├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── MANIFEST.in
├── README.rst
├── docs
    ├── Makefile
    ├── _static
    │   └── .gitignore
    ├── about.rst
    ├── api.rst
    ├── conf.py
    ├── developing_scrapers.rst
    ├── index.rst
    ├── make.bat
    └── using_scrapers.rst
├── pytest.ini
├── requirements.txt
├── setup.cfg
├── setup.py
├── statscraper
    ├── BaseScraperList.py
    ├── BaseScraperObject.py
    ├── DimensionValue.py
    ├── ValueList.py
    ├── __init__.py
    ├── base_scraper.py
    ├── compat.py
    ├── datatypes.py
    ├── datatypes
    │   ├── LICENSE
    │   ├── README.md
    │   ├── datatypes.csv
    │   └── values
    │   │   ├── currencies.csv
    │   │   ├── genders.csv
    │   │   ├── marital_statuses.csv
    │   │   ├── periods
    │   │       ├── academic-terms
    │   │       │   └── semesters.csv
    │   │       ├── months.csv
    │   │       └── quarters.csv
    │   │   ├── regions
    │   │       ├── eu.csv
    │   │       └── sweden
    │   │       │   ├── counties.csv
    │   │       │   └── municipalities.csv
    │   │   └── road_types.csv
    ├── exceptions.py
    └── scrapers
    │   ├── CranesScraper.py
    │   ├── PXWebScraper.py
    │   ├── SCBScraper.py
    │   ├── SMHIScraper.py
    │   ├── StatistikcentralenScraper.py
    │   ├── VantetiderScraper.py
    │   ├── VehicleScraper.py
    │   ├── __init__.py
    │   ├── uka_scraper.py
    │   └── work_injury_scraper.py
├── tests
    ├── scrapertests
    │   ├── test_injury_scraper.py
    │   ├── test_pxweb_scraper.py
    │   ├── test_smhi_scraper.py
    │   ├── test_vantetider_scraper.py
    │   └── test_vehicle_scraper.py
    ├── test-datatypes.py
    ├── test-scb.py
    ├── test_base_scraper.py
    ├── test_dialects.py
    └── test_resultset.py
└── version.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | /__init__.py
 6 | *pyc
 7 | 
 8 | # C extensions
 9 | *.so
10 | 
11 | # Distribution / packaging
12 | .Python
13 | env/
14 | build/
15 | develop-eggs/
16 | dist/
17 | downloads/
18 | eggs/
19 | .eggs/
20 | lib/
21 | lib64/
22 | parts/
23 | sdist/
24 | var/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | 
29 | # PyInstaller
30 | #  Usually these files are written by a python script from a template
31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 | 
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 | 
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .coverage
43 | .coverage.*
44 | .cache
45 | nosetests.xml
46 | coverage.xml
47 | *,cover
48 | .hypothesis/
49 | 
50 | # Translations
51 | *.mo
52 | *.pot
53 | 
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | 
58 | # Flask stuff:
59 | instance/
60 | .webassets-cache
61 | 
62 | # Scrapy stuff:
63 | .scrapy
64 | 
65 | # Sphinx documentation
66 | docs/_build/
67 | 
68 | # PyBuilder
69 | target/
70 | 
71 | # IPython Notebook
72 | .ipynb_checkpoints
73 | 
74 | # pyenv
75 | .python-version
76 | 
77 | # celery beat schedule file
78 | celerybeat-schedule
79 | 
80 | # dotenv
81 | .env
82 | 
83 | # virtualenv
84 | venv/
85 | ENV/
86 | 
87 | # Spyder project settings
88 | .spyderproject
89 | 
90 | # Rope project settings
91 | .ropeproject
92 | 
93 | # OS X
94 | .DS_Storegeckodriver.log
95 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | - 2.0.2
 2 | 
 3 |   - remove debug prints from SMHI scraper
 4 |   - upgrade BeautifulSoup to work with Pyhon 3.10+
 5 | 
 6 | - 2.0.1
 7 | 
 8 |   - Use https endpoint in SCB Scraper.
 9 | 
10 | - 2.0.0
11 | 
12 |   - Python 2 support deprecated. We will slowly phase out support.
13 |   - Fix a bug with `DimensionValue.translate()` in Python 3.
14 |   - Make `translate()` raise errors when it couldn't translate.
15 |   - The municipality of Gotland is now known as 'Region Gotland' (was: Gotlands kommun).
16 |   - Added some useful built-in filters to the SCB-scraper, to get results by eg municipality.
17 |   - Upstream fix for typo in datatype region:sweden/municipality Vännäs kommun
18 |   - SCB scraper will raises exception when an error message is returned
19 |   - Fixes Python3 bug in SMHI scraper
20 | 
21 | - 1.0.7
22 | 
23 |   - Bara kommun added to Swedish municipalities
24 |   - Remove logic from SCBScraper that is already handled by BaseScraper
25 | 
26 | - 1.0.6
27 | 
28 |   - Added dialect:skatteverket (two/four digit county/municipality codes)
29 |   - Added data type for road category
30 |   - Make SCB scraper treat a “Region” as, well, a region
31 | 
32 | - 1.0.5
33 |   - Added station key to SMHI scraper
34 | 
35 | - 1.0.4
36 |   - Added SMHI scraper
37 | 
38 | - 1.0.3
39 |   - Re-add demo scrapers that accidentally got left out in the first release
40 | 
41 | - 1.0.0
42 |   - First release
43 | 
44 | - 1.0.0.dev2
45 | 
46 |   - Implement translation
47 |   - Add Dataset.fetch_next() as generator for results
48 | 
49 | - 1.0.0.dev1
50 | 
51 |   - Semantic versioning starts here
52 |   - Implement datatypes and dialects
53 | 
54 | - 0.0.2
55 | 
56 |   - Added some demo scrapers
57 |   - The cursor is now moved when accessing datasets
58 |   - Renamed methods for moving cursor: move_up(), move_to()
59 |   - Added tests
60 |   - Added datatypes subtree
61 | 
62 | - 0.0.1
63 |   - First version
64 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Journalism++
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.rst
2 | recursive-include statscraper/datatypes *
3 | recursive-include statscraper/scrapers *.py
4 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | Statscraper is a base library for building web scrapers for statistical data, with a helper ontology for (primarily Swedish) statistical data. A set of ready-to-use scrapers are included.
  2 | 
  3 | For users
  4 | =========
  5 | 
  6 | You can use Statscraper as a foundation for your next scraper, or try out any of the included scrapers. With Statscraper comes a unified interface for scraping, and some useful helper methods for scraper authors.
  7 | 
  8 | Full documentation: ReadTheDocs_
  9 | 
 10 | For updates and discussion: Facebook_
 11 | 
 12 | By `Journalism++ Stockholm <http://jplusplus.org/sv>`_, and Robin Linderborg.
 13 | 
 14 | Installing
 15 | ----------
 16 | 
 17 | .. code:: bash
 18 | 
 19 |   pip install statscraper
 20 | 
 21 | Using a scraper
 22 | ---------------
 23 | Scrapers acts like “cursors” that move around a hierarchy of datasets and collections of datasets. Collections and datasets are refered to as “items”.
 24 | 
 25 | ::
 26 | 
 27 |         ┏━ Collection ━━━ Collection ━┳━ Dataset
 28 |   ROOT ━╋━ Collection ━┳━ Dataset     ┣━ Dataset
 29 |         ┗━ Collection  ┣━ Dataset     ┗━ Dataset
 30 |                        ┗━ Dataset
 31 | 
 32 |   ╰─────────────────────────┬───────────────────────╯
 33 |                        items
 34 | 
 35 | Here's a simple example, with a scraper that returns only a single dataset: The number of cranes spotted at Hornborgarsjön each day as scraped from `Länsstyrelsen i Västra Götalands län <http://web05.lansstyrelsen.se/transtat_O/transtat.asp>`_.
 36 | 
 37 | .. code:: python
 38 | 
 39 |   >>> from statscraper.scrapers import Cranes
 40 | 
 41 |   >>> scraper = Cranes()
 42 |   >>> scraper.items  # List available datasets
 43 |   [<Dataset: Number of cranes>]
 44 | 
 45 |   >>> dataset = scraper["Number of cranes"]
 46 |   >>> dataset.dimensions
 47 |   [<Dimension: date (Day of the month)>, <Dimension: month>, <Dimension: year>]
 48 | 
 49 |   >>> row = dataset.data[0]  # first row in this dataset
 50 |   >>> row
 51 |   <Result: 7 (value)>
 52 |   >>> row.dict
 53 |   {'value': '7', u'date': u'7', u'month': u'march', u'year': u'2015'}
 54 | 
 55 |   >>> df = dataset.data.pandas  # get this dataset as a Pandas dataframe
 56 | 
 57 | Building a scraper
 58 | ------------------
 59 | Scrapers are built by extending a base scraper, or a derative of that. You need to provide a method for listing datasets or collections of datasets, and for fetching data.
 60 | 
 61 | Statscraper is built for statistical data, meaning that it's most useful when the data you are scraping/fetching can be organized with a numerical value in each row:
 62 | 
 63 | ========  ======  =======
 64 |   city     year    value
 65 | ========  ======  =======
 66 | Voi       2009    45483
 67 | Kabarnet  2006    10191
 68 | Taveta    2009    67505
 69 | ========  ======  =======
 70 | 
 71 | A scraper can override these methods:
 72 | 
 73 | * `_fetch_itemslist(item)` to yield collections or datasets at the current cursor position
 74 | * `_fetch_data(dataset)` to yield rows from the currently selected dataset
 75 | * `_fetch_dimensions(dataset)` to yield dimensions available for the currently selected dataset
 76 | * `_fetch_allowed_values(dimension)` to yield allowed values for a dimension
 77 | 
 78 | A number of hooks are avaiable for more advanced scrapers. These are called by adding the on decorator on a method:
 79 | 
 80 | .. code:: python
 81 | 
 82 |   @BaseScraper.on("up")
 83 |   def my_method(self):
 84 |     # Do something when the user moves up one level
 85 | 
 86 | For developers
 87 | ==============
 88 | These instructions are for developers working on the BaseScraper. See above for instructions for developing a scraper using the BaseScraper.
 89 | 
 90 | Downloading
 91 | -----------
 92 | 
 93 | .. code:: bash
 94 | 
 95 |   git clone https://github.com/jplusplus/statscraper
 96 |   python setup.py install
 97 | 
 98 | This repo includes `statscraper-datatypes` as a subtree. To update this, do:
 99 | 
100 | .. code:: bash
101 | 
102 |   git subtree pull --prefix statscraper/datatypes git@github.com:jplusplus/statscraper-datatypes.git master --squash
103 | 
104 | 
105 | Tests
106 | -----
107 | 
108 | Since 2.0.0 we are using pytest. To run an individual test:
109 | 
110 | .. code:: bash
111 | 
112 |   python3 -m pytest tests/test-datatypes.py
113 | 
114 | 
115 | Changelog
116 | ---------
117 | The changelog has been moved to `CHANGELOG.md <CHANGELOG.md>`_.
118 | 
119 | .. _Facebook: https://www.facebook.com/groups/skrejperpark
120 | .. _ReadTheDocs: http://statscraper.readthedocs.io
121 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = statscraper
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/docs/_static/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jplusplus/statscraper/c75ed0474967c96c86f8def1223e55aebb80f631/docs/_static/.gitignore


--------------------------------------------------------------------------------
/docs/about.rst:
--------------------------------------------------------------------------------
 1 | =================
 2 | About Statscraper
 3 | =================
 4 | 
 5 | Statscraper is a base library for building web scrapers for statistical data, with a helper ontology for (primarily Swedish) statistical data. A set of ready-to-use scrapers are included. With Statscraper comes a unified interface for scraping, and some useful helper methods for scraper authors.
 6 | 
 7 | Statscraper is developed by Jens Finnäs and Leo Wallentin from Journalism++, and Robin Linderborg from SVT Nyheter.
 8 | 
 9 | The first stable version was released in August 2017. Statscraper is sponsored by Internetfonden/Stiftelsen för internetinfrastruktur and Journalism++ Stockholm.
10 | 


--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
 1 | =================
 2 | API Documentation
 3 | =================
 4 | 
 5 | Documentation of statscraper's public API.
 6 | 
 7 | 
 8 | Main Interface
 9 | --------------
10 | 
11 | .. autoclass:: statscraper.BaseScraper
12 | 	:members:
13 | .. autoclass:: statscraper.BaseScraperList
14 | 	:members: get
15 | .. autoclass:: statscraper.BaseScraperObject
16 | .. autoclass:: statscraper.Collection
17 | .. autoclass:: statscraper.Dataset
18 | .. autoclass:: statscraper.Dimension
19 | .. autoclass:: statscraper.DimensionList
20 | .. autoclass:: statscraper.DimensionValue
21 | .. autoclass:: statscraper.Item
22 | .. autoclass:: statscraper.Result
23 | .. autoclass:: statscraper.ResultSet
24 | .. autoclass:: statscraper.ValueList
25 | 
26 | 
27 | Exceptions
28 | --------------
29 | 
30 | .. autoclass:: statscraper.exceptions.DatasetNotInView
31 | .. autoclass:: statscraper.exceptions.InvalidData
32 | .. autoclass:: statscraper.exceptions.InvalidID
33 | .. autoclass:: statscraper.exceptions.NoSuchDatatype
34 | .. autoclass:: statscraper.exceptions.NoSuchItem


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import os
  5 | import sys
  6 | sys.path.insert(0, os.path.abspath('..'))
  7 | 
  8 | from version import long_version, short_version, name, copyright, authors, short_desc
  9 | 
 10 | #
 11 | # statscraper documentation build configuration file, created by
 12 | # sphinx-quickstart on Sun Mar 12 19:20:49 2017.
 13 | #
 14 | # This file is execfile()d with the current directory set to its
 15 | # containing dir.
 16 | #
 17 | # Note that not all possible configuration values are present in this
 18 | # autogenerated file.
 19 | #
 20 | # All configuration values have a default; values that are commented out
 21 | # serve to show the default.
 22 | 
 23 | # If extensions (or modules to document with autodoc) are in another directory,
 24 | # add these directories to sys.path here. If the directory is relative to the
 25 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 26 | #
 27 | # import os
 28 | # import sys
 29 | # sys.path.insert(0, os.path.abspath('.'))
 30 | 
 31 | 
 32 | # -- General configuration ------------------------------------------------
 33 | 
 34 | # If your documentation needs a minimal Sphinx version, state it here.
 35 | #
 36 | # needs_sphinx = '1.0'
 37 | 
 38 | # Add any Sphinx extension module names here, as strings. They can be
 39 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 40 | # ones.
 41 | extensions = ['sphinx.ext.autodoc']
 42 | 
 43 | # Add any paths that contain templates here, relative to this directory.
 44 | templates_path = ['_templates']
 45 | 
 46 | # The suffix(es) of source filenames.
 47 | # You can specify multiple suffix as a list of string:
 48 | #
 49 | # source_suffix = ['.rst', '.md']
 50 | source_suffix = '.rst'
 51 | 
 52 | # The master toctree document.
 53 | master_doc = 'index'
 54 | 
 55 | # General information about the project.
 56 | project = name
 57 | copyright = copyright
 58 | author = authors
 59 | 
 60 | # The version info for the project you're documenting, acts as replacement for
 61 | # |version| and |release|, also used in various other places throughout the
 62 | # built documents.
 63 | #
 64 | # The short X.Y version.
 65 | version = short_version
 66 | # The full version, including alpha/beta/rc tags.
 67 | release = long_version
 68 | 
 69 | # The language for content autogenerated by Sphinx. Refer to documentation
 70 | # for a list of supported languages.
 71 | #
 72 | # This is also used if you do content translation via gettext catalogs.
 73 | # Usually you set "language" from the command line for these cases.
 74 | language = None
 75 | 
 76 | # List of patterns, relative to source directory, that match files and
 77 | # directories to ignore when looking for source files.
 78 | # This patterns also effect to html_static_path and html_extra_path
 79 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 80 | 
 81 | # The name of the Pygments (syntax highlighting) style to use.
 82 | pygments_style = 'friendly'
 83 | 
 84 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 85 | todo_include_todos = False
 86 | 
 87 | 
 88 | # -- Options for HTML output ----------------------------------------------
 89 | 
 90 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 91 | # a list of builtin themes.
 92 | #
 93 | html_theme = 'alabaster'
 94 | 
 95 | # Theme options are theme-specific and customize the look and feel of a theme
 96 | # further.  For a list of options available for each theme, see the
 97 | # documentation.
 98 | #
 99 | # html_theme_options = {}
100 | 
101 | # Add any paths that contain custom static files (such as style sheets) here,
102 | # relative to this directory. They are copied after the builtin static files,
103 | # so a file named "default.css" will overwrite the builtin "default.css".
104 | html_static_path = ['_static']
105 | 
106 | html_sidebars = {
107 |    '**': ['globaltoc.html', 'sourcelink.html', 'searchbox.html'],
108 | }
109 | 
110 | 
111 | # -- Options for HTMLHelp output ------------------------------------------
112 | 
113 | # Output file base name for HTML help builder.
114 | htmlhelp_basename = 'statscraperdoc'
115 | 
116 | 
117 | # -- Options for LaTeX output ---------------------------------------------
118 | 
119 | latex_elements = {
120 |     # The paper size ('letterpaper' or 'a4paper').
121 |     #
122 |     # 'papersize': 'letterpaper',
123 | 
124 |     # The font size ('10pt', '11pt' or '12pt').
125 |     #
126 |     # 'pointsize': '10pt',
127 | 
128 |     # Additional stuff for the LaTeX preamble.
129 |     #
130 |     # 'preamble': '',
131 | 
132 |     # Latex figure (float) alignment
133 |     #
134 |     # 'figure_align': 'htbp',
135 | }
136 | 
137 | # Grouping the document tree into LaTeX files. List of tuples
138 | # (source start file, target name, title,
139 | #  author, documentclass [howto, manual, or own class]).
140 | latex_documents = [
141 |     (master_doc, 'statscraper.tex', 'statscraper Documentation',
142 |      authors, 'manual'),
143 | ]
144 | 
145 | 
146 | # -- Options for manual page output ---------------------------------------
147 | 
148 | # One entry per manual page. List of tuples
149 | # (source start file, name, description, authors, manual section).
150 | man_pages = [
151 |     (master_doc, name, 'statscraper Documentation',
152 |      [author], 1)
153 | ]
154 | 
155 | 
156 | # -- Options for Texinfo output -------------------------------------------
157 | 
158 | # Grouping the document tree into Texinfo files. List of tuples
159 | # (source start file, target name, title, author,
160 | #  dir menu entry, description, category)
161 | texinfo_documents = [
162 |     (master_doc, name, 'statscraper Documentation',
163 |      author, name, short_desc, 'Miscellaneous'),
164 | ]
165 | 


--------------------------------------------------------------------------------
/docs/developing_scrapers.rst:
--------------------------------------------------------------------------------
  1 | ===================
  2 | Developing scrapers
  3 | ===================
  4 | 
  5 | The scraper can navigate though an hierarchy of collections and datasets. Collections and datasets are refered to as “items”.
  6 | 
  7 | :: 
  8 | 
  9 |         ┏━ Collection ━━━ Collection ━┳━ Dataset
 10 |   ROOT ━╋━ Collection ━┳━ Dataset     ┣━ Dataset
 11 |         ┗━ Collection  ┣━ Dataset     ┗━ Dataset
 12 |                        ┗━ Dataset
 13 | 
 14 |   ╰─────────────────────────┬───────────────────────╯
 15 |                               items
 16 | 
 17 | 
 18 | Scrapers are built by extending the BaseScraper class, or a subclass of it. Every scraper must override the methods :code:`_fetch_itemslist` and :code:`_fetch_data`:
 19 | 
 20 |   * :code:`_fetch_itemslist(self, item)` must yield items at the current position.
 21 |   * :code:`_fetch_data(self, dataset, query)` must yield rows from a dataset.
 22 | 
 23 | Other methods that a scraper can chose to override are:
 24 | 
 25 |   * :code:`_fetch_dimensions(self, dataset)` should yield dimensions available on a dataset.
 26 |   * :code:`_fetch_allowed_values(self, dimension)` should yield allowed values for a dimension.
 27 | 
 28 | A number of hooks are avaiable for more advanced scrapers. These are called by adding the on decorator on a method:
 29 | 
 30 | .. code:: python
 31 | 
 32 |     @BaseScraper.on("up")
 33 |     def my_method(self):
 34 |       # Do something when the cusor moves up one level
 35 | 
 36 | Check out the `statscraper/scrapers <https://github.com/jplusplus/statscraper/tree/master/statscraper/scrapers>`_ directory for some scraper examples.
 37 | 
 38 | Below if the full code for the CranesScraper scraper used in the chapter `Using Scrapers <//statscraper.readthedocs.io/en/latest/using_scrapers.html>`_:
 39 | 
 40 | .. code:: python
 41 | 
 42 |     # encoding: utf-8
 43 |     """ A scraper to fetch daily cranes sightings at Hornborgasjön
 44 |         from http://web05.lansstyrelsen.se/transtat_O/transtat.asp
 45 |         This is intended to be a minimal example of a scraper
 46 |         using Beautiful Soup.
 47 |     """
 48 |     import requests
 49 |     from bs4 import BeautifulSoup
 50 |     from statscraper import BaseScraper, Dataset, Dimension, Result
 51 | 
 52 | 
 53 |     class Cranes(BaseScraper):
 54 | 
 55 |         def _fetch_itemslist(self, item):
 56 |             """ There is only one dataset. """
 57 |             yield Dataset("Number of cranes")
 58 | 
 59 |         def _fetch_dimensions(self, dataset):
 60 |             """ Declaring available dimensions like this is not mandatory,
 61 |              but nice, especially if they differ from dataset to dataset.
 62 | 
 63 |              If you are using a built in datatype, you can specify the dialect
 64 |              you are expecting, to have values normalized. This scraper will
 65 |              look for Swedish month names (e.g. 'Januari'), but return them
 66 |              according to the Statscraper standard ('january').
 67 |             """
 68 |             yield Dimension(u"date", label="Day of the month")
 69 |             yield Dimension(u"month", datatype="month", dialect="swedish")
 70 |             yield Dimension(u"year", datatype="year")
 71 | 
 72 |         def _fetch_data(self, dataset, query=None):
 73 |             html = requests.get("http://web05.lansstyrelsen.se/transtat_O/transtat.asp").text
 74 |             soup = BeautifulSoup(html, 'html.parser')
 75 |             table = soup.find("table", "line").find_all("table")[2].findNext("table")
 76 |             rows = table.find_all("tr")
 77 |             column_headers = rows.pop(0).find_all("td", recursive=False)
 78 |             years = [x.text for x in column_headers[2:]]
 79 |             for row in rows:
 80 |                 cells = row.find_all("td")
 81 |                 date = cells.pop(0).text
 82 |                 month = cells.pop(0).text
 83 |                 i = 0
 84 |                 for value in cells:
 85 |                     # Each column from here is a year.
 86 |                     if value.text:
 87 |                         yield Result(value.text.encode("utf-8"), {
 88 |                             "date": date,
 89 |                             "month": month,
 90 |                             "year": years[i],
 91 |                         })
 92 |                     i += 1
 93 | 
 94 | -----
 95 | Hooks
 96 | -----
 97 | Some scrapers might need to execute certains tasks as the user moves around the items tree. There are a number of hooks, that can be used to run code as a respons to an event. A scraper class method is attached to a hook by using the :code:`BaseScraper.on` decorator, with the name of the hook as the only argument. Here is an example of a hook in a Selenium based browser, used to refresh the browser each time the end user navigates to the top-most collection.
 98 | 
 99 | .. code:: python
100 | 
101 |     @BaseScraper.on("top")
102 |     def refresh_browser(self):
103 |         """ Refresh browser, to reset all forms """
104 |         self.browser.refresh()
105 | 
106 | Available hooks are:
107 | 
108 |  * init: Called when initiating the class
109 |  * up: Called when trying to go up one level (even if the scraper failed moving up)
110 |  * top: Called when moving to top level
111 |  * select: Called when trying to move to a specific Collection or Dataset. The target item will be provided as an artgument to the function.
112 | 
113 | 
114 | 
115 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | ===========================================
 2 | Statscraper: Standardizing Swedish scrapers
 3 | ===========================================
 4 | 
 5 | **Statscraper** provides a common set of guidelines, base classes and standards for writing scrapers for Swedish agencies' websites. Scrapers that comply with these standards provide a unified abstraction layer to the end-user, in terms of both usage and data output.
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 2
 9 |    :caption: Contents:
10 | 
11 |    about
12 |    using_scrapers
13 |    developing_scrapers
14 |    api
15 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=statscraper
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | 	echo.installed, then set the SPHINXBUILD environment variable to point
21 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | 	echo.may add the Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/docs/using_scrapers.rst:
--------------------------------------------------------------------------------
  1 | ==============
  2 | Using scrapers
  3 | ==============
  4 | 
  5 | Every scraper built on Statscraper shares the same interface towards the user. Here's sample code using one of the included demo scrapers, to fetch the number of cranes spotted at Hornborgarsjön each day from `Länsstyrelsen i Västra Götalands län <http://web05.lansstyrelsen.se/transtat_O/transtat.asp>`_:
  6 | 
  7 | .. code:: python
  8 | 
  9 |   >>> from statscraper.scrapers import Cranes
 10 | 
 11 |   >>> scraper = Cranes()
 12 |   >>> scraper.items  # List available datasets
 13 |   [<Dataset: Number of cranes>]
 14 | 
 15 |   >>> dataset = scraper["Number of cranes"]
 16 |   >>> dataset.dimensions
 17 |   [<Dimension: date (Day of the month)>, <Dimension: month>, <Dimension: year>]
 18 | 
 19 |   >>> row = dataset.data[0]  # first row in this dataset
 20 |   >>> row
 21 |   <Result: 7 (value)>
 22 |   >>> row.dict
 23 |   {'value': '7', u'date': u'7', u'month': u'march', u'year': u'2015'}
 24 |   >>> row.int
 25 |   7
 26 |   >>> row.tuple
 27 |   ('7', {u'date': u'7', u'month': u'march', u'year': u'2015'})
 28 | 
 29 |   >>> df = dataset.data.pandas  # get this dataset as a Pandas dataframe
 30 | 
 31 | 
 32 | Exploring sites
 33 | ---------------
 34 | Scrapers act like “cursors” that move around a hierarchy of datasets and collections of datasets. Collections and datasets are refered to as “items”.
 35 | 
 36 | ::
 37 | 
 38 |         ┏━ Collection ━━━ Collection ━┳━ Dataset
 39 |   ROOT ━╋━ Collection ━┳━ Dataset     ┣━ Dataset
 40 |         ┗━ Collection  ┣━ Dataset     ┗━ Dataset
 41 |                        ┗━ Dataset
 42 | 
 43 |   ╰─────────────────────────┬───────────────────────╯
 44 |                        items
 45 | 
 46 | The cursor is moved around the item tree as needed when you access properties or data, but you can also move manually around the items, if you want to be in full control. Some scrapers, e.g. those that need to fill out and post forms, or handle session data, might require that you move the cursor around manually. For most simple scrapers, e.g. those accessing an API, this should not be necessary.
 47 | 
 48 | Moving the cursor manually:
 49 | 
 50 | .. code:: python
 51 | 
 52 |     >>> from statscraper.scrapers import PXWeb
 53 | 
 54 |     >>> scraper = PXWeb(base_url="http://pxnet2.stat.fi/pxweb/api/v1/sv/StatFin/")
 55 |     >>> scraper.items
 56 |     [<Collection: tym (Arbetsmarknaden)>, <Collection: vrm (Befolkning)>, ...]
 57 | 
 58 |     >>> scraper.move_to("vrm").move_to("synt").move_to("080_synt_tau_203.px")
 59 |     >>> scraper.current_item
 60 |     <Dataset: 080_synt_tau_203.px (Befolkningsförändringar efter område 1980 - 2016)>
 61 | 
 62 |     >>> scraper.move_up()
 63 |     >>> scraper.current_item
 64 |     <Collection: synt (Födda)>
 65 |     >>> scraper.move_to("010_synt_tau_101.px")
 66 |     >>> scraper.current_item
 67 |     <Dataset: 010_synt_tau_101.px (Summerat fruktsamhetstal för åren 1776 - 2016)>
 68 | 
 69 |     >>> scraper.move_to_top()
 70 |     >>> scraper.move_to(0)  # Moving by index works too
 71 | 
 72 | 
 73 | The datasets above could also be accessed like this:
 74 | 
 75 | .. code:: python
 76 | 
 77 |     >>> from statscraper.scrapers import PXWeb
 78 | 
 79 |     >>> scraper = PXWeb(base_url="http://pxnet2.stat.fi/pxweb/api/v1/sv/StatFin/")
 80 | 
 81 |     >>> collection = scraper["vrm"]["synt"]
 82 |     >>> collection
 83 |     <Collection: synt (Födda)>
 84 | 
 85 |     >>> dataset_1 = collection["080_synt_tau_203.px"]
 86 |     >>> dataset_2 = collection["010_synt_tau_101.px"]
 87 | 
 88 | At any given point, :code:`scraper["foo"]` is shorthand for :code:`scraper.current_item.items["foo"]`.
 89 | 
 90 | If you want to loop throuh every available dataset a scraper can offer, there is a :code:`Scraper.descendants` property that will recursively move to every item in the tree. Here is an example, that will find all datasets in the SCB API that has monthly data:
 91 | 
 92 | .. code:: python
 93 | 
 94 |     >>> from statscraper.scrapers import SCB
 95 | 
 96 |     >>> scraper = SCB()
 97 |     >>> for dataset in scraper.descendants:
 98 |     >>>     if dataset.dimensions["Tid"].label == u"månad":
 99 |     >>>         print "Ahoy! Dataset %s has monthly data!" % dataset
100 | 
101 | Exploring datasets
102 | ------------------
103 | 
104 | Much like itemslists (:code:`Collection.items`), datasets are only fetched when you are inspecting or interacting with them.
105 | 
106 | The actual data is stored in a property called data:
107 | 
108 | .. code:: python
109 | 
110 |     >>> from statscraper.scrapers import Cranes
111 | 
112 |     >>> scraper = Cranes()
113 |     >>> dataset = scraper.items[0]
114 |     >>> for row in dataset.data:
115 |     >>>     print "%s cranes were spotted on %s" % (row.value, row["date"])
116 | 
117 | The data property will hold a list of result objects. The list can be converted to a few other formats, e.g. a pandas dataframe:
118 | 
119 | .. code:: python
120 | 
121 |     >>> from statscraper.scrapers import Cranes
122 | 
123 |     >>> scraper = Cranes()
124 |     >>> dataset = scraper.items[0]
125 |     >>> df = dataset.data.pandas  # convert to pandas dataframe
126 | 
127 | If you want to querry a site or database for some subset of the available data, you can use the :code:`fetch()` method on the dataset (or on the scraper, to fetch data from the current position, if any):
128 | 
129 | .. code:: python
130 | 
131 |     >>> dataset = scraper.items[0]
132 |     >>> data = dataset.fetch(query={'year': "2017"})
133 | 
134 | or
135 | 
136 | .. code:: python
137 | 
138 |     >>> scraper.move_to(0)
139 |     >>> data = scraper.fetch(query={'year': "2017"})
140 | 
141 | Available dimensions can be inspected though the .dimensions property:
142 | 
143 | .. code:: python
144 | 
145 |     >>> dataset.dimensions
146 |     [<Dimension: date>, <Dimension: year>]
147 | 
148 | Note however that a scraper does not necessarily need to provide dimensions. If :code:`Dataset.dimensions` is None, it could simply mean that the scraper itself is not sure what to expect from the data.
149 | 
150 | Dialects
151 | --------
152 | 
153 | Scraper authors can use the included :code:`Datatypes` module to have a standardised ontology for common statistical dimensions. If a dimensions uses a bulid in datatype, it can be translated to a different dialect. For instance, Swedish municipalities come in the following dialects:
154 | 
155 |  - :code:`short`: :code:`"Ale"`
156 |  - :code:`numerical`: :code:`"1440"`
157 |  - :code:`wikidata`: :code:`"Q498470"`
158 |  - :code:`brå`: :code:`"8617"`
159 |  - :code:`scb`: :code:`"1440 Ale kommun"`
160 | 
161 | By default, Statscraper prefers human readable representations, and municipality values is internally stored like this: :code:`u"Borås kommun"`. The philosophy here is that human readable id's speed up debugging and makes it easy to spot errors during scraping and analysis. Yes, we do use Unicode for id's. It's 2017 after all.
162 | 
163 | .. code:: python
164 | 
165 |     >>> from statscraper.scrapers import Cranes
166 | 
167 |     >>> scraper = Cranes()
168 |     >>> data = scraper.items[0].data
169 |     >>> row = data[0]
170 |     >>> row["month"]
171 |     <DimensionValue: march (month)>
172 |     >>> row["month"].translate("swedish")
173 |     u'mars'
174 | 
175 | For available datatypes, domains, values and dialects, see the `statscraper-datatypes repo <https://github.com/jplusplus/statscraper-datatypes>`_.
176 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | testpaths = tests
3 | norecursedirs = scrapertests
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Core dependecies
 2 | pandas==0.25.3
 3 | requests==2.28.2
 4 | six==1.11.0
 5 | Sphinx==1.6.7
 6 | pytest==5.3.5
 7 | 
 8 | # Scraper dependencies
 9 | beautifulsoup4==4.11.1
10 | selenium==3.9.0
11 | xlrd==1.0.0
12 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [bdist_wheel]
 2 | universal = 0
 3 | 
 4 | [metadata]
 5 | license_file = LICENSE
 6 | 
 7 | [flake8]
 8 | max-line-length = 90
 9 | 
10 | [options]
11 | python_requires = >=3.6
12 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | from version import version, name, authors, email, short_desc
 3 | 
 4 | 
 5 | def readme():
 6 |     """Import README for use as long_description."""
 7 |     with open("README.rst") as f:
 8 |         return f.read()
 9 | 
10 | 
11 | setup(
12 |     name=name,
13 |     version=version,
14 |     description=short_desc,
15 |     long_description=readme(),
16 |     url="https://github.com/jplusplus/statscraper",
17 |     author=authors,
18 |     author_email=email,
19 |     license="MIT",
20 |     packages=["statscraper"],
21 |     zip_safe=False,
22 |     install_requires=[
23 |         "pandas",
24 |         "six",
25 |         "requests",
26 |     ],
27 |     include_package_data=True,
28 |     download_url="https://github.com/jplusplus/skrejperpark/archive/%s.tar.gz"
29 |                  % version,
30 | )
31 | 


--------------------------------------------------------------------------------
/statscraper/BaseScraperList.py:
--------------------------------------------------------------------------------
 1 | import six
 2 | from .compat import unicode
 3 | from .exceptions import NoSuchItem
 4 | 
 5 | 
 6 | class BaseScraperList(list):
 7 |     """ Lists of dimensions, values, etc all inherit this class
 8 |     for some common convenience methods, such as get_by_label()
 9 |     """
10 | 
11 |     _CONTAINS = object
12 | 
13 |     def get(self, key):
14 |         """Provide alias for bracket notation."""
15 |         return self[key]
16 | 
17 |     def get_by_label(self, label):
18 |         """ Return the first item with a specific label,
19 |         or None.
20 |         """
21 |         return next((x for x in self if x.label == label), None)
22 | 
23 |     def __getitem__(self, key):
24 |         """ Make it possible to get item by id or value identity."""
25 |         if isinstance(key, six.string_types):
26 |             if isinstance(key, unicode):
27 |                 def f(x):
28 |                     return (x.id == key)
29 |             else:
30 |                 def f(x):
31 |                     return (x.id == unicode(key, encoding="utf-8"))
32 |         elif isinstance(key, self._CONTAINS):
33 |             def f(x):
34 |                 return (x is key)
35 |         else:
36 |             return list.__getitem__(self, key)
37 | 
38 |         try:
39 |             return next(iter(filter(f, self)))
40 |         except StopIteration:
41 |             # No such item
42 |             raise NoSuchItem("No such %s: %s" % (self._CONTAINS.__name__, key))
43 | 
44 |     def __contains__(self, item):
45 |         """ Make the 'in' keyword check for value/id """
46 |         if isinstance(item, six.string_types):
47 |             return bool(len(list(filter(lambda x: x.value == item, self))))
48 |         else:
49 |             return super(BaseScraperList, self).__contains__(item)


--------------------------------------------------------------------------------
/statscraper/BaseScraperObject.py:
--------------------------------------------------------------------------------
 1 | import six
 2 | 
 3 | 
 4 | class BaseScraperObject(object):
 5 |     """ Objects like items, dimensions, values etc all inherit
 6 |     this class. BaseScraperObjects are typicalliy stored in a
 7 |     BaseScraperList.
 8 |     """
 9 | 
10 |     def get(self, key):
11 |         """Provide alias for bracket notation."""
12 |         return self[key]
13 | 
14 |     @property
15 |     def value(self):
16 |         """ This is the value used for testing membership,
17 |         comparison, etc. Overloaded for classes that store
18 |         a value separate from the id, e.g. DimensionValue,
19 |         that might have something like {id: 'year', value: 2017}
20 |         """
21 |         if hasattr(self, '_value'):
22 |             return self._value
23 |         else:
24 |             return self.id
25 | 
26 |     @value.setter
27 |     def value(self, value):
28 |         """ This is the value used for testing membership,
29 |         comparison, etc. Overloaded for classes that store
30 |         a value separate from the id, e.g. DimensionValue,
31 |         that might have something like {id: 'year', value: 2017}
32 |         """
33 |         self._value = value
34 | 
35 |     def __eq__(self, other):
36 |         """ Enable equality check by string """
37 |         if self is other:
38 |             return True
39 |         elif isinstance(other, six.string_types):
40 |             return (self.value == other)
41 |         else:
42 |             return super(BaseScraperObject, self) == other
43 | 
44 |     def __nonzero__(self):
45 |         """ Make nonezero check value """
46 |         return bool(self.value)
47 | 
48 |     def __len__(self):
49 |         """ Make len check value """
50 |         return len(self.value)
51 | 
52 |     def __int__(self):
53 |         """ Make int return value """
54 |         return int(self.value)
55 | 
56 |     def __str__(self):
57 |         if isinstance(self.value, six.string_types):
58 |             try:
59 |                 if six.PY2:
60 |                     return self.value.encode("utf-8")
61 |                 else:
62 |                     return self.value
63 |             except (UnicodeEncodeError, UnicodeDecodeError):
64 |                 return self.value
65 |         else:
66 |             return str(self.value)
67 | 
68 |     def __repr__(self):
69 |         if self.label is None:
70 |             label = self.id
71 |         else:
72 |             label = self.label.encode("utf-8")
73 |         if str(self) != str(label):
74 |             return '<%s: %s (%s)>' % (type(self).__name__,
75 |                                       str(self),
76 |                                       label)
77 |         else:
78 |             return '<%s: %s>' % (type(self).__name__,
79 |                                  str(self))
80 | 


--------------------------------------------------------------------------------
/statscraper/DimensionValue.py:
--------------------------------------------------------------------------------
 1 | """This file contanis a class representing a value in a dataset."""
 2 | from .BaseScraperObject import BaseScraperObject
 3 | 
 4 | 
 5 | class DimensionValue(BaseScraperObject):
 6 |     """The value for a dimension inside a Resultset."""
 7 | 
 8 |     def __init__(self, value, dimension, label=None):
 9 |         """Value can be any type. dimension is a Dimension() object."""
10 |         self.value = value
11 |         self._dimension = dimension
12 |         self._label = label
13 |         self._id = dimension.id
14 | 
15 |     @property
16 |     def id(self):
17 |         return self._id
18 | 
19 |     @id.setter
20 |     def id(self, value):
21 |         self._id = value
22 | 
23 |     @property
24 |     def label(self):
25 |         return self._label
26 | 
27 |     @label.setter
28 |     def label(self, value):
29 |         self._label = value
30 | 
31 |     @property
32 |     def dimension(self):
33 |         return self._dimension
34 | 
35 |     @dimension.setter
36 |     def dimension(self, value):
37 |         self._dimension = value
38 | 
39 |     def translate(self, dialect):
40 |         """Translate this value to a different dialect."""
41 |         if self.dimension.datatype is None:
42 |             raise Exception(f"""\
43 | A value must belong to a dimension of a specific datatyp, to be translated. \
44 | {self.dimension} does not have a datatype.""")
45 |         dt = self.dimension.datatype
46 |         if self.value not in dt.allowed_values:
47 |             raise Exception(f"""\
48 | {self.value} is not an allowed value for this datatype, and can not be translated.""")
49 | 
50 |         translations = dt.allowed_values[self.value]
51 |         translation = ",".join([x.replace(",", "\\,")
52 |                                for x in translations.dialects[dialect]])
53 |         return translation
54 | 


--------------------------------------------------------------------------------
/statscraper/ValueList.py:
--------------------------------------------------------------------------------
 1 | import six
 2 | from .compat import unicode
 3 | from .BaseScraperList import BaseScraperList
 4 | from .DimensionValue import DimensionValue
 5 | 
 6 | 
 7 | class ValueList(BaseScraperList):
 8 |     """A list of dimension values.
 9 | 
10 |     allowed_values uses this class, to allow checking membership.
11 |     """
12 | 
13 |     def __getitem__(self, key):
14 |         """Make it possible to get value by value or value identity."""
15 |         if isinstance(key, six.string_types):
16 |             if isinstance(key, unicode):
17 |                 def f(x):
18 |                     return (x.value == key)
19 |             else:
20 |                 def f(x):
21 |                     return (x.value == unicode(key, encoding="utf-8"))
22 |         elif isinstance(key, DimensionValue):
23 |             def f(x):
24 |                 return (x is key)
25 |         else:
26 |             return list.__getitem__(self, key)
27 |         try:
28 |             val = next(iter(filter(f, self)))
29 |             return val
30 |         except IndexError:
31 |             # No such id
32 |             raise NoSuchItem("No such value")
33 | 
34 |     def __contains__(self, item):
35 |         """ in should look for value, not id. """
36 |         if isinstance(item, six.string_types):
37 |             return bool(len(list(filter(lambda x: x.value == item, self))))
38 |         else:
39 |             return super(ValueList, self).__contains__(item)
40 | 


--------------------------------------------------------------------------------
/statscraper/__init__.py:
--------------------------------------------------------------------------------
 1 | # Exceptions
 2 | from .exceptions import *
 3 | 
 4 | # Classes
 5 | from .DimensionValue import DimensionValue
 6 | from .BaseScraperList import BaseScraperList
 7 | from .BaseScraperObject import BaseScraperObject
 8 | from .ValueList import ValueList
 9 | from .datatypes import Datatype
10 | from .base_scraper import (BaseScraper, Item, Collection, Dataset, Result,
11 | 						   ResultSet, ItemList, Dimension, DimensionList)
12 | 
13 | # Contants
14 | from .base_scraper import ROOT, TYPE_DATASET, TYPE_COLLECTION
15 | 


--------------------------------------------------------------------------------
/statscraper/base_scraper.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | 
  3 | u"""
  4 |  This file contains the base class for scrapers. The scraper can navigate
  5 |  though an hierarchy of collections and datasets. Collections and datasets
  6 |  are refered to as “items”.
  7 | 
  8 |        ┏━ Collection ━━━ Collection ━┳━ Dataset
  9 |  ROOT ━╋━ Collection ━┳━ Dataset     ┣━ Dataset
 10 |        ┗━ Collection  ┣━ Dataset     ┗━ Dataset
 11 |                       ┗━ Dataset
 12 | 
 13 |  ╰───────────────────────┬─────────────────────╯
 14 |                      items
 15 | 
 16 |  A scraper can override three methods:
 17 |   * _fetch_itemslist(item) yields items at the current position
 18 |   * _fetch_dimensions(dataset) yields dimensions available on a dataset
 19 |   * _fetch_data(dataset) syield rows from a dataset
 20 | 
 21 |  A number of hooks are avaiable for more advanced scrapers. These are called
 22 |  by adding the on decorator on a method:
 23 | 
 24 |   @BaseScraper.on("up")
 25 |   def my_method(self):
 26 |     # Do something when the cusor moves up one level
 27 | 
 28 | """
 29 | import six
 30 | from hashlib import md5
 31 | from json import dumps
 32 | import pandas as pd
 33 | from collections import deque
 34 | from copy import copy
 35 | from .exceptions import NoSuchItem, InvalidID
 36 | from .datatypes import Datatype
 37 | from .BaseScraperObject import BaseScraperObject
 38 | from .BaseScraperList import BaseScraperList
 39 | from .DimensionValue import DimensionValue
 40 | from .ValueList import ValueList
 41 | 
 42 | if six.PY3:
 43 |     unicode = str
 44 | 
 45 | try:
 46 |     from itertools import ifilter as filter
 47 | except ImportError:
 48 |     pass
 49 | 
 50 | TYPE_DATASET = "Dataset"
 51 | TYPE_COLLECTION = "Collection"
 52 | ROOT = "<root>"  # Special id for root position
 53 | VALUE_KEY = "value"  # key/column holding the value of a result or dimension
 54 | """ Constants for item types and id's """
 55 | 
 56 | 
 57 | class ResultSet(list):
 58 |     """The result of a dataset query.
 59 | 
 60 |     This is essentially a list of Result objects.
 61 |     """
 62 | 
 63 |     _pandas = None
 64 |     dataset = None
 65 | 
 66 |     @property
 67 |     def list_of_dicts(self):
 68 |         """Return a list of dictionaries, with the key "value" for values."""
 69 |         return [dict(x) for x in self]
 70 | 
 71 |     @property
 72 |     def pandas(self):
 73 |         """Return a Pandas dataframe."""
 74 |         if self._pandas is None:
 75 |             self._pandas = pd.DataFrame().from_records(self.list_of_dicts)
 76 |         return self._pandas
 77 | 
 78 |     def translate(self, dialect):
 79 |         """Return a copy of this ResultSet in a different dialect."""
 80 |         new_resultset = copy(self)
 81 |         new_resultset.dialect = dialect
 82 | 
 83 |         for result in new_resultset:
 84 |             for dimensionvalue in result.dimensionvalues:
 85 |                 dimensionvalue.value = dimensionvalue.translate(dialect)
 86 |         return new_resultset
 87 | 
 88 |     def append(self, val):
 89 |         """Connect any new results to the resultset.
 90 | 
 91 |         This is where all the heavy lifting is done for creating results:
 92 |          - We add a datatype here, so that each result can handle
 93 |         validation etc independently. This is so that scraper authors
 94 |         don't need to worry about creating and passing around datatype objects.
 95 |          - As the scraper author yields result objects, we append them to
 96 |         a resultset.
 97 |          - This is also where we normalize dialects.
 98 |         """
 99 |         val.resultset = self
100 |         val.dataset = self.dataset
101 | 
102 |         # Check result dimensions against available dimensions for this dataset
103 |         if val.dataset:
104 |             dataset_dimensions = self.dataset.dimensions
105 |             for k, v in val.raw_dimensions.items():
106 |                 if k not in dataset_dimensions:
107 |                     d = Dimension(k)
108 |                 else:
109 |                     d = dataset_dimensions[k]
110 | 
111 |                 # Normalize if we have a datatype and a foreign dialect
112 |                 normalized_value = unicode(v)
113 |                 if d.dialect and d.datatype:
114 |                     if d.dialect in d.datatype.dialects:
115 |                         for av in d.allowed_values:
116 |                             # Not all allowed_value have all dialects
117 |                             if unicode(v) in av.dialects.get(d.dialect, []):
118 |                                 normalized_value = av.value
119 |                                 # Use first match
120 |                                 # We do not support multiple matches
121 |                                 # This is by design.
122 |                                 break
123 | 
124 |                 # Create DimensionValue object
125 |                 if isinstance(v, DimensionValue):
126 |                     dim = v
127 |                     v.value = normalized_value
128 |                 else:
129 |                     if k in dataset_dimensions:
130 |                         dim = DimensionValue(normalized_value, d)
131 |                     else:
132 |                         dim = DimensionValue(normalized_value, Dimension())
133 | 
134 |                 val.dimensionvalues.append(dim)
135 | 
136 |             # Add last list of dimension values to the ResultSet
137 |             # They will usually be the same for each result
138 |             self.dimensionvalues = val.dimensionvalues
139 | 
140 |         super(ResultSet, self).append(val)
141 | 
142 | 
143 | class DimensionList(BaseScraperList):
144 |     """A one dimensional list of dimensions."""
145 | 
146 |     pass
147 | 
148 | 
149 | class Result(BaseScraperObject):
150 |     u"""A “row” in a result.
151 | 
152 |     A result contains a numerical value,
153 |     and optionally a set of dimensions with values.
154 |     """
155 | 
156 |     def __init__(self, value, dimensions={}):
157 |         """Value is supposed, but not strictly required to be numerical."""
158 |         self.value = value
159 |         self.label = VALUE_KEY
160 |         self.raw_dimensions = dimensions
161 |         self.dimensionvalues = DimensionList()
162 | 
163 |     def __getitem__(self, key):
164 |         """ Make it possible to get dimensions by name. """
165 |         if isinstance(key, six.string_types):
166 |             return self.dimensionvalues[key]
167 |         else:
168 |             return list.__getitem__(self, key)
169 | 
170 |     def __iter__(self):
171 |         """ dict representation is like:
172 |          {value: 123, dimension_1: "foo", dimension_2: "bar"}
173 |         """
174 |         yield (VALUE_KEY, self.value)
175 |         for dv in self.dimensionvalues:
176 |             yield (dv.id,
177 |                    dv.value)
178 | 
179 |     @property
180 |     def dict(self):
181 |         return dict(self)
182 | 
183 |     @property
184 |     def int(self):
185 |         return int(self)
186 | 
187 |     @property
188 |     def str(self):
189 |         return str(int(self))
190 | 
191 |     @property
192 |     def tuple(self):
193 |         """ Tuple conversion to (value, dimensions), e.g.:
194 |          (123, {dimension_1: "foo", dimension_2: "bar"})
195 |         """
196 |         return (self.value, {dv.id: dv.value for dv in self.dimensionvalues})
197 | 
198 | 
199 | class Dimension(BaseScraperObject):
200 |     """A dimension in a dataset."""
201 | 
202 |     def __init__(self, id_=None, label=None,
203 |                  allowed_values=None, datatype=None,
204 |                  dialect=None, domain=None):
205 |         """A single dimension.
206 | 
207 |         If allowed_values are specified, they will override any
208 |         allowed values for the datatype
209 |         """
210 |         if id_ is None:
211 |             id_ = "default"
212 |         if id_ == VALUE_KEY:
213 |             raise InvalidID("'%s' is not a valid Dimension id." % VALUE_KEY)
214 |         self.id = id_
215 |         self._allowed_values = None
216 |         self.datatype = None
217 |         if label is None:
218 |             self.label = id_
219 |         else:
220 |             self.label = label
221 |         if datatype:
222 |             self.datatype = Datatype(datatype)
223 |             self._allowed_values = self.datatype.allowed_values
224 |         self.dialect = dialect
225 |         if allowed_values:
226 |             # Override allowed values from datatype, if any
227 |             #
228 |             # If allowed values is given as a list of values, create
229 |             # value objects using an empty dimension.
230 |             self._allowed_values = ValueList()
231 |             for val in allowed_values:
232 |                 if isinstance(val, DimensionValue):
233 |                     self._allowed_values.append(val)
234 |                 else:
235 |                     self._allowed_values.append(DimensionValue(val,
236 |                                                                Dimension())
237 |                                                 )
238 | 
239 |     @property
240 |     def allowed_values(self):
241 |         """Return a list of allowed values."""
242 |         if self._allowed_values is None:
243 |             self._allowed_values = ValueList()
244 |             for val in self.scraper._fetch_allowed_values(self):
245 |                 if isinstance(val, DimensionValue):
246 |                     self._allowed_values.append(val)
247 |                 else:
248 |                     self._allowed_values.append(DimensionValue(val,
249 |                                                                Dimension()))
250 |         return self._allowed_values
251 | 
252 | 
253 | class ItemList(BaseScraperList):
254 |     """A one dimensional list of items.
255 | 
256 |     Has some conventience getters and setters for scrapers
257 |     """
258 | 
259 |     @property
260 |     def type(self):
261 |         """Check if this is a list of Collections or Datasets."""
262 |         try:
263 |             return self[0].type
264 |         except IndexError:
265 |             return None
266 | 
267 |     def empty(self):
268 |         """Empty this list (delete all contents)."""
269 |         del self[:]
270 |         return self
271 | 
272 |     def append(self, val):
273 |         """Connect any new items to the scraper."""
274 |         val.scraper = self.scraper
275 |         val._collection_path = copy(self.collection._collection_path)
276 |         val._collection_path.append(val)
277 |         super(ItemList, self).append(val)
278 | 
279 | 
280 | class Item(BaseScraperObject):
281 |     """Common base class for collections and datasets."""
282 | 
283 |     # These are populated when added to an itemlist
284 |     parent = None  # Parent item
285 |     _items = None  # ItemList with children
286 |     _collection_path = None  # All ancestors
287 | 
288 |     def __init__(self, id_, label=None, blob=None):
289 |         """Use blob to store any custom data."""
290 |         self.id = id_
291 |         self.blob = blob
292 |         if label is None:
293 |             self.label = id_
294 |         else:
295 |             self.label = label
296 |         self._collection_path = deque([self])  # Will be overwritten when attached to an ItemList
297 | 
298 |     def _move_here(self):
299 |         """Move the cursor to this item."""
300 |         cu = self.scraper.current_item
301 |         # Already here?
302 |         if self is cu:
303 |             return
304 |         # A child?
305 |         if cu.items and self in cu.items:
306 |             self.scraper.move_to(self)
307 |             return
308 |         # A parent?
309 |         if self is cu.parent:
310 |             self.scraper.move_up()
311 |         # A sibling?
312 |         if self.parent and self in self.parent.items:
313 |             self.scraper.move_up()
314 |             self.scraper.move_to(self)
315 |             return
316 |         # Last resort: Move to top and all the way down again
317 |         self.scraper.move_to_top()
318 |         for step in self.path:
319 |             self.scraper.move_to(step)
320 | 
321 |     @property
322 |     def path(self):
323 |         """All named collections above, including the current, but not root."""
324 |         steps = list(self._collection_path)
325 |         steps.pop(0)
326 |         return steps
327 | 
328 |     @property
329 |     def type(self):
330 |         """Check if this is a Collection or Dataset."""
331 |         try:
332 |             if isinstance(self, Collection):
333 |                 return TYPE_COLLECTION
334 |             else:
335 |                 return TYPE_DATASET
336 |         except IndexError:
337 |             return None
338 | 
339 | 
340 | class Collection(Item):
341 |     """A collection can contain collection of datasets.
342 | 
343 |     Lorem ipsum lorem lorem ipsum lorem. Dummy text.
344 | 
345 |     Basic Usage::
346 | 
347 |       >>> from statscraper import Collection
348 |       >>> c = Collection()
349 |       <class 'statscraper.base_scraper.Collection'>
350 |     """
351 | 
352 |     @property
353 |     def is_root(self):
354 |         """Check if root element."""
355 |         if self.id == ROOT:
356 |             return True
357 |         else:
358 |             return None
359 | 
360 |     @property
361 |     def items(self):
362 |         """ItemList of children."""
363 |         if self.scraper.current_item is not self:
364 |             self._move_here()
365 | 
366 |         if self._items is None:
367 |             self._items = ItemList()
368 |             self._items.scraper = self.scraper
369 |             self._items.collection = self
370 |             for i in self.scraper._fetch_itemslist(self):
371 |                 i.parent = self
372 |                 if i.type == TYPE_DATASET and i.dialect is None:
373 |                     i.dialect = self.scraper.dialect
374 |                 self._items.append(i)
375 |         return self._items
376 | 
377 |     def __getitem__(self, key):
378 |         """Provide bracket notation.
379 | 
380 |         collection["abc"] is shorthand for collection.items["abc"]
381 |         """
382 |         if self.scraper.current_item is not self:
383 |             self._move_here()
384 |         try:
385 |             return self.items[key]
386 |         except IndexError:
387 |             # No such id
388 |             raise NoSuchItem("No such item in Collection")
389 | 
390 |     def get(self, key):
391 |         """Provide alias for bracket notation."""
392 |         return self[key]
393 | 
394 | 
395 | class Dataset(Item):
396 |     """A dataset. Can be empty."""
397 | 
398 |     _data = None  # We store one ResultSet for each unique query
399 |     _dimensions = None
400 |     dialect = None
401 |     query = None
402 | 
403 |     def __init__(self, id_, label=None, blob=None):
404 |         super(Dataset, self).__init__(id_, label, blob)
405 |         self._data = {}
406 | 
407 |     @property
408 |     def items(self):
409 |         """A dataset has no children."""
410 |         return None
411 | 
412 |     @property
413 |     def _hash(self):
414 |         """Return a hash for the current query.
415 | 
416 |         This hash is _not_ a unique representation of the dataset!
417 |         """
418 |         dump = dumps(self.query, sort_keys=True)
419 |         if isinstance(dump, str):
420 |             dump = dump.encode('utf-8')
421 |         return md5(dump).hexdigest()
422 | 
423 |     def fetch_next(self, query=None, **kwargs):
424 |         """Generator to yield data one row at a time.
425 |         Yields a Result, not the entire ResultSet. The containing ResultSet
426 |         can be accessed through `Result.resultset`, but be careful not to
427 |         manipulate the ResultSet until it is populated (when this generator
428 |         is empty), or you may see unexpected results.
429 |         """
430 |         if query:
431 |             self.query = query
432 | 
433 |         hash_ = self._hash
434 |         if hash_ in self._data:
435 |             for result in self._data[hash_]:
436 |                 yield result
437 | 
438 |         if self.scraper.current_item is not self:
439 |             self._move_here()
440 | 
441 |         self._data[hash_] = ResultSet()
442 |         self._data[hash_].dialect = self.dialect
443 |         self._data[hash_].dataset = self
444 |         for result in self.scraper._fetch_data(self,
445 |                                                query=self.query,
446 |                                                **kwargs):
447 |             self._data[hash_].append(result)
448 |             yield result
449 | 
450 |     def fetch(self, query=None, **kwargs):
451 |         """Ask scraper to return data for the current dataset."""
452 |         if query:
453 |             self.query = query
454 | 
455 |         hash_ = self._hash
456 |         if hash_ in self._data:
457 |             return self._data[hash_]
458 | 
459 |         if self.scraper.current_item is not self:
460 |             self._move_here()
461 | 
462 |         rs = ResultSet()
463 |         rs.dialect = self.dialect
464 |         rs.dataset = self
465 |         for result in self.scraper._fetch_data(self,
466 |                                                query=self.query,
467 |                                                **kwargs):
468 |             rs.append(result)
469 |         self._data[hash_] = rs
470 |         return self._data[hash_]
471 | 
472 |     @property
473 |     def data(self):
474 |         """Data as a property, given current query."""
475 |         return self.fetch(query=self.query)
476 | 
477 |     @property
478 |     def dimensions(self):
479 |         """Available dimensions, if defined."""
480 |         # First of all: Select this dataset
481 |         if self.scraper.current_item is not self:
482 |             self._move_here()
483 | 
484 |         if self._dimensions is None:
485 |             self._dimensions = DimensionList()
486 |             for d in self.scraper._fetch_dimensions(self):
487 |                 d.dataset = self
488 |                 d.scraper = self.scraper
489 |                 self._dimensions.append(d)
490 |         return self._dimensions
491 | 
492 |     @property
493 |     def shape(self):
494 |         """Compute the shape of the dataset as (rows, cols)."""
495 |         if not self.data:
496 |             return (0, 0)
497 |         return (len(self.data), len(self.dimensions))
498 | 
499 | 
500 | class BaseScraper(Collection):
501 |     """The base class for scrapers."""
502 | 
503 |     # Hooks
504 |     _hooks = {
505 |         'init': [],  # Called when initiating the class
506 |         'up': [],  # Called when trying to go up one level
507 |         'top': [],  # Called when moving to top level
508 |         'select': [],  # Called when trying to move to a Collection or Dataset
509 |     }
510 | 
511 |     dialect = None
512 | 
513 |     @classmethod
514 |     def on(cls, hook):
515 |         """Hook decorator."""
516 |         def decorator(function_):
517 |             cls._hooks[hook].append(function_)
518 |             return function_
519 |         return decorator
520 | 
521 |     def __repr__(self):
522 |         return u'<Scraper: %s>' % self.__class__.__name__
523 | 
524 |     def __init__(self, *args, **kwargs):
525 |         """Initiate with a ROOT collection on top."""
526 |         self.current_item = Collection(ROOT)
527 |         self.current_item.scraper = self
528 |         self.root = self.current_item
529 | 
530 |         for f in self._hooks["init"]:
531 |             f(self, *args, **kwargs)
532 | 
533 |     def __getitem__(self, key):
534 |         """ Make scraper[a] shorthand for scraper.items[a]
535 |         """
536 |         return self.items[key]
537 | 
538 |     @property
539 |     def items(self):
540 |         """ItemList of collections or datasets at the current position.
541 | 
542 |         None will be returned in case of no further levels
543 |         """
544 |         return self.current_item.items
545 | 
546 |     def fetch(self, query=None, **kwargs):
547 |         """Let the current item fetch it's data."""
548 |         return self.current_item.fetch(query, **kwargs)
549 | 
550 |     @property
551 |     def parent(self):
552 |         """Return the item above the current, if any."""
553 |         return self.current_item.parent
554 | 
555 |     @property
556 |     def path(self):
557 |         """All named collections above, including the current, but not root."""
558 |         return self.current_item.path
559 | 
560 |     def move_to_top(self):
561 |         """Move to root item."""
562 |         self.current_item = self.root
563 |         for f in self._hooks["top"]:
564 |             f(self)
565 |         return self
566 | 
567 |     def move_up(self):
568 |         """Move up one level in the hierarchy, unless already on top."""
569 |         if self.current_item.parent is not None:
570 |             self.current_item = self.current_item.parent
571 | 
572 |         for f in self._hooks["up"]:
573 |             f(self)
574 |         if self.current_item is self.root:
575 |             for f in self._hooks["top"]:
576 |                 f(self)
577 |         return self
578 | 
579 |     def move_to(self, id_):
580 |         """Select a child item by id (str), reference or index."""
581 |         if self.items:
582 |             try:
583 |                 self.current_item = self.items[id_]
584 |             except (StopIteration, IndexError, NoSuchItem):
585 |                 raise NoSuchItem
586 |             for f in self._hooks["select"]:
587 |                 f(self, id_)
588 |         return self
589 | 
590 |     def _fetch_itemslist(self, item):
591 |         """Must be overriden by scraper authors, to yield items.
592 | 
593 |         Should yield items (Collections or Datasets) at the
594 |         current cursor position. E.g something like this:
595 | 
596 |         list = get_items(self.current_item)
597 |         for item in list:
598 |             if item.type == "Collection":
599 |                 yield Collection(item.id)
600 |             else:
601 |                 yield Dataset(item.id)
602 |         """
603 |         raise Exception("This scraper has no method for fetching list items!")
604 | 
605 |     def _fetch_dimensions(self, dataset):
606 |         """Should be overriden by scraper authors, to yield dimensions."""
607 |         from warnings import warn
608 |         warn("This scraper has no method for fetching dimensions.",
609 |              RuntimeWarning)
610 |         return
611 |         yield
612 |         # raise Exception("This scraper has no method for fetching dimensions!")
613 | 
614 |     def _fetch_allowed_values(self, dimension):
615 |         """Can be overriden by scraper authors, to yield allowed values."""
616 |         if self.allowed_values is None:
617 |             yield None
618 |         for allowed_value in self.allowed_values:
619 |             yield allowed_value
620 | 
621 |     def _fetch_data(self, dataset, query=None):
622 |         """Must be overriden by scraper authors, to yield dataset rows."""
623 |         raise Exception("This scraper has no method for fetching data!")
624 | 
625 |     @property
626 |     def descendants(self):
627 |         """Recursively return every dataset below current item."""
628 |         for i in self.current_item.items:
629 |             self.move_to(i)
630 |             if i.type == TYPE_COLLECTION:
631 |                 for c in self.children:
632 |                     yield c
633 |             else:
634 |                 yield i
635 |             self.move_up()
636 | 
637 |     @property
638 |     def children(self):
639 |         """Former, misleading name for descendants."""
640 |         from warnings import warn
641 |         warn("Deprecated. Use Scraper.descendants.", DeprecationWarning)
642 |         for descendant in self.descendants:
643 |             yield descendant
644 | 
645 | 
646 | # Solve any circular dependencies here:
647 | 
648 | DimensionList._CONTAINS = Dimension
649 | ValueList._CONTAINS = DimensionValue
650 | ItemList._CONTAINS = Item
651 | 


--------------------------------------------------------------------------------
/statscraper/compat.py:
--------------------------------------------------------------------------------
 1 | import six
 2 | 
 3 | if six.PY3:
 4 |     from io import BytesIO as StringIO
 5 |     from json import JSONDecodeError
 6 |     unicode = str
 7 | elif six.PY2:
 8 |     from StringIO import StringIO
 9 |     unicode = unicode
10 |     JSONDecodeError = ValueError
11 | 


--------------------------------------------------------------------------------
/statscraper/datatypes.py:
--------------------------------------------------------------------------------
 1 | """Contains code for parsing datatypes from the statscraper-datatypes repo."""
 2 | from glob import iglob
 3 | from itertools import chain
 4 | from csv import DictReader
 5 | from csv import reader as CsvReader
 6 | from .exceptions import NoSuchDatatype
 7 | from .DimensionValue import DimensionValue
 8 | from .ValueList import ValueList
 9 | import os
10 | 
11 | DIR_PATH = os.path.dirname(os.path.realpath(__file__))
12 | DATATYPES_FILE = os.path.join(DIR_PATH, "datatypes", "datatypes.csv")
13 | VALUE_DELIMITOR = ','
14 | 
15 | 
16 | class Datatype(object):
17 |     """Represent a datatype, initiated by id."""
18 | 
19 |     def __init__(self, id):
20 |         """Id is a datatype from datatypes.csv."""
21 |         self.id = id
22 |         self.allowed_values = ValueList()
23 | 
24 |         data = None
25 |         with open(DATATYPES_FILE, 'r') as csvfile:
26 |             reader = DictReader(csvfile)
27 |             for row in reader:
28 |                 if row["id"] == id:
29 |                     data = row
30 |                     break
31 |         if data is None:
32 |             raise(NoSuchDatatype)
33 |         self.value_type = data["value_type"]
34 |         self.description = data["description"]
35 |         domain = data["allowed_values"]
36 |         if domain:
37 |             for file_ in self._get_csv_files(domain):
38 |                 with open(file_, 'r') as csvfile:
39 |                     reader = DictReader(csvfile)
40 |                     dialect_names = [x
41 |                                      for x in reader.fieldnames
42 |                                      if x.startswith("dialect:")]
43 |                     self.dialects = [d[8:] for d in dialect_names]
44 |                     for row in reader:
45 |                         value = DimensionValue(row["id"],
46 |                                                self,
47 |                                                label=row["label"])
48 |                         dialects = {x: None for x in self.dialects}
49 | 
50 |                         for d in dialect_names:
51 |                             # parse this cell as a csv row
52 |                             csvreader = CsvReader([row[d]],
53 |                                                   delimiter=VALUE_DELIMITOR,
54 |                                                   skipinitialspace=True,
55 |                                                   strict=True)
56 |                             values = next(csvreader)
57 |                             dialects[d[8:]] = values
58 |                         value.dialects = dialects
59 |                         self.allowed_values.append(value)
60 | 
61 |     def _get_csv_files(self, domain):
62 |         domain = os.path.join(*domain.split("/"))
63 | 
64 |         # We are fetching both by filename and dir name
65 |         # so that regions/kenya will match anything in
66 |         # `datatypes/values/regions/kenya/*.csv`
67 |         # and/or `datatypes/values/regions/kenya.csv`
68 |         #
69 |         # There is probably an easier way to do this
70 |         # FIXME the below function fetches /foo/bar/regions/kenya as well, but we probably want ^regions/kenya
71 |         value_path_1 = os.path.join(DIR_PATH, "datatypes", "values", domain)
72 |         value_path_2 = os.path.join(DIR_PATH, "datatypes", "values")
73 |         files_1 = chain.from_iterable(iglob(os.path.join(root, '*.csv'))
74 |                                       for root, dirs, files in os.walk(value_path_1))
75 |         files_2 = chain.from_iterable(iglob(os.path.join(root, domain + '.csv'))
76 |                                       for root, dirs, files in os.walk(value_path_2))
77 |         for f in chain(files_1, files_2):
78 |             yield f
79 | 
80 |     def __str__(self):
81 |         return str(self.id)
82 | 
83 |     def __repr__(self):
84 |         return '<Datatype: %s>' % str(self)
85 | 


--------------------------------------------------------------------------------
/statscraper/datatypes/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Journalism++
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/statscraper/datatypes/README.md:
--------------------------------------------------------------------------------
 1 | # Statscraper Datatypes
 2 | This repo contains data types (e.g. ”Swedish municipality”), with value types (e.g. ”string”), allowed values, names lookup tables for alternative names, and definitions. It is used by the Statscraper repo, as a semi standardized ontology for scrapers.
 3 | 
 4 | All datatypes are listed in `/datatypes.csv`. Allowed values are in the `/values` folder, organized in further subfolders by domain.
 5 | 
 6 | ## Data types (datatypes.csv)
 7 | `datatypes.csv` contains for each datatype:
 8 |  - `id`: A unique id. We use human readable id's.
 9 |  - `description`: Should include a definition
10 |  - `value_type`: `int`, `float`, `str`, `date` or `bool`
11 |  - `allowed_values`: See below
12 | 
13 | ### Value types
14 | Each data type can have one of the following value types:
15 | 
16 | * `int` – a value that can be parsed as an integer 
17 | * `float` – a value that can be parsed as a floating-point number
18 | * `str` – a value that can be parsed as a string. Empty strings are considered null.
19 | * `date` – a ISO 8601 date, e.g. `2016-07-05`, `2016-07-05T13:00:00`, `2016-W27`, or `1981-04`.
20 | * `bool` – 1 for True and 0 False. Blank means null.
21 | 
22 | ## Allowed values
23 | 
24 | Some data types, and some metadata fields, have a predefined set of allowed values (such as “regions”). In some domains, allowed values may be organized in categories (such as “Swedish municipalities”, “Swedish counties”).
25 | 
26 | Allowed values are specified in csv files under the `values` directory, optionlly structured in subfolders by domain, e.g.`regions/sweden/municipalities.csv`. They are referenced like this: `regions/sweden/municipalities`, and `regions`. If there is a `regions/` folder, there can not be a `regions.csv` in the same directory.
27 | 
28 | The allowed values csv's contain:
29 | 
30 | * `id`: A unique id. We use human readable id's, e.g. "Stockholms kommun", not "0180"
31 | * `label`: An optional label
32 | * `dialect:`~: Columns prefixed with `dialect:` contain corresponding id's, e.g. names useed by major statistical providers, WikiData id's, etc
33 | 


--------------------------------------------------------------------------------
/statscraper/datatypes/datatypes.csv:
--------------------------------------------------------------------------------
 1 | "id","description","value_type","allowed_values"
 2 | "int","This datatype holds integer values","int",
 3 | "float","This datatype holds floating point values","float",
 4 | "bool","This datatype holds boolean values","bool",
 5 | "str","This datatype holds string values. Empty strings are considered null.","str",
 6 | "date","An ISO 8601 date/time stamp","date",
 7 | "quarter","A quarter of a gregorian calendar year","str","periods/quarters"
 8 | "gender","A subjects gender, in most data by legal definition","str","genders"
 9 | "marital_status","A subjects marital status","str","marital_statuses"
10 | "year","A (proleptic) gregorian year","int",
11 | "month","A month in a (proleptic) gregorian year","str","periods/months"
12 | "academic_term","An academic term, e.g. a semester, trimester, etc","str","periods/academic-terms"
13 | "region","An administrative territorial region.","str","regions"
14 | "week","A ISO 8601 week number, week starting on Monday, first week of the year has at least four days","int",
15 | "currency","Currencies, following ISO 4217 where possible","str","currencies"
16 | "road_type","A road type and/or network","str","road_types"
17 | "road_number","A numeric roadnumber in a road system, eg 4 for the international e-road “E4”","int",
18 | 


--------------------------------------------------------------------------------
/statscraper/datatypes/values/currencies.csv:
--------------------------------------------------------------------------------
  1 | "id","label","description","dialect:sv","dialect:num","dialect:wikidata"
  2 | "AED","United Arab Emirates dirham","United Arab Emirates dirham","Emiratisk dirham",784,"Q200294"
  3 | "AFN","Afghan afghani","Afghan afghani","Afghani",971,"Q199471"
  4 | "ALL","Albanian lek","Albanian lek","Lek",8,"Q130498"
  5 | "AMD","Armenian dram","Armenian dram","Dram",51,"Q130498"
  6 | "ANG","Netherlands Antillean guilder","Netherlands Antillean guilder","Antillergulden",532,"Q200337"
  7 | "AOA","Angolan kwanza","Angolan kwanza","Kwanza",973,"Q199578"
  8 | "ARS","Argentine peso","Argentine peso","Argentinsk peso",32,"Q259502"
  9 | "AUD","Australian dollar","Australian dollar","Australisk dollar",36,"Q232270"
 10 | "AWG","Aruban florin","Aruban florin","Arubansk florin",533,"Q483725"
 11 | "AZN","Azerbaijani manat","Azerbaijani manat","Azerbajdzjansk manat",944,"Q179620"
 12 | "BAM","Bosnia and Herzegovina convertible mark","Bosnia and Herzegovina convertible mark","Konvertibilna marka",977,"Q179620"
 13 | "BBD","Barbados dollar","Barbados dollar","Barbadisk dollar",52,"Q194351"
 14 | "BDT","Bangladeshi taka","Bangladeshi taka","Taka",50,"Q172540"
 15 | "BGN","Bulgarian lev","Bulgarian lev","Lev",975,"Q201871"
 16 | "BHD","Bahraini dinar","Bahraini dinar","Bahrainsk dinar",48,"Q238007"
 17 | "BIF","Burundian franc","Burundian franc","Burundisk franc",108,"Q210478"
 18 | "BMD","Bermudian dollar","Bermudian dollar","Bermudisk dollar",60,"Q206319"
 19 | "BND","Brunei dollar","Brunei dollar","Bruneisk dollar",96,"Q206319"
 20 | "BOB","Boliviano","Boliviano","Boliviano",68,"Q173117"
 21 | "BRL","Brazilian real","Brazilian real","Real",986,"Q194339"
 22 | "BSD","Bahamian dollar","Bahamian dollar","Bahamansk dollar",44,"Q201799"
 23 | "BTN","Bhutanese ngultrum","Bhutanese ngultrum","Ngultrum",64,"Q201799"
 24 | "BWP","Botswana pula","Botswana pula","Pula",72,"Q186794"
 25 | "BYR","Belarusian ruble","Former Belarusian ruble","Vitrysk rubel",974,"Q275112"
 26 | "BZD","Belize dollar","Belize dollar","Belizisk dollar",84,"Q1104069"
 27 | "CAD","Canadian dollar","Canadian dollar","Kanadensisk dollar",124,"Q4734"
 28 | "CDF","Congolese franc","Congolese franc","Kongolesisk franc",976,"Q25344"
 29 | "CHF","Swiss franc","Swiss franc","Schweizisk franc",756,"Q200050"
 30 | "CLP","Chilean peso","Chilean peso","Chilensk peso",152,"Q1378945"
 31 | "CNY","Chinese yuan","Chinese yuan","Renminbi",156,"Q244819"
 32 | "COP","Colombian peso","Colombian peso","Colombiansk peso",170,"Q244819"
 33 | "CRC","Costa Rican colon","Costa Rican colon","Costaricansk colón",188,"Q201505"
 34 | "CUP","Cuban peso","Cuban peso","Kubansk peso",192,"Q201505"
 35 | "CVE","Cape Verde escudo","Cape Verde escudo","Kapverdisk escudo",132,"Q131016"
 36 | "CZK","Czech koruna","Czech koruna","Tjeckisk krona",203,"Q4594"
 37 | "DJF","Djiboutian franc","Djiboutian franc","Djiboutisk franc",262,"Q25417"
 38 | "DKK","Danish krone","Danish krone","Dansk krona",208,"Q242922"
 39 | "DOP","Dominican peso","Dominican peso","Dominikansk peso",214,"Q199674"
 40 | "DZD","Algerian dinar","Algerian dinar","Algerisk dinar",12,"Q199462"
 41 | "EGP","Egyptian pound","Egyptian pound","Egyptiskt pund",818,"Q171503"
 42 | "ERN","Eritrean nakfa","Eritrean nakfa","Nakfa",232,"Q206243"
 43 | "ETB","Ethiopian birr","Ethiopian birr","Birr",230,"Q206243"
 44 | "EUR","Euro","Euro","Euro",978,"Q4916"
 45 | "FJD","Fiji dollar","Fiji dollar","Fijidollar",242,"Q330044"
 46 | "FKP","Falkland Islands pound","Falkland Islands pound","Falklandspund",238,"Q25224"
 47 | "GBP","Pound sterling","Pound sterling","Brittiskt pund",826,"Q4608"
 48 | "GEL","Georgian lari","Georgian lari","Georgiska lari",981,"Q4608"
 49 | "GHS","Ghanaian cedi","Ghanaian cedi","Ghana Cedi",936,"Q41429"
 50 | "GIP","Gibraltar pound","Gibraltar pound","Gibraltarpund",292,"Q202885"
 51 | "GMD","Gambian dalasi","Gambian dalasi","Dalasi",270,"Q213311"
 52 | "GNF","Guinean franc","Guinean franc","Guinesisk franc",324,"Q207396"
 53 | "GTQ","Guatemalan quetzal","Guatemalan quetzal","Quetzal",320,"Q213005"
 54 | "GYD","Guyanese dollar","Guyanese dollar","Guyansk dollar",328,"Q31015"
 55 | "HKD","Hong Kong dollar","Hong Kong dollar","Hongkongdollar",344,"Q4719"
 56 | "HNL","Honduran lempira","Honduran lempira","Lempira",340,"Q26360"
 57 | "HRK","Croatian kuna","Croatian kuna","Kroatisk kuna",191,"Q203955"
 58 | "HTG","Haitian gourde","Haitian gourde","Gourde",332,"Q47190"
 59 | "HUF","Hungarian forint","Hungarian forint","Forint",348,"Q41588"
 60 | "IDR","Indonesian rupiah","Indonesian rupiah","Rupiah",360,"Q131309"
 61 | "ILS","Israeli new shekel","Israeli new shekel","Shekel",376,"Q80524"
 62 | "INR","Indian rupee","Indian rupee","Indisk rupie",356,"Q193094"
 63 | "IQD","Iraqi dinar","Iraqi dinar","Irakisk dinar",368,"Q188608"
 64 | "IRR","Iranian rial","Iranian rial","Iransk rial",364,"Q188608"
 65 | "ISK","Icelandic króna","Icelandic króna","Isländsk krona",352,"Q209792"
 66 | "JMD","Jamaican dollar","Jamaican dollar","Jamaicansk dollar",388,"Q203722"
 67 | "JOD","Jordanian dinar","Jordanian dinar","Jordansk dinar",400,"Q8146"
 68 | "JPY","Japanese yen","Japanese yen","Yen",392,"Q202882"
 69 | "KES","Kenyan shilling","Kenyan shilling","Kenyansk shilling",404,"Q35881"
 70 | "KGS","Kyrgyzstani som","Kyrgyzstani som","Kirgizistansk som",417,
 71 | "KHR","Cambodian riel","Cambodian riel","Riel",116,
 72 | "KMF","Comoro franc","Comoro franc","Komoransk franc",174,"Q106720"
 73 | "KPW","North Korean won","North Korean won","Nordkoreansk won",408,"Q202040"
 74 | "KRW","South Korean won","South Korean won","Sydkoreansk won",410,"Q193098"
 75 | "KWD","Kuwaiti dinar","Kuwaiti dinar","Kuwaitisk dinar",414,"Q319885"
 76 | "KYD","Cayman Islands dollar","Cayman Islands dollar","Caymansk dollar",136,"Q173751"
 77 | "KZT","Kazakhstani tenge","Kazakhstani tenge","Tenge",398,"Q200055"
 78 | "LAK","Lao kip","Lao kip","Kip",418,"Q201880"
 79 | "LBP","Lebanese pound","Lebanese pound","Libanesiskt pund",422,"Q4596"
 80 | "LKR","Sri Lankan rupee","Sri Lankan rupee","Lankesisk rupie",144,"Q242988"
 81 | "LRD","Liberian dollar","Liberian dollar","Liberiansk dollar",430,"Q208039"
 82 | "LSL","Lesotho loti","Lesotho loti","Loti",426,"Q190699"
 83 | "LYD","Libyan dinar","Libyan dinar","Libysk dinar",434,"Q200192"
 84 | "MAD","Moroccan dirham","Moroccan dirham","Marockansk dirham",504,"Q181129"
 85 | "MDL","Moldovan leu","Moldovan leu","Moldavisk leu",498,"Q4584"
 86 | "MGA","Malagasy ariary","Malagasy ariary","Ariary",969,"Q177875"
 87 | "MKD","Macedonian denar","Macedonian denar","Makedonisk denar",807,
 88 | "MMK","Myanmar kyat","Myanmar kyat","Kyat",104,"Q183435"
 89 | "MNT","Mongolian tögrög","Mongolian tögrög","Tögrög",496,"Q241214"
 90 | "MOP","Macanese pataca","Macanese pataca","Pataca",446,"Q207024"
 91 | "MRO","Mauritanian ouguiya","Mauritanian ouguiya","Ouguiya",478,"Q212967"
 92 | "MUR","Mauritian rupee","Mauritian rupee","Mauritisk rupie",480,"Q206600"
 93 | "MVR","Maldivian rufiyaa","Maldivian rufiyaa","Rufiyah",462,"Q211694"
 94 | "MWK","Malawian kwacha","Malawian kwacha","malawisk kwacha",454,"Q4730"
 95 | "MXN","Mexican peso","Mexican peso","Mexikansk peso",484,"Q163712"
 96 | "MYR","Malaysian ringgit","Malaysian ringgit","Ringgit",458,"Q200753"
 97 | "MZN","Mozambican metical","Mozambican metical","Metical",943,"Q202462"
 98 | "NAD","Namibian dollar","Namibian dollar","Namibisk dollar",516,"Q203567"
 99 | "NGN","Nigerian naira","Nigerian naira","Naira",566,"Q207312"
100 | "NIO","Nicaraguan córdoba","Nicaraguan córdoba","Córdoba",558,"Q132643"
101 | "NOK","Norwegian krone","Norwegian krone","Norsk krona",578,"Q202895"
102 | "NPR","Nepalese rupee","Nepalese rupee","Nepalesisk rupie",524,"Q1472704"
103 | "NZD","New Zealand dollar","New Zealand dollar","Nyzeeländsk dollar",554,"Q272290"
104 | "OMR","Omani rial","Omani rial","Omansk rial",512,"Q210472"
105 | "PAB","Panamanian balboa","Panamanian balboa","Balboa",590,"Q210472"
106 | "PEN","Peruvian Sol","Peruvian Sol","Nuevo sol",604,"Q200759"
107 | "PGK","Papua New Guinean kina","Papua New Guinean kina","Kina",598,"Q17193"
108 | "PHP","Philippine peso","Philippine peso","Filippinsk peso",608,"Q188289"
109 | "PKR","Pakistani rupee","Pakistani rupee","Pakistansk rupie",586,"Q188289"
110 | "PLN","Polish złoty","Polish złoty","Złoty",985,"Q207514"
111 | "PYG","Paraguayan guaraní","Paraguayan guaraní","Guarani",600,"Q206386"
112 | "QAR","Qatari riyal","Qatari riyal","Qatarisk rial",634,"Q206386"
113 | "RON","Romanian leu","Romanian leu","Rumänsk leu",946,"Q172524"
114 | "RSD","Serbian dinar","Serbian dinar","Serbisk dinar",941,"Q41044"
115 | "RUB","Russian ruble","Russian ruble","Rysk rubel",643,"Q4741"
116 | "RWF","Rwandan franc","Rwandan franc","Rwandisk franc",646,"Q199857"
117 | "SAR","Saudi riyal","Saudi riyal","Saudiarabisk rial",682,"Q4597"
118 | "SBD","Solomon Islands dollar","Solomon Islands dollar","Salomondollar",90,"Q4597"
119 | "SCR","Seychelles rupee","Seychelles rupee","Seychellisk rupie",690,"Q271206"
120 | "SDG","Sudanese pound","Sudanese pound","Sudanesiskt pund",938,"Q271206"
121 | "SEK","Swedish krona/kronor","Swedish krona/kronor","Svensk krona",752,"Q190951"
122 | "SGD","Singapore dollar","Singapore dollar","Singaporiansk dollar",702,"Q374453"
123 | "SHP","Saint Helena pound","Saint Helena pound","Sankthelenskt pund",654,"Q4587"
124 | "SLL","Sierra Leonean leone","Sierra Leonean leone","Leone",694,"Q4603"
125 | "SOS","Somali shilling","Somali shilling","Somalisk shilling",706,"Q202036"
126 | "SRD","Surinamese dollar","Surinamese dollar","Surinamesisk dollar",968,"Q244366"
127 | "SSP","South Sudanese pound","South Sudanese pound","Sydsudanesiskt pund",728,"Q193712"
128 | "STD","São Tomé and Príncipe dobra","São Tomé and Príncipe dobra","Dobra",678,"Q240468"
129 | "SYP","Syrian pound","Syrian pound","Syriskt pund",760,"Q4823"
130 | "SZL","Swazi lilangeni","Swazi lilangeni","Lilangeni",748,"Q177882"
131 | "THB","Thai baht","Thai baht","Baht",764,"Q199886"
132 | "TJS","Tajikistani somoni","Tajikistani somoni","Somoni",972,"Q199886"
133 | "TMM","Turkmenistani manat","Former Turkmenistani manat","Turkmenistansk manat",795,"Q486637"
134 | "TND","Tunisian dinar","Tunisian dinar","Tunisisk dinar",788,"Q4613"
135 | "TOP","Tongan paʻanga","Tongan paʻanga (often rendered as pa’anga)","Pa'anga",776,"Q172872"
136 | "TRY","Turkish lira","Turkish lira","Turkisk lira",949,"Q242890"
137 | "TTD","Trinidad and Tobago dollar","Trinidad and Tobago dollar","Trinidaddollar",780,"Q208526"
138 | "TWD","New Taiwan dollar","New Taiwan dollar","Taiwanesisk dollar",901,"Q4589"
139 | "TZS","Tanzanian shilling","Tanzanian shilling","Tanzanisk shilling",834,"Q4589"
140 | "UAH","Ukrainian hryvnia","Ukrainian hryvnia","Hryvnja",980,"Q4598"
141 | "UGX","Ugandan shilling","Ugandan shilling","Ugandisk shilling",800,"Q4917"
142 | "USD","United States dollar","United States dollar","Amerikansk dollar",840,"Q209272"
143 | "UYU","Uruguayan peso","Uruguayan peso","Uruguayansk peso",858,"Q209272"
144 | "UZS","Uzbekistan som","Uzbekistan som","Uzbekistansk som",860,"Q203757"
145 | "VEF","Venezuelan bolívar","Venezuelan bolívar","Bolívar",937,"Q203757"
146 | "VND","Vietnamese đồng","Vietnamese đồng","Dong",704,"Q192090"
147 | "VUV","Vanuatu vatu","Vanuatu vatu","Vatu",548,"Q207523"
148 | "WST","Samoan tālā","Samoan tālā","Tala",882,"Q4588"
149 | "XAF","CFA franc BEAC","Central African CFA franc","CFA-franc (BEAC)",950,"Q847739"
150 | "XCD","East Caribbean dollar","East Caribbean dollar","Östkaribisk dollar",951,"Q26365"
151 | "XOF","CFA franc BCEAO","West African CFA franc","CFA-franc (BCEAO)",952,"Q861690"
152 | "XPF","CFP franc","Pacifique CFP franc","CFP-franc",953,"Q240512"
153 | "YER","Yemeni rial","Yemeni rial","Jemenitisk rial",886,"Q181907"
154 | "ZAR","South African rand","South African rand","Rand",710,"Q73408"
155 | "ZMK","Zambian kwacha","Zambian kwacha","Zambisk kwacha",894,"Q73408"
156 | "ZWD","Zimbabwean dollar ","Zimbabwean dollar ","Zimbabwisk dollar",716,"Q182803"
157 | 


--------------------------------------------------------------------------------
/statscraper/datatypes/values/genders.csv:
--------------------------------------------------------------------------------
1 | id,label,description,dialect:wikidata,dialect:scb,dialect:vcard,dialect:foaf
2 | male,male,sex or gender is male,Q6581097,1,Male,male
3 | female,female,sex or gender is female,Q6581072,2,Female,female
4 | other,other,"sex or gender is not male or female, e.g. intersex, or in some jurisdictions, a third legal gender",Q1097630,,Other,
5 | unknown,unknown,"there is a sex or gender (i.e. not null or none), but we do not know it",Q19798648,,Unknown,
6 | 


--------------------------------------------------------------------------------
/statscraper/datatypes/values/marital_statuses.csv:
--------------------------------------------------------------------------------
1 | id,label,description,dialect:scb
2 | unmarried,unmarried,"Currently unmarried, including those widowed or divorced","OG,SK,ÄNKL"
3 | married,married,Currently married,G
4 | 


--------------------------------------------------------------------------------
/statscraper/datatypes/values/periods/academic-terms/semesters.csv:
--------------------------------------------------------------------------------
1 | "id","label","description","dialect:swedish","dialect:numeric"
2 | "semester_1","First semester","The first semester of the academic year, in a two term system. This semester will often, but not always, start in August, September, or October in the Northern Hemisphere, and in February or March in the Southern Hemiphere.","HT",1
3 | "semester_2","Second semester","The second semester of the academic year, in a two term system. This semester will often, but not always, start in January or February in the Northern Hemisphere, and in July, August, September, or October in the Southern Hemiphere.","VT",2
4 | 


--------------------------------------------------------------------------------
/statscraper/datatypes/values/periods/months.csv:
--------------------------------------------------------------------------------
 1 | id,label,description,dialect:wikidata,dialect:swedish,dialect:numeric,dialect:gnd
 2 | january,January,First month of the gregorian year. Assume local timezone.,Q108,januari,1,4334971-7
 3 | february,February,Second month of the gregorian year. Assume local timezone.,Q109,februari,2,4405048-3
 4 | march,March,Third month of the gregorian year. Assume local timezone.,Q110,mars,3,4334968-7
 5 | april,April,Fourth month of the gregorian year. Assume local timezone.,Q118,april,4,4334965-1
 6 | may,May,Fifth month of the gregorian year. Assume local timezone.,Q119,maj,5,4168620-2
 7 | june,June,Sixth month of the gregorian year. Assume local timezone.,Q120,juni,6,4405052-5
 8 | july,July,Seventh month of the gregorian year. Assume local timezone.,Q121,juli,7,4389806-3
 9 | august,August,Eighth month of the gregorian year. Assume local timezone.,Q122,augusti,8,4389837-3
10 | september,September,Nineth month of the gregorian year. Assume local timezone.,Q123,september,9,4389807-5
11 | october,October,Tenth month of the gregorian year. Assume local timezone.,Q124,oktober,10,4389801-4
12 | november,November,Eleventh month of the gregorian year. Assume local timezone.,Q125,november,11,4389811-7
13 | december,December,Twelfth month of the gregorian year. Assume local timezone.,Q126,december,12,4299252-7
14 | 


--------------------------------------------------------------------------------
/statscraper/datatypes/values/periods/quarters.csv:
--------------------------------------------------------------------------------
1 | id,label,description,dialect:numeric
2 | q1,First quarter,"January 1 – March 31 inclusive, in local timezone. 90 or 91 days.",1
3 | q2,Second quarter,"April 1 – June 30 inclusive, in local timezone. 91 days.",2
4 | q3,Third quarter,"July 1 – September 30 inclusive, in local timezone. 92 days.",3
5 | q4,Fourth quarter,"October 1 – December 31 inclusive, in local timezone. 92 days.",4
6 | 


--------------------------------------------------------------------------------
/statscraper/datatypes/values/regions/eu.csv:
--------------------------------------------------------------------------------
1 | id,label,description,dialect:wikidata
2 | eu,EU,"The EU. This refers to the union as a whole. For the set of member states, use one of the below",Q458
3 | eu-12,EU 12,"EU members after the establishment of the union in 1993: Belgium (BE), Denmark (DK), France (FR), Germany (DE), Greece (EL), Ireland (IE), Italy (IT), Luxembourg (LU), Netherlands (NL), Portugal (PT), Spain (ES) and United Kingdom (UK)",Q17627986
4 | eu-15,EU 15,"EU members after the 1995 enlargement: EU 12, plus Austria (AT), Finland (FI) and Sweden (SE) ",Q4590816
5 | eu-21,EU 21,"EU 15, plus Czech Republic, Hungary, Poland, Slovak Republic, before they where EU members.",
6 | eu-25,EU-25,"EU members after the 2004 enlargement: EU 15 + Cypern (CY), Tjeckien (CZ), Estland (EE), Ungern (HU), Lettland (LV), Litauen (LT), Malta (MT), Polen (PL), Slovakien (SK) och Slovenien (SI",Q19933476
7 | eu-27,EU-27,"EU members after the 2007 enlargement: EU 25, plus Bulgaria and Romania.",Q29440613
8 | eu-28,EU 28,"EU members after the 2013 enlargement: EU 27, plus Croatia",Q16681601
9 | 


--------------------------------------------------------------------------------
/statscraper/datatypes/values/regions/sweden/counties.csv:
--------------------------------------------------------------------------------
 1 | "id","label","description","dialect:short","dialect:skatteverket","dialect:numerical","dialect:wikidata","dialect:arbetsmiljoverket"
 2 | "Blekinge län","Blekinge län",,"Blekinge","10",10,"Q102377","Blekinge Län"
 3 | "Dalarnas län","Dalarnas län",,"Dalarna","20",20,"Q103732","Dalarnas Län"
 4 | "Gävleborgs län","Gävleborgs län",,"Gävleborg","21",21,"Q103699","Gävleborgs Län"
 5 | "Hallands län","Hallands län",,"Halland","13",13,"Q103691","Hallands Län"
 6 | "Jämtlands län","Jämtlands län",,"Jämtland","23",23,"Q103679","Jämtlands Län"
 7 | "Jönköpings län","Jönköpings län",,"Jönköping","06",6,"Q103672","Jönköpings Län"
 8 | "Kalmar län","Kalmar län",,"Kalmar","08",8,"Q103707","Kalmar Län"
 9 | "Kronobergs län","Kronobergs län",,"Kronoberg","07",7,"Q104746","Kronobergs Län"
10 | "Norrbottens län","Norrbottens län",,"Norrbotten","25",25,"Q103686","Norrbottens Län"
11 | "Örebro län","Örebro län",,"Örebro","18",18,"Q104257","Örebro Län"
12 | "Östergötlands län","Östergötlands län",,"Östergötland","05",5,"Q104940","Östergötlands Län"
13 | "Skåne län","Skåne län",,"Skåne","12",12,"Q103659","Skåne Län"
14 | "Södermanlands län","Södermanlands län",,"Södermanland","04",4,"Q106915","Södermanlands Län"
15 | "Stockholms län","Stockholms län",,"Stockholm","01",1,"Q104231","Stockholms Län"
16 | "Uppsala län","Uppsala län",,"Uppsala","03",3,"Q104926","Uppsala Län"
17 | "Värmlands län","Värmlands län",,"Värmland","17",17,"Q106789","Värmlands Län"
18 | "Västerbottens län","Västerbottens län",,"Västerbotten","24",24,"Q104877","Västerbottens Län"
19 | "Västernorrlands län","Västernorrlands län",,"Västernorrland","22",22,"Q104891","Västernorrlands Län"
20 | "Västmanlands län","Västmanlands län",,"Västmanland","19",19,"Q105075","Västmanlands Län"
21 | "Västra Götalands län","Västra Götalands län","Since 1998","Västra Götaland","14",14,"Q103093","Västra Götalands Län"
22 | "Skaraborgs län","Skaraborgs län","Since 1998 part of Västra Götalands län","Skaraborg","16",16,"Q922842",
23 | "Älvsborgs län","Älvsborgs län","Since 1998 part of Västra Götalands län","Älvsborg","15",15,"Q254990",
24 | "Göteborgs och Bohus län","Göteborgs och Bohus län","Since 1998 part of Västra Götalands län","Göteborgs och Bohus län","14",14,"Q579801",
25 | "Gotlands län","Gotlands län","For some purposes identical to `regions/sweden/municipalities/Gotlands kommun`, as the adinistrative entity Region Gotland  since 2011.","Gotland","09",9,"Q103738","Gotlands Län"
26 | 


--------------------------------------------------------------------------------
/statscraper/datatypes/values/regions/sweden/municipalities.csv:
--------------------------------------------------------------------------------
  1 | id,label,description,dialect:short,dialect:skatteverket,dialect:numerical,dialect:wikidata,dialect:brå,dialect:scb
  2 | Ale kommun,Ale kommun,Ale municipality,Ale,1440,1440,Q498470,8617,1440 Ale kommun
  3 | Alingsås kommun,Alingsås kommun,Alingsås municipality,Alingsås,1489,1489,Q503162,8618,1489 Alingsås kommun
  4 | Älmhults kommun,Älmhults kommun,Älmhult municipality,Älmhult,0765,765,Q254799,8428,0765 Älmhults kommun
  5 | Älvdalens kommun,Älvdalens kommun,Älvdalen municipality,Älvdalen,2039,2039,Q123855,8363,2039 Älvdalens kommun
  6 | Alvesta kommun,Alvesta kommun,Alvesta municipality,Alvesta,0764,764,Q182007,8421,0764 Alvesta kommun
  7 | Älvkarleby kommun,Älvkarleby kommun,Älvkarleby municipality,Älvkarleby,0319,319,Q59858,8561,0319 Älvkarleby kommun
  8 | Älvsbyns kommun,Älvsbyns kommun,Älvsbyn municipality,Älvsbyn,2560,2560,Q255025,8438,2560 Älvsbyns kommun
  9 | Åmåls kommun,Åmåls kommun,Åmål municipality,Åmål,1492,1492,Q271079,8695,1492 Åmåls kommun
 10 | Aneby kommun,Aneby kommun,Aneby municipality,Aneby,0604,604,Q503167,8394,0604 Aneby kommun
 11 | Ånge kommun,Ånge kommun,Ånge municipality,Ånge,2260,2260,Q271107,8602,2260 Ånge kommun
 12 | Ängelholms kommun,Ängelholms kommun,Ängelholm municipality,Ängelholm,1292,1292,Q255206,8491,1292 Ängelholms kommun
 13 | Arboga kommun,Arboga kommun,Arboga municipality,Arboga,1984,1984,Q431271,8605,1984 Arboga kommun
 14 | Åre kommun,Åre kommun,Åre municipality,Åre,2321,2321,Q271153,8391,2321 Åre kommun
 15 | Årjängs kommun,Årjängs kommun,Årjäng municipality,Årjäng,1765,1765,Q2777887,8579,1765 Årjängs kommun
 16 | Arjeplogs kommun,Arjeplogs kommun,Arjeplog municipality,Arjeplog,2506,2506,Q493855,8431,2506 Arjeplogs kommun
 17 | Arvidsjaurs kommun,Arvidsjaurs kommun,Arvidsjaur municipality,Arvidsjaur,2505,2505,Q499404,8430,2505 Arvidsjaurs kommun
 18 | Arvika kommun,Arvika kommun,Arvika municipality,Arvika,1784,1784,Q511365,8564,1784 Arvika kommun
 19 | Åsele kommun,Åsele kommun,Åsele municipality,Åsele,2463,2463,Q271274,8595,2463 Åsele kommun
 20 | Askersunds kommun,Askersunds kommun,Askersund municipality,Askersund,1882,1882,Q509501,8698,1882 Askersunds kommun
 21 | Åstorps kommun,Åstorps kommun,Åstorp municipality,Åstorp,1277,1277,Q201813,8490,1277 Åstorps kommun
 22 | Åtvidabergs kommun,Åtvidabergs kommun,Åtvidaberg municipality,Åtvidaberg,0561,561,Q271340,8722,0561 Åtvidabergs kommun
 23 | Avesta kommun,Avesta kommun,Avesta municipality,Avesta,2084,2084,Q500071,8349,2084 Avesta kommun
 24 | Bara kommun,Bara kommun,"Bara municipality, a former municipality of Sweden still appearing in some online datasets. Avoid using the short nameform as may cause confusion.",Bara,1229,1229,Q10426242,,1229 Bara kommun
 25 | Båstads kommun,Båstads kommun,Båstad municipality,Båstad,1278,1278,Q499464,8448,1278 Båstads kommun
 26 | Bengtsfors kommun,Bengtsfors kommun,Bengtsfors municipality,Bengtsfors,1460,1460,Q267462,8619,1460 Bengtsfors kommun
 27 | Bergs kommun,Bergs kommun,Berg municipality,Berg,2326,2326,Q845930,8385,2326 Bergs kommun
 28 | Bjurholms kommun,Bjurholms kommun,Bjurholm municipality,Bjurholm,2403,2403,Q376003,8581,2403 Bjurholms kommun
 29 | Bjuvs kommun,Bjuvs kommun,Bjuv municipality,Bjuv,1260,1260,Q511310,8445,1260 Bjuvs kommun
 30 | Bodens kommun,Bodens kommun,Boden municipality,Boden,2582,2582,Q509476,8441,2582 Bodens kommun
 31 | Bollebygds kommun,Bollebygds kommun,Bollebygd municipality,Bollebygd,1443,1443,Q503102,8620,1443 Bollebygds kommun
 32 | Bollnäs kommun,Bollnäs kommun,Bollnäs municipality,Bollnäs,2183,2183,Q62469,8367,2183 Bollnäs kommun
 33 | Borås kommun,Borås kommun,Borås municipality,Borås,1490,1490,Q894327,8621,1490 Borås kommun
 34 | Borgholms kommun,Borgholms kommun,Borgholm municipality,Borgholm,0885,885,Q509546,8408,0885 Borgholms kommun
 35 | Borlänge kommun,Borlänge kommun,Borlänge municipality,Borlänge,2081,2081,Q503209,8350,2081 Borlänge kommun
 36 | Botkyrka kommun,Botkyrka kommun,Botkyrka municipality,Botkyrka,0127,127,Q113718,8495,0127 Botkyrka kommun
 37 | Boxholms kommun,Boxholms kommun,Boxholm municipality,Boxholm,0560,560,Q509514,8711,0560 Boxholms kommun
 38 | Bräcke kommun,Bräcke kommun,Bräcke municipality,Bräcke,2305,2305,Q504261,8386,2305 Bräcke kommun
 39 | Bromölla kommun,Bromölla kommun,Bromölla municipality,Bromölla,1272,1272,Q514830,8446,1272 Bromölla kommun
 40 | Burlövs kommun,Burlövs kommun,Burlöv municipality,Burlöv,1231,1231,Q186662,8447,1231 Burlövs kommun
 41 | Dals-Eds kommun,Dals-Eds kommun,Dals-Ed municipality,Dals-Ed,1438,1438,Q503132,8622,1438 Dals-Eds kommun
 42 | Danderyds kommun,Danderyds kommun,Danderyd municipality,Danderyd,0162,162,Q113679,8496,0162 Danderyds kommun
 43 | Degerfors kommun,Degerfors kommun,Degerfors municipality,Degerfors,1862,1862,Q509469,8699,1862 Degerfors kommun
 44 | Dorotea kommun,Dorotea kommun,Dorotea municipality,Dorotea,2425,2425,Q132334,8582,2425 Dorotea kommun
 45 | Eda kommun,Eda kommun,Eda municipality,Eda,1730,1730,Q498447,8565,1730 Eda kommun
 46 | Ekerö kommun,Ekerö kommun,Ekerö municipality,Ekerö,0125,125,Q492614,8497,0125 Ekerö kommun
 47 | Eksjö kommun,Eksjö kommun,Eksjö municipality,Eksjö,0686,686,Q512035,8395,0686 Eksjö kommun
 48 | Emmaboda kommun,Emmaboda kommun,Emmaboda municipality,Emmaboda,0862,862,Q509452,8409,0862 Emmaboda kommun
 49 | Enköpings kommun,Enköpings kommun,Enköping municipality,Enköping,0381,381,Q509568,8555,0381 Enköpings kommun
 50 | Eskilstuna kommun,Eskilstuna kommun,Eskilstuna municipality,Eskilstuna,0484,484,Q503144,8545,0484 Eskilstuna kommun
 51 | Eslövs kommun,Eslövs kommun,Eslöv municipality,Eslöv,1285,1285,Q1130264,8449,1285 Eslövs kommun
 52 | Essunga kommun,Essunga kommun,Essunga municipality,Essunga,1445,1445,Q503127,8623,1445 Essunga kommun
 53 | Fagersta kommun,Fagersta kommun,Fagersta municipality,Fagersta,1982,1982,Q47018,8606,1982 Fagersta kommun
 54 | Falkenbergs kommun,Falkenbergs kommun,Falkenberg municipality,Falkenberg,1382,1382,Q508168,8378,1382 Falkenbergs kommun
 55 | Falköpings kommun,Falköpings kommun,Falköping municipality,Falköping,1499,1499,Q503214,8624,1499 Falköpings kommun
 56 | Falu kommun,Falu kommun,Falun municipality,Falun,2080,2080,Q501545,8351,2080 Falu kommun
 57 | Färgelanda kommun,Färgelanda kommun,Färgelanda municipality,Färgelanda,1439,1439,Q499441,8625,1439 Färgelanda kommun
 58 | Filipstads kommun,Filipstads kommun,Filipstad municipality,Filipstad,1782,1782,Q503204,8566,1782 Filipstads kommun
 59 | Finspångs kommun,Finspångs kommun,Finspång municipality,Finspång,0562,562,Q503180,8712,0562 Finspångs kommun
 60 | Flens kommun,Flens kommun,Flen municipality,Flen,0482,482,Q27982,8546,0482 Flens kommun
 61 | Forshaga kommun,Forshaga kommun,Forshaga municipality,Forshaga,1763,1763,Q499385,8567,1763 Forshaga kommun
 62 | Gagnefs kommun,Gagnefs kommun,Gagnef municipality,Gagnef,2026,2026,Q2642771,8352,2026 Gagnefs kommun
 63 | Gällivare kommun,Gällivare kommun,Gällivare municipality,Gällivare,2523,2523,Q493815,8437,2523 Gällivare kommun
 64 | Gävle kommun,Gävle kommun,Gävle municipality,Gävle,2180,2180,Q510010,8368,2180 Gävle kommun
 65 | Gislaveds kommun,Gislaveds kommun,Gislaved municipality,Gislaved,0662,662,Q505259,8396,0662 Gislaveds kommun
 66 | Gnesta kommun,Gnesta kommun,Gnesta municipality,Gnesta,0461,461,Q239332,8547,0461 Gnesta kommun
 67 | Gnosjö kommun,Gnosjö kommun,Gnosjö municipality,Gnosjö,0617,617,Q509609,8397,0617 Gnosjö kommun
 68 | Göteborgs kommun,Göteborgs kommun,Göteborg municipality,Göteborg,1480,1480,Q52502,8628,1480 Göteborgs kommun
 69 | Götene kommun,Götene kommun,Götene municipality,Götene,1471,1471,Q511281,8660,1471 Götene kommun
 70 | Region Gotland,Region Gotland,"Region Gotland, formally ”Gotland municipality”",Gotland,0980,980,Q374794,8365,0980 Region Gotland
 71 | Grästorps kommun,Grästorps kommun,Grästorp municipality,Grästorp,1444,1444,Q503148,8626,1444 Grästorps kommun
 72 | Grums kommun,Grums kommun,Grums municipality,Grums,1764,1764,Q503122,8568,1764 Grums kommun
 73 | Gullspångs kommun,Gullspångs kommun,Gullspång municipality,Gullspång,1447,1447,Q503193,8627,1447 Gullspångs kommun
 74 | Habo kommun,Habo kommun,Habo municipality,Habo,0643,643,Q503198,8398,0643 Habo kommun
 75 | Håbo kommun,Håbo kommun,Håbo municipality,Håbo,0305,305,Q511253,8557,0305 Håbo kommun
 76 | Hagfors kommun,Hagfors kommun,Hagfors municipality,Hagfors,1783,1783,Q511407,8569,1783 Hagfors kommun
 77 | Hällefors kommun,Hällefors kommun,Hällefors municipality,Hällefors,1863,1863,Q220881,8701,1863 Hällefors kommun
 78 | Hallsbergs kommun,Hallsbergs kommun,Hallsberg municipality,Hallsberg,1861,1861,Q508180,8700,1861 Hallsbergs kommun
 79 | Hallstahammars kommun,Hallstahammars kommun,Hallstahammar municipality,Hallstahammar,1961,1961,Q47019,8607,1961 Hallstahammars kommun
 80 | Halmstads kommun,Halmstads kommun,Halmstad municipality,Halmstad,1380,1380,Q504692,8379,1380 Halmstads kommun
 81 | Hammarö kommun,Hammarö kommun,Hammarö municipality,Hammarö,1761,1761,Q499359,8570,1761 Hammarö kommun
 82 | Haninge kommun,Haninge kommun,Haninge municipality,Haninge,0136,136,Q113692,8498,0136 Haninge kommun
 83 | Haparanda kommun,Haparanda kommun,Haparanda municipality,Haparanda,2583,2583,Q510310,8442,2583 Haparanda kommun
 84 | Härjedalens kommun,Härjedalens kommun,Härjedalen municipality,Härjedalen,2361,2361,Q513421,8387,2361 Härjedalens kommun
 85 | Härnösands kommun,Härnösands kommun,Härnösand municipality,Härnösand,2280,2280,Q209634,8597,2280 Härnösands kommun
 86 | Härryda kommun,Härryda kommun,Härryda municipality,Härryda,1401,1401,Q500125,8663,1401 Härryda kommun
 87 | Hässleholms kommun,Hässleholms kommun,Hässleholm municipality,Hässleholm,1293,1293,Q508125,8451,1293 Hässleholms kommun
 88 | Heby kommun,Heby kommun,Heby municipality,Heby,0331,331,Q516308,8556,0331 Heby kommun
 89 | Heby kommun före 2007,Heby kommun före 2007,"Heby before 2007, when they changed county. Most of the time you would probably want to use Heby kommun even before 2007",Heby före 2007,1917,1917,Q516308,8556,1917 Heby kommun
 90 | Hedemora kommun,Hedemora kommun,Hedemora municipality,Hedemora,2083,2083,Q507684,8353,2083 Hedemora kommun
 91 | Helsingborgs kommun,Helsingborgs kommun,Helsingborg municipality,Helsingborg,1283,1283,Q487648,8450,1283 Helsingborgs kommun
 92 | Herrljunga kommun,Herrljunga kommun,Herrljunga municipality,Herrljunga,1466,1466,Q503111,8661,1466 Herrljunga kommun
 93 | Hjo kommun,Hjo kommun,Hjo municipality,Hjo,1497,1497,Q428749,8662,1497 Hjo kommun
 94 | Hofors kommun,Hofors kommun,Hofors municipality,Hofors,2104,2104,Q62464,8369,2104 Hofors kommun
 95 | Höganäs kommun,Höganäs kommun,Höganäs municipality,Höganäs,1284,1284,Q505013,8452,1284 Höganäs kommun
 96 | Högsby kommun,Högsby kommun,Högsby municipality,Högsby,0821,821,Q510233,8411,0821 Högsby kommun
 97 | Höörs kommun,Höörs kommun,Höör municipality,Höör,1267,1267,Q266401,8454,1267 Höörs kommun
 98 | Hörby kommun,Hörby kommun,Hörby municipality,Hörby,1266,1266,Q504619,8453,1266 Hörby kommun
 99 | Huddinge kommun,Huddinge kommun,Huddinge municipality,Huddinge,0126,126,Q492575,8499,0126 Huddinge kommun
100 | Hudiksvalls kommun,Hudiksvalls kommun,Hudiksvall municipality,Hudiksvall,2184,2184,Q29963,8370,2184 Hudiksvalls kommun
101 | Hultsfreds kommun,Hultsfreds kommun,Hultsfred municipality,Hultsfred,0860,860,Q512002,8410,0860 Hultsfreds kommun
102 | Hylte kommun,Hylte kommun,Hylte municipality,Hylte,1315,1315,Q498477,8380,1315 Hylte kommun
103 | Järfälla kommun,Järfälla kommun,Järfälla municipality,Järfälla,0123,123,Q301259,8500,0123 Järfälla kommun
104 | Jokkmokks kommun,Jokkmokks kommun,Jokkmokk municipality,Jokkmokk,2510,2510,Q512048,8432,2510 Jokkmokks kommun
105 | Jönköpings kommun,Jönköpings kommun,Jönköping municipality,Jönköping,0680,680,Q504689,8399,0680 Jönköpings kommun
106 | Kalix kommun,Kalix kommun,Kalix municipality,Kalix,2514,2514,Q117091,8434,2514 Kalix kommun
107 | Kalmar kommun,Kalmar kommun,Kalmar municipality,Kalmar,0880,880,Q508153,8412,0880 Kalmar kommun
108 | Karlsborgs kommun,Karlsborgs kommun,Karlsborg municipality,Karlsborg,1446,1446,Q499435,8664,1446 Karlsborgs kommun
109 | Karlshamns kommun,Karlshamns kommun,Karlshamn municipality,Karlshamn,1082,1082,Q510223,8343,1082 Karlshamns kommun
110 | Karlskoga kommun,Karlskoga kommun,Karlskoga municipality,Karlskoga,1883,1883,Q509634,8702,1883 Karlskoga kommun
111 | Karlskrona kommun,Karlskrona kommun,Karlskrona municipality,Karlskrona,1080,1080,Q1128384,8344,1080 Karlskrona kommun
112 | Karlstads kommun,Karlstads kommun,Karlstad municipality,Karlstad,1780,1780,Q498453,8571,1780 Karlstads kommun
113 | Katrineholms kommun,Katrineholms kommun,Katrineholm municipality,Katrineholm,0483,483,Q508140,8548,0483 Katrineholms kommun
114 | Kävlinge kommun,Kävlinge kommun,Kävlinge municipality,Kävlinge,1261,1261,Q513370,8457,1261 Kävlinge kommun
115 | Kils kommun,Kils kommun,Kil municipality,Kil,1715,1715,Q499393,8572,1715 Kils kommun
116 | Kinda kommun,Kinda kommun,Kinda municipality,Kinda,0513,513,Q515299,8713,0513 Kinda kommun
117 | Kiruna kommun,Kiruna kommun,Kiruna municipality,Kiruna,2584,2584,Q499474,8443,2584 Kiruna kommun
118 | Klippans kommun,Klippans kommun,Klippan municipality,Klippan,1276,1276,Q504614,8455,1276 Klippans kommun
119 | Knivsta kommun,Knivsta kommun,Knivsta municipality,Knivsta,0330,330,Q504465,8558,0330 Knivsta kommun
120 | Köpings kommun,Köpings kommun,Köping municipality,Köping,1983,1983,Q42009,8610,1983 Köpings kommun
121 | Kramfors kommun,Kramfors kommun,Kramfors municipality,Kramfors,2282,2282,Q514815,8598,2282 Kramfors kommun
122 | Kristianstads kommun,Kristianstads kommun,Kristianstad municipality,Kristianstad,1290,1290,Q498857,8456,1290 Kristianstads kommun
123 | Kristinehamns kommun,Kristinehamns kommun,Kristinehamn municipality,Kristinehamn,1781,1781,Q510364,8573,1781 Kristinehamns kommun
124 | Krokoms kommun,Krokoms kommun,Krokom municipality,Krokom,2309,2309,Q514707,8388,2309 Krokoms kommun
125 | Kumla kommun,Kumla kommun,Kumla municipality,Kumla,1881,1881,Q504988,8703,1881 Kumla kommun
126 | Kungälvs kommun,Kungälvs kommun,Kungälv municipality,Kungälv,1482,1482,Q511394,8665,1482 Kungälvs kommun
127 | Kungsbacka kommun,Kungsbacka kommun,Kungsbacka municipality,Kungsbacka,1384,1384,Q499380,8381,1384 Kungsbacka kommun
128 | Kungsörs kommun,Kungsörs kommun,Kungsör municipality,Kungsör,1960,1960,Q47169,8609,1960 Kungsörs kommun
129 | Laholms kommun,Laholms kommun,Laholm municipality,Laholm,1381,1381,Q487502,8382,1381 Laholms kommun
130 | Landskrona kommun,Landskrona kommun,Landskrona municipality,Landskrona,1282,1282,Q502298,8458,1282 Landskrona kommun
131 | Laxå kommun,Laxå kommun,Laxå municipality,Laxå,1860,1860,Q515326,8704,1860 Laxå kommun
132 | Lekebergs kommun,Lekebergs kommun,Lekeberg municipality,Lekeberg,1814,1814,Q515282,8705,1814 Lekebergs kommun
133 | Leksands kommun,Leksands kommun,Leksand municipality,Leksand,2029,2029,Q509651,8354,2029 Leksands kommun
134 | Lerums kommun,Lerums kommun,Lerum municipality,Lerum,1441,1441,Q503188,8666,1441 Lerums kommun
135 | Lessebo kommun,Lessebo kommun,Lessebo municipality,Lessebo,0761,761,Q509488,8422,0761 Lessebo kommun
136 | Lidingö kommun,Lidingö kommun,Lidingö municipality,Lidingö,0186,186,Q3120654,8501,0186 Lidingö kommun
137 | Lidköpings kommun,Lidköpings kommun,Lidköping municipality,Lidköping,1494,1494,Q515358,8667,1494 Lidköpings kommun
138 | Lilla Edets kommun,Lilla Edets kommun,Lilla Edet municipality,Lilla Edet,1462,1462,Q511241,8668,1462 Lilla Edets kommun
139 | Lindesbergs kommun,Lindesbergs kommun,Lindesberg municipality,Lindesberg,1885,1885,Q514858,8706,1885 Lindesbergs kommun
140 | Linköpings kommun,Linköpings kommun,Linköping municipality,Linköping,0580,580,Q499410,8714,0580 Linköpings kommun
141 | Ljungby kommun,Ljungby kommun,Ljungby municipality,Ljungby,0781,781,Q504235,8423,0781 Ljungby kommun
142 | Ljusdals kommun,Ljusdals kommun,Ljusdal municipality,Ljusdal,2161,2161,Q515235,8371,2161 Ljusdals kommun
143 | Ljusnarsbergs kommun,Ljusnarsbergs kommun,Ljusnarsberg municipality,Ljusnarsberg,1864,1864,Q514739,8707,1864 Ljusnarsbergs kommun
144 | Lomma kommun,Lomma kommun,Lomma municipality,Lomma,1262,1262,Q427991,8459,1262 Lomma kommun
145 | Ludvika kommun,Ludvika kommun,Ludvika municipality,Ludvika,2085,2085,Q503184,8355,2085 Ludvika kommun
146 | Luleå kommun,Luleå kommun,Luleå municipality,Luleå,2580,2580,Q177019,8439,2580 Luleå kommun
147 | Lunds kommun,Lunds kommun,Lund municipality,Lund,1281,1281,Q505018,8460,1281 Lunds kommun
148 | Lycksele kommun,Lycksele kommun,Lycksele municipality,Lycksele,2481,2481,Q948462,8583,2481 Lycksele kommun
149 | Lysekils kommun,Lysekils kommun,Lysekil municipality,Lysekil,1484,1484,Q503173,8669,1484 Lysekils kommun
150 | Malå kommun,Malå kommun,Malå municipality,Malå,2418,2418,Q501540,8584,2418 Malå kommun
151 | Malmö kommun,Malmö kommun,Malmö municipality,Malmö,1280,1280,Q503361,8461,1280 Malmö kommun
152 | Malung-Sälens kommun,Malung-Sälens kommun,"Malung-Sälen municipality, formerly known as Malungs kommun",Malung-Sälen,2023,2023,Q504266,8356,2023 Malung-Sälens kommun
153 | Mariestads kommun,Mariestads kommun,Mariestad municipality,Mariestad,1493,1493,Q427422,8670,1493 Mariestads kommun
154 | Markaryds kommun,Markaryds kommun,Markaryd municipality,Markaryd,0767,767,Q240574,8671,0767 Markaryds kommun
155 | Marks kommun,Marks kommun,Mark municipality,Mark,1463,1463,Q500153,8424,1463 Marks kommun
156 | Melleruds kommun,Melleruds kommun,Mellerud municipality,Mellerud,1461,1461,Q501438,8672,1461 Melleruds kommun
157 | Mjölby kommun,Mjölby kommun,Mjölby municipality,Mjölby,0586,586,Q267030,8715,0586 Mjölby kommun
158 | Mölndals kommun,Mölndals kommun,Mölndal municipality,Mölndal,1481,1481,Q511270,8674,1481 Mölndals kommun
159 | Mönsterås kommun,Mönsterås kommun,Mönsterås municipality,Mönsterås,0861,861,Q515250,8413,0861 Mönsterås kommun
160 | Mora kommun,Mora kommun,Mora municipality,Mora,2062,2062,Q504239,8357,2062 Mora kommun
161 | Mörbylånga kommun,Mörbylånga kommun,Mörbylånga municipality,Mörbylånga,0840,840,Q514756,8414,0840 Mörbylånga kommun
162 | Motala kommun,Motala kommun,Motala municipality,Motala,0583,583,Q508108,8716,0583 Motala kommun
163 | Mullsjö kommun,Mullsjö kommun,Mullsjö municipality,Mullsjö,0642,642,Q505076,8400,0642 Mullsjö kommun
164 | Munkedals kommun,Munkedals kommun,Munkedal municipality,Munkedal,1430,1430,Q389040,8673,1430 Munkedals kommun
165 | Munkfors kommun,Munkfors kommun,Munkfors municipality,Munkfors,1762,1762,Q501494,8574,1762 Munkfors kommun
166 | Nacka kommun,Nacka kommun,Nacka municipality,Nacka,0182,182,Q946647,8502,0182 Nacka kommun
167 | Nässjö kommun,Nässjö kommun,Nässjö municipality,Nässjö,0682,682,Q505096,8401,0682 Nässjö kommun
168 | Nora kommun,Nora kommun,Nora municipality,Nora,1884,1884,Q285894,8708,1884 Nora kommun
169 | Norbergs kommun,Norbergs kommun,Norberg municipality,Norberg,1962,1962,Q37404,8611,1962 Norbergs kommun
170 | Nordanstigs kommun,Nordanstigs kommun,Nordanstig municipality,Nordanstig,2132,2132,Q514805,8372,2132 Nordanstigs kommun
171 | Nordmalings kommun,Nordmalings kommun,Nordmaling municipality,Nordmaling,2401,2401,Q514722,8585,2401 Nordmalings kommun
172 | Norrköpings kommun,Norrköpings kommun,Norrköping municipality,Norrköping,0581,581,Q504676,8717,0581 Norrköpings kommun
173 | Norrtälje kommun,Norrtälje kommun,Norrtälje municipality,Norrtälje,0188,188,Q214048,8503,0188 Norrtälje kommun
174 | Norsjö kommun,Norsjö kommun,Norsjö municipality,Norsjö,2417,2417,Q507644,8586,2417 Norsjö kommun
175 | Nybro kommun,Nybro kommun,Nybro municipality,Nybro,0881,881,Q515318,8415,0881 Nybro kommun
176 | Nyköpings kommun,Nyköpings kommun,Nyköping municipality,Nyköping,0480,480,Q500267,8549,0480 Nyköpings kommun
177 | Nykvarns kommun,Nykvarns kommun,Nykvarn municipality,Nykvarn,0140,140,Q499460,8504,0140 Nykvarns kommun
178 | Nynäshamns kommun,Nynäshamns kommun,Nynäshamn municipality,Nynäshamn,0192,192,Q505090,8505,0192 Nynäshamns kommun
179 | Ockelbo kommun,Ockelbo kommun,Ockelbo municipality,Ockelbo,2101,2101,Q505109,8373,2101 Ockelbo kommun
180 | Öckerö kommun,Öckerö kommun,Öckerö municipality,Öckerö,1407,1407,Q293928,8696,1407 Öckerö kommun
181 | Ödeshögs kommun,Ödeshögs kommun,Ödeshög municipality,Ödeshög,0509,509,Q293970,8723,0509 Ödeshögs kommun
182 | Olofströms kommun,Olofströms kommun,Olofström municipality,Olofström,1060,1060,Q504257,8345,1060 Olofströms kommun
183 | Örebro kommun,Örebro kommun,Örebro municipality,Örebro,1880,1880,Q297718,8709,1880 Örebro kommun
184 | Örkelljunga kommun,Örkelljunga kommun,Örkelljunga municipality,Örkelljunga,1257,1257,Q297936,8492,1257 Örkelljunga kommun
185 | Örnsköldsviks kommun,Örnsköldsviks kommun,Örnsköldsvik municipality,Örnsköldsvik,2284,2284,Q298003,8603,2284 Örnsköldsviks kommun
186 | Orsa kommun,Orsa kommun,Orsa municipality,Orsa,2034,2034,Q504630,8358,2034 Orsa kommun
187 | Orusts kommun,Orusts kommun,Orust municipality,Orust,1421,1421,Q338752,8675,1421 Orusts kommun
188 | Osby kommun,Osby kommun,Osby municipality,Osby,1273,1273,Q504594,8478,1273 Osby kommun
189 | Oskarshamns kommun,Oskarshamns kommun,Oskarshamn municipality,Oskarshamn,0882,882,Q505006,8416,0882 Oskarshamns kommun
190 | Österåkers kommun,Österåkers kommun,Österåker municipality,Österåker,0117,117,Q117728,8543,0117 Österåkers kommun
191 | Östersunds kommun,Östersunds kommun,Östersund municipality,Östersund,2380,2380,Q306789,8392,2380 Östersunds kommun
192 | Östhammars kommun,Östhammars kommun,Östhammar municipality,Östhammar,0382,382,Q59093,8562,0382 Östhammars kommun
193 | Östra Göinge kommun,Östra Göinge kommun,Östra Göinge municipality,Östra Göinge,1256,1256,Q307370,8493,1256 Östra Göinge kommun
194 | Ovanåkers kommun,Ovanåkers kommun,Ovanåker municipality,Ovanåker,2121,2121,Q505085,8374,2121 Ovanåkers kommun
195 | Överkalix kommun,Överkalix kommun,Överkalix municipality,Överkalix,2513,2513,Q307569,8433,2513 Överkalix kommun
196 | Övertorneå kommun,Övertorneå kommun,Övertorneå municipality,Övertorneå,2518,2518,Q307603,8435,2518 Övertorneå kommun
197 | Oxelösunds kommun,Oxelösunds kommun,Oxelösund municipality,Oxelösund,0481,481,Q505246,8550,0481 Oxelösunds kommun
198 | Pajala kommun,Pajala kommun,Pajala municipality,Pajala,2521,2521,Q186230,8436,2521 Pajala kommun
199 | Partille kommun,Partille kommun,Partille municipality,Partille,1402,1402,Q125222,8676,1402 Partille kommun
200 | Perstorps kommun,Perstorps kommun,Perstorp municipality,Perstorp,1275,1275,Q504249,8479,1275 Perstorps kommun
201 | Piteå kommun,Piteå kommun,Piteå municipality,Piteå,2581,2581,Q507656,8440,2581 Piteå kommun
202 | Ragunda kommun,Ragunda kommun,Ragunda municipality,Ragunda,2303,2303,Q515342,8389,2303 Ragunda kommun
203 | Rättviks kommun,Rättviks kommun,Rättvik municipality,Rättvik,2031,2031,Q504244,8359,2031 Rättviks kommun
204 | Robertsfors kommun,Robertsfors kommun,Robertsfors municipality,Robertsfors,2409,2409,Q507670,8587,2409 Robertsfors kommun
205 | Ronneby kommun,Ronneby kommun,Ronneby municipality,Ronneby,1081,1081,Q515373,8346,1081 Ronneby kommun
206 | Säffle kommun,Säffle kommun,Säffle municipality,Säffle,1785,1785,Q511326,8577,1785 Säffle kommun
207 | Sala kommun,Sala kommun,Sala municipality,Sala,1981,1981,Q37399,8612,1981 Sala kommun
208 | Salems kommun,Salems kommun,Salem municipality,Salem,0128,128,Q1255130,8506,0128 Salems kommun
209 | Sandvikens kommun,Sandvikens kommun,Sandviken municipality,Sandviken,2181,2181,Q149539,8375,2181 Sandvikens kommun
210 | Säters kommun,Säters kommun,Säter municipality,Säter,2082,2082,Q1345439,8361,2082 Säters kommun
211 | Sävsjö kommun,Sävsjö kommun,Sävsjö municipality,Sävsjö,0684,684,Q505239,8402,0684 Sävsjö kommun
212 | Sigtuna kommun,Sigtuna kommun,Sigtuna municipality,Sigtuna,0191,191,Q216915,8507,0191 Sigtuna kommun
213 | Simrishamns kommun,Simrishamns kommun,Simrishamn municipality,Simrishamn,1291,1291,Q504626,8480,1291 Simrishamns kommun
214 | Sjöbo kommun,Sjöbo kommun,Sjöbo municipality,Sjöbo,1265,1265,Q504601,8481,1265 Sjöbo kommun
215 | Skara kommun,Skara kommun,Skara municipality,Skara,1495,1495,Q499421,8677,1495 Skara kommun
216 | Skellefteå kommun,Skellefteå kommun,Skellefteå municipality,Skellefteå,2482,2482,Q430780,8588,2482 Skellefteå kommun
217 | Skinnskattebergs kommun,Skinnskattebergs kommun,Skinnskatteberg municipality,Skinnskatteberg,1904,1904,Q37462,8613,1904 Skinnskattebergs kommun
218 | Skövde kommun,Skövde kommun,Skövde municipality,Skövde,1496,1496,Q501452,8678,1496 Skövde kommun
219 | Skurups kommun,Skurups kommun,Skurup municipality,Skurup,1264,1264,Q515266,8482,1264 Skurups kommun
220 | Smedjebackens kommun,Smedjebackens kommun,Smedjebacken municipality,Smedjebacken,2061,2061,Q505046,8360,2061 Smedjebackens kommun
221 | Söderhamns kommun,Söderhamns kommun,Söderhamn municipality,Söderhamn,2182,2182,Q145835,8376,2182 Söderhamns kommun
222 | Söderköpings kommun,Söderköpings kommun,Söderköping municipality,Söderköping,0582,582,Q515680,8718,0582 Söderköpings kommun
223 | Södertälje kommun,Södertälje kommun,Södertälje municipality,Södertälje,0181,181,Q516336,8535,0181 Södertälje kommun
224 | Sollefteå kommun,Sollefteå kommun,Sollefteå municipality,Sollefteå,2283,2283,Q221990,8599,2283 Sollefteå kommun
225 | Sollentuna kommun,Sollentuna kommun,Sollentuna municipality,Sollentuna,0163,163,Q503746,8508,0163 Sollentuna kommun
226 | Solna kommun,Solna kommun,Solna municipality,Solna,0184,184,Q109010,8509,0184 Solna kommun
227 | Sölvesborgs kommun,Sölvesborgs kommun,Sölvesborg municipality,Sölvesborg,1083,1083,Q515409,8347,1083 Sölvesborgs kommun
228 | Sorsele kommun,Sorsele kommun,Sorsele municipality,Sorsele,2422,2422,Q501463,8589,2422 Sorsele kommun
229 | Sotenäs kommun,Sotenäs kommun,Sotenäs municipality,Sotenäs,1427,1427,Q501420,8679,1427 Sotenäs kommun
230 | Staffanstorps kommun,Staffanstorps kommun,Staffanstorp municipality,Staffanstorp,1230,1230,Q504609,8483,1230 Staffanstorps kommun
231 | Stenungsunds kommun,Stenungsunds kommun,Stenungsund municipality,Stenungsund,1415,1415,Q511438,8680,1415 Stenungsunds kommun
232 | Stockholms kommun,Stockholms kommun,Stockholm municipality,Stockholm,0180,180,Q506250,8510,0180 Stockholms kommun
233 | Storfors kommun,Storfors kommun,Storfors municipality,Storfors,1760,1760,Q505936,8575,1760 Storfors kommun
234 | Storumans kommun,Storumans kommun,Storuman municipality,Storuman,2421,2421,Q499415,8590,2421 Storumans kommun
235 | Strängnäs kommun,Strängnäs kommun,Strängnäs municipality,Strängnäs,0486,486,Q501532,8551,0486 Strängnäs kommun
236 | Strömstads kommun,Strömstads kommun,Strömstad municipality,Strömstad,1486,1486,Q501424,8681,1486 Strömstads kommun
237 | Strömsunds kommun,Strömsunds kommun,Strömsund municipality,Strömsund,2313,2313,Q514770,8390,2313 Strömsunds kommun
238 | Sundbybergs kommun,Sundbybergs kommun,Sundbyberg municipality,Sundbyberg,0183,183,Q972564,8534,0183 Sundbybergs kommun
239 | Sundsvalls kommun,Sundsvalls kommun,Sundsvall municipality,Sundsvall,2281,2281,Q504994,8600,2281 Sundsvalls kommun
240 | Sunne kommun,Sunne kommun,Sunne municipality,Sunne,1766,1766,Q501205,8576,1766 Sunne kommun
241 | Surahammars kommun,Surahammars kommun,Surahammar municipality,Surahammar,1907,1907,Q34078,8614,1907 Surahammars kommun
242 | Svalövs kommun,Svalövs kommun,Svalöv municipality,Svalöv,1214,1214,Q504227,8484,1214 Svalövs kommun
243 | Svedala kommun,Svedala kommun,Svedala municipality,Svedala,1263,1263,Q515706,8485,1263 Svedala kommun
244 | Svenljunga kommun,Svenljunga kommun,Svenljunga municipality,Svenljunga,1465,1465,Q501487,8682,1465 Svenljunga kommun
245 | Täby kommun,Täby kommun,Täby municipality,Täby,0160,160,Q493066,8537,0160 Täby kommun
246 | Tanums kommun,Tanums kommun,Tanum municipality,Tanum,1435,1435,Q511228,8683,1435 Tanums kommun
247 | Tibro kommun,Tibro kommun,Tibro municipality,Tibro,1472,1472,Q501432,8684,1472 Tibro kommun
248 | Tidaholms kommun,Tidaholms kommun,Tidaholm municipality,Tidaholm,1498,1498,Q501459,8685,1498 Tidaholms kommun
249 | Tierps kommun,Tierps kommun,Tierp municipality,Tierp,0360,360,Q510198,8559,0360 Tierps kommun
250 | Timrå kommun,Timrå kommun,Timrå municipality,Timrå,2262,2262,Q504983,8601,2262 Timrå kommun
251 | Tingsryds kommun,Tingsryds kommun,Tingsryd municipality,Tingsryd,0763,763,Q515491,8425,0763 Tingsryds kommun
252 | Tjörns kommun,Tjörns kommun,Tjörn municipality,Tjörn,1419,1419,Q501448,8686,1419 Tjörns kommun
253 | Tomelilla kommun,Tomelilla kommun,Tomelilla municipality,Tomelilla,1270,1270,Q515519,8486,1270 Tomelilla kommun
254 | Töreboda kommun,Töreboda kommun,Töreboda municipality,Töreboda,1473,1473,Q501470,8689,1473 Töreboda kommun
255 | Torsås kommun,Torsås kommun,Torsås municipality,Torsås,0834,834,Q515551,8417,0834 Torsås kommun
256 | Torsby kommun,Torsby kommun,Torsby municipality,Torsby,1737,1737,Q510135,8578,1737 Torsby kommun
257 | Tranås kommun,Tranås kommun,Tranås municipality,Tranås,0687,687,Q505071,8403,0687 Tranås kommun
258 | Tranemo kommun,Tranemo kommun,Tranemo municipality,Tranemo,1452,1452,Q501479,8687,1452 Tranemo kommun
259 | Trelleborgs kommun,Trelleborgs kommun,Trelleborg municipality,Trelleborg,1287,1287,Q504219,8487,1287 Trelleborgs kommun
260 | Trollhättans kommun,Trollhättans kommun,Trollhättan municipality,Trollhättan,1488,1488,Q28532,8688,1488 Trollhättans kommun
261 | Trosa kommun,Trosa kommun,Trosa municipality,Trosa,0488,488,Q505064,8552,0488 Trosa kommun
262 | Tyresö kommun,Tyresö kommun,Tyresö municipality,Tyresö,0138,138,Q113730,8536,0138 Tyresö kommun
263 | Uddevalla kommun,Uddevalla kommun,Uddevalla municipality,Uddevalla,1485,1485,Q501442,8690,1485 Uddevalla kommun
264 | Ulricehamns kommun,Ulricehamns kommun,Ulricehamn municipality,Ulricehamn,1491,1491,Q382808,8691,1491 Ulricehamns kommun
265 | Umeå kommun,Umeå kommun,Umeå municipality,Umeå,2480,2480,Q507709,8591,2480 Umeå kommun
266 | Upplands Väsby kommun,Upplands Väsby kommun,Upplands Väsby municipality,Upplands Väsby,0114,114,Q499425,8539,0114 Upplands Väsby kommun
267 | Upplands-Bro kommun,Upplands-Bro kommun,Upplands-Bro municipality,Upplands-Bro,0139,139,Q113673,8538,0139 Upplands-Bro kommun
268 | Uppsala kommun,Uppsala kommun,Uppsala municipality,Uppsala,0380,380,Q59091,8560,0380 Uppsala kommun
269 | Uppsala kommun före 2003,Uppsala kommun före 2003,Uppsala municipality before Knivsta broke away in 2003. Most of the time you would probably want to use Uppsala kommun even before 2003,Uppsala,0380,380,Q59091,8560,0380 Uppsala kommun före 2003
270 | Uppvidinge kommun,Uppvidinge kommun,Uppvidinge municipality,Uppvidinge,0760,760,Q515505,8426,0760 Uppvidinge kommun
271 | Vadstena kommun,Vadstena kommun,Vadstena municipality,Vadstena,0584,584,Q515969,8719,0584 Vadstena kommun
272 | Vaggeryds kommun,Vaggeryds kommun,Vaggeryd municipality,Vaggeryd,0665,665,Q605329,8404,0665 Vaggeryds kommun
273 | Valdemarsviks kommun,Valdemarsviks kommun,Valdemarsvik municipality,Valdemarsvik,0563,563,Q509997,8720,0563 Valdemarsviks kommun
274 | Vallentuna kommun,Vallentuna kommun,Vallentuna municipality,Vallentuna,0115,115,Q501526,8540,0115 Vallentuna kommun
275 | Vänersborgs kommun,Vänersborgs kommun,Vänersborg municipality,Vänersborg,1487,1487,Q511426,8693,1487 Vänersborgs kommun
276 | Vännäs kommun,Vännäs kommun,Vännäs municipality,Vännäs,2460,2460,Q500210,8594,2460 Vännäs kommun
277 | Vansbro kommun,Vansbro kommun,Vansbro municipality,Vansbro,2021,2021,Q501551,8362,2021 Vansbro kommun
278 | Vara kommun,Vara kommun,Vara municipality,Vara,1470,1470,Q501428,8692,1470 Vara kommun
279 | Varbergs kommun,Varbergs kommun,Varberg municipality,Varberg,1383,1383,Q179180,8383,1383 Varbergs kommun
280 | Vårgårda kommun,Vårgårda kommun,Vårgårda municipality,Vårgårda,1442,1442,Q511297,8694,1442 Vårgårda kommun
281 | Värmdö kommun,Värmdö kommun,Värmdö municipality,Värmdö,0120,120,Q493841,8542,0120 Värmdö kommun
282 | Värnamo kommun,Värnamo kommun,Värnamo municipality,Värnamo,0683,683,Q280562,8406,0683 Värnamo kommun
283 | Västerås kommun,Västerås kommun,Västerås municipality,Västerås,1980,1980,Q34550,8615,1980 Västerås kommun
284 | Västerviks kommun,Västerviks kommun,Västervik municipality,Västervik,0883,883,Q515477,8419,0883 Västerviks kommun
285 | Vaxholms kommun,Vaxholms kommun,Vaxholm municipality,Vaxholm,0187,187,Q500090,8541,0187 Vaxholms kommun
286 | Växjö kommun,Växjö kommun,Växjö municipality,Växjö,0780,780,Q500217,8427,0780 Växjö kommun
287 | Vellinge kommun,Vellinge kommun,Vellinge municipality,Vellinge,1233,1233,Q511338,8488,1233 Vellinge kommun
288 | Vetlanda kommun,Vetlanda kommun,Vetlanda municipality,Vetlanda,0685,685,Q505052,8405,0685 Vetlanda kommun
289 | Vilhelmina kommun,Vilhelmina kommun,Vilhelmina municipality,Vilhelmina,2462,2462,Q515861,8592,2462 Vilhelmina kommun
290 | Vimmerby kommun,Vimmerby kommun,Vimmerby municipality,Vimmerby,0884,884,Q505057,8418,0884 Vimmerby kommun
291 | Vindelns kommun,Vindelns kommun,Vindeln municipality,Vindeln,2404,2404,Q504505,8593,2404 Vindelns kommun
292 | Vingåkers kommun,Vingåkers kommun,Vingåker municipality,Vingåker,0428,428,Q249378,8553,0428 Vingåkers kommun
293 | Ydre kommun,Ydre kommun,Ydre municipality,Ydre,0512,512,Q515699,8721,0512 Ydre kommun
294 | Ystads kommun,Ystads kommun,Ystad municipality,Ystad,1286,1286,Q505102,8489,1286 Ystads kommun
295 | 


--------------------------------------------------------------------------------
/statscraper/datatypes/values/road_types.csv:
--------------------------------------------------------------------------------
1 | "id","label","description","dialect:wikidata","dialect:sv"
2 | "e-road","E-road","A road in the international E-road network","Q106123","e"
3 | "national_road","National road","A primary road class in many countries. Often crossing a large part of the country ","Q1716124","rv"
4 | "county_road","County road","A secondary road class in some countries.",,"lv"
5 | "street","Street","A public thoroughfare in a built environment, not part of a national road system","Q79007",
6 | "road","Other road","A generic road type","Q34442",
7 | 


--------------------------------------------------------------------------------
/statscraper/exceptions.py:
--------------------------------------------------------------------------------
 1 | class InvalidID(Exception):
 2 |     """This string is not allowed as an id at this point.
 3 |     Note: Inherits from Exception instead of StandardError
 4 |     for Python3.x compatibility reasons."""
 5 | 
 6 |     pass
 7 | 
 8 | 
 9 | class NoSuchItem(IndexError):
10 |     """No such Collection or Dataset."""
11 | 
12 |     pass
13 | 
14 | 
15 | class DatasetNotInView(IndexError):
16 |     """Tried to operate on a dataset that is not visible.
17 | 
18 |     This can be raised by a scraper if the cursor needs to
19 |     move before inspecting an item.
20 |     """
21 | 
22 |     pass
23 | 
24 | 
25 | class InvalidData(Exception):
26 |     """The scraper encountered some invalid data."""
27 | 
28 |     pass
29 | 
30 | 
31 | class NoSuchDatatype(Exception):
32 |     """No datatype with that id."""
33 | 
34 |     pass
35 | 


--------------------------------------------------------------------------------
/statscraper/scrapers/CranesScraper.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | """ A scraper to fetch daily cranes sightings at Hornborgasjön
 3 |     from http://web05.lansstyrelsen.se/transtat_O/transtat.asp
 4 |     This is intended to be a minimal example of a scraper
 5 |     using Beautiful Soup.
 6 | """
 7 | import requests
 8 | from bs4 import BeautifulSoup
 9 | from statscraper import BaseScraper, Dataset, Dimension, Result
10 | 
11 | 
12 | class Cranes(BaseScraper):
13 | 
14 |     def _fetch_itemslist(self, item):
15 |         """ There is only one dataset. """
16 |         yield Dataset("Number of cranes")
17 | 
18 |     def _fetch_dimensions(self, dataset):
19 |         """ Declaring available dimensions like this is not mandatory,
20 |          but nice, especially if they differ from dataset to dataset.
21 | 
22 |          If you are using a built in datatype, you can specify the dialect
23 |          you are expecting, to have values normalized. This scraper will
24 |          look for Swedish month names (e.g. 'Januari'), but return them
25 |          according to the Statscraper standard ('january').
26 |         """
27 |         yield Dimension(u"date", label="Day of the month")
28 |         yield Dimension(u"month", datatype="month", dialect="swedish")
29 |         yield Dimension(u"year", datatype="year")
30 | 
31 |     def _fetch_data(self, dataset, query=None):
32 |         html = requests.get("http://web05.lansstyrelsen.se/transtat_O/transtat.asp").text
33 |         soup = BeautifulSoup(html, 'html.parser')
34 |         table = soup.find("table", "line").find_all("table")[2].findNext("table")
35 |         rows = table.find_all("tr")
36 |         column_headers = rows.pop(0).find_all("td", recursive=False)
37 |         years = [x.text for x in column_headers[2:]]
38 |         for row in rows:
39 |             cells = row.find_all("td")
40 |             date = cells.pop(0).text
41 |             month = cells.pop(0).text
42 |             i = 0
43 |             for value in cells:
44 |                 # Each column from here is a year.
45 |                 if value.text:
46 |                     yield Result(value.text.encode("utf-8"), {
47 |                         "date": date,
48 |                         "month": month,
49 |                         "year": years[i],
50 |                     })
51 |                 i += 1
52 | 


--------------------------------------------------------------------------------
/statscraper/scrapers/PXWebScraper.py:
--------------------------------------------------------------------------------
  1 | """A wrapper around the PX-Web API.
  2 | 
  3 | As implementations and versions vary, this is best used as a base class,
  4 | for more specific scrapers to extend.
  5 | 
  6 | If used directly, an API endpoint must be set:
  7 |     scraper = PXWeb(base_url="http://api.example.com/")
  8 |     # ...or:
  9 |     scraper = PXWeb()
 10 |     scraper.base_url = "http://api.example.com/"
 11 | """
 12 | 
 13 | import requests
 14 | from statscraper import (BaseScraper, Collection, Result,
 15 |                          Dataset, Dimension, InvalidData)
 16 | from statscraper.compat import JSONDecodeError
 17 | 
 18 | 
 19 | class PXWeb(BaseScraper):
 20 |     """Scraper."""
 21 | 
 22 |     base_url = None  # API endpoint
 23 | 
 24 |     @BaseScraper.on("init")
 25 |     def _get_args(self, *args, **kwargs):
 26 |         """Store `base_url`, if given on init.
 27 | 
 28 |         This is convenient when the PXWeb scraper is used directly by an end user.
 29 |         """
 30 |         if "base_url" in kwargs and kwargs["base_url"]:
 31 |             self.base_url = kwargs["base_url"]
 32 | 
 33 |     def _api_path(self, item):
 34 |         """Get the API path for the current cursor position."""
 35 |         if self.base_url is None:
 36 |             raise NotImplementedError("base_url not set")
 37 |         path = "/".join([x.blob["id"] for x in item.path])
 38 |         return "/".join([self.base_url, path])
 39 | 
 40 |     def _fetch_itemslist(self, item):
 41 |         data = requests.get(self._api_path(item)).json()
 42 | 
 43 |         for d in data:
 44 |             if d["type"] == "l":
 45 |                 yield Collection(d["id"], label=d["text"], blob=d)
 46 |             else:
 47 |                 yield Dataset(d["id"], label=d["text"], blob=d)
 48 | 
 49 |     def _fetch_dimensions(self, dataset):
 50 |         data = requests.get(self._api_path(dataset)).json()
 51 |         try:
 52 |             for d in data["variables"]:
 53 |                 yield Dimension(d["code"],
 54 |                                 label=d["text"],
 55 |                                 allowed_values=d["values"])
 56 | 
 57 |         except KeyError:
 58 |             yield None
 59 | 
 60 |     def _fetch_data(self, dataset, query):
 61 |         if query is None:
 62 |             query = {}
 63 |         body = {
 64 |             'query': [{
 65 |                 'code': key,
 66 |                 'selection': {
 67 |                     'filter': filtertype,
 68 |                     # value can be a list or a value
 69 |                     'values': value if isinstance(value, list) else [value]
 70 |                 }
 71 |             } for key, (filtertype, value) in query.items()],
 72 |             'response': {
 73 |                 'format': "json"
 74 |             }
 75 |         }
 76 |         try:
 77 |             raw = requests.post(self._api_path(dataset), json=body)
 78 |             if raw.headers["content-type"] == "text/html":
 79 |                 # This is an error message
 80 |                 raise(InvalidData(f"""Error message from PX Web:
 81 | 
 82 | {raw.content}
 83 | 
 84 | Check your query for spelling errors, or try reducing the size.
 85 | """))
 86 |             data = raw.json()
 87 |         except JSONDecodeError:
 88 |             raise InvalidData("""No valid response from PX Web.
 89 | Check your query for spelling errors, or try reducing the size.
 90 | This error is frequently due to a too large result being requested.""")
 91 | 
 92 |         # All available dimensions are not always returned.
 93 |         # What is returned depends on the query
 94 |         raw_return_dimension = data["columns"]
 95 |         # Filter out dimensions only
 96 |         raw_return_dimension = [x for x in raw_return_dimension if x["type"] != "c"]
 97 | 
 98 |         for row in data[u"data"]:
 99 |             for value in row[u"values"]:
100 |                 dimensions = {}
101 |                 # 'key' contains one value for each dimension,
102 |                 # always preserving order.
103 |                 for d, v in zip(raw_return_dimension, row[u"key"]):
104 |                     dimensions[d["code"]] = v
105 | 
106 |                 yield Result(value, dimensions=dimensions)
107 | 


--------------------------------------------------------------------------------
/statscraper/scrapers/SCBScraper.py:
--------------------------------------------------------------------------------
 1 | """A wrapper around the SCB API."""
 2 | from .PXWebScraper import PXWeb, Dimension
 3 | 
 4 | 
 5 | class SCB(PXWeb):
 6 |     """The SCB API uses PXWeb. We just hardcode the url."""
 7 | 
 8 |     base_url = 'https://api.scb.se/OV0104/v1/doris/sv/ssd'
 9 |     COUNTIES = [
10 |         "01", "03", "04", "05", "06", "07", "08", "09", "10", "12", "13",
11 |         "14", "17", "18", "19", "20", "21", "22", "23", "24", "25"
12 |     ]
13 |     MUNICIPALITIES = [
14 |         "0114", "0115", "0117", "0120", "0123", "0125", "0126", "0127", "0128",
15 |         "0136", "0138", "0139", "0140", "0160", "0162", "0163", "0180", "0181",
16 |         "0182", "0183", "0184", "0186", "0187", "0188", "0191", "0192", "0305",
17 |         "0319", "0330", "0331", "0360", "0380", "0381", "0382", "0428", "0461",
18 |         "0480", "0481", "0482", "0483", "0484", "0486", "0488", "0509", "0512",
19 |         "0513", "0560", "0561", "0562", "0563", "0580", "0581", "0582", "0583",
20 |         "0584", "0586", "0604", "0617", "0642", "0643", "0662", "0665", "0680",
21 |         "0682", "0683", "0684", "0685", "0686", "0687", "0760", "0761", "0763",
22 |         "0764", "0765", "0767", "0780", "0781", "0821", "0834", "0840", "0860",
23 |         "0861", "0862", "0880", "0881", "0882", "0883", "0884", "0885", "0980",
24 |         "1060", "1080", "1081", "1082", "1083", "1214", "1230", "1231", "1233",
25 |         "1256", "1257", "1260", "1261", "1262", "1263", "1264", "1265", "1266",
26 |         "1267", "1270", "1272", "1273", "1275", "1276", "1277", "1278", "1280",
27 |         "1281", "1282", "1283", "1284", "1285", "1286", "1287", "1290", "1291",
28 |         "1292", "1293", "1315", "1380", "1381", "1382", "1383", "1384", "1401",
29 |         "1402", "1407", "1415", "1419", "1421", "1427", "1430", "1435", "1438",
30 |         "1439", "1440", "1441", "1442", "1443", "1444", "1445", "1446", "1447",
31 |         "1452", "1460", "1461", "1462", "1463", "1465", "1466", "1470", "1471",
32 |         "1472", "1473", "1480", "1481", "1482", "1484", "1485", "1486", "1487",
33 |         "1488", "1489", "1490", "1491", "1492", "1493", "1494", "1495", "1496",
34 |         "1497", "1498", "1499", "1715", "1730", "1737", "1760", "1761", "1762",
35 |         "1763", "1764", "1765", "1766", "1780", "1781", "1782", "1783", "1784",
36 |         "1785", "1814", "1860", "1861", "1862", "1863", "1864", "1880", "1881",
37 |         "1882", "1883", "1884", "1885", "1904", "1907", "1960", "1961", "1962",
38 |         "1980", "1981", "1982", "1983", "1984", "2021", "2023", "2026", "2029",
39 |         "2031", "2034", "2039", "2061", "2062", "2080", "2081", "2082", "2083",
40 |         "2084", "2085", "2101", "2104", "2121", "2132", "2161", "2180", "2181",
41 |         "2182", "2183", "2184", "2260", "2262", "2280", "2281", "2282", "2283",
42 |         "2284", "2303", "2305", "2309", "2313", "2321", "2326", "2361", "2380",
43 |         "2401", "2403", "2404", "2409", "2417", "2418", "2421", "2422", "2425",
44 |         "2460", "2462", "2463", "2480", "2481", "2482", "2505", "2506", "2510",
45 |         "2513", "2514", "2518", "2521", "2523", "2560", "2580", "2581", "2582",
46 |         "2583", "2584"
47 |     ]
48 | 
49 |     def _fetch_dimensions(self, dataset):
50 |         """Yield all dimensions.
51 | 
52 |         We override this method just to set the correct datatype
53 |         and dialect for regions.
54 |         """
55 |         for dimension in super()._fetch_dimensions(dataset):
56 |             if dimension.id == "Region":
57 |                 yield Dimension(dimension.id,
58 |                                 datatype="region",
59 |                                 dialect="skatteverket",
60 |                                 label=dimension.label)
61 |             else:
62 |                 yield dimension
63 | 
64 |     def _fetch_data(self, dataset, query={}, by=None):
65 |         """Allow adding municipalities or counties to a query."""
66 |         if by == "municipality":
67 |             query["Region"] = ("vs:RegionKommun07EjAggr", self.MUNICIPALITIES)
68 |         elif by == "county":
69 |             query["Region"] = ("vs:RegionLän07EjAggr", self.COUNTIES)
70 |         return super()._fetch_data(dataset, query)
71 | 


--------------------------------------------------------------------------------
/statscraper/scrapers/SMHIScraper.py:
--------------------------------------------------------------------------------
  1 | try:
  2 |     import StringIO
  3 | except ImportError:
  4 |     import io as StringIO
  5 | 
  6 | import requests
  7 | import csv
  8 | from datetime import datetime
  9 | from bs4 import BeautifulSoup
 10 | 
 11 | from statscraper import BaseScraper, Collection, Dimension, Dataset, Result, DimensionValue
 12 | 
 13 | VERSION = "1.0"
 14 | # LEVELS = ["api","parameter"]
 15 | PERIODS = [
 16 |     "corrected-archive",
 17 |     "latest-hour",
 18 |     "latest-day",
 19 |     "latest-months",
 20 | ]
 21 | 
 22 | 
 23 | class SMHI(BaseScraper):
 24 |     base_url = "http://opendata.smhi.se/apidocs/"
 25 | 
 26 |     def _fetch_itemslist(self, current_item):
 27 |         """ Get a all available apis
 28 |         """
 29 |         if current_item.is_root:
 30 |             html = requests.get(self.base_url).text
 31 |             soup = BeautifulSoup(html, 'html.parser')
 32 |             for item_html in soup.select(".row .col-md-6"):
 33 |                 try:
 34 |                     label = item_html.select_one("h2").text
 35 |                 except Exception:
 36 |                     continue
 37 |                 yield API(label, blob=item_html)
 38 |         else:
 39 |             # parameter = current_item.parent
 40 |             # data = requests.get(parameter.url)
 41 |             for resource in current_item.json["resource"]:
 42 |                 label = u"{}, {}".format(resource["title"], resource["summary"])
 43 |                 yield SMHIDataset(label, blob=resource)
 44 | 
 45 |     def _fetch_dimensions(self, parameter):
 46 |         yield StationDimension("station")
 47 |         # Hack: This redundant of the station dimension, but
 48 |         # necessary to be able to include both station name
 49 |         # (=readabilty) and key in resultset.
 50 |         # It would be better if the ResultSet object could
 51 |         # handle both label and key print.
 52 |         yield Dimension("station_key")
 53 |         yield Dimension("period", allowed_values=PERIODS)
 54 |         yield Dimension("parameter")
 55 | 
 56 |         example_data = parameter._get_example_csv()
 57 |         for dim in example_data.columns:
 58 |             yield Dimension(dim)
 59 | 
 60 |     def _fetch_allowed_values(self, dimension):
 61 |         if dimension.id == "station":
 62 |             for station in dimension.dataset.json["station"]:
 63 |                 yield Station(
 64 |                     station["key"],
 65 |                     dimension,
 66 |                     label=station["name"],
 67 |                     blob=station
 68 |                 )
 69 |         else:
 70 |             yield None
 71 | 
 72 |     def _fetch_data(self, dataset, query={}, include_inactive_stations=False):
 73 |         """ Should yield dataset rows
 74 |         """
 75 |         parameter = dataset
 76 |         station_dim = dataset.dimensions["station"]
 77 |         all_stations = station_dim.allowed_values
 78 |         # Step 1: Prepare query
 79 |         if "station" not in query:
 80 |             if include_inactive_stations:
 81 |                 # Get all stations
 82 |                 query["station"] = list(all_stations)
 83 |             else:
 84 |                 # Get only active stations
 85 |                 query["station"] = list(station_dim.active_stations())
 86 |         else:
 87 |             if not isinstance(query["station"], list):
 88 |                 query["station"] = [query["station"]]
 89 |             # Make sure that the queried stations actually exist
 90 |             query["station"] = [all_stations.get_by_label(x) for x in query["station"]]
 91 | 
 92 |         if "period" not in query:
 93 |             # TODO: I'd prepare to do dataset.get("period").allowed_values here
 94 |             query["period"] = PERIODS
 95 | 
 96 |         elif not isinstance(query["period"], list):
 97 |             query["period"] = [query["period"]]
 98 | 
 99 |         for period in query["period"]:
100 |             if period not in PERIODS:
101 |                 msg = u"{} is not an allowed period".format(period)
102 |                 raise Exception(msg)
103 | 
104 |         # Step 3: Get data
105 |         for station in query["station"]:
106 |             for period in query["period"]:
107 |                 url = dataset.url.replace(
108 |                     ".json",
109 |                     f"/station/{station.key}/period/{period}/data.csv"
110 |                 )
111 |                 r = requests.get(url)
112 | 
113 |                 if r.status_code == 200:
114 |                     raw_data = DataCsv().from_string(r.content).to_dictlist()
115 | 
116 |                     # TODO: This is a very hard coded parse function
117 |                     # Expects fixed start row and number of cols
118 |                     for row in raw_data:
119 |                         value_col = parameter.id.split(",")[0]
120 |                         value = float(row[value_col])
121 | 
122 |                         row["parameter"] = parameter.id
123 |                         row["station"] = station.label
124 |                         row["station_key"] = station.key
125 |                         row["period"] = period
126 | 
127 |                         row.pop(value_col, None)
128 | 
129 |                         datapoint = Result(value, row)
130 | 
131 |                         yield datapoint
132 | 
133 |                 elif r.status_code == 404:
134 |                     print("Warning no data at {}".format(url))
135 |                 else:
136 |                     raise Exception("Connection error for {}".format(url))
137 | 
138 | 
139 | class API(Collection):
140 |     """
141 |     """
142 |     level = "api"
143 | 
144 |     @property
145 |     def key(self):
146 |         return self.blob.select_one("a").get("href").replace("/index.html", "")
147 | 
148 |     @property
149 |     def url(self):
150 |         return "http://opendata-download-{}.smhi.se/api/version/{}.json"\
151 |                 .format(self.key, VERSION)
152 | 
153 |     @property
154 |     def json(self):
155 |         return self._get_json_blob()
156 | 
157 |     def _get_json_blob(self):
158 |         # Update blob
159 |         error_msg = "Scraper does not support parsing of '{}' yet.".format(self.id)
160 |         try:
161 |             r = requests.get(self.url)
162 |         except Exception:
163 |             # Catch ie. "opendata-download-grid.smhi.se"
164 |             raise NotImplementedError(error_msg)
165 |         if r.status_code == 404:
166 |             raise NotImplementedError(error_msg)
167 | 
168 |         return r.json()
169 | 
170 | 
171 | class StationDimension(Dimension):
172 | 
173 |     def active_stations(self):
174 |         """ Get a list of all active stations
175 |         """
176 |         return (x for x in self.allowed_values if x.is_active)
177 | 
178 | 
179 | class Station(DimensionValue):
180 |     def __init__(self, value, dimension, label=None, blob=None):
181 |         super(Station, self).__init__(value, dimension, label=label)
182 | 
183 |         self.key = value
184 |         self.summary = blob["summary"]
185 |         self.updated = datetime.fromtimestamp(blob["updated"]/1000)
186 |         self.blob = blob
187 | 
188 |         # Was there an update in the last 100 days?
189 |         self.is_active = (datetime.now() - self.updated).days < 100
190 | 
191 |     def __repr__(self):
192 |         if self.is_active:
193 |             status = "active"
194 |         else:
195 |             status = "inactive"
196 |         return "<Station: {} ({})>"\
197 |             .format(self.label.encode("utf-8"), status)
198 | 
199 | 
200 | class SMHIDataset(Dataset):
201 |     @property
202 |     def key(self):
203 |         return self.blob["key"]
204 | 
205 |     @property
206 |     def url(self):
207 |         api = self.parent
208 |         return "http://opendata-download-{}.smhi.se/api/version/{}/parameter/{}.json"\
209 |             .format(api.key, VERSION, self.key)
210 | 
211 |     @property
212 |     def json(self):
213 |         if not hasattr(self, "_json"):
214 |             self._json = requests.get(self.url).json()
215 |         return self._json
216 | 
217 |     def get_stations_list(self):
218 |         """ Get a dict list of all stations with properties such as
219 |             latitude and longitude
220 |         """
221 |         stations = self.dimensions["station"].allowed_values
222 |         return self._format_station_list(stations)
223 | 
224 |     def get_active_stations_list(self):
225 |         """ Get a dict list of all stations with properties such as
226 |             latitude and longitude
227 |         """
228 |         stations = self.dimensions["station"].active_stations()
229 |         return self._format_station_list(stations)
230 | 
231 |     def _get_example_csv(self):
232 |         """For dimension parsing
233 |         """
234 |         station_key = self.json["station"][0]["key"]
235 |         period = "corrected-archive"
236 |         url = self.url.replace(
237 |             ".json",
238 |             f"/station/{station_key}/period/{period}/data.csv"
239 |         )
240 | 
241 |         r = requests.get(url)
242 |         if r.status_code == 200:
243 |             return DataCsv().from_string(r.content)
244 |         else:
245 |             raise Exception("Error connecting to api")
246 | 
247 |     def _format_station_list(self, stations):
248 |         data = []
249 |         for station in stations:
250 |             json_data = station.blob
251 |             # Inlude all props but link
252 |             json_data.pop('link', None)
253 |             data.append(station.blob)
254 | 
255 |         return data
256 | 
257 | 
258 | class DataCsv(object):
259 |     columns = []
260 |     data = []
261 | 
262 |     def from_file(self, file_path):
263 |         with open(file_path) as f:
264 |             self._parse(f)
265 | 
266 |         return self
267 | 
268 |     def from_string(self, csv_content):
269 |         if isinstance(csv_content, bytes):
270 |             csv_content = csv_content.decode("utf-8")
271 |         f = StringIO.StringIO(csv_content)
272 |         self._parse(f)
273 | 
274 |         return self
275 | 
276 |     def to_dictlist(self):
277 |         return [
278 |             dict(zip(self.columns, row))
279 |             for row in self.data
280 |         ]
281 | 
282 |     def _parse(self, f):
283 |         rows = list(csv.reader(f, delimiter=';'))
284 |         tables = []
285 |         table = []
286 |         for i, row in enumerate(rows):
287 |             is_last = i == len(rows) - 1
288 | 
289 |             # Check if new table
290 |             if is_empty(row):
291 |                 if len(table) > 0:
292 |                     tables.append(table)
293 |                 table = []
294 |                 continue
295 | 
296 |             is_header = len(table) == 0
297 |             if is_header:
298 |                 n_cols = table_width(row)
299 | 
300 |             table.append(row[:n_cols])
301 | 
302 |             if is_last:
303 |                 tables.append(table)
304 | 
305 |         data_table = tables[-1]
306 |         self.columns = data_table[0]
307 |         try:
308 |             self.data = data_table[1:]
309 |         except IndexError:
310 |             self.data = []
311 | 
312 | 
313 | def is_empty(row):
314 |     """ Check if a csv row (represented as a list
315 |         of values) is empty.
316 | 
317 |         [] => True
318 |         ["","","foo"] => True
319 |         ["foo","bar"] => False
320 |     """
321 |     if len(row) == 0:
322 |         return True
323 |     if row[0] == "":
324 |         return True
325 |     return False
326 | 
327 | 
328 | def table_width(row):
329 |     """ Get number of cols in row
330 |         ["col1", "col2","","","other_col"] => 2
331 |     """
332 | 
333 |     for i, val in enumerate(row):
334 |         if val == "":
335 |             break
336 |     return i
337 | 


--------------------------------------------------------------------------------
/statscraper/scrapers/StatistikcentralenScraper.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | """ A wrapper around the Statistikcentralen/Tilastokeskus API,
 3 |     demonstrating how to extend a scraper in the scraper park.
 4 | 
 5 |     The user can select 'fi' or 'sv' as their prefered language like this:
 6 | 
 7 |      scraper = Statistikcentralen("fi")
 8 |      # ...or:
 9 |      scraper = Statistikcentralen()
10 |      scraper.lang = "fi"
11 | """
12 | from PXWebScraper import PXWeb
13 | 
14 | 
15 | class Statistikcentralen(PXWeb):
16 | 
17 |     lang = "sv"
18 |     _available_languages = ["sv", "fi"]
19 | 
20 |     @property
21 |     def base_url(self):
22 |         return 'http://pxnet2.stat.fi/pxweb/api/v1/%s/StatFin/' % self.lang
23 | 
24 |     @PXWeb.on("init")
25 |     def _get_lang(self, *args, **kwargs):
26 |         """ Let users select language
27 |         """
28 |         if "lang" in kwargs:
29 |             if kwargs["lang"] in self._available_languages:
30 |                 self.lang = kwargs["lang"]
31 | 


--------------------------------------------------------------------------------
/statscraper/scrapers/VantetiderScraper.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from bs4 import BeautifulSoup
  3 | import requests_cache
  4 | from requests.exceptions import RequestException
  5 | from itertools import product
  6 | import re
  7 | 
  8 | requests_cache.install_cache()
  9 | 
 10 | from statscraper.base_scraper import (BaseScraper, Collection,
 11 |                                       Dataset, Dimension, Result)
 12 | 
 13 | BASE_URL = u"http://www.vantetider.se/Kontaktkort/"
 14 | 
 15 | class VantetiderScraper(BaseScraper):
 16 | 
 17 |     def _fetch_itemslist(self, current_item):
 18 |         # Get start page
 19 |         html = self._get_html(BASE_URL + "Sveriges")
 20 |         soup = BeautifulSoup(html, 'html.parser')
 21 |         # Get links to datasets
 22 |         links = soup.find_all("ul", {"class":"main-nav page-width"})[0]\
 23 |             .find_all("li")[1]\
 24 |             .find_all("a")\
 25 |             [2:] # First two are _not relevant
 26 | 
 27 |         ids = [x.get("href").split("/Sveriges/")[-1].replace("/","")
 28 |             for x in links]
 29 |         labels = [x.text for x in links]
 30 | 
 31 |         for id_, label in zip(ids, labels):
 32 |             # Get html of dataset page
 33 |             yield VantetiderDataset(id_, label=label)
 34 | 
 35 |     def _fetch_dimensions(self, dataset):
 36 |         dimensions = {}
 37 |         dataset_id = dataset.id
 38 |         try:
 39 |             form = [x for x in dataset.soup.find_all("form")
 40 |                 if "/Kontaktkort/" in x.get("action")][0]
 41 |         except IndexError:
 42 |             # http://www.vantetider.se/Kontaktkort/Sveriges/Aterbesok
 43 |             # does not have form element
 44 |             form = dataset.soup.find("div", {"class": "container_12 filter_section specialised_operation"})
 45 |         self._form = form
 46 | 
 47 |         # 1. Get select elements (dropdowns)
 48 |         select_elems = form.find_all("select")
 49 |         for elem in select_elems:
 50 |             elem_id = elem.get("name")
 51 |             dim_id = elem_id.replace("select_","")
 52 | 
 53 |             dim = VantetiderDimension(dim_id)
 54 |             dim.elem_id = elem_id
 55 |             dim.elem = elem
 56 |             yield dim
 57 | 
 58 |         # 2. Get checkboxes (gender, ownership)
 59 |         checkbox_elems = [x for x in form.find_all("input", {"type": "checkbox"})]
 60 |         checkbox_labels = [x.text for x in form.find_all("label", {"class": "checkbox"})]
 61 |         for elem, label in zip(checkbox_elems, checkbox_labels):
 62 |             elem_id = elem.get("name")
 63 |             dim_id = elem_id.replace("checkbox_","")
 64 | 
 65 |             dim = VantetiderDimension(dim_id)
 66 |             dim.elem_id = elem_id
 67 |             dim.elem = elem
 68 |             yield dim
 69 | 
 70 | 
 71 |         # 3. Get radio buttons
 72 |         radio_elems = [x for x in form.find_all("input", {"type": "radio"})]
 73 |         elem_ids = get_unique([x.get("name") for x in radio_elems])
 74 | 
 75 |         for elem_id in elem_ids:
 76 |             elems = [x for x in radio_elems if x.get("name") == elem_id]
 77 |             dim_id = elem_id.replace("","")
 78 | 
 79 |             dim = VantetiderDimension(dim_id)
 80 |             dim.elem_id = elem_id
 81 |             dim.elem = elems
 82 |             yield dim
 83 | 
 84 | 
 85 |         # 4. Add measure and measure key
 86 |         yield VantetiderDimension("measure", label="Nyckeltal")
 87 | 
 88 | 
 89 |     def _fetch_data(self, dataset, query):
 90 |         only_region = query.keys() == ["region"]
 91 |         NO_QUERY_DIMS = ["measure"]
 92 |         #
 93 |         NOT_IMPLEMENTED_DIMS = ["unit", "services"]
 94 | 
 95 |         for dim_id in NOT_IMPLEMENTED_DIMS:
 96 |             if dim_id in query.keys():
 97 |                 msg = "Querying by {} is not implemented.".format(dim_id)
 98 |                 raise NotImplementedError(msg)
 99 | 
100 |         form_keys = [x.elem_id for x in dataset.dimensions if x.id not in NO_QUERY_DIMS]
101 | 
102 |         queries = []
103 | 
104 |         # Create payload for post request
105 |         # Get a list of values to query by
106 |         query_values = []
107 |         for dim in dataset.dimensions:
108 |             if dim.id in NO_QUERY_DIMS:
109 |                 continue
110 | 
111 |             # Pass default value if dimension is not in query
112 |             if dim.id not in query:
113 |                 value = [dim.default_value]
114 | 
115 |             else:
116 |                 # Translate passed values to ids
117 |                 value = query[dim.id]
118 |                 if not isinstance(value, list):
119 |                     value = [value]
120 | 
121 |             if value is None:
122 |                 raise ValueError()
123 |             query_values.append(value)
124 | 
125 |         queries = list(product(*query_values))
126 | 
127 |         self.log.info(u"Making a total of {} queries".format(len(queries)))
128 | 
129 |         data = []
130 | 
131 |         for _query in queries:
132 |             payload =  dict(zip(form_keys, _query))
133 |             url = dataset.get_url(payload["select_region"])
134 | 
135 |             for row in dataset._parse_result_page(url, payload, only_region=only_region):
136 |                 yield row
137 | 
138 | 
139 |     # HELPER METHODS
140 |     def _get_html(self, url):
141 |         """ Get html from url
142 |         """
143 |         self.log.info(u"/GET {}".format(url))
144 |         r = requests.get(url)
145 |         if hasattr(r, 'from_cache'):
146 |             if r.from_cache:
147 |                 self.log.info("(from cache)")
148 | 
149 |         if r.status_code != 200:
150 |             throw_request_err(r)
151 | 
152 |         return r.content
153 | 
154 |     def _post_html(self, url, payload):
155 |         self.log.info(u"/POST {} with {}".format(url, payload))
156 |         r = requests.post(url, payload)
157 |         if r.status_code != 200:
158 |             throw_request_err(r)
159 | 
160 |         return r.content
161 | 
162 |     def _get_json(self, url):
163 |         """ Get json from url
164 |         """
165 |         self.log.info(u"/GET " + url)
166 |         r = requests.get(url)
167 |         if hasattr(r, 'from_cache'):
168 |             if r.from_cache:
169 |                 self.log.info("(from cache)")
170 |         if r.status_code != 200:
171 |             throw_request_err(r)
172 | 
173 |         return r.json()
174 | 
175 | 
176 | 
177 |     @property
178 |     def log(self):
179 |         if not hasattr(self, "_logger"):
180 |             self._logger = PrintLogger()
181 |         return self._logger
182 | 
183 | 
184 | class VantetiderDataset(Dataset):
185 | 
186 |     def get_url(self, region="Sverige"):
187 |         region_slug = self._get_region_slug(region)
188 |         return BASE_URL + region_slug + "/" + self.id
189 | 
190 |     @property
191 |     def html(self):
192 |         if not hasattr(self, "_html"):
193 |             url = self.get_url()
194 |             self._html = self.scraper._get_html(url)
195 |         return self._html
196 | 
197 |     @property
198 |     def soup(self):
199 |         return BeautifulSoup(self.html, 'html.parser')
200 | 
201 |     @property
202 |     def regions(self):
203 |         """ Get a list of all regions
204 |         """
205 |         regions = []
206 |         elem = self.dimensions["region"].elem
207 |         for option_elem in elem.find_all("option"):
208 |             region = option_elem.text.strip()
209 |             regions.append(region)
210 | 
211 |         return regions
212 | 
213 | 
214 |     def _get_region_slug(self, id_or_label):
215 |         """ Get the regional slug to be used in url
216 |             "Norrbotten" => "Norrbottens"
217 | 
218 |             :param id_or_label: Id or label of region
219 |         """
220 |         #region = self.dimensions["region"].get(id_or_label)
221 |         region = id_or_label
222 |         slug = region\
223 |             .replace(u" ","-")\
224 |             .replace(u"ö","o")\
225 |             .replace(u"Ö","O")\
226 |             .replace(u"ä","a")\
227 |             .replace(u"å","a") + "s"
228 | 
229 |         EXCEPTIONS = {
230 |             "Jamtland-Harjedalens": "Jamtlands",
231 |             "Rikets": "Sveriges",
232 |         }
233 |         if slug in EXCEPTIONS:
234 |             slug = EXCEPTIONS[slug]
235 | 
236 |         return slug
237 | 
238 |     def _parse_result_page(self, url, payload, only_region=False):
239 |         """ Get data from a result page
240 |             :param url: url to query
241 |             :param payload: payload to pass
242 |             :return: a dictlist with data
243 |         """
244 |         data = []
245 |         try:
246 | 
247 |             if only_region:
248 |                 html = self.scraper._get_html(url)
249 |             else:
250 |                 html = self.scraper._post_html(url, payload=payload)
251 | 
252 |         except RequestException500:
253 | 
254 |             self.scraper.log.warning(u"Status code 500 on {} with {}".format(url, payload))
255 |             return None
256 | 
257 | 
258 |         current_selection = self._get_current_selection(html)
259 | 
260 |         table = Datatable(html)
261 |         data = []
262 |         for row in table.data:
263 |             region_or_unit_id, region_or_unit_label = row["region_or_unit"]
264 |             if region_or_unit_label in self.regions:
265 |                 row["region"] = region_or_unit_label
266 |                 row["unit"] = None
267 |             else:
268 |                 row["region"] = None
269 |                 row["unit"] = region_or_unit_label
270 | 
271 |             value = row["value"]
272 | 
273 |             row.pop("value", None)
274 |             row.pop("region_or_unit", None)
275 | 
276 |             for dim in self.dimensions:
277 |                 if dim.id not in row:
278 |                     row[dim.id] = current_selection[dim.id][1] # gets label
279 | 
280 | 
281 | 
282 |             data.append(Result(value, row))
283 |         return data
284 | 
285 |     def _get_current_selection(self, html):
286 |         if isinstance(html, str):
287 |             html = BeautifulSoup(html, "html.parser")
288 |         current_selection = {}
289 |         for dim in self.dimensions:
290 |             if dim.id in ["measure"]:
291 |                 continue
292 | 
293 |             elem = html.select("[name={}]".format(dim.elem_id))
294 | 
295 |             if len(elem) > 1 or len(elem) == 0:
296 |                 import pdb;pdb.set_trace()
297 |                 raise Exception("DEBUG!")
298 |             else:
299 |                 elem = elem[0]
300 | 
301 |             if dim.elem_type == "select":
302 |                 try:
303 |                     option_elem = elem.select_one("[selected]")
304 |                     selected_id = get_option_value(option_elem)
305 |                     selected_label = get_option_text(option_elem)
306 |                 except AttributeError:
307 |                     option_elem = elem.select_one("option")
308 |                     selected_id = get_option_value(option_elem)
309 |                     selected_label = get_option_text(option_elem)
310 | 
311 |                 selected_cat = selected_id
312 |             elif dim.elem_type == "radio":
313 |                 import pdb;pdb.set_trace()
314 |                 raise NotImplementedError()
315 |             elif dim.elem_type == "checkbox":
316 |                 selected_cat = elem.has_attr("checked")
317 |                 selected_label = selected_cat
318 | 
319 |             current_selection[dim.id] = (selected_cat, selected_label)
320 | 
321 |         return current_selection
322 | 
323 | class VantetiderDimension(Dimension):
324 |     """docstring for VantetiderDimension"""
325 | 
326 |     @property
327 |     def elem_type(self):
328 |         """ :returns: "select"|"radio"|"checkbox"
329 |         """
330 |         if not hasattr(self, "_elem_type"):
331 |             self._elem_type = get_elem_type(self.elem)
332 |         return self._elem_type
333 | 
334 | 
335 |     @property
336 |     def default_value(self):
337 |         """ The default category when making a query
338 |         """
339 |         if not hasattr(self, "_default_value"):
340 |             if self.elem_type == "select":
341 |                 try:
342 |                     # Get option marked "selected"
343 |                     def_value = get_option_value(self.elem.select_one("[selected]"))
344 |                 except AttributeError:
345 |                     # ...or if that one doesen't exist get the first option
346 |                     def_value = get_option_value(self.elem.select_one("option"))
347 | 
348 |             elif self.elem_type == "checkbox":
349 |                 def_value = self.elem.get("value")
350 | 
351 |             elif self.elem_type == "radio":
352 |                 def_value = [x for x in self.elem if x.has_attr("checked")][0].get("value")
353 | 
354 |             self._default_value = def_value
355 | 
356 |             assert def_value is not None
357 | 
358 |         return self._default_value
359 | 
360 | class PrintLogger():
361 |     """ Empyt "fake" logger
362 |     """
363 | 
364 |     def log(self, msg, *args, **kwargs):
365 |         print(msg)
366 | 
367 |     def debug(self, msg, *args, **kwargs):
368 |         print(msg)
369 | 
370 |     def info(self, msg, *args, **kwargs):
371 |         print(msg)
372 | 
373 |     def warning(self, msg, *args, **kwargs):
374 |         print(msg)
375 | 
376 |     def error(self, msg, *args, **kwargs):
377 |         print(msg)
378 | 
379 |     def critical(self, msg, *args, **kwargs):
380 |         print(msg)
381 | 
382 | 
383 | # UTILS
384 | class Datatable(object):
385 |     def __init__(self, html):
386 |         self.soup = BeautifulSoup(html, 'html.parser')
387 |         self.data = self._parse_values()
388 |         self._measures = None
389 |         # Assumption: the data table is the last table on the page
390 | 
391 | 
392 |     @property
393 |     def has_tabs(self):
394 |         """ Does the table have tabs?
395 |             Like http://www.vantetider.se/Kontaktkort/Sveriges/VantatKortareAn60Dagar/
396 |         """
397 |         return len(self.soup.select(".table_switch")) > 0
398 | 
399 |     @property
400 |     def has_horizontal_scroll(self):
401 |         """ Does the table have horizontal scroll?
402 |             Like http://www.vantetider.se/Kontaktkort/Sveriges/VantatKortareAn60Dagar/
403 |         """
404 |         return len(self.soup.select(".DTFC_ScrollWrapper")) > 0
405 | 
406 |     @property
407 |     def has_vertical_scroll(self):
408 |         """ Does the table have vertical scroll?
409 |             Like http://www.vantetider.se/Kontaktkort/Sveriges/PrimarvardTelefon/
410 |         """
411 |         return bool(self.soup.select_one("#DataTables_Table_0_wrapper"))
412 | 
413 | 
414 | 
415 |     @property
416 |     def measures(self):
417 |         """ Get a list of the measuers of this datatable
418 |             Measures can be "Antal Besök inom 7 dagar",
419 |             "Måluppfyllelse vårdgarantin", etc
420 |         """
421 |         if self._measures == None:
422 |             self._measures = get_unique([x["measure"] for x in self.data])
423 | 
424 |         return self._measures
425 | 
426 |     def _parse_values(self):
427 |         """ Get values
428 |         """
429 |         data = []
430 |         if self.has_tabs:
431 |             def _parse_tab_text(tab):
432 |                 # Annoying html in tabs
433 |                 if tab.select_one(".visible_normal"):
434 |                     return tab.select_one(".visible_normal").text
435 |                 else:
436 |                     return tab.text
437 | 
438 |             sub_table_ids = [_parse_tab_text(x) for x in self.soup.select(".table_switch li")]
439 |             sub_tables = self.soup.select(".dataTables_wrapper")
440 |             assert len(sub_tables) == len(sub_table_ids)
441 |             assert len(sub_tables) > 0
442 | 
443 |             for measure, table in zip(sub_table_ids, sub_tables):
444 |                 if self.has_horizontal_scroll:
445 |                     _data = self._parse_horizontal_scroll_table(table)
446 |                     for region, col, value in _data:
447 |                         data.append({
448 |                             "region_or_unit": region,
449 |                             "select_period": col, # Hardcode warning!
450 |                             "measure": measure,
451 |                             })
452 | 
453 |         else:
454 |             if self.has_horizontal_scroll:
455 |                 raise NotImplementedError()
456 | 
457 |             if self.has_vertical_scroll:
458 |                 table = self.soup.select_one("#DataTables_Table_0_wrapper")
459 |                 _data = self._parse_vertical_scroll_table(table)
460 |             else:
461 |                 table = self.soup.select(".chart.table.scrolling")[-1]
462 |                 _data = self._parse_regular_table(table)
463 | 
464 |             for region, measure, value in _data:
465 |                 data.append({
466 |                     "region_or_unit": region,
467 |                     "measure": measure,
468 |                     "value": value
469 |                 })
470 | 
471 |         return data
472 | 
473 |     def _parse_horizontal_scroll_table(self, table_html):
474 |         """ Get list of dicts from horizontally scrollable table
475 |         """
476 |         row_labels = [parse_text(x.text) for x  in table_html.select(".DTFC_LeftBodyWrapper tbody tr")]
477 |         row_label_ids = [None] * len(row_labels)
478 |         cols = [parse_text(x.text) for x in table_html.select(".dataTables_scrollHead th")]
479 |         value_rows = table_html.select(".dataTables_scrollBody tbody tr")
480 | 
481 |         values = []
482 |         for row_i, value_row in enumerate(value_rows):
483 |             row_values = [parse_value(x.text) for x in value_row.select("td")]
484 |             values.append(row_values)
485 | 
486 |         sheet = Sheet(zip(row_label_ids, row_labels), cols, values)
487 | 
488 |         return sheet.long_format
489 | 
490 |     def _parse_vertical_scroll_table(self, table_html):
491 |         value_rows = table_html.select("tbody tr")
492 |         row_labels = [parse_text(x.select_one("td").text) for x in value_rows]
493 |         row_label_ids = [None] * len(row_labels)
494 |         if table_html.select_one("td .clickable"):
495 |             row_label_ids = [parse_landsting(x.select_one("td .clickable").get("onclick")) for x in value_rows]
496 | 
497 |         cols = [parse_text(x.text) for x in table_html.select(".dataTables_scrollHead th")][1:]
498 |         values = []
499 |         for row in value_rows:
500 |             row_values = [ parse_value(x.text) for x in row.select("td")[1:] ]
501 |             values.append(row_values)
502 | 
503 |         sheet = Sheet(zip(row_label_ids, row_labels), cols, values)
504 | 
505 |         return sheet.long_format
506 | 
507 |     def _parse_regular_table(self, table_html):
508 |         value_rows = table_html.select("tbody tr")
509 |         row_labels = [parse_text(x.select_one("td").text) for x in value_rows]
510 |         row_label_ids = [None] * len(row_labels)
511 |         if table_html.select_one("td .clickable"):
512 |             row_label_ids = [parse_landsting(x.select_one("td .clickable").get("onclick")) for x in value_rows]
513 |         cols = [parse_text(x.text) for x in table_html.select("th")][1:]
514 |         values = []
515 |         for row in value_rows:
516 |             row_values = [ parse_value(x.text) for x in row.select("td")[1:] ]
517 |             values.append(row_values)
518 | 
519 |         sheet = Sheet(zip(row_label_ids, row_labels), cols, values)
520 | 
521 |         return sheet.long_format
522 | 
523 | 
524 | 
525 | class Sheet(object):
526 |     """ Represents a two-dimensional sheet/table with data
527 |     """
528 |     def __init__(self, rows, cols, values):
529 |         """
530 |             :param rows: a list with row values
531 |             :param cols: a list with column headers
532 |             :param values: a list of lists with row values
533 |         """
534 |         self.values_by_row = values
535 |         self.values = flatten(values)
536 | 
537 |         if len(rows) * len(cols) == len(self.values):
538 |             msg = ("Error initing sheet. Factor of n rows ({})",
539 |                 "and cols ({}) don't add up. Got {}, expected {}."\
540 |                 .format(len(rows), len(cols), len(rows) * len(cols), len(self.values)))
541 | 
542 |         assert len(rows) == len(values)
543 |         assert len(cols) == len(values[0])
544 | 
545 |         self.row_index = rows
546 |         self.col_index = cols
547 | 
548 |     @property
549 |     def as_dictlist(self):
550 |         """ Returns a dictlist with values
551 |             [
552 |                 {
553 |                     "row": "row_a",
554 |                     "col": "col_a",
555 |                     "value": 1,
556 |                 }
557 |             ]
558 |         """
559 |         data = []
560 |         for row_i, row in enumerate(self.row_index):
561 |             for col_i, col in enumerate(self.col_index):
562 |                 value = self.values_by_row[row_i][col_i]
563 |                 data.append({
564 |                     "row": row,
565 |                     "col": col,
566 |                     "value": value,
567 |                     })
568 |         return data
569 | 
570 |     @property
571 |     def long_format(self):
572 |         return zip(
573 |             repeat(self.row_index, len(self.col_index)),
574 |             self.col_index * len(self.row_index),
575 |             self.values
576 |             )
577 | 
578 | def get_unique(l):
579 |     """ Get unique values from list
580 |         Placed outside the class beacuse `list` conflicts our internal
581 |         method with the same name.
582 |     """
583 |     return list(set(l))
584 | 
585 | def get_elem_type(elem):
586 |     """ Get elem type of soup selection
587 |         :param elem: a soup element
588 |     """
589 |     elem_type = None
590 |     if isinstance(elem, list):
591 |         if elem[0].get("type") == "radio":
592 |             elem_type = "radio"
593 |         else:
594 |             raise ValueError(u"Unknown element type: {}".format(elem))
595 | 
596 |     elif elem.name == "select":
597 |         elem_type = "select"
598 | 
599 |     elif elem.name == "input":
600 |         elem_type = elem.get("type")
601 | 
602 |     else:
603 |         raise ValueError(u"Unknown element type: {}".format(elem))
604 | 
605 |     # To be removed
606 |     assert elem_type is not None
607 | 
608 |     return elem_type
609 | 
610 | def get_option_value(elem):
611 |     """ Get the value attribute, or if it doesn't exist the text
612 |         content.
613 |         <option value="foo">bar</option> => "foo"
614 |         <option>bar</option> => "bar"
615 |         :param elem: a soup element
616 |     """
617 |     value = elem.get("value")
618 |     if value is None:
619 |         value = elem.text.strip()
620 |     if value is None or value == "":
621 |         msg = u"Error parsing value from {}.".format(elem)
622 |         raise ValueError(msg)
623 | 
624 |     return value
625 | 
626 | def get_option_text(elem):
627 |     """ Get the text of option
628 |         <option value="foo">bar</option> => "bar"
629 |         <option>bar</option> => "bar"
630 |         :param elem: a soup element
631 |     """
632 |     return elem.text.strip()
633 | 
634 | 
635 | def parse_value(val):
636 |     """ Parse values from html
637 |     """
638 |     val = val.replace("%", " ")\
639 |         .replace(" ","")\
640 |         .replace(",", ".")\
641 |         .replace("st","").strip()
642 | 
643 |     missing = ["Ejdeltagit", "N/A"]
644 |     if val in missing:
645 |         return val
646 |     elif val == "":
647 |         return None
648 | 
649 |     return float(val)
650 | 
651 | def parse_text(val):
652 |     """ Format strings fetched from html
653 |     """
654 |     return val.replace("\n", " ").strip()
655 | 
656 | def parse_landsting(val):
657 |     """ Get region/unit id from "handle_click_event_landsting(this, 1)"
658 |     """
659 |     try:
660 |         return re.search("\(this, (\d+)", val).group(1)
661 |     except AttributeError:
662 |         return None
663 | 
664 | def is_string(val):
665 |     return isinstance(val, str) or isinstance(val, unicode)
666 | 
667 | def flatten(l):
668 |     """Flatten list of lists
669 |     """
670 |     return [item for sublist in l for item in sublist]
671 | 
672 | def repeat(l, n):
673 |     """ Repeat all items in list n times
674 |         repeat([1,2,3], 2) => [1,1,2,2,3,3]
675 |         http://stackoverflow.com/questions/24225072/repeating-elements-of-a-list-n-times
676 |     """
677 |     return [x for x in l for i in range(n)]
678 | 
679 | def is_int(s):
680 |     try:
681 |         int(s)
682 |         return True
683 |     except ValueError:
684 |         return False
685 | 
686 | def guess_measure_unit(values):
687 |     last_words = [x.split(" ")[-1] for x in values]
688 |     counts = Counter(last_words).most_common()
689 |     max_share = float(counts[0][1] / float(len(values)) )
690 |     if max_share <= 0.5:
691 |         raise ParseError(u"Not sure how to interpret the measure unit in: {}".format(values))
692 | 
693 |     return counts[0][0]
694 | 
695 | class RequestException404(RequestException):
696 |     pass
697 | 
698 | class RequestException500(RequestException):
699 |     pass
700 | 


--------------------------------------------------------------------------------
/statscraper/scrapers/VehicleScraper.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | import pandas as pd
 4 | import json
 5 | from statscraper import BaseScraper, Dataset, Dimension, Result
 6 | 
 7 | MONTHS = ['januari', 'februari', 'mars', 'april', 'maj', 'juni'
 8 |           'juli', 'augusti', 'september', 'oktober', 'november', 'december']
 9 | 
10 | 
11 | class Vehicles(BaseScraper):
12 |     """Vehicle statistics from Transportstyrelsen.
13 | 
14 |     :return: :class:`Vehicles <Vehicles>` object
15 |     :rtype: statscraper.BaseScraper
16 | 
17 |     Usage::
18 | 
19 |       >>> from statscraper.scrapers import Vehicles
20 |       >>> scraper = Vehicles()
21 |       >>> scraper.items
22 |       # [<Dataset: Vehicles>]
23 |     """
24 | 
25 |     BASE_URL = ('https://www.transportstyrelsen.se/globalassets/'
26 |                 'global/press/statistik/fordonsstatistik/{year}/'
27 |                 'fordonsstatistik-{month}-{year}.xlsx')
28 | 
29 |     def _clean_data(self, df, year, month):
30 |         df = df.dropna(how='all', axis=1)
31 |         df = df.dropna(how='all', axis=0)
32 |         df = df.drop('Totalsumma', axis=1)
33 |         df = df.rename(columns={'Unnamed: 1': 'vehicle_type'})
34 |         df = df[df['vehicle_type'] != 'Totalsumma']
35 |         df.loc[:, 'year'] = year
36 |         df.loc[:, 'month'] = month
37 |         df = pd.melt(df,
38 |                      id_vars=['vehicle_type', 'month', 'year'],
39 |                      value_vars=['AVREGISTRERAD', 'AVSTÄLLD', 'ITRAFIK'],
40 |                      var_name='status')
41 |         return df
42 | 
43 |     def _fetch_itemslist(self, item):
44 |         """There's one dataset spread out in many files."""
45 |         yield Dataset('Vehicles')
46 | 
47 |     def _fetch_dimensions(self, dataset):
48 |         yield Dimension('year', datatype='year')
49 |         yield Dimension('month')  # TODO: Convert to datatype month
50 |         yield Dimension('vehicle_type')
51 |         yield Dimension('status')
52 | 
53 |     def _fetch_data(self, dataset, query=None):
54 |         files = [(y, m) for y in query['years'] for m in query['months']]
55 |         frames = []
56 | 
57 |         # Download and clean every monthly Excel file
58 |         for file in files:
59 |             year, month = file
60 |             url = self.BASE_URL.format(year=year, month=MONTHS[month])
61 |             frame = self._clean_data(pd.read_excel(url), year, month)
62 |             frames.append(frame)
63 | 
64 |         # Yield individual rows of type Result from the dataframe
65 |         raw_data = pd.concat(frames)
66 |         for i, row in raw_data.iterrows():
67 |             val = row.pop('value')
68 |             yield Result(val, json.loads(row.to_json()))
69 | 


--------------------------------------------------------------------------------
/statscraper/scrapers/__init__.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | """ Expose scraper classes here.
 3 | """
 4 | 
 5 | from .SCBScraper import SCB
 6 | from .PXWebScraper import PXWeb
 7 | from .CranesScraper import Cranes
 8 | from .VehicleScraper import Vehicles
 9 | from .SMHIScraper import SMHI
10 | 
11 | 


--------------------------------------------------------------------------------
/statscraper/scrapers/uka_scraper.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | u""" A scraper to fetch Swedish university application statistics from
 3 |  the Swedish Higher Education Authority (Universitetskanslerämbetet, UKÄ),
 4 |  at http://statistik.uka.se
 5 | """
 6 | from statscraper import BaseScraper, Dataset, Dimension, Result, Collection
 7 | import requests
 8 | from bs4 import BeautifulSoup
 9 | 
10 | 
11 | class UKA(BaseScraper):
12 | 
13 |     def _fetch_itemslist(self, item):
14 |         """ We only offer regional application stats.
15 |          Other collections are differently structured.
16 |         """
17 |         if item.is_root:
18 |             yield Collection("regional",
19 |                              label="New students by area and school.")
20 |         else:
21 |             yield Dataset("municipality",
22 |                           label="Students by municipality, school, semester.")
23 | 
24 |     def _fetch_dimensions(self, dataset):
25 |         """ Iterate through semesters, counties and municipalities.
26 |         """
27 |         yield Dimension(u"school")
28 |         yield Dimension(u"year",
29 |                         datatype="year")
30 |         yield Dimension(u"semester",
31 |                         datatype="academic_term",
32 |                         dialect="swedish")  # HT/VT
33 |         yield Dimension(u"municipality",
34 |                         datatype="year",
35 |                         domain="sweden/municipalities")
36 | 
37 |     def _fetch_data(self, dataset, query):
38 |         url = "http://statistik.uka.se/4.5d85793915901d205f935d0f.12.5d85793915901d205f965eab.portlet?action=resultat&view=resultTable&frageTyp=3&frageNr=240&tid=%s&grupp1=%s&grupp2=%s"
39 |         thenmap_url = "http://api.thenmap.net/v1/se-7/data/%s?data_props=name|kommunkod"
40 |         # 6 is 1993, the first year in the db
41 |         if query is None:
42 |             query = {}
43 |         if "from" not in query:
44 |             query['from'] = 1993
45 |         if "semesters" not in query:
46 |             query['semesters'] = (2016 - query["from"]) * 2
47 |         start = (query["from"] - 1993) * 2 + 5
48 |         terms = range(start,
49 |                       start + query["semesters"] + 2)
50 |         for t in terms:
51 |             # Get all municipalities, and their codes, from this year
52 |             year = ((t - 5) / 2) + 1993
53 |             semester = ["HT", "VT"][t % 2]
54 |             municipalities = requests.get(thenmap_url % year).json()
55 |             for id_, municipality_ in municipalities["data"].items():
56 |                 municipality = municipality_.pop()
57 |                 code = municipality["kommunkod"].zfill(4)
58 |                 c, m = code[:2], code[2:]
59 |                 html = requests.get(url % (t, c, m)).text
60 |                 soup = BeautifulSoup(html, 'html.parser')
61 |                 table = soup.find("table")
62 |                 # The first rows are headers, the last are empty
63 |                 rows = table.find_all("tr")[5:-2]
64 |                 for row in rows:
65 |                     cells = row.find_all("td")
66 | 
67 |                     yield Result(cells[2].text.strip(), {
68 |                         "municipality": municipality["name"],
69 |                         "school": cells[0].text.strip(),
70 |                         "semester": semester,
71 |                         "year": year,
72 |                     })
73 | 


--------------------------------------------------------------------------------
/statscraper/scrapers/work_injury_scraper.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | """ A scraper to fetch Swedish work injury stats from
  3 |     http://webbstat.av.se
  4 | 
  5 |     This is an example of a scraper using Selenium.
  6 |     TODO: Move some useful functionality to a SeleciumFirefoxScraper
  7 | 
  8 |     To change download location:
  9 |        export STATSCRAPER_TEMPDIR="/path/to/temp/dir"
 10 | 
 11 | """
 12 | from selenium import webdriver
 13 | from selenium.webdriver.common.keys import Keys
 14 | from selenium.webdriver.common.action_chains import ActionChains
 15 | from selenium.webdriver.support.wait import WebDriverWait
 16 | from statscraper import BaseScraper, Collection, Dataset, Result, Dimension
 17 | import os
 18 | from glob import iglob
 19 | from time import sleep
 20 | from uuid import uuid4
 21 | from xlrd import open_workbook
 22 | from selenium.webdriver.support import expected_conditions as EC
 23 | from selenium.webdriver.common.by import By
 24 | 
 25 | DEFAULT_TEMPDIR = "./tmp"
 26 | TEMPDIR_ENVVAR = "STATSCRAPER_TEMPDIR"
 27 | PAGELOAD_TIMEOUT = 90  # seconds
 28 | 
 29 | 
 30 | class WorkInjuries(BaseScraper):
 31 | 
 32 |     tempdir = "./tmp"
 33 | 
 34 |     @BaseScraper.on("init")
 35 |     def initiate_browser(self):
 36 | 
 37 |         # Create a unique tempdir for downloaded files
 38 |         tempdir = os.getenv(TEMPDIR_ENVVAR, DEFAULT_TEMPDIR)
 39 |         tempsubdir = uuid4().hex
 40 |         # TODO: Remove this directory when finished!
 41 |         self.tempdir = os.path.join(tempdir, tempsubdir)
 42 |         try:
 43 |             # Try and create directory before checking if it exists,
 44 |             # to avoid race condition
 45 |             os.makedirs(self.tempdir)
 46 |         except OSError:
 47 |             if not os.path.isdir(self.tempdir):
 48 |                 raise
 49 | 
 50 |         profile = webdriver.FirefoxProfile()
 51 |         # Set download location, avoid download dialogues if possible
 52 |         # Different settings needed for different Firefox versions
 53 |         # This will be a long list...
 54 |         profile.set_preference('browser.download.folderList', 2)
 55 |         profile.set_preference('browser.download.manager.showWhenStarting', False)
 56 |         profile.set_preference('browser.download.manager.closeWhenDone', True)
 57 |         profile.set_preference('browser.download.dir', self.tempdir)
 58 |         profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream;application/vnd.ms-excel")
 59 |         profile.set_preference("browser.helperApps.alwaysAsk.force", False)
 60 |         profile.set_preference("browser.download.manager.useWindow", False)
 61 | 
 62 |         self.browser = webdriver.Firefox(profile)
 63 | 
 64 |         self.browser.get('http://webbstat.av.se')
 65 |         detailed_cls = "Document_TX_GOTOTAB_Avancerad"
 66 |         """ The button for expanded detailed options. This
 67 |         also happens to be a good indicator as to wheter
 68 |         all content is loaded.
 69 |         """
 70 | 
 71 |         # Wait for a content element, and 3 extra seconds just in case
 72 |         WebDriverWait(self.browser, PAGELOAD_TIMEOUT)\
 73 |             .until(EC.presence_of_element_located((By.CLASS_NAME,
 74 |                                                   detailed_cls)))
 75 |         self.browser.implicitly_wait(3)
 76 | 
 77 |         self.browser\
 78 |             .find_element_by_class_name(detailed_cls)\
 79 |             .find_element_by_tag_name("td")\
 80 |             .click()
 81 |         # Wait for a content element, and 3 extra seconds just in case
 82 |         WebDriverWait(self.browser, PAGELOAD_TIMEOUT)\
 83 |             .until(EC.presence_of_element_located((By.CLASS_NAME,
 84 |                                                    detailed_cls)))
 85 |         self.browser.implicitly_wait(3)
 86 | 
 87 |     @BaseScraper.on("select")
 88 |     def switch_dataset(self, id_):
 89 |         (c, r, p) = self.current_item.blob
 90 | 
 91 |         # Select collection
 92 |         xpath = "//div[@title='%s']" % c
 93 |         # `c` can be either "Arbetsolycka" or "Arbetssjukdom"
 94 |         button = self.browser.find_element_by_xpath(xpath)
 95 |         button.click()
 96 | 
 97 |         # select Kommun or Län
 98 |         xpath = '//div[@class="QvContent"]/div[@class="QvGrid"]//div[@title="Visa tabell per:"]'
 99 |         self.browser\
100 |             .find_element_by_xpath(xpath)\
101 |             .click()
102 |         region = "Kommun" if r == "kommun" else "Län"
103 |         xpath = "//div[@class='QvListbox']//div[@title='%s']" % region
104 |         self.browser\
105 |             .find_element_by_xpath(xpath)\
106 |             .click()
107 | 
108 |         # select Månad or År
109 |         xpath = '//div[@class="QvContent"]/div[@class="QvGrid"]//div[@title="Tidsenhet:"]'
110 |         self.browser\
111 |             .find_element_by_xpath(xpath)\
112 |             .click()
113 |         period = "Månad" if p == u"månad" else "År och månad"
114 |         xpath = "//div[@class='QvListbox']//div[@title='%s']" % period
115 |         self.browser\
116 |             .find_element_by_xpath(xpath)\
117 |             .click()
118 | 
119 |     def _fetch_dimensions(self, dataset):
120 |         """ Declaring available dimensions like this is not mandatory,
121 |          but nice, especially if they differ from dataset to dataset.
122 | 
123 |          If you are using a built in datatype, you can specify the dialect
124 |          you are expecting, to have values normalized. This scraper will
125 |          look for Swedish month names (e.g. 'Januari'), but return them
126 |          according to the Statscraper standard ('january').
127 |         """
128 |         yield Dimension(u"region",
129 |                         label="municipality or county",
130 |                         datatype="region",
131 |                         dialect="arbetsmiljoverket")
132 |         yield Dimension(u"period",
133 |                         label="Year or month")
134 | 
135 |     def _fetch_itemslist(self, item):
136 |         """ We define two collection:
137 |         - Number of work injuries ("Arbetsolycka")
138 |         - Number of workrelated diseases ("Arbetssjukdom")
139 |         Each contains four datasets:
140 |         - Per municipality and year
141 |         - Per county and year
142 |         - Per municipality and month
143 |         - Per municipality and year
144 |         """
145 |         if item.is_root:
146 |             for c in ["Arbetsolycka", "Arbetssjukdom"]:
147 |                 yield Collection(c, blob=(c, None, None))
148 |         else:
149 |             c = item.id
150 |             for r in [u"kommun", u"län"]:
151 |                 for p in [u"år", u"månad"]:
152 |                     yield Dataset(u"%s-%s-%s" % (c, r, p),
153 |                                   blob=(c, r, p),
154 |                                   label=u"%s, antal per %s och %s" % (c, r, p))
155 | 
156 |     def _fetch_data(self, dataset, query=None):
157 |         (c, r, p) = dataset.blob
158 | 
159 |         self.browser\
160 |             .find_element_by_xpath("//div[@title='Skicka till Excel']")\
161 |             .click()
162 |         # Press enter trice in case of any prompts
163 |         actions = ActionChains(self.browser)
164 |         actions.send_keys(Keys.RETURN)
165 |         actions.send_keys(Keys.RETURN)
166 |         actions.send_keys(Keys.RETURN)
167 |         actions.perform()
168 |         # Wait for download
169 |         i = 0
170 |         while not os.listdir(self.tempdir):
171 |             sleep(1)
172 |             i += 1
173 |             if i > PAGELOAD_TIMEOUT:
174 |                 # TODO: Use a suitable basescraper exception
175 |                 raise Exception("Download timed out")
176 |         sleep(20)  # TODO: We need to check that the file is complete.
177 |         # Something like this:
178 |         # https://stackoverflow.com/questions/35891393/how-to-get-file-download-complete-status-using-selenium-web-driver-c-sharp#35892347
179 | 
180 |         # WARNING: Assuming the latest downloaded xls to be our file.
181 |         # This is obviously not 100 % water proof.
182 |         latest_download = max(iglob(os.path.join(self.tempdir, "*.xls")),
183 |                               key=os.path.getctime)
184 |         workbook = open_workbook(latest_download)
185 |         sheet = workbook.sheet_by_index(0)
186 |         periods = sheet.row_values(0)[2:-1]
187 |         periods = [int(x) for x in periods]
188 |         for n in range(1, sheet.nrows):
189 |             row = sheet.row_values(n)
190 |             region = row.pop(0)
191 |             row.pop(0)  # empty due to merged cells
192 |             if region == "Total":
193 |                 break
194 |             i = 0
195 |             for col in row[:-1]:
196 |                 yield Result(
197 |                     int(col),
198 |                     {
199 |                         "region": region,
200 |                         "period": periods[i],
201 |                     }
202 |                 )
203 | 


--------------------------------------------------------------------------------
/tests/scrapertests/test_injury_scraper.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | from unittest import TestCase
 4 | from statscraper.scrapers.work_injury_scraper import WorkInjuries
 5 | 
 6 | 
 7 | class TestInjuries(TestCase):
 8 | 
 9 |     def setup_method(self, test_method):
10 |         self.scraper = WorkInjuries()
11 | 
12 |     def test_can_fetch(self):
13 |         collection = self.scraper[1]  # "Arbetssjukdomar"
14 |         dataset = collection[3]
15 |         data = dataset.data
16 |         self.assertTrue(len(data))
17 | 


--------------------------------------------------------------------------------
/tests/scrapertests/test_pxweb_scraper.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | from unittest import TestCase
 4 | 
 5 | from statscraper.scrapers import PXWeb
 6 | 
 7 | 
 8 | class TestPXWeb(TestCase):
 9 | 
10 |     def test_init_scraper(self):
11 |         """Extending the PXWebScraper."""
12 |         pxscraper = PXWeb(base_url="http://pxnet2.stat.fi/pxweb/api/v1/sv/StatFin/")
13 |         self.assertTrue(len(pxscraper.items))
14 | 
15 |     def test_navigating_tree(self):
16 |         """Navigate the tree.."""
17 |         scraper = PXWeb(base_url="http://pxnet2.stat.fi/pxweb/api/v1/sv/StatFin/")
18 |         scraper.move_to("tym")\
19 |                .move_to(u"tyonv")\
20 |                .move_to(u"statfin_pxt_tym_tyonv_001.px")
21 |         data = scraper.fetch()
22 |         self.assertTrue(len(data))
23 | 


--------------------------------------------------------------------------------
/tests/scrapertests/test_smhi_scraper.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | from unittest import TestCase
  3 | import pandas as pd
  4 | from statscraper.scrapers.SMHIScraper import SMHI, Collection, API, SMHIDataset, Station
  5 | 
  6 | 
  7 | class TestSMHI(TestCase):
  8 | 
  9 |     def setUp(self):
 10 |         """Setting up scraper."""
 11 | 
 12 |     def test_fetch_api(self):
 13 |         scraper = SMHI()
 14 |         apis = scraper.items
 15 |         self.assertTrue(len(apis) > 0)
 16 |         api = apis[0]
 17 |         self.assertTrue(api, API)
 18 | 
 19 |         self.assertFalse(api.label is None)
 20 |         self.assertFalse(api.url is None)
 21 |         self.assertTrue(isinstance(api.json, dict))
 22 | 
 23 | 
 24 | 
 25 |     def test_fetch_dataset(self):
 26 |         u"""Moving to an “API”."""
 27 |         scraper = SMHI()
 28 |         api = scraper.get("Meteorological Observations")
 29 |         for dataset in api:
 30 |             self.assertFalse(dataset.label is None)
 31 |             self.assertFalse(dataset.url is None)
 32 |             # Make sure its a dataset
 33 |             self.assertTrue(isinstance(dataset, SMHIDataset))
 34 |             # Get dimensions
 35 |             self.assertGreater(len(dataset.dimensions), 0)
 36 | 
 37 |     def test_fetch_allowed_values(self):
 38 |         scraper = SMHI()
 39 |         api = scraper.get("Meteorological Observations")
 40 |         dataset = api.items[0]
 41 |         stations = dataset.dimensions["station"].allowed_values
 42 |         active_stations = [x for x in dataset.dimensions["station"].active_stations()]
 43 |         self.assertTrue(len(stations)>0)
 44 |         self.assertTrue(len(active_stations)>0)
 45 | 
 46 |         station = dataset.dimensions["station"].allowed_values.get_by_label(u"Växjö A")
 47 |         self.assertTrue(isinstance(station, Station))
 48 | 
 49 |         self.assertFalse(station.label is None)
 50 | 
 51 |         periods = dataset.dimensions["period"].allowed_values
 52 |         self.assertEqual(len(periods), 4)
 53 | 
 54 | 
 55 |     def test_query(self):
 56 |         scraper = SMHI()
 57 |         api = scraper.get("Meteorological Observations")
 58 |         dataset = api.items[0]
 59 |         data = dataset.fetch({"station": u"Växjö A", "period": "latest-months"})
 60 |         self.assertTrue(len(data) > 0)
 61 | 
 62 | 
 63 |     def test_get_stations_list(self):
 64 |         scraper = SMHI()
 65 |         api = scraper.get("Meteorological Observations")
 66 |         dataset = api.items[0]
 67 |         stations = dataset.get_stations_list()
 68 |         self.assertTrue(len(stations) > 0)
 69 |         for station in stations:
 70 |             self.assertTrue("longitude" in station)
 71 | 
 72 |         active_stations = dataset.get_active_stations_list()
 73 | 
 74 |         self.assertTrue(len(active_stations) > 0)
 75 |         self.assertTrue(len(stations) > len(active_stations))
 76 | 
 77 |     def test_iterate_queries(self):
 78 |         # Make same query to multiple datasets
 79 |         scraper = SMHI()
 80 |         api = scraper.get("Meteorological Observations")
 81 |         datasets = [
 82 |             u"Nederbördsmängd, summa, 1 gång per månad",
 83 |             u"Lufttemperatur, medel, 1 gång per månad",
 84 |         ]
 85 |         dfs = []
 86 |         for dataset_name in datasets:
 87 |             query = {
 88 |                 "period": ["corrected-archive"],
 89 |                 "station": "Abisko"
 90 |             }
 91 | 
 92 |             res = api.get(dataset_name).fetch(query)
 93 |             dfs.append(res.pandas)
 94 | 
 95 |         # Merge the two resultsets to one dataframe
 96 |         df = pd.concat(dfs)
 97 | 
 98 |         # Make sure that both parameters (datasets) are in
 99 |         # the final dataframe
100 |         parameters = df["parameter"].unique()
101 |         self.assertTrue(len(parameters) == 2)
102 |         for parameter in parameters:
103 |             self.assertTrue(parameter in datasets)
104 | 


--------------------------------------------------------------------------------
/tests/scrapertests/test_vantetider_scraper.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | from unittest import TestCase
 3 | 
 4 | from statscraper.scrapers.VantetiderScraper import VantetiderScraper
 5 | 
 6 | 
 7 | class TestVantetider(TestCase):
 8 | 
 9 |     def setUp(self):
10 |         self.scraper = VantetiderScraper()
11 | 
12 |     def test_smhi_scraper(self):
13 |         """Setting up scraper."""
14 |         self.assertTrue(len(self.scraper.items))
15 | 
16 | 
17 |     def test_fetch_dataset(self):
18 |         u"""Moving to an “API”."""
19 |         dataset = self.scraper.get("PrimarvardBesok")
20 | 
21 |         #self.assertTrue(isinstance(dataset, Dataset))
22 |         self.assertEqual(len(self.scraper.items), 9)
23 | 
24 | 
25 |     def test_fetch_dimensions(self):
26 |         u"""Moving to an “API”."""
27 |         for dataset in self.scraper.items:
28 |             self.assertGreater(len(dataset.dimensions),0)
29 | 
30 |     def test_basic_query(self):
31 |         dataset = self.scraper.get("PrimarvardTelefon")
32 |         res = dataset.fetch({"region": ["Blekinge"]})
33 |         df = res.pandas
34 |         self.assertGreater(df.shape[0],0)
35 | 
36 |     def test_multi_period_query(self):
37 |         dataset = self.scraper.get("PrimarvardBesok")
38 |         res = dataset.fetch({
39 |             "region": ["Stockholm"],
40 |             "year": ["2017", "2016"]
41 |             })
42 |         df = res.pandas
43 |         self.assertGreater(df.shape[0],0)
44 | 


--------------------------------------------------------------------------------
/tests/scrapertests/test_vehicle_scraper.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | from unittest import TestCase
 4 | from statscraper.scrapers import Vehicles
 5 | 
 6 | 
 7 | class TestVehicles(TestCase):
 8 | 
 9 |     def setUp(self):
10 |         self.scraper = Vehicles()
11 | 
12 |     def test_has_items(self):
13 |         self.assertTrue(len(self.scraper.items))
14 | 
15 |     def test_has_datasets(self):
16 |         datasets = self.scraper.items
17 |         self.assertTrue(len(datasets))
18 | 
19 |     def test_can_fetch(self):
20 |         dataset = self.scraper.items[0]
21 |         data = dataset.fetch(query={'years': [2017], 'months':[0,1]})
22 |         self.assertTrue(len(data))


--------------------------------------------------------------------------------
/tests/test-datatypes.py:
--------------------------------------------------------------------------------
 1 | """Test datatypes."""
 2 | from statscraper.datatypes import Datatype
 3 | from statscraper import Dimension, DimensionValue
 4 | 
 5 | 
 6 | def test_allowed_values():
 7 |     """Datatypes shuold have allowed values."""
 8 |     dt = Datatype("region")
 9 |     assert("Ale kommun" in dt.allowed_values)
10 | 
11 | 
12 | def test_b():
13 |     """Dimension values should be translatable."""
14 |     d = Dimension("municipality", datatype="region", domain="sweden/municipalities")
15 |     dv = DimensionValue("Ale kommun", d)
16 |     assert(dv.translate("numerical") == "1440")
17 | 


--------------------------------------------------------------------------------
/tests/test-scb.py:
--------------------------------------------------------------------------------
 1 | """Test SCB/PXWeb scraper."""
 2 | from statscraper.scrapers import SCB
 3 | from statscraper.exceptions import InvalidData
 4 | import pytest
 5 | 
 6 | 
 7 | def test_get_data():
 8 |     """We should be able to access a dataset by path."""
 9 |     scraper = SCB()
10 |     scraper.move_to("HE").move_to("HE0110").move_to("HE0110F").move_to("Tab1DispInkN")
11 |     data = scraper.fetch({
12 |       "ContentsCode": ("item", "000002VY"),
13 |       "InkomstTyp": ("item", "FastInkIn"),
14 |     }, by="municipality")
15 | 
16 |     assert "Region" in data.dataset.dimensions
17 |     assert "InkomstTyp" in data.dataset.dimensions
18 | 
19 |     df = data.pandas
20 |     assert "value" in df.columns
21 |     assert "Region" in df.columns
22 |     assert "InkomstTyp" in df.columns
23 | 
24 | 
25 | def test_values():
26 |     """Make sure values are numerical."""
27 |     scraper = SCB()
28 |     scraper.move_to("HE").move_to("HE0110").move_to("HE0110F").move_to("Tab1DispInkN")
29 |     data = scraper.fetch({
30 |       "ContentsCode": ("item", "000002VY"),
31 |       "InkomstTyp": ("item", "FastInkIn"),
32 |     }, by="municipality")
33 |     float(data[0].value.isnumeric())
34 | 
35 | 
36 | def test_invalid_query():
37 |     """We should raise an error on invalid queries."""
38 |     scraper = SCB()
39 |     scraper.move_to("HE").move_to("HE0110").move_to("HE0110F").move_to("Tab1DispInkN")
40 |     with pytest.raises(InvalidData):
41 |         scraper.fetch({
42 |           "foo": ("bar", "buzz"),
43 |         }, by="municipality")
44 | 


--------------------------------------------------------------------------------
/tests/test_base_scraper.py:
--------------------------------------------------------------------------------
  1 | """Tests for scraper base class."""
  2 | from unittest import TestCase
  3 | from statscraper import (BaseScraper, Dataset, Dimension, Result,
  4 |                          DimensionValue, Collection, ROOT, NoSuchItem)
  5 | 
  6 | 
  7 | class Scraper(BaseScraper):
  8 |     """A scraper with hardcoded yields."""
  9 | 
 10 |     def _fetch_itemslist(self, item):
 11 |         yield Dataset("Dataset_1")
 12 |         yield Dataset("Dataset_2")
 13 |         yield Dataset("Dataset_3")
 14 | 
 15 |     def _fetch_dimensions(self, dataset):
 16 |         yield Dimension("date")
 17 | 
 18 |         # Assign a label to one of the allowed values
 19 |         mun = Dimension("municipality", allowed_values=[
 20 |             "Umeå kommun",
 21 |             "Robertsfors kommun"])
 22 |         mun.allowed_values["Robertsfors kommun"].label = "Robertsfors kommun"
 23 |         yield mun
 24 | 
 25 |         yield Dimension("gender")
 26 | 
 27 |     def _fetch_allowed_values(self, dimension):
 28 |         if dimension.id == "gender":
 29 |             yield DimensionValue("male", dimension, label="Men")
 30 |             yield DimensionValue("female", dimension, label="Women")
 31 | 
 32 |     def _fetch_data(self, dataset, query=None):
 33 |         if dataset.id == "Dataset_1":
 34 |             yield Result(127, {
 35 |                 "date": "2017-08-10",
 36 |                 "municipality": "Robertsfors kommun",
 37 |             })
 38 |         elif dataset.id == "Dataset_2":
 39 |             yield Result(12, {
 40 |                 "date": "2017-02-06",
 41 |                 "municipality": "Umeå kommun",
 42 |             })
 43 |             yield Result(130, {
 44 |                 "date": "2017-02-07",
 45 |                 "municipality": "Robertsfors kommun",
 46 |             })
 47 | 
 48 | 
 49 | class NestedScraper(Scraper):
 50 |     """A scraper with hardcoded yields.
 51 | 
 52 |     ROOT - Collection_1 - Dataset_1
 53 |          - Collection_2 - [Dataset_2, Dataset_3]
 54 |     """
 55 | 
 56 |     def _fetch_itemslist(self, item):
 57 |         if item.id == ROOT:
 58 |             yield Collection("Collection_1")
 59 |             yield Collection("Collection_2")
 60 |         elif item.id == "Collection_1":
 61 |             yield Dataset("Dataset_1")
 62 |         elif item.id == "Collection_2":
 63 |             yield Dataset("Dataset_2")
 64 |             yield Dataset("Dataset_3")
 65 |         else:
 66 |             raise Exception("This can not possibly happen.")
 67 | 
 68 | 
 69 | class CallbackScraper(Scraper):
 70 |     """A scraper with callbacks."""
 71 | 
 72 |     @BaseScraper.on("init")
 73 |     def initiation_code(self):
 74 |         self.initiated = True
 75 | 
 76 |     def _fetch_itemslist(self, item):
 77 |         yield Dataset("Dataset_1")
 78 |         yield Dataset("Dataset_2")
 79 | 
 80 | 
 81 | class TestBaseScraper(TestCase):
 82 |     """Testing base functionality."""
 83 | 
 84 |     def test_init(self):
 85 |         """Extending the basescraper."""
 86 |         scraper = Scraper()
 87 |         self.assertTrue(scraper.current_item.id == ROOT)
 88 | 
 89 |     def test_inspect_item(self):
 90 |         """Fetching items from an itemlist."""
 91 |         scraper = Scraper()
 92 |         self.assertTrue(scraper.items[0] == scraper.items["Dataset_1"])
 93 | 
 94 |     def test_move_to_item(self):
 95 |         """Moving the cursor up and down the tree."""
 96 |         scraper = Scraper()
 97 |         scraper.move_to("Dataset_1")
 98 |         self.assertTrue(isinstance(scraper.current_item, Dataset))
 99 |         self.assertTrue(scraper.current_item.id == "Dataset_1")
100 | 
101 |         scraper.move_up()
102 |         scraper.move_to(1)
103 |         self.assertTrue(isinstance(scraper.current_item, Dataset))
104 |         self.assertTrue(scraper.current_item.id == "Dataset_2")
105 | 
106 |         scraper.move_up()
107 |         scraper.move_to(scraper.items[2])
108 |         self.assertTrue(isinstance(scraper.current_item, Dataset))
109 |         self.assertTrue(scraper.current_item.id == "Dataset_3")
110 | 
111 |     def test_chained_move_to(self):
112 |         """Use chaining to move."""
113 |         scraper = Scraper()
114 |         scraper.move_to("Dataset_1").move_up().move_to("Dataset_2")
115 |         self.assertTrue(scraper.current_item.id == "Dataset_2")
116 | 
117 |     def test_stop_at_root(self):
118 |         """Trying to move up from the root should do nothing."""
119 |         scraper = Scraper()
120 |         scraper.move_up().move_up().move_up().move_up()
121 |         self.assertTrue(scraper.current_item.is_root)
122 | 
123 |     def test_itemslist_contains(self):
124 |         """Make sure 'in' keyword works with ItemList."""
125 |         scraper = Scraper()
126 |         self.assertTrue("Dataset_1" in scraper.items)
127 |         self.assertTrue(scraper.items[0] in scraper.items)
128 | 
129 |     def test_select_missing_item(self):
130 |         """Select an Item by ID that doesn't exist."""
131 |         scraper = Scraper()
132 |         with self.assertRaises(NoSuchItem):
133 |             scraper.move_to("non_existing_item")
134 | 
135 |     def test_item_knows_parent(self):
136 |         """Make sure an item knows who its parent is."""
137 |         scraper = Scraper()
138 |         parent = scraper.current_item
139 |         dataset = scraper["Dataset_1"]
140 |         scraper.move_to("Dataset_1")
141 |         self.assertTrue(scraper.parent.id == dataset.parent.id ==
142 |                         scraper.current_item.parent.id == parent.id)
143 | 
144 |     def test_fetch_dataset(self):
145 |         """Query a dataset for some data."""
146 |         scraper = Scraper()
147 |         dataset = scraper[0]
148 |         self.assertEqual(dataset.data[0]["municipality"], "Robertsfors kommun")
149 | 
150 |     def test_unselected_visible_dataset(self):
151 |         """Query a dataset not selected, but visible."""
152 |         scraper = Scraper()
153 |         dataset = scraper["Dataset_1"]
154 |         scraper.move_to("Dataset_2")
155 |         self.assertEqual(dataset.data[0]["municipality"], "Robertsfors kommun")
156 | 
157 |     def test_cached_data(self):
158 |         """Query a dataset not selected but cached."""
159 |         scraper = Scraper()
160 |         data_1 = scraper["Dataset_1"].data
161 |         scraper.move_up().move_to("Dataset_2")
162 |         self.assertEqual(data_1[0]["municipality"], "Robertsfors kommun")
163 | 
164 |     def test_get_dimension(self):
165 |         """Get dimensions for a dataset."""
166 |         scraper = Scraper()
167 |         dataset = scraper[0]
168 |         self.assertTrue(len(dataset.dimensions))
169 |         self.assertTrue(isinstance(dataset.dimensions[0], Dimension))
170 | 
171 |         dim = dataset.dimensions["municipality"]
172 |         self.assertTrue(isinstance(dim, Dimension))
173 | 
174 |         dim = dataset.dimensions.get("municipality")
175 |         self.assertTrue(isinstance(dim, Dimension))
176 | 
177 |     def test_select_allowed_values(self):
178 |         """List allowed values from dimension."""
179 |         scraper = Scraper()
180 |         dataset = scraper[0]
181 | 
182 |         municipality = dataset.dimensions["municipality"]
183 |         self.assertTrue("Robertsfors kommun" in municipality.allowed_values)
184 | 
185 |         allowed_value = municipality.allowed_values["Robertsfors kommun"]
186 |         self.assertEqual(allowed_value, "Robertsfors kommun")
187 | 
188 |         # We also want to be able to fetch allowed values by label
189 |         allowed_value_by_label = municipality.allowed_values.get_by_label("Robertsfors kommun")
190 |         self.assertEqual(allowed_value, allowed_value_by_label)
191 | 
192 |         gender = dataset.dimensions["gender"]
193 |         self.assertEqual(len(gender.allowed_values), 2)
194 | 
195 |         # Get an allowed value by key
196 |         female = gender.allowed_values["female"]
197 | 
198 |         # Get an allowed value by label
199 |         female_by_label = gender.allowed_values.get_by_label("Women")
200 | 
201 |         # The two methods above should fetch the same item
202 |         self.assertEqual(female, female_by_label)
203 |         self.assertEqual(female.id, "gender")
204 |         self.assertEqual(female.value, "female")
205 |         self.assertEqual(female.label, "Women")
206 | 
207 |     def test_move_deep_manually(self):
208 |         """Use the NestedScraper to move more than one step."""
209 |         scraper = NestedScraper()
210 |         scraper.move_to("Collection_1")
211 |         self.assertTrue("Dataset_1" in scraper.items)
212 | 
213 |         scraper.move_to("Dataset_1")
214 |         self.assertEqual("Dataset_1", scraper.current_item)
215 |         self.assertTrue(len(scraper.current_item.data))
216 | 
217 |         scraper.move_to_top().move_to("Collection_2")
218 |         self.assertTrue("Dataset_2" in scraper.items)
219 |         self.assertTrue("Dataset_3" in scraper.items)
220 | 
221 |         scraper.move_up().move_to("Collection_1")
222 |         self.assertTrue("Dataset_1" in scraper.items)
223 | 
224 |     def test_move_deep_automatically(self):
225 |         """Use the NestedScraper to move more than one step,
226 |         and make sure the cursor follows along as needed."""
227 |         scraper = NestedScraper()
228 | 
229 |         collection_2 = scraper.items["Collection_2"]
230 |         self.assertTrue(len(collection_2.items))
231 | 
232 |         scraper.move_to_top()
233 |         dataset_1 = scraper["Collection_1"]["Dataset_1"]
234 |         self.assertTrue(len(dataset_1.data))
235 | 
236 |         dataset_2 = collection_2["Dataset_2"]
237 |         self.assertTrue(len(dataset_2.data))
238 | 
239 |         self.assertTrue(len(dataset_1.data))
240 | 
241 |     def test_callbacks(self):
242 |         """Extending the basescraper."""
243 |         scraper = CallbackScraper()
244 |         self.assertTrue(scraper.initiated)
245 | 


--------------------------------------------------------------------------------
/tests/test_dialects.py:
--------------------------------------------------------------------------------
 1 | """Tests related to the concept of certain datatypes having values with dialects."""
 2 | from unittest import TestCase
 3 | from statscraper import (BaseScraper, Dataset, Result, Dimension, DimensionValue)
 4 | 
 5 | 
 6 | class Scraper(BaseScraper):
 7 |     """A scraper with hardcoded yields."""
 8 | 
 9 |     def _fetch_itemslist(self, item):
10 |         yield Dataset("Dataset_1")
11 | 
12 |     def _fetch_dimensions(self, dataset):
13 |         yield Dimension(u"municipality", datatype="region")
14 | 
15 |     def _fetch_data(self, dataset, query=None):
16 |         yield Result(127, {
17 |             "municipality": "Robertsfors kommun",
18 |         })
19 |         yield Result(17, {
20 |             "municipality": "Region Gotland",
21 |         })
22 | 
23 | 
24 | class TestDialects(TestCase):
25 |     """Test translated values."""
26 | 
27 |     def test_translations(self):
28 |         """Test standalone translation."""
29 |         municipalities = Dimension("municipality",
30 |                                    datatype="region", domain="sweden/municipalities")
31 |         municipality = DimensionValue("Stockholms kommun", municipalities)
32 |         assert municipality.translate("numerical") == "180"
33 | 
34 |     def test_dialects(self):
35 |         """Test translation inside a scraper."""
36 |         scraper = Scraper()
37 |         data1 = scraper.items[0].data
38 |         self.assertEqual(str(data1[0]["municipality"]), "Robertsfors kommun")
39 | 
40 |         data2 = data1.translate("scb")
41 |         self.assertEqual(str(data2[0]["municipality"]), "2409 Robertsfors kommun")
42 | 


--------------------------------------------------------------------------------
/tests/test_resultset.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | from statscraper import Result, ResultSet
 4 | from pandas.api import types as ptypes
 5 | 
 6 | 
 7 | class TestResultSet(TestCase):
 8 | 
 9 |     def test_pandas_export(self):
10 |         """Get results as pandas dataframe."""
11 |         result = ResultSet()
12 |         result.append(Result(45483, {'city': "Voi"}))
13 |         df = result.pandas
14 |         self.assertTrue(ptypes.is_numeric_dtype(df.value))
15 | 


--------------------------------------------------------------------------------
/version.py:
--------------------------------------------------------------------------------
 1 | from datetime import date
 2 | 
 3 | name = "statscraper"
 4 | 
 5 | short_version = "2.0.2"
 6 | long_version = short_version
 7 | 
 8 | short_desc = """\
 9 | A base class for building web scrapers for statistical data.\
10 | """
11 | authors = u"Jens Finnäs and Leo Wallentin, J++; Robin Linderborg"
12 | year = date.today().year
13 | copyright = "%s, %s" % (year, authors)
14 | email = "stockholm@jplusplus.org"
15 | 
16 | version = long_version
17 | 


--------------------------------------------------------------------------------