├── .all-contributorsrc ├── .bumpversion.cfg ├── .github └── workflows │ ├── pytest.yml │ └── release.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── .ruff.toml ├── .travis.yml ├── LICENSE.txt ├── MANIFEST.in ├── README.md ├── docs └── dataset.md ├── environment.yml ├── examples ├── analyse_PV_data_for_9th_Aug_2019.ipynb ├── analyse_metadata_for_UK.ipynb ├── animate_PV_yield_map.ipynb ├── compute_grid_points_for_UK.ipynb ├── download_pv_timeseries.ipynb ├── get_all_rss_systems.py ├── get_all_systems_in_region.ipynb ├── get_metadata.ipynb ├── query_API_for_all_UK_grid_points.ipynb └── quick_start.ipynb ├── infrastructure └── docker │ ├── Dockerfile_dev │ └── Dockerfile_prod ├── pvoutput ├── __init__.py ├── consts.py ├── daterange.py ├── exceptions.py ├── grid_search │ ├── __init__.py │ ├── app.py │ ├── clip.py │ ├── grid_search.py │ └── natural_earth.py ├── mapscraper.py ├── prcoess.py ├── pvoutput.py └── utils.py ├── requirements.txt ├── scripts ├── fetch_pv_timeseries.py └── scrape_country_codes.py ├── setup.py └── tests ├── __init__.py ├── conftest.py ├── data ├── create_mapscraper_test_files.py ├── create_test_hdf.py ├── mapscraper_dict_of_dfs.pickle ├── mapscraper_soup.pickle └── test.hdf ├── test_daterange.py ├── test_grid_search.py ├── test_mapscraper.py ├── test_process.py ├── test_pvoutput.py └── test_utils.py /.all-contributorsrc: -------------------------------------------------------------------------------- 1 | { 2 | "files": [ 3 | "README.md" 4 | ], 5 | "imageSize": 100, 6 | "commit": false, 7 | "contributors": [ 8 | { 9 | "login": "JackKelly", 10 | "name": "Jack Kelly", 11 | "avatar_url": "https://avatars.githubusercontent.com/u/460756?v=4", 12 | "profile": "http://jack-kelly.com", 13 | "contributions": [ 14 | "code" 15 | ] 16 | }, 17 | { 18 | "login": "ssmssam", 19 | "name": "Sam Murphy-Sugrue", 20 | "avatar_url": "https://avatars.githubusercontent.com/u/39378848?v=4", 21 | "profile": "https://github.com/ssmssam", 22 | "contributions": [ 23 | "code" 24 | ] 25 | }, 26 | { 27 | "login": "gabrieltseng", 28 | "name": "Gabriel Tseng", 29 | "avatar_url": "https://avatars.githubusercontent.com/u/29063740?v=4", 30 | "profile": "https://gabrieltseng.github.io/", 31 | "contributions": [ 32 | "code" 33 | ] 34 | }, 35 | { 36 | "login": "JamieTaylor-TUOS", 37 | "name": "Jamie Taylor", 38 | "avatar_url": "https://avatars.githubusercontent.com/u/12187350?v=4", 39 | "profile": "http://www.solar.sheffield.ac.uk/", 40 | "contributions": [ 41 | "code" 42 | ] 43 | }, 44 | { 45 | "login": "peterdudfield", 46 | "name": "Peter Dudfield", 47 | "avatar_url": "https://avatars.githubusercontent.com/u/34686298?v=4", 48 | "profile": "https://github.com/peterdudfield", 49 | "contributions": [ 50 | "infra" 51 | ] 52 | }, 53 | { 54 | "login": "vnshanmukh", 55 | "name": "Shanmukh Chava", 56 | "avatar_url": "https://avatars.githubusercontent.com/u/67438038?v=4", 57 | "profile": "https://github.com/vnshanmukh", 58 | "contributions": [ 59 | "code" 60 | ] 61 | }, 62 | { 63 | "login": "Antsthebul", 64 | "name": "Antsthebul", 65 | "avatar_url": "https://avatars.githubusercontent.com/u/56587872?v=4", 66 | "profile": "https://github.com/Antsthebul", 67 | "contributions": [ 68 | "code" 69 | ] 70 | }, 71 | { 72 | "login": "rachtsingh", 73 | "name": "Rachit Singh", 74 | "avatar_url": "https://avatars.githubusercontent.com/u/1606892?v=4", 75 | "profile": "http://www.rachitsingh.com", 76 | "contributions": [ 77 | "data", 78 | "code" 79 | ] 80 | }, 81 | { 82 | "login": "devsjc", 83 | "name": "devsjc", 84 | "avatar_url": "https://avatars.githubusercontent.com/u/47188100?v=4", 85 | "profile": "https://github.com/devsjc", 86 | "contributions": [ 87 | "code" 88 | ] 89 | } 90 | ], 91 | "contributorsPerLine": 7, 92 | "projectName": "pvoutput", 93 | "projectOwner": "openclimatefix", 94 | "repoType": "github", 95 | "repoHost": "https://github.com", 96 | "skipCi": true, 97 | "commitConvention": "angular" 98 | } 99 | -------------------------------------------------------------------------------- /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | commit = True 3 | tag = True 4 | current_version = 0.1.33 5 | message = Bump version: {current_version} → {new_version} [skip ci] 6 | 7 | [bumpversion:file:setup.py] 8 | search = version="{current_version}" 9 | replace = version="{new_version}" 10 | -------------------------------------------------------------------------------- /.github/workflows/pytest.yml: -------------------------------------------------------------------------------- 1 | name: Python package 2 | 3 | on: [push] 4 | jobs: 5 | call-run-python-tests: 6 | uses: openclimatefix/.github/.github/workflows/python-test.yml@main 7 | with: 8 | # pytest-cov looks at this folder 9 | pytest_cov_dir: "pvoutput" 10 | # these packages are installed. They are needed for 'cartopy' 11 | sudo_apt_install: "libgeos-dev libproj-dev" 12 | # these packages are installed. They are needed for 'cartopy' 13 | brew_install: "c-blosc hdf5 geos proj" 14 | -------------------------------------------------------------------------------- /.github/workflows/release.yaml: -------------------------------------------------------------------------------- 1 | name: Bump version and auto-release 2 | on: 3 | push: 4 | branches: 5 | - main 6 | jobs: 7 | call-run-python-release: 8 | uses: openclimatefix/.github/.github/workflows/python-release.yml@v1.7.2 9 | secrets: 10 | token: ${{ secrets.PYPI_API_TOKEN }} 11 | PAT_TOKEN: ${{ secrets.PAT_TOKEN }} 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | db.sqlite3-journal 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # IPython 80 | profile_default/ 81 | ipython_config.py 82 | 83 | # JetBrains 84 | .idea 85 | 86 | # pyenv 87 | .python-version 88 | 89 | # pipenv 90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 93 | # install all needed dependencies. 94 | #Pipfile.lock 95 | 96 | # celery beat schedule file 97 | celerybeat-schedule 98 | 99 | # SageMath parsed files 100 | *.sage.py 101 | 102 | # Environments 103 | .env 104 | .venv 105 | env/ 106 | venv/ 107 | ENV/ 108 | env.bak/ 109 | venv.bak/ 110 | 111 | # Spyder project settings 112 | .spyderproject 113 | .spyproject 114 | 115 | # Rope project settings 116 | .ropeproject 117 | 118 | # mkdocs documentation 119 | /site 120 | 121 | # mypy 122 | .mypy_cache/ 123 | .dmypy.json 124 | dmypy.json 125 | 126 | # Pyre type checker 127 | .pyre/ 128 | 129 | # Project-specific 130 | .pvoutput.yml 131 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | default_language_version: 2 | python: python3 3 | 4 | repos: 5 | - repo: https://github.com/pre-commit/pre-commit-hooks 6 | rev: v5.0.0 7 | hooks: 8 | # list of supported hooks: https://pre-commit.com/hooks.html 9 | - id: trailing-whitespace 10 | - id: end-of-file-fixer 11 | - id: debug-statements 12 | - id: detect-private-key 13 | 14 | # python code formatting/linting 15 | - repo: https://github.com/astral-sh/ruff-pre-commit 16 | # Ruff version. 17 | rev: "v0.11.5" 18 | hooks: 19 | - id: ruff 20 | args: [--fix] 21 | - repo: https://github.com/psf/black 22 | rev: 25.1.0 23 | hooks: 24 | - id: black 25 | args: [--line-length, "100"] 26 | # yaml formatting 27 | - repo: https://github.com/pre-commit/mirrors-prettier 28 | rev: v4.0.0-alpha.8 29 | hooks: 30 | - id: prettier 31 | types: [yaml] 32 | -------------------------------------------------------------------------------- /.ruff.toml: -------------------------------------------------------------------------------- 1 | # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default. 2 | select = ["E", "F", "D", "I"] 3 | ignore = ["D200","D202","D210","D212","D415","D105",] 4 | 5 | # Allow autofix for all enabled rules (when `--fix`) is provided. 6 | fixable = ["A", "B", "C", "D", "E", "F", "I"] 7 | unfixable = [] 8 | 9 | # Exclude a variety of commonly ignored directories. 10 | exclude = [ 11 | ".bzr", 12 | ".direnv", 13 | ".eggs", 14 | ".git", 15 | ".hg", 16 | ".mypy_cache", 17 | ".nox", 18 | ".pants.d", 19 | ".pytype", 20 | ".ruff_cache", 21 | ".svn", 22 | ".tox", 23 | ".venv", 24 | "__pypackages__", 25 | "_build", 26 | "buck-out", 27 | "build", 28 | "dist", 29 | "node_modules", 30 | "venv", 31 | "tests", 32 | ] 33 | 34 | # Same as Black. 35 | line-length = 100 36 | 37 | # Allow unused variables when underscore-prefixed. 38 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" 39 | 40 | # Assume Python 3.10. 41 | target-version = "py310" 42 | fix = false 43 | # Group violations by containing file. 44 | format = "github" 45 | ignore-init-module-imports = true 46 | 47 | [mccabe] 48 | # Unlike Flake8, default to a complexity level of 10. 49 | max-complexity = 10 50 | 51 | [pydocstyle] 52 | # Use Google-style docstrings. 53 | convention = "google" 54 | 55 | [per-file-ignores] 56 | "__init__.py" = ["F401", "E402"] 57 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | dist: xenial # required for Python >= 3.7 2 | language: python 3 | python: 3.7 4 | install: pip install -e git+https://github.com/openclimatefix/pvoutput#egg=pvoutput 5 | script: py.test 6 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright 2019 Open Climate Fix 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | [![All Contributors](https://img.shields.io/badge/all_contributors-9-orange.svg?style=flat-square)](#contributors-) 3 | 4 | 5 | [![codecov](https://codecov.io/gh/openclimatefix/pvoutput/branch/main/graph/badge.svg?token=GTQDR2ZZ2S)](https://codecov.io/gh/openclimatefix/pvoutput) 6 | 7 | Download historical solar photovoltaic data from [PVOutput.org](https://pvoutput.org). 8 | 9 | This code is a work-in-progress. The aim is to provide both a Python library for interacting with [PVOutput.org's API](https://pvoutput.org/help.html#api), and a set of scripts for downloading lots of data :) 10 | 11 | # Installation 12 | 13 | ```bash 14 | $ pip install pvoutput-ocf 15 | ``` 16 | 17 | ## Register with PVOutput.org 18 | 19 | You need to get an API key *and* a system ID from PVOutput.org. 20 | 21 | If you don't own a physical PV system, click the "energy consumption only" box 22 | when registering on PVOutput. If you don't include a 23 | system ID, then you'll get a "401 Unauthorized" response from the PVOutput API. 24 | 25 | You can pass the API key and system ID into the `PVOutput` constructor. 26 | Or, create a `~/.pvoutput.yml` file which looks like: 27 | 28 | ```yaml 29 | api_key: 30 | system_id: 31 | ``` 32 | 33 | The default location of the `.pvoutput.yml` is the user's home directory, expanded from `~`. This can be overridden by setting the `PVOUTPUT_CONFIG` environment variable. 34 | 35 | e.g. `export PVOUTPUT_CONFIG="/my/preferred/location/.pvoutput.yml"` 36 | 37 | Alternatively, you can set `API_KEY`, `SYSTEM_ID` and `DATA_SERVICE_URL` (see below) as environmental variables. 38 | 39 | ### API quotas and paid subscriptions 40 | Please see [here](https://pvoutput.org/help/data_services.html) for update info. 41 | 42 | #### Free 43 | 44 | For free, PVOutput.org gives you 60 API requests per hour. In a single API request you can download one day of data for one PV system. (See PVOutput's docs for more info about [rate limits](https://pvoutput.org/help/api_specification.html#rate-limits).) 45 | 46 | #### Donate 47 | [Donating to PVOutput.org](https://pvoutput.org/help/donations.html#donations) increases your API quota to 300 requests per hour. 48 | 49 | #### Paid 50 | To get more historical data, you can pay $800 Australian dollars for a year's 'Live System History' subscription for a single country ([more info here](https://pvoutput.org/help/data_services.html). And [here's PVOutput.org's full price list](https://pvoutput.org/services.jsp)). 51 | This allows you to use the [`get batch status`](https://pvoutput.org/help/data_services.html#get-batch-status-service) API to download 900 PV-system-*years* per hour. 52 | 53 | If you have subscribed to PVOutput's data service then either 54 | - add `data_service_url` to your configuration file (`~/.pvoutput.yml`) or 55 | - pass `data_service_url` to the `PVOutput` constructor. 56 | 57 | The `data_service_url` should end in `.org` (note the `data_service_url` doesn't include the `/service/r2` part of the URL) 58 | For example: `data_service_url: https://pvoutput.org/` 59 | 60 | 61 | ## Install pvoutput Python library 62 | 63 | ```bash 64 | pip install -e git+https://github.com/openclimatefix/pvoutput.git@main#egg=pvoutput-ocf 65 | ``` 66 | 67 | # Usage 68 | 69 | See the [Quick Start notebook](examples/quick_start.ipynb). 70 | 71 | ## Contributors ✨ 72 | 73 | Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)): 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 |
Jack Kelly
Jack Kelly

💻
Sam Murphy-Sugrue
Sam Murphy-Sugrue

💻
Gabriel Tseng
Gabriel Tseng

💻
Jamie Taylor
Jamie Taylor

💻
Peter Dudfield
Peter Dudfield

🚇
Shanmukh Chava
Shanmukh Chava

💻
Antsthebul
Antsthebul

💻
Rachit Singh
Rachit Singh

🔣 💻
devsjc
devsjc

💻
95 | 96 | 97 | 98 | 99 | 100 | 101 | This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome! 102 | -------------------------------------------------------------------------------- /docs/dataset.md: -------------------------------------------------------------------------------- 1 | ## `UK_PV_timeseries.hdf` 2 | 3 | ### `missing_dates` table 4 | 5 | Sometimes we query PVOutput.api for a particular date and PV system ID, and PVOutput.org returns no data. The `missing_dates` table records these pairs of PV system IDs and dates, so we don't retry these missing dates (and hence chew through our API quota!) 6 | 7 | Columns: 8 | 9 | - `pv_system_id`: index column, integer 10 | - `missing_start_date_PV_localtime` and `missing_end_date_PV_localtime`: The start and end of the date range of missing dates for this system ID. `pd.HDFStore` doesn't support `date` columns, so these are actual `pd.Timestamp` objects. 11 | - `datetime_of_API_request`: For data retrieved on or after 2019-08-06, this contains the UTC datetime of the API request. For data retrieved between 2019-08-05 and 2019-08-06, this has been manually backfilled with '2019-08-05 00:00'. For data retrieved before 2019-08-05, this columns contains `NaT` - these rows should be treated with some suspicion, because my data retrieval code may have been malformatting the date string for the PVOutput.org API, and hence may contain some 'missing dates' which aren't actually missing! A tell-tale might be if there are duplicated rows. 12 | 13 | ### `metadata` table 14 | 15 | ### `timeseries/` tables 16 | 17 | Columns: 18 | - `datetime`: index column, pd.DatetimeIndex, [localtime to the PV system](https://forum.pvoutput.org/t/clarification-are-date-times-in-local-or-utc/570/2). 19 | - `datetime_of_API_request`: The datetime at which we sent the API request. Will be `NaT` for data retrieved before about 2019-08-06 13:00 UTC. 20 | - `query_date`: The date (in localtime to the PV system) used in the query to the PVOutput.org API. Will be `NaT` for data retrieved before about 2019-08-06 13:00 UTC. 21 | - ... other columns contain data from PVOutput.org 22 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: pvoutput 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python>=3.7 6 | - pytest 7 | - pyyaml 8 | - pytables 9 | - pandas 10 | - matplotlib 11 | - jupyterlab 12 | - urllib3 13 | - requests 14 | - beautifulsoup4 15 | -------------------------------------------------------------------------------- /examples/get_all_rss_systems.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openclimatefix/pvoutput/383f57baf1488c0f14d4812acc0f4857152b3cb1/examples/get_all_rss_systems.py -------------------------------------------------------------------------------- /examples/get_metadata.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 51, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "import pandas as pd\n", 11 | "import time\n", 12 | "from datetime import datetime\n", 13 | "from pvoutput import *" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 23, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "INPUT_PV_LIST_FILENAME = '/home/jack/data/pvoutput.org/processed/UK_PV_listing_metadata.hdf'\n", 23 | "OUTPUT_METADATA_FILENAME = '/home/jack/data/pvoutput.org/processed/UK_PV_metadata.csv'\n", 24 | "PV_STATS_FILENAME = '/home/jack/data/pvoutput.org/processed/UK_PV_stats.csv'" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "pv_systems = pd.read_hdf(INPUT_PV_LIST_FILENAME, 'metadata')" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 42, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "data": { 43 | "text/html": [ 44 | "
\n", 45 | "\n", 58 | "\n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | "
Array TiltInverterLocationOrientationPanelsShadesystem_capacitysystem_nameOutputsGenerationAverageEfficiencysystem_capacity_kwefficiency_kWh_per_kWsystem_1_tilt_degreessystem_2_tilt_degrees
system_id
2696511.0°Enphase M215United Kingdom CV47South 0.0°336x250W Q Cells Pro-G3 250Low84.000kWmfl_phc1813 days399.238MWh220.208kWh2.654kWh/kW84.002.65411.0NaN
24768NaNaurora trioUnited Kingdom OX7South 180.0°1050x240W qidongNo252.000kWmfl_qfa446 days307.029MWh688.405kWh2.732kWh/kW252.002.732NaNNaN
1154215.0°SMA TRI-Power 1700 TLUnited Kingdom PE11South 180.0°200x250W Emmvee ES 230 M60 BNo50.000kWWray Farms Solar System2437 days293.684MWh120.510kWh2.477kWh/kW50.002.47715.0NaN
6699140.0°UnknownUnited Kingdom HR8South 180.0° / South West 225.0°152x325W + 80x325W UnknownNo75.400kWLedbury Community Hospital1434 days279.902MWh195.190kWh2.603kWh/kW75.402.60340.0NaN
511615.0°Fronius CL36; Datamanager 2.0United Kingdom OX7South 180.0°182x235W Q.Base-G2 235No42.770kWmfl_scf2538 days267.470MWh105.386kWh2.493kWh/kW42.772.49315.0NaN
\n", 197 | "
" 198 | ], 199 | "text/plain": [ 200 | " Array Tilt Inverter Location \\\n", 201 | "system_id \n", 202 | "26965 11.0° Enphase M215 United Kingdom CV47 \n", 203 | "24768 NaN aurora trio United Kingdom OX7 \n", 204 | "11542 15.0° SMA TRI-Power 1700 TL United Kingdom PE11 \n", 205 | "66991 40.0° Unknown United Kingdom HR8 \n", 206 | "5116 15.0° Fronius CL36; Datamanager 2.0 United Kingdom OX7 \n", 207 | "\n", 208 | " Orientation Panels \\\n", 209 | "system_id \n", 210 | "26965 South 0.0° 336x250W Q Cells Pro-G3 250 \n", 211 | "24768 South 180.0° 1050x240W qidong \n", 212 | "11542 South 180.0° 200x250W Emmvee ES 230 M60 B \n", 213 | "66991 South 180.0° / South West 225.0° 152x325W + 80x325W Unknown \n", 214 | "5116 South 180.0° 182x235W Q.Base-G2 235 \n", 215 | "\n", 216 | " Shade system_capacity system_name Outputs \\\n", 217 | "system_id \n", 218 | "26965 Low 84.000kW mfl_phc 1813 days \n", 219 | "24768 No 252.000kW mfl_qfa 446 days \n", 220 | "11542 No 50.000kW Wray Farms Solar System 2437 days \n", 221 | "66991 No 75.400kW Ledbury Community Hospital 1434 days \n", 222 | "5116 No 42.770kW mfl_scf 2538 days \n", 223 | "\n", 224 | " Generation Average Efficiency system_capacity_kw \\\n", 225 | "system_id \n", 226 | "26965 399.238MWh 220.208kWh 2.654kWh/kW 84.00 \n", 227 | "24768 307.029MWh 688.405kWh 2.732kWh/kW 252.00 \n", 228 | "11542 293.684MWh 120.510kWh 2.477kWh/kW 50.00 \n", 229 | "66991 279.902MWh 195.190kWh 2.603kWh/kW 75.40 \n", 230 | "5116 267.470MWh 105.386kWh 2.493kWh/kW 42.77 \n", 231 | "\n", 232 | " efficiency_kWh_per_kW system_1_tilt_degrees system_2_tilt_degrees \n", 233 | "system_id \n", 234 | "26965 2.654 11.0 NaN \n", 235 | "24768 2.732 NaN NaN \n", 236 | "11542 2.477 15.0 NaN \n", 237 | "66991 2.603 40.0 NaN \n", 238 | "5116 2.493 15.0 NaN " 239 | ] 240 | }, 241 | "execution_count": 42, 242 | "metadata": {}, 243 | "output_type": "execute_result" 244 | } 245 | ], 246 | "source": [ 247 | "pv_systems.head()" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": {}, 253 | "source": [ 254 | "## Retrieve metadata using get_pv_metadata" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 68, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "# Get list of systems we got from the PVOutput.org API search\n", 264 | "pv_sys_api_search = pd.read_csv(\n", 265 | " '/home/jack/data/pvoutput.org/raw/uk_pv_systems.csv',\n", 266 | " index_col='system_id',\n", 267 | " usecols=['system_id'])" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 69, 273 | "metadata": {}, 274 | "outputs": [ 275 | { 276 | "name": "stdout", 277 | "output_type": "stream", 278 | "text": [ 279 | "2559 systems already processed.\n" 280 | ] 281 | } 282 | ], 283 | "source": [ 284 | "if os.path.exists(OUTPUT_METADATA_FILENAME):\n", 285 | " output_metadata = pd.read_csv(OUTPUT_METADATA_FILENAME, index_col='system_id', usecols=['system_id'])\n", 286 | " systems_already_processed = output_metadata.index\n", 287 | " header = False\n", 288 | "else:\n", 289 | " systems_already_processed = []\n", 290 | " header = True\n", 291 | " \n", 292 | "print(len(systems_already_processed), 'systems already processed.')" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 73, 298 | "metadata": {}, 299 | "outputs": [ 300 | { 301 | "name": "stdout", 302 | "output_type": "stream", 303 | "text": [ 304 | "1471 PV systems left to process.\n" 305 | ] 306 | } 307 | ], 308 | "source": [ 309 | "pv_systems_to_process = list(\n", 310 | " (set(pv_systems.index).union(pv_sys_api_search.index)) - \n", 311 | " set(systems_already_processed))\n", 312 | "print(len(pv_systems_to_process), 'PV systems left to process.')" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 67, 318 | "metadata": {}, 319 | "outputs": [ 320 | { 321 | "name": "stdout", 322 | "output_type": "stream", 323 | "text": [ 324 | " 0 of 1971 | ID = 26572\n", 325 | "Bad status code returned: 403, Forbidden 403: Exceeded 300 requests per hour\n", 326 | "Waiting 54 minutes...\n", 327 | "Done waiting! Retrying...\n", 328 | " 300 of 1971 | ID = 3074\n", 329 | "Bad status code returned: 403, Forbidden 403: Exceeded 300 requests per hour\n", 330 | "Waiting 56 minutes...\n", 331 | "Done waiting! Retrying...\n", 332 | " 600 of 1971 | ID = 4185\n", 333 | "Bad status code returned: 403, Forbidden 403: Exceeded 300 requests per hour\n", 334 | "Waiting 57 minutes...\n", 335 | "Done waiting! Retrying...\n", 336 | " 900 of 1971 | ID = 37689\n", 337 | "Bad status code returned: 403, Forbidden 403: Exceeded 300 requests per hour\n", 338 | "Waiting 56 minutes...\n", 339 | "Done waiting! Retrying...\n", 340 | " 1200 of 1971 | ID = 30248\n", 341 | "Bad status code returned: 403, Forbidden 403: Exceeded 300 requests per hour\n", 342 | "Waiting 57 minutes...\n", 343 | "Done waiting! Retrying...\n", 344 | " 1500 of 1971 | ID = 6555\n", 345 | "Bad status code returned: 403, Forbidden 403: Exceeded 300 requests per hour\n", 346 | "Waiting 56 minutes...\n", 347 | "Done waiting! Retrying...\n", 348 | " 1800 of 1971 | ID = 40277\n", 349 | "Bad status code returned: 403, Forbidden 403: Exceeded 300 requests per hour\n", 350 | "Waiting 57 minutes...\n", 351 | "Done waiting! Retrying...\n", 352 | " 1970 of 1971 | ID = 57336" 353 | ] 354 | } 355 | ], 356 | "source": [ 357 | "n = len(pv_systems_to_process)\n", 358 | "for i, pv_system_id in enumerate(pv_systems_to_process):\n", 359 | " print('\\r', '{:>4d}'.format(i), 'of', n, '| ID =', pv_system_id, end='', flush=True)\n", 360 | " pv_metadata = get_pv_metadata(pv_system_id).to_frame().T.set_index('system_id')\n", 361 | " pv_metadata.to_csv(\n", 362 | " OUTPUT_METADATA_FILENAME,\n", 363 | " mode='a',\n", 364 | " header=header)\n", 365 | " header = False" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 95, 371 | "metadata": {}, 372 | "outputs": [ 373 | { 374 | "name": "stdout", 375 | "output_type": "stream", 376 | "text": [ 377 | "system_id,system_name,system_size_watts,postcode,number_of_panels,panel_power_watts,panel_brand,num_inverters,inverter_power_watts,inverter_brand,orientation,array_tilt_degrees,shade,install_date,latitude,longitude,status_interval_minutes,number_of_panels_secondary,panel_power_watts_secondary,orientation_secondary,array_tilt_degrees_secondary\n", 378 | "19397,AndyT's,3900,BS22,20,195,ZN Shine 195w,1,3500,Kaco 4202,S,30.0,No,2011-11-21 00:00:00,51.36,-2.92,5,0,0,,\n", 379 | "8195,Kilmarnock Roof,3750,KA3,15,250,Sanyo 250 HIT,1,4000,Omniksol,S,30.0,No,2011-11-07 00:00:00,55.64,-4.49,10,0,0,,\n", 380 | "8200,Flat 5,3430,E8,14,245,,1,3000,sb3000,S,25.0,Low,2011-12-12 00:00:00,51.54,-0.06,5,0,0,,\n", 381 | "8204,Sooper-Dooper Solar,2940,GU2,12,245,SunTech STP245S-20/Wd,1,3000,Kaco Powador 3002,S,19.0,No,2012-05-11 00:00:00,51.24,-0.59,10,0,0,,\n", 382 | "8205,58GPR,4000,BS48,16,250,Sanyo component Europe GmbH,1,50,Sma sunny boysb3800v,S,,No,2011-11-10 00:00:00,51.42,-2.74,10,0,0,,\n", 383 | "32783,olaf-UK,3780,B92,14,270,canadian solar CS6P-270MM,1,3600,SMA Sunny Boy 3600TL,S,25.0,No,2014-10-15 00:00:00,52.43,-1.77,5,0,0,,\n", 384 | "8208,48 St Saviours,4000,PR5,16,250,Sharp,1,4000,SMA,SW,35.0,No,2012-02-21 00:00:00,53.73,-2.65,10,0,0,,\n", 385 | "40978,Sma 2Kw,2000,CF31,8,250,Hyundai,1,2000,Sma 2000HF,S,30.0,No,2011-12-01 00:00:00,51.5,-3.57,5,0,0,,\n", 386 | "24599,LongfellowPV,3750,MK16,15,250,Yingli YL250C-30b,1,3600,Fronius IG TL 3.6,SW,30.0,Low,2013-05-28 00:00:00,52.083376,-0.729613,5,0,0,,\n" 387 | ] 388 | } 389 | ], 390 | "source": [ 391 | "!head $OUTPUT_METADATA_FILENAME" 392 | ] 393 | }, 394 | { 395 | "cell_type": "markdown", 396 | "metadata": {}, 397 | "source": [ 398 | "## Retrieve metadata using get_statistics" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": null, 404 | "metadata": {}, 405 | "outputs": [], 406 | "source": [ 407 | "# get stats\n", 408 | "if os.path.exists(PV_STATS_FILENAME):\n", 409 | " header = False\n", 410 | " stats_processed = pd.read_csv(PV_STATS_FILENAME, index_col='system_id', usecols=['system_id'])\n", 411 | "else:\n", 412 | " stats_processed = []\n", 413 | " header = True\n", 414 | " \n", 415 | "pv_systems_to_get_stats = set(pv_systems_filtered.index).union(pv_sys_api_search.index)\n", 416 | "\n", 417 | "print(len(pv_systems_to_get_stats), 'total PV systems')\n", 418 | "print(len(stats_processed), 'system IDs already loaded')\n", 419 | "stats_to_process = set(pv_systems_to_get_stats) - set(stats_processed.index)\n", 420 | "# re-order\n", 421 | "# stats_to_process = pd.Series(1, index=stats_to_process).reindex(pv_systems_filtered.index).dropna().index\n", 422 | "print(len(stats_to_process), 'system IDs to load')\n", 423 | " \n", 424 | "for i, system_id in enumerate(stats_to_process):\n", 425 | " print('\\r', i, system_id, end=' ', flush=True)\n", 426 | " try:\n", 427 | " pv_stats = get_pv_statistic(system_id)\n", 428 | " except NoStatusFound:\n", 429 | " print('No status found for', system_id)\n", 430 | " # Create a blank row\n", 431 | " index = ['system_id'] + list(range(1, 11))\n", 432 | " pv_stats = pd.Series(\n", 433 | " [system_id] + ([''] * 10),\n", 434 | " index=index)\n", 435 | " pv_stats['system_id'] = int(pv_stats['system_id'])\n", 436 | " pv_stats['stats_downloaded_on_utc'] = datetime.utcnow()\n", 437 | " pv_stats = pv_stats.to_frame().T.set_index('system_id')\n", 438 | " with open(PV_STATS_FILENAME, mode='a') as fh:\n", 439 | " pv_stats.to_csv(fh, header=header)\n", 440 | " header = False" 441 | ] 442 | } 443 | ], 444 | "metadata": { 445 | "kernelspec": { 446 | "display_name": "Python 3", 447 | "language": "python", 448 | "name": "python3" 449 | }, 450 | "language_info": { 451 | "codemirror_mode": { 452 | "name": "ipython", 453 | "version": 3 454 | }, 455 | "file_extension": ".py", 456 | "mimetype": "text/x-python", 457 | "name": "python", 458 | "nbconvert_exporter": "python", 459 | "pygments_lexer": "ipython3", 460 | "version": "3.7.3" 461 | } 462 | }, 463 | "nbformat": 4, 464 | "nbformat_minor": 2 465 | } 466 | -------------------------------------------------------------------------------- /infrastructure/docker/Dockerfile_dev: -------------------------------------------------------------------------------- 1 | FROM python:3.9 2 | 3 | WORKDIR /pvoutput 4 | 5 | RUN apt-get -qq update \ 6 | && apt-get -qq install -y --no-install-recommends \ 7 | git \ 8 | curl \ 9 | git \ 10 | wget \ 11 | libproj-dev \ 12 | proj-data \ 13 | proj-bin \ 14 | libgeos-dev \ 15 | libgdal-dev \ 16 | python3-gdal \ 17 | gdal-bin \ 18 | && apt-get autoclean && apt-get autoremove \ 19 | > /dev/null 20 | 21 | COPY requirements.txt /pvoutput/requirements.txt 22 | 23 | RUN pip install -U pip && pip install --no-cache-dir -r /pvoutput/requirements.txt > /dev/null 24 | 25 | EXPOSE 1234 26 | 27 | CMD ["jupyter", "notebook", "--allow-root", "--ip", "0.0.0.0", "--port", "1234", "--no-browser"] 28 | -------------------------------------------------------------------------------- /infrastructure/docker/Dockerfile_prod: -------------------------------------------------------------------------------- 1 | FROM python:3.9 2 | 3 | WORKDIR /pvoutput 4 | 5 | RUN pip install -e git+https://github.com/openclimatefix/pvoutput#egg=pvoutput > /dev/null 6 | 7 | COPY examples/ /pvoutput/examples/ 8 | 9 | EXPOSE 1234 10 | 11 | CMD ["jupyter", "notebook", "--allow-root", "--ip", "0.0.0.0", "--port", "1234", "--no-browser", "/pvoutput/examples/quick_start.ipynb"] 12 | -------------------------------------------------------------------------------- /pvoutput/__init__.py: -------------------------------------------------------------------------------- 1 | """Init PVoutput library""" 2 | 3 | from .pvoutput import * # noqa 4 | 5 | __version__ = 0.1 6 | -------------------------------------------------------------------------------- /pvoutput/consts.py: -------------------------------------------------------------------------------- 1 | """Constants used in this repo""" 2 | 3 | import os 4 | from datetime import timedelta 5 | from urllib.parse import urljoin 6 | 7 | BASE_URL = "https://pvoutput.org" 8 | MAP_URL = urljoin(BASE_URL, "map.jsp") 9 | REGIONS_URL = urljoin(BASE_URL, "region.jsp") 10 | 11 | # Country codes used by PVOutput.org on, for example, 12 | # https://pvoutput.org/map.jsp. Taken from 13 | # https://pvoutput.org/help.html#api-addsystem. 14 | 15 | PV_OUTPUT_COUNTRY_CODES = { 16 | "New South Wales": 1, 17 | "Afghanistan": 2, 18 | "Akrotiri": 3, 19 | "Albania": 4, 20 | "Algeria": 5, 21 | "American Samoa": 6, 22 | "Andorra": 7, 23 | "Angola": 8, 24 | "Anguilla": 9, 25 | "Antarctica": 10, 26 | "Antigua and Barbuda": 11, 27 | "Arctic Ocean": 12, 28 | "Argentina": 13, 29 | "Armenia": 14, 30 | "Aruba": 15, 31 | "Ashmore and Cartier Islands": 16, 32 | "Atlantic Ocean": 17, 33 | "Austria": 18, 34 | "Azerbaijan": 19, 35 | "Bahamas, The": 20, 36 | "Bahrain": 21, 37 | "Bangladesh": 22, 38 | "Barbados": 23, 39 | "Belarus": 24, 40 | "Belgium": 25, 41 | "Belize": 26, 42 | "Benin": 27, 43 | "Bermuda": 28, 44 | "Bhutan": 29, 45 | "Bolivia": 30, 46 | "Bosnia and Herzegovina": 31, 47 | "Botswana": 32, 48 | "Bouvet Island": 33, 49 | "Brazil": 34, 50 | "British Indian Ocean Territory": 35, 51 | "British Virgin Islands": 36, 52 | "Brunei": 37, 53 | "Bulgaria": 38, 54 | "Burkina Faso": 39, 55 | "Burma": 40, 56 | "Burundi": 41, 57 | "Cambodia": 42, 58 | "Cameroon": 43, 59 | "Canada": 44, 60 | "Cape Verde": 45, 61 | "Cayman Islands": 46, 62 | "Central African Republic": 47, 63 | "Chad": 48, 64 | "Chile": 49, 65 | "China": 50, 66 | "Christmas Island": 51, 67 | "Clipperton Island": 52, 68 | "Cocos (Keeling) Islands": 53, 69 | "Colombia": 54, 70 | "Comoros": 55, 71 | "Congo, Democratic Republic of the": 56, 72 | "Congo, Republic of the": 57, 73 | "Cook Islands": 58, 74 | "Coral Sea Islands": 59, 75 | "Costa Rica": 60, 76 | "Cote d'Ivoire": 61, 77 | "Croatia": 62, 78 | "Cuba": 63, 79 | "Curacao": 64, 80 | "Cyprus": 65, 81 | "Czech Republic": 66, 82 | "Denmark": 67, 83 | "Dhekelia": 68, 84 | "Djibouti": 69, 85 | "Dominica": 70, 86 | "Dominican Republic": 71, 87 | "Ecuador": 72, 88 | "Egypt": 73, 89 | "El Salvador": 74, 90 | "Equatorial Guinea": 75, 91 | "Eritrea": 76, 92 | "Estonia": 77, 93 | "Ethiopia": 78, 94 | "Falkland Islands": 79, 95 | "Faroe Islands": 80, 96 | "Fiji": 81, 97 | "Finland": 82, 98 | "France": 83, 99 | "French Polynesia": 84, 100 | "French Southern and Antarctic Lands": 85, 101 | "Gabon": 86, 102 | "Gambia, The": 87, 103 | "Gaza Strip": 88, 104 | "Georgia": 89, 105 | "Germany": 90, 106 | "Ghana": 91, 107 | "Gibraltar": 92, 108 | "Greece": 93, 109 | "Greenland": 94, 110 | "Grenada": 95, 111 | "Guam": 96, 112 | "Guatemala": 97, 113 | "Guernsey": 98, 114 | "Guinea": 99, 115 | "Guinea-Bissau": 100, 116 | "Guyana": 101, 117 | "Haiti": 102, 118 | "Heard Island and McDonald Islands": 103, 119 | "Holy See (Vatican City)": 104, 120 | "Honduras": 105, 121 | "Hong Kong": 106, 122 | "Hungary": 107, 123 | "Iceland": 108, 124 | "India": 109, 125 | "Indian Ocean": 110, 126 | "Indonesia": 111, 127 | "Iran": 112, 128 | "Iraq": 113, 129 | "Ireland": 114, 130 | "Isle of Man": 115, 131 | "Israel": 116, 132 | "Italy": 117, 133 | "Jamaica": 118, 134 | "Jan Mayen": 119, 135 | "Japan": 120, 136 | "Jersey": 121, 137 | "Jordan": 122, 138 | "Kazakhstan": 123, 139 | "Kenya": 124, 140 | "Kiribati": 125, 141 | "Korea, North": 126, 142 | "Korea, South": 127, 143 | "Kosovo": 128, 144 | "Kuwait": 129, 145 | "Kyrgyzstan": 130, 146 | "Laos": 131, 147 | "Latvia": 132, 148 | "Lebanon": 133, 149 | "Lesotho": 134, 150 | "Liberia": 135, 151 | "Libya": 136, 152 | "Liechtenstein": 137, 153 | "Lithuania": 138, 154 | "Luxembourg": 139, 155 | "Macau": 140, 156 | "Macedonia": 141, 157 | "Madagascar": 142, 158 | "Malawi": 143, 159 | "Malaysia": 144, 160 | "Maldives": 145, 161 | "Mali": 146, 162 | "Malta": 147, 163 | "Marshall Islands": 148, 164 | "Mauritania": 149, 165 | "Mauritius": 150, 166 | "Mayotte": 151, 167 | "Mexico": 152, 168 | "Micronesia": 153, 169 | "Moldova": 154, 170 | "Monaco": 155, 171 | "Mongolia": 156, 172 | "Montenegro": 157, 173 | "Montserrat": 158, 174 | "Morocco": 159, 175 | "Mozambique": 160, 176 | "Namibia": 161, 177 | "Nauru": 162, 178 | "Navassa Island": 163, 179 | "Nepal": 164, 180 | "Netherlands": 165, 181 | "New Caledonia": 166, 182 | "New Zealand": 167, 183 | "Nicaragua": 168, 184 | "Niger": 169, 185 | "Nigeria": 170, 186 | "Niue": 171, 187 | "Norfolk Island": 172, 188 | "Northern Mariana Islands": 173, 189 | "Norway": 174, 190 | "Oman": 175, 191 | "Pakistan": 176, 192 | "Palau": 177, 193 | "Panama": 178, 194 | "Papua New Guinea": 179, 195 | "Paracel Islands": 180, 196 | "Paraguay": 181, 197 | "Peru": 182, 198 | "Philippines": 183, 199 | "Pitcairn Islands": 184, 200 | "Poland": 185, 201 | "Portugal": 186, 202 | "Puerto Rico": 187, 203 | "Qatar": 188, 204 | "Romania": 189, 205 | "Russia": 190, 206 | "Rwanda": 191, 207 | "Saint Barthelemy": 192, 208 | "Saint Helena, Ascension, and Tristan da Cunha": 193, 209 | "Saint Kitts and Nevis": 194, 210 | "Saint Lucia": 195, 211 | "Saint Martin": 196, 212 | "Saint Pierre and Miquelon": 197, 213 | "Saint Vincent and the Grenadines": 198, 214 | "Samoa": 199, 215 | "San Marino": 200, 216 | "Sao Tome and Principe": 201, 217 | "Saudi Arabia": 202, 218 | "Senegal": 203, 219 | "Serbia": 204, 220 | "Seychelles": 205, 221 | "Sierra Leone": 206, 222 | "Singapore": 207, 223 | "Sint Maarten": 208, 224 | "Slovakia": 209, 225 | "Slovenia": 210, 226 | "Solomon Islands": 211, 227 | "Somalia": 212, 228 | "South Africa": 213, 229 | "South Georgia and South Sandwich Is.": 214, 230 | "Southern Ocean": 215, 231 | "Spain": 216, 232 | "Spratly Islands": 217, 233 | "Sri Lanka": 218, 234 | "Sudan": 219, 235 | "Suriname": 220, 236 | "Svalbard": 221, 237 | "Swaziland": 222, 238 | "Sweden": 223, 239 | "Switzerland": 224, 240 | "Syria": 225, 241 | "Taiwan": 226, 242 | "Tajikistan": 227, 243 | "Tanzania": 228, 244 | "Thailand": 229, 245 | "Timor-Leste": 230, 246 | "Togo": 231, 247 | "Tokelau": 232, 248 | "Tonga": 233, 249 | "Trinidad and Tobago": 234, 250 | "Tunisia": 235, 251 | "Turkey": 236, 252 | "Turkmenistan": 237, 253 | "Turks and Caicos Islands": 238, 254 | "Tuvalu": 239, 255 | "Uganda": 240, 256 | "Ukraine": 241, 257 | "United Arab Emirates": 242, 258 | "United Kingdom": 243, 259 | "United States": 244, 260 | "Uruguay": 245, 261 | "Uzbekistan": 246, 262 | "Vanuatu": 247, 263 | "Venezuela": 248, 264 | "Vietnam": 249, 265 | "Virgin Islands": 250, 266 | "Wake Island": 251, 267 | "Wallis and Futuna": 252, 268 | "West Bank": 253, 269 | "Western Sahara": 254, 270 | "Yemen": 255, 271 | "Zambia": 256, 272 | "Zimbabwe": 257, 273 | } 274 | 275 | PV_OUTPUT_MAP_COLUMN_NAMES = { 276 | "timeseries_duration": "c", 277 | "average_generation_per_day": "avg", 278 | "efficiency": "gss", 279 | "power_generation": "atg", 280 | "capacity": "ss", 281 | "address": "o", 282 | "name": "sn", 283 | } 284 | 285 | 286 | ONE_DAY = timedelta(days=1) 287 | 288 | PV_OUTPUT_DATE_FORMAT = "%Y%m%d" 289 | CONFIG_FILENAME = os.environ.get("PVOUTPUT_CONFIG", os.path.expanduser("~/.pvoutput.yml")) 290 | RATE_LIMIT_PARAMS_TO_API_HEADERS = { 291 | "rate_limit_remaining": "X-Rate-Limit-Remaining", 292 | "rate_limit_total": "X-Rate-Limit-Limit", 293 | "rate_limit_reset_time": "X-Rate-Limit-Reset", 294 | } 295 | -------------------------------------------------------------------------------- /pvoutput/daterange.py: -------------------------------------------------------------------------------- 1 | """Date Range Class""" 2 | 3 | from dataclasses import dataclass 4 | from datetime import date, datetime, timedelta 5 | from typing import Iterable, List, Union 6 | 7 | import numpy as np 8 | import pandas as pd 9 | 10 | 11 | @dataclass 12 | class DateRange: 13 | """Date Range Class""" 14 | 15 | start_date: date 16 | end_date: date 17 | 18 | def __init__(self, start_date, end_date): 19 | """Init""" 20 | self.start_date = safe_convert_to_date(start_date) 21 | self.end_date = safe_convert_to_date(end_date) 22 | 23 | def intersection(self, other): 24 | """ 25 | Get intersection of this DateRange and other 26 | """ 27 | new_start = max(self.start_date, other.start_date) 28 | new_end = min(self.end_date, other.end_date) 29 | if new_start < new_end: 30 | return DateRange(new_start, new_end) 31 | 32 | def date_range(self) -> np.array: 33 | """ 34 | Make date range 35 | """ 36 | return pd.date_range(self.start_date, self.end_date, freq="D").date 37 | 38 | def total_days(self) -> int: 39 | """ 40 | Find the total number of days 41 | """ 42 | return ( 43 | np.timedelta64(self.end_date - self.start_date) 44 | .astype("timedelta64[D]") 45 | .astype(np.float32) 46 | ) 47 | 48 | def split_into_years(self) -> List: 49 | """ 50 | Split start and end dates into list of years 51 | 52 | """ 53 | duration = self.end_date - self.start_date 54 | num_years = duration / timedelta(days=365) 55 | if num_years <= 1: 56 | return [self] 57 | else: 58 | end_date = self.end_date 59 | new_date_ranges = [] 60 | for year_back in range(np.ceil(num_years).astype(int)): 61 | start_date = end_date - timedelta(days=365) 62 | if start_date < self.start_date: 63 | start_date = self.start_date 64 | new_date_ranges.append(DateRange(start_date, end_date)) 65 | end_date = start_date 66 | return new_date_ranges 67 | 68 | 69 | def get_date_range_list(dates: Iterable[date]) -> List[DateRange]: 70 | """ 71 | Get data range lists for dates 72 | 73 | Args: 74 | dates: list of dates 75 | 76 | Returns: list of date ranges 77 | """ 78 | if not dates: 79 | return [] 80 | dates = np.array(dates) 81 | dates = np.sort(dates) 82 | dates_diff = np.diff(dates) 83 | location_of_gaps = np.where(dates_diff > timedelta(days=1))[0] 84 | index_of_last_date = len(dates) - 1 85 | location_of_gaps = list(location_of_gaps) 86 | location_of_gaps.append(index_of_last_date) 87 | 88 | start_i = 0 89 | date_range_list = [] 90 | for end_i in location_of_gaps: 91 | date_range = DateRange(start_date=dates[start_i], end_date=dates[end_i]) 92 | date_range_list.append(date_range) 93 | start_i = end_i + 1 94 | 95 | return date_range_list 96 | 97 | 98 | def safe_convert_to_date(dt: Union[datetime, date, str]) -> date: 99 | """ 100 | Convert datetime to date 101 | 102 | Args: 103 | dt: datetime, date or string 104 | 105 | Returns: date 106 | """ 107 | if isinstance(dt, str): 108 | dt = pd.Timestamp(dt) 109 | if isinstance(dt, datetime): 110 | return dt.date() 111 | if isinstance(dt, date): 112 | return dt 113 | 114 | 115 | def merge_date_ranges_to_years(date_ranges: Iterable[DateRange]) -> List[DateRange]: 116 | """ 117 | Merge date ranges to years 118 | 119 | Args: 120 | date_ranges: List of DateRanges, in ascending chronological order. 121 | 122 | Returns: 123 | List of DateRanges, each representing a year, in descending order. 124 | """ 125 | if not date_ranges: 126 | return [] 127 | 128 | # Split multi-year date ranges 129 | date_ranges_split = [] 130 | for date_range in date_ranges[::-1]: 131 | date_ranges_split.extend(date_range.split_into_years()) 132 | 133 | years_to_download = [] 134 | for date_range in date_ranges_split: 135 | if years_to_download: 136 | intersection = date_range.intersection(years_to_download[-1]) 137 | if intersection == date_range: 138 | # date_range falls within the last year to retrieve, 139 | # so we can ignore this date_range 140 | continue 141 | elif intersection is None: 142 | # No overlap 143 | date_to = date_range.end_date 144 | else: 145 | # Overlap 146 | date_to = intersection.start_date 147 | 148 | else: 149 | date_to = date_range.end_date 150 | 151 | date_from = date_to - timedelta(days=365) 152 | years_to_download.append(DateRange(date_from, date_to)) 153 | 154 | return years_to_download 155 | -------------------------------------------------------------------------------- /pvoutput/exceptions.py: -------------------------------------------------------------------------------- 1 | """Custom Exeception classes""" 2 | 3 | import requests 4 | 5 | 6 | class BadStatusCode(Exception): 7 | """Bad status code excepction""" 8 | 9 | def __init__(self, response: requests.Response, message: str = ""): 10 | """Init""" 11 | self.response = response 12 | super(BadStatusCode, self).__init__(message) 13 | 14 | def __str__(self) -> str: 15 | """String method""" 16 | string = super(BadStatusCode, self).__str__() 17 | string += "Status code: {}; ".format(self.response.status_code) 18 | string += "Response content: {}; ".format(self.response.content) 19 | string += "Response headers: {}; ".format(self.response.headers) 20 | return string 21 | 22 | 23 | class NoStatusFound(BadStatusCode): 24 | """Exeception for when no status code is found""" 25 | 26 | pass 27 | 28 | 29 | class RateLimitExceeded(BadStatusCode): 30 | """Class for rate limit is exceeded""" 31 | 32 | pass 33 | -------------------------------------------------------------------------------- /pvoutput/grid_search/__init__.py: -------------------------------------------------------------------------------- 1 | """Init for grid saerch module""" 2 | 3 | from .grid_search import GridSearch 4 | -------------------------------------------------------------------------------- /pvoutput/grid_search/app.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generate gridded lat/lon coordinates that can be used for fixed radius searches across a region. 3 | 4 | Provides both an importable method and a CLI. 5 | 6 | .. code:: console 7 | 8 | $ python app.py -h 9 | 10 | - Jamie Taylor 11 | - First Authored: 2021-11-16 12 | """ 13 | 14 | import argparse 15 | import logging 16 | import os 17 | import sys 18 | from typing import Optional 19 | 20 | from pvoutput.grid_search.grid_search import GridSearch 21 | from pvoutput.grid_search.natural_earth import NaturalEarth 22 | 23 | 24 | def parse_options(): 25 | """Parse command line options.""" 26 | parser = argparse.ArgumentParser( 27 | description=("This is a command line interface (CLI) for " "the grid_search module."), 28 | epilog="Jamie Taylor, 2021-11-16", 29 | ) 30 | parser.add_argument( 31 | "--bbox", 32 | metavar="", 33 | dest="bbox", 34 | action="store", 35 | type=str, 36 | required=False, 37 | default=None, 38 | help="Specify a bounding box to search. Can be used in conjunction with " "--countries", 39 | ) 40 | parser.add_argument( 41 | "--countries", 42 | metavar="[,[,...]]", 43 | dest="countries", 44 | action="store", 45 | type=str, 46 | required=False, 47 | default=None, 48 | help="Specify a list of countries, searching only grid points that fall " 49 | "within these countries' boundaries. Specify one or more countries, " 50 | "separated by commas (default is all). Country names must match those " 51 | "used in the Natural Earth dataset (HINT: run this code with the " 52 | "--list-countries option to list them). This option can be used in " 53 | "conjunction with --bbox, in which case the search will only include " 54 | "grid points within both the bounding box and the countries list.", 55 | ) 56 | parser.add_argument( 57 | "--radial-clip", 58 | metavar="", 59 | dest="radial_clip", 60 | action="store", 61 | type=str, 62 | required=False, 63 | default=None, 64 | help="Specify a radius to clip to. Can be used in conjunction with --bbox " 65 | "and --countries. Pass the latitude, longitude and radius as a " 66 | "comma-separated string. Radius should be in km.", 67 | ) 68 | parser.add_argument( 69 | "--list-countries", 70 | dest="list_countries", 71 | action="store_true", 72 | required=False, 73 | help="List the country names that can be used for the " "--countries option.", 74 | ) 75 | parser.add_argument( 76 | "--buffer", 77 | metavar="", 78 | dest="buffer", 79 | action="store", 80 | type=float, 81 | required=False, 82 | default=0.0, 83 | help="Specify a buffer/tolerance for including grid points i.e. include " 84 | "grid points that fall within kilometers of the target " 85 | "boundary. Default is 0km.", 86 | ) 87 | parser.add_argument( 88 | "--search-radius", 89 | metavar="", 90 | dest="search_radius", 91 | action="store", 92 | type=float, 93 | required=False, 94 | default=25.0, 95 | help="Specify the radial search limit around each grid point in " "kilometers.", 96 | ) 97 | parser.add_argument( 98 | "--local-crs-epsg", 99 | metavar="", 100 | dest="local_crs_epsg", 101 | action="store", 102 | type=int, 103 | required=False, 104 | default=4087, 105 | help="Optionally provide the EPSG code of a local co-ordinate Reference " 106 | "System (CRS) for improved accuracy. e.g. set to 27700 (OSGB36 / " 107 | "British National Grid) if searching the British Isles.", 108 | ) 109 | parser.add_argument( 110 | "--cache-dir", 111 | metavar="", 112 | dest="cache_dir", 113 | action="store", 114 | type=str, 115 | required=False, 116 | default=None, 117 | help="Specify a directory to use for caching downloaded boundary files.", 118 | ) 119 | parser.add_argument( 120 | "--show", 121 | dest="show", 122 | action="store_true", 123 | required=False, 124 | help="Set this flag to show a plot of the grid.", 125 | ) 126 | parser.add_argument( 127 | "-o", 128 | "--outfile", 129 | metavar="", 130 | dest="outfile", 131 | action="store", 132 | type=str, 133 | required=False, 134 | help="Specify a filename to save the grid to.", 135 | ) 136 | options = parser.parse_args() 137 | 138 | def handle_options(options): 139 | """Validate command line args and pre-process where necessary.""" 140 | if options.bbox is not None: 141 | options.bbox = list(map(lambda x: float(x.strip()), options.bbox.split(","))) 142 | if options.radial_clip is not None: 143 | options.radial_clip = list( 144 | map(lambda x: float(x.strip()), options.radial_clip.split(",")) 145 | ) 146 | if options.cache_dir is not None: 147 | if not os.path.isdir(options.cache_dir): 148 | logging.error(f"The cache_dir '{options.cache_dir}' does not exist.") 149 | raise ValueError(f"The cache_dir '{options.cache_dir}' does not exist.") 150 | if options.countries: 151 | options.countries = list(map(lambda x: str(x.strip()), options.countries.split(","))) 152 | earth = NaturalEarth(options.cache_dir) 153 | _, countries = earth.get_hires_world_boundaries() 154 | for country in options.countries: 155 | if country not in countries: 156 | logging.error(f"The country '{country}' is invalid.") 157 | raise ValueError(f"The country '{country}' is invalid.") 158 | if options.outfile is not None and os.path.exists(options.outfile): 159 | check = query_yes_no( 160 | f"The output file '{options.outfile}' already exists, results " 161 | "will be overwritten, do you want to continue?", 162 | "no", 163 | ) 164 | if check is False: 165 | print("Quitting...") 166 | sys.exit(0) 167 | return options 168 | 169 | return handle_options(options) 170 | 171 | 172 | def main(): 173 | """Run main app""" 174 | options = parse_options() 175 | grd = GridSearch(cache_dir=options.cache_dir) 176 | if options.list_countries: 177 | grd.nat_earth.list_countries() 178 | sys.exit() 179 | grd.generate_grid( 180 | bbox=options.bbox, 181 | countries=options.countries, 182 | radial_clip=options.radial_clip, 183 | buffer=options.buffer, 184 | search_radius=options.search_radius, 185 | local_crs_epsg=options.local_crs_epsg, 186 | save_to=options.outfile, 187 | show=options.show, 188 | ) 189 | 190 | 191 | if __name__ == "__main__": 192 | fmt = "%(asctime)s [%(levelname)s] - %(message)s (%(filename)s:%(funcName)s)" 193 | datefmt = "%Y-%m-%dT%H:%M:%SZ" 194 | logging.basicConfig(format=fmt, datefmt=datefmt, level=os.environ.get("LOGLEVEL", "INFO")) 195 | main() 196 | 197 | 198 | def query_yes_no(question: str, default: Optional[str] = "yes") -> bool: 199 | """Ask a yes/no question via input() and return the answer as boolean. 200 | 201 | Args: 202 | question: 203 | The question presented to the user. 204 | default: 205 | The presumed answer if the user just hits . It must be "yes" (the default), "no" 206 | or None (meaning an answer is required of the user). 207 | 208 | Returns: 209 | Return value is True for "yes" or False for "no". 210 | """ 211 | valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False} 212 | if default is None: 213 | prompt = " [y/n] " 214 | elif default == "yes": 215 | prompt = " [Y/n] " 216 | elif default == "no": 217 | prompt = " [y/N] " 218 | else: 219 | raise ValueError("invalid default answer: '%s'" % default) 220 | while True: 221 | sys.stdout.write(question + prompt) 222 | choice = input().lower() 223 | if default is not None and choice == "": 224 | return valid[default] 225 | elif choice in valid: 226 | return valid[choice] 227 | else: 228 | sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n") 229 | -------------------------------------------------------------------------------- /pvoutput/grid_search/clip.py: -------------------------------------------------------------------------------- 1 | """Clipping function for coordinates""" 2 | 3 | from typing import Iterable, Optional, Tuple, Union 4 | 5 | import geopandas as gpd 6 | import pandas as pd 7 | from shapely.geometry import Point, Polygon 8 | 9 | 10 | def clip_to_radius( 11 | coords: Union[pd.DataFrame, gpd.GeoDataFrame], 12 | latitude: float, 13 | longitude: float, 14 | radius: Optional[float] = None, 15 | search_radius: Optional[float] = None, 16 | local_crs_epsg: int = 4087, 17 | ) -> Union[pd.DataFrame, gpd.GeoDataFrame]: 18 | """Clip coordinates to a radius. 19 | 20 | Remove any coordinates which do not lie within x. 21 | 22 | Args: 23 | coords: 24 | A pandas DataFrame or geopandas GeoDataFrame of coordinates with columns: latitude, 25 | longitude. 26 | latitude: 27 | Latitude of the center of the radial search. 28 | longitude: 29 | Longitude of the center of the radial search. 30 | radius: 31 | Set the radial search limit in km. 32 | search_radius: 33 | Optionally set the radial search limit around each grid point in kilometers. If set, the 34 | code will consider coords to be included if any part of the search radius overlaps the 35 | outter radius. 36 | local_crs_epsg: 37 | Optionally provide the EPSG code of a local Co-ordinate Reference System (CRS) for 38 | improved accuracy. e.g. set to 27700 (OSGB36 / British National Grid) if searching the 39 | British Isles. 40 | 41 | Returns: 42 | As per `coords` but containing only the subset of the input coordinates which fall within 43 | `radius` km of the lat/lon. 44 | """ 45 | center = Point(longitude, latitude) 46 | radius_ = ( 47 | gpd.GeoSeries(center) 48 | .set_crs("EPSG:4326") 49 | .to_crs(f"EPSG:{local_crs_epsg}") 50 | .buffer(radius * 1000.0)[0] 51 | ) 52 | if search_radius is None: 53 | coords["selected"] = coords.within(radius_) 54 | else: 55 | coords_ = coords.to_crs(f"EPSG:{local_crs_epsg}").buffer(search_radius * 1000.0) 56 | coords["selected"] = coords_.intersects(radius_) 57 | return coords.loc[coords["selected"]].drop(columns="selected") 58 | 59 | 60 | def bounding_box_from_radius( 61 | latitude: float, longitude: float, radius: float, local_crs_epsg: int = 4087 62 | ) -> Tuple[float]: 63 | """Convert a radial search around a given lat/lon to a bounding box. 64 | 65 | Args: 66 | latitude: 67 | Latitude of the center of the radial search. 68 | longitude: 69 | Longitude of the center of the radial search. 70 | radius: 71 | Set the radial search limit in km. 72 | local_crs_epsg: 73 | Optionally provide the EPSG code of a local Co-ordinate Reference System (CRS) for 74 | improved accuracy. e.g. set to 27700 (OSGB36 / British National Grid) if searching the 75 | British Isles. 76 | 77 | Returns: 78 | Four element tuple defining a bounding box: (min_lat, min_lon, max_lat, max_lon). 79 | """ 80 | center = Point(longitude, latitude) 81 | search_radius = ( 82 | gpd.GeoSeries(center) 83 | .set_crs("EPSG:4326") 84 | .to_crs(f"EPSG:{local_crs_epsg}") 85 | .buffer(radius * 1000.0) 86 | ) 87 | bounds = search_radius.to_crs("EPSG:4326").bounds.loc[0].to_numpy() 88 | return bounds[1], bounds[0], bounds[3], bounds[2] 89 | 90 | 91 | def buffer_bounding_box_bounds( 92 | bbox: Optional[Iterable[float]] = None, buffer: float = 0, local_crs_epsg: int = 4087 93 | ) -> Tuple[float]: 94 | """Buffer a bounding box by a distance in km. 95 | 96 | Args: 97 | bbox: 98 | Four element iterable defining a bounding box: [min_lat, min_lon, max_lat, max_lon]. 99 | buffer: 100 | Optionally buffer the country boundaries before clipping, in kilometers. 101 | local_crs_epsg: 102 | Optionally provide the EPSG code of a local Co-ordinate Reference System (CRS) for 103 | improved accuracy. e.g. set to 27700 (OSGB36 / British National Grid) if searching the 104 | British Isles. 105 | 106 | Returns: 107 | Four element tuple defining a bounding box: (min_lat, min_lon, max_lat, max_lon). 108 | """ 109 | bbox_ = buffer_bounding_box(bbox, buffer, local_crs_epsg) 110 | new_bounds = bbox_.to_crs("EPSG:4326").bounds.loc[0].to_numpy() 111 | return new_bounds[1], new_bounds[0], new_bounds[3], new_bounds[2] 112 | 113 | 114 | def buffer_bounding_box( 115 | bbox: Optional[Iterable[float]] = None, buffer: float = 0, local_crs_epsg: int = 4087 116 | ) -> gpd.GeoSeries: 117 | """Buffer a bounding box by a distance in km. 118 | 119 | Args: 120 | bbox: 121 | Four element iterable defining a bounding box: [min_lat, min_lon, max_lat, max_lon]. 122 | buffer: 123 | Optionally buffer the country boundaries before clipping, in kilometers. 124 | local_crs_epsg: 125 | Optionally provide the EPSG code of a local Co-ordinate Reference System (CRS) for 126 | improved accuracy. e.g. set to 27700 (OSGB36 / British National Grid) if searching the 127 | British Isles. 128 | 129 | Returns: 130 | A geopandas GeoSeries containing the buffered geometry. 131 | """ 132 | bbox_ = Polygon( 133 | [(bbox[1], bbox[0]), (bbox[1], bbox[2]), (bbox[3], bbox[2]), (bbox[3], bbox[0])] 134 | ) 135 | bbox_ = ( 136 | gpd.GeoSeries(bbox_) 137 | .set_crs("EPSG:4326") 138 | .to_crs(f"EPSG:{local_crs_epsg}") 139 | .buffer(buffer * 1000.0) 140 | ) 141 | return bbox_ 142 | 143 | 144 | def clip_to_bbox( 145 | coords: Union[pd.DataFrame, gpd.GeoDataFrame], 146 | bbox: Iterable[float], 147 | buffer: float = 0, 148 | search_radius: Optional[float] = None, 149 | local_crs_epsg: int = 4087, 150 | ) -> Union[pd.DataFrame, gpd.GeoDataFrame]: 151 | """Clip coordinates to bounding box. 152 | 153 | Remove any coordinates which do not lie inside a bounding box. 154 | 155 | Args: 156 | coords: 157 | A pandas DataFrame or geopandas GeoDataFrame of coordinates with columns: latitude, 158 | longitude. 159 | bbox: 160 | Four element iterable defining a bounding box: [min_lat, min_lon, max_lat, max_lon]. 161 | buffer: 162 | Optionally buffer the country boundaries before clipping, in kilometers. 163 | search_radius: 164 | Optionally set the radial search limit around each grid point in kilometers. If set, the 165 | code will consider coords to be included if any part of the search radius overlaps the 166 | bounding box. 167 | local_crs_epsg: 168 | Optionally provide the EPSG code of a local Co-ordinate Reference System (CRS) for 169 | improved accuracy. e.g. set to 27700 (OSGB36 / British National Grid) if searching the 170 | British Isles. 171 | 172 | Returns: 173 | As per `coords` but containing only the subset of the input coordinates which fall within 174 | `buffer` km of the bounding box. 175 | """ 176 | if search_radius is None: 177 | # TODO why is bounds not used 178 | bounds = buffer_bounding_box_bounds(bbox, buffer, local_crs_epsg) # noqa 179 | coords["selected"] = (bbox[0] <= coords.latitude <= bbox[2]) & ( 180 | bbox[1] <= coords.longitude <= bbox[3] 181 | ) 182 | else: 183 | bbox_ = buffer_bounding_box(bbox, buffer, local_crs_epsg)[0] 184 | coords_ = coords.to_crs(f"EPSG:{local_crs_epsg}").buffer(search_radius * 1000.0) 185 | coords["selected"] = coords_.intersects(bbox_) 186 | return coords.loc[coords["selected"]].drop(columns="selected") 187 | 188 | 189 | def clip_to_countries( 190 | coords: gpd.GeoDataFrame, 191 | world: gpd.GeoDataFrame, 192 | countries: Iterable[str], 193 | buffer: float = 0, 194 | search_radius: Optional[float] = None, 195 | ) -> gpd.GeoDataFrame: 196 | """Clip coordinates to country boundaries. 197 | 198 | Given a set of coordinates, some country boundary definitions, a list of countries, and a buffer 199 | distance, return the coords which fall within `buffer` kilometers of the listed countries' 200 | boundaries. 201 | 202 | Args: 203 | coords: 204 | A geopandas GeoDataFrame containing latitudes, longitudes and geometries for a set of 205 | coordinates. 206 | world: 207 | A geopandas GeoDataFrame of world boundaries geomteries, as returned by 208 | `get_world_boundaries()`. 209 | countries: 210 | A list of country names to clip the coords to. 211 | buffer: 212 | Optionally buffer the country boundaries before clipping, in kilometers. 213 | search_radius: 214 | Optionally set the radial search limit around each grid point in kilometers. If set, the 215 | code will consider coords to be included if any part of the search radius overlaps the 216 | country. 217 | 218 | Returns: 219 | As per `coords` but containing only the subset of the input coordinates which fall within 220 | `buffer` km of the given countries. 221 | """ 222 | countries_ = world[world.name.isin(countries)] 223 | countries_ = countries_.dissolve().buffer(buffer * 1000.0)[0] 224 | if search_radius is None: 225 | coords["selected"] = coords.within(countries_) 226 | else: 227 | # Consider points outside the selected region whose search radius overlaps the region 228 | coords_ = coords.buffer(search_radius * 1000.0) 229 | coords["selected"] = coords_.intersects(countries_) 230 | return coords.loc[coords["selected"]].drop(columns="selected") 231 | -------------------------------------------------------------------------------- /pvoutput/grid_search/grid_search.py: -------------------------------------------------------------------------------- 1 | """ 2 | Grid Search Class used to make grid of latitude and longitude coordinates 3 | """ 4 | 5 | from typing import Iterable, Optional 6 | 7 | import geopandas as gpd 8 | import matplotlib.pyplot as plt 9 | import numpy as np 10 | import pandas as pd 11 | from pyproj import Transformer 12 | 13 | from pvoutput.grid_search.clip import ( 14 | bounding_box_from_radius, 15 | buffer_bounding_box_bounds, 16 | clip_to_bbox, 17 | clip_to_countries, 18 | clip_to_radius, 19 | ) 20 | from pvoutput.grid_search.natural_earth import NaturalEarth 21 | 22 | 23 | class GridSearch: 24 | """A class for generating a gridded search.""" 25 | 26 | def __init__(self, cache_dir: str = None) -> None: 27 | """Initialise. 28 | 29 | Args: 30 | cache_dir: Optionally provide a location to cache boundary 31 | definition files locally and avoid unnecsessary downloads. 32 | """ 33 | self.nat_earth = NaturalEarth(cache_dir) 34 | 35 | def plot_grid( 36 | self, 37 | coords: gpd.GeoDataFrame, 38 | countries: Iterable[str], 39 | bbox: Optional[Iterable[float]] = None, 40 | local_crs_epsg: int = 4087, 41 | filename: Optional[str] = None, 42 | ) -> None: 43 | """Plot grid coordinates. 44 | 45 | Plot grid coordinates over world boundaries with selected countries highlighted. 46 | 47 | Args: 48 | coords: A geopandas GeoDataFrame containing latitudes, longitudes and geometries 49 | for a set of coordinates. 50 | countries: A list of country names to clip the coords to. 51 | bbox: Optionally pass a four element iterable defining a bounding box: 52 | [min_lat, min_lon, max_lat, max_lon]. This will be used to set the scale of the 53 | plot. 54 | local_crs_epsg: Optionally provide the EPSG code of a local 55 | Co-ordinate Reference System (CRS) for improved accuracy. 56 | e.g. set to 27700 (OSGB36 / British National Grid) if searching 57 | the British Isles. The default is EPSG:4087 (a.k.a WGS 84 / World Equidistant 58 | Cylindrical), which works globally but with less accuracy. 59 | filename: Optionally pass a filename (relative or absolute) to save the plot to. 60 | Image format should be set using the file extension (i.e. .jpeg, .png or .svg). 61 | """ 62 | world, _ = self.nat_earth.get_hires_world_boundaries() 63 | world.to_crs(f"EPSG:{local_crs_epsg}", inplace=True) 64 | coords.to_crs(f"EPSG:{local_crs_epsg}", inplace=True) 65 | if bbox is None: 66 | bbox = [ 67 | coords.latitude.min(), 68 | coords.longitude.min(), 69 | coords.latitude.max(), 70 | coords.longitude.max(), 71 | ] 72 | f, ax = plt.subplots() 73 | world.plot( 74 | ax=ax, color="palegreen", edgecolor="black", linewidth=1, label="World", zorder=1 75 | ) 76 | if countries is not None: 77 | selected = world[world.name.isin(countries)] 78 | selected.geometry.boundary.plot( 79 | ax=ax, color=None, edgecolor="gold", label="Selected countries", zorder=2 80 | ) 81 | coords.plot(ax=ax, marker="o", color="red", markersize=2, label="Grid", zorder=3) 82 | xmin = coords.geometry.bounds.minx.min() 83 | xmax = coords.geometry.bounds.maxx.max() 84 | ymin = coords.geometry.bounds.miny.min() 85 | ymax = coords.geometry.bounds.maxy.max() 86 | xpadding = (xmax - xmin) / 8 87 | ypadding = (ymax - ymin) / 8 88 | ax.set_xlim(xmin - xpadding, xmax + xpadding) 89 | ax.set_ylim(ymin - ypadding, ymax + ypadding) 90 | ax.axes.xaxis.set_visible(False) 91 | ax.axes.yaxis.set_visible(False) 92 | plt.legend(prop={"size": 6}) 93 | plt.show() 94 | if filename is not None: 95 | plt.savefig(filename, dpi=300) 96 | 97 | def generate_grid( 98 | self, 99 | bbox: Optional[Iterable[float]] = None, 100 | countries: Optional[Iterable[str]] = None, 101 | radial_clip: Optional[Iterable[float]] = None, 102 | buffer: float = 0, 103 | search_radius: float = 25, 104 | local_crs_epsg: int = 4087, 105 | save_to: Optional[str] = None, 106 | show: bool = False, 107 | ): 108 | """Use hexagonal tiling to generate a grid search with minimal overlap. 109 | 110 | Create a set of gridded coordinates which use hexagonal tiling as an efficient way to 111 | conduct a fixed-radius search of a region defined by a bounding box and/or country borders. 112 | 113 | Args: 114 | bbox: 115 | Optionally pass a four element iterable defining a bounding box: 116 | [min_lat, min_lon, max_lat, max_lon]. 117 | countries: 118 | Optionally pass a list of country names to clip the coords to. 119 | radial_clip: 120 | Optionally set a radial boundary to clip to. Pass a three element iterable 121 | containing: (, , ). 122 | buffer: 123 | Optionally buffer the bounding box and country boundaries before clipping, in 124 | kilometers. 125 | search_radius: 126 | Optionally set the radial search limit around each grid point in kilometers. 127 | Defaults to 25km. 128 | local_crs_epsg: 129 | Optionally provide the EPSG code of a local co-ordinate Reference System (CRS) for 130 | improved accuracy. e.g. set to 27700 (OSGB36 / British National Grid) if searching 131 | the British Isles. The default is EPSG:4087 (a.k.a WGS 84 / World Equidistant 132 | Cylindrical), which works globally but with poor accuracy in some locations. 133 | save_to: 134 | Optionally specify a filename to save the grid co-ordinates to (CSV). 135 | show: 136 | Set to True to show a plot of the grid. 137 | 138 | Returns: 139 | A pandas DataFrame containing co-ordinates for the grid with columns: latitude, 140 | longitude. 141 | """ 142 | # get countries 143 | world, all_countries = self.nat_earth.get_hires_world_boundaries() 144 | countries = all_countries if countries is None else countries 145 | 146 | # create bounding box 147 | if bbox is None: 148 | if radial_clip is None: 149 | bbox = ( 150 | world[world.name.isin(countries)] 151 | .dissolve() 152 | .buffer(buffer * 1000.0) 153 | .to_crs("EPSG:4326") 154 | .bounds.to_numpy()[0] 155 | ) 156 | bbox = [bbox[1], bbox[0], bbox[3], bbox[2]] 157 | else: 158 | bbox = bounding_box_from_radius( 159 | radial_clip[0], radial_clip[1], radial_clip[2], local_crs_epsg 160 | ) 161 | bbox = buffer_bounding_box_bounds(bbox, buffer) 162 | bounds = [np.round(b, 5) for b in bbox] 163 | 164 | # create x and y bounds 165 | search_radius_m = search_radius * 1000.0 166 | wgs84_to_projected = Transformer.from_crs(4326, local_crs_epsg, always_xy=True) 167 | projected_to_wgs84 = Transformer.from_crs(local_crs_epsg, 4326, always_xy=True) 168 | xmin, ymin = wgs84_to_projected.transform(bounds[1], bounds[0]) 169 | xmax, ymax = wgs84_to_projected.transform(bounds[3], bounds[2]) 170 | y_interval = search_radius_m * np.cos(np.radians(30)) 171 | x_interval = search_radius_m * 3 172 | x_offset = 0 173 | 174 | # create coordinates 175 | coords = [] 176 | for y in np.arange(ymin - y_interval * 3, ymax + y_interval + search_radius_m, y_interval): 177 | xmin_ = xmin - search_radius_m - x_offset 178 | xmax_ = xmax + x_interval + search_radius_m + x_offset 179 | for x in np.arange(xmin_, xmax_, x_interval): 180 | coords.append(projected_to_wgs84.transform(x, y)) 181 | if x_offset == 0: 182 | x_offset = search_radius_m * 1.5 183 | else: 184 | x_offset = 0 185 | coords = pd.DataFrame(coords, columns=["longitude", "latitude"]) 186 | coords = gpd.GeoDataFrame( 187 | coords, geometry=gpd.points_from_xy(coords.longitude, coords.latitude) 188 | ) 189 | coords = coords.set_crs("EPSG:4326").to_crs("EPSG:4087") 190 | coords = clip_to_countries( 191 | coords=coords, 192 | world=world, 193 | countries=countries, 194 | buffer=buffer, 195 | search_radius=search_radius, 196 | ) 197 | coords = clip_to_bbox( 198 | coords=coords, 199 | bbox=bbox, 200 | buffer=buffer, 201 | search_radius=search_radius, 202 | local_crs_epsg=local_crs_epsg, 203 | ) 204 | if radial_clip is not None: 205 | coords = clip_to_radius( 206 | coords=coords, 207 | latitude=radial_clip[0], 208 | longitude=radial_clip[1], 209 | radius=radial_clip[2], 210 | search_radius=search_radius, 211 | local_crs_epsg=local_crs_epsg, 212 | ) 213 | 214 | # show plot 215 | if show: 216 | self.plot_grid( 217 | coords, 218 | countries, 219 | bbox, 220 | ) 221 | 222 | # save plots 223 | if save_to is not None: 224 | coords.to_csv( 225 | save_to, float_format="%.5f", index=False, columns=["longitude", "latitude"] 226 | ) 227 | return coords[["latitude", "longitude"]].reset_index(drop=True) 228 | -------------------------------------------------------------------------------- /pvoutput/grid_search/natural_earth.py: -------------------------------------------------------------------------------- 1 | """Retrieve Natural Earth world boundaries.""" 2 | 3 | import logging 4 | import os 5 | from io import BytesIO 6 | from typing import Tuple 7 | 8 | import geopandas as gpd 9 | import requests 10 | from numpy.typing import NDArray 11 | 12 | 13 | class NaturalEarth: 14 | """Retrieve Natural Earth world boundaries.""" 15 | 16 | def __init__(self, cache_dir: str = None) -> None: 17 | """Initialise. 18 | 19 | Args: 20 | cache_dir: 21 | Optionally provide a location to cache boundary definition files locally and avoid 22 | unnecsessary downloads. 23 | 24 | Raises: 25 | ValueError: If the cache_dir does not exist. 26 | """ 27 | self.cache_dir = cache_dir 28 | if self.cache_dir is not None: 29 | if not os.path.isdir(cache_dir): 30 | logging.error("The cache_dir does not exist.") 31 | raise ValueError("The cache_dir does not exist.") 32 | self.world_hires = None 33 | self.countries_hires = None 34 | self.world_lores = None 35 | self.countries_lores = None 36 | 37 | def get_hires_world_boundaries(self) -> Tuple[gpd.GeoDataFrame, NDArray]: 38 | """Load high res world boundaries. 39 | 40 | Download the high resolution country boundaries GIS file from the Natural Earth website and 41 | optionally cache locally. 42 | 43 | Returns: 44 | A tuple containing (`world`, `countries`). `world` is a Geopandas GeoDataFrame with 45 | geometries and metadata for all country borders. `countries` is a list of unique country 46 | names for which geometries exist. Boundaries will be in the EPSG:4087 projected CRS. 47 | 48 | Typical usage example: 49 | world, countries = get_world_boundaries() 50 | """ 51 | if self.world_hires is not None and self.countries_hires is not None: 52 | return self.world_hires, self.countries_hires 53 | if self.cache_dir: 54 | cache_file = os.path.join(self.cache_dir, "ne_10m_admin_0_countries.zip") 55 | else: 56 | cache_file = None 57 | if cache_file is not None and os.path.isfile(cache_file): 58 | data = cache_file 59 | else: 60 | headers = { 61 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " 62 | "(KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36" 63 | } 64 | url = ( 65 | "https://www.naturalearthdata.com/http//www.naturalearthdata.com/" 66 | "download/10m/cultural/ne_10m_admin_0_countries.zip" 67 | ) 68 | req = requests.get(url, headers=headers) 69 | data = BytesIO(req.content) 70 | if cache_file is not None: 71 | with open(cache_file, "wb") as fid: 72 | fid.write(req.content) 73 | self.world_hires = gpd.read_file(data).to_crs("EPSG:4087") 74 | cols2keep = {"NAME": "name", "CONTINENT": "continent", "geometry": "geometry"} 75 | self.world_hires = self.world_hires[list(cols2keep.keys())].rename(columns=cols2keep) 76 | self.countries_hires = self.world_hires.name.unique() 77 | return self.world_hires, self.countries_hires 78 | 79 | def get_lores_world_boundaries(self) -> Tuple[gpd.GeoDataFrame, NDArray]: 80 | """Load low resolution world boundaries. 81 | 82 | Load the low res world boundaries GIS file (`naturalearth_lowres`) from Geopandas datasets. 83 | Useful for visualisations and/or to speed up computation. 84 | 85 | Returns: 86 | A tuple containing (`world`, `countries`). `world` is a Geopandas GeoDataFrame with 87 | geometries and metadata for all country borders. `countries` is a list of unique country 88 | names for which geometries exist. Boundaries will be in the EPSG:4087 projected CRS. 89 | 90 | Typical usage example: 91 | world, countries = get_world_boundaries() 92 | """ 93 | if self.world_lores is not None and self.countries_lores is not None: 94 | return self.world_lores, self.countries_lores 95 | self.world_lores = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres")).to_crs( 96 | "EPSG:4087" 97 | ) 98 | self.world_lores.drop(columns=["pop_est", "iso_a3", "gdp_md_est"], inplace=True) 99 | self.countries = self.world_lores.name.unique() 100 | return self.world_lores, self.countries_lores 101 | 102 | def list_countries(self, res: str = "hires") -> Tuple[gpd.GeoDataFrame, NDArray]: 103 | """Print a list of country names to stdout. 104 | 105 | Print a list of country names whose geometries are available in the world boundaries GIS 106 | file. 107 | 108 | Args: 109 | res: 110 | Optionally switch between 'hires' and 'lores', although in theory both country lists 111 | should be identical, there may be some countries that are not included in the lores 112 | boundaries due to downsampling of borders. Names may also have changed between the 113 | two (e.g. Macedonia -> North Macedonia). 114 | 115 | Raises: 116 | ValueError: If `res` is not one of: 'lores', 'hires'. 117 | """ 118 | if res == "hires": 119 | _, countries = self.get_hires_world_boundaries() 120 | elif res == "lores": 121 | _, countries = self.get_lores_world_boundaries() 122 | else: 123 | logging.error("The `res` arg should be one of: 'lores', 'hires'.") 124 | raise ValueError("The `res` arg should be one of: 'lores', 'hires'.") 125 | countries.sort() 126 | print(f"Available countries are:\n{', '.join(countries)}") 127 | 128 | return countries 129 | -------------------------------------------------------------------------------- /pvoutput/mapscraper.py: -------------------------------------------------------------------------------- 1 | """Code for scraping for pv systems""" 2 | 3 | import re 4 | from copy import copy 5 | from typing import Iterable, Optional, Union 6 | 7 | import pandas as pd 8 | import requests 9 | from bs4 import BeautifulSoup 10 | 11 | from pvoutput.consts import ( 12 | MAP_URL, 13 | PV_OUTPUT_COUNTRY_CODES, 14 | PV_OUTPUT_MAP_COLUMN_NAMES, 15 | REGIONS_URL, 16 | ) 17 | 18 | _MAX_NUM_PAGES = 1024 19 | 20 | 21 | def get_pv_systems_for_country( 22 | country: Union[str, int], 23 | ascending: Optional[bool] = None, 24 | sort_by: Optional[str] = None, 25 | max_pages: int = _MAX_NUM_PAGES, 26 | region: Optional[str] = None, 27 | ) -> pd.DataFrame: 28 | """ 29 | Get all pv systems for on country 30 | 31 | Args: 32 | country: either a string such as 'United Kingdom' 33 | (see consts.PV_OUTPUT_COUNTRY_CODES for all recognised strings), 34 | or a PVOutput.org country code, in the range [1, 257]. 35 | ascending: if True, ask PVOutput.org to sort results by ascending. 36 | If False, sort by descending. If None, use PVOutput.org's default 37 | sort order. 38 | sort_by: The column to ask PVOutput.org to sort by. One of: 39 | timeseries_duration, 40 | average_generation_per_day, 41 | efficiency, 42 | power_generation, 43 | capacity, 44 | address, 45 | name 46 | max_pages: The maximum number of search pages to scrape. 47 | region: Optional input, #TODO 48 | 49 | Returns: pd.DataFrame with index system_id (int) and these columns: 50 | name, system_DC_capacity_W, panel, inverter, address, orientation, 51 | array_tilt_degrees, shade, timeseries_duration, 52 | total_energy_gen_Wh, average_daily_energy_gen_Wh 53 | average_efficiency_kWh_per_kW 54 | """ 55 | country_code = _convert_to_country_code(country) 56 | regions = [region] if region else get_regions_for_country(country_code) 57 | all_metadata = [] 58 | for region in regions: 59 | for page_number in range(max_pages): 60 | print( 61 | "\rReading page {:2d} for region: {}".format(page_number, region), 62 | end="", 63 | flush=True, 64 | ) 65 | url = _create_map_url( 66 | country_code=country_code, 67 | page_number=page_number, 68 | ascending=ascending, 69 | sort_by=sort_by, 70 | region=region, 71 | ) 72 | soup = get_soup(url) 73 | if _page_is_blank(soup): 74 | break 75 | metadata = _process_metadata(soup) 76 | metadata["region"] = region 77 | all_metadata.append(metadata) 78 | 79 | if not _page_has_next_link(soup): 80 | break 81 | 82 | return pd.concat(all_metadata) 83 | 84 | 85 | # ########### LOAD HTML ################### 86 | 87 | 88 | def _create_map_url( 89 | country_code: Optional[int] = None, 90 | page_number: Optional[int] = None, 91 | ascending: Optional[bool] = None, 92 | sort_by: Optional[str] = None, 93 | region: Optional[str] = None, 94 | ) -> str: 95 | """ 96 | Create a map URL 97 | 98 | Args: 99 | country_code: Country code 100 | page_number: Get this page number of the search results. Zero-indexed. 101 | The first page is page 0, the second page is page 1, etc. 102 | ascending: option for ascending or descending 103 | sort_by: sort results by (optional) 104 | region: region of country (optional) 105 | 106 | """ 107 | _check_country_code(country_code) 108 | 109 | if ascending is None: 110 | sort_order = None 111 | else: 112 | sort_order = "asc" if ascending else "desc" 113 | 114 | if sort_by is None: 115 | sort_by_pv_output_col_name = None 116 | else: 117 | try: 118 | sort_by_pv_output_col_name = PV_OUTPUT_MAP_COLUMN_NAMES[sort_by] 119 | except KeyError: 120 | raise ValueError("sort_by must be one of {}".format(PV_OUTPUT_MAP_COLUMN_NAMES.keys())) 121 | 122 | url_params = { 123 | "country": country_code, 124 | "p": page_number, 125 | "d": sort_order, 126 | "o": sort_by_pv_output_col_name, 127 | "region": region, 128 | } 129 | 130 | url_params_list = [ 131 | "{}={}".format(key, value) for key, value in url_params.items() if value is not None 132 | ] 133 | query_string = "&".join(url_params_list) 134 | url = copy(MAP_URL) 135 | if query_string: 136 | url += "?" + query_string 137 | return url 138 | 139 | 140 | def _raise_country_error(country, msg=""): 141 | country_codes = PV_OUTPUT_COUNTRY_CODES.values() 142 | raise ValueError( 143 | "Wrong value country='{}'. {}country must be an integer country" 144 | " code in the range [{}, {}], or one of {}.".format( 145 | country, 146 | msg, 147 | min(country_codes), 148 | max(country_codes), 149 | ", ".join(PV_OUTPUT_COUNTRY_CODES.keys()), 150 | ) 151 | ) 152 | 153 | 154 | def _check_country_code(country_code: Union[None, int]): 155 | if country_code is None: 156 | return 157 | country_codes = PV_OUTPUT_COUNTRY_CODES.values() 158 | if not min(country_codes) <= country_code <= max(country_codes): 159 | _raise_country_error(country_code, "country outside of valid range! ") 160 | 161 | 162 | def _convert_to_country_code(country: Union[str, int]) -> int: 163 | if isinstance(country, str): 164 | try: 165 | return PV_OUTPUT_COUNTRY_CODES[country] 166 | except KeyError: 167 | _raise_country_error(country) 168 | 169 | elif isinstance(country, int): 170 | _check_country_code(country) 171 | return country 172 | 173 | 174 | def _page_has_next_link(soup: BeautifulSoup): 175 | return bool(soup.find_all("a", text="Next")) 176 | 177 | 178 | # ############ PROCESS HTML ######################### 179 | 180 | 181 | def _process_metadata(soup: BeautifulSoup, return_constituents=False) -> pd.DataFrame: 182 | pv_system_size_metadata = _process_system_size_col(soup) 183 | index = pv_system_size_metadata.index 184 | pv_systems_metadata = [ 185 | pv_system_size_metadata, 186 | _process_output_col(soup, index), 187 | _process_generation_and_average_cols(soup, index), 188 | _process_efficiency_col(soup, index), 189 | ] 190 | 191 | df = pd.concat(pv_systems_metadata, axis="columns") 192 | df = _convert_metadata_cols_to_numeric(df) 193 | df["system_DC_capacity_W"] = df["capacity_kW"] * 1e3 194 | del df["capacity_kW"] 195 | if return_constituents: 196 | pv_systems_metadata.append(df) 197 | return tuple(pv_systems_metadata) 198 | return df 199 | 200 | 201 | def _process_system_size_col(soup: BeautifulSoup) -> pd.DataFrame: 202 | pv_system_size_col = soup.find_all("a", href=re.compile(r"display\.jsp\?sid=")) 203 | metadata = [] 204 | for row in pv_system_size_col: 205 | metadata_for_row = {} 206 | 207 | # Get system ID 208 | href = row.attrs["href"] 209 | p = re.compile(r"^display\.jsp\?sid=(\d+)$") 210 | href_match = p.match(href) 211 | metadata_for_row["system_id"] = href_match.group(1) 212 | 213 | # Process title (lots of metadata in here!) 214 | title, title_meta = row.attrs["title"].split("|") 215 | 216 | # Name and capacity 217 | p = re.compile(r"(.*) (\d+\.\d+kW)") 218 | title_match = p.match(title) 219 | metadata_for_row["name"] = title_match.group(1) 220 | metadata_for_row["capacity"] = title_match.group(2) 221 | 222 | # Other key-value pairs: 223 | key_value = title_meta.split("
") 224 | key_value_dict = {} 225 | for line in key_value: 226 | key_value_split = line.split(":") 227 | key = key_value_split[0].strip() 228 | # Some values have a colon(!) 229 | value = ":".join(key_value_split[1:]).strip() 230 | key_value_dict[key] = value 231 | metadata_for_row.update(key_value_dict) 232 | 233 | # Some cleaning 234 | # Remove from Location 235 | location = metadata_for_row["Location"] 236 | p = re.compile(r"()?(.*)") 237 | img_groups = p.search(location).groups() 238 | if img_groups[0] is not None: 239 | metadata_for_row["Location"] = img_groups[1].strip() 240 | 241 | metadata.append(metadata_for_row) 242 | 243 | df = pd.DataFrame(metadata) 244 | df["system_id"] = pd.to_numeric(df["system_id"]) 245 | df = df.set_index("system_id") 246 | df.columns = [col_name.lower() for col_name in df.columns] 247 | df.rename( 248 | { 249 | "location": "address", 250 | "panels": "panel", 251 | "array tilt": "array_tilt_degrees", 252 | "capacity": "capacity_kW", 253 | }, 254 | axis="columns", 255 | inplace=True, 256 | ) 257 | return df 258 | 259 | 260 | def _remove_str_and_convert_to_numeric(series: pd.Series, string_to_remove: str) -> pd.Series: 261 | series = series.str.replace(string_to_remove, "") 262 | return pd.to_numeric(series) 263 | 264 | 265 | def _convert_metadata_cols_to_numeric(df: pd.DataFrame) -> pd.DataFrame: 266 | for col_name, string_to_remove in [ 267 | # ('array_tilt_degrees', '°'), 268 | ("capacity_kW", "kW"), 269 | ("average_efficiency_kWh_per_kW", "kWh/kW"), 270 | ]: 271 | df[col_name] = _remove_str_and_convert_to_numeric(df[col_name], string_to_remove) 272 | 273 | return df 274 | 275 | 276 | def _process_output_col(soup: BeautifulSoup, index: Optional[Iterable] = None) -> pd.Series: 277 | 278 | # get all data 279 | outputs_col = soup.find_all(text=re.compile(r"\d Days")) 280 | 281 | # format data as strings 282 | outputs_col = [str(col) for col in outputs_col] 283 | 284 | # make into pandas Series 285 | duration = pd.Series(outputs_col, name="timeseries_duration", index=index) 286 | 287 | # change to timedeltas 288 | return pd.to_timedelta(duration.astype("unicode")) 289 | 290 | 291 | def _convert_energy_to_numeric_watt_hours(series: pd.Series) -> pd.Series: 292 | data = [] 293 | for unit, multiplier in [("kWh", 1e3), ("MWh", 1e6)]: 294 | selection = series[series.str.contains(unit)] 295 | selection = selection.str.replace(unit, "") 296 | selection = selection.str.replace(",", "") 297 | selection = pd.to_numeric(selection) 298 | selection *= multiplier 299 | data.append(selection) 300 | return pd.concat(data) 301 | 302 | 303 | def _process_generation_and_average_cols( 304 | soup: BeautifulSoup, index: Optional[Iterable] = None 305 | ) -> pd.DataFrame: 306 | # _soup = deepcopy(soup) 307 | [s.decompose() for s in soup.select("a")] 308 | generation_and_average_cols = soup.find_all(text=re.compile(r"\d[Mk]Wh$")) 309 | generation_col = generation_and_average_cols[0::2] 310 | average_col = generation_and_average_cols[1::2] 311 | df = pd.DataFrame( 312 | {"total_energy_gen_Wh": generation_col, "average_daily_energy_gen_Wh": average_col}, 313 | index=index, 314 | ) 315 | 316 | for col_name in df.columns: 317 | df[col_name] = _convert_energy_to_numeric_watt_hours(df[col_name]) 318 | 319 | return df 320 | 321 | 322 | def _process_efficiency_col(soup: BeautifulSoup, index: Optional[Iterable] = None) -> pd.Series: 323 | efficiency_col = soup.find_all(text=re.compile(r"\dkWh/kW")) 324 | return pd.Series(efficiency_col, name="average_efficiency_kWh_per_kW", index=index) 325 | 326 | 327 | def _page_is_blank(soup: BeautifulSoup) -> bool: 328 | # Pages can still be blank even if the previous page has a Next Button 329 | pv_system_size_col = soup.find_all("a", href=re.compile(r"display\.jsp\?sid=")) 330 | return not bool(pv_system_size_col) 331 | 332 | 333 | def get_soup(url, raw=False, parser="html.parser"): 334 | """ 335 | Get soupt from url 336 | 337 | Args: 338 | url: URL 339 | raw: option for raw, defaulted to False 340 | parser: parser for BeautifulSoup 341 | 342 | """ 343 | response = requests.get(url) 344 | soup = BeautifulSoup(response.text, parser) 345 | if raw: 346 | return soup 347 | return clean_soup(soup) 348 | 349 | 350 | def clean_soup(soup): 351 | """Function to clean scraped soup object. 352 | 353 | Note that the downloaded soup could change over time. 354 | Args: 355 | soup: bs4.BeautifulSoup 356 | 357 | Returns: 358 | bs4.BeautifulSoup 359 | 360 | """ 361 | for script in soup.find_all("script", src=False): 362 | script.decompose() 363 | return soup 364 | 365 | 366 | def get_regions_for_country(country_code: int): 367 | """ 368 | Get regions for on countruy 369 | 370 | Args: 371 | country_code: the country code 372 | 373 | Returns: list of regions 374 | """ 375 | region_list = [] 376 | url = f"{REGIONS_URL}?country={country_code}" 377 | soup = get_soup(url, parser="lxml") 378 | region_tags = soup.find_all("a", href=re.compile(r"map\.jsp\?country=")) 379 | for row in region_tags: 380 | href = row.attrs["href"] 381 | p = re.compile(r"^map\.jsp\?country=" + str(country_code) + r"®ion=(\w+.*)$") 382 | href_match = p.match(href) 383 | region = href_match.group(1) 384 | region_list.append(region) 385 | return region_list 386 | -------------------------------------------------------------------------------- /pvoutput/prcoess.py: -------------------------------------------------------------------------------- 1 | """Function to process data""" 2 | 3 | import logging 4 | from io import StringIO 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | def process_system_status(pv_system_status_text, date) -> pd.DataFrame: 13 | """ 14 | Process raw system status 15 | 16 | Args: 17 | pv_system_status_text: string of system data, like: 18 | "1234;07:45,21,255,1,2;07:50,21,255,1;07:50,21,255,1,2" 19 | date: The date this data is from 20 | 21 | Returns: dataframe of data 22 | """ 23 | 24 | # See https://pvoutput.org/help/data_services.html#data-services-get-system-status 25 | columns = [ 26 | "cumulative_energy_gen_Wh", 27 | "instantaneous_power_gen_W", 28 | "temperature_C", 29 | "voltage", 30 | ] 31 | if pv_system_status_text == "no status found": 32 | logger.debug("Text was empty so return empty dataframe") 33 | return pd.DataFrame(columns=columns + ["system_id", "datetime"]) 34 | 35 | # get system id 36 | system_id = int(pv_system_status_text.split(";")[0]) 37 | pv_system_status_text = ";".join(pv_system_status_text.split(";")[1:]) 38 | 39 | try: 40 | one_pv_system_status = pd.read_csv( 41 | StringIO(pv_system_status_text), 42 | lineterminator=";", 43 | names=["time"] + columns, 44 | dtype={col: np.float64 for col in columns}, 45 | ).sort_index() 46 | 47 | except Exception as e: 48 | 49 | # this can happen if there is only one data value and it doesnt contain all 5 columns. 50 | # if there is many rows of data, then it seems fine 51 | if pv_system_status_text.count(";") != 0: 52 | # the data contains more than one row, so lets raise the error 53 | raise e 54 | 55 | # how many columns does it have 56 | n_columns = pv_system_status_text.count(",") + 1 57 | 58 | one_pv_system_status = pd.read_csv( 59 | StringIO(pv_system_status_text), 60 | lineterminator=";", 61 | names=["time"] + columns[: n_columns - 1], 62 | dtype={col: np.float64 for col in columns}, 63 | ).sort_index() 64 | 65 | missing_columns = [c for c in columns if c not in one_pv_system_status.columns] 66 | one_pv_system_status[missing_columns] = np.NAN 67 | 68 | # process dataframe 69 | one_pv_system_status["system_id"] = system_id 70 | 71 | # format date 72 | one_pv_system_status["date"] = pd.to_datetime(date) 73 | one_pv_system_status = join_date_time(one_pv_system_status) 74 | 75 | return one_pv_system_status 76 | 77 | 78 | def join_date_time(one_pv_system_status: pd.DataFrame, time_format="%H:%M:%S"): 79 | """ 80 | Join date and time columns toegther 81 | 82 | Args: 83 | one_pv_system_status: dataframe with 'date' and 'time' 84 | time_format: format of time 85 | 86 | Returns: dataframe with column datetime 87 | """ 88 | 89 | # fix midnight 90 | fix_midnight_index = one_pv_system_status["time"] == "24:00" 91 | one_pv_system_status.loc[fix_midnight_index, "time"] = "00:00" 92 | 93 | # format time 94 | one_pv_system_status["time"] = pd.to_datetime(one_pv_system_status["time"]).dt.strftime( 95 | time_format 96 | ) 97 | one_pv_system_status["time"] = pd.to_timedelta(one_pv_system_status["time"]) 98 | 99 | # format date 100 | one_pv_system_status["date"] = pd.to_datetime(one_pv_system_status["date"].astype(str)) 101 | 102 | # make datetime 103 | one_pv_system_status["datetime"] = one_pv_system_status["date"] + one_pv_system_status["time"] 104 | one_pv_system_status.drop(columns=["date", "time"], inplace=True) 105 | one_pv_system_status.sort_values(by="datetime", inplace=True) 106 | 107 | one_pv_system_status.set_index("datetime", inplace=True, drop=True) 108 | 109 | return one_pv_system_status 110 | 111 | 112 | def process_batch_status(pv_system_status_text) -> pd.DataFrame: 113 | """ 114 | Process batch status text 115 | 116 | Args: 117 | pv_system_status_text: text to be procssed 118 | 119 | Returns: dataframe of data 120 | 121 | """ 122 | # See https://pvoutput.org/help.html#dataservice-getbatchstatus 123 | 124 | # PVOutput uses a non-standard format for the data. The text 125 | # needs some processing before it can be read as a CSV. 126 | processed_lines = [] 127 | for line in pv_system_status_text.split("\n"): 128 | line_sections = line.split(";") 129 | date = line_sections[0] 130 | time_and_data = line_sections[1:] 131 | processed_line = [ 132 | "{date},{payload}".format(date=date, payload=payload) for payload in time_and_data 133 | ] 134 | processed_lines.extend(processed_line) 135 | 136 | if processed_lines: 137 | first_line = processed_lines[0] 138 | num_cols = len(first_line.split(",")) 139 | if num_cols >= 8: 140 | raise NotImplementedError("Handling of consumption data is not implemented!") 141 | 142 | processed_text = "\n".join(processed_lines) 143 | del processed_lines 144 | 145 | columns = ["cumulative_energy_gen_Wh", "instantaneous_power_gen_W", "temperature_C", "voltage"] 146 | 147 | pv_system_status = pd.read_csv( 148 | StringIO(processed_text), 149 | names=["date", "time"] + columns, 150 | # parse_dates={"datetime": ["date", "time"]}, 151 | # index_col=["datetime"], 152 | dtype={col: np.float64 for col in columns}, 153 | ).sort_index() 154 | 155 | pv_system_status = join_date_time(pv_system_status) 156 | 157 | logger.info(pv_system_status) 158 | 159 | return pv_system_status 160 | -------------------------------------------------------------------------------- /pvoutput/pvoutput.py: -------------------------------------------------------------------------------- 1 | """Main PV Output class to get data from pvoutput.org""" 2 | 3 | import logging 4 | import os 5 | import time 6 | import warnings 7 | from datetime import date, datetime, timedelta 8 | from io import StringIO 9 | from typing import Dict, Iterable, List, Optional, Union 10 | from urllib.parse import urljoin 11 | 12 | import numpy as np 13 | import pandas as pd 14 | import requests 15 | import tables 16 | 17 | from pvoutput.consts import ( 18 | BASE_URL, 19 | CONFIG_FILENAME, 20 | ONE_DAY, 21 | PV_OUTPUT_DATE_FORMAT, 22 | RATE_LIMIT_PARAMS_TO_API_HEADERS, 23 | ) 24 | from pvoutput.daterange import DateRange, merge_date_ranges_to_years 25 | from pvoutput.exceptions import NoStatusFound, RateLimitExceeded 26 | from pvoutput.prcoess import process_batch_status, process_system_status 27 | from pvoutput.utils import ( 28 | _get_param_from_config_file, 29 | _get_response, 30 | _print_and_log, 31 | get_date_ranges_to_download, 32 | sort_and_de_dupe_pv_system, 33 | system_id_to_hdf_key, 34 | ) 35 | 36 | _LOG = logging.getLogger("pvoutput") 37 | 38 | 39 | class PVOutput: 40 | """ 41 | Main PV Output class 42 | 43 | Attributes: 44 | api_key 45 | system_id 46 | rate_limit_remaining 47 | rate_limit_total 48 | rate_limit_reset_time 49 | data_service_url 50 | """ 51 | 52 | def __init__( 53 | self, 54 | api_key: str = os.getenv("API_KEY"), 55 | system_id: str = os.getenv("SYSTEM_ID"), 56 | config_filename: Optional[str] = CONFIG_FILENAME, 57 | data_service_url: Optional[str] = os.getenv("DATA_SERVICE_URL"), 58 | ): 59 | """ 60 | Init 61 | 62 | Args: 63 | api_key: Your API key from PVOutput.org. 64 | system_id: Your system ID from PVOutput.org. If you don't have a 65 | PV system then you can register with PVOutput.org and select 66 | the 'energy consumption only' box. 67 | config_filename: Optional, the filename of the .yml config file. 68 | data_service_url: Optional. If you have subscribed to 69 | PVOutput.org's data service then add the data service URL here. 70 | This string must end in '.org'. 71 | """ 72 | self.api_key = api_key 73 | self.system_id = system_id 74 | self.rate_limit_remaining = None 75 | self.rate_limit_total = None 76 | self.rate_limit_reset_time = None 77 | self.data_service_url = data_service_url 78 | 79 | # Set from config file if None 80 | for param_name in ["api_key", "system_id"]: 81 | if getattr(self, param_name) is None: 82 | try: 83 | param_value_from_config = _get_param_from_config_file( 84 | param_name, config_filename 85 | ) 86 | except Exception as e: 87 | msg = ( 88 | "Error loading configuration parameter {param_name}" 89 | " from config file {filename}. Either pass" 90 | " {param_name} into PVOutput constructor, or create" 91 | " config file {filename}. {exception}".format( 92 | param_name=param_name, filename=CONFIG_FILENAME, exception=e 93 | ) 94 | ) 95 | print(msg) 96 | _LOG.exception(msg) 97 | raise 98 | setattr(self, param_name, param_value_from_config) 99 | # Convert to strings 100 | setattr(self, param_name, str(getattr(self, param_name))) 101 | 102 | # Check for data_service_url 103 | if self.data_service_url is None: 104 | try: 105 | self.data_service_url = _get_param_from_config_file( 106 | "data_service_url", config_filename 107 | ) 108 | except KeyError: 109 | pass 110 | except FileNotFoundError: 111 | pass 112 | 113 | if self.data_service_url is not None: 114 | if not self.data_service_url.strip("/").endswith(".org"): 115 | raise ValueError("data_service_url must end in '.org'") 116 | 117 | def search( 118 | self, 119 | query: str, 120 | lat: Optional[float] = None, 121 | lon: Optional[float] = None, 122 | include_country: bool = True, 123 | **kwargs, 124 | ) -> pd.DataFrame: 125 | """Search for PV systems. 126 | 127 | Some quirks of the PVOutput.org API: 128 | - The maximum number of results returned by PVOutput.org is 30. 129 | If the number of returned results is 30, then there is no 130 | indication of whether there are exactly 30 search results, 131 | or if there are more than 30. Also, there is no way to 132 | request additional 'pages' of search results. 133 | - The maximum search radius is 25km 134 | 135 | Args: 136 | query: string, see https://pvoutput.org/help.html#search 137 | e.g. '5km'. 138 | lat: float, e.g. 52.0668589 139 | lon: float, e.g. -1.3484038 140 | include_country: bool, whether or not to include the country name 141 | with the returned postcode. 142 | 143 | Returns: 144 | pd.DataFrame, one row per search results. Index is PV system ID. 145 | Columns: 146 | name, 147 | system_DC_capacity_W, 148 | address, # If `include_country` is True then address is 149 | # 'country> ', 150 | # else address is ''. 151 | orientation, 152 | num_outputs, 153 | last_output, 154 | panel, 155 | inverter, 156 | distance_km, 157 | latitude, 158 | longitude 159 | """ 160 | api_params = {"q": query, "country": int(include_country)} 161 | 162 | if lat is not None and lon is not None: 163 | api_params["ll"] = "{:f},{:f}".format(lat, lon) 164 | 165 | pv_systems_text = self._api_query(service="search", api_params=api_params, **kwargs) 166 | 167 | pv_systems = pd.read_csv( 168 | StringIO(pv_systems_text), 169 | names=[ 170 | "name", 171 | "system_DC_capacity_W", 172 | "address", 173 | "orientation", 174 | "num_outputs", 175 | "last_output", 176 | "system_id", 177 | "panel", 178 | "inverter", 179 | "distance_km", 180 | "latitude", 181 | "longitude", 182 | ], 183 | index_col="system_id", 184 | ) 185 | 186 | return pv_systems 187 | 188 | def get_status( 189 | self, 190 | pv_system_id: int, 191 | date: Union[str, datetime], 192 | historic: bool = True, 193 | timezone: Optional[str] = None, 194 | **kwargs, 195 | ) -> pd.DataFrame: 196 | """Get PV system status (e.g. power generation) for one day. 197 | 198 | The returned DataFrame will be empty if the PVOutput API 199 | returns 'status 400: No status found'. 200 | 201 | Args: 202 | pv_system_id: int 203 | date: str in format YYYYMMDD; or datetime 204 | (localtime of the PV system) 205 | timezone: the timezone of the systems. This will be used to add to the datetime. 206 | If None, it is not added 207 | 208 | Returns: 209 | pd.DataFrame: 210 | index: datetime (DatetimeIndex, localtime of the PV system) 211 | columns: (all np.float64): 212 | cumulative_energy_gen_Wh, 213 | energy_efficiency_kWh_per_kW, 214 | instantaneous_power_gen_W, 215 | average_power_gen_W, 216 | power_gen_normalised, 217 | energy_consumption_Wh, 218 | power_demand_W, 219 | temperature_C, 220 | voltage 221 | """ 222 | _LOG.info("system_id %d: Requesting system status for %s", pv_system_id, date) 223 | date = date_to_pvoutput_str(date) 224 | _check_date(date) 225 | 226 | api_params = { 227 | "d": date, # date, YYYYMMDD, localtime of the PV system 228 | "h": int(historic is True), # We want historical data. 229 | "limit": 288, # API limit is 288 (num of 5-min periods per day). 230 | "ext": 0, # Extended data; we don't want extended data. 231 | "sid1": pv_system_id, # SystemID. 232 | } 233 | 234 | try: 235 | pv_system_status_text = self._api_query( 236 | service="getstatus", api_params=api_params, **kwargs 237 | ) 238 | except NoStatusFound: 239 | _LOG.info("system_id %d: No status found for date %s", pv_system_id, date) 240 | pv_system_status_text = "" 241 | 242 | # See https://pvoutput.org/help.html#api-getstatus but make sure 243 | # you read the 'History Query' subsection, as a historical query 244 | # has slightly different return columns compared to a non-historical 245 | # query! 246 | columns = ( 247 | [ 248 | "cumulative_energy_gen_Wh", 249 | "energy_efficiency_kWh_per_kW", 250 | "instantaneous_power_gen_W", 251 | "average_power_gen_W", 252 | "power_gen_normalised", 253 | "energy_consumption_Wh", 254 | "power_demand_W", 255 | "temperature_C", 256 | "voltage", 257 | ] 258 | if historic 259 | else [ 260 | "cumulative_energy_gen_Wh", 261 | "instantaneous_power_gen_W", 262 | "energy_consumption_Wh", 263 | "power_demand_W", 264 | "power_gen_normalised", 265 | "temperature_C", 266 | "voltage", 267 | ] 268 | ) 269 | 270 | pv_system_status = pd.read_csv( 271 | StringIO(pv_system_status_text), 272 | lineterminator=";", 273 | names=["date", "time"] + columns, 274 | parse_dates={"datetime": ["date", "time"]}, 275 | index_col=["datetime"], 276 | dtype={col: np.float64 for col in columns}, 277 | ).sort_index() 278 | 279 | # add timezone 280 | if timezone is not None: 281 | pv_system_status = pv_system_status.tz_localize(timezone).tz_convert("UTC") 282 | 283 | return pv_system_status 284 | 285 | def get_system_status( 286 | self, 287 | pv_system_ids: List[int], 288 | date: Union[str, datetime], 289 | timezone: Optional[str] = None, 290 | **kwargs, 291 | ) -> pd.DataFrame: 292 | """Get Batch of PV system status (e.g. power generation) for one day, for multiple systems 293 | 294 | The returned DataFrame will be empty if the PVOutput API 295 | returns 'status 400: No status found'. 296 | 297 | Args: 298 | pv_system_ids: list of ints. 299 | If you have a subscription service then multiple (up to 50) 300 | pv systems status can be queries at once 301 | date: str in format YYYYMMDD; or datetime 302 | (localtime of the PV system) 303 | timezone: the timezone of the systems. This will be used to add to the datetime. 304 | If None, it is not added 305 | 306 | Returns: 307 | pd.DataFrame: 308 | columns: (all np.float64): 309 | system_id, 310 | datetime, 311 | instantaneous_power_gen_W, 312 | cumulative_energy_gen_Wh, 313 | instantaneous_power_gen_W, 314 | energy_consumption_Wh", 315 | temperature_C, 316 | voltage, 317 | """ 318 | _LOG.info(f"system_ids {pv_system_ids}: Requesting batch system status for %s", date) 319 | date = date_to_pvoutput_str(date) 320 | _check_date(date) 321 | 322 | # join the system ids with a column 323 | all_pv_system_id = ",".join([str(idx) for idx in pv_system_ids]) 324 | 325 | api_params = { 326 | "dt": date, # date, YYYYMMDD, localtime of the PV system 327 | "sid1": all_pv_system_id, # SystemID. 328 | } 329 | 330 | try: 331 | pv_system_status_text = self._api_query( 332 | service="getsystemstatus", api_params=api_params, **kwargs 333 | ) 334 | 335 | except NoStatusFound: 336 | _LOG.info(f"system_id {all_pv_system_id}: No status found for date %s", date) 337 | pv_system_status_text = "no status found" 338 | 339 | # each pv system is on a new line 340 | pv_systems_status_text = pv_system_status_text.split("\n") 341 | 342 | pv_system_status = [] 343 | for pv_system_status_text in pv_systems_status_text: 344 | 345 | try: 346 | one_pv_system_status = process_system_status( 347 | pv_system_status_text=pv_system_status_text, date=date 348 | ) 349 | except Exception as e: 350 | _LOG.error( 351 | f"Could not change raw text into dataframe. Raw text is {pv_system_status_text}" 352 | ) 353 | raise e 354 | 355 | pv_system_status.append(one_pv_system_status) 356 | 357 | pv_system_status = pd.concat(pv_system_status) 358 | pv_system_status.reset_index(inplace=True) 359 | 360 | # add timezone 361 | if timezone is not None: 362 | pv_system_status["datetime"] = ( 363 | pd.DatetimeIndex(pv_system_status["datetime"]) 364 | .tz_localize(timezone) 365 | .tz_convert("UTC") 366 | ) 367 | 368 | return pv_system_status 369 | 370 | def get_batch_status( 371 | self, 372 | pv_system_id: int, 373 | date_to: Optional[Union[str, datetime]] = None, 374 | max_retries: Optional[int] = 1000, 375 | **kwargs, 376 | ) -> Union[None, pd.DataFrame]: 377 | """Get batch PV system status (e.g. power generation). 378 | 379 | The returned DataFrame will be empty if the PVOutput API 380 | returns 'status 400: No status found'. 381 | 382 | Data returned is limited to the last 366 days per request. 383 | To retrieve older data, use the date_to parameter. 384 | 385 | The PVOutput getbatchstatus API is asynchronous. When it's first 386 | called, it replies to say 'accepted'. This function will then 387 | wait a minute and call the API again to see if the data is ready. 388 | Set `max_retries` to 1 if you want to return immediately, even 389 | if data isn't ready yet (and hence this function will return None) 390 | 391 | https://pvoutput.org/help.html#dataservice-getbatchstatus 392 | 393 | Args: 394 | pv_system_id: int 395 | date_to: str in format YYYYMMDD; or datetime 396 | (localtime of the PV system). The returned timeseries will 397 | include 366 days of data: from YYYY-1MMDD to YYYYMMDD inclusive 398 | max_retries: int, number of times to retry after receiving 399 | a '202 Accepted' request. Set `max_retries` to 1 if you want 400 | to return immediately, even if data isn't ready yet (and hence 401 | this function will return None). 402 | 403 | Returns: 404 | None (if data isn't ready after retrying max_retries times) or 405 | pd.DataFrame: 406 | index: datetime (DatetimeIndex, localtime of the PV system) 407 | columns: (all np.float64): 408 | cumulative_energy_gen_Wh, 409 | instantaneous_power_gen_W, 410 | temperature_C, 411 | voltage 412 | """ 413 | api_params = {"sid1": pv_system_id} 414 | 415 | _set_date_param(date_to, api_params, "dt") 416 | 417 | for retry in range(max_retries): 418 | try: 419 | pv_system_status_text = self._api_query( 420 | service="getbatchstatus", api_params=api_params, use_data_service=True, **kwargs 421 | ) 422 | except NoStatusFound: 423 | _LOG.info("system_id %d: No status found for date_to %s", pv_system_id, date_to) 424 | pv_system_status_text = "" 425 | break 426 | 427 | if "Accepted 202" in pv_system_status_text: 428 | if retry == 0: 429 | _print_and_log("Request accepted.") 430 | if retry < max_retries - 1: 431 | _print_and_log("Sleeping for 1 second.") 432 | time.sleep(1) 433 | else: 434 | _print_and_log( 435 | "Call get_batch_status again in a minute to see if" " results are ready." 436 | ) 437 | else: 438 | break 439 | else: 440 | return 441 | 442 | return process_batch_status(pv_system_status_text) 443 | 444 | def get_metadata(self, pv_system_id: int, **kwargs) -> pd.Series: 445 | """Get metadata for a single PV system. 446 | 447 | Args: 448 | pv_system_id: int 449 | 450 | Returns: 451 | pd.Series. Index is: 452 | name, 453 | system_DC_capacity_W, 454 | address, 455 | num_panels, 456 | panel_capacity_W_each, 457 | panel_brand, 458 | num_inverters, 459 | inverter_capacity_W, 460 | inverter_brand, 461 | orientation, 462 | array_tilt_degrees, 463 | shade, 464 | install_date, 465 | latitude, 466 | longitude, 467 | status_interval_minutes, 468 | secondary_num_panels, 469 | secondary_panel_capacity_W_each, 470 | secondary_orientation, 471 | secondary_array_tilt_degrees 472 | """ 473 | pv_metadata_text = self._api_query( 474 | service="getsystem", 475 | api_params={ 476 | "array2": 1, # Provide data about secondary array, if present. 477 | "tariffs": 0, 478 | "teams": 0, 479 | "est": 0, 480 | "donations": 0, 481 | "sid1": pv_system_id, # SystemID 482 | "ext": 0, # Include extended data? 483 | }, 484 | **kwargs, 485 | ) 486 | 487 | _LOG.debug(f"getting metadata for {pv_system_id}") 488 | 489 | pv_metadata = pd.read_csv( 490 | StringIO(pv_metadata_text), 491 | lineterminator=";", 492 | names=[ 493 | "name", 494 | "system_DC_capacity_W", 495 | "address", 496 | "num_panels", 497 | "panel_capacity_W_each", 498 | "panel_brand", 499 | "num_inverters", 500 | "inverter_capacity_W", 501 | "inverter_brand", 502 | "orientation", 503 | "array_tilt_degrees", 504 | "shade", 505 | "install_date", 506 | "latitude", 507 | "longitude", 508 | "status_interval_minutes", 509 | "secondary_num_panels", 510 | "secondary_panel_capacity_W_each", 511 | "secondary_orientation", 512 | "secondary_array_tilt_degrees", 513 | ], 514 | parse_dates=["install_date"], 515 | nrows=1, 516 | ).squeeze() 517 | pv_metadata["system_id"] = pv_system_id 518 | pv_metadata.name = pv_system_id 519 | return pv_metadata 520 | 521 | def get_statistic( 522 | self, 523 | pv_system_id: int, 524 | date_from: Optional[Union[str, date]] = None, 525 | date_to: Optional[Union[str, date]] = None, 526 | **kwargs, 527 | ) -> pd.DataFrame: 528 | """Get summary stats for a single PV system. 529 | 530 | Args: 531 | pv_system_id: int 532 | date_from 533 | date_to 534 | 535 | Returns: 536 | pd.DataFrame: 537 | total_energy_gen_Wh, 538 | energy_exported_Wh, 539 | average_daily_energy_gen_Wh, 540 | minimum_daily_energy_gen_Wh, 541 | maximum_daily_energy_gen_Wh, 542 | average_efficiency_kWh_per_kW, 543 | num_outputs, # The number of days for which there's >= 1 val. 544 | actual_date_from, 545 | actual_date_to, 546 | record_efficiency_kWh_per_kW, 547 | record_efficiency_date, 548 | query_date_from, 549 | query_date_to 550 | """ 551 | if date_from and not date_to: 552 | date_to = pd.Timestamp.now().date() 553 | if date_to and not date_from: 554 | date_from = pd.Timestamp("1900-01-01").date() 555 | 556 | api_params = { 557 | "c": 0, # consumption and import 558 | "crdr": 0, # credits / debits 559 | "sid1": pv_system_id, # SystemID 560 | } 561 | 562 | _set_date_param(date_from, api_params, "df") 563 | _set_date_param(date_to, api_params, "dt") 564 | 565 | try: 566 | pv_metadata_text = self._api_query( 567 | service="getstatistic", api_params=api_params, **kwargs 568 | ) 569 | except NoStatusFound: 570 | pv_metadata_text = "" 571 | 572 | columns = [ 573 | "total_energy_gen_Wh", 574 | "energy_exported_Wh", 575 | "average_daily_energy_gen_Wh", 576 | "minimum_daily_energy_gen_Wh", 577 | "maximum_daily_energy_gen_Wh", 578 | "average_efficiency_kWh_per_kW", 579 | "num_outputs", 580 | "actual_date_from", 581 | "actual_date_to", 582 | "record_efficiency_kWh_per_kW", 583 | "record_efficiency_date", 584 | ] 585 | date_cols = ["actual_date_from", "actual_date_to", "record_efficiency_date"] 586 | numeric_cols = set(columns) - set(date_cols) 587 | pv_metadata = pd.read_csv( 588 | StringIO(pv_metadata_text), 589 | names=columns, 590 | dtype={col: np.float32 for col in numeric_cols}, 591 | parse_dates=date_cols, 592 | ) 593 | if pv_metadata.empty: 594 | data = {col: np.float32(np.NaN) for col in numeric_cols} 595 | data.update({col: pd.NaT for col in date_cols}) 596 | pv_metadata = pd.DataFrame(data, index=[pv_system_id]) 597 | else: 598 | pv_metadata.index = [pv_system_id] 599 | 600 | pv_metadata["query_date_from"] = pd.Timestamp(date_from) if date_from else pd.NaT 601 | pv_metadata["query_date_to"] = pd.Timestamp(date_to) if date_to else pd.Timestamp.now() 602 | return pv_metadata 603 | 604 | def _get_statistic_with_cache( 605 | self, 606 | store_filename: str, 607 | pv_system_id: int, 608 | date_from: Optional[Union[str, date]] = None, 609 | date_to: Optional[Union[str, date]] = None, 610 | **kwargs, 611 | ) -> pd.Series: 612 | """ 613 | Get Statistic using cache 614 | 615 | Will try to get stats from store_filename['statistics']. If this 616 | fails, or if date_to > query_date_to, or if 617 | date_from < query_date_from, then will call the API. Note that the aim 618 | of this function is just to find the relevant actual_date_from and 619 | actual_date_to, so this function does not respect the other params. 620 | 621 | Args: 622 | store_filename: cache filenamte 623 | pv_system_id: pv system id 624 | date_from: the start date we want statistics from 625 | date_to: the end date we want statistics from 626 | **kwargs: 627 | 628 | Returns: Pandas data series holding various statistics 629 | 630 | """ 631 | if date_from: 632 | date_from = pd.Timestamp(date_from).date() 633 | if date_to: 634 | date_to = pd.Timestamp(date_to).date() 635 | 636 | def _get_fresh_statistic(): 637 | _LOG.info("pv_system %d: Getting fresh statistic.", pv_system_id) 638 | stats = self.get_statistic(pv_system_id, **kwargs) 639 | with pd.HDFStore(store_filename, mode="a") as store: 640 | try: 641 | store.remove(key="statistics", where="index=pv_system_id") 642 | except KeyError: 643 | pass 644 | store.append(key="statistics", value=stats) 645 | return stats 646 | 647 | try: 648 | stats = pd.read_hdf(store_filename, key="statistics", where="index=pv_system_id") 649 | except (FileNotFoundError, KeyError): 650 | return _get_fresh_statistic() 651 | 652 | if stats.empty: 653 | return _get_fresh_statistic() 654 | 655 | query_date_from = stats.iloc[0]["query_date_from"] 656 | query_date_to = stats.iloc[0]["query_date_to"] 657 | 658 | if ( 659 | not pd.isnull(date_from) 660 | and not pd.isnull(query_date_from) 661 | and date_from < query_date_from.date() 662 | ): 663 | return _get_fresh_statistic() 664 | 665 | if not pd.isnull(date_to) and date_to > query_date_to.date(): 666 | return _get_fresh_statistic() 667 | 668 | return stats 669 | 670 | def download_multiple_systems_to_disk( 671 | self, 672 | system_ids: Iterable[int], 673 | start_date: datetime, 674 | end_date: datetime, 675 | output_filename: str, 676 | timezone: Optional[str] = None, 677 | min_data_availability: Optional[float] = 0.5, 678 | use_get_batch_status_if_available: Optional[bool] = True, 679 | ): 680 | """Download multiple PV system IDs to disk. 681 | 682 | Data is saved to `output_filename` in HDF5 format. The exact data 683 | format is documented in 684 | https://github.com/openclimatefix/pvoutput/blob/master/docs/dataset.md 685 | 686 | This function is designed to be run for days (!) downloading 687 | gigabytes of PV data :) As such, this function can be safely 688 | interrupted and re-started. All the state required to re-start 689 | is stored in the HDF5 file. 690 | 691 | Add appropriate handlers the Python logger `pvoutput` to see progress. 692 | 693 | Args: 694 | system_ids: List of PV system IDs to download. 695 | start_date: Start of date range to download. 696 | end_date: End of date range to download. 697 | output_filename: HDF5 filename to write data to. 698 | timezone: String representation of timezone of timeseries data. 699 | e.g. 'Europe/London'. 700 | min_data_availability: A float in the range [0, 1]. 1 means only 701 | accept PV systems which have no days of missing data. 0 means 702 | accept all PV systems, no matter if they have missing data. 703 | Note that the data availability is measured against the date 704 | range for which the PV system has data available, not from 705 | the date range passed into this function. 706 | use_get_batch_status_if_available: Bool. If true then will use 707 | PVOutput's getbatchstatus API (which must be paid for, and 708 | `data_service_url` must be set in `~/.pvoutput.yml` or when 709 | initialising the PVOutput object). 710 | """ 711 | n = len(system_ids) 712 | for i, pv_system_id in enumerate(system_ids): 713 | _LOG.info("**********************") 714 | msg = "system_id {:d}: {:d} of {:d} ({:%})".format(pv_system_id, i + 1, n, (i + 1) / n) 715 | _LOG.info(msg) 716 | print("\r", msg, end="", flush=True) 717 | 718 | # Sorted list of DateRange objects. For each DateRange, 719 | # we need to download from start_date to end_date inclusive. 720 | date_ranges_to_download = get_date_ranges_to_download( 721 | output_filename, pv_system_id, start_date, end_date 722 | ) 723 | 724 | # How much data is actually available? 725 | date_ranges_to_download = self._filter_date_range( 726 | output_filename, pv_system_id, date_ranges_to_download, min_data_availability 727 | ) 728 | 729 | if not date_ranges_to_download: 730 | _LOG.info("system_id %d: No data left to download :)", pv_system_id) 731 | continue 732 | 733 | _LOG.info( 734 | "system_id %d: Will download these date ranges: %s", 735 | pv_system_id, 736 | date_ranges_to_download, 737 | ) 738 | 739 | if use_get_batch_status_if_available: 740 | if self.data_service_url: 741 | self._download_multiple_using_get_batch_status( 742 | output_filename, pv_system_id, date_ranges_to_download, timezone 743 | ) 744 | else: 745 | raise ValueError("data_service_url is not set!") 746 | else: 747 | self._download_multiple_using_get_status( 748 | output_filename, pv_system_id, date_ranges_to_download, timezone 749 | ) 750 | 751 | def get_insolation_forecast( 752 | self, 753 | date: Union[str, datetime], 754 | pv_system_id: Optional[int] = None, 755 | timezone: Optional[str] = None, 756 | lat: Optional[float] = None, 757 | lon: Optional[float] = None, 758 | **kwargs, 759 | ): 760 | """Get Insolation forecast data 761 | 762 | This is for a given site, or a given location defined by 763 | longitude and latitude. 764 | 765 | This is the estimated output for the site 766 | based on ideal weather conditions. Also factors in site age, reducing 767 | ouput by 1% each year, shade and orientation. Need donation mode enabled. 768 | See https://pvoutput.org/help.html#api-getinsolation 769 | 770 | Args: 771 | date: str in format YYYYMMDD; or datetime 772 | (localtime of the PV system) 773 | pv_system_id: int 774 | timezone: str 775 | lat: float e.g. -27.4676 776 | lon: float e.g. 153.0279 777 | **kwargs: 778 | 779 | 780 | Returns: dataframe of the insolution forecast 781 | 782 | """ 783 | date = date_to_pvoutput_str(date) 784 | _check_date(date, prediction=True) 785 | api_params = { 786 | "d": date, # date, YYYYMMDD, localtime of the PV system 787 | "sid1": pv_system_id, # SystemID. 788 | "tz": timezone, # defaults to configured timezone of system otherwise GMT 789 | } 790 | if lat is not None and lon is not None: 791 | api_params["ll"] = "{:f},{:f}".format(lat, lon) 792 | 793 | try: 794 | pv_insolation_text = self._api_query( 795 | service="getinsolation", api_params=api_params, **kwargs 796 | ) 797 | except NoStatusFound: 798 | _LOG.info("system_id %d: No status found for date %s", pv_system_id, date) 799 | pv_insolation_text = "" 800 | 801 | columns = ["predicted_power_gen_W", "predicted_cumulative_energy_gen_Wh"] 802 | pv_insolation = pd.read_csv( 803 | StringIO(pv_insolation_text), 804 | lineterminator=";", 805 | names=["time"] + columns, 806 | dtype={col: np.float64 for col in columns}, 807 | ).sort_index() 808 | pv_insolation.index = pd.to_datetime( 809 | date + " " + pv_insolation.time, format="%Y-%m-%d %H:%M" 810 | ) 811 | pv_insolation.drop("time", axis=1, inplace=True) 812 | return pv_insolation 813 | 814 | def _filter_date_range( 815 | self, 816 | store_filename: str, 817 | system_id: int, 818 | date_ranges: Iterable[DateRange], 819 | min_data_availability: Optional[float] = 0.5, 820 | ) -> List[DateRange]: 821 | """Check getstatistic to see if system_id has data for all date ranges. 822 | 823 | Args: 824 | system_id: PV system ID. 825 | store_filename: HDF5 filename to cache statistics to / from. 826 | date_ranges: List of DateRange objects. 827 | min_data_availability: A float in the range [0, 1]. 1 means only 828 | accept PV systems which have no days of missing data. 0 means 829 | accept all PV systems, no matter if they have missing data. 830 | """ 831 | if not date_ranges: 832 | return date_ranges 833 | 834 | stats = self._get_statistic_with_cache( 835 | store_filename, 836 | system_id, 837 | date_to=date_ranges[-1].end_date, 838 | wait_if_rate_limit_exceeded=True, 839 | ).squeeze() 840 | 841 | if pd.isnull(stats["actual_date_from"]) or pd.isnull(stats["actual_date_to"]): 842 | _LOG.info("system_id %d: Stats say there is no data!", system_id) 843 | return [] 844 | 845 | timeseries_date_range = DateRange(stats["actual_date_from"], stats["actual_date_to"]) 846 | 847 | data_availability = stats["num_outputs"] / (timeseries_date_range.total_days() + 1) 848 | 849 | if data_availability < min_data_availability: 850 | _LOG.info( 851 | "system_id %d: Data availability too low! Only %.0f %%.", 852 | system_id, 853 | data_availability * 100, 854 | ) 855 | return [] 856 | 857 | new_date_ranges = [] 858 | for date_range in date_ranges: 859 | new_date_range = date_range.intersection(timeseries_date_range) 860 | if new_date_range: 861 | new_date_ranges.append(new_date_range) 862 | return new_date_ranges 863 | 864 | def _download_multiple_using_get_batch_status( 865 | self, output_filename, pv_system_id, date_ranges_to_download, timezone: Optional[str] = None 866 | ): 867 | years = merge_date_ranges_to_years(date_ranges_to_download) 868 | dates_to = [year.end_date for year in years] 869 | total_rows = self._download_multiple_worker( 870 | output_filename, pv_system_id, dates_to, timezone, use_get_status=False 871 | ) 872 | 873 | # Re-load data, sort, remove duplicate indicies, append back 874 | if total_rows: 875 | with pd.HDFStore(output_filename, mode="a", complevel=9) as store: 876 | sort_and_de_dupe_pv_system(store, pv_system_id) 877 | 878 | def _download_multiple_using_get_status( 879 | self, output_filename, pv_system_id, date_ranges_to_download, timezone: Optional[str] = None 880 | ): 881 | for date_range in date_ranges_to_download: 882 | dates = date_range.date_range() 883 | self._download_multiple_worker( 884 | output_filename, pv_system_id, dates, timezone, use_get_status=True 885 | ) 886 | 887 | def _download_multiple_worker( 888 | self, output_filename, pv_system_id, dates, timezone, use_get_status 889 | ) -> int: 890 | """ 891 | Download data with multiple workers 892 | 893 | Returns: 894 | total number of rows downloaded 895 | """ 896 | total_rows = 0 897 | for date_to_load in dates: 898 | _LOG.info("system_id %d: Requesting date: %s", pv_system_id, date_to_load) 899 | datetime_of_api_request = pd.Timestamp.utcnow() 900 | if use_get_status: 901 | timeseries = self.get_status( 902 | pv_system_id, date_to_load, wait_if_rate_limit_exceeded=True 903 | ) 904 | else: 905 | timeseries = self.get_batch_status(pv_system_id, date_to=date_to_load) 906 | if timeseries.empty: 907 | _LOG.info( 908 | "system_id %d: Got empty timeseries back for %s", pv_system_id, date_to_load 909 | ) 910 | if use_get_status: 911 | _append_missing_date_range( 912 | output_filename, 913 | pv_system_id, 914 | date_to_load, 915 | date_to_load, 916 | datetime_of_api_request, 917 | ) 918 | else: 919 | _append_missing_date_range( 920 | output_filename, 921 | pv_system_id, 922 | date_to_load - timedelta(days=365), 923 | date_to_load, 924 | datetime_of_api_request, 925 | ) 926 | else: 927 | total_rows += len(timeseries) 928 | _LOG.info(f"Adding timezone {timezone} to {total_rows} rows") 929 | timeseries = timeseries.tz_localize(timezone) 930 | _LOG.info( 931 | "system_id: %d: %d rows retrieved: %s to %s", 932 | pv_system_id, 933 | len(timeseries), 934 | timeseries.index[0], 935 | timeseries.index[-1], 936 | ) 937 | if use_get_status: 938 | check_pv_system_status(timeseries, date_to_load) 939 | else: 940 | _record_gaps( 941 | output_filename, 942 | pv_system_id, 943 | date_to_load, 944 | timeseries, 945 | datetime_of_api_request, 946 | ) 947 | timeseries["datetime_of_API_request"] = datetime_of_api_request 948 | timeseries["query_date"] = pd.Timestamp(date_to_load) 949 | key = system_id_to_hdf_key(pv_system_id) 950 | with pd.HDFStore(output_filename, mode="a", complevel=9) as store: 951 | with warnings.catch_warnings(): 952 | warnings.simplefilter("ignore", tables.NaturalNameWarning) 953 | store.append(key=key, value=timeseries, data_columns=True) 954 | 955 | _LOG.info("system_id %d: %d total rows downloaded", pv_system_id, total_rows) 956 | return total_rows 957 | 958 | def _api_query( 959 | self, 960 | service: str, 961 | api_params: Dict, 962 | wait_if_rate_limit_exceeded: bool = False, 963 | use_data_service: bool = False, 964 | ) -> str: 965 | """Send API request to PVOutput.org and return content text. 966 | 967 | Args: 968 | service: string, e.g. 'search' or 'getstatus' 969 | api_params: dict 970 | wait_if_rate_limit_exceeded: bool 971 | use_data_service: bool 972 | 973 | Raises: 974 | NoStatusFound 975 | RateLimitExceeded 976 | """ 977 | get_response_func = ( 978 | self._get_data_service_response if use_data_service else self._get_api_response 979 | ) 980 | 981 | try: 982 | response = get_response_func(service, api_params) 983 | except Exception as e: 984 | _LOG.exception(e) 985 | raise 986 | 987 | try: 988 | return self._process_api_response(response) 989 | except RateLimitExceeded: 990 | msg = "PVOutput.org API rate limit exceeded!" " Rate limit will be reset at {}".format( 991 | self.rate_limit_reset_time 992 | ) 993 | _print_and_log(msg) 994 | if wait_if_rate_limit_exceeded: 995 | self.wait_for_rate_limit_reset() 996 | return self._api_query(service, api_params, wait_if_rate_limit_exceeded=False) 997 | 998 | raise RateLimitExceeded(response, msg) 999 | 1000 | def _get_api_response(self, service: str, api_params: Dict) -> requests.Response: 1001 | """ 1002 | Get the non-data service (free) response from pvoutput.org 1003 | 1004 | Args: 1005 | service: string, e.g. 'search', 'getstatus' 1006 | api_params: dict 1007 | """ 1008 | self._check_api_params() 1009 | # Create request headers 1010 | headers = { 1011 | "X-Rate-Limit": "1", 1012 | "X-Pvoutput-Apikey": self.api_key, 1013 | "X-Pvoutput-SystemId": self.system_id, 1014 | } 1015 | 1016 | api_url = urljoin(BASE_URL, "service/r2/{}.jsp".format(service)) 1017 | 1018 | return _get_response(api_url, api_params, headers) 1019 | 1020 | def _get_data_service_response(self, service: str, api_params: Dict) -> requests.Response: 1021 | """ 1022 | Get the data service response from pvoutput.org 1023 | 1024 | Args: 1025 | service: string, e.g. 'getbatchstatus' 1026 | api_params: dict 1027 | """ 1028 | self._check_api_params() 1029 | if self.data_service_url is None: 1030 | raise ValueError("data_service_url must be set to use the data service!") 1031 | 1032 | headers = {"X-Rate-Limit": "1"} 1033 | api_params = api_params.copy() 1034 | api_params["key"] = self.api_key 1035 | api_params["sid"] = self.system_id 1036 | 1037 | api_url = urljoin(self.data_service_url, "data/r2/{}.jsp".format(service)) 1038 | 1039 | return _get_response(api_url, api_params, headers) 1040 | 1041 | def _check_api_params(self): 1042 | # Check we have relevant login details: 1043 | for param_name in ["api_key", "system_id"]: 1044 | if getattr(self, param_name) is None: 1045 | raise ValueError("Please set the {} parameter.".format(param_name)) 1046 | 1047 | def _set_rate_limit_params(self, headers): 1048 | for param_name, header_key in RATE_LIMIT_PARAMS_TO_API_HEADERS.items(): 1049 | header_value = int(headers[header_key]) 1050 | setattr(self, param_name, header_value) 1051 | 1052 | self.rate_limit_reset_time = pd.Timestamp.utcfromtimestamp(self.rate_limit_reset_time) 1053 | self.rate_limit_reset_time = self.rate_limit_reset_time.tz_localize("utc") 1054 | 1055 | _LOG.debug("%s", self.rate_limit_info()) 1056 | 1057 | def rate_limit_info(self) -> Dict: 1058 | """Get the rate limit information""" 1059 | info = {} 1060 | for param_name in RATE_LIMIT_PARAMS_TO_API_HEADERS: 1061 | info[param_name] = getattr(self, param_name) 1062 | return info 1063 | 1064 | def _process_api_response(self, response: requests.Response) -> str: 1065 | """Turns an API response into text. 1066 | 1067 | Args: 1068 | response: from _get_api_response() 1069 | 1070 | Returns: 1071 | content of the response. 1072 | 1073 | Raises: 1074 | UnicodeDecodeError 1075 | NoStatusFound 1076 | RateLimitExceeded 1077 | """ 1078 | if response.status_code == 400: 1079 | raise NoStatusFound(response=response) 1080 | 1081 | if response.status_code != 403: 1082 | try: 1083 | response.raise_for_status() 1084 | except Exception as e: 1085 | msg = "Bad status code! Response content = {}. Exception = {}".format( 1086 | response.content, e 1087 | ) 1088 | _LOG.exception(msg) 1089 | raise e.__class__(msg) 1090 | 1091 | self._set_rate_limit_params(response.headers) 1092 | 1093 | # Did we overshoot our quota? 1094 | if response.status_code == 403 and self.rate_limit_remaining <= 0: 1095 | raise RateLimitExceeded(response=response) 1096 | 1097 | try: 1098 | content = response.content.decode("latin1").strip() 1099 | except Exception as e: 1100 | msg = "Error decoding this string: {}\n{}".format(response.content, e) 1101 | _LOG.exception(msg) 1102 | raise 1103 | 1104 | # If we get to here then the content is valid :) 1105 | return content 1106 | 1107 | def wait_for_rate_limit_reset(self, do_sleeping: bool = True) -> int: 1108 | """ 1109 | Wait for reset limit 1110 | 1111 | Args: 1112 | do_sleeping: bool to do the sleeping, or not. 1113 | 1114 | Returns: The number of seconds needed to sleep 1115 | """ 1116 | utc_now = pd.Timestamp.utcnow() 1117 | timedelta_to_wait = self.rate_limit_reset_time - utc_now 1118 | timedelta_to_wait += timedelta(minutes=3) # Just for safety 1119 | secs_to_wait = timedelta_to_wait.total_seconds() 1120 | retry_time_utc = utc_now + timedelta_to_wait 1121 | 1122 | # good to have the retry time in local so that user see 'their' time 1123 | # retry_time_local = retry_time_utc.tz_convert(tz=datetime.now(tzlocal()).tzname()) 1124 | retry_time_local = retry_time_utc 1125 | _print_and_log( 1126 | "Waiting {:.0f} seconds. Will retry at {} UTC".format(secs_to_wait, retry_time_local) 1127 | ) 1128 | if do_sleeping: 1129 | time.sleep(secs_to_wait) 1130 | 1131 | return secs_to_wait 1132 | 1133 | 1134 | def date_to_pvoutput_str(date: Union[str, datetime]) -> str: 1135 | """Convert datetime to date string for PVOutput.org in YYYYMMDD format.""" 1136 | if isinstance(date, str): 1137 | try: 1138 | datetime.strptime(date, PV_OUTPUT_DATE_FORMAT) 1139 | except ValueError: 1140 | return pd.Timestamp(date).strftime(PV_OUTPUT_DATE_FORMAT) 1141 | else: 1142 | return date 1143 | return date.strftime(PV_OUTPUT_DATE_FORMAT) 1144 | 1145 | 1146 | def _check_date(date: str, prediction=False): 1147 | """Check that date string 1148 | 1149 | 1. conforms to YYYYMMDD format, 1150 | 2. that the date isn't in the future. 1151 | 1152 | Raises: 1153 | ValueError if the date is 'bad'. 1154 | """ 1155 | dt = datetime.strptime(date, PV_OUTPUT_DATE_FORMAT) 1156 | if dt > datetime.now() and not prediction: 1157 | raise ValueError( 1158 | "" 1159 | "date should not be in the future. Got {}. Current date is {}.".format( 1160 | date, datetime.now() 1161 | ) 1162 | ) 1163 | 1164 | 1165 | def _set_date_param(dt, api_params, key): 1166 | if dt is not None: 1167 | dt = date_to_pvoutput_str(dt) 1168 | _check_date(dt) 1169 | api_params[key] = dt 1170 | 1171 | 1172 | def check_pv_system_status(pv_system_status: pd.DataFrame, requested_date: date): 1173 | """Checks the DataFrame returned by get_pv_system_status. 1174 | 1175 | Args: 1176 | pv_system_status: DataFrame returned by get_pv_system_status 1177 | requested_date: date. 1178 | 1179 | Raises: 1180 | ValueError if the DataFrame is incorrect. 1181 | """ 1182 | if not isinstance(pv_system_status, pd.DataFrame): 1183 | raise ValueError("pv_system_status must be a dataframe") 1184 | if not pv_system_status.empty: 1185 | index = pv_system_status.index 1186 | for d in [index[0], index[-1]]: 1187 | if not requested_date <= d.date() <= requested_date + ONE_DAY: 1188 | raise ValueError( 1189 | "A date in the index is outside the expected range." 1190 | " Date from index={}, requested_date={}".format(d, requested_date) 1191 | ) 1192 | 1193 | 1194 | def _append_missing_date_range( 1195 | output_filename, pv_system_id, missing_start_date, missing_end_date, datetime_of_api_request 1196 | ): 1197 | 1198 | data = { 1199 | "missing_start_date_PV_localtime": pd.Timestamp(missing_start_date), 1200 | "missing_end_date_PV_localtime": pd.Timestamp(missing_end_date), 1201 | "datetime_of_API_request": datetime_of_api_request, 1202 | } 1203 | new_missing_date_range = pd.DataFrame(data, index=[pv_system_id]) 1204 | new_missing_date_range.index.name = "pv_system_id" 1205 | _LOG.info( 1206 | "system_id %d: Recording missing date range from %s to %s", 1207 | pv_system_id, 1208 | missing_start_date, 1209 | missing_end_date, 1210 | ) 1211 | with pd.HDFStore(output_filename, mode="a", complevel=9) as store: 1212 | store.append(key="missing_dates", value=new_missing_date_range, data_columns=True) 1213 | 1214 | 1215 | def _record_gaps(output_filename, pv_system_id, date_to, timeseries, datetime_of_api_request): 1216 | dates_of_data = ( 1217 | timeseries["instantaneous_power_gen_W"].dropna().resample("D").mean().dropna().index.date 1218 | ) 1219 | dates_requested = pd.date_range(date_to - timedelta(days=365), date_to, freq="D").date 1220 | missing_dates = set(dates_requested) - set(dates_of_data) 1221 | missing_date_ranges = _convert_consecutive_dates_to_date_ranges(list(missing_dates)) 1222 | _LOG.info( 1223 | "system_id %d: %d missing date ranges found: \n%s", 1224 | pv_system_id, 1225 | len(missing_date_ranges), 1226 | missing_date_ranges, 1227 | ) 1228 | if len(missing_date_ranges) == 0: 1229 | return 1230 | # Convert to from date objects to pd.Timestamp objects, because HDF5 1231 | # doesn't like to store date objects. 1232 | missing_date_ranges = missing_date_ranges.astype("datetime64") 1233 | missing_date_ranges["pv_system_id"] = pv_system_id 1234 | missing_date_ranges["datetime_of_API_request"] = datetime_of_api_request 1235 | missing_date_ranges.set_index("pv_system_id", inplace=True) 1236 | with pd.HDFStore(output_filename, mode="a", complevel=9) as store: 1237 | store.append(key="missing_dates", value=missing_date_ranges, data_columns=True) 1238 | 1239 | 1240 | def _convert_consecutive_dates_to_date_ranges(missing_dates): 1241 | new_missing = [] 1242 | missing_dates = np.sort(np.unique(missing_dates)) 1243 | if len(missing_dates) == 0: 1244 | return pd.DataFrame(new_missing) 1245 | 1246 | gaps = np.diff(missing_dates).astype("timedelta64[D]").astype(int) > 1 1247 | gaps = np.where(gaps)[0] 1248 | 1249 | start_date = missing_dates[0] 1250 | for gap_i in gaps: 1251 | end_date = missing_dates[gap_i] 1252 | new_missing.append( 1253 | { 1254 | "missing_start_date_PV_localtime": start_date, 1255 | "missing_end_date_PV_localtime": end_date, 1256 | } 1257 | ) 1258 | start_date = missing_dates[gap_i + 1] 1259 | 1260 | end_date = missing_dates[-1] 1261 | new_missing.append( 1262 | {"missing_start_date_PV_localtime": start_date, "missing_end_date_PV_localtime": end_date} 1263 | ) 1264 | 1265 | return pd.DataFrame(new_missing) 1266 | -------------------------------------------------------------------------------- /pvoutput/utils.py: -------------------------------------------------------------------------------- 1 | """Util functions""" 2 | 3 | import logging 4 | import os 5 | import sys 6 | import warnings 7 | from datetime import date, datetime 8 | from typing import Dict, Iterable, List, Union 9 | 10 | import numpy as np 11 | import pandas as pd 12 | import requests 13 | import tables 14 | import yaml 15 | from requests.adapters import HTTPAdapter 16 | from urllib3.util.retry import Retry 17 | 18 | from pvoutput.consts import CONFIG_FILENAME 19 | from pvoutput.daterange import DateRange, get_date_range_list 20 | 21 | _LOG = logging.getLogger("pvoutput") 22 | 23 | 24 | def _get_param_from_config_file(param_name, config_filename=CONFIG_FILENAME): 25 | with open(config_filename, mode="r") as fh: 26 | config_data = yaml.load(fh, Loader=yaml.Loader) 27 | try: 28 | value = config_data[param_name] 29 | except KeyError as e: 30 | print("Config file", config_filename, "does not contain a", param_name, "parameter.", e) 31 | raise 32 | return value 33 | 34 | 35 | def get_logger(filename=None, mode="a", level=logging.DEBUG, stream_handler=False): 36 | """ 37 | Get a logger 38 | 39 | Args: 40 | filename: get file handler filename 41 | mode: file handler mode 42 | level: logging level 43 | stream_handler: option to make a stream handler aswell 44 | 45 | Returns: logger 46 | """ 47 | if filename is None: 48 | filename = _get_param_from_config_file("log_filename") 49 | logger = logging.getLogger("pvoutput") 50 | logger.setLevel(level) 51 | logger.handlers = [logging.FileHandler(filename=filename, mode=mode)] 52 | if stream_handler: 53 | logger.handlers.append(logging.StreamHandler(sys.stdout)) 54 | formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") 55 | for handler in logger.handlers: 56 | handler.setFormatter(formatter) 57 | 58 | # Attach urllib3's logger to our logger. 59 | loggers_to_attach = ["urllib3", "requests"] 60 | for logger_name_to_attach in loggers_to_attach: 61 | logger_to_attach = logging.getLogger(logger_name_to_attach) 62 | logger_to_attach.parent = logger 63 | logger_to_attach.propagate = True 64 | 65 | return logger 66 | 67 | 68 | def _get_session_with_retry() -> requests.Session: 69 | max_retry_counts = dict( 70 | connect=720, # How many connection-related errors to retry on. 71 | # Set high because sometimes the network goes down for a 72 | # few hours at a time. 73 | # 720 x Retry.MAX_BACKOFF (120 s) = 86,400 s = 24 hrs 74 | read=20, # How many times to retry on read errors. 75 | status=20, # How many times to retry on bad status codes. 76 | ) 77 | retries = Retry( 78 | total=max(max_retry_counts.values()), 79 | backoff_factor=0.5, 80 | status_forcelist=[500, 502, 503, 504], 81 | **max_retry_counts 82 | ) 83 | session = requests.Session() 84 | session.mount("http://", HTTPAdapter(max_retries=retries)) 85 | session.mount("https://", HTTPAdapter(max_retries=retries)) 86 | return session 87 | 88 | 89 | def _get_response(api_url: str, api_params: Dict, headers: Dict) -> requests.Response: 90 | api_params_str = "&".join(["{}={}".format(key, value) for key, value in api_params.items()]) 91 | full_api_url = "{}?{}".format(api_url, api_params_str) 92 | session = _get_session_with_retry() 93 | response = session.get(full_api_url, headers=headers) 94 | _LOG.debug("response: status_code=%d; headers=%s", response.status_code, response.headers) 95 | return response 96 | 97 | 98 | def _print_and_log(msg: str, level: int = logging.INFO): 99 | _LOG.log(level, msg) 100 | print(msg) 101 | 102 | 103 | def get_system_ids_in_store(store_filename: str) -> List[int]: 104 | """ 105 | Get system ids in the hdf store 106 | 107 | Args: 108 | store_filename: hdf file name 109 | 110 | Returns: list of systems ids 111 | """ 112 | if not os.path.exists(store_filename): 113 | return [] 114 | with pd.HDFStore(store_filename, mode="r") as store: 115 | pv_system_ids = list(store.walk("/timeseries"))[0][2] 116 | return pd.to_numeric(pv_system_ids) 117 | 118 | 119 | def get_date_ranges_to_download( 120 | store_filename: str, 121 | system_id: int, 122 | start_date: Union[str, datetime], 123 | end_date: Union[str, datetime], 124 | ) -> List[DateRange]: 125 | """ 126 | Get the date ranges that we need downloaded 127 | 128 | If system_id in store, check if it already has data from 129 | start_date to end_date, taking into consideration missing_dates. 130 | 131 | Returns: list of DateRange objects 132 | For each DateRange we need to download from 133 | start_date to end_date inclusive. 134 | """ 135 | dates_to_download = list(pd.date_range(start_date, end_date, freq="D")) 136 | dates_to_download = datetime_list_to_dates(dates_to_download) 137 | dates_already_downloaded = get_dates_already_downloaded(store_filename, system_id) 138 | dates_to_download = set(dates_to_download) - set(dates_already_downloaded) 139 | missing_dates_for_id = get_missing_dates_for_id(store_filename, system_id) 140 | dates_to_download -= set(missing_dates_for_id) 141 | return get_date_range_list(list(dates_to_download)) 142 | 143 | 144 | def get_missing_dates_for_id(store_filename: str, system_id: int) -> List: 145 | """ 146 | Get missing dates for on pv system id 147 | 148 | Args: 149 | store_filename: filename fo hdf store 150 | system_id: system id 151 | 152 | Returns: list of missing dates 153 | """ 154 | if not os.path.exists(store_filename): 155 | return [] 156 | 157 | with pd.HDFStore(store_filename, mode="r") as store: 158 | try: 159 | missing_dates_for_id = store.select( 160 | key="missing_dates", 161 | where="index=system_id", 162 | columns=["missing_start_date_PV_localtime", "missing_end_date_PV_localtime"], 163 | ) 164 | except Exception as e: 165 | _LOG.debug(e) 166 | return [] 167 | 168 | missing_dates = [] 169 | for _, row in missing_dates_for_id.iterrows(): 170 | missing_date_range = pd.date_range( 171 | row["missing_start_date_PV_localtime"], row["missing_end_date_PV_localtime"], freq="D" 172 | ).tolist() 173 | missing_dates.extend(missing_date_range) 174 | 175 | missing_dates = np.sort(np.unique(missing_dates)) 176 | missing_dates = datetime_list_to_dates(missing_dates) 177 | print() 178 | _LOG.info("system_id %d: %d missing dates already found", system_id, len(missing_dates)) 179 | return missing_dates 180 | 181 | 182 | def datetime_list_to_dates(datetimes: Iterable[datetime]) -> Iterable[date]: 183 | """ 184 | Change datetime list to dates 185 | 186 | Args: 187 | datetimes: list of datetimes 188 | 189 | Returns: datetime index of dates 190 | """ 191 | if not isinstance(datetimes, Iterable): 192 | datetimes = [datetimes] 193 | return pd.DatetimeIndex(datetimes).date 194 | 195 | 196 | def get_dates_already_downloaded(store_filename, system_id) -> set: 197 | """ 198 | Get the dates that have already been downloaded 199 | 200 | Args: 201 | store_filename: filename of hdf file 202 | system_id: one system id 203 | 204 | Returns: set of datetimes already downloaded 205 | 206 | """ 207 | if not os.path.exists(store_filename): 208 | return set([]) 209 | 210 | with pd.HDFStore(store_filename, mode="r") as store: 211 | key = system_id_to_hdf_key(system_id) 212 | try: 213 | datetimes = store.select(key=key, columns=["datetime", "query_date"]) 214 | except KeyError: 215 | return set([]) 216 | else: 217 | query_dates = datetime_list_to_dates(datetimes["query_date"].dropna()) 218 | return set(datetimes.index.date).union(query_dates) 219 | 220 | 221 | def system_id_to_hdf_key(system_id: int) -> str: 222 | """ 223 | Change system id to a hdf key 224 | 225 | Args: 226 | system_id: system id 227 | 228 | Returns: key 229 | """ 230 | return "/timeseries/{:d}".format(system_id) 231 | 232 | 233 | def sort_and_de_dupe_pv_system(store, pv_system_id): 234 | """ 235 | Sort and de-duplicate pv systems 236 | 237 | Args: 238 | store: store of pv systems 239 | pv_system_id: on pv system id 240 | 241 | """ 242 | key = system_id_to_hdf_key(pv_system_id) 243 | timeseries = store[key] 244 | timeseries.sort_index(inplace=True) 245 | timeseries = timeseries[~timeseries.index.duplicated()] 246 | store.remove(key) 247 | with warnings.catch_warnings(): 248 | warnings.simplefilter("ignore", tables.NaturalNameWarning) 249 | store.append(key, timeseries, data_columns=True) 250 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pandas 3 | pyproj 4 | pyshp 5 | shapely 6 | cython>0.15.1 7 | geopandas 8 | pytest 9 | pyyaml 10 | tables 11 | matplotlib 12 | jupyter 13 | urllib3 14 | requests 15 | beautifulsoup4 16 | -------------------------------------------------------------------------------- /scripts/fetch_pv_timeseries.py: -------------------------------------------------------------------------------- 1 | """Tool for importing timeseries PV data from PVOutput. 2 | 3 | Takes in a PVOutput system csv file, and fetches the 4 | PV system Timeseries data as a hdf file as described by the contents of 5 | input, built according to the PVOutput library hdf file spec. 6 | The output file is named according to the inputfile, with 7 | "systems" replaced with "timeseries", e.g. 8 | PVOutput_Albania_systems.csv -> PVOutput_Albania_timeseries.hdf 9 | 10 | Typical usage example: 11 | 12 | python fetch_pv_timeseries.py -s system.csv -o out --startdate 2019-07-25 --enddate 2020-07-25 13 | 14 | Requirements: 15 | 16 | Either: set the env vars 17 | - DATA_SERVICE_URL 18 | - PVOUTPUT_AUTH_SYSTEMID 19 | - PVOUTPUT_AUTH_APIKEY, 20 | pass their equivalent arguments to the command, 21 | or create and use a ~/.pvoutput.yml file as described in the PVOutput library documentation 22 | """ 23 | 24 | from pvoutput import * 25 | 26 | import click as cl 27 | import datetime as dt 28 | import sys 29 | import pandas as pd 30 | import pathlib 31 | import logging 32 | 33 | 34 | @cl.command() 35 | @cl.option( 36 | "-s", 37 | "--systemfile", 38 | "systemfile_path", 39 | envvar="SYSTEMFILE", 40 | required=True, 41 | type=cl.Path(exists=True), 42 | ) 43 | @cl.option( 44 | "-o", 45 | "--outdir", 46 | "output_directory", 47 | envvar="OUTDIR", 48 | default="/mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/PV/PVOutput.org", 49 | type=cl.Path(exists=False, dir_okay=True), 50 | ) 51 | @cl.option( 52 | "--startdate", "start_date", envvar="STARTDATE", default="2019-05-20", type=cl.DateTime() 53 | ) 54 | @cl.option("--enddate", "end_date", envvar="ENDDATE", default="2019-08-20", type=cl.DateTime()) 55 | @cl.option("--data_service_url", envvar="DATA_SERVICE_URL") 56 | @cl.option("--pvo_systemid", envvar="PVOUTPUT_AUTH_SYSTEMID", required=True, type=str) 57 | @cl.option("--pvo_apikey", envvar="PVOUTPUT_AUTH_APIKEY", required=True, type=str) 58 | def run( 59 | output_directory: str, 60 | systemfile_path: str, 61 | pvo_systemid: str, 62 | pvo_apikey: str, 63 | data_service_url: str, 64 | start_date: dt.datetime, 65 | end_date: dt.datetime, 66 | ): 67 | if end_date < start_date: 68 | sys.exit("End date cannot occur before start date") 69 | 70 | # Create output directory if it doesn't already exist 71 | os.makedirs(output_directory, exist_ok=True) 72 | 73 | # Instantiate PVOutput library 74 | pv: pvoutput.PVOutput = PVOutput( 75 | system_id=pvo_systemid, api_key=pvo_apikey, data_service_url=data_service_url 76 | ) 77 | 78 | # Read in input systemsfile 79 | pv_systems: pd.DataFrame = pd.read_csv(systemfile_path, index_col="system_id") 80 | 81 | # Write output file 82 | filename: str = pathlib.Path(systemfile_path).stem.replace("systems", "timeseries") + ".hdf" 83 | logging.info(f"Writing to {output_directory}/{filename}") 84 | pv.download_multiple_systems_to_disk( 85 | system_ids=pv_systems.index, 86 | start_date=start_date, 87 | end_date=end_date, 88 | output_filename=output_directory + "/" + filename, 89 | ) 90 | 91 | 92 | if __name__ == "__main__": 93 | run() 94 | -------------------------------------------------------------------------------- /scripts/scrape_country_codes.py: -------------------------------------------------------------------------------- 1 | """ 2 | Downloads all country codes from PVOutput. 3 | Prints and saves a dictionary mapping the country names to 4 | their codes. 5 | """ 6 | 7 | import json 8 | 9 | import urllib3 10 | from bs4 import BeautifulSoup 11 | 12 | COUNTRY_PAGES = "https://pvoutput.org/map.jsp?country=" 13 | MAX_COUNTRY_INT = 257 14 | 15 | 16 | def get_country_name(manager: urllib3.PoolManager, code: int) -> str: 17 | 18 | country_url = f"{COUNTRY_PAGES}{code}" 19 | 20 | response = manager.request("GET", country_url) 21 | soup = BeautifulSoup(response.data, "html.parser") 22 | 23 | title = soup.title.string 24 | 25 | return title.split("|")[0].strip() 26 | 27 | 28 | def get_all_countries() -> None: 29 | 30 | output_dict = {} 31 | 32 | manager = urllib3.PoolManager() 33 | 34 | for country_int in range(1, MAX_COUNTRY_INT + 1): 35 | country_str = get_country_name(manager, country_int) 36 | output_dict[country_str] = int(country_int) 37 | 38 | print(output_dict) 39 | str_dict = json.dumps(output_dict) 40 | 41 | with open("country_codes.txt", "w") as f: 42 | f.write(str_dict) 43 | 44 | 45 | if __name__ == "__main__": 46 | get_all_countries() 47 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from setuptools import find_packages, setup 4 | 5 | this_directory = Path(__file__).parent 6 | install_requires = (this_directory / "requirements.txt").read_text().splitlines() 7 | long_description = (this_directory / "README.md").read_text() 8 | 9 | setup( 10 | name="pvoutput-ocf", 11 | version="0.1.33", 12 | license="MIT", 13 | packages=find_packages(), 14 | install_requires=install_requires, 15 | long_description=long_description, 16 | long_description_content_type="text/markdown", 17 | company="Open Climate Fix Ltd", 18 | author_email="info@openclimatefix.org", 19 | ) 20 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openclimatefix/pvoutput/383f57baf1488c0f14d4812acc0f4857152b3cb1/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import os 3 | import pickle 4 | from functools import partial 5 | 6 | import pytest 7 | 8 | from pvoutput import mapscraper as ms 9 | 10 | 11 | @pytest.fixture 12 | def data_dir(): 13 | # Taken from http://stackoverflow.com/a/6098238/732596 14 | data_dir = os.path.dirname(inspect.getfile(inspect.currentframe())) 15 | data_dir = os.path.abspath(data_dir) 16 | assert os.path.isdir(data_dir), data_dir + " does not exist." 17 | return data_dir 18 | 19 | 20 | def get_cleaned_test_soup(data_dir): 21 | test_soup_filepath = os.path.join(data_dir, "data/mapscraper_soup.pickle") 22 | with open(test_soup_filepath, "rb") as f: 23 | test_soup = pickle.load(f) 24 | return ms.clean_soup(test_soup) 25 | 26 | 27 | @pytest.fixture() 28 | def get_test_dict_of_dfs(data_dir): 29 | dict_filepath = os.path.join(data_dir, "data/mapscraper_dict_of_dfs.pickle") 30 | with open(dict_filepath, "rb") as f: 31 | test_soup = pickle.load(f) 32 | return test_soup 33 | 34 | 35 | @pytest.fixture() 36 | def get_function_dict(data_dir): 37 | # using partials so functions only get executed when needed 38 | soup = get_cleaned_test_soup(data_dir) 39 | df = ms._process_system_size_col(soup) 40 | index = df.index 41 | keys = get_keys_for_dict() 42 | functions = ( 43 | partial(ms._process_system_size_col, soup), 44 | partial(ms._process_output_col, soup, index), 45 | partial(ms._process_generation_and_average_cols, soup, index), 46 | partial(ms._process_efficiency_col, soup, index), 47 | partial(ms._process_metadata, soup), 48 | ) 49 | function_dict = dict(zip(keys, functions)) 50 | return function_dict 51 | 52 | 53 | def get_keys_for_dict(): 54 | keys = ( 55 | "pv_system_size_metadata", 56 | "process_output_col", 57 | "process_generation_and_average_cols", 58 | "process_efficiency_col", 59 | "process_metadata", 60 | ) 61 | return keys 62 | -------------------------------------------------------------------------------- /tests/data/create_mapscraper_test_files.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import sys 3 | 4 | from pvoutput import mapscraper as ms 5 | from tests.conftest import get_keys_for_dict 6 | 7 | 8 | def save_pickle_test_file(file, filename): 9 | # needed to avoid occasional RecursionError 10 | sys.setrecursionlimit(10000) 11 | with open(filename, "wb") as f: 12 | pickle.dump(file, f) 13 | 14 | 15 | def get_raw_soup(): 16 | url = ms._create_map_url(country_code=243, page_number=1, ascending=False, sort_by="capacity") 17 | return ms.get_soup(url, raw=True) 18 | 19 | 20 | def main(): 21 | raw_soup = get_raw_soup() 22 | save_pickle_test_file(raw_soup, "mapscraper_soup.pickle") 23 | soup = ms.clean_soup(raw_soup) 24 | keys = get_keys_for_dict() 25 | values = ms._process_metadata(soup, True) 26 | df_dict = dict(zip(keys, values)) 27 | save_pickle_test_file(df_dict, "mapscraper_dict_of_dfs.pickle") 28 | 29 | 30 | if __name__ == "__main__": 31 | main() 32 | -------------------------------------------------------------------------------- /tests/data/create_test_hdf.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | import pandas as pd 3 | 4 | FILENAME = "test.hdf" 5 | PV_SYSTEM_ID = 123 6 | 7 | 8 | def get_timeseries_df(): 9 | df = pd.DataFrame( 10 | index=pd.date_range("2019-01-01", periods=20, freq="5T"), 11 | columns=["datetime_of_API_request", "query_date", "instantaneous_power_gen_W"], 12 | ) 13 | df.index.name = "datetime" 14 | df["datetime_of_API_request"] = [pd.Timestamp("2019-02-01", tz="UTC")] * len(df) 15 | df["query_date"] = [pd.Timestamp("2019-01-01")] * len(df) 16 | df["instantaneous_power_gen_W"] = list(range(20)) 17 | return df 18 | 19 | 20 | def get_missing_dates(): 21 | df = pd.DataFrame( 22 | [ 23 | [ 24 | PV_SYSTEM_ID, 25 | pd.Timestamp("2019-01-02"), 26 | pd.Timestamp("2019-01-02"), 27 | pd.Timestamp("2019-02-01", tz="UTC"), 28 | ], 29 | [ 30 | PV_SYSTEM_ID, 31 | pd.Timestamp("2019-01-03"), 32 | pd.Timestamp("2019-01-03"), 33 | pd.Timestamp("2019-02-01", tz="UTC"), 34 | ], 35 | ], 36 | columns=[ 37 | "pv_system_id", 38 | "missing_start_date_PV_localtime", 39 | "missing_end_date_PV_localtime", 40 | "datetime_of_API_request", 41 | ], 42 | ).set_index("pv_system_id") 43 | return df 44 | 45 | 46 | def main(): 47 | timeseries = get_timeseries_df() 48 | missing_dates = get_missing_dates() 49 | with pd.HDFStore(FILENAME, mode="w") as store: 50 | store.append(key="/timeseries/{}".format(PV_SYSTEM_ID), value=timeseries, data_columns=True) 51 | store.append(key="missing_dates", value=missing_dates, data_columns=True) 52 | 53 | 54 | if __name__ == "__main__": 55 | main() 56 | -------------------------------------------------------------------------------- /tests/data/mapscraper_dict_of_dfs.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openclimatefix/pvoutput/383f57baf1488c0f14d4812acc0f4857152b3cb1/tests/data/mapscraper_dict_of_dfs.pickle -------------------------------------------------------------------------------- /tests/data/mapscraper_soup.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openclimatefix/pvoutput/383f57baf1488c0f14d4812acc0f4857152b3cb1/tests/data/mapscraper_soup.pickle -------------------------------------------------------------------------------- /tests/data/test.hdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openclimatefix/pvoutput/383f57baf1488c0f14d4812acc0f4857152b3cb1/tests/data/test.hdf -------------------------------------------------------------------------------- /tests/test_daterange.py: -------------------------------------------------------------------------------- 1 | from datetime import date 2 | 3 | import pandas as pd 4 | 5 | from pvoutput import daterange 6 | from pvoutput.daterange import DateRange, merge_date_ranges_to_years 7 | 8 | 9 | def test_get_date_range_list(): 10 | def _get_date_range(start_date, periods): 11 | return list(pd.date_range(start_date, periods=periods, freq="D")) 12 | 13 | dates = [] 14 | for start_date, periods in [("2019-01-01", 5), ("2019-05-01", 3), ("2015-04-01", 1)]: 15 | dates.extend(_get_date_range(start_date, periods)) 16 | 17 | date_range_list = daterange.get_date_range_list(dates) 18 | assert date_range_list[0].start_date == date(2015, 4, 1) 19 | assert date_range_list[0].end_date == date(2015, 4, 1) 20 | 21 | assert date_range_list[1].start_date == date(2019, 1, 1) 22 | assert date_range_list[1].end_date == date(2019, 1, 5) 23 | 24 | assert date_range_list[2].start_date == date(2019, 5, 1) 25 | assert date_range_list[2].end_date == date(2019, 5, 3) 26 | 27 | assert daterange.get_date_range_list([]) == [] 28 | 29 | 30 | def test_intersection(): 31 | assert ( 32 | DateRange("2019-01-01", "2019-01-02").intersection(DateRange("2020-01-01", "2020-01-02")) 33 | is None 34 | ) 35 | 36 | assert DateRange("2019-01-01", "2019-01-10").intersection( 37 | DateRange("2019-01-01", "2019-01-02") 38 | ) == DateRange("2019-01-01", "2019-01-02") 39 | 40 | assert DateRange("2019-01-01", "2019-01-10").intersection( 41 | DateRange("2019-01-05", "2019-01-20") 42 | ) == DateRange("2019-01-05", "2019-01-10") 43 | 44 | year = DateRange("2018-01-1", "2019-01-01") 45 | dec = DateRange("2018-12-01", "2019-01-01") 46 | assert year.intersection(dec) == dec 47 | 48 | june = DateRange("2018-06-01", "2018-07-01") 49 | assert year.intersection(june) == june 50 | 51 | incomplete_overlap = DateRange("2017-07-01", "2018-02-01") 52 | assert year.intersection(incomplete_overlap) != incomplete_overlap 53 | 54 | 55 | def test_total_days(): 56 | assert DateRange("2019-01-01", "2019-01-10").total_days() == 9 57 | 58 | 59 | def test_split_into_years(): 60 | short_dr = DateRange("2019-01-01", "2019-01-10") 61 | assert short_dr.split_into_years() == [short_dr] 62 | 63 | one_year = DateRange("2019-01-01", "2020-01-01") 64 | assert one_year.split_into_years() == [one_year] 65 | 66 | year_and_half = DateRange("2019-01-01", "2020-06-01") 67 | assert year_and_half.split_into_years() == [ 68 | DateRange("2019-06-02", "2020-06-01"), 69 | DateRange("2019-01-01", "2019-06-02"), 70 | ] 71 | 72 | 73 | def test_merge_date_ranges_to_years(): 74 | jan = DateRange("2018-01-01", "2018-02-01") 75 | multiyear = DateRange("2017-01-01", "2018-02-01") 76 | old_multiyear = DateRange("2014-01-01", "2016-02-01") 77 | ancient_jan = DateRange("2010-01-01", "2010-02-01") 78 | for date_ranges, merged in [ 79 | ([], []), 80 | ([jan], [DateRange("2017-02-01", "2018-02-01")]), 81 | ( 82 | [multiyear], 83 | [DateRange("2017-02-01", "2018-02-01"), DateRange("2016-02-02", "2017-02-01")], 84 | ), 85 | ( 86 | [old_multiyear, multiyear], 87 | [ 88 | DateRange("2017-02-01", "2018-02-01"), 89 | DateRange("2016-02-02", "2017-02-01"), 90 | DateRange("2015-02-01", "2016-02-01"), 91 | DateRange("2014-02-01", "2015-02-01"), 92 | DateRange("2013-02-01", "2014-02-01"), 93 | ], 94 | ), 95 | ( 96 | [ancient_jan, old_multiyear, multiyear], 97 | [ 98 | DateRange("2017-02-01", "2018-02-01"), 99 | DateRange("2016-02-02", "2017-02-01"), 100 | DateRange("2015-02-01", "2016-02-01"), 101 | DateRange("2014-02-01", "2015-02-01"), 102 | DateRange("2013-02-01", "2014-02-01"), 103 | DateRange("2009-02-01", "2010-02-01"), 104 | ], 105 | ), 106 | ]: 107 | assert merge_date_ranges_to_years(date_ranges) == merged 108 | -------------------------------------------------------------------------------- /tests/test_grid_search.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pvoutput.grid_search.grid_search import GridSearch 4 | 5 | SHOW = True 6 | if "CI" in os.environ: 7 | SHOW = False 8 | 9 | 10 | def test_init(): 11 | """Test that Grid search can be initiated""" 12 | _ = GridSearch() 13 | 14 | 15 | def test_list_countries(): 16 | """Get list of countries""" 17 | grd = GridSearch() 18 | countries = grd.nat_earth.list_countries() 19 | assert len(countries) == 258 20 | 21 | 22 | def test_uk_grid(): 23 | """Example 1: Get UK grid 24 | 25 | Use this to clip to a bounding box as well as the countries selected 26 | List as many countries as you want, or set to None for world-wide 27 | Only include search points within a certain radius of a location (see Example 3) 28 | Increase this if you'd like to consider systems "near" the target region (see Example 2) 29 | Allow some extra overlap due to inaccuracies in measuring distance 30 | EPSG:27700 is OSGB36 / British National Grid 31 | Gives a nice plot of the region and grid 32 | """ 33 | grd = GridSearch() 34 | ukgrid = grd.generate_grid( 35 | bbox=[45, -15, 58, 15], 36 | countries=["United Kingdom"], 37 | radial_clip=None, 38 | buffer=0, 39 | search_radius=24.5, 40 | local_crs_epsg=27700, 41 | show=SHOW, 42 | ) 43 | assert len(ukgrid) > 100 44 | 45 | 46 | def test_luxembourg_grid(): 47 | """Example 2: Make Luxembourg grid 48 | 49 | Include search radii within 50km of Luzembourgs border 50 | Allow some extra overlap due to inaccuracies in measuring distance 51 | EPSG:2169 is Luxembourg 1930 / Gauss 52 | 53 | """ 54 | grd = GridSearch() 55 | luxgrid = grd.generate_grid( 56 | countries=["Luxembourg"], buffer=50, search_radius=24.5, local_crs_epsg=2169, show=SHOW 57 | ) 58 | luxgrid.head() 59 | assert len(luxgrid) == 18 60 | 61 | 62 | def test_sheffield_grid(): 63 | """Make grid around Sheffield 64 | 65 | Only include search points within a 100km of the TUOS Physics Department 66 | EPSG:27700 is OSGB36 / British National Grid 67 | 68 | """ 69 | grd = GridSearch() 70 | shefgrid = grd.generate_grid( 71 | radial_clip=( 72 | 53.381, 73 | -1.486, 74 | 100.0, 75 | ), # Only include search points within a 100km of the TUOS Physics Department 76 | local_crs_epsg=27700, # EPSG:27700 is OSGB36 / British National Grid 77 | show=SHOW, 78 | ) 79 | assert len(shefgrid) == 29 80 | 81 | 82 | def test_balkan_grid(): 83 | """Plot grid around Balkan area""" 84 | grd = GridSearch() 85 | balkan_grid = grd.generate_grid( 86 | countries=[ 87 | "Bosnia and Herz.", 88 | "Croatia", 89 | "Hungary", 90 | "Romania", 91 | "Bulgaria", 92 | "North Macedonia", 93 | "Kosovo", 94 | "Albania", 95 | "Montenegro", 96 | "Serbia", 97 | ], 98 | search_radius=24.5, 99 | show=SHOW, 100 | ) 101 | assert len(balkan_grid) == 733 102 | -------------------------------------------------------------------------------- /tests/test_mapscraper.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | 4 | from pvoutput import mapscraper as ms 5 | from pvoutput.consts import MAP_URL 6 | 7 | 8 | def compare_function_output_to_pickle(key, function_dict, dict_of_dfs, series=False): 9 | df_from_func = function_dict[key]() 10 | test_df = dict_of_dfs[key] 11 | if series: 12 | return pd.testing.assert_series_equal(df_from_func, test_df) 13 | return pd.testing.assert_frame_equal(df_from_func, test_df, check_like=True) 14 | 15 | 16 | def test_convert_to_country_code(): 17 | assert ms._convert_to_country_code(1) == 1 18 | assert ms._convert_to_country_code("United Kingdom") == 243 19 | 20 | def _assert_raises(bad_countries, exception): 21 | for bad_country in bad_countries: 22 | with pytest.raises(exception): 23 | ms._convert_to_country_code(bad_country) 24 | pytest.fail( 25 | "Failed to raise {} for country={}".format(exception.__name__, bad_country) 26 | ) 27 | 28 | _assert_raises([-1, -100, 1000, "blah"], ValueError) 29 | 30 | 31 | def test_create_map_url(): 32 | assert ms._create_map_url() == MAP_URL 33 | assert ms._create_map_url(country_code=1) == MAP_URL + "?country=1" 34 | assert ms._create_map_url(page_number=2) == MAP_URL + "?p=2" 35 | assert ms._create_map_url(ascending=True) == MAP_URL + "?d=asc" 36 | assert ms._create_map_url(ascending=False) == MAP_URL + "?d=desc" 37 | assert ms._create_map_url(sort_by="efficiency") == MAP_URL + "?o=gss" 38 | with pytest.raises(ValueError): 39 | ms._create_map_url(sort_by="blah") 40 | 41 | 42 | def test_pv_system_size_metadata(get_function_dict, get_test_dict_of_dfs): 43 | assert ( 44 | compare_function_output_to_pickle( 45 | "pv_system_size_metadata", get_function_dict, get_test_dict_of_dfs 46 | ) 47 | is None 48 | ) 49 | 50 | 51 | def test_process_output_col(get_function_dict, get_test_dict_of_dfs): 52 | assert ( 53 | compare_function_output_to_pickle( 54 | "process_output_col", get_function_dict, get_test_dict_of_dfs, series=True 55 | ) 56 | is None 57 | ) 58 | 59 | 60 | def test_process_generation_and_average_cols(get_function_dict, get_test_dict_of_dfs): 61 | assert ( 62 | compare_function_output_to_pickle( 63 | "process_generation_and_average_cols", get_function_dict, get_test_dict_of_dfs 64 | ) 65 | is None 66 | ) 67 | 68 | 69 | def test_process_efficiency_col(get_function_dict, get_test_dict_of_dfs): 70 | assert ( 71 | compare_function_output_to_pickle( 72 | "process_efficiency_col", get_function_dict, get_test_dict_of_dfs, series=True 73 | ) 74 | is None 75 | ) 76 | 77 | 78 | def test_process_metadata(get_function_dict, get_test_dict_of_dfs): 79 | assert ( 80 | compare_function_output_to_pickle( 81 | "process_metadata", get_function_dict, get_test_dict_of_dfs 82 | ) 83 | is None 84 | ) 85 | -------------------------------------------------------------------------------- /tests/test_process.py: -------------------------------------------------------------------------------- 1 | from datetime import date 2 | from io import StringIO 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import pytest 7 | 8 | from pvoutput.prcoess import process_batch_status, process_system_status 9 | 10 | 11 | def test_process_system_status(): 12 | pv_system_status_text = "1234;07:45,21,255,1,2;" 13 | one_status = process_system_status( 14 | pv_system_status_text=pv_system_status_text, date=date(2022, 1, 1) 15 | ) 16 | assert len(one_status) == 1 17 | 18 | 19 | def test_process_system_status_2(): 20 | # note that the second entry has a missing data value 21 | pv_system_status_text = "1234;07:45,21,255,1,5;" "07:50,22,257,2;" "07:55,23,256,3,4" 22 | 23 | one_status = process_system_status( 24 | pv_system_status_text=pv_system_status_text, date=date(2022, 1, 1) 25 | ) 26 | assert len(one_status) == 3 27 | assert (one_status["system_id"] == 1234).all() 28 | 29 | 30 | def test_process_system_status_none(): 31 | 32 | one_status = process_system_status( 33 | pv_system_status_text="no status found", date=date(2022, 1, 1) 34 | ) 35 | assert len(one_status) == 0 36 | 37 | 38 | def test_process_system_status_less_columns_two_data_points(): 39 | # this has all missing data values 40 | pv_system_status_text = "1234;07:45,21,255;" "07:45,22,256" 41 | one_status = process_system_status( 42 | pv_system_status_text=pv_system_status_text, date=date(2022, 1, 1) 43 | ) 44 | assert len(one_status) == 2 45 | 46 | 47 | def test_process_system_status_less_columns_one_data_points(): 48 | # this has all missing data values 49 | pv_system_status_text = "1234;07:45,21,255" 50 | one_status = process_system_status( 51 | pv_system_status_text=pv_system_status_text, date=date(2022, 1, 1) 52 | ) 53 | assert len(one_status) == 1 54 | assert np.isnan(one_status["temperature_C"][0]) 55 | 56 | 57 | def test_process_batch_status(): 58 | # Response text copied from 59 | # https://pvoutput.org/help.html#dataservice-getbatchstatus 60 | response_text = """ 61 | 20140330;07:35,2,24;07:40,4,24;07:45,6,24;07:50,8,24;07:55,13,60;08:00,24,132 62 | 20140329;07:35,2,24;07:40,4,24;07:45,6,24;07:50,8,24;07:55,13,60;08:00,24,132 63 | 20140328;07:35,2,24;07:40,4,24;07:45,6,24;07:50,8,24;07:55,13,60;08:00,24,132""" 64 | 65 | correct_interpretation_csv = """ 66 | datetime,cumulative_energy_gen_Wh,instantaneous_power_gen_W,temperature_C,voltage 67 | 2014-03-28 07:35:00,2.0,24.0,, 68 | 2014-03-28 07:40:00,4.0,24.0,, 69 | 2014-03-28 07:45:00,6.0,24.0,, 70 | 2014-03-28 07:50:00,8.0,24.0,, 71 | 2014-03-28 07:55:00,13.0,60.0,, 72 | 2014-03-28 08:00:00,24.0,132.0,, 73 | 2014-03-29 07:35:00,2.0,24.0,, 74 | 2014-03-29 07:40:00,4.0,24.0,, 75 | 2014-03-29 07:45:00,6.0,24.0,, 76 | 2014-03-29 07:50:00,8.0,24.0,, 77 | 2014-03-29 07:55:00,13.0,60.0,, 78 | 2014-03-29 08:00:00,24.0,132.0,, 79 | 2014-03-30 07:35:00,2.0,24.0,, 80 | 2014-03-30 07:40:00,4.0,24.0,, 81 | 2014-03-30 07:45:00,6.0,24.0,, 82 | 2014-03-30 07:50:00,8.0,24.0,, 83 | 2014-03-30 07:55:00,13.0,60.0,, 84 | 2014-03-30 08:00:00,24.0,132.0,,""" 85 | 86 | df = process_batch_status(response_text) 87 | correct_df = pd.read_csv( 88 | StringIO(correct_interpretation_csv), parse_dates=["datetime"], index_col="datetime" 89 | ) 90 | pd.testing.assert_frame_equal(df, correct_df) 91 | 92 | empty_df = process_batch_status("") 93 | assert empty_df.empty, "DataFrame should be empty but it was:\n{}\n".format(empty_df) 94 | 95 | with pytest.raises(NotImplementedError): 96 | process_batch_status("20140330;07:35,2,24,2,24,23.1,230.3") 97 | -------------------------------------------------------------------------------- /tests/test_pvoutput.py: -------------------------------------------------------------------------------- 1 | from datetime import date, datetime 2 | from io import StringIO 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import pytest 7 | 8 | from pvoutput import pvoutput 9 | 10 | 11 | def test_init(): 12 | _ = pvoutput.PVOutput(api_key="fake", system_id="fake") 13 | 14 | 15 | def test_rate_limit(): 16 | pv = pvoutput.PVOutput(api_key="fake", system_id="fake") 17 | 18 | # set a fake reset time 19 | pv.rate_limit_reset_time = pd.Timestamp.utcnow() + pd.Timedelta(minutes=30) 20 | 21 | # get the number of seconds we need to wait 22 | seconds_to_wait = pv.wait_for_rate_limit_reset(do_sleeping=False) 23 | 24 | # 30 mins, + 3 mins for safety 25 | assert np.round(seconds_to_wait) == 30 * 60 + (60 * 3) 26 | 27 | 28 | @pytest.mark.skip("Currently not working in CI") 29 | def test_get_status(): 30 | pv = pvoutput.PVOutput() 31 | pv.get_status( 32 | pv_system_id=10033, 33 | date=datetime(2022, 3, 1, 12), 34 | use_data_service=True, 35 | timezone="Europe/London", 36 | ) 37 | 38 | 39 | @pytest.mark.skip("Currently not working in CI") 40 | def test_multiple_get_status(): 41 | pv = pvoutput.PVOutput() 42 | status_df = pv.get_system_status( 43 | pv_system_ids=[10033, 10020], 44 | date=datetime(2022, 3, 15), 45 | use_data_service=True, 46 | timezone="Europe/London", 47 | ) 48 | 49 | assert len(status_df) > 0 50 | 51 | 52 | def test_convert_consecutive_dates_to_date_ranges(): 53 | dr1 = pd.date_range("2018-01-01", "2018-02-01", freq="D").tolist() 54 | dr2 = pd.date_range("2018-02-05", "2018-02-10", freq="D").tolist() 55 | missing_dates = dr1 + dr2 56 | date_ranges = pvoutput._convert_consecutive_dates_to_date_ranges(missing_dates) 57 | columns = ["missing_start_date_PV_localtime", "missing_end_date_PV_localtime"] 58 | pd.testing.assert_frame_equal( 59 | date_ranges[columns], 60 | pd.DataFrame( 61 | [ 62 | [dr1[0], dr1[-1]], 63 | [dr2[0], dr2[-1]], 64 | ], 65 | columns=columns, 66 | ), 67 | ) 68 | 69 | 70 | def test_date_to_pvoutput_str(): 71 | VALID_DATE_STR = "20190101" 72 | assert pvoutput.date_to_pvoutput_str(VALID_DATE_STR) == VALID_DATE_STR 73 | ts = pd.Timestamp(VALID_DATE_STR) 74 | assert pvoutput.date_to_pvoutput_str(ts) == VALID_DATE_STR 75 | 76 | 77 | def test_check_date(): 78 | assert pvoutput._check_date("20190101") is None 79 | with pytest.raises(ValueError): 80 | pvoutput._check_date("2010") 81 | with pytest.raises(ValueError): 82 | pvoutput._check_date("2010-01-02") 83 | 84 | 85 | def test_check_pv_system_status(): 86 | def _make_timeseries(start, end): 87 | index = pd.date_range(start, end, freq="5T") 88 | n = len(index) 89 | timeseries = pd.DataFrame(np.zeros(n), index=index) 90 | return timeseries 91 | 92 | DATE = date(2019, 1, 1) 93 | good_timeseries = _make_timeseries("2019-01-01 00:00", "2019-01-02 00:00") 94 | pvoutput.check_pv_system_status(good_timeseries, DATE) 95 | 96 | bad_timeseries = _make_timeseries("2019-01-01 00:00", "2019-01-03 00:00") 97 | with pytest.raises(ValueError): 98 | pvoutput.check_pv_system_status(bad_timeseries, DATE) 99 | 100 | bad_timeseries2 = _make_timeseries("2019-01-02 00:00", "2019-01-03 00:00") 101 | with pytest.raises(ValueError): 102 | pvoutput.check_pv_system_status(bad_timeseries2, DATE) 103 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import date 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import pytest 7 | 8 | from pvoutput import utils 9 | from pvoutput.daterange import DateRange 10 | 11 | PV_SYSTEM = 123 12 | 13 | 14 | def test_get_missing_dates_for_id(data_dir): 15 | test_hdf = os.path.join(data_dir, "data/test.hdf") 16 | missing_dates = utils.get_missing_dates_for_id(test_hdf, PV_SYSTEM) 17 | np.testing.assert_array_equal(missing_dates, [date(2019, 1, 2), date(2019, 1, 3)]) 18 | 19 | 20 | def test_get_system_ids_in_store(data_dir): 21 | test_hdf = os.path.join(data_dir, "data/test.hdf") 22 | system_ids = utils.get_system_ids_in_store(test_hdf) 23 | np.testing.assert_array_equal(system_ids, [PV_SYSTEM]) 24 | 25 | 26 | def test_get_date_ranges_to_download(data_dir): 27 | test_hdf = os.path.join(data_dir, "data/test.hdf") 28 | date_ranges = utils.get_date_ranges_to_download(test_hdf, PV_SYSTEM, "2018-01-01", "2019-01-10") 29 | # 2018-01-02 and 2018-01-03 are already known to be missing. 30 | np.testing.assert_array_equal( 31 | date_ranges, 32 | [ 33 | DateRange(start_date=date(2018, 1, 1), end_date=date(2018, 12, 31)), 34 | DateRange(start_date=date(2019, 1, 4), end_date=date(2019, 1, 10)), 35 | ], 36 | ) 37 | 38 | 39 | def test_datetime_list_to_dates(): 40 | np.testing.assert_array_equal( 41 | utils.datetime_list_to_dates(pd.Timestamp("2019-01-01")), [date(2019, 1, 1)] 42 | ) 43 | 44 | np.testing.assert_array_equal( 45 | utils.datetime_list_to_dates([pd.Timestamp("2019-01-01"), pd.Timestamp("2019-01-02")]), 46 | [date(2019, 1, 1), date(2019, 1, 2)], 47 | ) 48 | --------------------------------------------------------------------------------