├── .all-contributorsrc
├── .bumpversion.cfg
├── .github
    └── workflows
    │   ├── pytest.yml
    │   └── release.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── .ruff.toml
├── .travis.yml
├── LICENSE.txt
├── MANIFEST.in
├── README.md
├── docs
    └── dataset.md
├── environment.yml
├── examples
    ├── analyse_PV_data_for_9th_Aug_2019.ipynb
    ├── analyse_metadata_for_UK.ipynb
    ├── animate_PV_yield_map.ipynb
    ├── compute_grid_points_for_UK.ipynb
    ├── download_pv_timeseries.ipynb
    ├── get_all_rss_systems.py
    ├── get_all_systems_in_region.ipynb
    ├── get_metadata.ipynb
    ├── query_API_for_all_UK_grid_points.ipynb
    └── quick_start.ipynb
├── infrastructure
    └── docker
    │   ├── Dockerfile_dev
    │   └── Dockerfile_prod
├── pvoutput
    ├── __init__.py
    ├── consts.py
    ├── daterange.py
    ├── exceptions.py
    ├── grid_search
    │   ├── __init__.py
    │   ├── app.py
    │   ├── clip.py
    │   ├── grid_search.py
    │   └── natural_earth.py
    ├── mapscraper.py
    ├── prcoess.py
    ├── pvoutput.py
    └── utils.py
├── requirements.txt
├── scripts
    ├── fetch_pv_timeseries.py
    └── scrape_country_codes.py
├── setup.py
└── tests
    ├── __init__.py
    ├── conftest.py
    ├── data
        ├── create_mapscraper_test_files.py
        ├── create_test_hdf.py
        ├── mapscraper_dict_of_dfs.pickle
        ├── mapscraper_soup.pickle
        └── test.hdf
    ├── test_daterange.py
    ├── test_grid_search.py
    ├── test_mapscraper.py
    ├── test_process.py
    ├── test_pvoutput.py
    └── test_utils.py


/.all-contributorsrc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "files": [
 3 |     "README.md"
 4 |   ],
 5 |   "imageSize": 100,
 6 |   "commit": false,
 7 |   "contributors": [
 8 |     {
 9 |       "login": "JackKelly",
10 |       "name": "Jack Kelly",
11 |       "avatar_url": "https://avatars.githubusercontent.com/u/460756?v=4",
12 |       "profile": "http://jack-kelly.com",
13 |       "contributions": [
14 |         "code"
15 |       ]
16 |     },
17 |     {
18 |       "login": "ssmssam",
19 |       "name": "Sam Murphy-Sugrue",
20 |       "avatar_url": "https://avatars.githubusercontent.com/u/39378848?v=4",
21 |       "profile": "https://github.com/ssmssam",
22 |       "contributions": [
23 |         "code"
24 |       ]
25 |     },
26 |     {
27 |       "login": "gabrieltseng",
28 |       "name": "Gabriel Tseng",
29 |       "avatar_url": "https://avatars.githubusercontent.com/u/29063740?v=4",
30 |       "profile": "https://gabrieltseng.github.io/",
31 |       "contributions": [
32 |         "code"
33 |       ]
34 |     },
35 |     {
36 |       "login": "JamieTaylor-TUOS",
37 |       "name": "Jamie Taylor",
38 |       "avatar_url": "https://avatars.githubusercontent.com/u/12187350?v=4",
39 |       "profile": "http://www.solar.sheffield.ac.uk/",
40 |       "contributions": [
41 |         "code"
42 |       ]
43 |     },
44 |     {
45 |       "login": "peterdudfield",
46 |       "name": "Peter Dudfield",
47 |       "avatar_url": "https://avatars.githubusercontent.com/u/34686298?v=4",
48 |       "profile": "https://github.com/peterdudfield",
49 |       "contributions": [
50 |         "infra"
51 |       ]
52 |     },
53 |     {
54 |       "login": "vnshanmukh",
55 |       "name": "Shanmukh Chava",
56 |       "avatar_url": "https://avatars.githubusercontent.com/u/67438038?v=4",
57 |       "profile": "https://github.com/vnshanmukh",
58 |       "contributions": [
59 |         "code"
60 |       ]
61 |     },
62 |     {
63 |       "login": "Antsthebul",
64 |       "name": "Antsthebul",
65 |       "avatar_url": "https://avatars.githubusercontent.com/u/56587872?v=4",
66 |       "profile": "https://github.com/Antsthebul",
67 |       "contributions": [
68 |         "code"
69 |       ]
70 |     },
71 |     {
72 |       "login": "rachtsingh",
73 |       "name": "Rachit Singh",
74 |       "avatar_url": "https://avatars.githubusercontent.com/u/1606892?v=4",
75 |       "profile": "http://www.rachitsingh.com",
76 |       "contributions": [
77 |         "data",
78 |         "code"
79 |       ]
80 |     },
81 |     {
82 |       "login": "devsjc",
83 |       "name": "devsjc",
84 |       "avatar_url": "https://avatars.githubusercontent.com/u/47188100?v=4",
85 |       "profile": "https://github.com/devsjc",
86 |       "contributions": [
87 |         "code"
88 |       ]
89 |     }
90 |   ],
91 |   "contributorsPerLine": 7,
92 |   "projectName": "pvoutput",
93 |   "projectOwner": "openclimatefix",
94 |   "repoType": "github",
95 |   "repoHost": "https://github.com",
96 |   "skipCi": true,
97 |   "commitConvention": "angular"
98 | }
99 | 


--------------------------------------------------------------------------------
/.bumpversion.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | commit = True
 3 | tag = True
 4 | current_version = 0.1.33
 5 | message = Bump version: {current_version} → {new_version} [skip ci]
 6 | 
 7 | [bumpversion:file:setup.py]
 8 | search = version="{current_version}"
 9 | replace = version="{new_version}"
10 | 


--------------------------------------------------------------------------------
/.github/workflows/pytest.yml:
--------------------------------------------------------------------------------
 1 | name: Python package
 2 | 
 3 | on: [push]
 4 | jobs:
 5 |   call-run-python-tests:
 6 |     uses: openclimatefix/.github/.github/workflows/python-test.yml@main
 7 |     with:
 8 |       # pytest-cov looks at this folder
 9 |       pytest_cov_dir: "pvoutput"
10 |       # these packages are installed. They are needed for 'cartopy'
11 |       sudo_apt_install: "libgeos-dev libproj-dev"
12 |       # these packages are installed. They are needed for 'cartopy'
13 |       brew_install: "c-blosc hdf5 geos proj"
14 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yaml:
--------------------------------------------------------------------------------
 1 | name: Bump version and auto-release
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - main
 6 | jobs:
 7 |   call-run-python-release:
 8 |     uses: openclimatefix/.github/.github/workflows/python-release.yml@v1.7.2
 9 |     secrets:
10 |       token: ${{ secrets.PYPI_API_TOKEN }}
11 |       PAT_TOKEN: ${{ secrets.PAT_TOKEN }}
12 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | db.sqlite3-journal
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | 
 73 | # PyBuilder
 74 | target/
 75 | 
 76 | # Jupyter Notebook
 77 | .ipynb_checkpoints
 78 | 
 79 | # IPython
 80 | profile_default/
 81 | ipython_config.py
 82 | 
 83 | # JetBrains
 84 | .idea
 85 | 
 86 | # pyenv
 87 | .python-version
 88 | 
 89 | # pipenv
 90 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 91 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 92 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 93 | #   install all needed dependencies.
 94 | #Pipfile.lock
 95 | 
 96 | # celery beat schedule file
 97 | celerybeat-schedule
 98 | 
 99 | # SageMath parsed files
100 | *.sage.py
101 | 
102 | # Environments
103 | .env
104 | .venv
105 | env/
106 | venv/
107 | ENV/
108 | env.bak/
109 | venv.bak/
110 | 
111 | # Spyder project settings
112 | .spyderproject
113 | .spyproject
114 | 
115 | # Rope project settings
116 | .ropeproject
117 | 
118 | # mkdocs documentation
119 | /site
120 | 
121 | # mypy
122 | .mypy_cache/
123 | .dmypy.json
124 | dmypy.json
125 | 
126 | # Pyre type checker
127 | .pyre/
128 | 
129 | # Project-specific
130 | .pvoutput.yml
131 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | default_language_version:
 2 |   python: python3
 3 | 
 4 | repos:
 5 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 6 |     rev: v5.0.0
 7 |     hooks:
 8 |       # list of supported hooks: https://pre-commit.com/hooks.html
 9 |       - id: trailing-whitespace
10 |       - id: end-of-file-fixer
11 |       - id: debug-statements
12 |       - id: detect-private-key
13 | 
14 |   # python code formatting/linting
15 |   - repo: https://github.com/astral-sh/ruff-pre-commit
16 |     # Ruff version.
17 |     rev: "v0.11.5"
18 |     hooks:
19 |       - id: ruff
20 |         args: [--fix]
21 |   - repo: https://github.com/psf/black
22 |     rev: 25.1.0
23 |     hooks:
24 |       - id: black
25 |         args: [--line-length, "100"]
26 |   # yaml formatting
27 |   - repo: https://github.com/pre-commit/mirrors-prettier
28 |     rev: v4.0.0-alpha.8
29 |     hooks:
30 |       - id: prettier
31 |         types: [yaml]
32 | 


--------------------------------------------------------------------------------
/.ruff.toml:
--------------------------------------------------------------------------------
 1 | # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
 2 | select = ["E", "F", "D", "I"]
 3 | ignore = ["D200","D202","D210","D212","D415","D105",]
 4 | 
 5 | # Allow autofix for all enabled rules (when `--fix`) is provided.
 6 | fixable = ["A", "B", "C", "D", "E", "F", "I"]
 7 | unfixable = []
 8 | 
 9 | # Exclude a variety of commonly ignored directories.
10 | exclude = [
11 |     ".bzr",
12 |     ".direnv",
13 |     ".eggs",
14 |     ".git",
15 |     ".hg",
16 |     ".mypy_cache",
17 |     ".nox",
18 |     ".pants.d",
19 |     ".pytype",
20 |     ".ruff_cache",
21 |     ".svn",
22 |     ".tox",
23 |     ".venv",
24 |     "__pypackages__",
25 |     "_build",
26 |     "buck-out",
27 |     "build",
28 |     "dist",
29 |     "node_modules",
30 |     "venv",
31 |     "tests",
32 | ]
33 | 
34 | # Same as Black.
35 | line-length = 100
36 | 
37 | # Allow unused variables when underscore-prefixed.
38 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
39 | 
40 | # Assume Python 3.10.
41 | target-version = "py310"
42 | fix = false
43 | # Group violations by containing file.
44 | format = "github"
45 | ignore-init-module-imports = true
46 | 
47 | [mccabe]
48 | # Unlike Flake8, default to a complexity level of 10.
49 | max-complexity = 10
50 | 
51 | [pydocstyle]
52 | # Use Google-style docstrings.
53 | convention = "google"
54 | 
55 | [per-file-ignores]
56 | "__init__.py" = ["F401", "E402"]
57 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | dist: xenial # required for Python >= 3.7
2 | language: python
3 | python: 3.7
4 | install: pip install -e git+https://github.com/openclimatefix/pvoutput#egg=pvoutput
5 | script: py.test
6 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright 2019 Open Climate Fix
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <!-- ALL-CONTRIBUTORS-BADGE:START - Do not remove or modify this section -->
  2 | [![All Contributors](https://img.shields.io/badge/all_contributors-9-orange.svg?style=flat-square)](#contributors-)
  3 | <!-- ALL-CONTRIBUTORS-BADGE:END -->
  4 | 
  5 | [![codecov](https://codecov.io/gh/openclimatefix/pvoutput/branch/main/graph/badge.svg?token=GTQDR2ZZ2S)](https://codecov.io/gh/openclimatefix/pvoutput)
  6 | 
  7 | Download historical solar photovoltaic data from [PVOutput.org](https://pvoutput.org).
  8 | 
  9 | This code is a work-in-progress.  The aim is to provide both a Python library for interacting with [PVOutput.org's API](https://pvoutput.org/help.html#api), and a set of scripts for downloading lots of data :)
 10 | 
 11 | # Installation
 12 | 
 13 | ```bash
 14 | $ pip install pvoutput-ocf
 15 | ```
 16 | 
 17 | ## Register with PVOutput.org
 18 | 
 19 | You need to get an API key *and* a system ID from PVOutput.org.
 20 | 
 21 | If you don't own a physical PV system, click the "energy consumption only" box
 22 | when registering on PVOutput.  If you don't include a
 23 | system ID, then you'll get a "401 Unauthorized" response from the PVOutput API.
 24 | 
 25 | You can pass the API key and system ID into the `PVOutput` constructor.
 26 | Or, create a `~/.pvoutput.yml` file which looks like:
 27 | 
 28 | ```yaml
 29 | api_key: <API key from PVOutput.org>
 30 | system_id: <SystemID from PVOutput.org>
 31 | ```
 32 | 
 33 | The default location of the `.pvoutput.yml` is the user's home directory, expanded from `~`. This can be overridden by setting the `PVOUTPUT_CONFIG` environment variable.
 34 | 
 35 | e.g. `export PVOUTPUT_CONFIG="/my/preferred/location/.pvoutput.yml"`
 36 | 
 37 | Alternatively, you can set `API_KEY`, `SYSTEM_ID` and `DATA_SERVICE_URL` (see below) as environmental variables.
 38 | 
 39 | ### API quotas and paid subscriptions
 40 | Please see [here](https://pvoutput.org/help/data_services.html) for update info.
 41 | 
 42 | #### Free
 43 | 
 44 | For free, PVOutput.org gives you 60 API requests per hour. In a single API request you can download one day of data for one PV system.  (See PVOutput's docs for more info about [rate limits](https://pvoutput.org/help/api_specification.html#rate-limits).)
 45 | 
 46 | #### Donate
 47 | [Donating to PVOutput.org](https://pvoutput.org/help/donations.html#donations) increases your API quota to 300 requests per hour.
 48 | 
 49 | #### Paid
 50 | To get more historical data, you can pay $800 Australian dollars for a year's 'Live System History' subscription for a single country ([more info here](https://pvoutput.org/help/data_services.html). And [here's PVOutput.org's full price list](https://pvoutput.org/services.jsp)).
 51 | This allows you to use the [`get batch status`](https://pvoutput.org/help/data_services.html#get-batch-status-service) API to download 900 PV-system-*years* per hour.
 52 | 
 53 | If you have subscribed to PVOutput's data service then either
 54 | - add `data_service_url` to your configuration file (`~/.pvoutput.yml`) or
 55 | - pass `data_service_url` to the `PVOutput` constructor.
 56 | 
 57 | The `data_service_url` should end in `.org` (note the `data_service_url` doesn't include the `/service/r2` part of the URL)
 58 | For example: `data_service_url: https://pvoutput.org/`
 59 | 
 60 | 
 61 | ## Install pvoutput Python library
 62 | 
 63 | ```bash
 64 | pip install -e git+https://github.com/openclimatefix/pvoutput.git@main#egg=pvoutput-ocf
 65 | ```
 66 | 
 67 | # Usage
 68 | 
 69 | See the [Quick Start notebook](examples/quick_start.ipynb).
 70 | 
 71 | ## Contributors ✨
 72 | 
 73 | Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)):
 74 | 
 75 | <!-- ALL-CONTRIBUTORS-LIST:START - Do not remove or modify this section -->
 76 | <!-- prettier-ignore-start -->
 77 | <!-- markdownlint-disable -->
 78 | <table>
 79 |   <tbody>
 80 |     <tr>
 81 |       <td align="center"><a href="http://jack-kelly.com"><img src="https://avatars.githubusercontent.com/u/460756?v=4?s=100" width="100px;" alt="Jack Kelly"/><br /><sub><b>Jack Kelly</b></sub></a><br /><a href="https://github.com/openclimatefix/pvoutput/commits?author=JackKelly" title="Code">💻</a></td>
 82 |       <td align="center"><a href="https://github.com/ssmssam"><img src="https://avatars.githubusercontent.com/u/39378848?v=4?s=100" width="100px;" alt="Sam Murphy-Sugrue"/><br /><sub><b>Sam Murphy-Sugrue</b></sub></a><br /><a href="https://github.com/openclimatefix/pvoutput/commits?author=ssmssam" title="Code">💻</a></td>
 83 |       <td align="center"><a href="https://gabrieltseng.github.io/"><img src="https://avatars.githubusercontent.com/u/29063740?v=4?s=100" width="100px;" alt="Gabriel Tseng"/><br /><sub><b>Gabriel Tseng</b></sub></a><br /><a href="https://github.com/openclimatefix/pvoutput/commits?author=gabrieltseng" title="Code">💻</a></td>
 84 |       <td align="center"><a href="http://www.solar.sheffield.ac.uk/"><img src="https://avatars.githubusercontent.com/u/12187350?v=4?s=100" width="100px;" alt="Jamie Taylor"/><br /><sub><b>Jamie Taylor</b></sub></a><br /><a href="https://github.com/openclimatefix/pvoutput/commits?author=JamieTaylor-TUOS" title="Code">💻</a></td>
 85 |       <td align="center"><a href="https://github.com/peterdudfield"><img src="https://avatars.githubusercontent.com/u/34686298?v=4?s=100" width="100px;" alt="Peter Dudfield"/><br /><sub><b>Peter Dudfield</b></sub></a><br /><a href="#infra-peterdudfield" title="Infrastructure (Hosting, Build-Tools, etc)">🚇</a></td>
 86 |       <td align="center"><a href="https://github.com/vnshanmukh"><img src="https://avatars.githubusercontent.com/u/67438038?v=4?s=100" width="100px;" alt="Shanmukh Chava"/><br /><sub><b>Shanmukh Chava</b></sub></a><br /><a href="https://github.com/openclimatefix/pvoutput/commits?author=vnshanmukh" title="Code">💻</a></td>
 87 |       <td align="center"><a href="https://github.com/Antsthebul"><img src="https://avatars.githubusercontent.com/u/56587872?v=4?s=100" width="100px;" alt="Antsthebul"/><br /><sub><b>Antsthebul</b></sub></a><br /><a href="https://github.com/openclimatefix/pvoutput/commits?author=Antsthebul" title="Code">💻</a></td>
 88 |     </tr>
 89 |     <tr>
 90 |       <td align="center"><a href="http://www.rachitsingh.com"><img src="https://avatars.githubusercontent.com/u/1606892?v=4?s=100" width="100px;" alt="Rachit Singh"/><br /><sub><b>Rachit Singh</b></sub></a><br /><a href="#data-rachtsingh" title="Data">🔣</a> <a href="https://github.com/openclimatefix/pvoutput/commits?author=rachtsingh" title="Code">💻</a></td>
 91 |       <td align="center"><a href="https://github.com/devsjc"><img src="https://avatars.githubusercontent.com/u/47188100?v=4?s=100" width="100px;" alt="devsjc"/><br /><sub><b>devsjc</b></sub></a><br /><a href="https://github.com/openclimatefix/pvoutput/commits?author=devsjc" title="Code">💻</a></td>
 92 |     </tr>
 93 |   </tbody>
 94 | </table>
 95 | 
 96 | <!-- markdownlint-restore -->
 97 | <!-- prettier-ignore-end -->
 98 | 
 99 | <!-- ALL-CONTRIBUTORS-LIST:END -->
100 | 
101 | This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome!
102 | 


--------------------------------------------------------------------------------
/docs/dataset.md:
--------------------------------------------------------------------------------
 1 | ## `UK_PV_timeseries.hdf`
 2 | 
 3 | ### `missing_dates` table
 4 | 
 5 | Sometimes we query PVOutput.api for a particular date and PV system ID, and PVOutput.org returns no data.  The `missing_dates` table records these pairs of PV system IDs and dates, so we don't retry these missing dates (and hence chew through our API quota!)
 6 | 
 7 | Columns:
 8 | 
 9 | - `pv_system_id`: index column, integer
10 | - `missing_start_date_PV_localtime` and `missing_end_date_PV_localtime`: The start and end of the date range of missing dates for this system ID.  `pd.HDFStore` doesn't support `date` columns, so these are actual `pd.Timestamp` objects.
11 | - `datetime_of_API_request`: For data retrieved on or after 2019-08-06, this contains the UTC datetime of the API request.  For data retrieved between 2019-08-05 and 2019-08-06, this has been manually backfilled with '2019-08-05 00:00'.  For data retrieved before 2019-08-05, this columns contains `NaT` - these rows should be treated with some suspicion, because my data retrieval code may have been malformatting the date string for the PVOutput.org API, and hence may contain some 'missing dates' which aren't actually missing!  A tell-tale might be if there are duplicated rows.
12 | 
13 | ### `metadata` table
14 | 
15 | ### `timeseries/<pv_system_id>` tables
16 | 
17 | Columns:
18 | - `datetime`: index column, pd.DatetimeIndex, [localtime to the PV system](https://forum.pvoutput.org/t/clarification-are-date-times-in-local-or-utc/570/2).
19 | - `datetime_of_API_request`: The datetime at which we sent the API request.  Will be `NaT` for data retrieved before about 2019-08-06 13:00 UTC.
20 | - `query_date`: The date (in localtime to the PV system) used in the query to the PVOutput.org API.  Will be `NaT` for data retrieved before about 2019-08-06 13:00 UTC.
21 | - ... other columns contain data from PVOutput.org
22 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: pvoutput
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 |   - python>=3.7
 6 |   - pytest
 7 |   - pyyaml
 8 |   - pytables
 9 |   - pandas
10 |   - matplotlib
11 |   - jupyterlab
12 |   - urllib3
13 |   - requests
14 |   - beautifulsoup4
15 | 


--------------------------------------------------------------------------------
/examples/get_all_rss_systems.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openclimatefix/pvoutput/383f57baf1488c0f14d4812acc0f4857152b3cb1/examples/get_all_rss_systems.py


--------------------------------------------------------------------------------
/examples/get_metadata.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 51,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import os\n",
 10 |     "import pandas as pd\n",
 11 |     "import time\n",
 12 |     "from datetime import datetime\n",
 13 |     "from pvoutput import *"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 23,
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "INPUT_PV_LIST_FILENAME = '/home/jack/data/pvoutput.org/processed/UK_PV_listing_metadata.hdf'\n",
 23 |     "OUTPUT_METADATA_FILENAME = '/home/jack/data/pvoutput.org/processed/UK_PV_metadata.csv'\n",
 24 |     "PV_STATS_FILENAME = '/home/jack/data/pvoutput.org/processed/UK_PV_stats.csv'"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 2,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "pv_systems = pd.read_hdf(INPUT_PV_LIST_FILENAME, 'metadata')"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 42,
 39 |    "metadata": {},
 40 |    "outputs": [
 41 |     {
 42 |      "data": {
 43 |       "text/html": [
 44 |        "<div>\n",
 45 |        "<style scoped>\n",
 46 |        "    .dataframe tbody tr th:only-of-type {\n",
 47 |        "        vertical-align: middle;\n",
 48 |        "    }\n",
 49 |        "\n",
 50 |        "    .dataframe tbody tr th {\n",
 51 |        "        vertical-align: top;\n",
 52 |        "    }\n",
 53 |        "\n",
 54 |        "    .dataframe thead th {\n",
 55 |        "        text-align: right;\n",
 56 |        "    }\n",
 57 |        "</style>\n",
 58 |        "<table border=\"1\" class=\"dataframe\">\n",
 59 |        "  <thead>\n",
 60 |        "    <tr style=\"text-align: right;\">\n",
 61 |        "      <th></th>\n",
 62 |        "      <th>Array Tilt</th>\n",
 63 |        "      <th>Inverter</th>\n",
 64 |        "      <th>Location</th>\n",
 65 |        "      <th>Orientation</th>\n",
 66 |        "      <th>Panels</th>\n",
 67 |        "      <th>Shade</th>\n",
 68 |        "      <th>system_capacity</th>\n",
 69 |        "      <th>system_name</th>\n",
 70 |        "      <th>Outputs</th>\n",
 71 |        "      <th>Generation</th>\n",
 72 |        "      <th>Average</th>\n",
 73 |        "      <th>Efficiency</th>\n",
 74 |        "      <th>system_capacity_kw</th>\n",
 75 |        "      <th>efficiency_kWh_per_kW</th>\n",
 76 |        "      <th>system_1_tilt_degrees</th>\n",
 77 |        "      <th>system_2_tilt_degrees</th>\n",
 78 |        "    </tr>\n",
 79 |        "    <tr>\n",
 80 |        "      <th>system_id</th>\n",
 81 |        "      <th></th>\n",
 82 |        "      <th></th>\n",
 83 |        "      <th></th>\n",
 84 |        "      <th></th>\n",
 85 |        "      <th></th>\n",
 86 |        "      <th></th>\n",
 87 |        "      <th></th>\n",
 88 |        "      <th></th>\n",
 89 |        "      <th></th>\n",
 90 |        "      <th></th>\n",
 91 |        "      <th></th>\n",
 92 |        "      <th></th>\n",
 93 |        "      <th></th>\n",
 94 |        "      <th></th>\n",
 95 |        "      <th></th>\n",
 96 |        "      <th></th>\n",
 97 |        "    </tr>\n",
 98 |        "  </thead>\n",
 99 |        "  <tbody>\n",
100 |        "    <tr>\n",
101 |        "      <th>26965</th>\n",
102 |        "      <td>11.0°</td>\n",
103 |        "      <td>Enphase M215</td>\n",
104 |        "      <td>United Kingdom CV47</td>\n",
105 |        "      <td>South 0.0°</td>\n",
106 |        "      <td>336x250W Q Cells Pro-G3 250</td>\n",
107 |        "      <td>Low</td>\n",
108 |        "      <td>84.000kW</td>\n",
109 |        "      <td>mfl_phc</td>\n",
110 |        "      <td>1813 days</td>\n",
111 |        "      <td>399.238MWh</td>\n",
112 |        "      <td>220.208kWh</td>\n",
113 |        "      <td>2.654kWh/kW</td>\n",
114 |        "      <td>84.00</td>\n",
115 |        "      <td>2.654</td>\n",
116 |        "      <td>11.0</td>\n",
117 |        "      <td>NaN</td>\n",
118 |        "    </tr>\n",
119 |        "    <tr>\n",
120 |        "      <th>24768</th>\n",
121 |        "      <td>NaN</td>\n",
122 |        "      <td>aurora trio</td>\n",
123 |        "      <td>United Kingdom OX7</td>\n",
124 |        "      <td>South 180.0°</td>\n",
125 |        "      <td>1050x240W qidong</td>\n",
126 |        "      <td>No</td>\n",
127 |        "      <td>252.000kW</td>\n",
128 |        "      <td>mfl_qfa</td>\n",
129 |        "      <td>446 days</td>\n",
130 |        "      <td>307.029MWh</td>\n",
131 |        "      <td>688.405kWh</td>\n",
132 |        "      <td>2.732kWh/kW</td>\n",
133 |        "      <td>252.00</td>\n",
134 |        "      <td>2.732</td>\n",
135 |        "      <td>NaN</td>\n",
136 |        "      <td>NaN</td>\n",
137 |        "    </tr>\n",
138 |        "    <tr>\n",
139 |        "      <th>11542</th>\n",
140 |        "      <td>15.0°</td>\n",
141 |        "      <td>SMA TRI-Power 1700 TL</td>\n",
142 |        "      <td>United Kingdom PE11</td>\n",
143 |        "      <td>South 180.0°</td>\n",
144 |        "      <td>200x250W Emmvee ES 230 M60 B</td>\n",
145 |        "      <td>No</td>\n",
146 |        "      <td>50.000kW</td>\n",
147 |        "      <td>Wray Farms Solar System</td>\n",
148 |        "      <td>2437 days</td>\n",
149 |        "      <td>293.684MWh</td>\n",
150 |        "      <td>120.510kWh</td>\n",
151 |        "      <td>2.477kWh/kW</td>\n",
152 |        "      <td>50.00</td>\n",
153 |        "      <td>2.477</td>\n",
154 |        "      <td>15.0</td>\n",
155 |        "      <td>NaN</td>\n",
156 |        "    </tr>\n",
157 |        "    <tr>\n",
158 |        "      <th>66991</th>\n",
159 |        "      <td>40.0°</td>\n",
160 |        "      <td>Unknown</td>\n",
161 |        "      <td>United Kingdom HR8</td>\n",
162 |        "      <td>South 180.0° / South West 225.0°</td>\n",
163 |        "      <td>152x325W + 80x325W Unknown</td>\n",
164 |        "      <td>No</td>\n",
165 |        "      <td>75.400kW</td>\n",
166 |        "      <td>Ledbury Community Hospital</td>\n",
167 |        "      <td>1434 days</td>\n",
168 |        "      <td>279.902MWh</td>\n",
169 |        "      <td>195.190kWh</td>\n",
170 |        "      <td>2.603kWh/kW</td>\n",
171 |        "      <td>75.40</td>\n",
172 |        "      <td>2.603</td>\n",
173 |        "      <td>40.0</td>\n",
174 |        "      <td>NaN</td>\n",
175 |        "    </tr>\n",
176 |        "    <tr>\n",
177 |        "      <th>5116</th>\n",
178 |        "      <td>15.0°</td>\n",
179 |        "      <td>Fronius CL36; Datamanager 2.0</td>\n",
180 |        "      <td>United Kingdom OX7</td>\n",
181 |        "      <td>South 180.0°</td>\n",
182 |        "      <td>182x235W Q.Base-G2 235</td>\n",
183 |        "      <td>No</td>\n",
184 |        "      <td>42.770kW</td>\n",
185 |        "      <td>mfl_scf</td>\n",
186 |        "      <td>2538 days</td>\n",
187 |        "      <td>267.470MWh</td>\n",
188 |        "      <td>105.386kWh</td>\n",
189 |        "      <td>2.493kWh/kW</td>\n",
190 |        "      <td>42.77</td>\n",
191 |        "      <td>2.493</td>\n",
192 |        "      <td>15.0</td>\n",
193 |        "      <td>NaN</td>\n",
194 |        "    </tr>\n",
195 |        "  </tbody>\n",
196 |        "</table>\n",
197 |        "</div>"
198 |       ],
199 |       "text/plain": [
200 |        "          Array Tilt                       Inverter             Location  \\\n",
201 |        "system_id                                                                  \n",
202 |        "26965          11.0°                   Enphase M215  United Kingdom CV47   \n",
203 |        "24768            NaN                    aurora trio   United Kingdom OX7   \n",
204 |        "11542          15.0°          SMA TRI-Power 1700 TL  United Kingdom PE11   \n",
205 |        "66991          40.0°                        Unknown   United Kingdom HR8   \n",
206 |        "5116           15.0°  Fronius CL36; Datamanager 2.0   United Kingdom OX7   \n",
207 |        "\n",
208 |        "                                Orientation                        Panels  \\\n",
209 |        "system_id                                                                   \n",
210 |        "26965                            South 0.0°   336x250W Q Cells Pro-G3 250   \n",
211 |        "24768                          South 180.0°              1050x240W qidong   \n",
212 |        "11542                          South 180.0°  200x250W Emmvee ES 230 M60 B   \n",
213 |        "66991      South 180.0° / South West 225.0°    152x325W + 80x325W Unknown   \n",
214 |        "5116                           South 180.0°        182x235W Q.Base-G2 235   \n",
215 |        "\n",
216 |        "          Shade system_capacity                 system_name   Outputs  \\\n",
217 |        "system_id                                                               \n",
218 |        "26965       Low        84.000kW                     mfl_phc 1813 days   \n",
219 |        "24768        No       252.000kW                     mfl_qfa  446 days   \n",
220 |        "11542        No        50.000kW     Wray Farms Solar System 2437 days   \n",
221 |        "66991        No        75.400kW  Ledbury Community Hospital 1434 days   \n",
222 |        "5116         No        42.770kW                     mfl_scf 2538 days   \n",
223 |        "\n",
224 |        "           Generation     Average   Efficiency  system_capacity_kw  \\\n",
225 |        "system_id                                                            \n",
226 |        "26965      399.238MWh  220.208kWh  2.654kWh/kW               84.00   \n",
227 |        "24768      307.029MWh  688.405kWh  2.732kWh/kW              252.00   \n",
228 |        "11542      293.684MWh  120.510kWh  2.477kWh/kW               50.00   \n",
229 |        "66991      279.902MWh  195.190kWh  2.603kWh/kW               75.40   \n",
230 |        "5116       267.470MWh  105.386kWh  2.493kWh/kW               42.77   \n",
231 |        "\n",
232 |        "           efficiency_kWh_per_kW  system_1_tilt_degrees  system_2_tilt_degrees  \n",
233 |        "system_id                                                                       \n",
234 |        "26965                      2.654                   11.0                    NaN  \n",
235 |        "24768                      2.732                    NaN                    NaN  \n",
236 |        "11542                      2.477                   15.0                    NaN  \n",
237 |        "66991                      2.603                   40.0                    NaN  \n",
238 |        "5116                       2.493                   15.0                    NaN  "
239 |       ]
240 |      },
241 |      "execution_count": 42,
242 |      "metadata": {},
243 |      "output_type": "execute_result"
244 |     }
245 |    ],
246 |    "source": [
247 |     "pv_systems.head()"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "markdown",
252 |    "metadata": {},
253 |    "source": [
254 |     "## Retrieve metadata using get_pv_metadata"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": 68,
260 |    "metadata": {},
261 |    "outputs": [],
262 |    "source": [
263 |     "# Get list of systems we got from the PVOutput.org API search\n",
264 |     "pv_sys_api_search = pd.read_csv(\n",
265 |     "    '/home/jack/data/pvoutput.org/raw/uk_pv_systems.csv',\n",
266 |     "    index_col='system_id',\n",
267 |     "    usecols=['system_id'])"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": 69,
273 |    "metadata": {},
274 |    "outputs": [
275 |     {
276 |      "name": "stdout",
277 |      "output_type": "stream",
278 |      "text": [
279 |       "2559 systems already processed.\n"
280 |      ]
281 |     }
282 |    ],
283 |    "source": [
284 |     "if os.path.exists(OUTPUT_METADATA_FILENAME):\n",
285 |     "    output_metadata = pd.read_csv(OUTPUT_METADATA_FILENAME, index_col='system_id', usecols=['system_id'])\n",
286 |     "    systems_already_processed = output_metadata.index\n",
287 |     "    header = False\n",
288 |     "else:\n",
289 |     "    systems_already_processed = []\n",
290 |     "    header = True\n",
291 |     "    \n",
292 |     "print(len(systems_already_processed), 'systems already processed.')"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": 73,
298 |    "metadata": {},
299 |    "outputs": [
300 |     {
301 |      "name": "stdout",
302 |      "output_type": "stream",
303 |      "text": [
304 |       "1471 PV systems left to process.\n"
305 |      ]
306 |     }
307 |    ],
308 |    "source": [
309 |     "pv_systems_to_process = list(\n",
310 |     "    (set(pv_systems.index).union(pv_sys_api_search.index)) - \n",
311 |     "    set(systems_already_processed))\n",
312 |     "print(len(pv_systems_to_process), 'PV systems left to process.')"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": 67,
318 |    "metadata": {},
319 |    "outputs": [
320 |     {
321 |      "name": "stdout",
322 |      "output_type": "stream",
323 |      "text": [
324 |       "    0 of 1971 | ID = 26572\n",
325 |       "Bad status code returned: 403, Forbidden 403: Exceeded 300 requests per hour\n",
326 |       "Waiting 54 minutes...\n",
327 |       "Done waiting!  Retrying...\n",
328 |       "  300 of 1971 | ID = 3074\n",
329 |       "Bad status code returned: 403, Forbidden 403: Exceeded 300 requests per hour\n",
330 |       "Waiting 56 minutes...\n",
331 |       "Done waiting!  Retrying...\n",
332 |       "  600 of 1971 | ID = 4185\n",
333 |       "Bad status code returned: 403, Forbidden 403: Exceeded 300 requests per hour\n",
334 |       "Waiting 57 minutes...\n",
335 |       "Done waiting!  Retrying...\n",
336 |       "  900 of 1971 | ID = 37689\n",
337 |       "Bad status code returned: 403, Forbidden 403: Exceeded 300 requests per hour\n",
338 |       "Waiting 56 minutes...\n",
339 |       "Done waiting!  Retrying...\n",
340 |       " 1200 of 1971 | ID = 30248\n",
341 |       "Bad status code returned: 403, Forbidden 403: Exceeded 300 requests per hour\n",
342 |       "Waiting 57 minutes...\n",
343 |       "Done waiting!  Retrying...\n",
344 |       " 1500 of 1971 | ID = 6555\n",
345 |       "Bad status code returned: 403, Forbidden 403: Exceeded 300 requests per hour\n",
346 |       "Waiting 56 minutes...\n",
347 |       "Done waiting!  Retrying...\n",
348 |       " 1800 of 1971 | ID = 40277\n",
349 |       "Bad status code returned: 403, Forbidden 403: Exceeded 300 requests per hour\n",
350 |       "Waiting 57 minutes...\n",
351 |       "Done waiting!  Retrying...\n",
352 |       " 1970 of 1971 | ID = 57336"
353 |      ]
354 |     }
355 |    ],
356 |    "source": [
357 |     "n = len(pv_systems_to_process)\n",
358 |     "for i, pv_system_id in enumerate(pv_systems_to_process):\n",
359 |     "    print('\\r', '{:>4d}'.format(i), 'of', n, '| ID =', pv_system_id, end='', flush=True)\n",
360 |     "    pv_metadata = get_pv_metadata(pv_system_id).to_frame().T.set_index('system_id')\n",
361 |     "    pv_metadata.to_csv(\n",
362 |     "        OUTPUT_METADATA_FILENAME,\n",
363 |     "        mode='a',\n",
364 |     "        header=header)\n",
365 |     "    header = False"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "code",
370 |    "execution_count": 95,
371 |    "metadata": {},
372 |    "outputs": [
373 |     {
374 |      "name": "stdout",
375 |      "output_type": "stream",
376 |      "text": [
377 |       "system_id,system_name,system_size_watts,postcode,number_of_panels,panel_power_watts,panel_brand,num_inverters,inverter_power_watts,inverter_brand,orientation,array_tilt_degrees,shade,install_date,latitude,longitude,status_interval_minutes,number_of_panels_secondary,panel_power_watts_secondary,orientation_secondary,array_tilt_degrees_secondary\n",
378 |       "19397,AndyT's,3900,BS22,20,195,ZN Shine 195w,1,3500,Kaco 4202,S,30.0,No,2011-11-21 00:00:00,51.36,-2.92,5,0,0,,\n",
379 |       "8195,Kilmarnock Roof,3750,KA3,15,250,Sanyo 250 HIT,1,4000,Omniksol,S,30.0,No,2011-11-07 00:00:00,55.64,-4.49,10,0,0,,\n",
380 |       "8200,Flat 5,3430,E8,14,245,,1,3000,sb3000,S,25.0,Low,2011-12-12 00:00:00,51.54,-0.06,5,0,0,,\n",
381 |       "8204,Sooper-Dooper Solar,2940,GU2,12,245,SunTech STP245S-20/Wd,1,3000,Kaco Powador 3002,S,19.0,No,2012-05-11 00:00:00,51.24,-0.59,10,0,0,,\n",
382 |       "8205,58GPR,4000,BS48,16,250,Sanyo component Europe GmbH,1,50,Sma sunny boysb3800v,S,,No,2011-11-10 00:00:00,51.42,-2.74,10,0,0,,\n",
383 |       "32783,olaf-UK,3780,B92,14,270,canadian solar CS6P-270MM,1,3600,SMA Sunny Boy 3600TL,S,25.0,No,2014-10-15 00:00:00,52.43,-1.77,5,0,0,,\n",
384 |       "8208,48 St Saviours,4000,PR5,16,250,Sharp,1,4000,SMA,SW,35.0,No,2012-02-21 00:00:00,53.73,-2.65,10,0,0,,\n",
385 |       "40978,Sma 2Kw,2000,CF31,8,250,Hyundai,1,2000,Sma 2000HF,S,30.0,No,2011-12-01 00:00:00,51.5,-3.57,5,0,0,,\n",
386 |       "24599,LongfellowPV,3750,MK16,15,250,Yingli YL250C-30b,1,3600,Fronius IG TL 3.6,SW,30.0,Low,2013-05-28 00:00:00,52.083376,-0.729613,5,0,0,,\n"
387 |      ]
388 |     }
389 |    ],
390 |    "source": [
391 |     "!head $OUTPUT_METADATA_FILENAME"
392 |    ]
393 |   },
394 |   {
395 |    "cell_type": "markdown",
396 |    "metadata": {},
397 |    "source": [
398 |     "## Retrieve metadata using get_statistics"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "code",
403 |    "execution_count": null,
404 |    "metadata": {},
405 |    "outputs": [],
406 |    "source": [
407 |     "# get stats\n",
408 |     "if os.path.exists(PV_STATS_FILENAME):\n",
409 |     "    header = False\n",
410 |     "    stats_processed = pd.read_csv(PV_STATS_FILENAME, index_col='system_id', usecols=['system_id'])\n",
411 |     "else:\n",
412 |     "    stats_processed = []\n",
413 |     "    header = True\n",
414 |     "    \n",
415 |     "pv_systems_to_get_stats = set(pv_systems_filtered.index).union(pv_sys_api_search.index)\n",
416 |     "\n",
417 |     "print(len(pv_systems_to_get_stats), 'total PV systems')\n",
418 |     "print(len(stats_processed), 'system IDs already loaded')\n",
419 |     "stats_to_process = set(pv_systems_to_get_stats) - set(stats_processed.index)\n",
420 |     "# re-order\n",
421 |     "# stats_to_process = pd.Series(1, index=stats_to_process).reindex(pv_systems_filtered.index).dropna().index\n",
422 |     "print(len(stats_to_process), 'system IDs to load')\n",
423 |     "    \n",
424 |     "for i, system_id in enumerate(stats_to_process):\n",
425 |     "    print('\\r', i, system_id, end='   ', flush=True)\n",
426 |     "    try:\n",
427 |     "        pv_stats = get_pv_statistic(system_id)\n",
428 |     "    except NoStatusFound:\n",
429 |     "        print('No status found for', system_id)\n",
430 |     "        # Create a blank row\n",
431 |     "        index = ['system_id'] + list(range(1, 11))\n",
432 |     "        pv_stats = pd.Series(\n",
433 |     "            [system_id] + ([''] * 10),\n",
434 |     "            index=index)\n",
435 |     "        pv_stats['system_id'] = int(pv_stats['system_id'])\n",
436 |     "    pv_stats['stats_downloaded_on_utc'] = datetime.utcnow()\n",
437 |     "    pv_stats = pv_stats.to_frame().T.set_index('system_id')\n",
438 |     "    with open(PV_STATS_FILENAME, mode='a') as fh:\n",
439 |     "        pv_stats.to_csv(fh, header=header)\n",
440 |     "    header = False"
441 |    ]
442 |   }
443 |  ],
444 |  "metadata": {
445 |   "kernelspec": {
446 |    "display_name": "Python 3",
447 |    "language": "python",
448 |    "name": "python3"
449 |   },
450 |   "language_info": {
451 |    "codemirror_mode": {
452 |     "name": "ipython",
453 |     "version": 3
454 |    },
455 |    "file_extension": ".py",
456 |    "mimetype": "text/x-python",
457 |    "name": "python",
458 |    "nbconvert_exporter": "python",
459 |    "pygments_lexer": "ipython3",
460 |    "version": "3.7.3"
461 |   }
462 |  },
463 |  "nbformat": 4,
464 |  "nbformat_minor": 2
465 | }
466 | 


--------------------------------------------------------------------------------
/infrastructure/docker/Dockerfile_dev:
--------------------------------------------------------------------------------
 1 | FROM python:3.9
 2 | 
 3 | WORKDIR /pvoutput
 4 | 
 5 | RUN apt-get -qq update \
 6 |     && apt-get -qq install -y --no-install-recommends \
 7 |         git \
 8 |         curl \
 9 |         git \
10 |         wget \
11 |         libproj-dev \
12 |         proj-data \
13 |         proj-bin \
14 |         libgeos-dev \
15 |         libgdal-dev \
16 |         python3-gdal \
17 |         gdal-bin \
18 |     && apt-get autoclean && apt-get autoremove \
19 |     > /dev/null
20 | 
21 | COPY requirements.txt /pvoutput/requirements.txt
22 | 
23 | RUN pip install -U pip && pip install --no-cache-dir -r /pvoutput/requirements.txt > /dev/null
24 | 
25 | EXPOSE 1234
26 | 
27 | CMD ["jupyter", "notebook", "--allow-root", "--ip", "0.0.0.0", "--port", "1234", "--no-browser"]
28 | 


--------------------------------------------------------------------------------
/infrastructure/docker/Dockerfile_prod:
--------------------------------------------------------------------------------
 1 | FROM python:3.9
 2 | 
 3 | WORKDIR /pvoutput
 4 | 
 5 | RUN pip install -e git+https://github.com/openclimatefix/pvoutput#egg=pvoutput > /dev/null
 6 | 
 7 | COPY examples/ /pvoutput/examples/
 8 | 
 9 | EXPOSE 1234
10 | 
11 | CMD ["jupyter", "notebook", "--allow-root", "--ip", "0.0.0.0", "--port", "1234", "--no-browser", "/pvoutput/examples/quick_start.ipynb"]
12 | 


--------------------------------------------------------------------------------
/pvoutput/__init__.py:
--------------------------------------------------------------------------------
1 | """Init PVoutput library"""
2 | 
3 | from .pvoutput import *  # noqa
4 | 
5 | __version__ = 0.1
6 | 


--------------------------------------------------------------------------------
/pvoutput/consts.py:
--------------------------------------------------------------------------------
  1 | """Constants used in this repo"""
  2 | 
  3 | import os
  4 | from datetime import timedelta
  5 | from urllib.parse import urljoin
  6 | 
  7 | BASE_URL = "https://pvoutput.org"
  8 | MAP_URL = urljoin(BASE_URL, "map.jsp")
  9 | REGIONS_URL = urljoin(BASE_URL, "region.jsp")
 10 | 
 11 | # Country codes used by PVOutput.org on, for example,
 12 | # https://pvoutput.org/map.jsp.  Taken from
 13 | # https://pvoutput.org/help.html#api-addsystem.
 14 | 
 15 | PV_OUTPUT_COUNTRY_CODES = {
 16 |     "New South Wales": 1,
 17 |     "Afghanistan": 2,
 18 |     "Akrotiri": 3,
 19 |     "Albania": 4,
 20 |     "Algeria": 5,
 21 |     "American Samoa": 6,
 22 |     "Andorra": 7,
 23 |     "Angola": 8,
 24 |     "Anguilla": 9,
 25 |     "Antarctica": 10,
 26 |     "Antigua and Barbuda": 11,
 27 |     "Arctic Ocean": 12,
 28 |     "Argentina": 13,
 29 |     "Armenia": 14,
 30 |     "Aruba": 15,
 31 |     "Ashmore and Cartier Islands": 16,
 32 |     "Atlantic Ocean": 17,
 33 |     "Austria": 18,
 34 |     "Azerbaijan": 19,
 35 |     "Bahamas, The": 20,
 36 |     "Bahrain": 21,
 37 |     "Bangladesh": 22,
 38 |     "Barbados": 23,
 39 |     "Belarus": 24,
 40 |     "Belgium": 25,
 41 |     "Belize": 26,
 42 |     "Benin": 27,
 43 |     "Bermuda": 28,
 44 |     "Bhutan": 29,
 45 |     "Bolivia": 30,
 46 |     "Bosnia and Herzegovina": 31,
 47 |     "Botswana": 32,
 48 |     "Bouvet Island": 33,
 49 |     "Brazil": 34,
 50 |     "British Indian Ocean Territory": 35,
 51 |     "British Virgin Islands": 36,
 52 |     "Brunei": 37,
 53 |     "Bulgaria": 38,
 54 |     "Burkina Faso": 39,
 55 |     "Burma": 40,
 56 |     "Burundi": 41,
 57 |     "Cambodia": 42,
 58 |     "Cameroon": 43,
 59 |     "Canada": 44,
 60 |     "Cape Verde": 45,
 61 |     "Cayman Islands": 46,
 62 |     "Central African Republic": 47,
 63 |     "Chad": 48,
 64 |     "Chile": 49,
 65 |     "China": 50,
 66 |     "Christmas Island": 51,
 67 |     "Clipperton Island": 52,
 68 |     "Cocos (Keeling) Islands": 53,
 69 |     "Colombia": 54,
 70 |     "Comoros": 55,
 71 |     "Congo, Democratic Republic of the": 56,
 72 |     "Congo, Republic of the": 57,
 73 |     "Cook Islands": 58,
 74 |     "Coral Sea Islands": 59,
 75 |     "Costa Rica": 60,
 76 |     "Cote d'Ivoire": 61,
 77 |     "Croatia": 62,
 78 |     "Cuba": 63,
 79 |     "Curacao": 64,
 80 |     "Cyprus": 65,
 81 |     "Czech Republic": 66,
 82 |     "Denmark": 67,
 83 |     "Dhekelia": 68,
 84 |     "Djibouti": 69,
 85 |     "Dominica": 70,
 86 |     "Dominican Republic": 71,
 87 |     "Ecuador": 72,
 88 |     "Egypt": 73,
 89 |     "El Salvador": 74,
 90 |     "Equatorial Guinea": 75,
 91 |     "Eritrea": 76,
 92 |     "Estonia": 77,
 93 |     "Ethiopia": 78,
 94 |     "Falkland Islands": 79,
 95 |     "Faroe Islands": 80,
 96 |     "Fiji": 81,
 97 |     "Finland": 82,
 98 |     "France": 83,
 99 |     "French Polynesia": 84,
100 |     "French Southern and Antarctic Lands": 85,
101 |     "Gabon": 86,
102 |     "Gambia, The": 87,
103 |     "Gaza Strip": 88,
104 |     "Georgia": 89,
105 |     "Germany": 90,
106 |     "Ghana": 91,
107 |     "Gibraltar": 92,
108 |     "Greece": 93,
109 |     "Greenland": 94,
110 |     "Grenada": 95,
111 |     "Guam": 96,
112 |     "Guatemala": 97,
113 |     "Guernsey": 98,
114 |     "Guinea": 99,
115 |     "Guinea-Bissau": 100,
116 |     "Guyana": 101,
117 |     "Haiti": 102,
118 |     "Heard Island and McDonald Islands": 103,
119 |     "Holy See (Vatican City)": 104,
120 |     "Honduras": 105,
121 |     "Hong Kong": 106,
122 |     "Hungary": 107,
123 |     "Iceland": 108,
124 |     "India": 109,
125 |     "Indian Ocean": 110,
126 |     "Indonesia": 111,
127 |     "Iran": 112,
128 |     "Iraq": 113,
129 |     "Ireland": 114,
130 |     "Isle of Man": 115,
131 |     "Israel": 116,
132 |     "Italy": 117,
133 |     "Jamaica": 118,
134 |     "Jan Mayen": 119,
135 |     "Japan": 120,
136 |     "Jersey": 121,
137 |     "Jordan": 122,
138 |     "Kazakhstan": 123,
139 |     "Kenya": 124,
140 |     "Kiribati": 125,
141 |     "Korea, North": 126,
142 |     "Korea, South": 127,
143 |     "Kosovo": 128,
144 |     "Kuwait": 129,
145 |     "Kyrgyzstan": 130,
146 |     "Laos": 131,
147 |     "Latvia": 132,
148 |     "Lebanon": 133,
149 |     "Lesotho": 134,
150 |     "Liberia": 135,
151 |     "Libya": 136,
152 |     "Liechtenstein": 137,
153 |     "Lithuania": 138,
154 |     "Luxembourg": 139,
155 |     "Macau": 140,
156 |     "Macedonia": 141,
157 |     "Madagascar": 142,
158 |     "Malawi": 143,
159 |     "Malaysia": 144,
160 |     "Maldives": 145,
161 |     "Mali": 146,
162 |     "Malta": 147,
163 |     "Marshall Islands": 148,
164 |     "Mauritania": 149,
165 |     "Mauritius": 150,
166 |     "Mayotte": 151,
167 |     "Mexico": 152,
168 |     "Micronesia": 153,
169 |     "Moldova": 154,
170 |     "Monaco": 155,
171 |     "Mongolia": 156,
172 |     "Montenegro": 157,
173 |     "Montserrat": 158,
174 |     "Morocco": 159,
175 |     "Mozambique": 160,
176 |     "Namibia": 161,
177 |     "Nauru": 162,
178 |     "Navassa Island": 163,
179 |     "Nepal": 164,
180 |     "Netherlands": 165,
181 |     "New Caledonia": 166,
182 |     "New Zealand": 167,
183 |     "Nicaragua": 168,
184 |     "Niger": 169,
185 |     "Nigeria": 170,
186 |     "Niue": 171,
187 |     "Norfolk Island": 172,
188 |     "Northern Mariana Islands": 173,
189 |     "Norway": 174,
190 |     "Oman": 175,
191 |     "Pakistan": 176,
192 |     "Palau": 177,
193 |     "Panama": 178,
194 |     "Papua New Guinea": 179,
195 |     "Paracel Islands": 180,
196 |     "Paraguay": 181,
197 |     "Peru": 182,
198 |     "Philippines": 183,
199 |     "Pitcairn Islands": 184,
200 |     "Poland": 185,
201 |     "Portugal": 186,
202 |     "Puerto Rico": 187,
203 |     "Qatar": 188,
204 |     "Romania": 189,
205 |     "Russia": 190,
206 |     "Rwanda": 191,
207 |     "Saint Barthelemy": 192,
208 |     "Saint Helena, Ascension, and Tristan da Cunha": 193,
209 |     "Saint Kitts and Nevis": 194,
210 |     "Saint Lucia": 195,
211 |     "Saint Martin": 196,
212 |     "Saint Pierre and Miquelon": 197,
213 |     "Saint Vincent and the Grenadines": 198,
214 |     "Samoa": 199,
215 |     "San Marino": 200,
216 |     "Sao Tome and Principe": 201,
217 |     "Saudi Arabia": 202,
218 |     "Senegal": 203,
219 |     "Serbia": 204,
220 |     "Seychelles": 205,
221 |     "Sierra Leone": 206,
222 |     "Singapore": 207,
223 |     "Sint Maarten": 208,
224 |     "Slovakia": 209,
225 |     "Slovenia": 210,
226 |     "Solomon Islands": 211,
227 |     "Somalia": 212,
228 |     "South Africa": 213,
229 |     "South Georgia and South Sandwich Is.": 214,
230 |     "Southern Ocean": 215,
231 |     "Spain": 216,
232 |     "Spratly Islands": 217,
233 |     "Sri Lanka": 218,
234 |     "Sudan": 219,
235 |     "Suriname": 220,
236 |     "Svalbard": 221,
237 |     "Swaziland": 222,
238 |     "Sweden": 223,
239 |     "Switzerland": 224,
240 |     "Syria": 225,
241 |     "Taiwan": 226,
242 |     "Tajikistan": 227,
243 |     "Tanzania": 228,
244 |     "Thailand": 229,
245 |     "Timor-Leste": 230,
246 |     "Togo": 231,
247 |     "Tokelau": 232,
248 |     "Tonga": 233,
249 |     "Trinidad and Tobago": 234,
250 |     "Tunisia": 235,
251 |     "Turkey": 236,
252 |     "Turkmenistan": 237,
253 |     "Turks and Caicos Islands": 238,
254 |     "Tuvalu": 239,
255 |     "Uganda": 240,
256 |     "Ukraine": 241,
257 |     "United Arab Emirates": 242,
258 |     "United Kingdom": 243,
259 |     "United States": 244,
260 |     "Uruguay": 245,
261 |     "Uzbekistan": 246,
262 |     "Vanuatu": 247,
263 |     "Venezuela": 248,
264 |     "Vietnam": 249,
265 |     "Virgin Islands": 250,
266 |     "Wake Island": 251,
267 |     "Wallis and Futuna": 252,
268 |     "West Bank": 253,
269 |     "Western Sahara": 254,
270 |     "Yemen": 255,
271 |     "Zambia": 256,
272 |     "Zimbabwe": 257,
273 | }
274 | 
275 | PV_OUTPUT_MAP_COLUMN_NAMES = {
276 |     "timeseries_duration": "c",
277 |     "average_generation_per_day": "avg",
278 |     "efficiency": "gss",
279 |     "power_generation": "atg",
280 |     "capacity": "ss",
281 |     "address": "o",
282 |     "name": "sn",
283 | }
284 | 
285 | 
286 | ONE_DAY = timedelta(days=1)
287 | 
288 | PV_OUTPUT_DATE_FORMAT = "%Y%m%d"
289 | CONFIG_FILENAME = os.environ.get("PVOUTPUT_CONFIG", os.path.expanduser("~/.pvoutput.yml"))
290 | RATE_LIMIT_PARAMS_TO_API_HEADERS = {
291 |     "rate_limit_remaining": "X-Rate-Limit-Remaining",
292 |     "rate_limit_total": "X-Rate-Limit-Limit",
293 |     "rate_limit_reset_time": "X-Rate-Limit-Reset",
294 | }
295 | 


--------------------------------------------------------------------------------
/pvoutput/daterange.py:
--------------------------------------------------------------------------------
  1 | """Date Range Class"""
  2 | 
  3 | from dataclasses import dataclass
  4 | from datetime import date, datetime, timedelta
  5 | from typing import Iterable, List, Union
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | 
 10 | 
 11 | @dataclass
 12 | class DateRange:
 13 |     """Date Range Class"""
 14 | 
 15 |     start_date: date
 16 |     end_date: date
 17 | 
 18 |     def __init__(self, start_date, end_date):
 19 |         """Init"""
 20 |         self.start_date = safe_convert_to_date(start_date)
 21 |         self.end_date = safe_convert_to_date(end_date)
 22 | 
 23 |     def intersection(self, other):
 24 |         """
 25 |         Get intersection of this DateRange and other
 26 |         """
 27 |         new_start = max(self.start_date, other.start_date)
 28 |         new_end = min(self.end_date, other.end_date)
 29 |         if new_start < new_end:
 30 |             return DateRange(new_start, new_end)
 31 | 
 32 |     def date_range(self) -> np.array:
 33 |         """
 34 |         Make date range
 35 |         """
 36 |         return pd.date_range(self.start_date, self.end_date, freq="D").date
 37 | 
 38 |     def total_days(self) -> int:
 39 |         """
 40 |         Find the total number of days
 41 |         """
 42 |         return (
 43 |             np.timedelta64(self.end_date - self.start_date)
 44 |             .astype("timedelta64[D]")
 45 |             .astype(np.float32)
 46 |         )
 47 | 
 48 |     def split_into_years(self) -> List:
 49 |         """
 50 |         Split start and end dates into list of years
 51 | 
 52 |         """
 53 |         duration = self.end_date - self.start_date
 54 |         num_years = duration / timedelta(days=365)
 55 |         if num_years <= 1:
 56 |             return [self]
 57 |         else:
 58 |             end_date = self.end_date
 59 |             new_date_ranges = []
 60 |             for year_back in range(np.ceil(num_years).astype(int)):
 61 |                 start_date = end_date - timedelta(days=365)
 62 |                 if start_date < self.start_date:
 63 |                     start_date = self.start_date
 64 |                 new_date_ranges.append(DateRange(start_date, end_date))
 65 |                 end_date = start_date
 66 |             return new_date_ranges
 67 | 
 68 | 
 69 | def get_date_range_list(dates: Iterable[date]) -> List[DateRange]:
 70 |     """
 71 |     Get data range lists for dates
 72 | 
 73 |     Args:
 74 |         dates: list of dates
 75 | 
 76 |     Returns: list of date ranges
 77 |     """
 78 |     if not dates:
 79 |         return []
 80 |     dates = np.array(dates)
 81 |     dates = np.sort(dates)
 82 |     dates_diff = np.diff(dates)
 83 |     location_of_gaps = np.where(dates_diff > timedelta(days=1))[0]
 84 |     index_of_last_date = len(dates) - 1
 85 |     location_of_gaps = list(location_of_gaps)
 86 |     location_of_gaps.append(index_of_last_date)
 87 | 
 88 |     start_i = 0
 89 |     date_range_list = []
 90 |     for end_i in location_of_gaps:
 91 |         date_range = DateRange(start_date=dates[start_i], end_date=dates[end_i])
 92 |         date_range_list.append(date_range)
 93 |         start_i = end_i + 1
 94 | 
 95 |     return date_range_list
 96 | 
 97 | 
 98 | def safe_convert_to_date(dt: Union[datetime, date, str]) -> date:
 99 |     """
100 |     Convert datetime to date
101 | 
102 |     Args:
103 |         dt: datetime, date or string
104 | 
105 |     Returns: date
106 |     """
107 |     if isinstance(dt, str):
108 |         dt = pd.Timestamp(dt)
109 |     if isinstance(dt, datetime):
110 |         return dt.date()
111 |     if isinstance(dt, date):
112 |         return dt
113 | 
114 | 
115 | def merge_date_ranges_to_years(date_ranges: Iterable[DateRange]) -> List[DateRange]:
116 |     """
117 |     Merge date ranges to years
118 | 
119 |     Args:
120 |         date_ranges: List of DateRanges, in ascending chronological order.
121 | 
122 |     Returns:
123 |         List of DateRanges, each representing a year, in descending order.
124 |     """
125 |     if not date_ranges:
126 |         return []
127 | 
128 |     # Split multi-year date ranges
129 |     date_ranges_split = []
130 |     for date_range in date_ranges[::-1]:
131 |         date_ranges_split.extend(date_range.split_into_years())
132 | 
133 |     years_to_download = []
134 |     for date_range in date_ranges_split:
135 |         if years_to_download:
136 |             intersection = date_range.intersection(years_to_download[-1])
137 |             if intersection == date_range:
138 |                 # date_range falls within the last year to retrieve,
139 |                 # so we can ignore this date_range
140 |                 continue
141 |             elif intersection is None:
142 |                 # No overlap
143 |                 date_to = date_range.end_date
144 |             else:
145 |                 # Overlap
146 |                 date_to = intersection.start_date
147 | 
148 |         else:
149 |             date_to = date_range.end_date
150 | 
151 |         date_from = date_to - timedelta(days=365)
152 |         years_to_download.append(DateRange(date_from, date_to))
153 | 
154 |     return years_to_download
155 | 


--------------------------------------------------------------------------------
/pvoutput/exceptions.py:
--------------------------------------------------------------------------------
 1 | """Custom Exeception classes"""
 2 | 
 3 | import requests
 4 | 
 5 | 
 6 | class BadStatusCode(Exception):
 7 |     """Bad status code excepction"""
 8 | 
 9 |     def __init__(self, response: requests.Response, message: str = ""):
10 |         """Init"""
11 |         self.response = response
12 |         super(BadStatusCode, self).__init__(message)
13 | 
14 |     def __str__(self) -> str:
15 |         """String method"""
16 |         string = super(BadStatusCode, self).__str__()
17 |         string += "Status code: {}; ".format(self.response.status_code)
18 |         string += "Response content: {}; ".format(self.response.content)
19 |         string += "Response headers: {}; ".format(self.response.headers)
20 |         return string
21 | 
22 | 
23 | class NoStatusFound(BadStatusCode):
24 |     """Exeception for when no status code is found"""
25 | 
26 |     pass
27 | 
28 | 
29 | class RateLimitExceeded(BadStatusCode):
30 |     """Class for rate limit is exceeded"""
31 | 
32 |     pass
33 | 


--------------------------------------------------------------------------------
/pvoutput/grid_search/__init__.py:
--------------------------------------------------------------------------------
1 | """Init for grid saerch module"""
2 | 
3 | from .grid_search import GridSearch
4 | 


--------------------------------------------------------------------------------
/pvoutput/grid_search/app.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Generate gridded lat/lon coordinates that can be used for fixed radius searches across a region.
  3 | 
  4 | Provides both an importable method and a CLI.
  5 | 
  6 | .. code:: console
  7 | 
  8 |     $ python app.py -h
  9 | 
 10 | - Jamie Taylor <jamie.taylor@sheffield.ac.uk>
 11 | - First Authored: 2021-11-16
 12 | """
 13 | 
 14 | import argparse
 15 | import logging
 16 | import os
 17 | import sys
 18 | from typing import Optional
 19 | 
 20 | from pvoutput.grid_search.grid_search import GridSearch
 21 | from pvoutput.grid_search.natural_earth import NaturalEarth
 22 | 
 23 | 
 24 | def parse_options():
 25 |     """Parse command line options."""
 26 |     parser = argparse.ArgumentParser(
 27 |         description=("This is a command line interface (CLI) for " "the grid_search module."),
 28 |         epilog="Jamie Taylor, 2021-11-16",
 29 |     )
 30 |     parser.add_argument(
 31 |         "--bbox",
 32 |         metavar="<min_lat,min_lon,max_lat,max_lon>",
 33 |         dest="bbox",
 34 |         action="store",
 35 |         type=str,
 36 |         required=False,
 37 |         default=None,
 38 |         help="Specify a bounding box to search. Can be used in conjunction with " "--countries",
 39 |     )
 40 |     parser.add_argument(
 41 |         "--countries",
 42 |         metavar="<country>[,<country>[,...]]",
 43 |         dest="countries",
 44 |         action="store",
 45 |         type=str,
 46 |         required=False,
 47 |         default=None,
 48 |         help="Specify a list of countries, searching only grid points that fall "
 49 |         "within these countries' boundaries. Specify one or more countries, "
 50 |         "separated by commas (default is all). Country names must match those "
 51 |         "used in the Natural Earth dataset (HINT: run this code with the "
 52 |         "--list-countries option to list them). This option can be used in "
 53 |         "conjunction with --bbox, in which case the search will only include "
 54 |         "grid points within both the bounding box and the countries list.",
 55 |     )
 56 |     parser.add_argument(
 57 |         "--radial-clip",
 58 |         metavar="<lat,lon,radius_km>",
 59 |         dest="radial_clip",
 60 |         action="store",
 61 |         type=str,
 62 |         required=False,
 63 |         default=None,
 64 |         help="Specify a radius to clip to. Can be used in conjunction with --bbox "
 65 |         "and --countries. Pass the latitude, longitude and radius as a "
 66 |         "comma-separated string. Radius should be in km.",
 67 |     )
 68 |     parser.add_argument(
 69 |         "--list-countries",
 70 |         dest="list_countries",
 71 |         action="store_true",
 72 |         required=False,
 73 |         help="List the country names that can be used for the " "--countries option.",
 74 |     )
 75 |     parser.add_argument(
 76 |         "--buffer",
 77 |         metavar="<buffer>",
 78 |         dest="buffer",
 79 |         action="store",
 80 |         type=float,
 81 |         required=False,
 82 |         default=0.0,
 83 |         help="Specify a buffer/tolerance for including grid points i.e. include "
 84 |         "grid points that fall within <buffer> kilometers of the target "
 85 |         "boundary. Default is 0km.",
 86 |     )
 87 |     parser.add_argument(
 88 |         "--search-radius",
 89 |         metavar="<radius>",
 90 |         dest="search_radius",
 91 |         action="store",
 92 |         type=float,
 93 |         required=False,
 94 |         default=25.0,
 95 |         help="Specify the radial search limit around each grid point in " "kilometers.",
 96 |     )
 97 |     parser.add_argument(
 98 |         "--local-crs-epsg",
 99 |         metavar="<EPSG>",
100 |         dest="local_crs_epsg",
101 |         action="store",
102 |         type=int,
103 |         required=False,
104 |         default=4087,
105 |         help="Optionally provide the EPSG code of a local co-ordinate Reference "
106 |         "System (CRS) for improved accuracy. e.g. set to 27700 (OSGB36 / "
107 |         "British National Grid) if searching the British Isles.",
108 |     )
109 |     parser.add_argument(
110 |         "--cache-dir",
111 |         metavar="</path/to/dir>",
112 |         dest="cache_dir",
113 |         action="store",
114 |         type=str,
115 |         required=False,
116 |         default=None,
117 |         help="Specify a directory to use for caching downloaded boundary files.",
118 |     )
119 |     parser.add_argument(
120 |         "--show",
121 |         dest="show",
122 |         action="store_true",
123 |         required=False,
124 |         help="Set this flag to show a plot of the grid.",
125 |     )
126 |     parser.add_argument(
127 |         "-o",
128 |         "--outfile",
129 |         metavar="<path>",
130 |         dest="outfile",
131 |         action="store",
132 |         type=str,
133 |         required=False,
134 |         help="Specify a filename to save the grid to.",
135 |     )
136 |     options = parser.parse_args()
137 | 
138 |     def handle_options(options):
139 |         """Validate command line args and pre-process where necessary."""
140 |         if options.bbox is not None:
141 |             options.bbox = list(map(lambda x: float(x.strip()), options.bbox.split(",")))
142 |         if options.radial_clip is not None:
143 |             options.radial_clip = list(
144 |                 map(lambda x: float(x.strip()), options.radial_clip.split(","))
145 |             )
146 |         if options.cache_dir is not None:
147 |             if not os.path.isdir(options.cache_dir):
148 |                 logging.error(f"The cache_dir '{options.cache_dir}' does not exist.")
149 |                 raise ValueError(f"The cache_dir '{options.cache_dir}' does not exist.")
150 |         if options.countries:
151 |             options.countries = list(map(lambda x: str(x.strip()), options.countries.split(",")))
152 |             earth = NaturalEarth(options.cache_dir)
153 |             _, countries = earth.get_hires_world_boundaries()
154 |             for country in options.countries:
155 |                 if country not in countries:
156 |                     logging.error(f"The country '{country}' is invalid.")
157 |                     raise ValueError(f"The country '{country}' is invalid.")
158 |         if options.outfile is not None and os.path.exists(options.outfile):
159 |             check = query_yes_no(
160 |                 f"The output file '{options.outfile}' already exists, results "
161 |                 "will be overwritten, do you want to continue?",
162 |                 "no",
163 |             )
164 |             if check is False:
165 |                 print("Quitting...")
166 |                 sys.exit(0)
167 |         return options
168 | 
169 |     return handle_options(options)
170 | 
171 | 
172 | def main():
173 |     """Run main app"""
174 |     options = parse_options()
175 |     grd = GridSearch(cache_dir=options.cache_dir)
176 |     if options.list_countries:
177 |         grd.nat_earth.list_countries()
178 |         sys.exit()
179 |     grd.generate_grid(
180 |         bbox=options.bbox,
181 |         countries=options.countries,
182 |         radial_clip=options.radial_clip,
183 |         buffer=options.buffer,
184 |         search_radius=options.search_radius,
185 |         local_crs_epsg=options.local_crs_epsg,
186 |         save_to=options.outfile,
187 |         show=options.show,
188 |     )
189 | 
190 | 
191 | if __name__ == "__main__":
192 |     fmt = "%(asctime)s [%(levelname)s] - %(message)s (%(filename)s:%(funcName)s)"
193 |     datefmt = "%Y-%m-%dT%H:%M:%SZ"
194 |     logging.basicConfig(format=fmt, datefmt=datefmt, level=os.environ.get("LOGLEVEL", "INFO"))
195 |     main()
196 | 
197 | 
198 | def query_yes_no(question: str, default: Optional[str] = "yes") -> bool:
199 |     """Ask a yes/no question via input() and return the answer as boolean.
200 | 
201 |     Args:
202 |         question:
203 |             The question presented to the user.
204 |         default:
205 |             The presumed answer if the user just hits <Enter>. It must be "yes" (the default), "no"
206 |             or None (meaning an answer is required of the user).
207 | 
208 |     Returns:
209 |         Return value is True for "yes" or False for "no".
210 |     """
211 |     valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
212 |     if default is None:
213 |         prompt = " [y/n] "
214 |     elif default == "yes":
215 |         prompt = " [Y/n] "
216 |     elif default == "no":
217 |         prompt = " [y/N] "
218 |     else:
219 |         raise ValueError("invalid default answer: '%s'" % default)
220 |     while True:
221 |         sys.stdout.write(question + prompt)
222 |         choice = input().lower()
223 |         if default is not None and choice == "":
224 |             return valid[default]
225 |         elif choice in valid:
226 |             return valid[choice]
227 |         else:
228 |             sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
229 | 


--------------------------------------------------------------------------------
/pvoutput/grid_search/clip.py:
--------------------------------------------------------------------------------
  1 | """Clipping function for coordinates"""
  2 | 
  3 | from typing import Iterable, Optional, Tuple, Union
  4 | 
  5 | import geopandas as gpd
  6 | import pandas as pd
  7 | from shapely.geometry import Point, Polygon
  8 | 
  9 | 
 10 | def clip_to_radius(
 11 |     coords: Union[pd.DataFrame, gpd.GeoDataFrame],
 12 |     latitude: float,
 13 |     longitude: float,
 14 |     radius: Optional[float] = None,
 15 |     search_radius: Optional[float] = None,
 16 |     local_crs_epsg: int = 4087,
 17 | ) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
 18 |     """Clip coordinates to a radius.
 19 | 
 20 |     Remove any coordinates which do not lie within x.
 21 | 
 22 |     Args:
 23 |         coords:
 24 |             A pandas DataFrame or geopandas GeoDataFrame of coordinates with columns: latitude,
 25 |             longitude.
 26 |         latitude:
 27 |             Latitude of the center of the radial search.
 28 |         longitude:
 29 |             Longitude of the center of the radial search.
 30 |         radius:
 31 |             Set the radial search limit in km.
 32 |         search_radius:
 33 |             Optionally set the radial search limit around each grid point in kilometers. If set, the
 34 |             code will consider coords to be included if any part of the search radius overlaps the
 35 |             outter radius.
 36 |         local_crs_epsg:
 37 |             Optionally provide the EPSG code of a local Co-ordinate Reference System (CRS) for
 38 |             improved accuracy. e.g. set to 27700 (OSGB36 / British National Grid) if searching the
 39 |             British Isles.
 40 | 
 41 |     Returns:
 42 |         As per `coords` but containing only the subset of the input coordinates which fall within
 43 |         `radius` km of the lat/lon.
 44 |     """
 45 |     center = Point(longitude, latitude)
 46 |     radius_ = (
 47 |         gpd.GeoSeries(center)
 48 |         .set_crs("EPSG:4326")
 49 |         .to_crs(f"EPSG:{local_crs_epsg}")
 50 |         .buffer(radius * 1000.0)[0]
 51 |     )
 52 |     if search_radius is None:
 53 |         coords["selected"] = coords.within(radius_)
 54 |     else:
 55 |         coords_ = coords.to_crs(f"EPSG:{local_crs_epsg}").buffer(search_radius * 1000.0)
 56 |         coords["selected"] = coords_.intersects(radius_)
 57 |     return coords.loc[coords["selected"]].drop(columns="selected")
 58 | 
 59 | 
 60 | def bounding_box_from_radius(
 61 |     latitude: float, longitude: float, radius: float, local_crs_epsg: int = 4087
 62 | ) -> Tuple[float]:
 63 |     """Convert a radial search around a given lat/lon to a bounding box.
 64 | 
 65 |     Args:
 66 |         latitude:
 67 |             Latitude of the center of the radial search.
 68 |         longitude:
 69 |             Longitude of the center of the radial search.
 70 |         radius:
 71 |             Set the radial search limit in km.
 72 |         local_crs_epsg:
 73 |             Optionally provide the EPSG code of a local Co-ordinate Reference System (CRS) for
 74 |             improved accuracy. e.g. set to 27700 (OSGB36 / British National Grid) if searching the
 75 |             British Isles.
 76 | 
 77 |     Returns:
 78 |         Four element tuple defining a bounding box: (min_lat, min_lon, max_lat, max_lon).
 79 |     """
 80 |     center = Point(longitude, latitude)
 81 |     search_radius = (
 82 |         gpd.GeoSeries(center)
 83 |         .set_crs("EPSG:4326")
 84 |         .to_crs(f"EPSG:{local_crs_epsg}")
 85 |         .buffer(radius * 1000.0)
 86 |     )
 87 |     bounds = search_radius.to_crs("EPSG:4326").bounds.loc[0].to_numpy()
 88 |     return bounds[1], bounds[0], bounds[3], bounds[2]
 89 | 
 90 | 
 91 | def buffer_bounding_box_bounds(
 92 |     bbox: Optional[Iterable[float]] = None, buffer: float = 0, local_crs_epsg: int = 4087
 93 | ) -> Tuple[float]:
 94 |     """Buffer a bounding box by a distance in km.
 95 | 
 96 |     Args:
 97 |         bbox:
 98 |             Four element iterable defining a bounding box: [min_lat, min_lon, max_lat, max_lon].
 99 |         buffer:
100 |             Optionally buffer the country boundaries before clipping, in kilometers.
101 |         local_crs_epsg:
102 |             Optionally provide the EPSG code of a local Co-ordinate Reference System (CRS) for
103 |             improved accuracy. e.g. set to 27700 (OSGB36 / British National Grid) if searching the
104 |             British Isles.
105 | 
106 |     Returns:
107 |         Four element tuple defining a bounding box: (min_lat, min_lon, max_lat, max_lon).
108 |     """
109 |     bbox_ = buffer_bounding_box(bbox, buffer, local_crs_epsg)
110 |     new_bounds = bbox_.to_crs("EPSG:4326").bounds.loc[0].to_numpy()
111 |     return new_bounds[1], new_bounds[0], new_bounds[3], new_bounds[2]
112 | 
113 | 
114 | def buffer_bounding_box(
115 |     bbox: Optional[Iterable[float]] = None, buffer: float = 0, local_crs_epsg: int = 4087
116 | ) -> gpd.GeoSeries:
117 |     """Buffer a bounding box by a distance in km.
118 | 
119 |     Args:
120 |         bbox:
121 |             Four element iterable defining a bounding box: [min_lat, min_lon, max_lat, max_lon].
122 |         buffer:
123 |             Optionally buffer the country boundaries before clipping, in kilometers.
124 |         local_crs_epsg:
125 |             Optionally provide the EPSG code of a local Co-ordinate Reference System (CRS) for
126 |             improved accuracy. e.g. set to 27700 (OSGB36 / British National Grid) if searching the
127 |             British Isles.
128 | 
129 |     Returns:
130 |         A geopandas GeoSeries containing the buffered geometry.
131 |     """
132 |     bbox_ = Polygon(
133 |         [(bbox[1], bbox[0]), (bbox[1], bbox[2]), (bbox[3], bbox[2]), (bbox[3], bbox[0])]
134 |     )
135 |     bbox_ = (
136 |         gpd.GeoSeries(bbox_)
137 |         .set_crs("EPSG:4326")
138 |         .to_crs(f"EPSG:{local_crs_epsg}")
139 |         .buffer(buffer * 1000.0)
140 |     )
141 |     return bbox_
142 | 
143 | 
144 | def clip_to_bbox(
145 |     coords: Union[pd.DataFrame, gpd.GeoDataFrame],
146 |     bbox: Iterable[float],
147 |     buffer: float = 0,
148 |     search_radius: Optional[float] = None,
149 |     local_crs_epsg: int = 4087,
150 | ) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
151 |     """Clip coordinates to bounding box.
152 | 
153 |     Remove any coordinates which do not lie inside a bounding box.
154 | 
155 |     Args:
156 |         coords:
157 |             A pandas DataFrame or geopandas GeoDataFrame of coordinates with columns: latitude,
158 |             longitude.
159 |         bbox:
160 |             Four element iterable defining a bounding box: [min_lat, min_lon, max_lat, max_lon].
161 |         buffer:
162 |             Optionally buffer the country boundaries before clipping, in kilometers.
163 |         search_radius:
164 |             Optionally set the radial search limit around each grid point in kilometers. If set, the
165 |             code will consider coords to be included if any part of the search radius overlaps the
166 |             bounding box.
167 |         local_crs_epsg:
168 |             Optionally provide the EPSG code of a local Co-ordinate Reference System (CRS) for
169 |             improved accuracy. e.g. set to 27700 (OSGB36 / British National Grid) if searching the
170 |             British Isles.
171 | 
172 |     Returns:
173 |         As per `coords` but containing only the subset of the input coordinates which fall within
174 |         `buffer` km of the bounding box.
175 |     """
176 |     if search_radius is None:
177 |         # TODO why is bounds not used
178 |         bounds = buffer_bounding_box_bounds(bbox, buffer, local_crs_epsg)  # noqa
179 |         coords["selected"] = (bbox[0] <= coords.latitude <= bbox[2]) & (
180 |             bbox[1] <= coords.longitude <= bbox[3]
181 |         )
182 |     else:
183 |         bbox_ = buffer_bounding_box(bbox, buffer, local_crs_epsg)[0]
184 |         coords_ = coords.to_crs(f"EPSG:{local_crs_epsg}").buffer(search_radius * 1000.0)
185 |         coords["selected"] = coords_.intersects(bbox_)
186 |     return coords.loc[coords["selected"]].drop(columns="selected")
187 | 
188 | 
189 | def clip_to_countries(
190 |     coords: gpd.GeoDataFrame,
191 |     world: gpd.GeoDataFrame,
192 |     countries: Iterable[str],
193 |     buffer: float = 0,
194 |     search_radius: Optional[float] = None,
195 | ) -> gpd.GeoDataFrame:
196 |     """Clip coordinates to country boundaries.
197 | 
198 |     Given a set of coordinates, some country boundary definitions, a list of countries, and a buffer
199 |     distance, return the coords which fall within `buffer` kilometers of the listed countries'
200 |     boundaries.
201 | 
202 |     Args:
203 |         coords:
204 |             A geopandas GeoDataFrame containing latitudes, longitudes and geometries for a set of
205 |             coordinates.
206 |         world:
207 |             A geopandas GeoDataFrame of world boundaries geomteries, as returned by
208 |             `get_world_boundaries()`.
209 |         countries:
210 |             A list of country names to clip the coords to.
211 |         buffer:
212 |             Optionally buffer the country boundaries before clipping, in kilometers.
213 |         search_radius:
214 |             Optionally set the radial search limit around each grid point in kilometers. If set, the
215 |             code will consider coords to be included if any part of the search radius overlaps the
216 |             country.
217 | 
218 |     Returns:
219 |         As per `coords` but containing only the subset of the input coordinates which fall within
220 |         `buffer` km of the given countries.
221 |     """
222 |     countries_ = world[world.name.isin(countries)]
223 |     countries_ = countries_.dissolve().buffer(buffer * 1000.0)[0]
224 |     if search_radius is None:
225 |         coords["selected"] = coords.within(countries_)
226 |     else:
227 |         # Consider points outside the selected region whose search radius overlaps the region
228 |         coords_ = coords.buffer(search_radius * 1000.0)
229 |         coords["selected"] = coords_.intersects(countries_)
230 |     return coords.loc[coords["selected"]].drop(columns="selected")
231 | 


--------------------------------------------------------------------------------
/pvoutput/grid_search/grid_search.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Grid Search Class used to make grid of latitude and longitude coordinates
  3 | """
  4 | 
  5 | from typing import Iterable, Optional
  6 | 
  7 | import geopandas as gpd
  8 | import matplotlib.pyplot as plt
  9 | import numpy as np
 10 | import pandas as pd
 11 | from pyproj import Transformer
 12 | 
 13 | from pvoutput.grid_search.clip import (
 14 |     bounding_box_from_radius,
 15 |     buffer_bounding_box_bounds,
 16 |     clip_to_bbox,
 17 |     clip_to_countries,
 18 |     clip_to_radius,
 19 | )
 20 | from pvoutput.grid_search.natural_earth import NaturalEarth
 21 | 
 22 | 
 23 | class GridSearch:
 24 |     """A class for generating a gridded search."""
 25 | 
 26 |     def __init__(self, cache_dir: str = None) -> None:
 27 |         """Initialise.
 28 | 
 29 |         Args:
 30 |             cache_dir: Optionally provide a location to cache boundary
 31 |             definition files locally and avoid unnecsessary downloads.
 32 |         """
 33 |         self.nat_earth = NaturalEarth(cache_dir)
 34 | 
 35 |     def plot_grid(
 36 |         self,
 37 |         coords: gpd.GeoDataFrame,
 38 |         countries: Iterable[str],
 39 |         bbox: Optional[Iterable[float]] = None,
 40 |         local_crs_epsg: int = 4087,
 41 |         filename: Optional[str] = None,
 42 |     ) -> None:
 43 |         """Plot grid coordinates.
 44 | 
 45 |         Plot grid coordinates over world boundaries with selected countries highlighted.
 46 | 
 47 |         Args:
 48 |             coords: A geopandas GeoDataFrame containing latitudes, longitudes and geometries
 49 |                 for a set of coordinates.
 50 |             countries: A list of country names to clip the coords to.
 51 |             bbox: Optionally pass a four element iterable defining a bounding box:
 52 |                 [min_lat, min_lon, max_lat, max_lon]. This will be used to set the scale of the
 53 |                 plot.
 54 |             local_crs_epsg: Optionally provide the EPSG code of a local
 55 |                 Co-ordinate Reference System (CRS) for improved accuracy.
 56 |                 e.g. set to 27700 (OSGB36 / British National Grid) if searching
 57 |                 the British Isles. The default is EPSG:4087 (a.k.a WGS 84 / World Equidistant
 58 |                 Cylindrical), which works globally but with less accuracy.
 59 |             filename: Optionally pass a filename (relative or absolute) to save the plot to.
 60 |                 Image format should be set using the file extension (i.e. .jpeg, .png or .svg).
 61 |         """
 62 |         world, _ = self.nat_earth.get_hires_world_boundaries()
 63 |         world.to_crs(f"EPSG:{local_crs_epsg}", inplace=True)
 64 |         coords.to_crs(f"EPSG:{local_crs_epsg}", inplace=True)
 65 |         if bbox is None:
 66 |             bbox = [
 67 |                 coords.latitude.min(),
 68 |                 coords.longitude.min(),
 69 |                 coords.latitude.max(),
 70 |                 coords.longitude.max(),
 71 |             ]
 72 |         f, ax = plt.subplots()
 73 |         world.plot(
 74 |             ax=ax, color="palegreen", edgecolor="black", linewidth=1, label="World", zorder=1
 75 |         )
 76 |         if countries is not None:
 77 |             selected = world[world.name.isin(countries)]
 78 |             selected.geometry.boundary.plot(
 79 |                 ax=ax, color=None, edgecolor="gold", label="Selected countries", zorder=2
 80 |             )
 81 |         coords.plot(ax=ax, marker="o", color="red", markersize=2, label="Grid", zorder=3)
 82 |         xmin = coords.geometry.bounds.minx.min()
 83 |         xmax = coords.geometry.bounds.maxx.max()
 84 |         ymin = coords.geometry.bounds.miny.min()
 85 |         ymax = coords.geometry.bounds.maxy.max()
 86 |         xpadding = (xmax - xmin) / 8
 87 |         ypadding = (ymax - ymin) / 8
 88 |         ax.set_xlim(xmin - xpadding, xmax + xpadding)
 89 |         ax.set_ylim(ymin - ypadding, ymax + ypadding)
 90 |         ax.axes.xaxis.set_visible(False)
 91 |         ax.axes.yaxis.set_visible(False)
 92 |         plt.legend(prop={"size": 6})
 93 |         plt.show()
 94 |         if filename is not None:
 95 |             plt.savefig(filename, dpi=300)
 96 | 
 97 |     def generate_grid(
 98 |         self,
 99 |         bbox: Optional[Iterable[float]] = None,
100 |         countries: Optional[Iterable[str]] = None,
101 |         radial_clip: Optional[Iterable[float]] = None,
102 |         buffer: float = 0,
103 |         search_radius: float = 25,
104 |         local_crs_epsg: int = 4087,
105 |         save_to: Optional[str] = None,
106 |         show: bool = False,
107 |     ):
108 |         """Use hexagonal tiling to generate a grid search with minimal overlap.
109 | 
110 |         Create a set of gridded coordinates which use hexagonal tiling as an efficient way to
111 |         conduct a fixed-radius search of a region defined by a bounding box and/or country borders.
112 | 
113 |         Args:
114 |             bbox:
115 |                 Optionally pass a four element iterable defining a bounding box:
116 |                 [min_lat, min_lon, max_lat, max_lon].
117 |             countries:
118 |                 Optionally pass a list of country names to clip the coords to.
119 |             radial_clip:
120 |                 Optionally set a radial boundary to clip to. Pass a three element iterable
121 |                 containing: (<lat>, <lon>, <radius_km>).
122 |             buffer:
123 |                 Optionally buffer the bounding box and country boundaries before clipping, in
124 |                 kilometers.
125 |             search_radius:
126 |                 Optionally set the radial search limit around each grid point in kilometers.
127 |                 Defaults to 25km.
128 |             local_crs_epsg:
129 |                 Optionally provide the EPSG code of a local co-ordinate Reference System (CRS) for
130 |                 improved accuracy. e.g. set to 27700 (OSGB36 / British National Grid) if searching
131 |                 the British Isles. The default is EPSG:4087 (a.k.a WGS 84 / World Equidistant
132 |                 Cylindrical), which works globally but with poor accuracy in some locations.
133 |             save_to:
134 |                 Optionally specify a filename to save the grid co-ordinates to (CSV).
135 |             show:
136 |                 Set to True to show a plot of the grid.
137 | 
138 |         Returns:
139 |             A pandas DataFrame containing co-ordinates for the grid with columns: latitude,
140 |             longitude.
141 |         """
142 |         # get countries
143 |         world, all_countries = self.nat_earth.get_hires_world_boundaries()
144 |         countries = all_countries if countries is None else countries
145 | 
146 |         # create bounding box
147 |         if bbox is None:
148 |             if radial_clip is None:
149 |                 bbox = (
150 |                     world[world.name.isin(countries)]
151 |                     .dissolve()
152 |                     .buffer(buffer * 1000.0)
153 |                     .to_crs("EPSG:4326")
154 |                     .bounds.to_numpy()[0]
155 |                 )
156 |                 bbox = [bbox[1], bbox[0], bbox[3], bbox[2]]
157 |             else:
158 |                 bbox = bounding_box_from_radius(
159 |                     radial_clip[0], radial_clip[1], radial_clip[2], local_crs_epsg
160 |                 )
161 |         bbox = buffer_bounding_box_bounds(bbox, buffer)
162 |         bounds = [np.round(b, 5) for b in bbox]
163 | 
164 |         # create x and y bounds
165 |         search_radius_m = search_radius * 1000.0
166 |         wgs84_to_projected = Transformer.from_crs(4326, local_crs_epsg, always_xy=True)
167 |         projected_to_wgs84 = Transformer.from_crs(local_crs_epsg, 4326, always_xy=True)
168 |         xmin, ymin = wgs84_to_projected.transform(bounds[1], bounds[0])
169 |         xmax, ymax = wgs84_to_projected.transform(bounds[3], bounds[2])
170 |         y_interval = search_radius_m * np.cos(np.radians(30))
171 |         x_interval = search_radius_m * 3
172 |         x_offset = 0
173 | 
174 |         # create coordinates
175 |         coords = []
176 |         for y in np.arange(ymin - y_interval * 3, ymax + y_interval + search_radius_m, y_interval):
177 |             xmin_ = xmin - search_radius_m - x_offset
178 |             xmax_ = xmax + x_interval + search_radius_m + x_offset
179 |             for x in np.arange(xmin_, xmax_, x_interval):
180 |                 coords.append(projected_to_wgs84.transform(x, y))
181 |             if x_offset == 0:
182 |                 x_offset = search_radius_m * 1.5
183 |             else:
184 |                 x_offset = 0
185 |         coords = pd.DataFrame(coords, columns=["longitude", "latitude"])
186 |         coords = gpd.GeoDataFrame(
187 |             coords, geometry=gpd.points_from_xy(coords.longitude, coords.latitude)
188 |         )
189 |         coords = coords.set_crs("EPSG:4326").to_crs("EPSG:4087")
190 |         coords = clip_to_countries(
191 |             coords=coords,
192 |             world=world,
193 |             countries=countries,
194 |             buffer=buffer,
195 |             search_radius=search_radius,
196 |         )
197 |         coords = clip_to_bbox(
198 |             coords=coords,
199 |             bbox=bbox,
200 |             buffer=buffer,
201 |             search_radius=search_radius,
202 |             local_crs_epsg=local_crs_epsg,
203 |         )
204 |         if radial_clip is not None:
205 |             coords = clip_to_radius(
206 |                 coords=coords,
207 |                 latitude=radial_clip[0],
208 |                 longitude=radial_clip[1],
209 |                 radius=radial_clip[2],
210 |                 search_radius=search_radius,
211 |                 local_crs_epsg=local_crs_epsg,
212 |             )
213 | 
214 |         # show plot
215 |         if show:
216 |             self.plot_grid(
217 |                 coords,
218 |                 countries,
219 |                 bbox,
220 |             )
221 | 
222 |         # save plots
223 |         if save_to is not None:
224 |             coords.to_csv(
225 |                 save_to, float_format="%.5f", index=False, columns=["longitude", "latitude"]
226 |             )
227 |         return coords[["latitude", "longitude"]].reset_index(drop=True)
228 | 


--------------------------------------------------------------------------------
/pvoutput/grid_search/natural_earth.py:
--------------------------------------------------------------------------------
  1 | """Retrieve Natural Earth world boundaries."""
  2 | 
  3 | import logging
  4 | import os
  5 | from io import BytesIO
  6 | from typing import Tuple
  7 | 
  8 | import geopandas as gpd
  9 | import requests
 10 | from numpy.typing import NDArray
 11 | 
 12 | 
 13 | class NaturalEarth:
 14 |     """Retrieve Natural Earth world boundaries."""
 15 | 
 16 |     def __init__(self, cache_dir: str = None) -> None:
 17 |         """Initialise.
 18 | 
 19 |         Args:
 20 |             cache_dir:
 21 |                 Optionally provide a location to cache boundary definition files locally and avoid
 22 |                 unnecsessary downloads.
 23 | 
 24 |         Raises:
 25 |             ValueError: If the cache_dir does not exist.
 26 |         """
 27 |         self.cache_dir = cache_dir
 28 |         if self.cache_dir is not None:
 29 |             if not os.path.isdir(cache_dir):
 30 |                 logging.error("The cache_dir does not exist.")
 31 |                 raise ValueError("The cache_dir does not exist.")
 32 |         self.world_hires = None
 33 |         self.countries_hires = None
 34 |         self.world_lores = None
 35 |         self.countries_lores = None
 36 | 
 37 |     def get_hires_world_boundaries(self) -> Tuple[gpd.GeoDataFrame, NDArray]:
 38 |         """Load high res world boundaries.
 39 | 
 40 |         Download the high resolution country boundaries GIS file from the Natural Earth website and
 41 |         optionally cache locally.
 42 | 
 43 |         Returns:
 44 |             A tuple containing (`world`, `countries`). `world` is a Geopandas GeoDataFrame with
 45 |             geometries and metadata for all country borders. `countries` is a list of unique country
 46 |             names for which geometries exist. Boundaries will be in the EPSG:4087 projected CRS.
 47 | 
 48 |         Typical usage example:
 49 |             world, countries = get_world_boundaries()
 50 |         """
 51 |         if self.world_hires is not None and self.countries_hires is not None:
 52 |             return self.world_hires, self.countries_hires
 53 |         if self.cache_dir:
 54 |             cache_file = os.path.join(self.cache_dir, "ne_10m_admin_0_countries.zip")
 55 |         else:
 56 |             cache_file = None
 57 |         if cache_file is not None and os.path.isfile(cache_file):
 58 |             data = cache_file
 59 |         else:
 60 |             headers = {
 61 |                 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
 62 |                 "(KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"
 63 |             }
 64 |             url = (
 65 |                 "https://www.naturalearthdata.com/http//www.naturalearthdata.com/"
 66 |                 "download/10m/cultural/ne_10m_admin_0_countries.zip"
 67 |             )
 68 |             req = requests.get(url, headers=headers)
 69 |             data = BytesIO(req.content)
 70 |             if cache_file is not None:
 71 |                 with open(cache_file, "wb") as fid:
 72 |                     fid.write(req.content)
 73 |         self.world_hires = gpd.read_file(data).to_crs("EPSG:4087")
 74 |         cols2keep = {"NAME": "name", "CONTINENT": "continent", "geometry": "geometry"}
 75 |         self.world_hires = self.world_hires[list(cols2keep.keys())].rename(columns=cols2keep)
 76 |         self.countries_hires = self.world_hires.name.unique()
 77 |         return self.world_hires, self.countries_hires
 78 | 
 79 |     def get_lores_world_boundaries(self) -> Tuple[gpd.GeoDataFrame, NDArray]:
 80 |         """Load low resolution world boundaries.
 81 | 
 82 |         Load the low res world boundaries GIS file (`naturalearth_lowres`) from Geopandas datasets.
 83 |         Useful for visualisations and/or to speed up computation.
 84 | 
 85 |         Returns:
 86 |             A tuple containing (`world`, `countries`). `world` is a Geopandas GeoDataFrame with
 87 |             geometries and metadata for all country borders. `countries` is a list of unique country
 88 |             names for which geometries exist. Boundaries will be in the EPSG:4087 projected CRS.
 89 | 
 90 |         Typical usage example:
 91 |             world, countries = get_world_boundaries()
 92 |         """
 93 |         if self.world_lores is not None and self.countries_lores is not None:
 94 |             return self.world_lores, self.countries_lores
 95 |         self.world_lores = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres")).to_crs(
 96 |             "EPSG:4087"
 97 |         )
 98 |         self.world_lores.drop(columns=["pop_est", "iso_a3", "gdp_md_est"], inplace=True)
 99 |         self.countries = self.world_lores.name.unique()
100 |         return self.world_lores, self.countries_lores
101 | 
102 |     def list_countries(self, res: str = "hires") -> Tuple[gpd.GeoDataFrame, NDArray]:
103 |         """Print a list of country names to stdout.
104 | 
105 |         Print a list of country names whose geometries are available in the world boundaries GIS
106 |         file.
107 | 
108 |         Args:
109 |             res:
110 |                 Optionally switch between 'hires' and 'lores', although in theory both country lists
111 |                 should be identical, there may be some countries that are not included in the lores
112 |                 boundaries due to downsampling of borders. Names may also have changed between the
113 |                 two (e.g. Macedonia -> North Macedonia).
114 | 
115 |         Raises:
116 |             ValueError: If `res` is not one of: 'lores', 'hires'.
117 |         """
118 |         if res == "hires":
119 |             _, countries = self.get_hires_world_boundaries()
120 |         elif res == "lores":
121 |             _, countries = self.get_lores_world_boundaries()
122 |         else:
123 |             logging.error("The `res` arg should be one of: 'lores', 'hires'.")
124 |             raise ValueError("The `res` arg should be one of: 'lores', 'hires'.")
125 |         countries.sort()
126 |         print(f"Available countries are:\n{', '.join(countries)}")
127 | 
128 |         return countries
129 | 


--------------------------------------------------------------------------------
/pvoutput/mapscraper.py:
--------------------------------------------------------------------------------
  1 | """Code for scraping for pv systems"""
  2 | 
  3 | import re
  4 | from copy import copy
  5 | from typing import Iterable, Optional, Union
  6 | 
  7 | import pandas as pd
  8 | import requests
  9 | from bs4 import BeautifulSoup
 10 | 
 11 | from pvoutput.consts import (
 12 |     MAP_URL,
 13 |     PV_OUTPUT_COUNTRY_CODES,
 14 |     PV_OUTPUT_MAP_COLUMN_NAMES,
 15 |     REGIONS_URL,
 16 | )
 17 | 
 18 | _MAX_NUM_PAGES = 1024
 19 | 
 20 | 
 21 | def get_pv_systems_for_country(
 22 |     country: Union[str, int],
 23 |     ascending: Optional[bool] = None,
 24 |     sort_by: Optional[str] = None,
 25 |     max_pages: int = _MAX_NUM_PAGES,
 26 |     region: Optional[str] = None,
 27 | ) -> pd.DataFrame:
 28 |     """
 29 |     Get all pv systems for on country
 30 | 
 31 |     Args:
 32 |         country: either a string such as 'United Kingdom'
 33 |             (see consts.PV_OUTPUT_COUNTRY_CODES for all recognised strings),
 34 |             or a PVOutput.org country code, in the range [1, 257].
 35 |         ascending: if True, ask PVOutput.org to sort results by ascending.
 36 |             If False, sort by descending.  If None, use PVOutput.org's default
 37 |             sort order.
 38 |         sort_by: The column to ask PVOutput.org to sort by.  One of:
 39 |             timeseries_duration,
 40 |             average_generation_per_day,
 41 |             efficiency,
 42 |             power_generation,
 43 |             capacity,
 44 |             address,
 45 |             name
 46 |         max_pages: The maximum number of search pages to scrape.
 47 |         region: Optional input, #TODO
 48 | 
 49 |     Returns: pd.DataFrame with index system_id (int) and these columns:
 50 |         name, system_DC_capacity_W, panel, inverter, address, orientation,
 51 |         array_tilt_degrees, shade, timeseries_duration,
 52 |         total_energy_gen_Wh, average_daily_energy_gen_Wh
 53 |         average_efficiency_kWh_per_kW
 54 |     """
 55 |     country_code = _convert_to_country_code(country)
 56 |     regions = [region] if region else get_regions_for_country(country_code)
 57 |     all_metadata = []
 58 |     for region in regions:
 59 |         for page_number in range(max_pages):
 60 |             print(
 61 |                 "\rReading page {:2d} for region: {}".format(page_number, region),
 62 |                 end="",
 63 |                 flush=True,
 64 |             )
 65 |             url = _create_map_url(
 66 |                 country_code=country_code,
 67 |                 page_number=page_number,
 68 |                 ascending=ascending,
 69 |                 sort_by=sort_by,
 70 |                 region=region,
 71 |             )
 72 |             soup = get_soup(url)
 73 |             if _page_is_blank(soup):
 74 |                 break
 75 |             metadata = _process_metadata(soup)
 76 |             metadata["region"] = region
 77 |             all_metadata.append(metadata)
 78 | 
 79 |             if not _page_has_next_link(soup):
 80 |                 break
 81 | 
 82 |     return pd.concat(all_metadata)
 83 | 
 84 | 
 85 | # ########### LOAD HTML ###################
 86 | 
 87 | 
 88 | def _create_map_url(
 89 |     country_code: Optional[int] = None,
 90 |     page_number: Optional[int] = None,
 91 |     ascending: Optional[bool] = None,
 92 |     sort_by: Optional[str] = None,
 93 |     region: Optional[str] = None,
 94 | ) -> str:
 95 |     """
 96 |     Create a map URL
 97 | 
 98 |     Args:
 99 |         country_code: Country code
100 |         page_number: Get this page number of the search results.  Zero-indexed.
101 |             The first page is page 0, the second page is page 1, etc.
102 |         ascending: option for ascending or descending
103 |         sort_by: sort results by (optional)
104 |         region: region of country (optional)
105 | 
106 |     """
107 |     _check_country_code(country_code)
108 | 
109 |     if ascending is None:
110 |         sort_order = None
111 |     else:
112 |         sort_order = "asc" if ascending else "desc"
113 | 
114 |     if sort_by is None:
115 |         sort_by_pv_output_col_name = None
116 |     else:
117 |         try:
118 |             sort_by_pv_output_col_name = PV_OUTPUT_MAP_COLUMN_NAMES[sort_by]
119 |         except KeyError:
120 |             raise ValueError("sort_by must be one of {}".format(PV_OUTPUT_MAP_COLUMN_NAMES.keys()))
121 | 
122 |     url_params = {
123 |         "country": country_code,
124 |         "p": page_number,
125 |         "d": sort_order,
126 |         "o": sort_by_pv_output_col_name,
127 |         "region": region,
128 |     }
129 | 
130 |     url_params_list = [
131 |         "{}={}".format(key, value) for key, value in url_params.items() if value is not None
132 |     ]
133 |     query_string = "&".join(url_params_list)
134 |     url = copy(MAP_URL)
135 |     if query_string:
136 |         url += "?" + query_string
137 |     return url
138 | 
139 | 
140 | def _raise_country_error(country, msg=""):
141 |     country_codes = PV_OUTPUT_COUNTRY_CODES.values()
142 |     raise ValueError(
143 |         "Wrong value country='{}'. {}country must be an integer country"
144 |         " code in the range [{}, {}], or one of {}.".format(
145 |             country,
146 |             msg,
147 |             min(country_codes),
148 |             max(country_codes),
149 |             ", ".join(PV_OUTPUT_COUNTRY_CODES.keys()),
150 |         )
151 |     )
152 | 
153 | 
154 | def _check_country_code(country_code: Union[None, int]):
155 |     if country_code is None:
156 |         return
157 |     country_codes = PV_OUTPUT_COUNTRY_CODES.values()
158 |     if not min(country_codes) <= country_code <= max(country_codes):
159 |         _raise_country_error(country_code, "country outside of valid range!  ")
160 | 
161 | 
162 | def _convert_to_country_code(country: Union[str, int]) -> int:
163 |     if isinstance(country, str):
164 |         try:
165 |             return PV_OUTPUT_COUNTRY_CODES[country]
166 |         except KeyError:
167 |             _raise_country_error(country)
168 | 
169 |     elif isinstance(country, int):
170 |         _check_country_code(country)
171 |         return country
172 | 
173 | 
174 | def _page_has_next_link(soup: BeautifulSoup):
175 |     return bool(soup.find_all("a", text="Next"))
176 | 
177 | 
178 | # ############ PROCESS HTML #########################
179 | 
180 | 
181 | def _process_metadata(soup: BeautifulSoup, return_constituents=False) -> pd.DataFrame:
182 |     pv_system_size_metadata = _process_system_size_col(soup)
183 |     index = pv_system_size_metadata.index
184 |     pv_systems_metadata = [
185 |         pv_system_size_metadata,
186 |         _process_output_col(soup, index),
187 |         _process_generation_and_average_cols(soup, index),
188 |         _process_efficiency_col(soup, index),
189 |     ]
190 | 
191 |     df = pd.concat(pv_systems_metadata, axis="columns")
192 |     df = _convert_metadata_cols_to_numeric(df)
193 |     df["system_DC_capacity_W"] = df["capacity_kW"] * 1e3
194 |     del df["capacity_kW"]
195 |     if return_constituents:
196 |         pv_systems_metadata.append(df)
197 |         return tuple(pv_systems_metadata)
198 |     return df
199 | 
200 | 
201 | def _process_system_size_col(soup: BeautifulSoup) -> pd.DataFrame:
202 |     pv_system_size_col = soup.find_all("a", href=re.compile(r"display\.jsp\?sid="))
203 |     metadata = []
204 |     for row in pv_system_size_col:
205 |         metadata_for_row = {}
206 | 
207 |         # Get system ID
208 |         href = row.attrs["href"]
209 |         p = re.compile(r"^display\.jsp\?sid=(\d+)$")
210 |         href_match = p.match(href)
211 |         metadata_for_row["system_id"] = href_match.group(1)
212 | 
213 |         # Process title (lots of metadata in here!)
214 |         title, title_meta = row.attrs["title"].split("|")
215 | 
216 |         # Name and capacity
217 |         p = re.compile(r"(.*) (\d+\.\d+kW)")
218 |         title_match = p.match(title)
219 |         metadata_for_row["name"] = title_match.group(1)
220 |         metadata_for_row["capacity"] = title_match.group(2)
221 | 
222 |         # Other key-value pairs:
223 |         key_value = title_meta.split("<br/>")
224 |         key_value_dict = {}
225 |         for line in key_value:
226 |             key_value_split = line.split(":")
227 |             key = key_value_split[0].strip()
228 |             # Some values have a colon(!)
229 |             value = ":".join(key_value_split[1:]).strip()
230 |             key_value_dict[key] = value
231 |         metadata_for_row.update(key_value_dict)
232 | 
233 |         # Some cleaning
234 |         # Remove <img ...> from Location
235 |         location = metadata_for_row["Location"]
236 |         p = re.compile(r"(<img .*\>)?(.*)")
237 |         img_groups = p.search(location).groups()
238 |         if img_groups[0] is not None:
239 |             metadata_for_row["Location"] = img_groups[1].strip()
240 | 
241 |         metadata.append(metadata_for_row)
242 | 
243 |     df = pd.DataFrame(metadata)
244 |     df["system_id"] = pd.to_numeric(df["system_id"])
245 |     df = df.set_index("system_id")
246 |     df.columns = [col_name.lower() for col_name in df.columns]
247 |     df.rename(
248 |         {
249 |             "location": "address",
250 |             "panels": "panel",
251 |             "array tilt": "array_tilt_degrees",
252 |             "capacity": "capacity_kW",
253 |         },
254 |         axis="columns",
255 |         inplace=True,
256 |     )
257 |     return df
258 | 
259 | 
260 | def _remove_str_and_convert_to_numeric(series: pd.Series, string_to_remove: str) -> pd.Series:
261 |     series = series.str.replace(string_to_remove, "")
262 |     return pd.to_numeric(series)
263 | 
264 | 
265 | def _convert_metadata_cols_to_numeric(df: pd.DataFrame) -> pd.DataFrame:
266 |     for col_name, string_to_remove in [
267 |         # ('array_tilt_degrees', '°'),
268 |         ("capacity_kW", "kW"),
269 |         ("average_efficiency_kWh_per_kW", "kWh/kW"),
270 |     ]:
271 |         df[col_name] = _remove_str_and_convert_to_numeric(df[col_name], string_to_remove)
272 | 
273 |     return df
274 | 
275 | 
276 | def _process_output_col(soup: BeautifulSoup, index: Optional[Iterable] = None) -> pd.Series:
277 | 
278 |     # get all data
279 |     outputs_col = soup.find_all(text=re.compile(r"\d Days"))
280 | 
281 |     # format data as strings
282 |     outputs_col = [str(col) for col in outputs_col]
283 | 
284 |     # make into pandas Series
285 |     duration = pd.Series(outputs_col, name="timeseries_duration", index=index)
286 | 
287 |     # change to timedeltas
288 |     return pd.to_timedelta(duration.astype("unicode"))
289 | 
290 | 
291 | def _convert_energy_to_numeric_watt_hours(series: pd.Series) -> pd.Series:
292 |     data = []
293 |     for unit, multiplier in [("kWh", 1e3), ("MWh", 1e6)]:
294 |         selection = series[series.str.contains(unit)]
295 |         selection = selection.str.replace(unit, "")
296 |         selection = selection.str.replace(",", "")
297 |         selection = pd.to_numeric(selection)
298 |         selection *= multiplier
299 |         data.append(selection)
300 |     return pd.concat(data)
301 | 
302 | 
303 | def _process_generation_and_average_cols(
304 |     soup: BeautifulSoup, index: Optional[Iterable] = None
305 | ) -> pd.DataFrame:
306 |     # _soup = deepcopy(soup)
307 |     [s.decompose() for s in soup.select("a")]
308 |     generation_and_average_cols = soup.find_all(text=re.compile(r"\d[Mk]Wh$"))
309 |     generation_col = generation_and_average_cols[0::2]
310 |     average_col = generation_and_average_cols[1::2]
311 |     df = pd.DataFrame(
312 |         {"total_energy_gen_Wh": generation_col, "average_daily_energy_gen_Wh": average_col},
313 |         index=index,
314 |     )
315 | 
316 |     for col_name in df.columns:
317 |         df[col_name] = _convert_energy_to_numeric_watt_hours(df[col_name])
318 | 
319 |     return df
320 | 
321 | 
322 | def _process_efficiency_col(soup: BeautifulSoup, index: Optional[Iterable] = None) -> pd.Series:
323 |     efficiency_col = soup.find_all(text=re.compile(r"\dkWh/kW"))
324 |     return pd.Series(efficiency_col, name="average_efficiency_kWh_per_kW", index=index)
325 | 
326 | 
327 | def _page_is_blank(soup: BeautifulSoup) -> bool:
328 |     # Pages can still be blank even if the previous page has a Next Button
329 |     pv_system_size_col = soup.find_all("a", href=re.compile(r"display\.jsp\?sid="))
330 |     return not bool(pv_system_size_col)
331 | 
332 | 
333 | def get_soup(url, raw=False, parser="html.parser"):
334 |     """
335 |     Get soupt from url
336 | 
337 |     Args:
338 |         url: URL
339 |         raw: option for raw, defaulted to False
340 |         parser: parser for BeautifulSoup
341 | 
342 |     """
343 |     response = requests.get(url)
344 |     soup = BeautifulSoup(response.text, parser)
345 |     if raw:
346 |         return soup
347 |     return clean_soup(soup)
348 | 
349 | 
350 | def clean_soup(soup):
351 |     """Function to clean scraped soup object.
352 | 
353 |     Note that the downloaded soup could change over time.
354 |     Args:
355 |         soup: bs4.BeautifulSoup
356 | 
357 |     Returns:
358 |         bs4.BeautifulSoup
359 | 
360 |     """
361 |     for script in soup.find_all("script", src=False):
362 |         script.decompose()
363 |     return soup
364 | 
365 | 
366 | def get_regions_for_country(country_code: int):
367 |     """
368 |     Get regions for on countruy
369 | 
370 |     Args:
371 |         country_code: the country code
372 | 
373 |     Returns: list of regions
374 |     """
375 |     region_list = []
376 |     url = f"{REGIONS_URL}?country={country_code}"
377 |     soup = get_soup(url, parser="lxml")
378 |     region_tags = soup.find_all("a", href=re.compile(r"map\.jsp\?country="))
379 |     for row in region_tags:
380 |         href = row.attrs["href"]
381 |         p = re.compile(r"^map\.jsp\?country=" + str(country_code) + r"&region=(\w+.*)$")
382 |         href_match = p.match(href)
383 |         region = href_match.group(1)
384 |         region_list.append(region)
385 |     return region_list
386 | 


--------------------------------------------------------------------------------
/pvoutput/prcoess.py:
--------------------------------------------------------------------------------
  1 | """Function to process data"""
  2 | 
  3 | import logging
  4 | from io import StringIO
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | 
 12 | def process_system_status(pv_system_status_text, date) -> pd.DataFrame:
 13 |     """
 14 |     Process raw system status
 15 | 
 16 |     Args:
 17 |         pv_system_status_text: string of system data, like:
 18 |             "1234;07:45,21,255,1,2;07:50,21,255,1;07:50,21,255,1,2"
 19 |         date: The date this data is from
 20 | 
 21 |     Returns: dataframe of data
 22 |     """
 23 | 
 24 |     # See https://pvoutput.org/help/data_services.html#data-services-get-system-status
 25 |     columns = [
 26 |         "cumulative_energy_gen_Wh",
 27 |         "instantaneous_power_gen_W",
 28 |         "temperature_C",
 29 |         "voltage",
 30 |     ]
 31 |     if pv_system_status_text == "no status found":
 32 |         logger.debug("Text was empty so return empty dataframe")
 33 |         return pd.DataFrame(columns=columns + ["system_id", "datetime"])
 34 | 
 35 |     # get system id
 36 |     system_id = int(pv_system_status_text.split(";")[0])
 37 |     pv_system_status_text = ";".join(pv_system_status_text.split(";")[1:])
 38 | 
 39 |     try:
 40 |         one_pv_system_status = pd.read_csv(
 41 |             StringIO(pv_system_status_text),
 42 |             lineterminator=";",
 43 |             names=["time"] + columns,
 44 |             dtype={col: np.float64 for col in columns},
 45 |         ).sort_index()
 46 | 
 47 |     except Exception as e:
 48 | 
 49 |         # this can happen if there is only one data value and it doesnt contain all 5 columns.
 50 |         # if there is many rows of data, then it seems fine
 51 |         if pv_system_status_text.count(";") != 0:
 52 |             # the data contains more than one row, so lets raise the error
 53 |             raise e
 54 | 
 55 |         # how many columns does it have
 56 |         n_columns = pv_system_status_text.count(",") + 1
 57 | 
 58 |         one_pv_system_status = pd.read_csv(
 59 |             StringIO(pv_system_status_text),
 60 |             lineterminator=";",
 61 |             names=["time"] + columns[: n_columns - 1],
 62 |             dtype={col: np.float64 for col in columns},
 63 |         ).sort_index()
 64 | 
 65 |         missing_columns = [c for c in columns if c not in one_pv_system_status.columns]
 66 |         one_pv_system_status[missing_columns] = np.NAN
 67 | 
 68 |     # process dataframe
 69 |     one_pv_system_status["system_id"] = system_id
 70 | 
 71 |     # format date
 72 |     one_pv_system_status["date"] = pd.to_datetime(date)
 73 |     one_pv_system_status = join_date_time(one_pv_system_status)
 74 | 
 75 |     return one_pv_system_status
 76 | 
 77 | 
 78 | def join_date_time(one_pv_system_status: pd.DataFrame, time_format="%H:%M:%S"):
 79 |     """
 80 |     Join date and time columns toegther
 81 | 
 82 |     Args:
 83 |         one_pv_system_status: dataframe with 'date' and 'time'
 84 |         time_format: format of time
 85 | 
 86 |     Returns: dataframe with column datetime
 87 |     """
 88 | 
 89 |     # fix midnight
 90 |     fix_midnight_index = one_pv_system_status["time"] == "24:00"
 91 |     one_pv_system_status.loc[fix_midnight_index, "time"] = "00:00"
 92 | 
 93 |     # format time
 94 |     one_pv_system_status["time"] = pd.to_datetime(one_pv_system_status["time"]).dt.strftime(
 95 |         time_format
 96 |     )
 97 |     one_pv_system_status["time"] = pd.to_timedelta(one_pv_system_status["time"])
 98 | 
 99 |     # format date
100 |     one_pv_system_status["date"] = pd.to_datetime(one_pv_system_status["date"].astype(str))
101 | 
102 |     # make datetime
103 |     one_pv_system_status["datetime"] = one_pv_system_status["date"] + one_pv_system_status["time"]
104 |     one_pv_system_status.drop(columns=["date", "time"], inplace=True)
105 |     one_pv_system_status.sort_values(by="datetime", inplace=True)
106 | 
107 |     one_pv_system_status.set_index("datetime", inplace=True, drop=True)
108 | 
109 |     return one_pv_system_status
110 | 
111 | 
112 | def process_batch_status(pv_system_status_text) -> pd.DataFrame:
113 |     """
114 |     Process batch status text
115 | 
116 |     Args:
117 |         pv_system_status_text: text to be procssed
118 | 
119 |     Returns: dataframe of data
120 | 
121 |     """
122 |     # See https://pvoutput.org/help.html#dataservice-getbatchstatus
123 | 
124 |     # PVOutput uses a non-standard format for the data.  The text
125 |     # needs some processing before it can be read as a CSV.
126 |     processed_lines = []
127 |     for line in pv_system_status_text.split("\n"):
128 |         line_sections = line.split(";")
129 |         date = line_sections[0]
130 |         time_and_data = line_sections[1:]
131 |         processed_line = [
132 |             "{date},{payload}".format(date=date, payload=payload) for payload in time_and_data
133 |         ]
134 |         processed_lines.extend(processed_line)
135 | 
136 |     if processed_lines:
137 |         first_line = processed_lines[0]
138 |         num_cols = len(first_line.split(","))
139 |         if num_cols >= 8:
140 |             raise NotImplementedError("Handling of consumption data is not implemented!")
141 | 
142 |     processed_text = "\n".join(processed_lines)
143 |     del processed_lines
144 | 
145 |     columns = ["cumulative_energy_gen_Wh", "instantaneous_power_gen_W", "temperature_C", "voltage"]
146 | 
147 |     pv_system_status = pd.read_csv(
148 |         StringIO(processed_text),
149 |         names=["date", "time"] + columns,
150 |         # parse_dates={"datetime": ["date", "time"]},
151 |         # index_col=["datetime"],
152 |         dtype={col: np.float64 for col in columns},
153 |     ).sort_index()
154 | 
155 |     pv_system_status = join_date_time(pv_system_status)
156 | 
157 |     logger.info(pv_system_status)
158 | 
159 |     return pv_system_status
160 | 


--------------------------------------------------------------------------------
/pvoutput/pvoutput.py:
--------------------------------------------------------------------------------
   1 | """Main PV Output class to get data from pvoutput.org"""
   2 | 
   3 | import logging
   4 | import os
   5 | import time
   6 | import warnings
   7 | from datetime import date, datetime, timedelta
   8 | from io import StringIO
   9 | from typing import Dict, Iterable, List, Optional, Union
  10 | from urllib.parse import urljoin
  11 | 
  12 | import numpy as np
  13 | import pandas as pd
  14 | import requests
  15 | import tables
  16 | 
  17 | from pvoutput.consts import (
  18 |     BASE_URL,
  19 |     CONFIG_FILENAME,
  20 |     ONE_DAY,
  21 |     PV_OUTPUT_DATE_FORMAT,
  22 |     RATE_LIMIT_PARAMS_TO_API_HEADERS,
  23 | )
  24 | from pvoutput.daterange import DateRange, merge_date_ranges_to_years
  25 | from pvoutput.exceptions import NoStatusFound, RateLimitExceeded
  26 | from pvoutput.prcoess import process_batch_status, process_system_status
  27 | from pvoutput.utils import (
  28 |     _get_param_from_config_file,
  29 |     _get_response,
  30 |     _print_and_log,
  31 |     get_date_ranges_to_download,
  32 |     sort_and_de_dupe_pv_system,
  33 |     system_id_to_hdf_key,
  34 | )
  35 | 
  36 | _LOG = logging.getLogger("pvoutput")
  37 | 
  38 | 
  39 | class PVOutput:
  40 |     """
  41 |     Main PV Output class
  42 | 
  43 |     Attributes:
  44 |         api_key
  45 |         system_id
  46 |         rate_limit_remaining
  47 |         rate_limit_total
  48 |         rate_limit_reset_time
  49 |         data_service_url
  50 |     """
  51 | 
  52 |     def __init__(
  53 |         self,
  54 |         api_key: str = os.getenv("API_KEY"),
  55 |         system_id: str = os.getenv("SYSTEM_ID"),
  56 |         config_filename: Optional[str] = CONFIG_FILENAME,
  57 |         data_service_url: Optional[str] = os.getenv("DATA_SERVICE_URL"),
  58 |     ):
  59 |         """
  60 |         Init
  61 | 
  62 |         Args:
  63 |             api_key: Your API key from PVOutput.org.
  64 |             system_id: Your system ID from PVOutput.org.  If you don't have a
  65 |                 PV system then you can register with PVOutput.org and select
  66 |                 the 'energy consumption only' box.
  67 |             config_filename: Optional, the filename of the .yml config file.
  68 |             data_service_url: Optional.  If you have subscribed to
  69 |                 PVOutput.org's data service then add the data service URL here.
  70 |                 This string must end in '.org'.
  71 |         """
  72 |         self.api_key = api_key
  73 |         self.system_id = system_id
  74 |         self.rate_limit_remaining = None
  75 |         self.rate_limit_total = None
  76 |         self.rate_limit_reset_time = None
  77 |         self.data_service_url = data_service_url
  78 | 
  79 |         # Set from config file if None
  80 |         for param_name in ["api_key", "system_id"]:
  81 |             if getattr(self, param_name) is None:
  82 |                 try:
  83 |                     param_value_from_config = _get_param_from_config_file(
  84 |                         param_name, config_filename
  85 |                     )
  86 |                 except Exception as e:
  87 |                     msg = (
  88 |                         "Error loading configuration parameter {param_name}"
  89 |                         " from config file {filename}.  Either pass"
  90 |                         " {param_name} into PVOutput constructor, or create"
  91 |                         " config file {filename}.  {exception}".format(
  92 |                             param_name=param_name, filename=CONFIG_FILENAME, exception=e
  93 |                         )
  94 |                     )
  95 |                     print(msg)
  96 |                     _LOG.exception(msg)
  97 |                     raise
  98 |                 setattr(self, param_name, param_value_from_config)
  99 |             # Convert to strings
 100 |             setattr(self, param_name, str(getattr(self, param_name)))
 101 | 
 102 |         # Check for data_service_url
 103 |         if self.data_service_url is None:
 104 |             try:
 105 |                 self.data_service_url = _get_param_from_config_file(
 106 |                     "data_service_url", config_filename
 107 |                 )
 108 |             except KeyError:
 109 |                 pass
 110 |             except FileNotFoundError:
 111 |                 pass
 112 | 
 113 |         if self.data_service_url is not None:
 114 |             if not self.data_service_url.strip("/").endswith(".org"):
 115 |                 raise ValueError("data_service_url must end in '.org'")
 116 | 
 117 |     def search(
 118 |         self,
 119 |         query: str,
 120 |         lat: Optional[float] = None,
 121 |         lon: Optional[float] = None,
 122 |         include_country: bool = True,
 123 |         **kwargs,
 124 |     ) -> pd.DataFrame:
 125 |         """Search for PV systems.
 126 | 
 127 |         Some quirks of the PVOutput.org API:
 128 |             - The maximum number of results returned by PVOutput.org is 30.
 129 |                 If the number of returned results is 30, then there is no
 130 |                 indication of whether there are exactly 30 search results,
 131 |                 or if there are more than 30.  Also, there is no way to
 132 |                 request additional 'pages' of search results.
 133 |             - The maximum search radius is 25km
 134 | 
 135 |         Args:
 136 |             query: string, see https://pvoutput.org/help.html#search
 137 |                 e.g. '5km'.
 138 |             lat: float, e.g. 52.0668589
 139 |             lon: float, e.g. -1.3484038
 140 |             include_country: bool, whether or not to include the country name
 141 |                 with the returned postcode.
 142 | 
 143 |         Returns:
 144 |             pd.DataFrame, one row per search results.  Index is PV system ID.
 145 |                 Columns:
 146 |                     name,
 147 |                     system_DC_capacity_W,
 148 |                     address,  # If `include_country` is True then address is
 149 |                               # 'country> <postcode>',
 150 |                               # else address is '<postcode>'.
 151 |                     orientation,
 152 |                     num_outputs,
 153 |                     last_output,
 154 |                     panel,
 155 |                     inverter,
 156 |                     distance_km,
 157 |                     latitude,
 158 |                     longitude
 159 |         """
 160 |         api_params = {"q": query, "country": int(include_country)}
 161 | 
 162 |         if lat is not None and lon is not None:
 163 |             api_params["ll"] = "{:f},{:f}".format(lat, lon)
 164 | 
 165 |         pv_systems_text = self._api_query(service="search", api_params=api_params, **kwargs)
 166 | 
 167 |         pv_systems = pd.read_csv(
 168 |             StringIO(pv_systems_text),
 169 |             names=[
 170 |                 "name",
 171 |                 "system_DC_capacity_W",
 172 |                 "address",
 173 |                 "orientation",
 174 |                 "num_outputs",
 175 |                 "last_output",
 176 |                 "system_id",
 177 |                 "panel",
 178 |                 "inverter",
 179 |                 "distance_km",
 180 |                 "latitude",
 181 |                 "longitude",
 182 |             ],
 183 |             index_col="system_id",
 184 |         )
 185 | 
 186 |         return pv_systems
 187 | 
 188 |     def get_status(
 189 |         self,
 190 |         pv_system_id: int,
 191 |         date: Union[str, datetime],
 192 |         historic: bool = True,
 193 |         timezone: Optional[str] = None,
 194 |         **kwargs,
 195 |     ) -> pd.DataFrame:
 196 |         """Get PV system status (e.g. power generation) for one day.
 197 | 
 198 |         The returned DataFrame will be empty if the PVOutput API
 199 |         returns 'status 400: No status found'.
 200 | 
 201 |         Args:
 202 |             pv_system_id: int
 203 |             date: str in format YYYYMMDD; or datetime
 204 |                 (localtime of the PV system)
 205 |             timezone: the timezone of the systems. This will be used to add to the datetime.
 206 |                 If None, it is not added
 207 | 
 208 |         Returns:
 209 |             pd.DataFrame:
 210 |                 index: datetime (DatetimeIndex, localtime of the PV system)
 211 |                 columns:  (all np.float64):
 212 |                     cumulative_energy_gen_Wh,
 213 |                     energy_efficiency_kWh_per_kW,
 214 |                     instantaneous_power_gen_W,
 215 |                     average_power_gen_W,
 216 |                     power_gen_normalised,
 217 |                     energy_consumption_Wh,
 218 |                     power_demand_W,
 219 |                     temperature_C,
 220 |                     voltage
 221 |         """
 222 |         _LOG.info("system_id %d: Requesting system status for %s", pv_system_id, date)
 223 |         date = date_to_pvoutput_str(date)
 224 |         _check_date(date)
 225 | 
 226 |         api_params = {
 227 |             "d": date,  # date, YYYYMMDD, localtime of the PV system
 228 |             "h": int(historic is True),  # We want historical data.
 229 |             "limit": 288,  # API limit is 288 (num of 5-min periods per day).
 230 |             "ext": 0,  # Extended data; we don't want extended data.
 231 |             "sid1": pv_system_id,  # SystemID.
 232 |         }
 233 | 
 234 |         try:
 235 |             pv_system_status_text = self._api_query(
 236 |                 service="getstatus", api_params=api_params, **kwargs
 237 |             )
 238 |         except NoStatusFound:
 239 |             _LOG.info("system_id %d: No status found for date %s", pv_system_id, date)
 240 |             pv_system_status_text = ""
 241 | 
 242 |         # See https://pvoutput.org/help.html#api-getstatus but make sure
 243 |         # you read the 'History Query' subsection, as a historical query
 244 |         # has slightly different return columns compared to a non-historical
 245 |         # query!
 246 |         columns = (
 247 |             [
 248 |                 "cumulative_energy_gen_Wh",
 249 |                 "energy_efficiency_kWh_per_kW",
 250 |                 "instantaneous_power_gen_W",
 251 |                 "average_power_gen_W",
 252 |                 "power_gen_normalised",
 253 |                 "energy_consumption_Wh",
 254 |                 "power_demand_W",
 255 |                 "temperature_C",
 256 |                 "voltage",
 257 |             ]
 258 |             if historic
 259 |             else [
 260 |                 "cumulative_energy_gen_Wh",
 261 |                 "instantaneous_power_gen_W",
 262 |                 "energy_consumption_Wh",
 263 |                 "power_demand_W",
 264 |                 "power_gen_normalised",
 265 |                 "temperature_C",
 266 |                 "voltage",
 267 |             ]
 268 |         )
 269 | 
 270 |         pv_system_status = pd.read_csv(
 271 |             StringIO(pv_system_status_text),
 272 |             lineterminator=";",
 273 |             names=["date", "time"] + columns,
 274 |             parse_dates={"datetime": ["date", "time"]},
 275 |             index_col=["datetime"],
 276 |             dtype={col: np.float64 for col in columns},
 277 |         ).sort_index()
 278 | 
 279 |         # add timezone
 280 |         if timezone is not None:
 281 |             pv_system_status = pv_system_status.tz_localize(timezone).tz_convert("UTC")
 282 | 
 283 |         return pv_system_status
 284 | 
 285 |     def get_system_status(
 286 |         self,
 287 |         pv_system_ids: List[int],
 288 |         date: Union[str, datetime],
 289 |         timezone: Optional[str] = None,
 290 |         **kwargs,
 291 |     ) -> pd.DataFrame:
 292 |         """Get Batch of PV system status (e.g. power generation) for one day, for multiple systems
 293 | 
 294 |         The returned DataFrame will be empty if the PVOutput API
 295 |         returns 'status 400: No status found'.
 296 | 
 297 |         Args:
 298 |             pv_system_ids: list of ints.
 299 |                 If you have a subscription service then multiple (up to 50)
 300 |                 pv systems status can be queries at once
 301 |             date: str in format YYYYMMDD; or datetime
 302 |                 (localtime of the PV system)
 303 |             timezone: the timezone of the systems. This will be used to add to the datetime.
 304 |                 If None, it is not added
 305 | 
 306 |         Returns:
 307 |             pd.DataFrame:
 308 |                 columns:  (all np.float64):
 309 |                     system_id,
 310 |                     datetime,
 311 |                     instantaneous_power_gen_W,
 312 |                     cumulative_energy_gen_Wh,
 313 |                     instantaneous_power_gen_W,
 314 |                     energy_consumption_Wh",
 315 |                     temperature_C,
 316 |                     voltage,
 317 |         """
 318 |         _LOG.info(f"system_ids {pv_system_ids}: Requesting batch system status for %s", date)
 319 |         date = date_to_pvoutput_str(date)
 320 |         _check_date(date)
 321 | 
 322 |         # join the system ids with a column
 323 |         all_pv_system_id = ",".join([str(idx) for idx in pv_system_ids])
 324 | 
 325 |         api_params = {
 326 |             "dt": date,  # date, YYYYMMDD, localtime of the PV system
 327 |             "sid1": all_pv_system_id,  # SystemID.
 328 |         }
 329 | 
 330 |         try:
 331 |             pv_system_status_text = self._api_query(
 332 |                 service="getsystemstatus", api_params=api_params, **kwargs
 333 |             )
 334 | 
 335 |         except NoStatusFound:
 336 |             _LOG.info(f"system_id {all_pv_system_id}: No status found for date %s", date)
 337 |             pv_system_status_text = "no status found"
 338 | 
 339 |         # each pv system is on a new line
 340 |         pv_systems_status_text = pv_system_status_text.split("\n")
 341 | 
 342 |         pv_system_status = []
 343 |         for pv_system_status_text in pv_systems_status_text:
 344 | 
 345 |             try:
 346 |                 one_pv_system_status = process_system_status(
 347 |                     pv_system_status_text=pv_system_status_text, date=date
 348 |                 )
 349 |             except Exception as e:
 350 |                 _LOG.error(
 351 |                     f"Could not change raw text into dataframe. Raw text is {pv_system_status_text}"
 352 |                 )
 353 |                 raise e
 354 | 
 355 |             pv_system_status.append(one_pv_system_status)
 356 | 
 357 |         pv_system_status = pd.concat(pv_system_status)
 358 |         pv_system_status.reset_index(inplace=True)
 359 | 
 360 |         # add timezone
 361 |         if timezone is not None:
 362 |             pv_system_status["datetime"] = (
 363 |                 pd.DatetimeIndex(pv_system_status["datetime"])
 364 |                 .tz_localize(timezone)
 365 |                 .tz_convert("UTC")
 366 |             )
 367 | 
 368 |         return pv_system_status
 369 | 
 370 |     def get_batch_status(
 371 |         self,
 372 |         pv_system_id: int,
 373 |         date_to: Optional[Union[str, datetime]] = None,
 374 |         max_retries: Optional[int] = 1000,
 375 |         **kwargs,
 376 |     ) -> Union[None, pd.DataFrame]:
 377 |         """Get batch PV system status (e.g. power generation).
 378 | 
 379 |         The returned DataFrame will be empty if the PVOutput API
 380 |         returns 'status 400: No status found'.
 381 | 
 382 |         Data returned is limited to the last 366 days per request.
 383 |         To retrieve older data, use the date_to parameter.
 384 | 
 385 |         The PVOutput getbatchstatus API is asynchronous.  When it's first
 386 |         called, it replies to say 'accepted'.  This function will then
 387 |         wait a minute and call the API again to see if the data is ready.
 388 |         Set `max_retries` to 1 if you want to return immediately, even
 389 |         if data isn't ready yet (and hence this function will return None)
 390 | 
 391 |         https://pvoutput.org/help.html#dataservice-getbatchstatus
 392 | 
 393 |         Args:
 394 |             pv_system_id: int
 395 |             date_to: str in format YYYYMMDD; or datetime
 396 |                 (localtime of the PV system).  The returned timeseries will
 397 |                 include 366 days of data: from YYYY-1MMDD to YYYYMMDD inclusive
 398 |             max_retries: int, number of times to retry after receiving
 399 |                 a '202 Accepted' request.  Set `max_retries` to 1 if you want
 400 |                 to return immediately, even if data isn't ready yet (and hence
 401 |                 this function will return None).
 402 | 
 403 |         Returns:
 404 |             None (if data isn't ready after retrying max_retries times) or
 405 |             pd.DataFrame:
 406 |                 index: datetime (DatetimeIndex, localtime of the PV system)
 407 |                 columns:  (all np.float64):
 408 |                     cumulative_energy_gen_Wh,
 409 |                     instantaneous_power_gen_W,
 410 |                     temperature_C,
 411 |                     voltage
 412 |         """
 413 |         api_params = {"sid1": pv_system_id}
 414 | 
 415 |         _set_date_param(date_to, api_params, "dt")
 416 | 
 417 |         for retry in range(max_retries):
 418 |             try:
 419 |                 pv_system_status_text = self._api_query(
 420 |                     service="getbatchstatus", api_params=api_params, use_data_service=True, **kwargs
 421 |                 )
 422 |             except NoStatusFound:
 423 |                 _LOG.info("system_id %d: No status found for date_to %s", pv_system_id, date_to)
 424 |                 pv_system_status_text = ""
 425 |                 break
 426 | 
 427 |             if "Accepted 202" in pv_system_status_text:
 428 |                 if retry == 0:
 429 |                     _print_and_log("Request accepted.")
 430 |                 if retry < max_retries - 1:
 431 |                     _print_and_log("Sleeping for 1 second.")
 432 |                     time.sleep(1)
 433 |                 else:
 434 |                     _print_and_log(
 435 |                         "Call get_batch_status again in a minute to see if" " results are ready."
 436 |                     )
 437 |             else:
 438 |                 break
 439 |         else:
 440 |             return
 441 | 
 442 |         return process_batch_status(pv_system_status_text)
 443 | 
 444 |     def get_metadata(self, pv_system_id: int, **kwargs) -> pd.Series:
 445 |         """Get metadata for a single PV system.
 446 | 
 447 |         Args:
 448 |             pv_system_id: int
 449 | 
 450 |         Returns:
 451 |             pd.Series.  Index is:
 452 |                 name,
 453 |                 system_DC_capacity_W,
 454 |                 address,
 455 |                 num_panels,
 456 |                 panel_capacity_W_each,
 457 |                 panel_brand,
 458 |                 num_inverters,
 459 |                 inverter_capacity_W,
 460 |                 inverter_brand,
 461 |                 orientation,
 462 |                 array_tilt_degrees,
 463 |                 shade,
 464 |                 install_date,
 465 |                 latitude,
 466 |                 longitude,
 467 |                 status_interval_minutes,
 468 |                 secondary_num_panels,
 469 |                 secondary_panel_capacity_W_each,
 470 |                 secondary_orientation,
 471 |                 secondary_array_tilt_degrees
 472 |         """
 473 |         pv_metadata_text = self._api_query(
 474 |             service="getsystem",
 475 |             api_params={
 476 |                 "array2": 1,  # Provide data about secondary array, if present.
 477 |                 "tariffs": 0,
 478 |                 "teams": 0,
 479 |                 "est": 0,
 480 |                 "donations": 0,
 481 |                 "sid1": pv_system_id,  # SystemID
 482 |                 "ext": 0,  # Include extended data?
 483 |             },
 484 |             **kwargs,
 485 |         )
 486 | 
 487 |         _LOG.debug(f"getting metadata for {pv_system_id}")
 488 | 
 489 |         pv_metadata = pd.read_csv(
 490 |             StringIO(pv_metadata_text),
 491 |             lineterminator=";",
 492 |             names=[
 493 |                 "name",
 494 |                 "system_DC_capacity_W",
 495 |                 "address",
 496 |                 "num_panels",
 497 |                 "panel_capacity_W_each",
 498 |                 "panel_brand",
 499 |                 "num_inverters",
 500 |                 "inverter_capacity_W",
 501 |                 "inverter_brand",
 502 |                 "orientation",
 503 |                 "array_tilt_degrees",
 504 |                 "shade",
 505 |                 "install_date",
 506 |                 "latitude",
 507 |                 "longitude",
 508 |                 "status_interval_minutes",
 509 |                 "secondary_num_panels",
 510 |                 "secondary_panel_capacity_W_each",
 511 |                 "secondary_orientation",
 512 |                 "secondary_array_tilt_degrees",
 513 |             ],
 514 |             parse_dates=["install_date"],
 515 |             nrows=1,
 516 |         ).squeeze()
 517 |         pv_metadata["system_id"] = pv_system_id
 518 |         pv_metadata.name = pv_system_id
 519 |         return pv_metadata
 520 | 
 521 |     def get_statistic(
 522 |         self,
 523 |         pv_system_id: int,
 524 |         date_from: Optional[Union[str, date]] = None,
 525 |         date_to: Optional[Union[str, date]] = None,
 526 |         **kwargs,
 527 |     ) -> pd.DataFrame:
 528 |         """Get summary stats for a single PV system.
 529 | 
 530 |         Args:
 531 |             pv_system_id: int
 532 |             date_from
 533 |             date_to
 534 | 
 535 |         Returns:
 536 |             pd.DataFrame:
 537 |                 total_energy_gen_Wh,
 538 |                 energy_exported_Wh,
 539 |                 average_daily_energy_gen_Wh,
 540 |                 minimum_daily_energy_gen_Wh,
 541 |                 maximum_daily_energy_gen_Wh,
 542 |                 average_efficiency_kWh_per_kW,
 543 |                 num_outputs,  # The number of days for which there's >= 1 val.
 544 |                 actual_date_from,
 545 |                 actual_date_to,
 546 |                 record_efficiency_kWh_per_kW,
 547 |                 record_efficiency_date,
 548 |                 query_date_from,
 549 |                 query_date_to
 550 |         """
 551 |         if date_from and not date_to:
 552 |             date_to = pd.Timestamp.now().date()
 553 |         if date_to and not date_from:
 554 |             date_from = pd.Timestamp("1900-01-01").date()
 555 | 
 556 |         api_params = {
 557 |             "c": 0,  # consumption and import
 558 |             "crdr": 0,  # credits / debits
 559 |             "sid1": pv_system_id,  # SystemID
 560 |         }
 561 | 
 562 |         _set_date_param(date_from, api_params, "df")
 563 |         _set_date_param(date_to, api_params, "dt")
 564 | 
 565 |         try:
 566 |             pv_metadata_text = self._api_query(
 567 |                 service="getstatistic", api_params=api_params, **kwargs
 568 |             )
 569 |         except NoStatusFound:
 570 |             pv_metadata_text = ""
 571 | 
 572 |         columns = [
 573 |             "total_energy_gen_Wh",
 574 |             "energy_exported_Wh",
 575 |             "average_daily_energy_gen_Wh",
 576 |             "minimum_daily_energy_gen_Wh",
 577 |             "maximum_daily_energy_gen_Wh",
 578 |             "average_efficiency_kWh_per_kW",
 579 |             "num_outputs",
 580 |             "actual_date_from",
 581 |             "actual_date_to",
 582 |             "record_efficiency_kWh_per_kW",
 583 |             "record_efficiency_date",
 584 |         ]
 585 |         date_cols = ["actual_date_from", "actual_date_to", "record_efficiency_date"]
 586 |         numeric_cols = set(columns) - set(date_cols)
 587 |         pv_metadata = pd.read_csv(
 588 |             StringIO(pv_metadata_text),
 589 |             names=columns,
 590 |             dtype={col: np.float32 for col in numeric_cols},
 591 |             parse_dates=date_cols,
 592 |         )
 593 |         if pv_metadata.empty:
 594 |             data = {col: np.float32(np.NaN) for col in numeric_cols}
 595 |             data.update({col: pd.NaT for col in date_cols})
 596 |             pv_metadata = pd.DataFrame(data, index=[pv_system_id])
 597 |         else:
 598 |             pv_metadata.index = [pv_system_id]
 599 | 
 600 |         pv_metadata["query_date_from"] = pd.Timestamp(date_from) if date_from else pd.NaT
 601 |         pv_metadata["query_date_to"] = pd.Timestamp(date_to) if date_to else pd.Timestamp.now()
 602 |         return pv_metadata
 603 | 
 604 |     def _get_statistic_with_cache(
 605 |         self,
 606 |         store_filename: str,
 607 |         pv_system_id: int,
 608 |         date_from: Optional[Union[str, date]] = None,
 609 |         date_to: Optional[Union[str, date]] = None,
 610 |         **kwargs,
 611 |     ) -> pd.Series:
 612 |         """
 613 |         Get Statistic using cache
 614 | 
 615 |         Will try to get stats from store_filename['statistics'].  If this
 616 |         fails, or if date_to > query_date_to, or if
 617 |         date_from < query_date_from, then will call the API.  Note that the aim
 618 |         of this function is just to find the relevant actual_date_from and
 619 |         actual_date_to, so this function does not respect the other params.
 620 | 
 621 |         Args:
 622 |             store_filename: cache filenamte
 623 |             pv_system_id: pv system id
 624 |             date_from: the start date we want statistics from
 625 |             date_to: the end date we want statistics from
 626 |             **kwargs:
 627 | 
 628 |         Returns: Pandas data series holding various statistics
 629 | 
 630 |         """
 631 |         if date_from:
 632 |             date_from = pd.Timestamp(date_from).date()
 633 |         if date_to:
 634 |             date_to = pd.Timestamp(date_to).date()
 635 | 
 636 |         def _get_fresh_statistic():
 637 |             _LOG.info("pv_system %d: Getting fresh statistic.", pv_system_id)
 638 |             stats = self.get_statistic(pv_system_id, **kwargs)
 639 |             with pd.HDFStore(store_filename, mode="a") as store:
 640 |                 try:
 641 |                     store.remove(key="statistics", where="index=pv_system_id")
 642 |                 except KeyError:
 643 |                     pass
 644 |                 store.append(key="statistics", value=stats)
 645 |             return stats
 646 | 
 647 |         try:
 648 |             stats = pd.read_hdf(store_filename, key="statistics", where="index=pv_system_id")
 649 |         except (FileNotFoundError, KeyError):
 650 |             return _get_fresh_statistic()
 651 | 
 652 |         if stats.empty:
 653 |             return _get_fresh_statistic()
 654 | 
 655 |         query_date_from = stats.iloc[0]["query_date_from"]
 656 |         query_date_to = stats.iloc[0]["query_date_to"]
 657 | 
 658 |         if (
 659 |             not pd.isnull(date_from)
 660 |             and not pd.isnull(query_date_from)
 661 |             and date_from < query_date_from.date()
 662 |         ):
 663 |             return _get_fresh_statistic()
 664 | 
 665 |         if not pd.isnull(date_to) and date_to > query_date_to.date():
 666 |             return _get_fresh_statistic()
 667 | 
 668 |         return stats
 669 | 
 670 |     def download_multiple_systems_to_disk(
 671 |         self,
 672 |         system_ids: Iterable[int],
 673 |         start_date: datetime,
 674 |         end_date: datetime,
 675 |         output_filename: str,
 676 |         timezone: Optional[str] = None,
 677 |         min_data_availability: Optional[float] = 0.5,
 678 |         use_get_batch_status_if_available: Optional[bool] = True,
 679 |     ):
 680 |         """Download multiple PV system IDs to disk.
 681 | 
 682 |         Data is saved to `output_filename` in HDF5 format.  The exact data
 683 |         format is documented in
 684 |         https://github.com/openclimatefix/pvoutput/blob/master/docs/dataset.md
 685 | 
 686 |         This function is designed to be run for days (!) downloading
 687 |         gigabytes of PV data :)  As such, this function can be safely
 688 |         interrupted and re-started.  All the state required to re-start
 689 |         is stored in the HDF5 file.
 690 | 
 691 |         Add appropriate handlers the Python logger `pvoutput` to see progress.
 692 | 
 693 |         Args:
 694 |             system_ids: List of PV system IDs to download.
 695 |             start_date: Start of date range to download.
 696 |             end_date: End of date range to download.
 697 |             output_filename: HDF5 filename to write data to.
 698 |             timezone: String representation of timezone of timeseries data.
 699 |                 e.g. 'Europe/London'.
 700 |             min_data_availability: A float in the range [0, 1].  1 means only
 701 |                 accept PV systems which have no days of missing data.  0 means
 702 |                 accept all PV systems, no matter if they have missing data.
 703 |                 Note that the data availability is measured against the date
 704 |                 range for which the PV system has data available, not from
 705 |                 the date range passed into this function.
 706 |             use_get_batch_status_if_available: Bool.  If true then will use
 707 |                 PVOutput's getbatchstatus API (which must be paid for, and
 708 |                 `data_service_url` must be set in `~/.pvoutput.yml` or when
 709 |                 initialising the PVOutput object).
 710 |         """
 711 |         n = len(system_ids)
 712 |         for i, pv_system_id in enumerate(system_ids):
 713 |             _LOG.info("**********************")
 714 |             msg = "system_id {:d}: {:d} of {:d} ({:%})".format(pv_system_id, i + 1, n, (i + 1) / n)
 715 |             _LOG.info(msg)
 716 |             print("\r", msg, end="", flush=True)
 717 | 
 718 |             # Sorted list of DateRange objects.  For each DateRange,
 719 |             # we need to download from start_date to end_date inclusive.
 720 |             date_ranges_to_download = get_date_ranges_to_download(
 721 |                 output_filename, pv_system_id, start_date, end_date
 722 |             )
 723 | 
 724 |             # How much data is actually available?
 725 |             date_ranges_to_download = self._filter_date_range(
 726 |                 output_filename, pv_system_id, date_ranges_to_download, min_data_availability
 727 |             )
 728 | 
 729 |             if not date_ranges_to_download:
 730 |                 _LOG.info("system_id %d: No data left to download :)", pv_system_id)
 731 |                 continue
 732 | 
 733 |             _LOG.info(
 734 |                 "system_id %d: Will download these date ranges: %s",
 735 |                 pv_system_id,
 736 |                 date_ranges_to_download,
 737 |             )
 738 | 
 739 |             if use_get_batch_status_if_available:
 740 |                 if self.data_service_url:
 741 |                     self._download_multiple_using_get_batch_status(
 742 |                         output_filename, pv_system_id, date_ranges_to_download, timezone
 743 |                     )
 744 |                 else:
 745 |                     raise ValueError("data_service_url is not set!")
 746 |             else:
 747 |                 self._download_multiple_using_get_status(
 748 |                     output_filename, pv_system_id, date_ranges_to_download, timezone
 749 |                 )
 750 | 
 751 |     def get_insolation_forecast(
 752 |         self,
 753 |         date: Union[str, datetime],
 754 |         pv_system_id: Optional[int] = None,
 755 |         timezone: Optional[str] = None,
 756 |         lat: Optional[float] = None,
 757 |         lon: Optional[float] = None,
 758 |         **kwargs,
 759 |     ):
 760 |         """Get Insolation forecast data
 761 | 
 762 |         This is for a given site, or a given location defined by
 763 |         longitude and latitude.
 764 | 
 765 |         This is the estimated output for the site
 766 |         based on ideal weather conditions. Also factors in site age, reducing
 767 |         ouput by 1% each year, shade and orientation. Need donation mode enabled.
 768 |         See https://pvoutput.org/help.html#api-getinsolation
 769 | 
 770 |         Args:
 771 |            date: str in format YYYYMMDD; or datetime
 772 |             (localtime of the PV system)
 773 |             pv_system_id: int
 774 |             timezone: str
 775 |             lat: float e.g. -27.4676
 776 |             lon: float e.g. 153.0279
 777 |             **kwargs:
 778 | 
 779 | 
 780 |         Returns: dataframe of the insolution forecast
 781 | 
 782 |         """
 783 |         date = date_to_pvoutput_str(date)
 784 |         _check_date(date, prediction=True)
 785 |         api_params = {
 786 |             "d": date,  # date, YYYYMMDD, localtime of the PV system
 787 |             "sid1": pv_system_id,  # SystemID.
 788 |             "tz": timezone,  # defaults to configured timezone of system otherwise GMT
 789 |         }
 790 |         if lat is not None and lon is not None:
 791 |             api_params["ll"] = "{:f},{:f}".format(lat, lon)
 792 | 
 793 |         try:
 794 |             pv_insolation_text = self._api_query(
 795 |                 service="getinsolation", api_params=api_params, **kwargs
 796 |             )
 797 |         except NoStatusFound:
 798 |             _LOG.info("system_id %d: No status found for date %s", pv_system_id, date)
 799 |             pv_insolation_text = ""
 800 | 
 801 |         columns = ["predicted_power_gen_W", "predicted_cumulative_energy_gen_Wh"]
 802 |         pv_insolation = pd.read_csv(
 803 |             StringIO(pv_insolation_text),
 804 |             lineterminator=";",
 805 |             names=["time"] + columns,
 806 |             dtype={col: np.float64 for col in columns},
 807 |         ).sort_index()
 808 |         pv_insolation.index = pd.to_datetime(
 809 |             date + " " + pv_insolation.time, format="%Y-%m-%d %H:%M"
 810 |         )
 811 |         pv_insolation.drop("time", axis=1, inplace=True)
 812 |         return pv_insolation
 813 | 
 814 |     def _filter_date_range(
 815 |         self,
 816 |         store_filename: str,
 817 |         system_id: int,
 818 |         date_ranges: Iterable[DateRange],
 819 |         min_data_availability: Optional[float] = 0.5,
 820 |     ) -> List[DateRange]:
 821 |         """Check getstatistic to see if system_id has data for all date ranges.
 822 | 
 823 |         Args:
 824 |             system_id: PV system ID.
 825 |             store_filename: HDF5 filename to cache statistics to / from.
 826 |             date_ranges: List of DateRange objects.
 827 |             min_data_availability: A float in the range [0, 1].  1 means only
 828 |                 accept PV systems which have no days of missing data.  0 means
 829 |                 accept all PV systems, no matter if they have missing data.
 830 |         """
 831 |         if not date_ranges:
 832 |             return date_ranges
 833 | 
 834 |         stats = self._get_statistic_with_cache(
 835 |             store_filename,
 836 |             system_id,
 837 |             date_to=date_ranges[-1].end_date,
 838 |             wait_if_rate_limit_exceeded=True,
 839 |         ).squeeze()
 840 | 
 841 |         if pd.isnull(stats["actual_date_from"]) or pd.isnull(stats["actual_date_to"]):
 842 |             _LOG.info("system_id %d: Stats say there is no data!", system_id)
 843 |             return []
 844 | 
 845 |         timeseries_date_range = DateRange(stats["actual_date_from"], stats["actual_date_to"])
 846 | 
 847 |         data_availability = stats["num_outputs"] / (timeseries_date_range.total_days() + 1)
 848 | 
 849 |         if data_availability < min_data_availability:
 850 |             _LOG.info(
 851 |                 "system_id %d: Data availability too low!  Only %.0f %%.",
 852 |                 system_id,
 853 |                 data_availability * 100,
 854 |             )
 855 |             return []
 856 | 
 857 |         new_date_ranges = []
 858 |         for date_range in date_ranges:
 859 |             new_date_range = date_range.intersection(timeseries_date_range)
 860 |             if new_date_range:
 861 |                 new_date_ranges.append(new_date_range)
 862 |         return new_date_ranges
 863 | 
 864 |     def _download_multiple_using_get_batch_status(
 865 |         self, output_filename, pv_system_id, date_ranges_to_download, timezone: Optional[str] = None
 866 |     ):
 867 |         years = merge_date_ranges_to_years(date_ranges_to_download)
 868 |         dates_to = [year.end_date for year in years]
 869 |         total_rows = self._download_multiple_worker(
 870 |             output_filename, pv_system_id, dates_to, timezone, use_get_status=False
 871 |         )
 872 | 
 873 |         # Re-load data, sort, remove duplicate indicies, append back
 874 |         if total_rows:
 875 |             with pd.HDFStore(output_filename, mode="a", complevel=9) as store:
 876 |                 sort_and_de_dupe_pv_system(store, pv_system_id)
 877 | 
 878 |     def _download_multiple_using_get_status(
 879 |         self, output_filename, pv_system_id, date_ranges_to_download, timezone: Optional[str] = None
 880 |     ):
 881 |         for date_range in date_ranges_to_download:
 882 |             dates = date_range.date_range()
 883 |             self._download_multiple_worker(
 884 |                 output_filename, pv_system_id, dates, timezone, use_get_status=True
 885 |             )
 886 | 
 887 |     def _download_multiple_worker(
 888 |         self, output_filename, pv_system_id, dates, timezone, use_get_status
 889 |     ) -> int:
 890 |         """
 891 |         Download data with multiple workers
 892 | 
 893 |         Returns:
 894 |             total number of rows downloaded
 895 |         """
 896 |         total_rows = 0
 897 |         for date_to_load in dates:
 898 |             _LOG.info("system_id %d: Requesting date: %s", pv_system_id, date_to_load)
 899 |             datetime_of_api_request = pd.Timestamp.utcnow()
 900 |             if use_get_status:
 901 |                 timeseries = self.get_status(
 902 |                     pv_system_id, date_to_load, wait_if_rate_limit_exceeded=True
 903 |                 )
 904 |             else:
 905 |                 timeseries = self.get_batch_status(pv_system_id, date_to=date_to_load)
 906 |             if timeseries.empty:
 907 |                 _LOG.info(
 908 |                     "system_id %d: Got empty timeseries back for %s", pv_system_id, date_to_load
 909 |                 )
 910 |                 if use_get_status:
 911 |                     _append_missing_date_range(
 912 |                         output_filename,
 913 |                         pv_system_id,
 914 |                         date_to_load,
 915 |                         date_to_load,
 916 |                         datetime_of_api_request,
 917 |                     )
 918 |                 else:
 919 |                     _append_missing_date_range(
 920 |                         output_filename,
 921 |                         pv_system_id,
 922 |                         date_to_load - timedelta(days=365),
 923 |                         date_to_load,
 924 |                         datetime_of_api_request,
 925 |                     )
 926 |             else:
 927 |                 total_rows += len(timeseries)
 928 |                 _LOG.info(f"Adding timezone {timezone} to {total_rows} rows")
 929 |                 timeseries = timeseries.tz_localize(timezone)
 930 |                 _LOG.info(
 931 |                     "system_id: %d: %d rows retrieved: %s to %s",
 932 |                     pv_system_id,
 933 |                     len(timeseries),
 934 |                     timeseries.index[0],
 935 |                     timeseries.index[-1],
 936 |                 )
 937 |                 if use_get_status:
 938 |                     check_pv_system_status(timeseries, date_to_load)
 939 |                 else:
 940 |                     _record_gaps(
 941 |                         output_filename,
 942 |                         pv_system_id,
 943 |                         date_to_load,
 944 |                         timeseries,
 945 |                         datetime_of_api_request,
 946 |                     )
 947 |                 timeseries["datetime_of_API_request"] = datetime_of_api_request
 948 |                 timeseries["query_date"] = pd.Timestamp(date_to_load)
 949 |                 key = system_id_to_hdf_key(pv_system_id)
 950 |                 with pd.HDFStore(output_filename, mode="a", complevel=9) as store:
 951 |                     with warnings.catch_warnings():
 952 |                         warnings.simplefilter("ignore", tables.NaturalNameWarning)
 953 |                         store.append(key=key, value=timeseries, data_columns=True)
 954 | 
 955 |         _LOG.info("system_id %d: %d total rows downloaded", pv_system_id, total_rows)
 956 |         return total_rows
 957 | 
 958 |     def _api_query(
 959 |         self,
 960 |         service: str,
 961 |         api_params: Dict,
 962 |         wait_if_rate_limit_exceeded: bool = False,
 963 |         use_data_service: bool = False,
 964 |     ) -> str:
 965 |         """Send API request to PVOutput.org and return content text.
 966 | 
 967 |         Args:
 968 |             service: string, e.g. 'search' or 'getstatus'
 969 |             api_params: dict
 970 |             wait_if_rate_limit_exceeded: bool
 971 |             use_data_service: bool
 972 | 
 973 |         Raises:
 974 |             NoStatusFound
 975 |             RateLimitExceeded
 976 |         """
 977 |         get_response_func = (
 978 |             self._get_data_service_response if use_data_service else self._get_api_response
 979 |         )
 980 | 
 981 |         try:
 982 |             response = get_response_func(service, api_params)
 983 |         except Exception as e:
 984 |             _LOG.exception(e)
 985 |             raise
 986 | 
 987 |         try:
 988 |             return self._process_api_response(response)
 989 |         except RateLimitExceeded:
 990 |             msg = "PVOutput.org API rate limit exceeded!" "  Rate limit will be reset at {}".format(
 991 |                 self.rate_limit_reset_time
 992 |             )
 993 |             _print_and_log(msg)
 994 |             if wait_if_rate_limit_exceeded:
 995 |                 self.wait_for_rate_limit_reset()
 996 |                 return self._api_query(service, api_params, wait_if_rate_limit_exceeded=False)
 997 | 
 998 |             raise RateLimitExceeded(response, msg)
 999 | 
1000 |     def _get_api_response(self, service: str, api_params: Dict) -> requests.Response:
1001 |         """
1002 |         Get the non-data service (free) response from pvoutput.org
1003 | 
1004 |         Args:
1005 |             service: string, e.g. 'search', 'getstatus'
1006 |             api_params: dict
1007 |         """
1008 |         self._check_api_params()
1009 |         # Create request headers
1010 |         headers = {
1011 |             "X-Rate-Limit": "1",
1012 |             "X-Pvoutput-Apikey": self.api_key,
1013 |             "X-Pvoutput-SystemId": self.system_id,
1014 |         }
1015 | 
1016 |         api_url = urljoin(BASE_URL, "service/r2/{}.jsp".format(service))
1017 | 
1018 |         return _get_response(api_url, api_params, headers)
1019 | 
1020 |     def _get_data_service_response(self, service: str, api_params: Dict) -> requests.Response:
1021 |         """
1022 |         Get the data service response from pvoutput.org
1023 | 
1024 |         Args:
1025 |             service: string, e.g. 'getbatchstatus'
1026 |             api_params: dict
1027 |         """
1028 |         self._check_api_params()
1029 |         if self.data_service_url is None:
1030 |             raise ValueError("data_service_url must be set to use the data service!")
1031 | 
1032 |         headers = {"X-Rate-Limit": "1"}
1033 |         api_params = api_params.copy()
1034 |         api_params["key"] = self.api_key
1035 |         api_params["sid"] = self.system_id
1036 | 
1037 |         api_url = urljoin(self.data_service_url, "data/r2/{}.jsp".format(service))
1038 | 
1039 |         return _get_response(api_url, api_params, headers)
1040 | 
1041 |     def _check_api_params(self):
1042 |         # Check we have relevant login details:
1043 |         for param_name in ["api_key", "system_id"]:
1044 |             if getattr(self, param_name) is None:
1045 |                 raise ValueError("Please set the {} parameter.".format(param_name))
1046 | 
1047 |     def _set_rate_limit_params(self, headers):
1048 |         for param_name, header_key in RATE_LIMIT_PARAMS_TO_API_HEADERS.items():
1049 |             header_value = int(headers[header_key])
1050 |             setattr(self, param_name, header_value)
1051 | 
1052 |         self.rate_limit_reset_time = pd.Timestamp.utcfromtimestamp(self.rate_limit_reset_time)
1053 |         self.rate_limit_reset_time = self.rate_limit_reset_time.tz_localize("utc")
1054 | 
1055 |         _LOG.debug("%s", self.rate_limit_info())
1056 | 
1057 |     def rate_limit_info(self) -> Dict:
1058 |         """Get the rate limit information"""
1059 |         info = {}
1060 |         for param_name in RATE_LIMIT_PARAMS_TO_API_HEADERS:
1061 |             info[param_name] = getattr(self, param_name)
1062 |         return info
1063 | 
1064 |     def _process_api_response(self, response: requests.Response) -> str:
1065 |         """Turns an API response into text.
1066 | 
1067 |         Args:
1068 |             response: from _get_api_response()
1069 | 
1070 |         Returns:
1071 |             content of the response.
1072 | 
1073 |         Raises:
1074 |             UnicodeDecodeError
1075 |             NoStatusFound
1076 |             RateLimitExceeded
1077 |         """
1078 |         if response.status_code == 400:
1079 |             raise NoStatusFound(response=response)
1080 | 
1081 |         if response.status_code != 403:
1082 |             try:
1083 |                 response.raise_for_status()
1084 |             except Exception as e:
1085 |                 msg = "Bad status code! Response content = {}. Exception = {}".format(
1086 |                     response.content, e
1087 |                 )
1088 |                 _LOG.exception(msg)
1089 |                 raise e.__class__(msg)
1090 | 
1091 |         self._set_rate_limit_params(response.headers)
1092 | 
1093 |         # Did we overshoot our quota?
1094 |         if response.status_code == 403 and self.rate_limit_remaining <= 0:
1095 |             raise RateLimitExceeded(response=response)
1096 | 
1097 |         try:
1098 |             content = response.content.decode("latin1").strip()
1099 |         except Exception as e:
1100 |             msg = "Error decoding this string: {}\n{}".format(response.content, e)
1101 |             _LOG.exception(msg)
1102 |             raise
1103 | 
1104 |         # If we get to here then the content is valid :)
1105 |         return content
1106 | 
1107 |     def wait_for_rate_limit_reset(self, do_sleeping: bool = True) -> int:
1108 |         """
1109 |         Wait for reset limit
1110 | 
1111 |         Args:
1112 |             do_sleeping: bool to do the sleeping, or not.
1113 | 
1114 |         Returns: The number of seconds needed to sleep
1115 |         """
1116 |         utc_now = pd.Timestamp.utcnow()
1117 |         timedelta_to_wait = self.rate_limit_reset_time - utc_now
1118 |         timedelta_to_wait += timedelta(minutes=3)  # Just for safety
1119 |         secs_to_wait = timedelta_to_wait.total_seconds()
1120 |         retry_time_utc = utc_now + timedelta_to_wait
1121 | 
1122 |         # good to have the retry time in local so that user see 'their' time
1123 |         # retry_time_local = retry_time_utc.tz_convert(tz=datetime.now(tzlocal()).tzname())
1124 |         retry_time_local = retry_time_utc
1125 |         _print_and_log(
1126 |             "Waiting {:.0f} seconds.  Will retry at {} UTC".format(secs_to_wait, retry_time_local)
1127 |         )
1128 |         if do_sleeping:
1129 |             time.sleep(secs_to_wait)
1130 | 
1131 |         return secs_to_wait
1132 | 
1133 | 
1134 | def date_to_pvoutput_str(date: Union[str, datetime]) -> str:
1135 |     """Convert datetime to date string for PVOutput.org in YYYYMMDD format."""
1136 |     if isinstance(date, str):
1137 |         try:
1138 |             datetime.strptime(date, PV_OUTPUT_DATE_FORMAT)
1139 |         except ValueError:
1140 |             return pd.Timestamp(date).strftime(PV_OUTPUT_DATE_FORMAT)
1141 |         else:
1142 |             return date
1143 |     return date.strftime(PV_OUTPUT_DATE_FORMAT)
1144 | 
1145 | 
1146 | def _check_date(date: str, prediction=False):
1147 |     """Check that date string
1148 | 
1149 |     1. conforms to YYYYMMDD format,
1150 |     2. that the date isn't in the future.
1151 | 
1152 |     Raises:
1153 |         ValueError if the date is 'bad'.
1154 |     """
1155 |     dt = datetime.strptime(date, PV_OUTPUT_DATE_FORMAT)
1156 |     if dt > datetime.now() and not prediction:
1157 |         raise ValueError(
1158 |             ""
1159 |             "date should not be in the future.  Got {}.  Current date is {}.".format(
1160 |                 date, datetime.now()
1161 |             )
1162 |         )
1163 | 
1164 | 
1165 | def _set_date_param(dt, api_params, key):
1166 |     if dt is not None:
1167 |         dt = date_to_pvoutput_str(dt)
1168 |         _check_date(dt)
1169 |         api_params[key] = dt
1170 | 
1171 | 
1172 | def check_pv_system_status(pv_system_status: pd.DataFrame, requested_date: date):
1173 |     """Checks the DataFrame returned by get_pv_system_status.
1174 | 
1175 |     Args:
1176 |         pv_system_status: DataFrame returned by get_pv_system_status
1177 |         requested_date: date.
1178 | 
1179 |     Raises:
1180 |         ValueError if the DataFrame is incorrect.
1181 |     """
1182 |     if not isinstance(pv_system_status, pd.DataFrame):
1183 |         raise ValueError("pv_system_status must be a dataframe")
1184 |     if not pv_system_status.empty:
1185 |         index = pv_system_status.index
1186 |         for d in [index[0], index[-1]]:
1187 |             if not requested_date <= d.date() <= requested_date + ONE_DAY:
1188 |                 raise ValueError(
1189 |                     "A date in the index is outside the expected range."
1190 |                     " Date from index={}, requested_date={}".format(d, requested_date)
1191 |                 )
1192 | 
1193 | 
1194 | def _append_missing_date_range(
1195 |     output_filename, pv_system_id, missing_start_date, missing_end_date, datetime_of_api_request
1196 | ):
1197 | 
1198 |     data = {
1199 |         "missing_start_date_PV_localtime": pd.Timestamp(missing_start_date),
1200 |         "missing_end_date_PV_localtime": pd.Timestamp(missing_end_date),
1201 |         "datetime_of_API_request": datetime_of_api_request,
1202 |     }
1203 |     new_missing_date_range = pd.DataFrame(data, index=[pv_system_id])
1204 |     new_missing_date_range.index.name = "pv_system_id"
1205 |     _LOG.info(
1206 |         "system_id %d: Recording missing date range from %s to %s",
1207 |         pv_system_id,
1208 |         missing_start_date,
1209 |         missing_end_date,
1210 |     )
1211 |     with pd.HDFStore(output_filename, mode="a", complevel=9) as store:
1212 |         store.append(key="missing_dates", value=new_missing_date_range, data_columns=True)
1213 | 
1214 | 
1215 | def _record_gaps(output_filename, pv_system_id, date_to, timeseries, datetime_of_api_request):
1216 |     dates_of_data = (
1217 |         timeseries["instantaneous_power_gen_W"].dropna().resample("D").mean().dropna().index.date
1218 |     )
1219 |     dates_requested = pd.date_range(date_to - timedelta(days=365), date_to, freq="D").date
1220 |     missing_dates = set(dates_requested) - set(dates_of_data)
1221 |     missing_date_ranges = _convert_consecutive_dates_to_date_ranges(list(missing_dates))
1222 |     _LOG.info(
1223 |         "system_id %d: %d missing date ranges found: \n%s",
1224 |         pv_system_id,
1225 |         len(missing_date_ranges),
1226 |         missing_date_ranges,
1227 |     )
1228 |     if len(missing_date_ranges) == 0:
1229 |         return
1230 |     # Convert to from date objects to pd.Timestamp objects, because HDF5
1231 |     # doesn't like to store date objects.
1232 |     missing_date_ranges = missing_date_ranges.astype("datetime64")
1233 |     missing_date_ranges["pv_system_id"] = pv_system_id
1234 |     missing_date_ranges["datetime_of_API_request"] = datetime_of_api_request
1235 |     missing_date_ranges.set_index("pv_system_id", inplace=True)
1236 |     with pd.HDFStore(output_filename, mode="a", complevel=9) as store:
1237 |         store.append(key="missing_dates", value=missing_date_ranges, data_columns=True)
1238 | 
1239 | 
1240 | def _convert_consecutive_dates_to_date_ranges(missing_dates):
1241 |     new_missing = []
1242 |     missing_dates = np.sort(np.unique(missing_dates))
1243 |     if len(missing_dates) == 0:
1244 |         return pd.DataFrame(new_missing)
1245 | 
1246 |     gaps = np.diff(missing_dates).astype("timedelta64[D]").astype(int) > 1
1247 |     gaps = np.where(gaps)[0]
1248 | 
1249 |     start_date = missing_dates[0]
1250 |     for gap_i in gaps:
1251 |         end_date = missing_dates[gap_i]
1252 |         new_missing.append(
1253 |             {
1254 |                 "missing_start_date_PV_localtime": start_date,
1255 |                 "missing_end_date_PV_localtime": end_date,
1256 |             }
1257 |         )
1258 |         start_date = missing_dates[gap_i + 1]
1259 | 
1260 |     end_date = missing_dates[-1]
1261 |     new_missing.append(
1262 |         {"missing_start_date_PV_localtime": start_date, "missing_end_date_PV_localtime": end_date}
1263 |     )
1264 | 
1265 |     return pd.DataFrame(new_missing)
1266 | 


--------------------------------------------------------------------------------
/pvoutput/utils.py:
--------------------------------------------------------------------------------
  1 | """Util functions"""
  2 | 
  3 | import logging
  4 | import os
  5 | import sys
  6 | import warnings
  7 | from datetime import date, datetime
  8 | from typing import Dict, Iterable, List, Union
  9 | 
 10 | import numpy as np
 11 | import pandas as pd
 12 | import requests
 13 | import tables
 14 | import yaml
 15 | from requests.adapters import HTTPAdapter
 16 | from urllib3.util.retry import Retry
 17 | 
 18 | from pvoutput.consts import CONFIG_FILENAME
 19 | from pvoutput.daterange import DateRange, get_date_range_list
 20 | 
 21 | _LOG = logging.getLogger("pvoutput")
 22 | 
 23 | 
 24 | def _get_param_from_config_file(param_name, config_filename=CONFIG_FILENAME):
 25 |     with open(config_filename, mode="r") as fh:
 26 |         config_data = yaml.load(fh, Loader=yaml.Loader)
 27 |     try:
 28 |         value = config_data[param_name]
 29 |     except KeyError as e:
 30 |         print("Config file", config_filename, "does not contain a", param_name, "parameter.", e)
 31 |         raise
 32 |     return value
 33 | 
 34 | 
 35 | def get_logger(filename=None, mode="a", level=logging.DEBUG, stream_handler=False):
 36 |     """
 37 |     Get a logger
 38 | 
 39 |     Args:
 40 |         filename: get file handler filename
 41 |         mode: file handler mode
 42 |         level: logging level
 43 |         stream_handler: option to make a stream handler aswell
 44 | 
 45 |     Returns: logger
 46 |     """
 47 |     if filename is None:
 48 |         filename = _get_param_from_config_file("log_filename")
 49 |     logger = logging.getLogger("pvoutput")
 50 |     logger.setLevel(level)
 51 |     logger.handlers = [logging.FileHandler(filename=filename, mode=mode)]
 52 |     if stream_handler:
 53 |         logger.handlers.append(logging.StreamHandler(sys.stdout))
 54 |     formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
 55 |     for handler in logger.handlers:
 56 |         handler.setFormatter(formatter)
 57 | 
 58 |     # Attach urllib3's logger to our logger.
 59 |     loggers_to_attach = ["urllib3", "requests"]
 60 |     for logger_name_to_attach in loggers_to_attach:
 61 |         logger_to_attach = logging.getLogger(logger_name_to_attach)
 62 |         logger_to_attach.parent = logger
 63 |         logger_to_attach.propagate = True
 64 | 
 65 |     return logger
 66 | 
 67 | 
 68 | def _get_session_with_retry() -> requests.Session:
 69 |     max_retry_counts = dict(
 70 |         connect=720,  # How many connection-related errors to retry on.
 71 |         # Set high because sometimes the network goes down for a
 72 |         # few hours at a time.
 73 |         # 720 x Retry.MAX_BACKOFF (120 s) = 86,400 s = 24 hrs
 74 |         read=20,  # How many times to retry on read errors.
 75 |         status=20,  # How many times to retry on bad status codes.
 76 |     )
 77 |     retries = Retry(
 78 |         total=max(max_retry_counts.values()),
 79 |         backoff_factor=0.5,
 80 |         status_forcelist=[500, 502, 503, 504],
 81 |         **max_retry_counts
 82 |     )
 83 |     session = requests.Session()
 84 |     session.mount("http://", HTTPAdapter(max_retries=retries))
 85 |     session.mount("https://", HTTPAdapter(max_retries=retries))
 86 |     return session
 87 | 
 88 | 
 89 | def _get_response(api_url: str, api_params: Dict, headers: Dict) -> requests.Response:
 90 |     api_params_str = "&".join(["{}={}".format(key, value) for key, value in api_params.items()])
 91 |     full_api_url = "{}?{}".format(api_url, api_params_str)
 92 |     session = _get_session_with_retry()
 93 |     response = session.get(full_api_url, headers=headers)
 94 |     _LOG.debug("response: status_code=%d; headers=%s", response.status_code, response.headers)
 95 |     return response
 96 | 
 97 | 
 98 | def _print_and_log(msg: str, level: int = logging.INFO):
 99 |     _LOG.log(level, msg)
100 |     print(msg)
101 | 
102 | 
103 | def get_system_ids_in_store(store_filename: str) -> List[int]:
104 |     """
105 |     Get system ids in the hdf store
106 | 
107 |     Args:
108 |         store_filename: hdf file name
109 | 
110 |     Returns: list of systems ids
111 |     """
112 |     if not os.path.exists(store_filename):
113 |         return []
114 |     with pd.HDFStore(store_filename, mode="r") as store:
115 |         pv_system_ids = list(store.walk("/timeseries"))[0][2]
116 |     return pd.to_numeric(pv_system_ids)
117 | 
118 | 
119 | def get_date_ranges_to_download(
120 |     store_filename: str,
121 |     system_id: int,
122 |     start_date: Union[str, datetime],
123 |     end_date: Union[str, datetime],
124 | ) -> List[DateRange]:
125 |     """
126 |     Get the date ranges that we need downloaded
127 | 
128 |     If system_id in store, check if it already has data from
129 |     start_date to end_date, taking into consideration missing_dates.
130 | 
131 |     Returns: list of DateRange objects
132 |         For each DateRange we need to download from
133 |         start_date to end_date inclusive.
134 |     """
135 |     dates_to_download = list(pd.date_range(start_date, end_date, freq="D"))
136 |     dates_to_download = datetime_list_to_dates(dates_to_download)
137 |     dates_already_downloaded = get_dates_already_downloaded(store_filename, system_id)
138 |     dates_to_download = set(dates_to_download) - set(dates_already_downloaded)
139 |     missing_dates_for_id = get_missing_dates_for_id(store_filename, system_id)
140 |     dates_to_download -= set(missing_dates_for_id)
141 |     return get_date_range_list(list(dates_to_download))
142 | 
143 | 
144 | def get_missing_dates_for_id(store_filename: str, system_id: int) -> List:
145 |     """
146 |     Get missing dates for on pv system id
147 | 
148 |     Args:
149 |         store_filename: filename fo hdf store
150 |         system_id: system id
151 | 
152 |     Returns: list of missing dates
153 |     """
154 |     if not os.path.exists(store_filename):
155 |         return []
156 | 
157 |     with pd.HDFStore(store_filename, mode="r") as store:
158 |         try:
159 |             missing_dates_for_id = store.select(
160 |                 key="missing_dates",
161 |                 where="index=system_id",
162 |                 columns=["missing_start_date_PV_localtime", "missing_end_date_PV_localtime"],
163 |             )
164 |         except Exception as e:
165 |             _LOG.debug(e)
166 |             return []
167 | 
168 |     missing_dates = []
169 |     for _, row in missing_dates_for_id.iterrows():
170 |         missing_date_range = pd.date_range(
171 |             row["missing_start_date_PV_localtime"], row["missing_end_date_PV_localtime"], freq="D"
172 |         ).tolist()
173 |         missing_dates.extend(missing_date_range)
174 | 
175 |     missing_dates = np.sort(np.unique(missing_dates))
176 |     missing_dates = datetime_list_to_dates(missing_dates)
177 |     print()
178 |     _LOG.info("system_id %d: %d missing dates already found", system_id, len(missing_dates))
179 |     return missing_dates
180 | 
181 | 
182 | def datetime_list_to_dates(datetimes: Iterable[datetime]) -> Iterable[date]:
183 |     """
184 |     Change datetime list to dates
185 | 
186 |     Args:
187 |         datetimes: list of datetimes
188 | 
189 |     Returns: datetime index of dates
190 |     """
191 |     if not isinstance(datetimes, Iterable):
192 |         datetimes = [datetimes]
193 |     return pd.DatetimeIndex(datetimes).date
194 | 
195 | 
196 | def get_dates_already_downloaded(store_filename, system_id) -> set:
197 |     """
198 |     Get the dates that have already been downloaded
199 | 
200 |     Args:
201 |         store_filename: filename of hdf file
202 |         system_id: one system id
203 | 
204 |     Returns: set of datetimes already downloaded
205 | 
206 |     """
207 |     if not os.path.exists(store_filename):
208 |         return set([])
209 | 
210 |     with pd.HDFStore(store_filename, mode="r") as store:
211 |         key = system_id_to_hdf_key(system_id)
212 |         try:
213 |             datetimes = store.select(key=key, columns=["datetime", "query_date"])
214 |         except KeyError:
215 |             return set([])
216 |         else:
217 |             query_dates = datetime_list_to_dates(datetimes["query_date"].dropna())
218 |             return set(datetimes.index.date).union(query_dates)
219 | 
220 | 
221 | def system_id_to_hdf_key(system_id: int) -> str:
222 |     """
223 |     Change system id to a hdf key
224 | 
225 |     Args:
226 |         system_id: system id
227 | 
228 |     Returns: key
229 |     """
230 |     return "/timeseries/{:d}".format(system_id)
231 | 
232 | 
233 | def sort_and_de_dupe_pv_system(store, pv_system_id):
234 |     """
235 |     Sort and de-duplicate pv systems
236 | 
237 |     Args:
238 |         store: store of pv systems
239 |         pv_system_id: on pv system id
240 | 
241 |     """
242 |     key = system_id_to_hdf_key(pv_system_id)
243 |     timeseries = store[key]
244 |     timeseries.sort_index(inplace=True)
245 |     timeseries = timeseries[~timeseries.index.duplicated()]
246 |     store.remove(key)
247 |     with warnings.catch_warnings():
248 |         warnings.simplefilter("ignore", tables.NaturalNameWarning)
249 |         store.append(key, timeseries, data_columns=True)
250 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy
 2 | pandas
 3 | pyproj
 4 | pyshp
 5 | shapely
 6 | cython>0.15.1
 7 | geopandas
 8 | pytest
 9 | pyyaml
10 | tables
11 | matplotlib
12 | jupyter
13 | urllib3
14 | requests
15 | beautifulsoup4
16 | 


--------------------------------------------------------------------------------
/scripts/fetch_pv_timeseries.py:
--------------------------------------------------------------------------------
 1 | """Tool for importing timeseries PV data from PVOutput.
 2 | 
 3 | Takes in a PVOutput system csv file, and fetches the
 4 | PV system Timeseries data as a hdf file as described by the contents of
 5 | input, built according to the PVOutput library hdf file spec.
 6 | The output file is named according to the inputfile, with
 7 | "systems" replaced with "timeseries", e.g.
 8 | PVOutput_Albania_systems.csv -> PVOutput_Albania_timeseries.hdf
 9 | 
10 | Typical usage example:
11 | 
12 |   python fetch_pv_timeseries.py -s system.csv -o out --startdate 2019-07-25 --enddate 2020-07-25
13 | 
14 | Requirements:
15 | 
16 | Either: set the env vars
17 |   - DATA_SERVICE_URL
18 |   - PVOUTPUT_AUTH_SYSTEMID
19 |   - PVOUTPUT_AUTH_APIKEY,
20 | pass their equivalent arguments to the command,
21 | or create and use a ~/.pvoutput.yml file as described in the PVOutput library documentation
22 | """
23 | 
24 | from pvoutput import *
25 | 
26 | import click as cl
27 | import datetime as dt
28 | import sys
29 | import pandas as pd
30 | import pathlib
31 | import logging
32 | 
33 | 
34 | @cl.command()
35 | @cl.option(
36 |     "-s",
37 |     "--systemfile",
38 |     "systemfile_path",
39 |     envvar="SYSTEMFILE",
40 |     required=True,
41 |     type=cl.Path(exists=True),
42 | )
43 | @cl.option(
44 |     "-o",
45 |     "--outdir",
46 |     "output_directory",
47 |     envvar="OUTDIR",
48 |     default="/mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/PV/PVOutput.org",
49 |     type=cl.Path(exists=False, dir_okay=True),
50 | )
51 | @cl.option(
52 |     "--startdate", "start_date", envvar="STARTDATE", default="2019-05-20", type=cl.DateTime()
53 | )
54 | @cl.option("--enddate", "end_date", envvar="ENDDATE", default="2019-08-20", type=cl.DateTime())
55 | @cl.option("--data_service_url", envvar="DATA_SERVICE_URL")
56 | @cl.option("--pvo_systemid", envvar="PVOUTPUT_AUTH_SYSTEMID", required=True, type=str)
57 | @cl.option("--pvo_apikey", envvar="PVOUTPUT_AUTH_APIKEY", required=True, type=str)
58 | def run(
59 |     output_directory: str,
60 |     systemfile_path: str,
61 |     pvo_systemid: str,
62 |     pvo_apikey: str,
63 |     data_service_url: str,
64 |     start_date: dt.datetime,
65 |     end_date: dt.datetime,
66 | ):
67 |     if end_date < start_date:
68 |         sys.exit("End date cannot occur before start date")
69 | 
70 |     # Create output directory if it doesn't already exist
71 |     os.makedirs(output_directory, exist_ok=True)
72 | 
73 |     # Instantiate PVOutput library
74 |     pv: pvoutput.PVOutput = PVOutput(
75 |         system_id=pvo_systemid, api_key=pvo_apikey, data_service_url=data_service_url
76 |     )
77 | 
78 |     # Read in input systemsfile
79 |     pv_systems: pd.DataFrame = pd.read_csv(systemfile_path, index_col="system_id")
80 | 
81 |     # Write output file
82 |     filename: str = pathlib.Path(systemfile_path).stem.replace("systems", "timeseries") + ".hdf"
83 |     logging.info(f"Writing to {output_directory}/{filename}")
84 |     pv.download_multiple_systems_to_disk(
85 |         system_ids=pv_systems.index,
86 |         start_date=start_date,
87 |         end_date=end_date,
88 |         output_filename=output_directory + "/" + filename,
89 |     )
90 | 
91 | 
92 | if __name__ == "__main__":
93 |     run()
94 | 


--------------------------------------------------------------------------------
/scripts/scrape_country_codes.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Downloads all country codes from PVOutput.
 3 | Prints and saves a dictionary mapping the country names to
 4 | their codes.
 5 | """
 6 | 
 7 | import json
 8 | 
 9 | import urllib3
10 | from bs4 import BeautifulSoup
11 | 
12 | COUNTRY_PAGES = "https://pvoutput.org/map.jsp?country="
13 | MAX_COUNTRY_INT = 257
14 | 
15 | 
16 | def get_country_name(manager: urllib3.PoolManager, code: int) -> str:
17 | 
18 |     country_url = f"{COUNTRY_PAGES}{code}"
19 | 
20 |     response = manager.request("GET", country_url)
21 |     soup = BeautifulSoup(response.data, "html.parser")
22 | 
23 |     title = soup.title.string
24 | 
25 |     return title.split("|")[0].strip()
26 | 
27 | 
28 | def get_all_countries() -> None:
29 | 
30 |     output_dict = {}
31 | 
32 |     manager = urllib3.PoolManager()
33 | 
34 |     for country_int in range(1, MAX_COUNTRY_INT + 1):
35 |         country_str = get_country_name(manager, country_int)
36 |         output_dict[country_str] = int(country_int)
37 | 
38 |     print(output_dict)
39 |     str_dict = json.dumps(output_dict)
40 | 
41 |     with open("country_codes.txt", "w") as f:
42 |         f.write(str_dict)
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     get_all_countries()
47 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from setuptools import find_packages, setup
 4 | 
 5 | this_directory = Path(__file__).parent
 6 | install_requires = (this_directory / "requirements.txt").read_text().splitlines()
 7 | long_description = (this_directory / "README.md").read_text()
 8 | 
 9 | setup(
10 |     name="pvoutput-ocf",
11 |     version="0.1.33",
12 |     license="MIT",
13 |     packages=find_packages(),
14 |     install_requires=install_requires,
15 |     long_description=long_description,
16 |     long_description_content_type="text/markdown",
17 |     company="Open Climate Fix Ltd",
18 |     author_email="info@openclimatefix.org",
19 | )
20 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openclimatefix/pvoutput/383f57baf1488c0f14d4812acc0f4857152b3cb1/tests/__init__.py


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | import os
 3 | import pickle
 4 | from functools import partial
 5 | 
 6 | import pytest
 7 | 
 8 | from pvoutput import mapscraper as ms
 9 | 
10 | 
11 | @pytest.fixture
12 | def data_dir():
13 |     # Taken from http://stackoverflow.com/a/6098238/732596
14 |     data_dir = os.path.dirname(inspect.getfile(inspect.currentframe()))
15 |     data_dir = os.path.abspath(data_dir)
16 |     assert os.path.isdir(data_dir), data_dir + " does not exist."
17 |     return data_dir
18 | 
19 | 
20 | def get_cleaned_test_soup(data_dir):
21 |     test_soup_filepath = os.path.join(data_dir, "data/mapscraper_soup.pickle")
22 |     with open(test_soup_filepath, "rb") as f:
23 |         test_soup = pickle.load(f)
24 |     return ms.clean_soup(test_soup)
25 | 
26 | 
27 | @pytest.fixture()
28 | def get_test_dict_of_dfs(data_dir):
29 |     dict_filepath = os.path.join(data_dir, "data/mapscraper_dict_of_dfs.pickle")
30 |     with open(dict_filepath, "rb") as f:
31 |         test_soup = pickle.load(f)
32 |     return test_soup
33 | 
34 | 
35 | @pytest.fixture()
36 | def get_function_dict(data_dir):
37 |     # using partials so functions only get executed when needed
38 |     soup = get_cleaned_test_soup(data_dir)
39 |     df = ms._process_system_size_col(soup)
40 |     index = df.index
41 |     keys = get_keys_for_dict()
42 |     functions = (
43 |         partial(ms._process_system_size_col, soup),
44 |         partial(ms._process_output_col, soup, index),
45 |         partial(ms._process_generation_and_average_cols, soup, index),
46 |         partial(ms._process_efficiency_col, soup, index),
47 |         partial(ms._process_metadata, soup),
48 |     )
49 |     function_dict = dict(zip(keys, functions))
50 |     return function_dict
51 | 
52 | 
53 | def get_keys_for_dict():
54 |     keys = (
55 |         "pv_system_size_metadata",
56 |         "process_output_col",
57 |         "process_generation_and_average_cols",
58 |         "process_efficiency_col",
59 |         "process_metadata",
60 |     )
61 |     return keys
62 | 


--------------------------------------------------------------------------------
/tests/data/create_mapscraper_test_files.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import sys
 3 | 
 4 | from pvoutput import mapscraper as ms
 5 | from tests.conftest import get_keys_for_dict
 6 | 
 7 | 
 8 | def save_pickle_test_file(file, filename):
 9 |     # needed to avoid occasional RecursionError
10 |     sys.setrecursionlimit(10000)
11 |     with open(filename, "wb") as f:
12 |         pickle.dump(file, f)
13 | 
14 | 
15 | def get_raw_soup():
16 |     url = ms._create_map_url(country_code=243, page_number=1, ascending=False, sort_by="capacity")
17 |     return ms.get_soup(url, raw=True)
18 | 
19 | 
20 | def main():
21 |     raw_soup = get_raw_soup()
22 |     save_pickle_test_file(raw_soup, "mapscraper_soup.pickle")
23 |     soup = ms.clean_soup(raw_soup)
24 |     keys = get_keys_for_dict()
25 |     values = ms._process_metadata(soup, True)
26 |     df_dict = dict(zip(keys, values))
27 |     save_pickle_test_file(df_dict, "mapscraper_dict_of_dfs.pickle")
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     main()
32 | 


--------------------------------------------------------------------------------
/tests/data/create_test_hdf.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | import pandas as pd
 3 | 
 4 | FILENAME = "test.hdf"
 5 | PV_SYSTEM_ID = 123
 6 | 
 7 | 
 8 | def get_timeseries_df():
 9 |     df = pd.DataFrame(
10 |         index=pd.date_range("2019-01-01", periods=20, freq="5T"),
11 |         columns=["datetime_of_API_request", "query_date", "instantaneous_power_gen_W"],
12 |     )
13 |     df.index.name = "datetime"
14 |     df["datetime_of_API_request"] = [pd.Timestamp("2019-02-01", tz="UTC")] * len(df)
15 |     df["query_date"] = [pd.Timestamp("2019-01-01")] * len(df)
16 |     df["instantaneous_power_gen_W"] = list(range(20))
17 |     return df
18 | 
19 | 
20 | def get_missing_dates():
21 |     df = pd.DataFrame(
22 |         [
23 |             [
24 |                 PV_SYSTEM_ID,
25 |                 pd.Timestamp("2019-01-02"),
26 |                 pd.Timestamp("2019-01-02"),
27 |                 pd.Timestamp("2019-02-01", tz="UTC"),
28 |             ],
29 |             [
30 |                 PV_SYSTEM_ID,
31 |                 pd.Timestamp("2019-01-03"),
32 |                 pd.Timestamp("2019-01-03"),
33 |                 pd.Timestamp("2019-02-01", tz="UTC"),
34 |             ],
35 |         ],
36 |         columns=[
37 |             "pv_system_id",
38 |             "missing_start_date_PV_localtime",
39 |             "missing_end_date_PV_localtime",
40 |             "datetime_of_API_request",
41 |         ],
42 |     ).set_index("pv_system_id")
43 |     return df
44 | 
45 | 
46 | def main():
47 |     timeseries = get_timeseries_df()
48 |     missing_dates = get_missing_dates()
49 |     with pd.HDFStore(FILENAME, mode="w") as store:
50 |         store.append(key="/timeseries/{}".format(PV_SYSTEM_ID), value=timeseries, data_columns=True)
51 |         store.append(key="missing_dates", value=missing_dates, data_columns=True)
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     main()
56 | 


--------------------------------------------------------------------------------
/tests/data/mapscraper_dict_of_dfs.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openclimatefix/pvoutput/383f57baf1488c0f14d4812acc0f4857152b3cb1/tests/data/mapscraper_dict_of_dfs.pickle


--------------------------------------------------------------------------------
/tests/data/mapscraper_soup.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openclimatefix/pvoutput/383f57baf1488c0f14d4812acc0f4857152b3cb1/tests/data/mapscraper_soup.pickle


--------------------------------------------------------------------------------
/tests/data/test.hdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openclimatefix/pvoutput/383f57baf1488c0f14d4812acc0f4857152b3cb1/tests/data/test.hdf


--------------------------------------------------------------------------------
/tests/test_daterange.py:
--------------------------------------------------------------------------------
  1 | from datetime import date
  2 | 
  3 | import pandas as pd
  4 | 
  5 | from pvoutput import daterange
  6 | from pvoutput.daterange import DateRange, merge_date_ranges_to_years
  7 | 
  8 | 
  9 | def test_get_date_range_list():
 10 |     def _get_date_range(start_date, periods):
 11 |         return list(pd.date_range(start_date, periods=periods, freq="D"))
 12 | 
 13 |     dates = []
 14 |     for start_date, periods in [("2019-01-01", 5), ("2019-05-01", 3), ("2015-04-01", 1)]:
 15 |         dates.extend(_get_date_range(start_date, periods))
 16 | 
 17 |     date_range_list = daterange.get_date_range_list(dates)
 18 |     assert date_range_list[0].start_date == date(2015, 4, 1)
 19 |     assert date_range_list[0].end_date == date(2015, 4, 1)
 20 | 
 21 |     assert date_range_list[1].start_date == date(2019, 1, 1)
 22 |     assert date_range_list[1].end_date == date(2019, 1, 5)
 23 | 
 24 |     assert date_range_list[2].start_date == date(2019, 5, 1)
 25 |     assert date_range_list[2].end_date == date(2019, 5, 3)
 26 | 
 27 |     assert daterange.get_date_range_list([]) == []
 28 | 
 29 | 
 30 | def test_intersection():
 31 |     assert (
 32 |         DateRange("2019-01-01", "2019-01-02").intersection(DateRange("2020-01-01", "2020-01-02"))
 33 |         is None
 34 |     )
 35 | 
 36 |     assert DateRange("2019-01-01", "2019-01-10").intersection(
 37 |         DateRange("2019-01-01", "2019-01-02")
 38 |     ) == DateRange("2019-01-01", "2019-01-02")
 39 | 
 40 |     assert DateRange("2019-01-01", "2019-01-10").intersection(
 41 |         DateRange("2019-01-05", "2019-01-20")
 42 |     ) == DateRange("2019-01-05", "2019-01-10")
 43 | 
 44 |     year = DateRange("2018-01-1", "2019-01-01")
 45 |     dec = DateRange("2018-12-01", "2019-01-01")
 46 |     assert year.intersection(dec) == dec
 47 | 
 48 |     june = DateRange("2018-06-01", "2018-07-01")
 49 |     assert year.intersection(june) == june
 50 | 
 51 |     incomplete_overlap = DateRange("2017-07-01", "2018-02-01")
 52 |     assert year.intersection(incomplete_overlap) != incomplete_overlap
 53 | 
 54 | 
 55 | def test_total_days():
 56 |     assert DateRange("2019-01-01", "2019-01-10").total_days() == 9
 57 | 
 58 | 
 59 | def test_split_into_years():
 60 |     short_dr = DateRange("2019-01-01", "2019-01-10")
 61 |     assert short_dr.split_into_years() == [short_dr]
 62 | 
 63 |     one_year = DateRange("2019-01-01", "2020-01-01")
 64 |     assert one_year.split_into_years() == [one_year]
 65 | 
 66 |     year_and_half = DateRange("2019-01-01", "2020-06-01")
 67 |     assert year_and_half.split_into_years() == [
 68 |         DateRange("2019-06-02", "2020-06-01"),
 69 |         DateRange("2019-01-01", "2019-06-02"),
 70 |     ]
 71 | 
 72 | 
 73 | def test_merge_date_ranges_to_years():
 74 |     jan = DateRange("2018-01-01", "2018-02-01")
 75 |     multiyear = DateRange("2017-01-01", "2018-02-01")
 76 |     old_multiyear = DateRange("2014-01-01", "2016-02-01")
 77 |     ancient_jan = DateRange("2010-01-01", "2010-02-01")
 78 |     for date_ranges, merged in [
 79 |         ([], []),
 80 |         ([jan], [DateRange("2017-02-01", "2018-02-01")]),
 81 |         (
 82 |             [multiyear],
 83 |             [DateRange("2017-02-01", "2018-02-01"), DateRange("2016-02-02", "2017-02-01")],
 84 |         ),
 85 |         (
 86 |             [old_multiyear, multiyear],
 87 |             [
 88 |                 DateRange("2017-02-01", "2018-02-01"),
 89 |                 DateRange("2016-02-02", "2017-02-01"),
 90 |                 DateRange("2015-02-01", "2016-02-01"),
 91 |                 DateRange("2014-02-01", "2015-02-01"),
 92 |                 DateRange("2013-02-01", "2014-02-01"),
 93 |             ],
 94 |         ),
 95 |         (
 96 |             [ancient_jan, old_multiyear, multiyear],
 97 |             [
 98 |                 DateRange("2017-02-01", "2018-02-01"),
 99 |                 DateRange("2016-02-02", "2017-02-01"),
100 |                 DateRange("2015-02-01", "2016-02-01"),
101 |                 DateRange("2014-02-01", "2015-02-01"),
102 |                 DateRange("2013-02-01", "2014-02-01"),
103 |                 DateRange("2009-02-01", "2010-02-01"),
104 |             ],
105 |         ),
106 |     ]:
107 |         assert merge_date_ranges_to_years(date_ranges) == merged
108 | 


--------------------------------------------------------------------------------
/tests/test_grid_search.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from pvoutput.grid_search.grid_search import GridSearch
  4 | 
  5 | SHOW = True
  6 | if "CI" in os.environ:
  7 |     SHOW = False
  8 | 
  9 | 
 10 | def test_init():
 11 |     """Test that Grid search can be initiated"""
 12 |     _ = GridSearch()
 13 | 
 14 | 
 15 | def test_list_countries():
 16 |     """Get list of countries"""
 17 |     grd = GridSearch()
 18 |     countries = grd.nat_earth.list_countries()
 19 |     assert len(countries) == 258
 20 | 
 21 | 
 22 | def test_uk_grid():
 23 |     """Example 1: Get UK grid
 24 | 
 25 |     Use this to clip to a bounding box as well as the countries selected
 26 |     List as many countries as you want, or set to None for world-wide
 27 |     Only include search points within a certain radius of a location (see Example 3)
 28 |     Increase this if you'd like to consider systems "near" the target region (see Example 2)
 29 |     Allow some extra overlap due to inaccuracies in measuring distance
 30 |     EPSG:27700 is OSGB36 / British National Grid
 31 |     Gives a nice plot of the region and grid
 32 |     """
 33 |     grd = GridSearch()
 34 |     ukgrid = grd.generate_grid(
 35 |         bbox=[45, -15, 58, 15],
 36 |         countries=["United Kingdom"],
 37 |         radial_clip=None,
 38 |         buffer=0,
 39 |         search_radius=24.5,
 40 |         local_crs_epsg=27700,
 41 |         show=SHOW,
 42 |     )
 43 |     assert len(ukgrid) > 100
 44 | 
 45 | 
 46 | def test_luxembourg_grid():
 47 |     """Example 2: Make Luxembourg grid
 48 | 
 49 |     Include search radii within 50km of Luzembourgs border
 50 |     Allow some extra overlap due to inaccuracies in measuring distance
 51 |     EPSG:2169 is Luxembourg 1930 / Gauss
 52 | 
 53 |     """
 54 |     grd = GridSearch()
 55 |     luxgrid = grd.generate_grid(
 56 |         countries=["Luxembourg"], buffer=50, search_radius=24.5, local_crs_epsg=2169, show=SHOW
 57 |     )
 58 |     luxgrid.head()
 59 |     assert len(luxgrid) == 18
 60 | 
 61 | 
 62 | def test_sheffield_grid():
 63 |     """Make grid around Sheffield
 64 | 
 65 |     Only include search points within a 100km of the TUOS Physics Department
 66 |     EPSG:27700 is OSGB36 / British National Grid
 67 | 
 68 |     """
 69 |     grd = GridSearch()
 70 |     shefgrid = grd.generate_grid(
 71 |         radial_clip=(
 72 |             53.381,
 73 |             -1.486,
 74 |             100.0,
 75 |         ),  # Only include search points within a 100km of the TUOS Physics Department
 76 |         local_crs_epsg=27700,  # EPSG:27700 is OSGB36 / British National Grid
 77 |         show=SHOW,
 78 |     )
 79 |     assert len(shefgrid) == 29
 80 | 
 81 | 
 82 | def test_balkan_grid():
 83 |     """Plot grid around Balkan area"""
 84 |     grd = GridSearch()
 85 |     balkan_grid = grd.generate_grid(
 86 |         countries=[
 87 |             "Bosnia and Herz.",
 88 |             "Croatia",
 89 |             "Hungary",
 90 |             "Romania",
 91 |             "Bulgaria",
 92 |             "North Macedonia",
 93 |             "Kosovo",
 94 |             "Albania",
 95 |             "Montenegro",
 96 |             "Serbia",
 97 |         ],
 98 |         search_radius=24.5,
 99 |         show=SHOW,
100 |     )
101 |     assert len(balkan_grid) == 733
102 | 


--------------------------------------------------------------------------------
/tests/test_mapscraper.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pytest
 3 | 
 4 | from pvoutput import mapscraper as ms
 5 | from pvoutput.consts import MAP_URL
 6 | 
 7 | 
 8 | def compare_function_output_to_pickle(key, function_dict, dict_of_dfs, series=False):
 9 |     df_from_func = function_dict[key]()
10 |     test_df = dict_of_dfs[key]
11 |     if series:
12 |         return pd.testing.assert_series_equal(df_from_func, test_df)
13 |     return pd.testing.assert_frame_equal(df_from_func, test_df, check_like=True)
14 | 
15 | 
16 | def test_convert_to_country_code():
17 |     assert ms._convert_to_country_code(1) == 1
18 |     assert ms._convert_to_country_code("United Kingdom") == 243
19 | 
20 |     def _assert_raises(bad_countries, exception):
21 |         for bad_country in bad_countries:
22 |             with pytest.raises(exception):
23 |                 ms._convert_to_country_code(bad_country)
24 |                 pytest.fail(
25 |                     "Failed to raise {} for country={}".format(exception.__name__, bad_country)
26 |                 )
27 | 
28 |     _assert_raises([-1, -100, 1000, "blah"], ValueError)
29 | 
30 | 
31 | def test_create_map_url():
32 |     assert ms._create_map_url() == MAP_URL
33 |     assert ms._create_map_url(country_code=1) == MAP_URL + "?country=1"
34 |     assert ms._create_map_url(page_number=2) == MAP_URL + "?p=2"
35 |     assert ms._create_map_url(ascending=True) == MAP_URL + "?d=asc"
36 |     assert ms._create_map_url(ascending=False) == MAP_URL + "?d=desc"
37 |     assert ms._create_map_url(sort_by="efficiency") == MAP_URL + "?o=gss"
38 |     with pytest.raises(ValueError):
39 |         ms._create_map_url(sort_by="blah")
40 | 
41 | 
42 | def test_pv_system_size_metadata(get_function_dict, get_test_dict_of_dfs):
43 |     assert (
44 |         compare_function_output_to_pickle(
45 |             "pv_system_size_metadata", get_function_dict, get_test_dict_of_dfs
46 |         )
47 |         is None
48 |     )
49 | 
50 | 
51 | def test_process_output_col(get_function_dict, get_test_dict_of_dfs):
52 |     assert (
53 |         compare_function_output_to_pickle(
54 |             "process_output_col", get_function_dict, get_test_dict_of_dfs, series=True
55 |         )
56 |         is None
57 |     )
58 | 
59 | 
60 | def test_process_generation_and_average_cols(get_function_dict, get_test_dict_of_dfs):
61 |     assert (
62 |         compare_function_output_to_pickle(
63 |             "process_generation_and_average_cols", get_function_dict, get_test_dict_of_dfs
64 |         )
65 |         is None
66 |     )
67 | 
68 | 
69 | def test_process_efficiency_col(get_function_dict, get_test_dict_of_dfs):
70 |     assert (
71 |         compare_function_output_to_pickle(
72 |             "process_efficiency_col", get_function_dict, get_test_dict_of_dfs, series=True
73 |         )
74 |         is None
75 |     )
76 | 
77 | 
78 | def test_process_metadata(get_function_dict, get_test_dict_of_dfs):
79 |     assert (
80 |         compare_function_output_to_pickle(
81 |             "process_metadata", get_function_dict, get_test_dict_of_dfs
82 |         )
83 |         is None
84 |     )
85 | 


--------------------------------------------------------------------------------
/tests/test_process.py:
--------------------------------------------------------------------------------
 1 | from datetime import date
 2 | from io import StringIO
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | import pytest
 7 | 
 8 | from pvoutput.prcoess import process_batch_status, process_system_status
 9 | 
10 | 
11 | def test_process_system_status():
12 |     pv_system_status_text = "1234;07:45,21,255,1,2;"
13 |     one_status = process_system_status(
14 |         pv_system_status_text=pv_system_status_text, date=date(2022, 1, 1)
15 |     )
16 |     assert len(one_status) == 1
17 | 
18 | 
19 | def test_process_system_status_2():
20 |     # note that the second entry has a missing data value
21 |     pv_system_status_text = "1234;07:45,21,255,1,5;" "07:50,22,257,2;" "07:55,23,256,3,4"
22 | 
23 |     one_status = process_system_status(
24 |         pv_system_status_text=pv_system_status_text, date=date(2022, 1, 1)
25 |     )
26 |     assert len(one_status) == 3
27 |     assert (one_status["system_id"] == 1234).all()
28 | 
29 | 
30 | def test_process_system_status_none():
31 | 
32 |     one_status = process_system_status(
33 |         pv_system_status_text="no status found", date=date(2022, 1, 1)
34 |     )
35 |     assert len(one_status) == 0
36 | 
37 | 
38 | def test_process_system_status_less_columns_two_data_points():
39 |     # this has all missing data values
40 |     pv_system_status_text = "1234;07:45,21,255;" "07:45,22,256"
41 |     one_status = process_system_status(
42 |         pv_system_status_text=pv_system_status_text, date=date(2022, 1, 1)
43 |     )
44 |     assert len(one_status) == 2
45 | 
46 | 
47 | def test_process_system_status_less_columns_one_data_points():
48 |     # this has all missing data values
49 |     pv_system_status_text = "1234;07:45,21,255"
50 |     one_status = process_system_status(
51 |         pv_system_status_text=pv_system_status_text, date=date(2022, 1, 1)
52 |     )
53 |     assert len(one_status) == 1
54 |     assert np.isnan(one_status["temperature_C"][0])
55 | 
56 | 
57 | def test_process_batch_status():
58 |     # Response text copied from
59 |     # https://pvoutput.org/help.html#dataservice-getbatchstatus
60 |     response_text = """
61 | 20140330;07:35,2,24;07:40,4,24;07:45,6,24;07:50,8,24;07:55,13,60;08:00,24,132
62 | 20140329;07:35,2,24;07:40,4,24;07:45,6,24;07:50,8,24;07:55,13,60;08:00,24,132
63 | 20140328;07:35,2,24;07:40,4,24;07:45,6,24;07:50,8,24;07:55,13,60;08:00,24,132"""
64 | 
65 |     correct_interpretation_csv = """
66 | datetime,cumulative_energy_gen_Wh,instantaneous_power_gen_W,temperature_C,voltage
67 | 2014-03-28 07:35:00,2.0,24.0,,
68 | 2014-03-28 07:40:00,4.0,24.0,,
69 | 2014-03-28 07:45:00,6.0,24.0,,
70 | 2014-03-28 07:50:00,8.0,24.0,,
71 | 2014-03-28 07:55:00,13.0,60.0,,
72 | 2014-03-28 08:00:00,24.0,132.0,,
73 | 2014-03-29 07:35:00,2.0,24.0,,
74 | 2014-03-29 07:40:00,4.0,24.0,,
75 | 2014-03-29 07:45:00,6.0,24.0,,
76 | 2014-03-29 07:50:00,8.0,24.0,,
77 | 2014-03-29 07:55:00,13.0,60.0,,
78 | 2014-03-29 08:00:00,24.0,132.0,,
79 | 2014-03-30 07:35:00,2.0,24.0,,
80 | 2014-03-30 07:40:00,4.0,24.0,,
81 | 2014-03-30 07:45:00,6.0,24.0,,
82 | 2014-03-30 07:50:00,8.0,24.0,,
83 | 2014-03-30 07:55:00,13.0,60.0,,
84 | 2014-03-30 08:00:00,24.0,132.0,,"""
85 | 
86 |     df = process_batch_status(response_text)
87 |     correct_df = pd.read_csv(
88 |         StringIO(correct_interpretation_csv), parse_dates=["datetime"], index_col="datetime"
89 |     )
90 |     pd.testing.assert_frame_equal(df, correct_df)
91 | 
92 |     empty_df = process_batch_status("")
93 |     assert empty_df.empty, "DataFrame should be empty but it was:\n{}\n".format(empty_df)
94 | 
95 |     with pytest.raises(NotImplementedError):
96 |         process_batch_status("20140330;07:35,2,24,2,24,23.1,230.3")
97 | 


--------------------------------------------------------------------------------
/tests/test_pvoutput.py:
--------------------------------------------------------------------------------
  1 | from datetime import date, datetime
  2 | from io import StringIO
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import pytest
  7 | 
  8 | from pvoutput import pvoutput
  9 | 
 10 | 
 11 | def test_init():
 12 |     _ = pvoutput.PVOutput(api_key="fake", system_id="fake")
 13 | 
 14 | 
 15 | def test_rate_limit():
 16 |     pv = pvoutput.PVOutput(api_key="fake", system_id="fake")
 17 | 
 18 |     # set a fake reset time
 19 |     pv.rate_limit_reset_time = pd.Timestamp.utcnow() + pd.Timedelta(minutes=30)
 20 | 
 21 |     # get the number of seconds we need to wait
 22 |     seconds_to_wait = pv.wait_for_rate_limit_reset(do_sleeping=False)
 23 | 
 24 |     # 30 mins, + 3 mins for safety
 25 |     assert np.round(seconds_to_wait) == 30 * 60 + (60 * 3)
 26 | 
 27 | 
 28 | @pytest.mark.skip("Currently not working in CI")
 29 | def test_get_status():
 30 |     pv = pvoutput.PVOutput()
 31 |     pv.get_status(
 32 |         pv_system_id=10033,
 33 |         date=datetime(2022, 3, 1, 12),
 34 |         use_data_service=True,
 35 |         timezone="Europe/London",
 36 |     )
 37 | 
 38 | 
 39 | @pytest.mark.skip("Currently not working in CI")
 40 | def test_multiple_get_status():
 41 |     pv = pvoutput.PVOutput()
 42 |     status_df = pv.get_system_status(
 43 |         pv_system_ids=[10033, 10020],
 44 |         date=datetime(2022, 3, 15),
 45 |         use_data_service=True,
 46 |         timezone="Europe/London",
 47 |     )
 48 | 
 49 |     assert len(status_df) > 0
 50 | 
 51 | 
 52 | def test_convert_consecutive_dates_to_date_ranges():
 53 |     dr1 = pd.date_range("2018-01-01", "2018-02-01", freq="D").tolist()
 54 |     dr2 = pd.date_range("2018-02-05", "2018-02-10", freq="D").tolist()
 55 |     missing_dates = dr1 + dr2
 56 |     date_ranges = pvoutput._convert_consecutive_dates_to_date_ranges(missing_dates)
 57 |     columns = ["missing_start_date_PV_localtime", "missing_end_date_PV_localtime"]
 58 |     pd.testing.assert_frame_equal(
 59 |         date_ranges[columns],
 60 |         pd.DataFrame(
 61 |             [
 62 |                 [dr1[0], dr1[-1]],
 63 |                 [dr2[0], dr2[-1]],
 64 |             ],
 65 |             columns=columns,
 66 |         ),
 67 |     )
 68 | 
 69 | 
 70 | def test_date_to_pvoutput_str():
 71 |     VALID_DATE_STR = "20190101"
 72 |     assert pvoutput.date_to_pvoutput_str(VALID_DATE_STR) == VALID_DATE_STR
 73 |     ts = pd.Timestamp(VALID_DATE_STR)
 74 |     assert pvoutput.date_to_pvoutput_str(ts) == VALID_DATE_STR
 75 | 
 76 | 
 77 | def test_check_date():
 78 |     assert pvoutput._check_date("20190101") is None
 79 |     with pytest.raises(ValueError):
 80 |         pvoutput._check_date("2010")
 81 |     with pytest.raises(ValueError):
 82 |         pvoutput._check_date("2010-01-02")
 83 | 
 84 | 
 85 | def test_check_pv_system_status():
 86 |     def _make_timeseries(start, end):
 87 |         index = pd.date_range(start, end, freq="5T")
 88 |         n = len(index)
 89 |         timeseries = pd.DataFrame(np.zeros(n), index=index)
 90 |         return timeseries
 91 | 
 92 |     DATE = date(2019, 1, 1)
 93 |     good_timeseries = _make_timeseries("2019-01-01 00:00", "2019-01-02 00:00")
 94 |     pvoutput.check_pv_system_status(good_timeseries, DATE)
 95 | 
 96 |     bad_timeseries = _make_timeseries("2019-01-01 00:00", "2019-01-03 00:00")
 97 |     with pytest.raises(ValueError):
 98 |         pvoutput.check_pv_system_status(bad_timeseries, DATE)
 99 | 
100 |     bad_timeseries2 = _make_timeseries("2019-01-02 00:00", "2019-01-03 00:00")
101 |     with pytest.raises(ValueError):
102 |         pvoutput.check_pv_system_status(bad_timeseries2, DATE)
103 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import date
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | import pytest
 7 | 
 8 | from pvoutput import utils
 9 | from pvoutput.daterange import DateRange
10 | 
11 | PV_SYSTEM = 123
12 | 
13 | 
14 | def test_get_missing_dates_for_id(data_dir):
15 |     test_hdf = os.path.join(data_dir, "data/test.hdf")
16 |     missing_dates = utils.get_missing_dates_for_id(test_hdf, PV_SYSTEM)
17 |     np.testing.assert_array_equal(missing_dates, [date(2019, 1, 2), date(2019, 1, 3)])
18 | 
19 | 
20 | def test_get_system_ids_in_store(data_dir):
21 |     test_hdf = os.path.join(data_dir, "data/test.hdf")
22 |     system_ids = utils.get_system_ids_in_store(test_hdf)
23 |     np.testing.assert_array_equal(system_ids, [PV_SYSTEM])
24 | 
25 | 
26 | def test_get_date_ranges_to_download(data_dir):
27 |     test_hdf = os.path.join(data_dir, "data/test.hdf")
28 |     date_ranges = utils.get_date_ranges_to_download(test_hdf, PV_SYSTEM, "2018-01-01", "2019-01-10")
29 |     # 2018-01-02 and 2018-01-03 are already known to be missing.
30 |     np.testing.assert_array_equal(
31 |         date_ranges,
32 |         [
33 |             DateRange(start_date=date(2018, 1, 1), end_date=date(2018, 12, 31)),
34 |             DateRange(start_date=date(2019, 1, 4), end_date=date(2019, 1, 10)),
35 |         ],
36 |     )
37 | 
38 | 
39 | def test_datetime_list_to_dates():
40 |     np.testing.assert_array_equal(
41 |         utils.datetime_list_to_dates(pd.Timestamp("2019-01-01")), [date(2019, 1, 1)]
42 |     )
43 | 
44 |     np.testing.assert_array_equal(
45 |         utils.datetime_list_to_dates([pd.Timestamp("2019-01-01"), pd.Timestamp("2019-01-02")]),
46 |         [date(2019, 1, 1), date(2019, 1, 2)],
47 |     )
48 | 


--------------------------------------------------------------------------------