├── .github └── workflows │ └── test.yml ├── .gitignore ├── .tx └── config ├── CHANGELOG.rst ├── CONTRIBUTING.rst ├── MANIFEST.in ├── README.rst ├── ckanext ├── __init__.py └── harvest │ ├── __init__.py │ ├── assets │ ├── styles │ │ ├── harvest.css │ │ ├── harvest.less │ │ ├── less │ │ ├── mixins.less │ │ └── variables.less │ └── webassets.yml │ ├── cli.py │ ├── controllers │ ├── __init__.py │ └── view.py │ ├── harvesters │ ├── __init__.py │ ├── base.py │ └── ckanharvester.py │ ├── helpers.py │ ├── i18n │ ├── ckanext-harvest.pot │ └── sv │ │ └── LC_MESSAGES │ │ ├── ckanext-harvest.mo │ │ └── ckanext-harvest.po │ ├── interfaces.py │ ├── log.py │ ├── logic │ ├── __init__.py │ ├── action │ │ ├── __init__.py │ │ ├── create.py │ │ ├── delete.py │ │ ├── get.py │ │ ├── patch.py │ │ └── update.py │ ├── auth │ │ ├── __init__.py │ │ ├── create.py │ │ ├── delete.py │ │ ├── get.py │ │ ├── patch.py │ │ └── update.py │ ├── dictization.py │ ├── schema.py │ └── validators.py │ ├── migration │ └── harvest │ │ ├── README │ │ ├── alembic.ini │ │ ├── env.py │ │ ├── script.py.mako │ │ └── versions │ │ ├── 3b4894672727_create_harvest_tables.py │ │ └── 75d650dfd519_add_cascade_to_harvest_tables.py │ ├── model │ └── __init__.py │ ├── plugin.py │ ├── public │ └── ckanext │ │ └── harvest │ │ ├── images │ │ └── icons │ │ │ ├── source_delete.png │ │ │ ├── source_edit.png │ │ │ ├── source_new.png │ │ │ ├── source_refresh.png │ │ │ └── source_view.png │ │ ├── javascript │ │ ├── extra_fields.js │ │ ├── resource.config │ │ └── webassets.yml │ │ └── style.css │ ├── queue.py │ ├── templates │ ├── admin │ │ └── base.html │ ├── base.html │ ├── emails │ │ ├── error_email.txt │ │ └── summary_email.txt │ ├── snippets │ │ ├── add_source_button.html │ │ ├── job_details.html │ │ ├── job_error_summary.html │ │ ├── package_list_empty.html │ │ ├── search_result_text.html │ │ ├── source_item.html │ │ └── source_list.html │ └── source │ │ ├── about.html │ │ ├── admin.html │ │ ├── admin_base.html │ │ ├── base.html │ │ ├── edit.html │ │ ├── job │ │ ├── list.html │ │ └── read.html │ │ ├── new.html │ │ ├── new_source_form.html │ │ ├── read.html │ │ ├── read_base.html │ │ └── search.html │ ├── tests │ ├── __init__.py │ ├── conftest.py │ ├── factories.py │ ├── fixtures.py │ ├── harvesters │ │ ├── __init__.py │ │ ├── mock_ckan.py │ │ ├── test_base.py │ │ └── test_ckanharvester.py │ ├── lib.py │ ├── test_action.py │ ├── test_blueprint.py │ ├── test_queue.py │ ├── test_queue2.py │ └── test_timeouts.py │ ├── utils.py │ └── views.py ├── config └── supervisor │ └── ckan_harvesting.conf ├── conftest.py ├── dev-requirements.txt ├── docs └── admin-tab.png ├── pip-requirements.txt ├── pyproject.toml ├── requirements.txt ├── setup.cfg ├── setup.py └── test.ini /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | on: [push, pull_request] 3 | jobs: 4 | lint: 5 | runs-on: ubuntu-latest 6 | steps: 7 | - uses: actions/checkout@v4 8 | - uses: actions/setup-python@v5 9 | with: 10 | python-version: '3.9' 11 | - name: Install requirements 12 | run: pip install flake8 pycodestyle 13 | - name: Check syntax 14 | run: flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics --exclude ckan 15 | - name: Run flake8 16 | run: flake8 . --count --max-line-length=127 --statistics --exclude ckan 17 | 18 | test: 19 | needs: lint 20 | strategy: 21 | matrix: 22 | include: 23 | - ckan-version: "2.11" 24 | ckan-image: "ckan/ckan-dev:2.11-py3.10" 25 | - ckan-version: "2.10" 26 | ckan-image: "ckan/ckan-dev:2.10-py3.10" 27 | fail-fast: false 28 | 29 | name: CKAN ${{ matrix.ckan-version }} 30 | runs-on: ubuntu-latest 31 | container: 32 | image: ${{ matrix.ckan-image }} 33 | options: --user root 34 | services: 35 | solr: 36 | image: ckan/ckan-solr:${{ matrix.ckan-version }}-solr9 37 | postgres: 38 | image: ckan/ckan-postgres-dev:${{ matrix.ckan-version }} 39 | env: 40 | POSTGRES_USER: postgres 41 | POSTGRES_PASSWORD: postgres 42 | POSTGRES_DB: postgres 43 | options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5 44 | redis: 45 | image: redis:3 46 | env: 47 | CKAN_SQLALCHEMY_URL: postgresql://ckan_default:pass@postgres/ckan_test 48 | CKAN_DATASTORE_WRITE_URL: postgresql://datastore_write:pass@postgres/datastore_test 49 | CKAN_DATASTORE_READ_URL: postgresql://datastore_read:pass@postgres/datastore_test 50 | CKAN_SOLR_URL: http://solr:8983/solr/ckan 51 | CKAN_REDIS_URL: redis://redis:6379/1 52 | 53 | steps: 54 | - uses: actions/checkout@v4 55 | - name: Install requirements 56 | run: | 57 | pip install -r requirements.txt 58 | pip install -r dev-requirements.txt 59 | pip install -e . 60 | # Replace default path to CKAN core config file with the one on the container 61 | sed -i -e 's/use = config:.*/use = config:\/srv\/app\/src\/ckan\/test-core.ini/' test.ini 62 | - name: Setup extension (CKAN >= 2.9) 63 | run: | 64 | ckan -c test.ini db init 65 | ckan -c test.ini db pending-migrations --apply 66 | - name: Run tests 67 | run: pytest --ckan-ini=test.ini --cov=ckanext.harvest --disable-warnings ckanext/harvest/tests 68 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | syntax: glob 2 | *.pyc 3 | *.egg-info 4 | *.orig 5 | .coverage 6 | build 7 | *.egg 8 | .DS_Store 9 | dist 10 | development.ini 11 | *.sw? 12 | *~ 13 | node_modules 14 | *.project 15 | .eggs 16 | .idea/ 17 | .vscode/ 18 | 19 | -------------------------------------------------------------------------------- /.tx/config: -------------------------------------------------------------------------------- 1 | [main] 2 | host = https://www.transifex.com 3 | 4 | [ckanext-harvest.ckanext-harvestpot] 5 | file_filter = i18n//LC_MESSAGES/ckanext-harvest.po 6 | source_file = i18n/ckanext-harvest.pot 7 | source_lang = en 8 | type = PO 9 | 10 | -------------------------------------------------------------------------------- /CHANGELOG.rst: -------------------------------------------------------------------------------- 1 | ######### 2 | Changelog 3 | ######### 4 | 5 | All notable changes to this project will be documented in this file. 6 | 7 | The format is based on `Keep a Changelog `_ 8 | and this project adheres to `Semantic Versioning `_ 9 | 10 | *********** 11 | 1.6.1_ - 2025-01-14 12 | *********** 13 | 14 | Changed 15 | _______ 16 | 17 | - CKAN 2.9 is not longer maintained #559 18 | 19 | Fixed 20 | ------- 21 | - Update manifest to include alembic configuration #558 22 | 23 | 24 | *********** 25 | 1.6.0_ - 2024-10-31 26 | *********** 27 | 28 | Changed 29 | _______ 30 | 31 | - CKAN 2.11 support #551 32 | - Switched to alembic migrations #540 33 | - Support for SQLAlchemy 2 #553 34 | - Use pyproject.toml file #554 35 | - Add tab for harvest sources in sysadmin page 36 | 37 | Fixed 38 | ------- 39 | 40 | - Clean up harvest source clear command, fix revisions exception #556 41 | - Convert boolean values to bools #544 42 | 43 | 44 | *********** 45 | 1.5.6_ - 2023-06-26 46 | *********** 47 | 48 | Fixed 49 | ------- 50 | 51 | - Fix url endpoint for job_show #534 52 | 53 | *********** 54 | 1.5.5_ - 2023-06-05 55 | *********** 56 | 57 | Fixed 58 | ------- 59 | 60 | - Fix display of harvest job errors #533 61 | 62 | *********** 63 | 1.5.4_ - 2023-05-23 64 | *********** 65 | 66 | Fixed 67 | ------- 68 | 69 | - Fix a problem with data-dictization when using sqlalchemy 1.4+ #529 70 | 71 | *********** 72 | 1.5.3_ - 2023-04-03 73 | *********** 74 | 75 | Fixed 76 | ------- 77 | 78 | - Fix asset path in MANIFEST.in #525 79 | 80 | *********** 81 | 1.5.2_ - 2023-03-28 82 | *********** 83 | 84 | Fixed 85 | ------- 86 | 87 | - Fix URL endpoints: from ``harvest.object_show`` to ``harvester.object_show`` #524 88 | 89 | *********** 90 | 1.5.1_ - 2023-03-22 91 | *********** 92 | 93 | Fixed 94 | ------- 95 | 96 | - Fix ``url_for`` routing to point to harvester blueprint #523 97 | 98 | *********** 99 | 1.5.0_ - 2023-03-16 100 | *********** 101 | 102 | Changed 103 | ------- 104 | 105 | - Added unescape for email text body to avoid encoded characters #517 106 | - Pick the right harvest_object_id if there are multiple #519 107 | - Do not duplicate harvest_extras if exist in root schema #521 108 | - Use 403 when actions are forbidden, not 401 #522 109 | - Drop support old versions #520 110 | 111 | Breaking Changes 112 | ------- 113 | - ``h.bootstrap_version()`` no longer exist since it is no longer needed to inject CSS classes 114 | - Support for old Pylon's route syntax has been removed. Example: calling ``url_for("harvest_read")`` will no longer work. URLs for ``ckanext-harvest`` needs to respect Flask's syntax: ``url_for("harvest.read")``, etc 115 | 116 | *********** 117 | 1.4.2_ - 2023-01-12 118 | *********** 119 | 120 | Changed 121 | ------- 122 | 123 | - Add DB index harvest_error_harvest_object_id_idx #514 124 | - Remove pyopenssl requirement c87309a 125 | - Add CSRF protection to new source form #516 126 | 127 | *********** 128 | 1.4.1_ - 2022-09-20 129 | *********** 130 | 131 | Changed 132 | ------- 133 | 134 | - Use requirements.txt instead of pip-requirements.txt (still working via symlink) 8ed1eca 135 | 136 | Fixed 137 | ----- 138 | 139 | - Bump pyopenssl requirement to avoid requirements error on install 98edcd3 140 | - Fixes unicode error in Python 2 #502 141 | - Fixes in email notification sendngi #499, #505 142 | - Fix pagination for Dataset list on source page #504 143 | 144 | *********** 145 | 1.4.0_ - 2022-04-20 146 | *********** 147 | 148 | Changed 149 | ------- 150 | 151 | - Add ckan.harvest.not_overwrite_fields #472 152 | - Support for Bootstrap 5 templates #490 153 | - Support for CKAN 2.10 #492 #496 154 | 155 | Fixed 156 | ----- 157 | 158 | - Fix JSONDecode error #489 159 | - Check if email exists before sending notification #498 160 | 161 | 162 | *********** 163 | 1.3.4_ - 2022-01-24 164 | *********** 165 | 166 | Changed 167 | ------- 168 | 169 | - Changes function calls to `render_jinja2` over to `render` as the former is 170 | no longer used. #459 171 | - Set the default value for MQ_TYPE to redis #463 172 | - Add option `keep-current` to `clearsource_history` command #484 173 | 174 | Fixed 175 | ----- 176 | 177 | - Fix JSON serialization for Python3 #450 178 | - Make `Rehavest` and `Clear` buttons work again #452 179 | - Fix error when running run-test #466 180 | - Fix timeout calculation #482 181 | - Fix harvest extras for packages #458 182 | 183 | 184 | *********** 185 | 1.3.3_ - 2021-03-26 186 | *********** 187 | 188 | Changed 189 | ------- 190 | 191 | - Migrate tests from Travis CI to GitHub Actions 192 | - Optimize last error free job detection #437 193 | 194 | Fixed 195 | ----- 196 | - Improve timeout detection #431 197 | - Check if Redis key is available #432 198 | - Include webassets.yml in MANIFEST 199 | 200 | 201 | *********** 202 | 1.3.2_ - 2020-10-08 203 | *********** 204 | 205 | Changed 206 | ------- 207 | 208 | - Calculate timeouts based on last finished object instead of job creation time #418 209 | 210 | Fixed 211 | ----- 212 | 213 | - Fix resubmitting harvest objects to Redis fetch queue #421 214 | 215 | 216 | *********** 217 | 1.3.1_ - 2020-09-01 218 | *********** 219 | 220 | Changed 221 | ------- 222 | 223 | - Abort failed jobs CLI command #398 224 | 225 | Fixed 226 | ----- 227 | 228 | - Fix Redis conflict with core workers 229 | - Fix harvest source list reference 230 | - Fix and improve test suite, remove nose tests 231 | 232 | 233 | *********** 234 | 1.3.0_ - 2020-06-04 235 | *********** 236 | 237 | Changed 238 | ------- 239 | 240 | - Support for Python 3 #392 241 | - Add option for job timeout #403 242 | - Add support for limiting number of results and filtering by organization in harvest_source_list #403 243 | 244 | Fixed 245 | ----- 246 | 247 | - Fix support for different Redis client libraries #403 248 | - Fix force_import option in run_test command #402 249 | - Fix show object #395 250 | - Fix handling of exceptions in controller #390 251 | 252 | 253 | *********** 254 | 1.2.1_ - 2020-01-22 255 | *********** 256 | 257 | Changed 258 | ------- 259 | 260 | - Support ``not modified`` status for objects #385 261 | - New ``force-import`` flag for the ``run_test`` command #385 262 | 263 | Fixed 264 | ----- 265 | 266 | - Get message from harvest_object_error-dict #381 267 | - Fix Admin link appearing to non authorized users #389 268 | - Capture Redis Exceptions #385 269 | 270 | ******************* 271 | 1.2.0_ - 2019-11-01 272 | ******************* 273 | 274 | Changed 275 | ------- 276 | - Apply flake8 to be PEP-8 compliant #354 277 | - Use ckantoolkit to clean up imports #358 278 | - Add hook to extend the package dict in CKAN harvester 279 | - Use CKAN core ckan.redis.url setting if present 280 | - Remove database migration code targeting ancient versions #376 281 | (In the unlikely event that you need to upgrade from one 282 | of the previous DB versions just apply the changes removed 283 | on the linked PR manually) 284 | 285 | Fixed 286 | ----- 287 | - harvest_source_type_exists validator should not fail if Harvester has no ``info()`` method #338 288 | - Fix SSL problems for old versions of Python 2.7.x #344 289 | - Add an 'owner_org' to the v3 package migration #348 290 | - Fix harvest request exceptions #357 291 | - Fix wrong toolkit reference 8e862c8 292 | - Mark early errored jobs as finished 5ad6d86 293 | - Resubmit awaiting objects in the DB not on Redis 5ffe6d4 294 | 295 | ******************* 296 | 1.1.4_ - 2018-10-26 297 | ******************* 298 | Fixed 299 | ----- 300 | - Fix nav link 301 | 302 | ******************* 303 | 1.1.3_ - 2018-10-26 304 | ******************* 305 | Fixed 306 | ----- 307 | - Reduce usage of c vars (CKAN 2.9) 308 | 309 | ******************* 310 | 1.1.2_ - 2018-10-25 311 | ******************* 312 | Added 313 | ----- 314 | - Send harvest-error-mails to organization-admins #329 315 | - CKAN Harvester option to include/exclude groups #323 316 | - Use Redis password from configuration when present #332 317 | - Support for CKAN 2.9 318 | 319 | Fixed 320 | ----- 321 | - Ensures the AND operator for fq in solr #335 322 | - Fix styling issues on Bootstrap 3 323 | 324 | ******************* 325 | 1.1.1_ - 2018-06-13 326 | ******************* 327 | Added 328 | ----- 329 | - Move CKANHarvester._last_error_free_job to HarvesterBase.last_error_free_job #305 330 | - Add the CSS classes for FontAwesome 4.x #313 331 | - Add config option for dataset name append type #327 332 | - Send error mail to admin when harvesting fails #244 333 | 334 | Changed 335 | ------- 336 | - Readme test tip ckan parameter #318 337 | 338 | Fixed 339 | ----- 340 | - Fix handling of ``clean_tags`` options for tag lists and dicts #304 341 | - Don't delete all solr documents/fail to index harvesters when harvest config blank #315 342 | - Fix print statements to be Py3 friendly #328 343 | 344 | ******************* 345 | 1.1.0_ - 2017-11-07 346 | ******************* 347 | Added 348 | ----- 349 | - Button on harvest admin page to abort running jobs #296 350 | 351 | Changed 352 | ------- 353 | - Test improvements for harvester config #288 354 | - Use package_search API for count of datasets #298 355 | - Catch sqlalchemy.exc.DatabaseError instead of sqlalchemy.exc.OperationalError in ``gather_callback`` #301 356 | 357 | Fixed 358 | ------- 359 | - Fix default_extras initialization #290 360 | - Travis build (postgres service, checkout of correct CKAN branch, libcommons-fileupload) #297 361 | 362 | ******************* 363 | 1.0.0_ - 2017-03-30 364 | ******************* 365 | Added 366 | ----- 367 | - Includes i18n directory in package. 368 | - Adds a new ``clearsource_history`` command/operation. 369 | - Adds new parameter ``return_last_job_status`` to ``harvest_source_list`` 370 | - Documentation for logs API 371 | 372 | Changed 373 | ------- 374 | - ``gather_stage`` return empty list instead of None if errors occured 375 | - Change ``redirect`` calls to ``h.redirect_to`` 376 | 377 | Fixed 378 | ----- 379 | - Fix namespace package declarations 380 | - Only purge own data when calling ``queue_purge`` with redis 381 | - Fix ``default_groups`` behavior 382 | 383 | ******************* 384 | 0.0.5_ - 2016-05-23 385 | ******************* 386 | Added 387 | ----- 388 | - Adds ``HarvestLog`` to log to database 389 | - Adds a new ``clean_harvest_log`` command to clean the log table 390 | 391 | Removed 392 | ------- 393 | - This release removes support for CKAN <= 2.0 394 | 395 | ******************* 396 | 0.0.4_ - 2015-12-11 397 | ******************* 398 | Added 399 | ----- 400 | - Adds ``_find_existing_package`` method to allow harvesters extending the ``HarvesterBase`` to implement their own logic to find an existing package 401 | - Adds support for ``ITranslation`` interface 402 | - Adds special CSS class to datetimes in frontend to enable localisation to the users timezone 403 | 404 | Changed 405 | ------- 406 | - Make statistics keys consistent across all actions 407 | 408 | Removed 409 | ------- 410 | - Remove ``harvest_source_for_a_dataset`` action 411 | 412 | ******************* 413 | 0.0.3_ - 2015-11-20 414 | ******************* 415 | Fixed 416 | ----- 417 | - Fixed queues tests 418 | 419 | 420 | ******************* 421 | 0.0.2_ - 2015-11-20 422 | ******************* 423 | Changed 424 | ------- 425 | - Namespace redis keys to avoid conflicts between CKAN instances 426 | 427 | 428 | ******************* 429 | 0.0.1_ - 2015-11-20 430 | ******************* 431 | Added 432 | ----- 433 | - Adds clear source as a command 434 | - Adds specific exceptions instead of having only the generic ``Exception`` 435 | 436 | Fixed 437 | ----- 438 | - Catch 'no harvest job' exception 439 | 440 | ********** 441 | Categories 442 | ********** 443 | - ``Added`` for new features. 444 | - ``Changed`` for changes in existing functionality. 445 | - ``Deprecated`` for once-stable features removed in upcoming releases. 446 | - ``Removed`` for deprecated features removed in this release. 447 | - ``Fixed`` for any bug fixes. 448 | - ``Security`` to invite users to upgrade in case of vulnerabilities. 449 | 450 | .. _Unreleased: https://github.com/ckan/ckanext-harvest/compare/v1.6.0...HEAD 451 | .. _1.6.0: https://github.com/ckan/ckanext-harvest/compare/v1.5.6...v1.6.0 452 | .. _1.5.6: https://github.com/ckan/ckanext-harvest/compare/v1.5.5...v1.5.6 453 | .. _1.5.5: https://github.com/ckan/ckanext-harvest/compare/v1.5.4...v1.5.5 454 | .. _1.5.4: https://github.com/ckan/ckanext-harvest/compare/v1.5.3...v1.5.4 455 | .. _1.5.3: https://github.com/ckan/ckanext-harvest/compare/v1.5.2...v1.5.3 456 | .. _1.5.2: https://github.com/ckan/ckanext-harvest/compare/v1.5.1...v1.5.2 457 | .. _1.5.1: https://github.com/ckan/ckanext-harvest/compare/v1.5.0...v1.5.1 458 | .. _1.5.0: https://github.com/ckan/ckanext-harvest/compare/v1.4.2...v1.5.0 459 | .. _1.4.2: https://github.com/ckan/ckanext-harvest/compare/v1.4.1...v1.4.2 460 | .. _1.4.1: https://github.com/ckan/ckanext-harvest/compare/v1.4.0...v1.4.1 461 | .. _1.4.0: https://github.com/ckan/ckanext-harvest/compare/v1.3.4...v1.4.0 462 | .. _1.3.4: https://github.com/ckan/ckanext-harvest/compare/v1.3.3...v1.3.4 463 | .. _1.3.3: https://github.com/ckan/ckanext-harvest/compare/v1.3.2...v1.3.3 464 | .. _1.3.2: https://github.com/ckan/ckanext-harvest/compare/v1.3.1...v1.3.2 465 | .. _1.3.1: https://github.com/ckan/ckanext-harvest/compare/v1.3.0...v1.3.1 466 | .. _1.3.0: https://github.com/ckan/ckanext-harvest/compare/v1.2.1...v1.3.0 467 | .. _1.2.1: https://github.com/ckan/ckanext-harvest/compare/v1.2.0...v1.2.1 468 | .. _1.2.0: https://github.com/ckan/ckanext-harvest/compare/v1.1.4...v1.2.0 469 | .. _1.1.4: https://github.com/ckan/ckanext-harvest/compare/v1.1.3...v1.1.4 470 | .. _1.1.3: https://github.com/ckan/ckanext-harvest/compare/v1.1.2...v1.1.3 471 | .. _1.1.2: https://github.com/ckan/ckanext-harvest/compare/v1.1.1...v1.1.2 472 | .. _1.1.1: https://github.com/ckan/ckanext-harvest/compare/v1.1.0...v1.1.1 473 | .. _1.1.0: https://github.com/ckan/ckanext-harvest/compare/v1.0.0...v1.1.0 474 | .. _1.0.0: https://github.com/ckan/ckanext-harvest/compare/v0.0.5...v1.0.0 475 | .. _0.0.5: https://github.com/ckan/ckanext-harvest/compare/v0.0.4...v0.0.5 476 | .. _0.0.4: https://github.com/ckan/ckanext-harvest/compare/v0.0.3...v0.0.4 477 | .. _0.0.3: https://github.com/ckan/ckanext-harvest/compare/v0.0.2...v0.0.3 478 | .. _0.0.2: https://github.com/ckan/ckanext-harvest/compare/v0.0.1...v0.0.2 479 | .. _0.0.1: https://github.com/ckan/ckanext-harvest/compare/ckan-1.6...v0.0.1 480 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | #################################### 2 | How to contribute to ckanext-harvest 3 | #################################### 4 | 5 | For contributing to ckanext-harvest or its documentation, follow the same guidelines that apply to CKAN core, described in the `contributing guidelines `_. 6 | 7 | **Did you find a bug?** 8 | ----------------------- 9 | 10 | * **Ensure the bug was not already reported** by searching on GitHub under `Issues `_. 11 | 12 | * If you're unable to find an open issue addressing the problem, `open a new one `_. Be sure to include a **title and clear description**, as much relevant information as possible. 13 | 14 | **Did you write a patch that fixes a bug?** 15 | ------------------------------------------- 16 | 17 | * Open a new GitHub pull request with the patch. 18 | 19 | * Ensure the PR description clearly describes the problem and solution. Include the relevant issue number if applicable. 20 | 21 | * Make sure to **update the CHANGELOG.rst** in the "Unreleased" section with your bugfix 22 | 23 | **Do you intend to add a new feature or change an existing one?** 24 | ----------------------------------------------------------------- 25 | 26 | * Open a new issue on Github and start writing code 27 | 28 | * If you are unsure about the change, wait for feedback on the issue or post to the `ckan-dev mailinglist `_ 29 | 30 | * Make sure to **update the CHANGELOG.rst** in the "Unreleased" section with your change 31 | 32 | **Do you have questions about the source code?** 33 | ------------------------------------------------ 34 | 35 | * Ask any question about how to use ckanext-harvest on the `ckan-dev mailinglist `_ 36 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include ckanext/harvest/templates * 2 | recursive-include ckanext/harvest/assets * 3 | recursive-include ckanext/harvest/public * 4 | recursive-include ckanext/harvest/i18n * 5 | recursive-include ckanext/harvest/migration * 6 | 7 | -------------------------------------------------------------------------------- /ckanext/__init__.py: -------------------------------------------------------------------------------- 1 | # this is a namespace package 2 | try: 3 | import pkg_resources 4 | pkg_resources.declare_namespace(__name__) 5 | except ImportError: 6 | import pkgutil 7 | __path__ = pkgutil.extend_path(__path__, __name__) 8 | -------------------------------------------------------------------------------- /ckanext/harvest/__init__.py: -------------------------------------------------------------------------------- 1 | # this is a namespace package 2 | try: 3 | import pkg_resources 4 | pkg_resources.declare_namespace(__name__) 5 | except ImportError: 6 | import pkgutil 7 | __path__ = pkgutil.extend_path(__path__, __name__) 8 | -------------------------------------------------------------------------------- /ckanext/harvest/assets/styles/harvest.css: -------------------------------------------------------------------------------- 1 | header.with-filter { 2 | clear: both; 3 | overflow: hidden; 4 | } 5 | header.with-filter h1 { 6 | margin-top: 0; 7 | } 8 | [data-diff] { 9 | color: #000; 10 | background-color: #DDD; 11 | text-shadow: none; 12 | font-weight: normal; 13 | } 14 | [data-diff="error"] { 15 | background-color: #b55457; 16 | } 17 | [data-diff="added"] { 18 | background-color: #9ee592; 19 | } 20 | [data-diff="updated"] { 21 | background-color: #c5aaff; 22 | } 23 | [data-diff="deleted"] { 24 | background-color: #e7a4a6; 25 | } 26 | .harvest-error-summary .count { 27 | text-align: right; 28 | } 29 | .harvest-error-list h5 { 30 | margin-top: 0; 31 | } 32 | .harvest-error-list .error { 33 | padding-left: 20px; 34 | } 35 | .harvest-types label.radio { 36 | font-weight: normal; 37 | margin-bottom: 10px; 38 | } 39 | .harvest-types label.radio input { 40 | top: 3px; 41 | } 42 | 43 | #source-new.bs2 .control-label { 44 | width: 125px; 45 | } 46 | -------------------------------------------------------------------------------- /ckanext/harvest/assets/styles/harvest.less: -------------------------------------------------------------------------------- 1 | @import 'mixins.less'; 2 | @import 'variables.less'; 3 | 4 | header.with-filter { 5 | clear: both; 6 | overflow: hidden; 7 | h1 { 8 | margin-top: 0; 9 | } 10 | } 11 | 12 | [data-diff] { 13 | color: #000; 14 | background-color: #DDD; 15 | text-shadow: none; 16 | font-weight: normal; 17 | } 18 | [data-diff="added"] { 19 | background-color: @diffAdded; 20 | } 21 | [data-diff="updated"] { 22 | background-color: @diffUpdated; 23 | } 24 | [data-diff="deleted"] { 25 | background-color: @diffDeleted; 26 | } 27 | 28 | .harvest-error-summary { 29 | .count { 30 | text-align: right; 31 | } 32 | } 33 | 34 | .harvest-error-list { 35 | h5 { 36 | margin-top: 0; 37 | } 38 | .error { 39 | padding-left: 20px; 40 | } 41 | } 42 | 43 | .harvest-types label.radio { 44 | font-weight: normal; 45 | margin-bottom: 10px; 46 | input { 47 | top: 3px; 48 | } 49 | } 50 | 51 | #source-new { 52 | .controls { 53 | margin-left: 135px; 54 | } 55 | .control-label { 56 | width: 125px; 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /ckanext/harvest/assets/styles/less: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | // This file is only used to generate the harvest.css 4 | 5 | var path = require('path'), 6 | nodeWatch = require('nodewatch'), 7 | exec = require('child_process').exec, 8 | watch = path.join(__dirname), 9 | lastArg = process.argv.slice().pop(); 10 | 11 | function now() { 12 | return new Date().toISOString().replace('T', ' ').substr(0, 19); 13 | } 14 | 15 | function compile(event, filename) { 16 | var start = Date.now(); 17 | 18 | exec('`npm bin`/lessc ' + __dirname + '/harvest.less > ' + __dirname + '/harvest.css', function (err, stdout, stderr) { 19 | var duration = Date.now() - start; 20 | 21 | if (err) { 22 | console.log('An error occurred running the less command:'); 23 | console.log(err.message); 24 | } 25 | else if (stderr || stdout) { 26 | console.log(stdout, stderr); 27 | } else { 28 | console.log('[%s] recompiled in %sms', now(), duration); 29 | } 30 | }); 31 | } 32 | 33 | nodeWatch.add(watch).onChange(compile); 34 | compile(); 35 | -------------------------------------------------------------------------------- /ckanext/harvest/assets/styles/mixins.less: -------------------------------------------------------------------------------- 1 | .clearfix() { 2 | 3 | } 4 | 5 | .border-radius(@radius) { 6 | -webkit-border-radius: @radius; 7 | -moz-border-radius: @radius; 8 | border-radius: @radius; 9 | } 10 | 11 | .box-shadow(@shadowA, @shadowB:X, ...){ 12 | @props: ~`"@{arguments}".replace(/[\[\]]|\,\sX/g, '')`; 13 | -webkit-box-shadow: @props; 14 | -moz-box-shadow: @props; 15 | box-shadow: @props; 16 | } 17 | -------------------------------------------------------------------------------- /ckanext/harvest/assets/styles/variables.less: -------------------------------------------------------------------------------- 1 | @borderColor: #DDD; 2 | @hoverColor: #F6F6F6; 3 | 4 | @diffAdded: #9EE592; 5 | @diffUpdated: #C5AAFF; 6 | @diffDeleted: #E7A4A6; 7 | -------------------------------------------------------------------------------- /ckanext/harvest/assets/webassets.yml: -------------------------------------------------------------------------------- 1 | harvest_css: 2 | output: ckanext-harvest/%(version)s_harvest_css.css 3 | contents: 4 | - styles/harvest.css 5 | -------------------------------------------------------------------------------- /ckanext/harvest/cli.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import print_function 4 | 5 | import ckantoolkit as tk 6 | import click 7 | 8 | import ckanext.harvest.utils as utils 9 | from ckanext.harvest.logic import HarvestJobExists 10 | 11 | 12 | def get_commands(): 13 | return [harvester] 14 | 15 | 16 | @click.group() 17 | def harvester(): 18 | """Harvests remotely mastered metadata. 19 | """ 20 | pass 21 | 22 | 23 | @harvester.group() 24 | def source(): 25 | """Manage harvest sources 26 | """ 27 | pass 28 | 29 | 30 | @source.command() 31 | @click.argument(u"name") 32 | @click.argument(u"url") 33 | @click.argument(u"type") 34 | @click.argument(u"title", required=False) 35 | @click.argument(u"active", type=tk.asbool, default=True) 36 | @click.argument(u"owner_org", required=False) 37 | @click.argument(u"frequency", default=u"MANUAL") 38 | @click.argument(u"config", required=False) 39 | def create(name, url, type, title, active, owner_org, frequency, config): 40 | """Create new harvest source. 41 | """ 42 | try: 43 | result = utils.create_harvest_source( 44 | name, url, type, title, active, owner_org, frequency, config 45 | ) 46 | except tk.ValidationError as e: 47 | tk.error_shout(u"Validation error:") 48 | for field, err in e.error_summary.items(): 49 | tk.error_shout("\t{}: {}".format(field, err)) 50 | raise click.Abort() 51 | click.echo(result) 52 | 53 | 54 | @source.command() 55 | @click.argument(u"id", metavar=u"SOURCE_ID_OR_NAME") 56 | @click.pass_context 57 | def show(ctx, id): 58 | """Shows a harvest source. 59 | """ 60 | flask_app = ctx.meta["flask_app"] 61 | 62 | try: 63 | with flask_app.test_request_context(): 64 | result = utils.show_harvest_source(id) 65 | except tk.ObjectNotFound: 66 | tk.error_shout(u"Source <{}> not found.".format(id)) 67 | raise click.Abort() 68 | click.echo(result) 69 | 70 | 71 | @source.command() 72 | @click.argument(u"id", metavar=u"SOURCE_ID_OR_NAME") 73 | @click.pass_context 74 | def remove(ctx, id): 75 | """Remove (deactivate) a harvester source, whilst leaving any related 76 | datasets, jobs and objects. 77 | 78 | """ 79 | flask_app = ctx.meta["flask_app"] 80 | 81 | with flask_app.test_request_context(): 82 | utils.remove_harvest_source(id) 83 | click.secho("Removed harvest source: {0}".format(id), fg="green") 84 | 85 | 86 | @source.command() 87 | @click.argument(u"id", metavar=u"SOURCE_ID_OR_NAME") 88 | @click.pass_context 89 | def clear(ctx, id): 90 | """Clears all datasets, jobs and objects related to a harvest source, 91 | but keeps the source itself. 92 | 93 | """ 94 | flask_app = ctx.meta["flask_app"] 95 | 96 | with flask_app.test_request_context(): 97 | utils.clear_harvest_source(id) 98 | click.secho("Cleared harvest source: {0}".format(id), fg="green") 99 | 100 | 101 | @source.command() 102 | @click.argument(u"id", metavar=u"SOURCE_ID_OR_NAME", required=False) 103 | @click.option( 104 | "-k", 105 | "--keep-current", 106 | default=False 107 | ) 108 | @click.pass_context 109 | def clear_history(ctx, id, keep_current): 110 | """If no source id is given the history for all harvest sources 111 | (maximum is 1000) will be cleared. 112 | 113 | Clears all jobs and objects related to a harvest source, but keeps 114 | the source itself. The datasets imported from the harvest source 115 | will NOT be deleted!!! If a source id is given, it only clears 116 | the history of the harvest source with the given source id. 117 | 118 | """ 119 | flask_app = ctx.meta["flask_app"] 120 | 121 | with flask_app.test_request_context(): 122 | result = utils.clear_harvest_source_history(id, bool(keep_current)) 123 | click.secho(result, fg="green") 124 | 125 | 126 | @harvester.command() 127 | @click.argument("all", required=False) 128 | @click.pass_context 129 | def sources(ctx, all): 130 | """Lists harvest sources. 131 | 132 | If 'all' is defined, it also shows the Inactive sources 133 | 134 | """ 135 | flask_app = ctx.meta["flask_app"] 136 | 137 | with flask_app.test_request_context(): 138 | result = utils.list_sources(bool(all)) 139 | click.echo(result) 140 | 141 | 142 | @harvester.command() 143 | @click.argument("id", metavar="SOURCE_ID_OR_NAME") 144 | @click.pass_context 145 | def job(ctx, id): 146 | """Create new harvest job and runs it (puts it on the gather queue). 147 | 148 | """ 149 | flask_app = ctx.meta["flask_app"] 150 | with flask_app.test_request_context(): 151 | try: 152 | result = utils.create_job(id) 153 | except HarvestJobExists as e: 154 | tk.error_shout(e) 155 | ctx.abort() 156 | click.echo(result) 157 | 158 | 159 | @harvester.command() 160 | @click.pass_context 161 | def jobs(ctx): 162 | """Lists harvest jobs. 163 | 164 | """ 165 | flask_app = ctx.meta["flask_app"] 166 | with flask_app.test_request_context(): 167 | result = utils.list_jobs() 168 | click.echo(result) 169 | 170 | 171 | @harvester.command() 172 | @click.argument("id", metavar="SOURCE_OR_JOB_ID") 173 | @click.pass_context 174 | def job_abort(ctx, id): 175 | """Marks a job as "Aborted" so that the source can be restarted afresh. 176 | 177 | It ensures that the job's harvest objects status are also marked 178 | finished. You should ensure that neither the job nor its objects 179 | are currently in the gather/fetch queues. 180 | 181 | """ 182 | flask_app = ctx.meta["flask_app"] 183 | with flask_app.test_request_context(): 184 | try: 185 | result = utils.abort_job(id) 186 | except tk.ObjectNotFound: 187 | tk.error_shout(u"Job not found.") 188 | ctx.abort() 189 | 190 | click.echo(result) 191 | 192 | 193 | @harvester.command() 194 | @click.argument("life_span", default=False, required=False) 195 | @click.option( 196 | "-i", 197 | "--include", 198 | default=False, 199 | help="""If source_id provided as included, then only it's failed jobs will be aborted. 200 | You can use comma as a separator to provide multiple source_id's""", 201 | ) 202 | @click.option( 203 | "-e", 204 | "--exclude", 205 | default=False, 206 | help="""If source_id provided as excluded, all sources failed jobs, except for that 207 | will be aborted. You can use comma as a separator to provide multiple source_id's""", 208 | ) 209 | @click.pass_context 210 | def abort_failed_jobs(ctx, life_span, include, exclude): 211 | """Abort all jobs which are in a "limbo state" where the job has 212 | run with errors but the harvester run command will not mark it 213 | as finished, and therefore you cannot run another job. 214 | """ 215 | flask_app = ctx.meta["flask_app"] 216 | with flask_app.test_request_context(): 217 | result = utils.abort_failed_jobs(life_span, include, exclude) 218 | click.echo(result) 219 | 220 | 221 | @harvester.command() 222 | def purge_queues(): 223 | """Removes all jobs from fetch and gather queue. 224 | """ 225 | utils.purge_queues() 226 | 227 | 228 | @harvester.command() 229 | def gather_consumer(): 230 | """Starts the consumer for the gathering queue. 231 | 232 | """ 233 | utils.gather_consumer() 234 | 235 | 236 | @harvester.command() 237 | def fetch_consumer(): 238 | """Starts the consumer for the fetching queue. 239 | 240 | """ 241 | utils.fetch_consumer() 242 | 243 | 244 | @harvester.command() 245 | @click.pass_context 246 | def run(ctx): 247 | """Starts any harvest jobs that have been created by putting them onto 248 | the gather queue. 249 | 250 | Also checks running jobs - if finished it changes their status to 251 | Finished. 252 | 253 | """ 254 | flask_app = ctx.meta["flask_app"] 255 | with flask_app.test_request_context(): 256 | utils.run_harvester() 257 | 258 | 259 | @harvester.command() 260 | @click.pass_context 261 | @click.argument("id", metavar="SOURCE_ID_OR_NAME") 262 | @click.argument("force-import", required=False, metavar="GUID") 263 | def run_test(ctx, id, force_import=None): 264 | """Runs a harvest - for testing only. 265 | 266 | This does all the stages of the harvest (creates job, gather, 267 | fetch, import) without involving the web UI or the queue 268 | backends. This is useful for testing a harvester without having to 269 | fire up gather/fetch_consumer processes, as is done in production. 270 | 271 | """ 272 | if force_import: 273 | force_import = force_import.split('=')[-1] 274 | flask_app = ctx.meta["flask_app"] 275 | with flask_app.test_request_context(): 276 | utils.run_test_harvester(id, force_import) 277 | 278 | 279 | @harvester.command("import") 280 | @click.pass_context 281 | @click.argument("id", metavar="SOURCE_ID_OR_NAME", required=False) 282 | @click.option( 283 | "-j", 284 | "--no-join-datasets", 285 | is_flag=True, 286 | help="Do not join harvest objects to existing datasets", 287 | ) 288 | @click.option( 289 | "-o", 290 | "--harvest-object-id", 291 | help="Id of the harvest object to which perform the import stage", 292 | ) 293 | @click.option( 294 | "-p", 295 | "--package-id", 296 | help="Id of the package whose harvest object to perform the import stage for", 297 | ) 298 | @click.option( 299 | "-g", 300 | "--guid", 301 | help="Guid of the harvest object to which perform the import stage for", 302 | ) 303 | @click.option( 304 | "--segments", 305 | help="""A string containing hex digits that represent which of 306 | the 16 harvest object segments to import. e.g. 15af will run segments 1,5,a,f""", 307 | ) 308 | def import_stage( 309 | ctx, id, no_join_datasets, harvest_object_id, guid, package_id, segments 310 | ): 311 | """Perform the import stage with the last fetched objects, for a 312 | certain source or a single harvest object. 313 | 314 | Please note that no objects will be fetched from the remote 315 | server. It will only affect the objects already present in the 316 | database. 317 | 318 | To import a particular harvest source, specify its id as an argument. 319 | To import a particular harvest object use the -o option. 320 | To import a particular guid use the -g option. 321 | To import a particular package use the -p option. 322 | 323 | You will need to specify the -j flag in cases where the datasets 324 | are not yet created (e.g. first harvest, or all previous harvests 325 | have failed) 326 | 327 | The --segments flag allows to define a string containing hex 328 | digits that represent which of the 16 harvest object segments to 329 | import. e.g. 15af will run segments 1,5,a,f 330 | 331 | """ 332 | flask_app = ctx.meta["flask_app"] 333 | with flask_app.test_request_context(): 334 | try: 335 | utils.import_stage( 336 | id, 337 | no_join_datasets, 338 | harvest_object_id, 339 | guid, 340 | package_id, 341 | segments, 342 | ) 343 | except tk.ObjectNotFound: 344 | tk.error_shout(u"Source <{}> not found.".format(id)) 345 | 346 | 347 | @harvester.command() 348 | @click.pass_context 349 | def clean_harvest_log(ctx): 350 | """Clean-up mechanism for the harvest log table. 351 | 352 | You can configure the time frame through the configuration 353 | parameter `ckan.harvest.log_timeframe`. The default time frame is 30 354 | days 355 | 356 | """ 357 | flask_app = ctx.meta["flask_app"] 358 | with flask_app.test_request_context(): 359 | utils.clean_harvest_log() 360 | 361 | 362 | @harvester.command("job-all") 363 | @click.pass_context 364 | def job_all(ctx): 365 | """Create new harvest jobs for all active sources. 366 | 367 | """ 368 | flask_app = ctx.meta["flask_app"] 369 | with flask_app.test_request_context(): 370 | result = utils.job_all() 371 | click.echo(result) 372 | 373 | 374 | @harvester.command() 375 | @click.pass_context 376 | def reindex(ctx): 377 | """Reindexes the harvest source datasets. 378 | 379 | """ 380 | flask_app = ctx.meta["flask_app"] 381 | with flask_app.test_request_context(): 382 | utils.reindex() 383 | 384 | 385 | @harvester.command("harvesters_info") 386 | @click.pass_context 387 | def harvesters_info(ctx): 388 | """ 389 | 390 | """ 391 | flask_app = ctx.meta["flask_app"] 392 | with flask_app.test_request_context(): 393 | result = utils.harvesters_info() 394 | 395 | click.echo(result) 396 | -------------------------------------------------------------------------------- /ckanext/harvest/controllers/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | import pkg_resources 3 | pkg_resources.declare_namespace(__name__) 4 | except ImportError: 5 | import pkgutil 6 | __path__ = pkgutil.extend_path(__path__, __name__) 7 | -------------------------------------------------------------------------------- /ckanext/harvest/controllers/view.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from ckan.lib.base import BaseController, c 4 | from ckan.common import response 5 | 6 | import ckanext.harvest.utils as utils 7 | 8 | 9 | class ViewController(BaseController): 10 | def __before__(self, action, **params): 11 | 12 | super(ViewController, self).__before__(action, **params) 13 | 14 | c.dataset_type = utils.DATASET_TYPE_NAME 15 | 16 | def delete(self, id): 17 | return utils.delete_view(id) 18 | 19 | def refresh(self, id): 20 | return utils.refresh_view(id) 21 | 22 | def clear(self, id): 23 | return utils.clear_view(id) 24 | 25 | def show_object(self, id, ref_type='object'): 26 | _, content = utils.object_show_view(id, ref_type, response) 27 | return content 28 | 29 | def show_job(self, id, source_dict=False, is_last=False): 30 | return utils.job_show_view(id, source_dict, is_last) 31 | 32 | def about(self, id): 33 | return utils.about_view(id) 34 | 35 | def admin(self, id): 36 | return utils.admin_view(id) 37 | 38 | def abort_job(self, source, id): 39 | return utils.job_abort_view(source, id) 40 | 41 | def show_last_job(self, source): 42 | return utils.job_show_last_view(source) 43 | 44 | def list_jobs(self, source): 45 | return utils.job_list_view(source) 46 | -------------------------------------------------------------------------------- /ckanext/harvest/harvesters/__init__.py: -------------------------------------------------------------------------------- 1 | from ckanext.harvest.harvesters.ckanharvester import CKANHarvester 2 | from ckanext.harvest.harvesters.base import HarvesterBase 3 | 4 | __all__ = ['CKANHarvester', 'HarvesterBase'] 5 | -------------------------------------------------------------------------------- /ckanext/harvest/helpers.py: -------------------------------------------------------------------------------- 1 | 2 | from ckan import logic 3 | from ckan import model 4 | import ckan.lib.helpers as h 5 | import ckan.plugins as p 6 | 7 | from ckanext.harvest.model import UPDATE_FREQUENCIES 8 | from ckanext.harvest.utils import ( 9 | DATASET_TYPE_NAME 10 | ) 11 | from ckanext.harvest.interfaces import IHarvester 12 | 13 | 14 | c = p.toolkit.c 15 | request = p.toolkit.request 16 | 17 | 18 | def get_harvest_source(source_id=None): 19 | 20 | context = {'model': model, 'session': model.Session} 21 | if source_id: 22 | return p.toolkit.get_action('harvest_source_show')(context, {'id': source_id}) 23 | elif hasattr(c, 'pkg_dict'): 24 | return c.pkg_dict 25 | elif hasattr(c, 'pkg'): 26 | return p.toolkit.get_action('harvest_source_show')(context, {'id': c.pkg.id}) 27 | 28 | return None 29 | 30 | 31 | def package_list_for_source(source_id): 32 | ''' 33 | Creates a dataset list with the ones belonging to a particular harvest 34 | source. 35 | 36 | It calls the package_list snippet and the pager. 37 | ''' 38 | limit = 20 39 | page = int(request.args.get('page', 1)) 40 | fq = '+harvest_source_id:"{0}"'.format(source_id) 41 | search_dict = { 42 | 'fq': fq, 43 | 'rows': limit, 44 | 'sort': 'metadata_modified desc', 45 | 'start': (page - 1) * limit, 46 | } 47 | 48 | context = {'model': model, 'session': model.Session} 49 | harvest_source = get_harvest_source(source_id) 50 | owner_org = harvest_source.get('owner_org', '') 51 | if owner_org: 52 | user_member_of_orgs = [org['id'] for org 53 | in h.organizations_available('read')] 54 | if (harvest_source and owner_org in user_member_of_orgs): 55 | context['ignore_capacity_check'] = True 56 | 57 | query = logic.get_action('package_search')(context, search_dict) 58 | 59 | base_url = h.url_for( 60 | '{0}.read'.format(DATASET_TYPE_NAME), 61 | id=harvest_source['name'] 62 | ) 63 | 64 | def pager_url(q=None, page=None): 65 | url = base_url 66 | if page: 67 | url += '?page={0}'.format(page) 68 | return url 69 | 70 | pager = h.Page( 71 | collection=query['results'], 72 | page=page, 73 | url=pager_url, 74 | item_count=query['count'], 75 | items_per_page=limit 76 | ) 77 | pager.items = query['results'] 78 | 79 | if query['results']: 80 | out = h.snippet('snippets/package_list.html', packages=query['results']) 81 | out += pager.pager() 82 | else: 83 | out = h.snippet('snippets/package_list_empty.html') 84 | 85 | return out 86 | 87 | 88 | def package_count_for_source(source_id): 89 | ''' 90 | Returns the current package count for datasets associated with the given 91 | source id 92 | ''' 93 | fq = '+harvest_source_id:"{0}"'.format(source_id) 94 | search_dict = {'fq': fq} 95 | context = {'model': model, 'session': model.Session} 96 | result = logic.get_action('package_search')(context, search_dict) 97 | return result.get('count', 0) 98 | 99 | 100 | def harvesters_info(): 101 | context = {'model': model, 'user': p.toolkit.c.user or p.toolkit.c.author} 102 | return logic.get_action('harvesters_info_show')(context, {}) 103 | 104 | 105 | def harvester_types(): 106 | harvesters = harvesters_info() 107 | return [{'text': p.toolkit._(h['title']), 'value': h['name']} 108 | for h in harvesters] 109 | 110 | 111 | def harvest_frequencies(): 112 | 113 | return [{'text': p.toolkit._(f.title()), 'value': f} 114 | for f in UPDATE_FREQUENCIES] 115 | 116 | 117 | def link_for_harvest_object(id=None, guid=None, text=None): 118 | 119 | if not id and not guid: 120 | return None 121 | 122 | if guid: 123 | context = {'model': model, 'user': p.toolkit.c.user or p.toolkit.c.author} 124 | obj = logic.get_action('harvest_object_show')(context, {'id': guid, 'attr': 'guid'}) 125 | id = obj.id 126 | 127 | url = h.url_for('harvester.object_show', id=id) 128 | text = text or guid or id 129 | link = '{text}'.format(url=url, text=text) 130 | 131 | return p.toolkit.literal(link) 132 | 133 | 134 | def harvest_source_extra_fields(): 135 | fields = {} 136 | for harvester in p.PluginImplementations(IHarvester): 137 | if not hasattr(harvester, 'extra_schema'): 138 | continue 139 | fields[harvester.info()['name']] = list(harvester.extra_schema().keys()) 140 | return fields 141 | -------------------------------------------------------------------------------- /ckanext/harvest/i18n/sv/LC_MESSAGES/ckanext-harvest.mo: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ckan/ckanext-harvest/b74cba23b647f0aefab1db406784dd8bb11f8c7d/ckanext/harvest/i18n/sv/LC_MESSAGES/ckanext-harvest.mo -------------------------------------------------------------------------------- /ckanext/harvest/interfaces.py: -------------------------------------------------------------------------------- 1 | from ckan.plugins.interfaces import Interface 2 | 3 | 4 | class IHarvester(Interface): 5 | ''' 6 | Common harvesting interface 7 | 8 | ''' 9 | 10 | def info(self): 11 | ''' 12 | Harvesting implementations must provide this method, which will return 13 | a dictionary containing different descriptors of the harvester. The 14 | returned dictionary should contain: 15 | 16 | * name: machine-readable name. This will be the value stored in the 17 | database, and the one used by ckanext-harvest to call the appropiate 18 | harvester. 19 | * title: human-readable name. This will appear in the form's select box 20 | in the WUI. 21 | * description: a small description of what the harvester does. This 22 | will appear on the form as a guidance to the user. 23 | 24 | A complete example may be:: 25 | 26 | { 27 | 'name': 'csw', 28 | 'title': 'CSW Server', 29 | 'description': 'A server that implements OGC's Catalog Service 30 | for the Web (CSW) standard' 31 | } 32 | 33 | :returns: A dictionary with the harvester descriptors 34 | ''' 35 | 36 | def validate_config(self, config): 37 | ''' 38 | 39 | [optional] 40 | 41 | Harvesters can provide this method to validate the configuration 42 | entered in the form. It should return a single string, which will be 43 | stored in the database. Exceptions raised will be shown in the form's 44 | error messages. 45 | 46 | :param harvest_object_id: Config string coming from the form 47 | :returns: A string with the validated configuration options 48 | ''' 49 | 50 | def get_original_url(self, harvest_object_id): 51 | ''' 52 | 53 | [optional] 54 | 55 | This optional but very recommended method allows harvesters to return 56 | the URL to the original remote document, given a Harvest Object id. 57 | Note that getting the harvest object you have access to its guid as 58 | well as the object source, which has the URL. 59 | This URL will be used on error reports to help publishers link to the 60 | original document that has the errors. If this method is not provided 61 | or no URL is returned, only a link to the local copy of the remote 62 | document will be shown. 63 | 64 | Examples: 65 | * For a CKAN record: http://{ckan-instance}/api/rest/{guid} 66 | * For a WAF record: http://{waf-root}/{file-name} 67 | * For a CSW record: http://{csw-server}/?Request=GetElementById&Id={guid}&... 68 | 69 | :param harvest_object_id: HarvestObject id 70 | :returns: A string with the URL to the original document 71 | ''' 72 | 73 | def gather_stage(self, harvest_job): 74 | ''' 75 | The gather stage will receive a HarvestJob object and will be 76 | responsible for: 77 | - gathering all the necessary objects to fetch on a later. 78 | stage (e.g. for a CSW server, perform a GetRecords request) 79 | - creating the necessary HarvestObjects in the database, specifying 80 | the guid and a reference to its job. The HarvestObjects need a 81 | reference date with the last modified date for the resource, this 82 | may need to be set in a different stage depending on the type of 83 | source. 84 | - creating and storing any suitable HarvestGatherErrors that may 85 | occur. 86 | - returning a list with all the ids of the created HarvestObjects. 87 | - to abort the harvest, create a HarvestGatherError and raise an 88 | exception. Any created HarvestObjects will be deleted. 89 | 90 | :param harvest_job: HarvestJob object 91 | :returns: A list of HarvestObject ids 92 | ''' 93 | 94 | def fetch_stage(self, harvest_object): 95 | ''' 96 | The fetch stage will receive a HarvestObject object and will be 97 | responsible for: 98 | - getting the contents of the remote object (e.g. for a CSW server, 99 | perform a GetRecordById request). 100 | - saving the content in the provided HarvestObject. 101 | - creating and storing any suitable HarvestObjectErrors that may 102 | occur. 103 | - returning True if everything is ok (ie the object should now be 104 | imported), "unchanged" if the object didn't need harvesting after 105 | all (ie no error, but don't continue to import stage) or False if 106 | there were errors. 107 | 108 | :param harvest_object: HarvestObject object 109 | :returns: True if successful, 'unchanged' if nothing to import after 110 | all, False if not successful 111 | ''' 112 | 113 | def import_stage(self, harvest_object): 114 | ''' 115 | The import stage will receive a HarvestObject object and will be 116 | responsible for: 117 | - performing any necessary action with the fetched object (e.g. 118 | create, update or delete a CKAN package). 119 | Note: if this stage creates or updates a package, a reference 120 | to the package should be added to the HarvestObject. 121 | - setting the HarvestObject.package (if there is one) 122 | - setting the HarvestObject.current for this harvest: 123 | - True if successfully created/updated 124 | - False if successfully deleted 125 | - setting HarvestObject.current to False for previous harvest 126 | objects of this harvest source if the action was successful. 127 | - creating and storing any suitable HarvestObjectErrors that may 128 | occur. 129 | - creating the HarvestObject - Package relation (if necessary) 130 | - returning True if the action was done, "unchanged" if the object 131 | didn't need harvesting after all or False if there were errors. 132 | 133 | NB You can run this stage repeatedly using 'paster harvest import'. 134 | 135 | :param harvest_object: HarvestObject object 136 | :returns: True if the action was done, "unchanged" if the object didn't 137 | need harvesting after all or False if there were errors. 138 | ''' 139 | -------------------------------------------------------------------------------- /ckanext/harvest/log.py: -------------------------------------------------------------------------------- 1 | from logging import Handler, NOTSET 2 | 3 | from ckanext.harvest.model import HarvestLog 4 | 5 | 6 | class DBLogHandler(Handler): 7 | def __init__(self, level=NOTSET): 8 | super(DBLogHandler, self).__init__(level=level) 9 | 10 | def emit(self, record): 11 | try: 12 | level = record.levelname 13 | msg = self.format(record) 14 | obj = HarvestLog(level=level, content=msg) 15 | obj.save() 16 | except Exception: 17 | pass 18 | -------------------------------------------------------------------------------- /ckanext/harvest/logic/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | import pkg_resources 3 | pkg_resources.declare_namespace(__name__) 4 | except ImportError: 5 | import pkgutil 6 | __path__ = pkgutil.extend_path(__path__, __name__) 7 | 8 | 9 | class HarvestJobExists(Exception): 10 | pass 11 | 12 | 13 | class HarvestSourceInactiveError(Exception): 14 | pass 15 | -------------------------------------------------------------------------------- /ckanext/harvest/logic/action/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | import pkg_resources 3 | pkg_resources.declare_namespace(__name__) 4 | except ImportError: 5 | import pkgutil 6 | __path__ = pkgutil.extend_path(__path__, __name__) 7 | -------------------------------------------------------------------------------- /ckanext/harvest/logic/action/create.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import ckan 4 | 5 | from ckan.plugins import toolkit 6 | 7 | from ckanext.harvest.logic import HarvestJobExists, HarvestSourceInactiveError 8 | from ckanext.harvest.utils import ( 9 | DATASET_TYPE_NAME 10 | ) 11 | from ckanext.harvest.model import (HarvestSource, HarvestJob, HarvestObject, 12 | HarvestObjectExtra) 13 | from ckanext.harvest.logic.dictization import (harvest_job_dictize, 14 | harvest_object_dictize) 15 | from ckanext.harvest.logic.schema import harvest_object_create_schema 16 | from ckanext.harvest.logic.action.get import (harvest_source_list, 17 | harvest_job_list) 18 | 19 | log = logging.getLogger(__name__) 20 | 21 | _validate = ckan.lib.navl.dictization_functions.validate 22 | check_access = toolkit.check_access 23 | 24 | 25 | class InactiveSource(Exception): 26 | pass 27 | 28 | 29 | def harvest_source_create(context, data_dict): 30 | ''' 31 | Creates a new harvest source 32 | 33 | This method just proxies the request to package_create, 34 | which will create a harvest_source dataset type and the 35 | HarvestSource object. All auth checks and validation will 36 | be done there .We only make sure to set the dataset type. 37 | 38 | Note that the harvest source type (ckan, waf, csw, etc) 39 | is now set via the source_type field. 40 | 41 | :param url: the URL for the harvest source 42 | :type url: string 43 | :param name: the name of the new harvest source, must be between 2 and 100 44 | characters long and contain only lowercase alphanumeric characters 45 | :type name: string 46 | :param title: the title of the dataset (optional, default: same as 47 | ``name``) 48 | :type title: string 49 | :param notes: a description of the harvest source (optional) 50 | :type notes: string 51 | :param source_type: the harvester type for this source. This must be one 52 | of the registerd harvesters, eg 'ckan', 'csw', etc. 53 | :type source_type: string 54 | :param frequency: the frequency in wich this harvester should run. See 55 | ``ckanext.harvest.model`` source for possible values. Default is 56 | 'MANUAL' 57 | :type frequency: string 58 | :param config: extra configuration options for the particular harvester 59 | type. Should be a serialized as JSON. (optional) 60 | :type config: string 61 | 62 | 63 | :returns: the newly created harvest source 64 | :rtype: dictionary 65 | ''' 66 | 67 | log.info('Creating harvest source: %r', data_dict) 68 | 69 | data_dict['type'] = DATASET_TYPE_NAME 70 | 71 | context['extras_as_string'] = True 72 | source = toolkit.get_action('package_create')(context, data_dict) 73 | 74 | return source 75 | 76 | 77 | def harvest_job_create(context, data_dict): 78 | ''' 79 | Creates a Harvest Job for a Harvest Source and runs it (by putting it on 80 | the gather queue) 81 | 82 | :param source_id: id of the harvest source to create a job for 83 | :type source_id: string 84 | :param run: whether to also run it or not (default: True) 85 | :type run: bool 86 | ''' 87 | log.info('Harvest job create: %r', data_dict) 88 | check_access('harvest_job_create', context, data_dict) 89 | 90 | source_id = data_dict['source_id'] 91 | run_it = data_dict.get('run', True) 92 | 93 | # Check if source exists 94 | source = HarvestSource.get(source_id) 95 | if not source: 96 | log.warn('Harvest source %s does not exist', source_id) 97 | raise toolkit.ObjectNotFound('Harvest source %s does not exist' % source_id) 98 | 99 | # Check if the source is active 100 | if not source.active: 101 | log.warn('Harvest job cannot be created for inactive source %s', 102 | source_id) 103 | raise HarvestSourceInactiveError('Can not create jobs on inactive sources') 104 | 105 | # Check if there already is an unrun or currently running job for this 106 | # source 107 | exists = _check_for_existing_jobs(context, source_id) 108 | if exists: 109 | log.warn('There is already an unrun job %r for this source %s', 110 | exists, source_id) 111 | raise HarvestJobExists('There already is an unrun job for this source') 112 | 113 | job = HarvestJob() 114 | job.source = source 115 | job.save() 116 | log.info('Harvest job saved %s', job.id) 117 | 118 | if run_it: 119 | toolkit.get_action('harvest_send_job_to_gather_queue')( 120 | context, {'id': job.id}) 121 | 122 | return harvest_job_dictize(job, context) 123 | 124 | 125 | def harvest_job_create_all(context, data_dict): 126 | ''' 127 | Creates a Harvest Job for all Harvest Sources and runs them (by 128 | putting them on the gather queue) 129 | 130 | :param source_id: 131 | :type param: string 132 | :param run: whether to also run the jobs or not (default: True) 133 | :type run: bool 134 | ''' 135 | 136 | log.info('Harvest job create all: %r', data_dict) 137 | check_access('harvest_job_create_all', context, data_dict) 138 | 139 | run = data_dict.get('run', True) 140 | 141 | data_dict.update({'only_active': True}) 142 | 143 | # Get all active sources 144 | sources = harvest_source_list(context, data_dict) 145 | jobs = [] 146 | # Create a new job for each, if there isn't already one 147 | for source in sources: 148 | exists = _check_for_existing_jobs(context, source['id']) 149 | if exists: 150 | log.info('Skipping source %s as it already has a pending job', 151 | source['id']) 152 | continue 153 | 154 | job = harvest_job_create( 155 | context, {'source_id': source['id'], 'run': run}) 156 | jobs.append(job) 157 | 158 | log.info('Created jobs for %s%i harvest sources', 159 | 'and run ' if run else '', len(jobs)) 160 | return jobs 161 | 162 | 163 | def _check_for_existing_jobs(context, source_id): 164 | ''' 165 | Given a source id, checks if there are jobs for this source 166 | with status 'New' or 'Running' 167 | 168 | rtype: boolean 169 | ''' 170 | data_dict = { 171 | 'source_id': source_id, 172 | 'status': u'New' 173 | } 174 | exist_new = harvest_job_list(context, data_dict) 175 | data_dict = { 176 | 'source_id': source_id, 177 | 'status': u'Running' 178 | } 179 | exist_running = harvest_job_list(context, data_dict) 180 | exist = len(exist_new + exist_running) > 0 181 | 182 | return exist 183 | 184 | 185 | def harvest_object_create(context, data_dict): 186 | ''' Create a new harvest object 187 | 188 | :type guid: string (optional) 189 | :type content: string (optional) 190 | :type job_id: string 191 | :type source_id: string (optional) 192 | :type package_id: string (optional) 193 | :type extras: dict (optional) 194 | ''' 195 | check_access('harvest_object_create', context, data_dict) 196 | data, errors = _validate(data_dict, harvest_object_create_schema(), 197 | context) 198 | 199 | if errors: 200 | raise toolkit.ValidationError(errors) 201 | 202 | obj = HarvestObject( 203 | guid=data.get('guid'), 204 | content=data.get('content'), 205 | job=data['job_id'], # which was validated into a HarvestJob object 206 | harvest_source_id=data.get('source_id'), 207 | package_id=data.get('package_id'), 208 | extras=[HarvestObjectExtra(key=k, value=v) 209 | for k, v in data.get('extras', {}).items()] 210 | ) 211 | 212 | obj.save() 213 | return harvest_object_dictize(obj, context) 214 | -------------------------------------------------------------------------------- /ckanext/harvest/logic/action/delete.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from ckan import plugins as p 4 | 5 | log = logging.getLogger(__name__) 6 | 7 | 8 | def harvest_source_delete(context, data_dict): 9 | '''Deletes an existing harvest source 10 | 11 | This method just proxies the request to package_delete, 12 | which will delete the actual harvest type dataset and the 13 | HarvestSource object (via the after_delete extension point). 14 | 15 | :param id: the name or id of the harvest source to delete 16 | :type id: string 17 | ''' 18 | log.info('Deleting harvest source: %r', data_dict) 19 | 20 | p.toolkit.check_access('harvest_source_delete', context, data_dict) 21 | 22 | p.toolkit.get_action('package_delete')(context, data_dict) 23 | 24 | if context.get('clear_source', False): 25 | 26 | # We need the id. The name won't work. 27 | package_dict = p.toolkit.get_action('package_show')(context, data_dict) 28 | 29 | p.toolkit.get_action('harvest_source_clear')( 30 | context, {'id': package_dict['id']}) 31 | -------------------------------------------------------------------------------- /ckanext/harvest/logic/action/patch.py: -------------------------------------------------------------------------------- 1 | '''API functions for partial updates of existing data in CKAN''' 2 | 3 | import logging 4 | from ckan.logic import get_action 5 | from ckanext.harvest.utils import ( 6 | DATASET_TYPE_NAME 7 | ) 8 | 9 | log = logging.getLogger(__name__) 10 | 11 | 12 | def harvest_source_patch(context, data_dict): 13 | ''' 14 | Patch an existing harvest source 15 | 16 | This method just proxies the request to package_patch, which will update a 17 | harvest_source dataset type and the HarvestSource object. All auth checks 18 | and validation will be done there. We only make sure to set the dataset 19 | type. 20 | 21 | Note that the harvest source type (ckan, waf, csw, etc) is now set via the 22 | source_type field. 23 | 24 | All fields that are not provided, will be stay as they were before. 25 | 26 | :param id: the name or id of the harvest source to update 27 | :type id: string 28 | :param url: the URL for the harvest source 29 | :type url: string 30 | :param name: the name of the new harvest source, must be between 2 and 100 31 | characters long and contain only lowercase alphanumeric characters 32 | :type name: string 33 | :param title: the title of the dataset (optional, default: same as 34 | ``name``) 35 | :type title: string 36 | :param notes: a description of the harvest source (optional) 37 | :type notes: string 38 | :param source_type: the harvester type for this source. This must be one 39 | of the registerd harvesters, eg 'ckan', 'csw', etc. 40 | :type source_type: string 41 | :param frequency: the frequency in wich this harvester should run. See 42 | ``ckanext.harvest.model`` source for possible values. Default is 43 | 'MANUAL' 44 | :type frequency: string 45 | :param config: extra configuration options for the particular harvester 46 | type. Should be a serialized as JSON. (optional) 47 | :type config: string 48 | 49 | :returns: the updated harvest source 50 | :rtype: dictionary 51 | ''' 52 | log.info('Patch harvest source: %r', data_dict) 53 | 54 | data_dict['type'] = DATASET_TYPE_NAME 55 | 56 | context['extras_as_string'] = True 57 | try: 58 | source = get_action('package_patch')(context, data_dict) 59 | except KeyError: 60 | raise Exception('The harvest_source_patch action is not available on ' 61 | 'this version of CKAN') 62 | 63 | return source 64 | -------------------------------------------------------------------------------- /ckanext/harvest/logic/auth/__init__.py: -------------------------------------------------------------------------------- 1 | from ckan.plugins import toolkit as pt 2 | from ckanext.harvest import model as harvest_model 3 | 4 | 5 | def user_is_sysadmin(context): 6 | ''' 7 | Checks if the user defined in the context is a sysadmin 8 | 9 | rtype: boolean 10 | ''' 11 | model = context['model'] 12 | user = context['user'] 13 | user_obj = model.User.get(user) 14 | if not user_obj: 15 | raise pt.Objectpt.ObjectNotFound('User {0} not found').format(user) 16 | 17 | return user_obj.sysadmin 18 | 19 | 20 | def _get_object(context, data_dict, name, class_name): 21 | ''' 22 | return the named item if in the data_dict, or get it from 23 | model.class_name 24 | ''' 25 | if name not in context: 26 | id = data_dict.get('id', None) 27 | obj = getattr(harvest_model, class_name).get(id) 28 | if not obj: 29 | raise pt.ObjectNotFound 30 | else: 31 | obj = context[name] 32 | return obj 33 | 34 | 35 | def get_source_object(context, data_dict={}): 36 | return _get_object(context, data_dict, 'source', 'HarvestSource') 37 | 38 | 39 | def get_job_object(context, data_dict={}): 40 | return _get_object(context, data_dict, 'job', 'HarvestJob') 41 | 42 | 43 | def get_obj_object(context, data_dict={}): 44 | return _get_object(context, data_dict, 'obj', 'HarvestObject') 45 | -------------------------------------------------------------------------------- /ckanext/harvest/logic/auth/create.py: -------------------------------------------------------------------------------- 1 | from ckan.plugins import toolkit as pt 2 | from ckanext.harvest.logic.auth import user_is_sysadmin 3 | 4 | 5 | def harvest_source_create(context, data_dict): 6 | ''' 7 | Authorization check for harvest source creation 8 | 9 | It forwards the checks to package_create, which will check for 10 | organization membership, whether if sysadmin, etc according to the 11 | instance configuration. 12 | ''' 13 | user = context.get('user') 14 | try: 15 | pt.check_access('package_create', context, data_dict) 16 | return {'success': True} 17 | except pt.NotAuthorized: 18 | return {'success': False, 19 | 'msg': pt._('User {0} not authorized to create harvest sources').format(user)} 20 | 21 | 22 | def harvest_job_create(context, data_dict): 23 | ''' 24 | Authorization check for harvest job creation 25 | 26 | It forwards the checks to package_update, ie the user can only create 27 | new jobs if she is allowed to edit the harvest source dataset. 28 | ''' 29 | model = context['model'] 30 | source_id = data_dict['source_id'] 31 | 32 | pkg = model.Package.get(source_id) 33 | if not pkg: 34 | raise pt.ObjectNotFound(pt._('Harvest source not found')) 35 | 36 | context['package'] = pkg 37 | try: 38 | pt.check_access('package_update', context, {"id": source_id}) 39 | return {'success': True} 40 | except pt.NotAuthorized: 41 | return {'success': False, 42 | 'msg': pt._('User not authorized to create a job for source {0}').format(source_id)} 43 | 44 | 45 | def harvest_job_create_all(context, data_dict): 46 | ''' 47 | Authorization check for creating new jobs for all sources 48 | 49 | Only sysadmins can do it 50 | ''' 51 | if not user_is_sysadmin(context): 52 | return {'success': False, 'msg': pt._('Only sysadmins can create harvest jobs for all sources')} 53 | else: 54 | return {'success': True} 55 | 56 | 57 | def harvest_object_create(context, data_dict): 58 | """ 59 | Auth check for creating a harvest object 60 | 61 | only the sysadmins can create harvest objects 62 | """ 63 | # sysadmins can run all actions if we've got to this point we're not a sysadmin 64 | return {'success': False, 'msg': pt._('Only the sysadmins can create harvest objects')} 65 | -------------------------------------------------------------------------------- /ckanext/harvest/logic/auth/delete.py: -------------------------------------------------------------------------------- 1 | from ckan.plugins import toolkit as pt 2 | 3 | 4 | def harvest_source_delete(context, data_dict): 5 | ''' 6 | Authorization check for harvest source deletion 7 | 8 | It forwards the checks to package_delete, which will check for 9 | organization membership, whether if sysadmin, etc according to the 10 | instance configuration. 11 | ''' 12 | model = context.get('model') 13 | user = context.get('user') 14 | source_id = data_dict['id'] 15 | 16 | pkg = model.Package.get(source_id) 17 | if not pkg: 18 | raise pt.ObjectNotFound(pt._('Harvest source not found')) 19 | 20 | context['package'] = pkg 21 | 22 | try: 23 | pt.check_access('package_delete', context, data_dict) 24 | return {'success': True} 25 | except pt.NotAuthorized: 26 | return {'success': False, 27 | 'msg': pt._('User {0} not authorized to delete harvest source {1}').format(user, source_id)} 28 | -------------------------------------------------------------------------------- /ckanext/harvest/logic/auth/get.py: -------------------------------------------------------------------------------- 1 | from ckan.plugins import toolkit as pt 2 | 3 | from ckanext.harvest.logic.auth import get_job_object 4 | 5 | 6 | def auth_allow_anonymous_access(auth_function): 7 | ''' 8 | Local version of the auth_allow_anonymous_access decorator that only 9 | calls the actual toolkit decorator if the CKAN version supports it 10 | ''' 11 | if pt.check_ckan_version(min_version='2.2'): 12 | auth_function = pt.auth_allow_anonymous_access(auth_function) 13 | 14 | return auth_function 15 | 16 | 17 | @auth_allow_anonymous_access 18 | def harvest_source_show(context, data_dict): 19 | ''' 20 | Authorization check for getting the details of a harvest source 21 | 22 | It forwards the checks to package_show, which will check for 23 | organization membership, whether if sysadmin, etc according to the 24 | instance configuration. 25 | ''' 26 | model = context.get('model') 27 | user = context.get('user') 28 | source_id = data_dict['id'] 29 | 30 | pkg = model.Package.get(source_id) 31 | if not pkg: 32 | raise pt.ObjectNotFound(pt._('Harvest source not found')) 33 | 34 | context['package'] = pkg 35 | 36 | try: 37 | pt.check_access('package_show', context, data_dict) 38 | return {'success': True} 39 | except pt.NotAuthorized: 40 | return {'success': False, 41 | 'msg': pt._('User {0} not authorized to read harvest source {1}') 42 | .format(user, source_id)} 43 | 44 | 45 | @auth_allow_anonymous_access 46 | def harvest_source_show_status(context, data_dict): 47 | ''' 48 | Authorization check for getting the status of a harvest source 49 | 50 | It forwards the checks to harvest_source_show. 51 | ''' 52 | return harvest_source_show(context, data_dict) 53 | 54 | 55 | @auth_allow_anonymous_access 56 | def harvest_source_list(context, data_dict): 57 | ''' 58 | Authorization check for getting a list of harveste sources 59 | 60 | Everybody can do it 61 | ''' 62 | return {'success': True} 63 | 64 | 65 | def harvest_job_show(context, data_dict): 66 | ''' 67 | Authorization check for getting the details of a harvest job 68 | 69 | It forwards the checks to harvest_source_update, ie if the user can 70 | update the parent source (eg create new jobs), she can get the details 71 | for the job, including the reports 72 | ''' 73 | user = context.get('user') 74 | job = get_job_object(context, data_dict) 75 | 76 | try: 77 | pt.check_access('harvest_source_update', 78 | context, 79 | {'id': job.source.id}) 80 | return {'success': True} 81 | except pt.NotAuthorized: 82 | return {'success': False, 83 | 'msg': pt._('User {0} not authorized to see jobs from source {1}') 84 | .format(user, job.source.id)} 85 | 86 | 87 | def harvest_job_list(context, data_dict): 88 | ''' 89 | Authorization check for getting a list of jobs for a source 90 | 91 | It forwards the checks to harvest_source_update, ie if the user can 92 | update the parent source (eg create new jobs), she can get the list of 93 | jobs 94 | ''' 95 | user = context.get('user') 96 | source_id = data_dict['source_id'] 97 | 98 | try: 99 | pt.check_access('harvest_source_update', 100 | context, 101 | {'id': source_id}) 102 | return {'success': True} 103 | except pt.NotAuthorized: 104 | return {'success': False, 105 | 'msg': pt._('User {0} not authorized to list jobs for source {1}') 106 | .format(user, source_id)} 107 | 108 | 109 | @auth_allow_anonymous_access 110 | def harvest_object_show(context, data_dict): 111 | ''' 112 | Authorization check for getting the contents of a harvest object 113 | 114 | Everybody can do it 115 | ''' 116 | return {'success': True} 117 | 118 | 119 | def harvest_object_list(context, data_dict): 120 | ''' 121 | TODO: remove 122 | ''' 123 | return {'success': True} 124 | 125 | 126 | @auth_allow_anonymous_access 127 | def harvesters_info_show(context, data_dict): 128 | ''' 129 | Authorization check for getting information about the available 130 | harvesters 131 | 132 | Everybody can do it 133 | ''' 134 | return {'success': True} 135 | 136 | 137 | def harvest_get_notifications_recipients(context, data_dict): 138 | # Only sysadmins can access this 139 | return {'success': False} 140 | -------------------------------------------------------------------------------- /ckanext/harvest/logic/auth/patch.py: -------------------------------------------------------------------------------- 1 | import ckanext.harvest.logic.auth.update as _update 2 | 3 | harvest_source_patch = _update.harvest_source_update 4 | -------------------------------------------------------------------------------- /ckanext/harvest/logic/auth/update.py: -------------------------------------------------------------------------------- 1 | from ckan.plugins import toolkit as pt 2 | from ckanext.harvest.logic.auth import user_is_sysadmin 3 | 4 | 5 | def harvest_source_update(context, data_dict): 6 | ''' 7 | Authorization check for harvest source update 8 | 9 | It forwards the checks to package_update, which will check for 10 | organization membership, whether if sysadmin, etc according to the 11 | instance configuration. 12 | ''' 13 | model = context.get('model') 14 | user = context.get('user') 15 | source_id = data_dict['id'] 16 | 17 | pkg = model.Package.get(source_id) 18 | if not pkg: 19 | raise pt.ObjectNotFound(pt._('Harvest source not found')) 20 | 21 | context['package'] = pkg 22 | 23 | try: 24 | pt.check_access('package_update', context, data_dict) 25 | return {'success': True} 26 | except pt.NotAuthorized: 27 | return {'success': False, 28 | 'msg': pt._('User {0} not authorized to update harvest source {1}').format(user, source_id)} 29 | 30 | 31 | def harvest_sources_clear(context, data_dict): 32 | ''' 33 | Authorization check for clearing history for all harvest sources 34 | 35 | Only sysadmins can do it 36 | ''' 37 | if not user_is_sysadmin(context): 38 | return {'success': False, 'msg': pt._('Only sysadmins can clear history for all harvest jobs')} 39 | else: 40 | return {'success': True} 41 | 42 | 43 | def harvest_source_clear(context, data_dict): 44 | ''' 45 | Authorization check for clearing a harvest source 46 | 47 | It forwards to harvest_source_update 48 | ''' 49 | return harvest_source_update(context, data_dict) 50 | 51 | 52 | def harvest_objects_import(context, data_dict): 53 | ''' 54 | Authorization check reimporting all harvest objects 55 | 56 | Only sysadmins can do it 57 | ''' 58 | if not user_is_sysadmin(context): 59 | return {'success': False, 'msg': pt._('Only sysadmins can reimport all harvest objects')} 60 | else: 61 | return {'success': True} 62 | 63 | 64 | def harvest_jobs_run(context, data_dict): 65 | ''' 66 | Authorization check for running the pending harvest jobs 67 | 68 | Only sysadmins can do it 69 | ''' 70 | if not user_is_sysadmin(context): 71 | return {'success': False, 'msg': pt._('Only sysadmins can run the pending harvest jobs')} 72 | else: 73 | return {'success': True} 74 | 75 | 76 | def harvest_send_job_to_gather_queue(context, data_dict): 77 | ''' 78 | Authorization check for sending a job to the gather queue 79 | 80 | It forwards the checks to harvest_job_create, ie the user can only run 81 | the job if she is allowed to create the job. 82 | ''' 83 | from ckanext.harvest.logic.auth.create import harvest_job_create 84 | return harvest_job_create(context, data_dict) 85 | 86 | 87 | def harvest_job_abort(context, data_dict): 88 | ''' 89 | Authorization check for aborting a running harvest job 90 | 91 | Same permissions as running one 92 | ''' 93 | return harvest_jobs_run(context, data_dict) 94 | 95 | 96 | def harvest_sources_reindex(context, data_dict): 97 | ''' 98 | Authorization check for reindexing all harvest sources 99 | 100 | Only sysadmins can do it 101 | ''' 102 | if not user_is_sysadmin(context): 103 | return {'success': False, 'msg': pt._('Only sysadmins can reindex all harvest sources')} 104 | else: 105 | return {'success': True} 106 | 107 | 108 | def harvest_source_reindex(context, data_dict): 109 | ''' 110 | Authorization check for reindexing a harvest source 111 | 112 | It forwards to harvest_source_update 113 | ''' 114 | return harvest_source_update(context, data_dict) 115 | -------------------------------------------------------------------------------- /ckanext/harvest/logic/dictization.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import func, text 2 | 3 | from ckan.model import Group 4 | from ckan import logic 5 | from ckanext.harvest.model import (HarvestJob, HarvestObject, 6 | HarvestGatherError, HarvestObjectError) 7 | 8 | 9 | def harvest_source_dictize(source, context, last_job_status=False): 10 | out = source.as_dict() 11 | 12 | out['publisher_title'] = u'' 13 | 14 | publisher_id = out.get('publisher_id') 15 | if publisher_id: 16 | group = Group.get(publisher_id) 17 | if group: 18 | out['publisher_title'] = group.title 19 | 20 | out['status'] = _get_source_status(source, context) 21 | 22 | if last_job_status: 23 | source_status = logic.get_action('harvest_source_show_status')(context, {'id': source.id}) 24 | out['last_job_status'] = source_status.get('last_job', {}) 25 | 26 | return out 27 | 28 | 29 | def harvest_job_dictize(job, context): 30 | out = job.as_dict() 31 | 32 | model = context['model'] 33 | 34 | if context.get('return_stats', True): 35 | stats = model.Session.query( 36 | HarvestObject.report_status, 37 | func.count(HarvestObject.id).label('total_objects'))\ 38 | .filter_by(harvest_job_id=job.id)\ 39 | .group_by(HarvestObject.report_status).all() 40 | out['stats'] = {'added': 0, 'updated': 0, 'not modified': 0, 41 | 'errored': 0, 'deleted': 0} 42 | for status, count in stats: 43 | out['stats'][status] = count 44 | 45 | # We actually want to check which objects had errors, because they 46 | # could have been added/updated anyway (eg bbox errors) 47 | count = model.Session.query( 48 | func.distinct(HarvestObjectError.harvest_object_id)) \ 49 | .join(HarvestObject) \ 50 | .filter(HarvestObject.harvest_job_id == job.id) \ 51 | .count() 52 | if count > 0: 53 | out['stats']['errored'] = count 54 | 55 | # Add gather errors to the error count 56 | count = model.Session.query(HarvestGatherError) \ 57 | .filter(HarvestGatherError.harvest_job_id == job.id) \ 58 | .count() 59 | if count > 0: 60 | out['stats']['errored'] = out['stats'].get('errored', 0) + count 61 | 62 | if context.get('return_error_summary', True): 63 | q = model.Session.query( 64 | HarvestObjectError.message, 65 | func.count(HarvestObjectError.message).label('error_count')) \ 66 | .join(HarvestObject) \ 67 | .filter(HarvestObject.harvest_job_id == job.id) \ 68 | .group_by(HarvestObjectError.message) \ 69 | .order_by(text('error_count desc')) \ 70 | .limit(context.get('error_summmary_limit', 20)) 71 | out['object_error_summary'] = harvest_error_dictize(q.all(), context) 72 | q = model.Session.query( 73 | HarvestGatherError.message, 74 | func.count(HarvestGatherError.message).label('error_count')) \ 75 | .filter(HarvestGatherError.harvest_job_id == job.id) \ 76 | .group_by(HarvestGatherError.message) \ 77 | .order_by(text('error_count desc')) \ 78 | .limit(context.get('error_summmary_limit', 20)) 79 | out['gather_error_summary'] = harvest_error_dictize(q.all(), context) 80 | 81 | return out 82 | 83 | 84 | def harvest_object_dictize(obj, context): 85 | out = obj.as_dict() 86 | out['source'] = obj.harvest_source_id 87 | out['job'] = obj.harvest_job_id 88 | 89 | if obj.package: 90 | out['package'] = obj.package.id 91 | 92 | out['errors'] = [] 93 | for error in obj.errors: 94 | out['errors'].append(error.as_dict()) 95 | 96 | out['extras'] = {} 97 | for extra in obj.extras: 98 | out['extras'][extra.key] = extra.value 99 | 100 | return out 101 | 102 | 103 | def harvest_log_dictize(obj, context): 104 | out = obj.as_dict() 105 | del out['id'] 106 | 107 | return out 108 | 109 | 110 | def harvest_error_dictize(obj, context): 111 | out = [] 112 | for elem in obj: 113 | out.append(elem._asdict()) 114 | return out 115 | 116 | 117 | def _get_source_status(source, context): 118 | ''' 119 | TODO: Deprecated, use harvest_source_show_status instead 120 | ''' 121 | 122 | out = dict() 123 | 124 | job_count = HarvestJob.filter(source=source).count() 125 | 126 | out = { 127 | 'job_count': 0, 128 | 'next_harvest': '', 129 | 'last_harvest_request': '', 130 | } 131 | 132 | if not job_count: 133 | out['msg'] = 'No jobs yet' 134 | return out 135 | else: 136 | out['job_count'] = job_count 137 | 138 | # Get next scheduled job 139 | next_job = HarvestJob.filter(source=source, status=u'New').first() 140 | if next_job: 141 | out['next_harvest'] = 'Scheduled' 142 | else: 143 | out['next_harvest'] = 'Not yet scheduled' 144 | 145 | # Get the last finished job 146 | last_job = HarvestJob.filter(source=source, status=u'Finished') \ 147 | .order_by(HarvestJob.created.desc()).first() 148 | 149 | if last_job: 150 | out['last_harvest_request'] = str(last_job.gather_finished) 151 | else: 152 | out['last_harvest_request'] = 'Not yet harvested' 153 | 154 | return out 155 | -------------------------------------------------------------------------------- /ckanext/harvest/logic/schema.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import ckan.plugins.toolkit as tk 3 | 4 | from ckan.logic.schema import default_extras_schema 5 | from ckan.logic.validators import (package_id_exists, 6 | name_validator, 7 | owner_org_validator, 8 | package_name_validator, 9 | boolean_validator, 10 | ) 11 | from ckan.logic.converters import convert_to_extras, convert_from_extras 12 | from ckantoolkit import unicode_safe 13 | 14 | from ckanext.harvest.logic.validators import ( 15 | harvest_source_url_validator, 16 | harvest_source_type_exists, 17 | harvest_source_config_validator, 18 | harvest_source_extra_validator, 19 | harvest_source_frequency_exists, 20 | dataset_type_exists, 21 | harvest_source_convert_from_config, 22 | harvest_source_id_exists, 23 | harvest_job_exists, 24 | harvest_object_extras_validator, 25 | ) 26 | ignore_missing = tk.get_validator("ignore_missing") 27 | not_empty = tk.get_validator("not_empty") 28 | ignore = tk.get_validator("ignore") 29 | if_empty_same_as = tk.get_validator("if_empty_same_as") 30 | 31 | 32 | def harvest_source_schema(): 33 | 34 | schema = { 35 | 'id': [ignore_missing, unicode_safe, package_id_exists], 36 | 'type': [dataset_type_exists, unicode_safe], 37 | 'url': [not_empty, unicode_safe, harvest_source_url_validator], 38 | 'name': [not_empty, unicode_safe, name_validator, package_name_validator], 39 | 'source_type': [not_empty, unicode_safe, harvest_source_type_exists, convert_to_extras], 40 | 'title': [if_empty_same_as("name"), unicode_safe], 41 | 'notes': [ignore_missing, unicode_safe], 42 | 'owner_org': [owner_org_validator, unicode_safe], 43 | 'private': [ignore_missing, boolean_validator], 44 | 'organization': [ignore_missing], 45 | 'frequency': [ignore_missing, unicode_safe, harvest_source_frequency_exists, convert_to_extras], 46 | 'state': [ignore_missing], 47 | 'config': [ignore_missing, harvest_source_config_validator, convert_to_extras], 48 | 'extras': default_extras_schema(), 49 | } 50 | 51 | extras_schema = default_extras_schema() 52 | extras_schema['__extras'] = [ignore] 53 | 54 | schema['extras'] = extras_schema 55 | 56 | return schema 57 | 58 | 59 | def harvest_source_create_package_schema(): 60 | 61 | schema = harvest_source_schema() 62 | schema['__extras'] = [harvest_source_extra_validator] 63 | schema['save'] = [ignore] 64 | schema.pop("id") 65 | 66 | return schema 67 | 68 | 69 | def harvest_source_update_package_schema(): 70 | 71 | schema = harvest_source_create_package_schema() 72 | schema['owner_org'] = [ignore_missing, owner_org_validator, unicode_safe] 73 | return schema 74 | 75 | 76 | def harvest_source_show_package_schema(): 77 | 78 | schema = harvest_source_schema() 79 | schema.update({ 80 | 'source_type': [convert_from_extras, ignore_missing], 81 | 'frequency': [convert_from_extras, ignore_missing], 82 | 'config': [convert_from_extras, harvest_source_convert_from_config, ignore_missing], 83 | 'metadata_created': [], 84 | 'metadata_modified': [], 85 | 'owner_org': [], 86 | 'creator_user_id': [], 87 | 'organization': [], 88 | 'notes': [], 89 | 'revision_id': [ignore_missing], 90 | 'revision_timestamp': [ignore_missing], 91 | 'tracking_summary': [ignore_missing], 92 | }) 93 | 94 | schema['__extras'] = [ignore] 95 | 96 | return schema 97 | 98 | 99 | def harvest_object_create_schema(): 100 | schema = { 101 | 'guid': [ignore_missing, unicode_safe], 102 | 'content': [ignore_missing, unicode_safe], 103 | 'state': [ignore_missing, unicode_safe], 104 | 'job_id': [harvest_job_exists], 105 | 'source_id': [ignore_missing, harvest_source_id_exists], 106 | 'package_id': [ignore_missing, package_id_exists], 107 | 'extras': [ignore_missing, harvest_object_extras_validator], 108 | } 109 | return schema 110 | -------------------------------------------------------------------------------- /ckanext/harvest/logic/validators.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import logging 4 | import json 5 | 6 | from ckan.lib.navl.dictization_functions import Invalid, validate 7 | from ckan import model 8 | from ckan.plugins import PluginImplementations 9 | 10 | from ckanext.harvest.utils import ( 11 | DATASET_TYPE_NAME 12 | ) 13 | from ckanext.harvest.model import HarvestSource, UPDATE_FREQUENCIES, HarvestJob 14 | from ckanext.harvest.interfaces import IHarvester 15 | from urllib.parse import (urlparse, urlunparse) 16 | 17 | log = logging.getLogger(__name__) 18 | 19 | 20 | def harvest_source_id_exists(value, context): 21 | 22 | result = HarvestSource.get(value) 23 | 24 | if not result: 25 | raise Invalid('Harvest Source with id %r does not exist.' % str(value)) 26 | return value 27 | 28 | 29 | def harvest_job_exists(value, context): 30 | '''Check if a harvest job exists and returns the model if it does''' 31 | result = HarvestJob.get(value) 32 | 33 | if not result: 34 | raise Invalid('Harvest Job with id %r does not exist.' % str(value)) 35 | return result 36 | 37 | 38 | def _normalize_url(url): 39 | '''Strips off parameters off a URL, and an unnecessary port number, so that 40 | simple variations on a URL are ignored, to used to help avoid getting two 41 | harvesters for the same URL.''' 42 | o = urlparse(url) 43 | 44 | # Normalize port 45 | if ':' in o.netloc: 46 | parts = o.netloc.split(':') 47 | if (o.scheme == 'http' and parts[1] == '80') or \ 48 | (o.scheme == 'https' and parts[1] == '443'): 49 | netloc = parts[0] 50 | else: 51 | netloc = ':'.join(parts) 52 | else: 53 | netloc = o.netloc 54 | 55 | # Remove trailing slash 56 | path = o.path.rstrip('/') 57 | 58 | check_url = urlunparse(( 59 | o.scheme, 60 | netloc, 61 | path, 62 | None, None, None)) 63 | 64 | return check_url 65 | 66 | 67 | def harvest_source_url_validator(key, data, errors, context): 68 | '''Validate the provided harvest source URL 69 | 70 | Checks that the URL & config combination are unique to this HarvestSource. 71 | ''' 72 | 73 | package = context.get('package') 74 | 75 | if package: 76 | package_id = package.id 77 | else: 78 | package_id = data.get(key[:-1] + ('id',)) 79 | 80 | try: 81 | new_config = data.get(key[:-1] + ('config',)) 82 | except Exception: 83 | new_config = None 84 | 85 | new_url = _normalize_url(data[key]) 86 | 87 | q = model.Session.query(model.Package.id, model.Package.url) \ 88 | .filter(model.Package.type == DATASET_TYPE_NAME) 89 | 90 | if package_id: 91 | # When editing a source we need to avoid its own URL 92 | q = q.filter(model.Package.id != package_id) 93 | 94 | existing_sources = q.all() 95 | 96 | for id_, url in existing_sources: 97 | url = _normalize_url(url) 98 | conf = model.Session.query(HarvestSource.config).filter( 99 | HarvestSource.id == id_).first() 100 | if conf: 101 | conf = conf[0] 102 | else: 103 | conf = None 104 | 105 | if url == new_url and conf == new_config: 106 | raise Invalid('There already is a Harvest Source for this URL (& ' 107 | 'config): url=%s config=%s' % (new_url, new_config)) 108 | 109 | return data[key] 110 | 111 | 112 | def harvest_source_type_exists(value, context): 113 | # TODO: use new description interface 114 | 115 | # Get all the registered harvester types 116 | available_types = [] 117 | for harvester in PluginImplementations(IHarvester): 118 | try: 119 | info = harvester.info() 120 | except AttributeError: 121 | continue 122 | if not info or 'name' not in info: 123 | log.error('Harvester %s does not provide the harvester name in ' 124 | 'the info response' % harvester) 125 | continue 126 | available_types.append(info['name']) 127 | 128 | if value not in available_types: 129 | raise Invalid('Unknown harvester type: %s. Registered types: %r' % 130 | (value, available_types)) 131 | 132 | return value 133 | 134 | 135 | def harvest_source_config_validator(key, data, errors, context): 136 | harvester_type = data.get(('source_type',), '') 137 | for harvester in PluginImplementations(IHarvester): 138 | info = harvester.info() 139 | if info['name'] == harvester_type: 140 | if hasattr(harvester, 'validate_config'): 141 | try: 142 | config = harvester.validate_config(data[key]) 143 | except Exception as e: 144 | raise Invalid('Error parsing the configuration options: %s' 145 | % e) 146 | if config is not None: 147 | # save an edited config, for use during the harvest 148 | data[key] = config 149 | # no value is returned for this sort of validator/converter 150 | 151 | 152 | def keep_not_empty_extras(key, data, errors, context): 153 | extras = data.pop(key, {}) 154 | for extras_key, value in extras.items(): 155 | if value: 156 | data[key[:-1] + (extras_key,)] = value 157 | 158 | 159 | def harvest_source_extra_validator(key, data, errors, context): 160 | harvester_type = data.get(('source_type',), '') 161 | 162 | # gather all extra fields to use as whitelist of what 163 | # can be added to top level data_dict 164 | all_extra_fields = set() 165 | for harvester in PluginImplementations(IHarvester): 166 | if not hasattr(harvester, 'extra_schema'): 167 | continue 168 | all_extra_fields.update(harvester.extra_schema().keys()) 169 | 170 | extra_schema = {'__extras': [keep_not_empty_extras]} 171 | for harvester in PluginImplementations(IHarvester): 172 | if not hasattr(harvester, 'extra_schema'): 173 | continue 174 | info = harvester.info() 175 | if not info['name'] == harvester_type: 176 | continue 177 | extra_schema.update(harvester.extra_schema()) 178 | break 179 | 180 | extra_data, extra_errors = validate(data.get(key, {}), extra_schema) 181 | for key in list(extra_data.keys()): 182 | # only allow keys that appear in at least one harvester 183 | if key not in all_extra_fields: 184 | extra_data.pop(key) 185 | 186 | for key, value in extra_data.items(): 187 | data[(key,)] = value 188 | 189 | for key, value in extra_errors.items(): 190 | errors[(key,)] = value 191 | 192 | # need to get config out of extras as __extra runs 193 | # after rest of validation 194 | package_extras = data.get(('extras',), []) 195 | 196 | for num, extra in enumerate(list(package_extras)): 197 | if extra['key'] == 'config': 198 | # remove config extra so we can add back cleanly later 199 | package_extras.pop(num) 200 | try: 201 | config_dict = json.loads(extra.get('value') or '{}') 202 | except ValueError: 203 | log.error('Wrong JSON provided in config, skipping') 204 | config_dict = {} 205 | break 206 | else: 207 | config_dict = {} 208 | config_dict.update(extra_data) 209 | if config_dict and not extra_errors: 210 | config = json.dumps(config_dict) 211 | package_extras.append(dict(key='config', 212 | value=config)) 213 | data[('config',)] = config 214 | if package_extras: 215 | data[('extras',)] = package_extras 216 | 217 | 218 | def harvest_source_convert_from_config(key, data, errors, context): 219 | config = data[key] 220 | if config: 221 | try: 222 | config_dict = json.loads(config) 223 | except ValueError: 224 | log.error('Wrong JSON provided config, skipping') 225 | data[key] = None 226 | return 227 | for key, value in config_dict.items(): 228 | data[(key,)] = value 229 | 230 | 231 | def harvest_source_active_validator(value, context): 232 | if isinstance(value, str): 233 | if value.lower() == 'true': 234 | return True 235 | else: 236 | return False 237 | return bool(value) 238 | 239 | 240 | def harvest_source_frequency_exists(value): 241 | if value == '': 242 | value = 'MANUAL' 243 | if value.upper() not in UPDATE_FREQUENCIES: 244 | raise Invalid('Frequency %s not recognised' % value) 245 | return value.upper() 246 | 247 | 248 | def dataset_type_exists(value): 249 | if value != DATASET_TYPE_NAME: 250 | value = DATASET_TYPE_NAME 251 | return value 252 | 253 | 254 | def harvest_object_extras_validator(value, context): 255 | if not isinstance(value, dict): 256 | raise Invalid('extras must be a dict') 257 | for v in value.values(): 258 | if not isinstance(v, str): 259 | raise Invalid('extras must be a dict of strings') 260 | return value 261 | -------------------------------------------------------------------------------- /ckanext/harvest/migration/harvest/README: -------------------------------------------------------------------------------- 1 | Generic single-database configuration. -------------------------------------------------------------------------------- /ckanext/harvest/migration/harvest/alembic.ini: -------------------------------------------------------------------------------- 1 | # A generic, single database configuration. 2 | 3 | [alembic] 4 | # path to migration scripts 5 | script_location = %(here)s 6 | 7 | # template used to generate migration files 8 | # file_template = %%(rev)s_%%(slug)s 9 | 10 | # timezone to use when rendering the date 11 | # within the migration file as well as the filename. 12 | # string value is passed to dateutil.tz.gettz() 13 | # leave blank for localtime 14 | # timezone = 15 | 16 | # max length of characters to apply to the 17 | # "slug" field 18 | #truncate_slug_length = 40 19 | 20 | # set to 'true' to run the environment during 21 | # the 'revision' command, regardless of autogenerate 22 | # revision_environment = false 23 | 24 | # set to 'true' to allow .pyc and .pyo files without 25 | # a source .py file to be detected as revisions in the 26 | # versions/ directory 27 | # sourceless = false 28 | 29 | # version location specification; this defaults 30 | # to /home/sergey/projects/core/ckanext-harvest/ckanext/harvest/migration/harvest/versions. When using multiple version 31 | # directories, initial revisions must be specified with --version-path 32 | # version_locations = %(here)s/bar %(here)s/bat /home/sergey/projects/core/ckanext-harvest/ckanext/harvest/migration/harvest/versions 33 | 34 | # the output encoding used when revision files 35 | # are written from script.py.mako 36 | # output_encoding = utf-8 37 | 38 | sqlalchemy.url = driver://user:pass@localhost/dbname 39 | 40 | 41 | # Logging configuration 42 | [loggers] 43 | keys = root,sqlalchemy,alembic 44 | 45 | [handlers] 46 | keys = console 47 | 48 | [formatters] 49 | keys = generic 50 | 51 | [logger_root] 52 | level = WARN 53 | handlers = console 54 | qualname = 55 | 56 | [logger_sqlalchemy] 57 | level = WARN 58 | handlers = 59 | qualname = sqlalchemy.engine 60 | 61 | [logger_alembic] 62 | level = INFO 63 | handlers = 64 | qualname = alembic 65 | 66 | [handler_console] 67 | class = StreamHandler 68 | args = (sys.stderr,) 69 | level = NOTSET 70 | formatter = generic 71 | 72 | [formatter_generic] 73 | format = %(levelname)-5.5s [%(name)s] %(message)s 74 | datefmt = %H:%M:%S 75 | -------------------------------------------------------------------------------- /ckanext/harvest/migration/harvest/env.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import with_statement 4 | from alembic import context 5 | from sqlalchemy import engine_from_config, pool 6 | from logging.config import fileConfig 7 | 8 | import os 9 | 10 | # this is the Alembic Config object, which provides 11 | # access to the values within the .ini file in use. 12 | config = context.config 13 | 14 | # Interpret the config file for Python logging. 15 | # This line sets up loggers basically. 16 | fileConfig(config.config_file_name) 17 | 18 | # add your model's MetaData object here 19 | # for 'autogenerate' support 20 | # from myapp import mymodel 21 | # target_metadata = mymodel.Base.metadata 22 | target_metadata = None 23 | 24 | # other values from the config, defined by the needs of env.py, 25 | # can be acquired: 26 | # my_important_option = config.get_main_option("my_important_option") 27 | # ... etc. 28 | 29 | name = os.path.basename(os.path.dirname(__file__)) 30 | 31 | 32 | def run_migrations_offline(): 33 | """Run migrations in 'offline' mode. 34 | 35 | This configures the context with just a URL 36 | and not an Engine, though an Engine is acceptable 37 | here as well. By skipping the Engine creation 38 | we don't even need a DBAPI to be available. 39 | 40 | Calls to context.execute() here emit the given string to the 41 | script output. 42 | 43 | """ 44 | 45 | url = config.get_main_option(u"sqlalchemy.url") 46 | context.configure( 47 | url=url, target_metadata=target_metadata, literal_binds=True, 48 | version_table=u'{}_alembic_version'.format(name) 49 | ) 50 | 51 | with context.begin_transaction(): 52 | context.run_migrations() 53 | 54 | 55 | def run_migrations_online(): 56 | """Run migrations in 'online' mode. 57 | 58 | In this scenario we need to create an Engine 59 | and associate a connection with the context. 60 | 61 | """ 62 | connectable = engine_from_config( 63 | config.get_section(config.config_ini_section), 64 | prefix=u'sqlalchemy.', 65 | poolclass=pool.NullPool) 66 | 67 | with connectable.connect() as connection: 68 | context.configure( 69 | connection=connection, 70 | target_metadata=target_metadata, 71 | version_table=u'{}_alembic_version'.format(name) 72 | ) 73 | 74 | with context.begin_transaction(): 75 | context.run_migrations() 76 | 77 | 78 | if context.is_offline_mode(): 79 | run_migrations_offline() 80 | else: 81 | run_migrations_online() 82 | -------------------------------------------------------------------------------- /ckanext/harvest/migration/harvest/script.py.mako: -------------------------------------------------------------------------------- 1 | """${message} 2 | 3 | Revision ID: ${up_revision} 4 | Revises: ${down_revision | comma,n} 5 | Create Date: ${create_date} 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | ${imports if imports else ""} 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = ${repr(up_revision)} 14 | down_revision = ${repr(down_revision)} 15 | branch_labels = ${repr(branch_labels)} 16 | depends_on = ${repr(depends_on)} 17 | 18 | 19 | def upgrade(): 20 | ${upgrades if upgrades else "pass"} 21 | 22 | 23 | def downgrade(): 24 | ${downgrades if downgrades else "pass"} 25 | -------------------------------------------------------------------------------- /ckanext/harvest/migration/harvest/versions/3b4894672727_create_harvest_tables.py: -------------------------------------------------------------------------------- 1 | """create harvest tables 2 | 3 | Revision ID: 3b4894672727 4 | Revises: 5 | Create Date: 2023-11-02 15:53:02.262586 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = "3b4894672727" 14 | down_revision = None 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | engine = op.get_bind() 21 | inspector = sa.inspect(engine) 22 | tables = inspector.get_table_names() 23 | if "harvest_source" not in tables: 24 | op.create_table( 25 | "harvest_source", 26 | sa.Column("id", sa.UnicodeText, primary_key=True), 27 | sa.Column("url", sa.UnicodeText, nullable=False), 28 | sa.Column("title", sa.UnicodeText), 29 | sa.Column("description", sa.UnicodeText), 30 | sa.Column("config", sa.UnicodeText), 31 | sa.Column("created", sa.DateTime), 32 | sa.Column("type", sa.UnicodeText, nullable=False), 33 | sa.Column("active", sa.Boolean), 34 | sa.Column("user_id", sa.UnicodeText), 35 | sa.Column("publisher_id", sa.UnicodeText), 36 | sa.Column("frequency", sa.UnicodeText), 37 | sa.Column("next_run", sa.DateTime), 38 | ) 39 | 40 | if "harvest_job" not in tables: 41 | op.create_table( 42 | "harvest_job", 43 | sa.Column("id", sa.UnicodeText, primary_key=True), 44 | sa.Column("created", sa.DateTime), 45 | sa.Column("gather_started", sa.DateTime), 46 | sa.Column("gather_finished", sa.DateTime), 47 | sa.Column("finished", sa.DateTime), 48 | sa.Column( 49 | "source_id", 50 | sa.UnicodeText, 51 | sa.ForeignKey("harvest_source.id"), 52 | ), 53 | sa.Column("status", sa.UnicodeText, nullable=False), 54 | ) 55 | 56 | if "harvest_object" not in tables: 57 | op.create_table( 58 | "harvest_object", 59 | sa.Column("id", sa.UnicodeText, primary_key=True), 60 | sa.Column("guid", sa.UnicodeText), 61 | sa.Column("current", sa.Boolean), 62 | sa.Column("gathered", sa.DateTime), 63 | sa.Column("fetch_started", sa.DateTime), 64 | sa.Column("content", sa.UnicodeText, nullable=True), 65 | sa.Column("fetch_finished", sa.DateTime), 66 | sa.Column("import_started", sa.DateTime), 67 | sa.Column("import_finished", sa.DateTime), 68 | sa.Column("state", sa.UnicodeText), 69 | sa.Column("metadata_modified_date", sa.DateTime), 70 | sa.Column("retry_times", sa.Integer), 71 | sa.Column( 72 | "harvest_job_id", 73 | sa.UnicodeText, 74 | sa.ForeignKey("harvest_job.id"), 75 | ), 76 | sa.Column( 77 | "harvest_source_id", 78 | sa.UnicodeText, 79 | sa.ForeignKey("harvest_source.id"), 80 | ), 81 | sa.Column( 82 | "package_id", 83 | sa.UnicodeText, 84 | sa.ForeignKey("package.id", deferrable=True), 85 | nullable=True, 86 | ), 87 | sa.Column("report_status", sa.UnicodeText, nullable=True), 88 | ) 89 | 90 | index_names = [index["name"] for index in inspector.get_indexes("harvest_object")] 91 | if "harvest_job_id_idx" not in index_names: 92 | op.create_index("harvest_job_id_idx", "harvest_object", ["harvest_job_id"]) 93 | 94 | if "harvest_source_id_idx" not in index_names: 95 | op.create_index( 96 | "harvest_source_id_idx", "harvest_object", ["harvest_source_id"] 97 | ) 98 | 99 | if "package_id_idx" not in index_names: 100 | op.create_index("package_id_idx", "harvest_object", ["package_id"]) 101 | 102 | if "guid_idx" not in index_names: 103 | op.create_index("guid_idx", "harvest_object", ["guid"]) 104 | 105 | if "harvest_object_extra" not in tables: 106 | op.create_table( 107 | "harvest_object_extra", 108 | sa.Column("id", sa.UnicodeText, primary_key=True), 109 | sa.Column( 110 | "harvest_object_id", 111 | sa.UnicodeText, 112 | sa.ForeignKey("harvest_object.id"), 113 | ), 114 | sa.Column("key", sa.UnicodeText), 115 | sa.Column("value", sa.UnicodeText), 116 | ) 117 | 118 | index_names = [ 119 | index["name"] for index in inspector.get_indexes("harvest_object_extra") 120 | ] 121 | if "harvest_object_id_idx" not in index_names: 122 | op.create_index( 123 | "harvest_object_id_idx", "harvest_object_extra", ["harvest_object_id"] 124 | ) 125 | 126 | if "harvest_gather_error" not in tables: 127 | op.create_table( 128 | "harvest_gather_error", 129 | sa.Column("id", sa.UnicodeText, primary_key=True), 130 | sa.Column( 131 | "harvest_job_id", 132 | sa.UnicodeText, 133 | sa.ForeignKey("harvest_job.id"), 134 | ), 135 | sa.Column("message", sa.UnicodeText), 136 | sa.Column("created", sa.DateTime), 137 | ) 138 | 139 | if "harvest_object_error" not in tables: 140 | op.create_table( 141 | "harvest_object_error", 142 | sa.Column("id", sa.UnicodeText, primary_key=True), 143 | sa.Column( 144 | "harvest_object_id", 145 | sa.UnicodeText, 146 | sa.ForeignKey("harvest_object.id"), 147 | ), 148 | sa.Column("message", sa.UnicodeText), 149 | sa.Column("stage", sa.UnicodeText), 150 | sa.Column("line", sa.Integer), 151 | sa.Column("created", sa.DateTime), 152 | ) 153 | 154 | index_names = [ 155 | index["name"] for index in inspector.get_indexes("harvest_object_error") 156 | ] 157 | if "harvest_error_harvest_object_id_idx" not in index_names: 158 | op.create_index( 159 | "harvest_error_harvest_object_id_idx", 160 | "harvest_object_error", 161 | ["harvest_object_id"], 162 | ) 163 | 164 | if "harvest_log" not in tables: 165 | op.create_table( 166 | "harvest_log", 167 | sa.Column("id", sa.UnicodeText, primary_key=True), 168 | sa.Column("content", sa.UnicodeText, nullable=False), 169 | sa.Column( 170 | "level", 171 | sa.Enum( 172 | "DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL", name="log_level" 173 | ), 174 | ), 175 | sa.Column("created", sa.DateTime), 176 | ) 177 | 178 | 179 | def downgrade(): 180 | op.drop_table("harvest_log") 181 | sa.Enum(name="log_level").drop(op.get_bind()) 182 | op.drop_table("harvest_object_error") 183 | op.drop_table("harvest_gather_error") 184 | op.drop_table("harvest_object_extra") 185 | op.drop_table("harvest_object") 186 | op.drop_table("harvest_job") 187 | op.drop_table("harvest_source") 188 | -------------------------------------------------------------------------------- /ckanext/harvest/migration/harvest/versions/75d650dfd519_add_cascade_to_harvest_tables.py: -------------------------------------------------------------------------------- 1 | """add cascade to harvest tables 2 | 3 | Revision ID: 75d650dfd519 4 | Revises: 3b4894672727 5 | Create Date: 2023-11-02 17:13:39.995339 6 | 7 | """ 8 | from alembic import op 9 | 10 | 11 | # revision identifiers, used by Alembic. 12 | revision = "75d650dfd519" 13 | down_revision = "3b4894672727" 14 | branch_labels = None 15 | depends_on = None 16 | 17 | 18 | def upgrade(): 19 | _recreate_fk("CASCADE") 20 | 21 | 22 | def downgrade(): 23 | _recreate_fk(None) 24 | 25 | 26 | def _recreate_fk(ondelete): 27 | op.drop_constraint("harvest_job_source_id_fkey", "harvest_job") 28 | op.create_foreign_key( 29 | "harvest_job_source_id_fkey", 30 | "harvest_job", 31 | "harvest_source", 32 | ["source_id"], 33 | ["id"], 34 | ondelete=ondelete, 35 | ) 36 | 37 | op.drop_constraint("harvest_object_harvest_job_id_fkey", "harvest_object") 38 | op.create_foreign_key( 39 | "harvest_object_harvest_job_id_fkey", 40 | "harvest_object", 41 | "harvest_job", 42 | ["harvest_job_id"], 43 | ["id"], 44 | ondelete=ondelete, 45 | ) 46 | 47 | op.drop_constraint("harvest_object_harvest_source_id_fkey", "harvest_object") 48 | op.create_foreign_key( 49 | "harvest_object_harvest_source_id_fkey", 50 | "harvest_object", 51 | "harvest_source", 52 | ["harvest_source_id"], 53 | ["id"], 54 | ondelete=ondelete, 55 | ) 56 | 57 | op.drop_constraint("harvest_object_package_id_fkey", "harvest_object") 58 | op.create_foreign_key( 59 | "harvest_object_package_id_fkey", 60 | "harvest_object", 61 | "package", 62 | ["package_id"], 63 | ["id"], 64 | ondelete=ondelete, 65 | deferrable=True, 66 | ) 67 | 68 | op.drop_constraint( 69 | "harvest_object_extra_harvest_object_id_fkey", "harvest_object_extra" 70 | ) 71 | op.create_foreign_key( 72 | "harvest_object_extra_harvest_object_id_fkey", 73 | "harvest_object_extra", 74 | "harvest_object", 75 | ["harvest_object_id"], 76 | ["id"], 77 | ondelete=ondelete, 78 | ) 79 | 80 | op.drop_constraint( 81 | "harvest_gather_error_harvest_job_id_fkey", "harvest_gather_error" 82 | ) 83 | op.create_foreign_key( 84 | "harvest_gather_error_harvest_job_id_fkey", 85 | "harvest_gather_error", 86 | "harvest_job", 87 | ["harvest_job_id"], 88 | ["id"], 89 | ondelete=ondelete, 90 | ) 91 | 92 | op.drop_constraint( 93 | "harvest_object_error_harvest_object_id_fkey", "harvest_object_error" 94 | ) 95 | op.create_foreign_key( 96 | "harvest_object_error_harvest_object_id_fkey", 97 | "harvest_object_error", 98 | "harvest_object", 99 | ["harvest_object_id"], 100 | ["id"], 101 | ondelete=ondelete, 102 | ) 103 | -------------------------------------------------------------------------------- /ckanext/harvest/model/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import datetime 3 | 4 | from sqlalchemy import event 5 | from sqlalchemy import Column 6 | from sqlalchemy import ForeignKey 7 | from sqlalchemy import types 8 | from sqlalchemy import Index 9 | from sqlalchemy.orm import backref, relationship 10 | from sqlalchemy.exc import InvalidRequestError 11 | 12 | from ckan.model.meta import Session 13 | from ckan.model.types import make_uuid 14 | from ckan.model.domain_object import DomainObject 15 | from ckan.model.package import Package 16 | 17 | try: 18 | from ckan.plugins.toolkit import BaseModel 19 | except ImportError: 20 | # CKAN <= 2.9 21 | from ckan.model.meta import metadata 22 | from sqlalchemy.ext.declarative import declarative_base 23 | 24 | BaseModel = declarative_base(metadata=metadata) 25 | 26 | 27 | UPDATE_FREQUENCIES = ["MANUAL", "MONTHLY", "WEEKLY", "BIWEEKLY", "DAILY", "ALWAYS"] 28 | 29 | log = logging.getLogger(__name__) 30 | 31 | 32 | class HarvestError(Exception): 33 | pass 34 | 35 | 36 | class HarvestDomainObject(DomainObject): 37 | """Convenience methods for searching objects""" 38 | 39 | key_attr = "id" 40 | 41 | @classmethod 42 | def get(cls, key, default=None, attr=None): 43 | """Finds a single entity in the register.""" 44 | if attr is None: 45 | attr = cls.key_attr 46 | kwds = {attr: key} 47 | o = cls.filter(**kwds).first() 48 | if o: 49 | return o 50 | else: 51 | return default 52 | 53 | @classmethod 54 | def filter(cls, **kwds): 55 | query = Session.query(cls).autoflush(False) 56 | return query.filter_by(**kwds) 57 | 58 | 59 | class HarvestSource(BaseModel, HarvestDomainObject): 60 | """A Harvest Source is essentially a URL plus some other metadata. 61 | It must have a type (e.g. CSW) and can have a status of "active" 62 | or "inactive". The harvesting processes are not fired on inactive 63 | sources. 64 | """ 65 | 66 | __tablename__ = "harvest_source" 67 | 68 | id = Column(types.UnicodeText, primary_key=True, default=make_uuid) 69 | url = Column(types.UnicodeText, nullable=False) 70 | title = Column(types.UnicodeText, default="") 71 | description = Column(types.UnicodeText, default="") 72 | config = Column(types.UnicodeText, default="") 73 | created = Column(types.DateTime, default=datetime.datetime.utcnow) 74 | type = Column(types.UnicodeText, nullable=False) 75 | active = Column(types.Boolean, default=True) 76 | user_id = Column(types.UnicodeText, default="") 77 | publisher_id = Column(types.UnicodeText, default="") 78 | frequency = Column(types.UnicodeText, default="MANUAL") 79 | next_run = Column(types.DateTime) 80 | jobs = relationship( 81 | "HarvestJob", 82 | lazy="select", 83 | back_populates="source", 84 | order_by=lambda: HarvestJob.created, 85 | ) 86 | 87 | def __repr__(self): 88 | return "" % ( 89 | self.id, 90 | self.title, 91 | self.url, 92 | self.active, 93 | ) 94 | 95 | def __str__(self): 96 | return self.__repr__().encode("ascii", "ignore") 97 | 98 | def get_jobs(self, status=None): 99 | """get the running jobs for this source""" 100 | 101 | query = Session.query(HarvestJob).filter(HarvestJob.source_id == self.id) 102 | 103 | if status is not None: 104 | query = query.filter(HarvestJob.status == status) 105 | 106 | return query.all() 107 | 108 | 109 | class HarvestJob(BaseModel, HarvestDomainObject): 110 | """A Harvesting Job is performed in two phases. In first place, the 111 | **gather** stage collects all the Ids and URLs that need to be fetched 112 | from the harvest source. Errors occurring in this phase 113 | (``HarvestGatherError``) are stored in the ``harvest_gather_error`` 114 | table. During the next phase, the **fetch** stage retrieves the 115 | ``HarvestedObjects`` and, if necessary, the **import** stage stores 116 | them on the database. Errors occurring in this second stage 117 | (``HarvestObjectError``) are stored in the ``harvest_object_error`` 118 | table. 119 | """ 120 | 121 | __tablename__ = "harvest_job" 122 | 123 | id = Column(types.UnicodeText, primary_key=True, default=make_uuid) 124 | created = Column(types.DateTime, default=datetime.datetime.utcnow) 125 | gather_started = Column(types.DateTime) 126 | gather_finished = Column(types.DateTime) 127 | finished = Column(types.DateTime) 128 | source_id = Column(types.UnicodeText, ForeignKey("harvest_source.id")) 129 | # status: New, Running, Finished 130 | status = Column(types.UnicodeText, default="New", nullable=False) 131 | source = relationship( 132 | "HarvestSource", 133 | lazy="select", 134 | back_populates="jobs", 135 | ) 136 | 137 | def get_last_finished_object(self): 138 | """Determine the last finished object in this job 139 | Helpful to know if a job is running or not and 140 | to avoid timeouts when the source is running 141 | """ 142 | 143 | query = ( 144 | Session.query(HarvestObject) 145 | .filter(HarvestObject.harvest_job_id == self.id) 146 | .filter(HarvestObject.state == "COMPLETE") 147 | .filter(HarvestObject.import_finished.isnot(None)) 148 | .order_by(HarvestObject.import_finished.desc()) 149 | .first() 150 | ) 151 | 152 | return query 153 | 154 | def get_last_gathered_object(self): 155 | """Determine the last gathered object in this job 156 | Helpful to know if a job is running or not and 157 | to avoid timeouts when the source is running 158 | """ 159 | 160 | query = ( 161 | Session.query(HarvestObject) 162 | .filter(HarvestObject.harvest_job_id == self.id) 163 | .order_by(HarvestObject.gathered.desc()) 164 | .first() 165 | ) 166 | 167 | return query 168 | 169 | def get_last_action_time(self): 170 | last_object = self.get_last_finished_object() 171 | if last_object is not None: 172 | return last_object.import_finished 173 | 174 | if self.gather_finished is not None: 175 | return self.gather_finished 176 | 177 | last_gathered_object = self.get_last_gathered_object() 178 | if last_gathered_object is not None: 179 | return last_gathered_object.gathered 180 | 181 | return self.created 182 | 183 | def get_gather_errors(self): 184 | query = ( 185 | Session.query(HarvestGatherError) 186 | .filter(HarvestGatherError.harvest_job_id == self.id) 187 | .order_by(HarvestGatherError.created.desc()) 188 | ) 189 | 190 | return query.all() 191 | 192 | 193 | class HarvestObject(BaseModel, HarvestDomainObject): 194 | """A Harvest Object is created every time an element is fetched from a 195 | harvest source. Its contents can be processed and imported to ckan 196 | packages, RDF graphs, etc. 197 | 198 | """ 199 | 200 | __tablename__ = "harvest_object" 201 | 202 | id = Column(types.UnicodeText, primary_key=True, default=make_uuid) 203 | # The guid is the 'identity' of the dataset, according to the source. 204 | # So if you reharvest it, then the harvester knows which dataset to 205 | # update because of this identity. The identity needs to be unique 206 | # within this CKAN. 207 | guid = Column(types.UnicodeText, default="") 208 | # When you harvest a dataset multiple times, only the latest 209 | # successfully imported harvest_object should be flagged 'current'. 210 | # The import_stage usually reads and writes it. 211 | current = Column(types.Boolean, default=False) 212 | gathered = Column(types.DateTime, default=datetime.datetime.utcnow) 213 | fetch_started = Column(types.DateTime) 214 | content = Column(types.UnicodeText, nullable=True) 215 | fetch_finished = Column(types.DateTime) 216 | import_started = Column(types.DateTime) 217 | import_finished = Column(types.DateTime) 218 | # state: WAITING, FETCH, IMPORT, COMPLETE, ERROR 219 | state = Column(types.UnicodeText, default="WAITING") 220 | metadata_modified_date = Column(types.DateTime) 221 | retry_times = Column(types.Integer, default=0) 222 | harvest_job_id = Column(types.UnicodeText, ForeignKey("harvest_job.id")) 223 | harvest_source_id = Column(types.UnicodeText, ForeignKey("harvest_source.id")) 224 | package_id = Column( 225 | types.UnicodeText, 226 | ForeignKey("package.id", deferrable=True), 227 | nullable=True, 228 | ) 229 | # report_status: 'added', 'updated', 'not modified', 'deleted', 'errored' 230 | report_status = Column(types.UnicodeText, nullable=True) 231 | harvest_job_id_idx = Index("harvest_job_id") 232 | harvest_source_id_idx = Index("harvest_source_id") 233 | package_id_idx = Index("package_id") 234 | guid_idx = Index("guid") 235 | package = relationship( 236 | Package, 237 | lazy="select", 238 | backref="harvest_objects", 239 | ) 240 | job = relationship( 241 | HarvestJob, 242 | lazy="select", 243 | backref="objects", 244 | ) 245 | source = relationship( 246 | HarvestSource, 247 | lazy="select", 248 | backref="objects", 249 | ) 250 | 251 | 252 | class HarvestObjectExtra(BaseModel, HarvestDomainObject): 253 | """Extra key value data for Harvest objects""" 254 | 255 | __tablename__ = "harvest_object_extra" 256 | 257 | id = Column(types.UnicodeText, primary_key=True, default=make_uuid) 258 | harvest_object_id = Column(types.UnicodeText, ForeignKey("harvest_object.id")) 259 | key = Column(types.UnicodeText) 260 | value = Column(types.UnicodeText) 261 | harvest_object_id_idx = Index("harvest_object_id") 262 | object = relationship( 263 | HarvestObject, backref=backref("extras", cascade="all,delete-orphan") 264 | ) 265 | 266 | 267 | class HarvestGatherError(BaseModel, HarvestDomainObject): 268 | """Gather errors are raised during the **gather** stage of a harvesting 269 | job. 270 | """ 271 | 272 | __tablename__ = "harvest_gather_error" 273 | 274 | id = Column(types.UnicodeText, primary_key=True, default=make_uuid) 275 | harvest_job_id = Column(types.UnicodeText, ForeignKey("harvest_job.id")) 276 | message = Column(types.UnicodeText) 277 | created = Column(types.DateTime, default=datetime.datetime.utcnow) 278 | 279 | job = relationship(HarvestJob, backref="gather_errors") 280 | 281 | @classmethod 282 | def create(cls, message, job): 283 | """ 284 | Helper function to create an error object and save it. 285 | """ 286 | err = cls(message=message, job=job) 287 | try: 288 | err.save() 289 | except InvalidRequestError: 290 | Session.rollback() 291 | err.save() 292 | finally: 293 | # No need to alert administrator so don't log as an error 294 | log.info(message) 295 | 296 | 297 | class HarvestObjectError(BaseModel, HarvestDomainObject): 298 | """Object errors are raised during the **fetch** or **import** stage of a 299 | harvesting job, and are referenced to a specific harvest object. 300 | """ 301 | 302 | __tablename__ = "harvest_object_error" 303 | 304 | id = Column(types.UnicodeText, primary_key=True, default=make_uuid) 305 | harvest_object_id = Column(types.UnicodeText, ForeignKey("harvest_object.id")) 306 | message = Column(types.UnicodeText) 307 | stage = Column(types.UnicodeText) 308 | line = Column(types.Integer) 309 | created = Column(types.DateTime, default=datetime.datetime.utcnow) 310 | harvest_error_harvest_object_id_idx = Index("harvest_object_id") 311 | 312 | object = relationship( 313 | HarvestObject, backref=backref("errors", cascade="all,delete-orphan") 314 | ) 315 | 316 | @classmethod 317 | def create(cls, message, object, stage="Fetch", line=None): 318 | """ 319 | Helper function to create an error object and save it. 320 | """ 321 | err = cls(message=message, object=object, stage=stage, line=line) 322 | try: 323 | err.save() 324 | except InvalidRequestError: 325 | # Clear any in-progress sqlalchemy transactions 326 | try: 327 | Session.rollback() 328 | except Exception: 329 | pass 330 | try: 331 | Session.remove() 332 | except Exception: 333 | pass 334 | err.save() 335 | finally: 336 | log_message = "{0}, line {1}".format(message, line) if line else message 337 | log.debug(log_message) 338 | 339 | 340 | class HarvestLog(BaseModel, HarvestDomainObject): 341 | """HarvestLog objects are created each time something is logged 342 | using python's standard logging module 343 | """ 344 | 345 | __tablename__ = "harvest_log" 346 | 347 | id = Column(types.UnicodeText, primary_key=True, default=make_uuid) 348 | content = Column(types.UnicodeText, nullable=False) 349 | level = Column( 350 | types.Enum("DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL", name="log_level"), 351 | ) 352 | created = Column(types.DateTime, default=datetime.datetime.utcnow) 353 | 354 | 355 | def harvest_object_before_insert_listener(mapper, connection, target): 356 | """ 357 | For compatibility with old harvesters, check if the source id has 358 | been set, and set it automatically from the job if not. 359 | """ 360 | if not target.harvest_source_id or not target.source: 361 | if not target.job: 362 | raise Exception("You must define a Harvest Job for each Harvest Object") 363 | target.harvest_source_id = target.job.source.id 364 | 365 | 366 | class PackageIdHarvestSourceIdMismatch(Exception): 367 | """ 368 | The package created for the harvest source must match the id of the 369 | harvest source 370 | """ 371 | 372 | pass 373 | 374 | 375 | def clean_harvest_log(condition): 376 | Session.query(HarvestLog).filter(HarvestLog.created <= condition).delete( 377 | synchronize_session=False 378 | ) 379 | try: 380 | Session.commit() 381 | except InvalidRequestError: 382 | Session.rollback() 383 | log.error("An error occurred while trying to clean-up the harvest log table") 384 | 385 | log.info("Harvest log table clean-up finished successfully") 386 | 387 | 388 | event.listen(HarvestObject, "before_insert", harvest_object_before_insert_listener) 389 | -------------------------------------------------------------------------------- /ckanext/harvest/public/ckanext/harvest/images/icons/source_delete.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ckan/ckanext-harvest/b74cba23b647f0aefab1db406784dd8bb11f8c7d/ckanext/harvest/public/ckanext/harvest/images/icons/source_delete.png -------------------------------------------------------------------------------- /ckanext/harvest/public/ckanext/harvest/images/icons/source_edit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ckan/ckanext-harvest/b74cba23b647f0aefab1db406784dd8bb11f8c7d/ckanext/harvest/public/ckanext/harvest/images/icons/source_edit.png -------------------------------------------------------------------------------- /ckanext/harvest/public/ckanext/harvest/images/icons/source_new.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ckan/ckanext-harvest/b74cba23b647f0aefab1db406784dd8bb11f8c7d/ckanext/harvest/public/ckanext/harvest/images/icons/source_new.png -------------------------------------------------------------------------------- /ckanext/harvest/public/ckanext/harvest/images/icons/source_refresh.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ckan/ckanext-harvest/b74cba23b647f0aefab1db406784dd8bb11f8c7d/ckanext/harvest/public/ckanext/harvest/images/icons/source_refresh.png -------------------------------------------------------------------------------- /ckanext/harvest/public/ckanext/harvest/images/icons/source_view.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ckan/ckanext-harvest/b74cba23b647f0aefab1db406784dd8bb11f8c7d/ckanext/harvest/public/ckanext/harvest/images/icons/source_view.png -------------------------------------------------------------------------------- /ckanext/harvest/public/ckanext/harvest/javascript/extra_fields.js: -------------------------------------------------------------------------------- 1 | ckan.module('harvest-type-change', function (jQuery, _) { 2 | return { 3 | initialize: function () { 4 | var self, harvest_source_type; 5 | self = this; 6 | harvest_source_type = this.el.attr('value'); 7 | this.el.change(function(){ 8 | self.sandbox.publish('harvest-source-type-select', harvest_source_type); 9 | }) 10 | if (this.el.attr("checked") === "checked"){ 11 | self.sandbox.publish('harvest-source-type-select', harvest_source_type); 12 | } 13 | }, 14 | } 15 | }) 16 | 17 | ckan.module('harvest-extra-form-change', function (jQuery, _) { 18 | return { 19 | initialize: function () { 20 | var self, item, i, control_groups, control_group, item_name; 21 | self = this; 22 | self.sandbox.subscribe('harvest-source-type-select', function(source_type) { 23 | form_items = self.options.formItems; 24 | items = form_items[source_type] || []; 25 | 26 | control_groups = self.el.find('.control-group'); 27 | for (i=0;i td{ 90 | background-color: #E3E3E3 !important; 91 | padding: 3px; 92 | font-weight: bold; 93 | } 94 | 95 | #harvest-source-details th { 96 | width: 33%; 97 | } 98 | 99 | #source-new { 100 | margin-top: 30px; 101 | } 102 | -------------------------------------------------------------------------------- /ckanext/harvest/templates/admin/base.html: -------------------------------------------------------------------------------- 1 | {% ckan_extends %} 2 | 3 | {% block content_primary_nav %} 4 | {{ super() }} 5 | {{ h.build_nav_icon('harvest.search', _('Harvest'), icon='download') }} 6 | {% endblock %} 7 | -------------------------------------------------------------------------------- /ckanext/harvest/templates/base.html: -------------------------------------------------------------------------------- 1 | {% ckan_extends %} 2 | 3 | {% block styles %} 4 | {{ super() }} 5 | {% asset 'ckanext-harvest/harvest_css' %} 6 | {% endblock %} 7 | -------------------------------------------------------------------------------- /ckanext/harvest/templates/emails/error_email.txt: -------------------------------------------------------------------------------- 1 | This is a failure notification of the latest harvest job set-up in {{ site_url }}. 2 | Job URL: {{ job_url }} 3 | 4 | Harvest Source: {{ harvest_source_title }} 5 | Harvest Configuration: {{ harvest_configuration | safe }} 6 | Organization: {{ organization }} 7 | 8 | Harvest Job Id: {{ job_id }} 9 | Created: {{ job_created }} 10 | Finished: {{ job_finished }} 11 | 12 | Records in Error: {{ records_in_error }} 13 | Records Added: {{ records_added }} 14 | Records Updated: {{ records_updated }} 15 | Records Deleted: {{ records_deleted }} 16 | 17 | {{ error_summary_title }}: {{ errors|length }} errors 18 | 19 | {{ job_errors_title }}: {{ job_errors|length }} 20 | {% for error in job_errors %} 21 | - {{ error }} {% endfor %} 22 | {{ obj_errors_title }}: {{ obj_errors|length }} 23 | {% for error in obj_errors %} 24 | - {{ error }} {% endfor %} 25 | -- 26 | You are receiving this email because you are currently set-up as Administrator for {{ site_url }}. 27 | Please do not reply to this email as it was sent from a non-monitored address. 28 | -------------------------------------------------------------------------------- /ckanext/harvest/templates/emails/summary_email.txt: -------------------------------------------------------------------------------- 1 | This is a summary of the latest harvest job set-up in {{ site_url }}. 2 | Job URL: {{ job_url }} 3 | 4 | Harvest Source: {{ harvest_source_title }} 5 | Harvest Configuration: {{ harvest_configuration | safe }} 6 | 7 | Organization: {{ organization }} 8 | 9 | Harvest Job Id: {{ job_id }} 10 | Created: {{ job_created }} 11 | Finished: {{ job_finished }} 12 | 13 | Records in Error: {{ records_in_error }} 14 | Records Added: {{ records_added }} 15 | Records Updated: {{ records_updated }} 16 | Records Deleted: {{ records_deleted }} 17 | 18 | {{ error_summary_title }}: {{ errors|length }} errors 19 | {{ job_errors_title }}: {{ job_errors|length }} 20 | {% for error in job_errors %} 21 | - {{ error }} {% endfor %} 22 | 23 | {{ obj_errors_title }}: {{ obj_errors|length }} 24 | {% for error in obj_errors %} 25 | - {{ error }} {% endfor %} 26 | 27 | -- 28 | You are receiving this email because you are currently set-up as Administrator for {{ site_url }}. 29 | Please do not reply to this email as it was sent from a non-monitored address. 30 | -------------------------------------------------------------------------------- /ckanext/harvest/templates/snippets/add_source_button.html: -------------------------------------------------------------------------------- 1 | {% set authorized_user = h.check_access('harvest_source_create') %} 2 | 3 | {% if authorized_user %} 4 | 5 | 6 | {{ _('Add Harvest Source') }} 7 | 8 | {% endif %} 9 | 10 | -------------------------------------------------------------------------------- /ckanext/harvest/templates/snippets/job_details.html: -------------------------------------------------------------------------------- 1 | {# 2 | Displays information for a particular harvest job, including: 3 | 4 | * counts for added, updated, deleted or errored datasets 5 | * table with general details 6 | * table with a summary of the most common errors on this job 7 | 8 | job - dictized harvest job object 9 | 10 | Example: 11 | 12 | {% snippet 'snippets/job_details.html', job=job %} 13 | 14 | #} 15 | 16 | {% set stats = job.stats %} 17 | 18 | {% if job.status == 'Finished' %} 19 |

20 | 21 | {% if 'errored' in stats and stats['errored'] > 0 %} 22 | {{ stats['errored'] }} 23 | {% else %} 24 | 0 25 | {% endif %} 26 | {{ _('errors') }} 27 | 28 | {% for action in ['added', 'updated', 'deleted', 'not modified'] %} 29 | 30 | {% if action in stats and stats[action] > 0 %} 31 | {{ stats[action] }} 32 | {% else %} 33 | 0 34 | {% endif %} 35 | {{ _(action) }} 36 | 37 | {% endfor %} 38 |

39 | {% endif %} 40 | 41 |

{{ _('Details') }}

42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 58 | 59 | 60 | 61 | 66 | 67 | 68 | 69 | 74 | 75 | 76 | 77 | 78 | 79 |
{{ _('Id') }}{{ job.id }}
{{ _('Created') }} 54 | 55 | {{ h.render_datetime(job.created, with_hours=True) }} 56 | 57 |
{{ _('Started') }} 62 | 63 | {{ h.render_datetime(job.gather_started, with_hours=True) }} 64 | 65 |
{{ _('Finished') }} 70 | 71 | {{ h.render_datetime(job.finished, with_hours=True) }} 72 | 73 |
{{ _('Status') }}{{ _(job.status) }}
80 | -------------------------------------------------------------------------------- /ckanext/harvest/templates/snippets/job_error_summary.html: -------------------------------------------------------------------------------- 1 | {# 2 | Displays a table with a summary of the most common errors for a job 3 | 4 | error_summary - List of dicts with message and error_count 5 | 6 | Example: 7 | 8 | {% snippet 'snippets/job_error_summary.html', summary=job.object_error_summary %} 9 | 10 | #} 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | {% for error in summary %} 24 | 25 | 26 | 27 | 28 | {% endfor %} 29 | 30 |
{{ _('Count') }}{{ _('Message') }}
{{ error["error_count"] }}{{ error["message"] }}
31 | 32 | 33 | -------------------------------------------------------------------------------- /ckanext/harvest/templates/snippets/package_list_empty.html: -------------------------------------------------------------------------------- 1 |

{% trans %}There are no datasets associated to this harvest source.{% endtrans %}

2 | -------------------------------------------------------------------------------- /ckanext/harvest/templates/snippets/search_result_text.html: -------------------------------------------------------------------------------- 1 | {% ckan_extends %} 2 | 3 | {% if type == 'harvest' %} 4 | {% set text_query = ungettext('{number} harvest source found for "{query}"', '{number} harvest sources found for "{query}"', count) %} 5 | {% set text_query_none = _('Sorry no harvest sources found for "{query}"') %} 6 | {% set text_no_query = ungettext('{number} harvest source found', '{number} harvest sources found', count) %} 7 | {% set text_no_query_none = _('Sorry no harvest sources found') %} 8 | {%- endif -%} 9 | -------------------------------------------------------------------------------- /ckanext/harvest/templates/snippets/source_item.html: -------------------------------------------------------------------------------- 1 | {# 2 | Displays a single harvest source result. 3 | 4 | source - A source to display. 5 | item_class - The class name to use on the list item. 6 | hide_resources - If true hides the resources (default: false). 7 | banner - If true displays a popular banner (default: false). 8 | truncate - The length to trucate the description to (default: 180) 9 | truncate_title - The length to truncate the title to (default: 80). 10 | show_organization - Boolean on whether to show the related organization 11 | 12 | Example: 13 | 14 | {% snippet 'snippets/source_item.html', source=sources[0] %} 15 | 16 | #} 17 | {% set ckan_version = h.ckan_version().split('.')[1] %} 18 | {% set truncate = truncate or 180 %} 19 | {% set truncate_title = truncate_title or 80 %} 20 | {% set title = source.title or source.name %} 21 | {% set source_type = h.get_pkg_dict_extra(source, 'source_type') %} 22 | {% set url = h.url_for('harvest_admin', id=source.name) if within_organization else h.url_for('harvest.read', id=source.name) %} 23 | 24 |
  • 25 |
    26 |

    27 | {% if ckan_version | int >= 9 %} 28 | {{ h.link_to(title|truncate(truncate_title), url) }} 29 | {% else %} 30 | {{ h.link_to(h.truncate(title, truncate_title), url) }} 31 | {% endif %} 32 | {% if source.get(state, '').startswith('draft') %} 33 | {{ _('Draft') }} 34 | {% elif source.get(state, '').startswith('deleted') %} 35 | {{ _('Deleted') }} 36 | {% endif %} 37 |

    38 | 39 | {% if source.notes %} 40 |

    {{ source.notes }}

    41 | {% else %} 42 |

    {{ _('There is no description for this harvest source') }}

    43 | {% endif %} 44 | 45 |

    46 | {% if source.status %} 47 | {{ _('Datasets') }}: {{ source.status.total_datasets }} 48 | {% endif %} 49 | {% if not within_organization and source.organization %} 50 | — {{ _('Organization') }}: {{ h.link_to(source.organization.title or source.organization.name, h.url_for('organization.read', id=source.organization.name)) }} 51 | {% endif %} 52 |

    53 | 54 |
    55 |
  • 56 | -------------------------------------------------------------------------------- /ckanext/harvest/templates/snippets/source_list.html: -------------------------------------------------------------------------------- 1 | {# 2 | Displays a list of harvest sources. 3 | 4 | sources - A list of harvest sources to display. 5 | list_class - The class name for the list item. 6 | item_class - The class name to use on each item. 7 | hide_resources - If true hides the resources (default: false). 8 | banner - If true displays a popular banner (default: false). 9 | truncate - The length to trucate the description to (default: 180) 10 | truncate_title - The length to truncate the title to (default: 80). 11 | 12 | Example: 13 | 14 | {% snippet 'snippets/sources_list.html', sources=sources %} 15 | 16 | #} 17 | {% if sources %} 18 |
      19 | {% for source in sources %} 20 | {% snippet 'snippets/source_item.html', source=source, item_class=item_class, hide_resources=hide_resources, banner=banner, truncate=truncate, truncate_title=truncate_title, within_organization=within_organization %} 21 | {% endfor %} 22 |
    23 | {% endif %} 24 | -------------------------------------------------------------------------------- /ckanext/harvest/templates/source/about.html: -------------------------------------------------------------------------------- 1 | {% extends "source/read_base.html" %} 2 | 3 | {% block primary_content_inner %} 4 |
    5 |

    {{ harvest_source.title or harvest_source.name }}

    6 | {% if harvest_source.notes %} 7 |

    {{ h.markdown_extract(harvest_source.notes)|urlize }}

    8 | {% else %} 9 |

    {{ _('There is no description for this harvest source') }}

    10 | {% endif %} 11 |
    12 | {% snippet "package/snippets/additional_info.html", pkg_dict=harvest_source %} 13 | {% endblock %} 14 | -------------------------------------------------------------------------------- /ckanext/harvest/templates/source/admin.html: -------------------------------------------------------------------------------- 1 | {% extends "source/admin_base.html" %} 2 | 3 | {% block primary_content_inner %} 4 |
    5 |

    {{ _('Last Harvest Job') }}

    6 | {% if harvest_source.status and harvest_source.status.last_job %} 7 | {% snippet "snippets/job_details.html", job=harvest_source.status.last_job %} 8 | 14 | {% else %} 15 |

    {{ _('No jobs yet for this source') }}

    16 | {% endif %} 17 |
    18 | {% endblock %} 19 | -------------------------------------------------------------------------------- /ckanext/harvest/templates/source/admin_base.html: -------------------------------------------------------------------------------- 1 | {% extends "source/read_base.html" %} 2 | 3 | {% block breadcrumb_content_root_selected %}{% endblock %} 4 | 5 | {% block breadcrumb_content %} 6 | {{ super() }} 7 |
  • {{ _('Admin') }}
  • 8 | {% endblock %} 9 | 10 | {% block content_action %} 11 |
    12 | {% if harvest_source.status and harvest_source.status.last_job and (harvest_source.status.last_job.status == 'New' or harvest_source.status.last_job.status == 'Running') %} 13 | Reharvest 14 | {% else %} 15 | {% set locale = h.dump_json({'content': _('This will re-run the harvesting for this source. Any updates at the source will overwrite the local datasets. Sources with a large number of datasets may take a significant amount of time to finish harvesting. Please confirm you would like us to start reharvesting.')}) %} 16 | 18 | 19 | {{ _('Reharvest') }} 20 | 21 | {% endif %} 22 | {% if harvest_source.status and harvest_source.status.last_job and (harvest_source.status.last_job.status == 'Running') %} 23 | 24 | 25 | {{ _('Stop') }} 26 | 27 | {% endif %} 28 | {% set locale = h.dump_json({'content': _('Warning: This will remove all datasets for this source, as well as all previous job reports. Are you sure you want to continue?')}) %} 29 | 31 | {{ _('Clear') }} 32 | 33 | 34 | 35 | {{ _('View harvest source') }} 36 | 37 |
    38 | {% endblock %} 39 | 40 | {% block page_header_tabs %} 41 | {{ h.build_nav_icon('harvester.admin', _('Dashboard'), id=harvest_source.name, icon='dashboard') }} 42 | {{ h.build_nav_icon('harvester.job_list', _('Jobs'), source=harvest_source.name, icon='reorder') }} 43 | {{ h.build_nav_icon(c.dataset_type ~ '.edit', _('Edit'), id=harvest_source.name, icon='edit') }} 44 | {% endblock %} 45 | -------------------------------------------------------------------------------- /ckanext/harvest/templates/source/base.html: -------------------------------------------------------------------------------- 1 | {% extends "page.html" %} 2 | {% set harvest_source = harvest_source or h.get_harvest_source() %} 3 | {% if harvest_source %} 4 | {% set authorized_user = h.check_access('harvest_source_update', {'id':harvest_source.id }) %} 5 | {% else %} 6 | {% set authorized_user = h.check_access('harvest_source_create') %} 7 | {% endif %} 8 | 9 | {% block subtitle %}{{ harvest_source.title or harvest_source.name }}{% endblock %} 10 | 11 | {% block breadcrumb_content_root_selected %} class="active"{% endblock %} 12 | 13 | {% block breadcrumb_content %} 14 | {% if harvest_source.organization %} 15 | {% set org = harvest_source.organization %} 16 |
  • {{ h.nav_link(_('Organizations'), named_route='organizations_index') }}
  • 17 |
  • {{ h.nav_link(org.title or org.name|truncate(10), named_route='organization_read', id=org.name) }}
  • 18 |
  • {{ h.nav_link(_('Harvest Sources'), named_route='{0}_search'.format(c.dataset_type)) }}
  • 19 | {{ h.nav_link(harvest_source.title|truncate(10), named_route='{0}_read'.format(c.dataset_type), id=harvest_source.name) }} 20 | {% else %} 21 |
  • {{ h.nav_link(_('Harvest Sources'), named_route='{0}_search'.format(c.dataset_type)) }}
  • 22 | {{ h.nav_link(harvest_source.title|truncate(30), named_route='{0}_read'.format(c.dataset_type), id=harvest_source.name) }} 23 | {% endif %} 24 | {% endblock %} 25 | -------------------------------------------------------------------------------- /ckanext/harvest/templates/source/edit.html: -------------------------------------------------------------------------------- 1 | {% extends "source/admin_base.html" %} 2 | 3 | {% block subtitle %}{{ _('Edit harvest source') }}{% endblock %} 4 | 5 | {% block primary_content_inner %} 6 | 7 |
    8 | {% block form %} 9 | {{- h.snippet(form_snippet, c=c, **form_vars) -}} 10 | {% endblock %} 11 |
    12 | {% endblock %} 13 | -------------------------------------------------------------------------------- /ckanext/harvest/templates/source/job/list.html: -------------------------------------------------------------------------------- 1 | {% extends "source/admin_base.html" %} 2 | 3 | 4 | {% block subtitle %}{{ _('Harvest Jobs')}} - {{ super() }}{% endblock %} 5 | 6 | {% block primary_content_inner %} 7 |
    8 | 9 |

    {{ _('Harvest Jobs') }}

    10 | 11 | {% if jobs|length == 0 %} 12 |

    {{ _('No jobs yet for this source') }}

    13 | {% else %} 14 |
      15 | {% for job in jobs %} 16 |
    • 17 |
      18 |

      19 | 20 | {{ _('Job: ') }} {{ job.id }} 21 | 22 | {% if job.status != 'Finished' %} 23 | {{ job.status }} 24 | {% endif %} 25 |

      26 |

      27 | {{ _('Started:') }} 28 | 29 | {{ h.render_datetime(job.gather_started, with_hours=True) or _('Not yet') }} 30 | 31 | — 32 | {{ _('Finished:') }} 33 | 34 | {{ h.render_datetime(job.finished, with_hours=True) or _('Not yet') }} 35 | 36 |

      37 |
      38 | {% if job.status == 'Finished' %} 39 |
        40 | {% if 'errored' in job.stats and job.stats['errored'] > 0 %} 41 |
      • 42 | 43 | {{ job.stats['errored'] }} {{ _('errors') }} 44 | 45 |
      • 46 | {% endif %} 47 | {% for action in ['added', 'updated', 'deleted', 'not modified'] %} 48 |
      • 49 | 50 | {% if action in job.stats and job.stats[action] > 0 %} 51 | {{ job.stats[action] }} 52 | {% else %} 53 | 0 54 | {% endif %} 55 | {{ _(action) }} 56 | 57 |
      • 58 | {% endfor %} 59 |
      60 | {% endif %} 61 |
    • 62 | {% endfor %} 63 |
    64 | {% endif %} 65 | 66 |
    67 | {% endblock %} 68 | -------------------------------------------------------------------------------- /ckanext/harvest/templates/source/job/read.html: -------------------------------------------------------------------------------- 1 | {% extends "source/admin_base.html" %} 2 | 3 | {% block subtitle %}{{ _('Job Report') }} - {{ super() }}{% endblock %} 4 | 5 | {% block primary_content_inner %} 6 |
    7 | 8 |

    9 | {{ h.nav_link(_('Back to job list'), named_route='harvester.job_list', source=harvest_source.name, class_='btn btn-default', icon='arrow-left')}} 10 |

    11 | 12 |

    {{ _('Job Report') }}

    13 | {% snippet 'snippets/job_details.html', job=job %} 14 | 15 | {% if job.status == 'Finished' %} 16 | 17 | {% if job.object_error_summary|length == 0 and job.gather_error_summary|length == 0 %} 18 |

    {{ _('Error Summary') }}

    19 |

    {{ _('No errors for this job') }}

    20 | {% else %} 21 |

    22 | {{ _('Error Summary') }} 23 | {{ _('Only the 20 most frequent errors are shown') }} 24 |

    25 | {% if job.gather_error_summary|length > 0 %} 26 |

    {{ _('Job Errors') }}

    27 | {% snippet 'snippets/job_error_summary.html', summary=job.gather_error_summary %} 28 | {% endif %} 29 | {% if job.object_error_summary|length > 0 %} 30 |

    {{ _('Document Errors') }}

    31 | {% snippet 'snippets/job_error_summary.html', summary=job.object_error_summary %} 32 | {% endif %} 33 | {% endif %} 34 | 35 | {% if job_report.gather_errors|length > 0 or job_report.object_errors.keys()|length > 0 %} 36 |

    37 | {{ _('Error Report') }} 38 |

    39 | {% if job_report.gather_errors|length > 0 %} 40 |

    {{ _('Job Errors') }}

    41 | 42 | 43 | {% for error in job_report.gather_errors %} 44 | 45 | 50 | 51 | {% endfor %} 52 | 53 |
    46 |
    47 | {{ error.message }} 48 |
    49 |
    54 | {% endif %} 55 | 56 | {% if job_report.object_errors.keys()|length > 0 %} 57 |

    {{ _('Document Errors') }} 58 | {{ job_report.object_errors.keys()|length}} {{ _('documents with errors') }} 59 |

    60 | 61 | 62 | {% for harvest_object_id in job_report.object_errors.keys() %} 63 | {% set object = job_report.object_errors[harvest_object_id] %} 64 | 65 | 87 | 88 | {% endfor %} 89 | 90 |
    66 | 67 | {% if 'original_url' in object%} 68 | 69 | {{ _('Remote content') }} 70 | 71 | {% endif %} 72 | 73 | {{ _('Local content') }} 74 | 75 | 76 | 77 |
    {{ object.guid }}
    78 | {% for error in object.errors %} 79 |
    80 | {{ error.message }} 81 | {% if error.line %} 82 | (line {{ error.line }}) 83 | {% endif %} 84 |
    85 | {% endfor %} 86 |
    91 | {% endif %} 92 | 93 | {% endif %} 94 | {% endif %} 95 | {% endblock %} 96 | -------------------------------------------------------------------------------- /ckanext/harvest/templates/source/new.html: -------------------------------------------------------------------------------- 1 | {% extends "source/admin_base.html" %} 2 | 3 | {% block breadcrumb_content %} 4 |
  • {{ h.nav_link(_('Harvest Sources'), named_route='{0}_search'.format(c.dataset_type)) }}
  • 5 |
  • {{ h.nav_link(_('Create Harvest Source'), named_route='{0}_new'.format(c.dataset_type)) }}
  • 6 | {% endblock %} 7 | 8 | {% block actions_content %} 9 | {% endblock %} 10 | 11 | {% block subtitle %}{{ _('Create harvest source') }}{% endblock %} 12 | 13 | {% block primary_content %} 14 |
    15 |
    16 | {{- h.snippet(form_snippet, c=c, **form_vars) -}} 17 |
    18 |
    19 | {% endblock %} 20 | 21 | {% block secondary_content %} 22 |
    23 |

    {{ _('Harvest sources') }}

    24 |
    25 |

    26 | {% trans %} 27 | Harvest sources allow importing remote metadata into this catalog. 28 | Remote sources can be other catalogs such as other CKAN instances, CSW 29 | servers or Web Accessible Folders (WAF) (depending on the actual 30 | harvesters enabled for this instance). 31 | {% endtrans %} 32 |

    33 |
    34 |
    35 | {% endblock %} 36 | -------------------------------------------------------------------------------- /ckanext/harvest/templates/source/new_source_form.html: -------------------------------------------------------------------------------- 1 | {% import 'macros/form.html' as form %} 2 | 3 | {% asset 'harvest-extra-field/main' %} 4 | 5 |
    6 | 7 | {% block errors %}{{ form.errors(error_summary) }}{% endblock %} 8 | 9 | {% call form.input('url', id='field-url', label=_('URL'), value=data.url, error=errors.url, classes=['control-full', 'control-large']) %} 10 | 11 | {{ _('This should include the http:// part of the URL') }} 12 | 13 | {% endcall %} 14 | 15 | {{ h.csrf_input() if 'csrf_input' in h }} 16 | 17 | {{ form.input('title', id='field-title', label=_('Title'), placeholder=_('eg. A descriptive title'), value=data.title, error=errors.title, classes=['control-full'], attrs={'data-module': 'slug-preview-target'}) }} 18 | 19 | {% set prefix = 'harvest' %} 20 | {% set domain = h.url_for('{0}.read'.format(c.dataset_type), id='', qualified=true) %} 21 | {% set domain = domain|replace("http://", "")|replace("https://", "") %} 22 | {% set attrs = {'data-module': 'slug-preview-slug', 'data-module-prefix': domain, 'data-module-placeholder': ''} %} 23 | 24 | {{ form.prepend('name', id='field-name', label=_('Name'), prepend=prefix, placeholder=_('eg. my-dataset'), value=data.name, error=errors.name, attrs=attrs) }} 25 | 26 | {{ form.markdown('notes', id='field-notes', label=_('Description'), value=data.notes, error=errors.notes) }} 27 | 28 |
    29 | 30 |
    31 | {% for harvester in h.harvesters_info() %} 32 | {% set checked = False %} 33 | {# select first option if nothing in data #} 34 | {% if data.source_type == harvester['name'] or (not data.source_type and loop.first) %} 35 | {% set checked = True %} 36 | {% endif %} 37 | 42 | {% endfor %} 43 |
    44 |
    45 | 46 | {{ form.select('frequency', id='field-frequency', label=_('Update frequency'), options=h.harvest_frequencies(), selected=data.frequency, error=errors.frequency) }} 47 | 48 | {% block extra_config %} 49 | {{ form.textarea('config', id='field-config', label=_('Configuration'), value=data.config, error=errors.config) }} 50 | {% endblock extra_config %} 51 | 52 | {# if we have a default group then this wants remembering #} 53 | {% if data.group_id %} 54 | 55 | {% endif %} 56 | 57 | {% set dataset_is_draft = data.get('state', 'draft').startswith('draft') or data.get('state', 'none') == 'none' %} 58 | {% set dataset_has_organization = data.owner_org or data.group_id %} 59 | {% set organizations_available = h.organizations_available('create_dataset') %} 60 | {% set user_is_sysadmin = h.check_access('sysadmin') %} 61 | {% set show_organizations_selector = organizations_available and (user_is_sysadmin or dataset_is_draft) %} 62 | 63 | {% if show_organizations_selector %} 64 | {% set existing_org = data.owner_org %} 65 |
    66 | 67 |
    68 | 76 |
    77 |
    78 | {% endif %} 79 | 80 | {% if data.get('id', None) and h.check_access('harvest_source_delete', {'id': data.id}) and data.get('state', 'none') == 'deleted' %} 81 |
    82 | 83 |
    84 | 88 |
    89 |
    90 | {% endif %} 91 | 92 |

    93 | {% block delete_button %} 94 | {% if data.get('id', None) and h.check_access('harvest_source_delete', {'id': data.id}) and not data.get('state', 'none') == 'deleted' %} 95 | {% set locale_delete = h.dump_json({'content': _('This will flag the source as deleted but keep all its datasets and previous jobs. Are you sure you want to delete this harvest source?')}) %} 96 | {% set locale_clear = h.dump_json({'content': _('Warning: Apart from deleting this source, this command will remove all its datasets, as well as all previous job reports. Are you sure you want to continue?')}) %} 97 |

    115 | {% endif %} 116 | {% endblock %} 117 | 118 | 119 |

    120 | 121 | 122 | -------------------------------------------------------------------------------- /ckanext/harvest/templates/source/read.html: -------------------------------------------------------------------------------- 1 | {% extends "source/read_base.html" %} 2 | 3 | {% block primary_content_inner %} 4 |
    5 |

    {{ _('Datasets') }}

    6 | {{ h.package_list_for_source(harvest_source.id) }} 7 |
    8 | {% endblock %} 9 | -------------------------------------------------------------------------------- /ckanext/harvest/templates/source/read_base.html: -------------------------------------------------------------------------------- 1 | {% extends "source/base.html" %} 2 | 3 | {% block secondary_content %} 4 |
    5 |
    6 |

    {{ harvest_source.title }}

    7 | {% if harvest_source.notes %} 8 |

    9 | {{ h.markdown_extract(harvest_source.notes, 180) }} 10 | {{ h.nav_link(_('read more'), named_route='harvester.about', id=harvest_source.name) }} 11 |

    12 | {% else %} 13 |

    {{ _('There is no description for this harvest source') }}

    14 | {% endif %} 15 |
    16 |
    17 |
    {{ _('Datasets') }}
    18 |
    {{ h.package_count_for_source(harvest_source.id) }}
    19 |
    20 |
    21 |
    22 |
    23 | {% endblock %} 24 | 25 | {% block primary_content %} 26 |
    27 | {% block page_header %} 28 | 43 | {% endblock %} 44 | {% block primary_content_inner %}{% endblock %} 45 |
    46 | {% endblock %} 47 | -------------------------------------------------------------------------------- /ckanext/harvest/templates/source/search.html: -------------------------------------------------------------------------------- 1 | {% extends "page.html" %} 2 | 3 | {% block subtitle %}{{ _("Harvest sources") }}{% endblock %} 4 | 5 | 6 | {% block breadcrumb_content %} 7 |
  • {{ h.nav_link(_('Harvest Sources'), named_route='{0}_search'.format(c.dataset_type)) }}
  • 8 | {% endblock %} 9 | 10 | 11 | {% block primary_content %} 12 |
    13 |
    14 | {% block page_primary_action %} 15 |
    16 | {{ h.snippet('snippets/add_source_button.html', dataset_type=c.dataset_type) }} 17 |
    18 | {% endblock %} 19 | 20 | {% set facets = { 21 | 'fields': c.fields_grouped, 22 | 'search': c.search_facets, 23 | 'titles': c.facet_titles, 24 | 'translated_fields': c.translated_fields, 25 | 'remove_field': c.remove_field } 26 | %} 27 | {% set sorting = [ 28 | (_('Relevance'), 'score desc, metadata_modified desc'), 29 | (_('Name Ascending'), 'title_string asc'), 30 | (_('Name Descending'), 'title_string desc'), 31 | (_('Last Modified'), 'metadata_modified desc'), 32 | (_('Popular'), 'views_recent desc') if g.tracking_enabled else (false, false) ] 33 | %} 34 | {% snippet 'snippets/search_form.html', type='harvest', query=c.q, sorting=sorting, sorting_selected=c.sort_by_selected, count=c.page.item_count, facets=facets, show_empty=request.args, error=c.query_error, placeholder=_("Search harvest sources...") %} 35 | 36 | {{ h.snippet('snippets/source_list.html', sources=c.page.items, show_organization=true) }} 37 | 38 |
    39 | 40 | {{ c.page.pager(q=c.q) }} 41 |
    42 | 43 | {% endblock %} 44 | 45 | 46 | 47 | {% block secondary_content %} 48 | {% for facet in c.facet_titles %} 49 | {{ h.snippet('snippets/facet_list.html', title=c.facet_titles[facet], name=facet, alternative_url=h.url_for('{0}.search'.format(c.dataset_type))) }} 50 | {% endfor %} 51 | {% endblock %} 52 | -------------------------------------------------------------------------------- /ckanext/harvest/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ckan/ckanext-harvest/b74cba23b647f0aefab1db406784dd8bb11f8c7d/ckanext/harvest/tests/__init__.py -------------------------------------------------------------------------------- /ckanext/harvest/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.fixture 5 | def clean_db(reset_db, migrate_db_for): 6 | reset_db() 7 | migrate_db_for("harvest") 8 | -------------------------------------------------------------------------------- /ckanext/harvest/tests/factories.py: -------------------------------------------------------------------------------- 1 | import factory 2 | import ckanext.harvest.model as harvest_model 3 | from ckantoolkit.tests.factories import _get_action_user_name 4 | from ckan.plugins import toolkit 5 | 6 | 7 | class HarvestSource(factory.Factory): 8 | 9 | FACTORY_FOR = harvest_model.HarvestSource 10 | 11 | class Meta: 12 | model = harvest_model.HarvestSource 13 | 14 | _return_type = 'dict' 15 | 16 | name = factory.Sequence(lambda n: 'test_source_{n}'.format(n=n)) 17 | title = factory.Sequence(lambda n: 'test title {n}'.format(n=n)) 18 | url = factory.Sequence(lambda n: 'http://{n}.test.com'.format(n=n)) 19 | source_type = 'test' # defined in test_queue.py 20 | id = '{0}_id'.format(name).lower() 21 | 22 | @classmethod 23 | def _create(cls, target_class, *args, **kwargs): 24 | if args: 25 | assert False, "Positional args aren't supported, use keyword args." 26 | context = {'user': _get_action_user_name(kwargs)} 27 | # If there is an existing source for this URL, and we can't create 28 | # another source with that URL, just return the original one. 29 | try: 30 | source_dict = toolkit.get_action('harvest_source_show')( 31 | context, dict(url=kwargs['url'])) 32 | except toolkit.ObjectNotFound: 33 | source_dict = toolkit.get_action('harvest_source_create')( 34 | context, kwargs) 35 | if cls._return_type == 'dict': 36 | return source_dict 37 | else: 38 | return harvest_model.HarvestSource.get(source_dict['id']) 39 | 40 | 41 | class HarvestSourceObj(HarvestSource): 42 | _return_type = 'obj' 43 | 44 | 45 | class HarvestJob(factory.Factory): 46 | 47 | FACTORY_FOR = harvest_model.HarvestJob 48 | 49 | class Meta: 50 | model = harvest_model.HarvestJob 51 | 52 | _return_type = 'dict' 53 | 54 | source = factory.SubFactory(HarvestSourceObj) 55 | 56 | @classmethod 57 | def _create(cls, target_class, *args, **kwargs): 58 | if args: 59 | assert False, "Positional args aren't supported, use keyword args." 60 | context = {'user': _get_action_user_name(kwargs)} 61 | if 'source_id' not in kwargs: 62 | kwargs['source_id'] = kwargs['source'].id 63 | if 'run' not in kwargs: 64 | kwargs['run'] = False 65 | job_dict = toolkit.get_action('harvest_job_create')( 66 | context, kwargs) 67 | if cls._return_type == 'dict': 68 | return job_dict 69 | else: 70 | return harvest_model.HarvestJob.get(job_dict['id']) 71 | 72 | 73 | class HarvestJobObj(HarvestJob): 74 | _return_type = 'obj' 75 | 76 | 77 | class HarvestObject(factory.Factory): 78 | 79 | FACTORY_FOR = harvest_model.HarvestObject 80 | 81 | class Meta: 82 | model = harvest_model.HarvestObject 83 | 84 | _return_type = 'dict' 85 | 86 | # source = factory.SubFactory(HarvestSourceObj) 87 | job = factory.SubFactory(HarvestJobObj) 88 | 89 | @classmethod 90 | def _create(cls, target_class, *args, **kwargs): 91 | if args: 92 | assert False, "Positional args aren't supported, use keyword args." 93 | context = {'user': _get_action_user_name(kwargs)} 94 | if 'job_id' not in kwargs: 95 | kwargs['job_id'] = kwargs['job'].id 96 | kwargs['source_id'] = kwargs['job'].source.id 97 | # Remove 'job' to avoid it getting added as a HarvestObjectExtra 98 | if 'job' in kwargs: 99 | kwargs.pop('job') 100 | job_dict = toolkit.get_action('harvest_object_create')( 101 | context, kwargs) 102 | if cls._return_type == 'dict': 103 | return job_dict 104 | else: 105 | return harvest_model.HarvestObject.get(job_dict['id']) 106 | 107 | 108 | class HarvestObjectObj(HarvestObject): 109 | _return_type = 'obj' 110 | -------------------------------------------------------------------------------- /ckanext/harvest/tests/fixtures.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from ckanext.harvest import queue 4 | 5 | 6 | @pytest.fixture 7 | def clean_queues(): 8 | queue.purge_queues() 9 | -------------------------------------------------------------------------------- /ckanext/harvest/tests/harvesters/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ckan/ckanext-harvest/b74cba23b647f0aefab1db406784dd8bb11f8c7d/ckanext/harvest/tests/harvesters/__init__.py -------------------------------------------------------------------------------- /ckanext/harvest/tests/harvesters/test_base.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import pytest 4 | try: 5 | from unittest.mock import patch 6 | except ImportError: 7 | from mock import patch 8 | 9 | 10 | from ckanext.harvest.harvesters.base import HarvesterBase, munge_tag 11 | from ckantoolkit.tests import factories 12 | 13 | _ensure_name_is_unique = HarvesterBase._ensure_name_is_unique 14 | 15 | 16 | @pytest.mark.usefixtures('with_plugins', 'clean_db', 'clean_index') 17 | class TestGenNewName(object): 18 | 19 | def test_basic(self): 20 | assert HarvesterBase._gen_new_name('Trees') == 'trees' 21 | 22 | def test_munge(self): 23 | assert HarvesterBase._gen_new_name('Trees and branches - survey.') == 'trees-and-branches-survey' 24 | 25 | @patch.dict('ckanext.harvest.harvesters.base.config', 26 | {'ckanext.harvest.some_other_config': 'value'}) 27 | def test_without_config(self): 28 | '''Tests if the number suffix is used when no config is set.''' 29 | factories.Dataset(name='trees') 30 | assert HarvesterBase._gen_new_name('Trees') == 'trees1' 31 | 32 | @patch.dict('ckanext.harvest.harvesters.base.config', 33 | {'ckanext.harvest.default_dataset_name_append': 'number-sequence'}) 34 | def test_number_config(self): 35 | factories.Dataset(name='trees') 36 | assert HarvesterBase._gen_new_name('Trees') == 'trees1' 37 | 38 | @patch.dict('ckanext.harvest.harvesters.base.config', 39 | {'ckanext.harvest.default_dataset_name_append': 'random-hex'}) 40 | def test_random_config(self): 41 | factories.Dataset(name='trees') 42 | new_name = HarvesterBase._gen_new_name('Trees') 43 | 44 | assert re.match(r'trees[\da-f]{5}', new_name) 45 | 46 | @patch.dict('ckanext.harvest.harvesters.base.config', 47 | {'ckanext.harvest.default_dataset_name_append': 'random-hex'}) 48 | def test_config_override(self): 49 | '''Tests if a parameter has precedence over a config value.''' 50 | factories.Dataset(name='trees') 51 | assert HarvesterBase._gen_new_name('Trees', append_type='number-sequence') == 'trees1' 52 | 53 | 54 | @pytest.mark.usefixtures('with_plugins', 'clean_db', 'clean_index') 55 | class TestEnsureNameIsUnique(object): 56 | 57 | def test_no_existing_datasets(self): 58 | factories.Dataset(name='unrelated') 59 | assert _ensure_name_is_unique('trees') == 'trees' 60 | 61 | def test_existing_dataset(self): 62 | factories.Dataset(name='trees') 63 | assert _ensure_name_is_unique('trees') == 'trees1' 64 | 65 | def test_two_existing_datasets(self): 66 | factories.Dataset(name='trees') 67 | factories.Dataset(name='trees1') 68 | assert _ensure_name_is_unique('trees') == 'trees2' 69 | 70 | def test_no_existing_datasets_and_long_name(self): 71 | assert _ensure_name_is_unique('x' * 101) == 'x' * 100 72 | 73 | def test_existing_dataset_and_long_name(self): 74 | # because PACKAGE_NAME_MAX_LENGTH = 100 75 | factories.Dataset(name='x' * 100) 76 | assert _ensure_name_is_unique('x' * 101) == 'x' * 99 + '1' 77 | 78 | def test_update_dataset_with_new_name(self): 79 | factories.Dataset(name='trees1') 80 | assert _ensure_name_is_unique('tree', existing_name='trees1') == 'tree' 81 | 82 | def test_update_dataset_but_with_same_name(self): 83 | # this can happen if you remove a trailing space from the title - the 84 | # harvester sees the title changed and thinks it should have a new 85 | # name, but clearly it can reuse its existing one 86 | factories.Dataset(name='trees') 87 | factories.Dataset(name='trees1') 88 | assert _ensure_name_is_unique('trees', existing_name='trees') == 'trees' 89 | 90 | def test_update_dataset_to_available_shorter_name(self): 91 | # this can be handy when if reharvesting, you got duplicates and 92 | # managed to purge one set and through a minor title change you can now 93 | # lose the appended number. users don't like unnecessary numbers. 94 | factories.Dataset(name='trees1') 95 | assert _ensure_name_is_unique('trees', existing_name='trees1') == 'trees' 96 | 97 | def test_update_dataset_but_doesnt_change_to_other_number(self): 98 | # there's no point changing one number for another though 99 | factories.Dataset(name='trees') 100 | factories.Dataset(name='trees2') 101 | assert _ensure_name_is_unique('trees', existing_name='trees2') == 'trees2' 102 | 103 | def test_update_dataset_with_new_name_with_numbers(self): 104 | factories.Dataset(name='trees') 105 | factories.Dataset(name='trees2') 106 | factories.Dataset(name='frogs') 107 | assert _ensure_name_is_unique('frogs', existing_name='trees2') == 'frogs1' 108 | 109 | def test_existing_dataset_appending_hex(self): 110 | factories.Dataset(name='trees') 111 | name = _ensure_name_is_unique('trees', append_type='random-hex') 112 | # e.g. 'trees0b53f' 113 | assert re.match(r'trees[\da-f]{5}', name) 114 | 115 | 116 | # taken from ckan/tests/lib/test_munge.py 117 | class TestMungeTag: 118 | 119 | # (original, expected) 120 | munge_list = [ 121 | ('unchanged', 'unchanged'), 122 | # ('s', 's_'), # too short 123 | ('some spaces here', 'some-spaces--here'), 124 | ('random:other%characters&_.here', 'randomothercharactershere'), 125 | ('river-water-dashes', 'river-water-dashes'), 126 | ] 127 | 128 | def test_munge_tag(self): 129 | '''Munge a list of tags gives expected results.''' 130 | for org, exp in self.munge_list: 131 | munge = munge_tag(org) 132 | assert munge == exp 133 | 134 | def test_munge_tag_multiple_pass(self): 135 | '''Munge a list of tags muliple times gives expected results.''' 136 | for org, exp in self.munge_list: 137 | first_munge = munge_tag(org) 138 | assert first_munge == exp 139 | second_munge = munge_tag(first_munge) 140 | assert second_munge == exp 141 | 142 | def test_clean_tags_package_show(self): 143 | instance = HarvesterBase() 144 | tags_as_dict = [{u'vocabulary_id': None, 145 | u'state': u'active', 146 | u'display_name': name, 147 | u'id': u'073080c8-fef2-4743-9c9e-6216019f8b3d', 148 | u'name': name} for name, exp in self.munge_list] 149 | 150 | clean_tags = HarvesterBase._clean_tags(instance, tags_as_dict) 151 | 152 | idx = 0 153 | for _, exp in self.munge_list: 154 | tag = clean_tags[idx] 155 | assert tag['name'] == exp 156 | idx += 1 157 | 158 | def test_clean_tags_rest(self): 159 | instance = HarvesterBase() 160 | tags_as_str = [name for name, exp in self.munge_list] 161 | 162 | clean_tags = HarvesterBase._clean_tags(instance, tags_as_str) 163 | 164 | assert len(clean_tags) == len(tags_as_str) 165 | 166 | for _, exp in self.munge_list: 167 | assert exp in clean_tags 168 | -------------------------------------------------------------------------------- /ckanext/harvest/tests/harvesters/test_ckanharvester.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import copy 3 | 4 | import json 5 | try: 6 | from unittest.mock import patch, MagicMock, Mock 7 | except ImportError: 8 | from mock import patch, MagicMock, Mock 9 | import pytest 10 | from requests.exceptions import HTTPError, RequestException 11 | 12 | from ckantoolkit.tests.helpers import call_action 13 | from ckantoolkit.tests.factories import Organization, Group 14 | from ckan import model 15 | from ckan.plugins import toolkit 16 | 17 | from ckanext.harvest.harvesters.ckanharvester import ContentFetchError 18 | from ckanext.harvest.tests.factories import (HarvestSourceObj, HarvestJobObj, 19 | HarvestObjectObj) 20 | from ckanext.harvest.tests.lib import run_harvest 21 | import ckanext.harvest.model as harvest_model 22 | from ckanext.harvest.harvesters.base import HarvesterBase 23 | from ckanext.harvest.harvesters.ckanharvester import CKANHarvester 24 | 25 | from . import mock_ckan 26 | 27 | # Start CKAN-alike server we can test harvesting against it 28 | mock_ckan.serve() 29 | 30 | 31 | def was_last_job_considered_error_free(): 32 | last_job = model.Session.query(harvest_model.HarvestJob) \ 33 | .order_by(harvest_model.HarvestJob.created.desc()) \ 34 | .first() 35 | job = MagicMock() 36 | job.source = last_job.source 37 | job.id = '' 38 | return bool(HarvesterBase.last_error_free_job(job)) 39 | 40 | 41 | @pytest.mark.usefixtures('with_plugins', 'clean_db', 'clean_index') 42 | class TestCkanHarvester(object): 43 | 44 | def test_gather_normal(self): 45 | source = HarvestSourceObj(url='http://localhost:%s/' % mock_ckan.PORT) 46 | job = HarvestJobObj(source=source) 47 | 48 | harvester = CKANHarvester() 49 | obj_ids = harvester.gather_stage(job) 50 | 51 | assert job.gather_errors == [] 52 | assert isinstance(obj_ids, list) 53 | assert len(obj_ids) == len(mock_ckan.DATASETS) 54 | harvest_object = harvest_model.HarvestObject.get(obj_ids[0]) 55 | assert harvest_object.guid == mock_ckan.DATASETS[0]['id'] 56 | assert json.loads(harvest_object.content) == mock_ckan.DATASETS[0] 57 | 58 | def test_fetch_normal(self): 59 | source = HarvestSourceObj(url='http://localhost:%s/' % mock_ckan.PORT) 60 | job = HarvestJobObj(source=source) 61 | harvest_object = HarvestObjectObj( 62 | guid=mock_ckan.DATASETS[0]['id'], 63 | job=job, 64 | content=json.dumps(mock_ckan.DATASETS[0])) 65 | 66 | harvester = CKANHarvester() 67 | result = harvester.fetch_stage(harvest_object) 68 | 69 | assert harvest_object.errors == [] 70 | assert result is True 71 | 72 | def test_import_normal(self): 73 | org = Organization() 74 | harvest_object = HarvestObjectObj( 75 | guid=mock_ckan.DATASETS[0]['id'], 76 | content=json.dumps(mock_ckan.DATASETS[0]), 77 | job__source__owner_org=org['id']) 78 | 79 | harvester = CKANHarvester() 80 | result = harvester.import_stage(harvest_object) 81 | 82 | assert harvest_object.errors == [] 83 | assert result is True 84 | assert harvest_object.package_id 85 | dataset = model.Package.get(harvest_object.package_id) 86 | assert dataset.name == mock_ckan.DATASETS[0]['name'] 87 | 88 | def test_harvest(self): 89 | results_by_guid = run_harvest( 90 | url='http://localhost:%s/' % mock_ckan.PORT, 91 | harvester=CKANHarvester()) 92 | 93 | result = results_by_guid['dataset1-id'] 94 | assert result['state'] == 'COMPLETE' 95 | assert result['report_status'] == 'added' 96 | assert result['dataset']['name'] == mock_ckan.DATASETS[0]['name'] 97 | assert result['errors'] == [] 98 | 99 | result = results_by_guid[mock_ckan.DATASETS[1]['id']] 100 | assert result['state'] == 'COMPLETE' 101 | assert result['report_status'] == 'added' 102 | assert result['dataset']['name'] == mock_ckan.DATASETS[1]['name'] 103 | assert result['errors'] == [] 104 | assert was_last_job_considered_error_free() 105 | 106 | def test_harvest_twice(self): 107 | run_harvest( 108 | url='http://localhost:%s/' % mock_ckan.PORT, 109 | harvester=CKANHarvester()) 110 | 111 | # change the modified date 112 | datasets = copy.deepcopy(mock_ckan.DATASETS) 113 | datasets[1]['metadata_modified'] = '2050-05-09T22:00:01.486366' 114 | with patch('ckanext.harvest.tests.harvesters.mock_ckan.DATASETS', 115 | datasets): 116 | results_by_guid = run_harvest( 117 | url='http://localhost:%s/' % mock_ckan.PORT, 118 | harvester=CKANHarvester()) 119 | 120 | # updated the dataset which has revisions 121 | result = results_by_guid[mock_ckan.DATASETS[1]['id']] 122 | assert result['state'] == 'COMPLETE' 123 | assert result['report_status'] == 'updated' 124 | assert result['dataset']['name'] == mock_ckan.DATASETS[1]['name'] 125 | assert result['errors'] == [] 126 | 127 | # the other dataset is unchanged and not harvested 128 | assert mock_ckan.DATASETS[0]['id'] not in result 129 | assert was_last_job_considered_error_free() 130 | 131 | def test_exclude_organizations(self): 132 | config = {'organizations_filter_exclude': ['org1']} 133 | results_by_guid = run_harvest( 134 | url='http://localhost:%s' % mock_ckan.PORT, 135 | harvester=CKANHarvester(), 136 | config=json.dumps(config)) 137 | assert 'dataset1-id' not in results_by_guid 138 | assert mock_ckan.DATASETS[1]['id'] in results_by_guid 139 | 140 | def test_include_organizations(self): 141 | config = {'organizations_filter_include': ['org1']} 142 | results_by_guid = run_harvest( 143 | url='http://localhost:%s' % mock_ckan.PORT, 144 | harvester=CKANHarvester(), 145 | config=json.dumps(config)) 146 | assert 'dataset1-id' in results_by_guid 147 | assert mock_ckan.DATASETS[1]['id'] not in results_by_guid 148 | 149 | def test_exclude_groups(self): 150 | config = {'groups_filter_exclude': ['group1']} 151 | results_by_guid = run_harvest( 152 | url='http://localhost:%s' % mock_ckan.PORT, 153 | harvester=CKANHarvester(), 154 | config=json.dumps(config)) 155 | assert 'dataset1-id' not in results_by_guid 156 | assert mock_ckan.DATASETS[1]['id'] in results_by_guid 157 | 158 | def test_include_groups(self): 159 | config = {'groups_filter_include': ['group1']} 160 | results_by_guid = run_harvest( 161 | url='http://localhost:%s' % mock_ckan.PORT, 162 | harvester=CKANHarvester(), 163 | config=json.dumps(config)) 164 | assert 'dataset1-id' in results_by_guid 165 | assert mock_ckan.DATASETS[1]['id'] not in results_by_guid 166 | 167 | def test_remote_groups_create(self): 168 | config = {'remote_groups': 'create'} 169 | results_by_guid = run_harvest( 170 | url='http://localhost:%s' % mock_ckan.PORT, 171 | harvester=CKANHarvester(), 172 | config=json.dumps(config)) 173 | assert 'dataset1-id' in results_by_guid 174 | # Check that the remote group was created locally 175 | call_action('group_show', {}, id=mock_ckan.GROUPS[0]['id']) 176 | 177 | def test_harvest_info_in_package_show(self): 178 | results_by_guid = run_harvest( 179 | url='http://localhost:%s' % mock_ckan.PORT, 180 | harvester=CKANHarvester()) 181 | assert 'dataset1-id' in results_by_guid 182 | 183 | # Check that the dataset extras has the harvest_object_id, harvest_source_id, and harvest_source_title 184 | dataset = call_action('package_show', {"for_view": True}, id=mock_ckan.DATASETS[0]['id']) 185 | extras_dict = dict((e['key'], e['value']) for e in dataset['extras']) 186 | assert 'harvest_object_id' in extras_dict 187 | assert 'harvest_source_id' in extras_dict 188 | assert 'harvest_source_title' in extras_dict 189 | 190 | def test_remote_groups_only_local(self): 191 | # Create an existing group 192 | Group(id='10037fa4-e683-4a67-892a-efba815e24ad', name='group1') 193 | 194 | config = {'remote_groups': 'only_local'} 195 | results_by_guid = run_harvest( 196 | url='http://localhost:%s' % mock_ckan.PORT, 197 | harvester=CKANHarvester(), 198 | config=json.dumps(config)) 199 | assert 'dataset1-id' in results_by_guid 200 | 201 | # Check that the dataset was added to the existing local group 202 | dataset = call_action('package_show', {}, id=mock_ckan.DATASETS[0]['id']) 203 | assert dataset['groups'][0]['id'] == mock_ckan.DATASETS[0]['groups'][0]['id'] 204 | 205 | # Check that the other remote group was not created locally 206 | with pytest.raises(toolkit.ObjectNotFound): 207 | call_action('group_show', {}, id='remote-group') 208 | 209 | def test_harvest_not_modified(self): 210 | run_harvest( 211 | url='http://localhost:%s/' % mock_ckan.PORT, 212 | harvester=CKANHarvester()) 213 | 214 | results_by_guid = run_harvest( 215 | url='http://localhost:%s/' % mock_ckan.PORT, 216 | harvester=CKANHarvester()) 217 | 218 | # The metadata_modified was the same for this dataset so the import 219 | # would have returned 'unchanged' 220 | result = results_by_guid[mock_ckan.DATASETS[1]['id']] 221 | assert result['state'] == 'COMPLETE' 222 | assert result['report_status'] == 'not modified' 223 | assert 'dataset' not in result 224 | assert result['errors'] == [] 225 | assert was_last_job_considered_error_free() 226 | 227 | def test_harvest_whilst_datasets_added(self): 228 | results_by_guid = run_harvest( 229 | url='http://localhost:%s/datasets_added' % mock_ckan.PORT, 230 | harvester=CKANHarvester()) 231 | 232 | assert sorted(results_by_guid.keys()) == [mock_ckan.DATASETS[1]['id'], mock_ckan.DATASETS[0]['id']] 233 | 234 | def test_harvest_site_down(self): 235 | results_by_guid = run_harvest( 236 | url='http://localhost:%s/site_down' % mock_ckan.PORT, 237 | harvester=CKANHarvester()) 238 | assert not results_by_guid 239 | assert not was_last_job_considered_error_free() 240 | 241 | def test_default_tags(self): 242 | config = {'default_tags': [{'name': 'geo'}]} 243 | results_by_guid = run_harvest( 244 | url='http://localhost:%s' % mock_ckan.PORT, 245 | harvester=CKANHarvester(), 246 | config=json.dumps(config)) 247 | tags = results_by_guid['dataset1-id']['dataset']['tags'] 248 | tag_names = [tag['name'] for tag in tags] 249 | assert 'geo' in tag_names 250 | 251 | def test_default_tags_invalid(self): 252 | config = {'default_tags': ['geo']} # should be list of dicts 253 | with pytest.raises(toolkit.ValidationError) as harvest_context: 254 | run_harvest( 255 | url='http://localhost:%s' % mock_ckan.PORT, 256 | harvester=CKANHarvester(), 257 | config=json.dumps(config)) 258 | assert 'default_tags must be a list of dictionaries' in str(harvest_context.value) 259 | 260 | def test_default_groups(self): 261 | Group(name='group1') 262 | Group(name='group2') 263 | Group(name='group3') 264 | 265 | config = {'default_groups': ['group2', 'group3'], 266 | 'remote_groups': 'only_local'} 267 | tmp_c = toolkit.c 268 | try: 269 | # c.user is used by the validation (annoying), 270 | # however patch doesn't work because it's a weird 271 | # StackedObjectProxy, so we swap it manually 272 | toolkit.c = MagicMock(user='') 273 | results_by_guid = run_harvest( 274 | url='http://localhost:%s' % mock_ckan.PORT, 275 | harvester=CKANHarvester(), 276 | config=json.dumps(config)) 277 | finally: 278 | toolkit.c = tmp_c 279 | assert results_by_guid['dataset1-id']['errors'] == [] 280 | groups = results_by_guid['dataset1-id']['dataset']['groups'] 281 | group_names = set(group['name'] for group in groups) 282 | # group1 comes from the harvested dataset 283 | # group2 & 3 come from the default_groups 284 | assert group_names, set(('group1', 'group2' == 'group3')) 285 | 286 | def test_default_groups_invalid(self): 287 | Group(name='group2') 288 | 289 | # should be list of strings 290 | config = {'default_groups': [{'name': 'group2'}]} 291 | with pytest.raises(toolkit.ValidationError) as harvest_context: 292 | run_harvest( 293 | url='http://localhost:%s' % mock_ckan.PORT, 294 | harvester=CKANHarvester(), 295 | config=json.dumps(config)) 296 | assert 'default_groups must be a list of group names/ids' in str(harvest_context.value) 297 | 298 | def test_default_extras(self): 299 | config = { 300 | 'default_extras': { 301 | 'encoding': 'utf8', 302 | 'harvest_url': '{harvest_source_url}/dataset/{dataset_id}' 303 | } 304 | } 305 | results_by_guid = run_harvest( 306 | url='http://localhost:%s' % mock_ckan.PORT, 307 | harvester=CKANHarvester(), 308 | config=json.dumps(config)) 309 | assert results_by_guid['dataset1-id']['errors'] == [] 310 | extras = results_by_guid['dataset1-id']['dataset']['extras'] 311 | extras_dict = dict((e['key'], e['value']) for e in extras) 312 | assert extras_dict['encoding'] == 'utf8' 313 | assert extras_dict['harvest_url'] == 'http://localhost:8998/dataset/dataset1-id' 314 | 315 | def test_default_extras_invalid(self): 316 | config = { 317 | 'default_extras': 'utf8', # value should be a dict 318 | } 319 | with pytest.raises(toolkit.ValidationError) as harvest_context: 320 | run_harvest( 321 | url='http://localhost:%s' % mock_ckan.PORT, 322 | harvester=CKANHarvester(), 323 | config=json.dumps(config)) 324 | assert 'default_extras must be a dictionary' in str(harvest_context.value) 325 | 326 | @patch('ckanext.harvest.harvesters.ckanharvester.CKANHarvester.config') 327 | @patch('ckanext.harvest.harvesters.ckanharvester.requests.get', side_effect=RequestException('Test.value')) 328 | def test_get_content_handles_request_exception( 329 | self, mock_requests_get, mock_config 330 | ): 331 | mock_config.return_value = {} 332 | 333 | harvester = CKANHarvester() 334 | 335 | with pytest.raises(ContentFetchError) as context: 336 | harvester._get_content("http://test.example.gov.uk") 337 | 338 | assert str(context.value) == 'Request error: Test.value' 339 | 340 | class MockHTTPError(HTTPError): 341 | def __init__(self): 342 | self.response = Mock() 343 | self.response.status_code = 404 344 | self.request = Mock() 345 | self.request.url = "http://test.example.gov.uk" 346 | 347 | @patch('ckanext.harvest.harvesters.ckanharvester.CKANHarvester.config') 348 | @patch('ckanext.harvest.harvesters.ckanharvester.requests.get', side_effect=MockHTTPError()) 349 | def test_get_content_handles_http_error( 350 | self, mock_requests_get, mock_config 351 | ): 352 | mock_config.return_value = {} 353 | 354 | harvester = CKANHarvester() 355 | 356 | with pytest.raises(ContentFetchError) as context: 357 | harvester._get_content("http://test.example.gov.uk") 358 | 359 | assert str(context.value) == 'HTTP error: 404 http://test.example.gov.uk' 360 | -------------------------------------------------------------------------------- /ckanext/harvest/tests/lib.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from ckanext.harvest.tests.factories import HarvestSourceObj, HarvestJobObj 4 | import ckanext.harvest.model as harvest_model 5 | from ckanext.harvest import queue 6 | from ckan.plugins import toolkit 7 | 8 | log = logging.getLogger(__name__) 9 | 10 | 11 | def run_harvest(url, harvester, config=''): 12 | '''Runs a harvest and returns the results. 13 | This allows you to test a harvester. 14 | Queues are avoided as they are a pain in tests. 15 | ''' 16 | # User creates a harvest source 17 | source = HarvestSourceObj(url=url, config=config, 18 | source_type=harvester.info()['name']) 19 | 20 | # User triggers a harvest, which is the creation of a harvest job. 21 | # We set run=False so that it doesn't put it on the gather queue. 22 | job = HarvestJobObj(source=source, run=False) 23 | 24 | return run_harvest_job(job, harvester) 25 | 26 | 27 | def run_harvest_job(job, harvester): 28 | # In 'harvest_job_create' it would call 'harvest_send_job_to_gather_queue' 29 | # which would do 2 things to 'run' the job: 30 | # 1. change the job status to Running 31 | job.status = 'Running' 32 | job.save() 33 | # 2. put the job on the gather queue which is consumed by 34 | # queue.gather_callback, which determines the harvester and then calls 35 | # gather_stage. We simply call the gather_stage. 36 | obj_ids = queue.gather_stage(harvester, job) 37 | if not isinstance(obj_ids, list): 38 | # gather had nothing to do or errored. Carry on to ensure the job is 39 | # closed properly 40 | obj_ids = [] 41 | 42 | # The object ids are put onto the fetch queue, consumed by 43 | # queue.fetch_callback which calls queue.fetch_and_import_stages 44 | results_by_guid = {} 45 | for obj_id in obj_ids: 46 | harvest_object = harvest_model.HarvestObject.get(obj_id) 47 | guid = harvest_object.guid 48 | 49 | # force reimport of datasets 50 | if hasattr(job, 'force_import'): 51 | if guid in job.force_import: 52 | harvest_object.force_import = True 53 | else: 54 | log.info('Skipping: %s', guid) 55 | continue 56 | 57 | results_by_guid[guid] = {'obj_id': obj_id} 58 | 59 | queue.fetch_and_import_stages(harvester, harvest_object) 60 | results_by_guid[guid]['state'] = harvest_object.state 61 | results_by_guid[guid]['report_status'] = harvest_object.report_status 62 | if harvest_object.state == 'COMPLETE' and harvest_object.package_id: 63 | results_by_guid[guid]['dataset'] = \ 64 | toolkit.get_action('package_show')( 65 | {'ignore_auth': True}, 66 | dict(id=harvest_object.package_id)) 67 | results_by_guid[guid]['errors'] = harvest_object.errors 68 | 69 | # Do 'harvest_jobs_run' to change the job status to 'finished' 70 | toolkit.get_action('harvest_jobs_run')({'ignore_auth': True}, {}) 71 | 72 | return results_by_guid 73 | -------------------------------------------------------------------------------- /ckanext/harvest/tests/test_blueprint.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from ckantoolkit import url_for 4 | from ckantoolkit.tests import factories 5 | from ckanext.harvest.tests import factories as harvest_factories 6 | 7 | 8 | @pytest.mark.usefixtures('with_plugins', 'clean_db', 'clean_index') 9 | class TestBlueprint(): 10 | 11 | def test_index_page_is_rendered(self, app): 12 | 13 | source1 = harvest_factories.HarvestSource() 14 | source2 = harvest_factories.HarvestSource() 15 | 16 | response = app.get(u'/harvest') 17 | 18 | assert source1['title'] in response.body 19 | assert source2['title'] in response.body 20 | 21 | def test_new_form_is_rendered(self, app): 22 | 23 | url = url_for('harvest.new') 24 | sysadmin = factories.Sysadmin() 25 | env = {"REMOTE_USER": sysadmin['name'].encode('ascii')} 26 | 27 | response = app.get(url, extra_environ=env) 28 | 29 | assert '
    0: 147 | obj.import_finished = now - timedelta(minutes=minutes_ago) 148 | obj.save() 149 | return obj 150 | -------------------------------------------------------------------------------- /ckanext/harvest/views.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import ckantoolkit as tk 4 | from flask import Blueprint, make_response 5 | 6 | import ckanext.harvest.utils as utils 7 | 8 | # IDatasetForm provides a "harvest" blueprint for the package type harvest. 9 | # We name the extension blueprint "harvester" to avoid clashing of names. 10 | harvester = Blueprint("harvester", __name__) 11 | 12 | 13 | @harvester.before_request 14 | def before_request(): 15 | tk.c.dataset_type = utils.DATASET_TYPE_NAME 16 | 17 | 18 | def delete(id): 19 | return utils.delete_view(id) 20 | 21 | 22 | def refresh(id): 23 | return utils.refresh_view(id) 24 | 25 | 26 | def admin(id): 27 | return utils.admin_view(id) 28 | 29 | 30 | def about(id): 31 | return utils.about_view(id) 32 | 33 | 34 | def clear(id): 35 | return utils.clear_view(id) 36 | 37 | 38 | def job_list(source): 39 | return utils.job_list_view(source) 40 | 41 | 42 | def job_show_last(source): 43 | return utils.job_show_last_view(source) 44 | 45 | 46 | def job_show(source, id): 47 | return utils.job_show_view(id) 48 | 49 | 50 | def job_abort(source, id): 51 | return utils.job_abort_view(source, id) 52 | 53 | 54 | def object_show(id, ref_type): 55 | (response, content) = utils.object_show_view(id, ref_type, make_response()) 56 | response.set_data(content) 57 | return response 58 | 59 | 60 | harvester.add_url_rule( 61 | "/" + utils.DATASET_TYPE_NAME + "/delete/", 62 | view_func=delete, 63 | ) 64 | harvester.add_url_rule("/" + utils.DATASET_TYPE_NAME + "/refresh/", 65 | view_func=refresh, 66 | methods=(u'POST', u'GET')) 67 | harvester.add_url_rule( 68 | "/" + utils.DATASET_TYPE_NAME + "/admin/", 69 | view_func=admin, 70 | ) 71 | harvester.add_url_rule( 72 | "/" + utils.DATASET_TYPE_NAME + "/about/", 73 | view_func=about, 74 | ) 75 | harvester.add_url_rule("/" + utils.DATASET_TYPE_NAME + "/clear/", 76 | view_func=clear, 77 | methods=(u'POST', u'GET')) 78 | harvester.add_url_rule( 79 | "/" + utils.DATASET_TYPE_NAME + "//job", 80 | view_func=job_list, 81 | ) 82 | harvester.add_url_rule( 83 | "/" + utils.DATASET_TYPE_NAME + "//job/last", 84 | view_func=job_show_last, 85 | ) 86 | 87 | harvester.add_url_rule( 88 | "/" + utils.DATASET_TYPE_NAME + "//job/", 89 | view_func=job_show, 90 | ) 91 | harvester.add_url_rule( 92 | "/" + utils.DATASET_TYPE_NAME + "//job//abort", 93 | view_func=job_abort, 94 | ) 95 | harvester.add_url_rule( 96 | "/" + utils.DATASET_TYPE_NAME + "/object/", 97 | view_func=object_show, 98 | defaults={"ref_type": "object"}, 99 | ) 100 | harvester.add_url_rule( 101 | "/dataset/harvest_object/", 102 | view_func=object_show, 103 | defaults={"ref_type": "dataset"}, 104 | ) 105 | 106 | 107 | def get_blueprints(): 108 | return [harvester] 109 | -------------------------------------------------------------------------------- /config/supervisor/ckan_harvesting.conf: -------------------------------------------------------------------------------- 1 | ; =============================== 2 | ; ckan harvester example 3 | ; =============================== 4 | 5 | ; symlink or copy this file to /etc/supervisr/conf.d 6 | ; change the path/to/virtualenv below to the virtualenv ckan is in. 7 | 8 | [program:ckan_gather_consumer] 9 | 10 | ; Full Path to executable, should be path to virtural environment, 11 | ; Full path to config file too. 12 | 13 | command=/path/to/pyenv/bin/paster --plugin=ckanext-harvest harvester gather_consumer --config=/path/to/config/std.ini 14 | 15 | ; user that owns virtual environment. 16 | user=ckan 17 | 18 | numprocs=1 19 | stdout_logfile=/var/log/ckan/std/gather_consumer.log 20 | stderr_logfile=/var/log/ckan/std/gather_consumer.log 21 | autostart=true 22 | autorestart=true 23 | startsecs=10 24 | 25 | [program:ckan_fetch_consumer] 26 | 27 | ; Full Path to executable, should be path to virtural environment, 28 | ; Full path to config file too. 29 | 30 | command=/path/to/pyenv/bin/paster --plugin=ckanext-harvest harvester fetch_consumer --config=/path/to/config/std.ini 31 | 32 | ; user that owns virtual environment. 33 | user=ckan 34 | 35 | numprocs=1 36 | stdout_logfile=/var/log/ckan/std/fetch_consumer.log 37 | stderr_logfile=/var/log/ckan/std/fetch_consumer.log 38 | autostart=true 39 | autorestart=true 40 | startsecs=10 41 | -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | pytest_plugins = [ 4 | u'ckanext.harvest.tests.fixtures', 5 | ] 6 | -------------------------------------------------------------------------------- /dev-requirements.txt: -------------------------------------------------------------------------------- 1 | pytest-ckan 2 | pytest-cov 3 | factory-boy>=2 4 | mock 5 | -------------------------------------------------------------------------------- /docs/admin-tab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ckan/ckanext-harvest/b74cba23b647f0aefab1db406784dd8bb11f8c7d/docs/admin-tab.png -------------------------------------------------------------------------------- /pip-requirements.txt: -------------------------------------------------------------------------------- 1 | requirements.txt -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "ckanext-harvest" 3 | version = "1.6.1" 4 | description = "Harvesting interface plugin for CKAN, plus harvester for other CKAN sites" 5 | authors = [ 6 | {name = "Adrià Mercader", email = "amercadero@gmail.com"} 7 | ] 8 | maintainers = [ 9 | {name = "CKAN Tech Team and contributors", email = "tech-team@ckan.org"}, 10 | {name = "Seitenbau Govdata"}, 11 | ] 12 | license = {text = "AGPL"} 13 | classifiers = [ 14 | "Intended Audience :: Developers", 15 | "Development Status :: 5 - Production/Stable", 16 | "License :: OSI Approved :: GNU Affero General Public License v3", 17 | "Programming Language :: Python :: 3.9", 18 | "Programming Language :: Python :: 3.10", 19 | "Programming Language :: Python :: 3.11", 20 | "Programming Language :: Python :: 3.12" 21 | ] 22 | keywords = [ 23 | "ckan", 24 | "ckanext", 25 | "harvesting", 26 | "federation", 27 | ] 28 | dependencies = [] 29 | 30 | [project.urls] 31 | Homepage = "http://github.com/ckan/ckanext-harvest" 32 | Repository = "https://github.com/ckan/ckanext-harvest" 33 | Issues = "https://github.com/ckan/ckanext-harvest/issues" 34 | Changelog = "https://github.com/ckan/ckanext-harvest/blob/master/CHANGELOG.rst" 35 | 36 | [build-system] 37 | requires = ["setuptools"] 38 | build-backend = "setuptools.build_meta" 39 | 40 | [project.entry-points."ckan.plugins"] 41 | harvest = "ckanext.harvest.plugin:Harvest" 42 | ckan_harvester = "ckanext.harvest.harvesters:CKANHarvester" 43 | 44 | # Test plugins 45 | test_harvester = "ckanext.harvest.tests.test_queue:MockHarvester" 46 | test_harvester2 = "ckanext.harvest.tests.test_queue2:MockHarvester" 47 | test_action_harvester = "ckanext.harvest.tests.test_action:MockHarvesterForActionTests" 48 | 49 | 50 | [project.entry-points."babel.extractors"] 51 | ckan = "ckan.lib.extract:extract_ckan" 52 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | ckantoolkit>=0.0.7 2 | pika>=1.1.0,<1.3.0 3 | redis 4 | requests>=2.11.1 5 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [options] 2 | packages = find: 3 | namespace_packages = ckanext 4 | install_requires = 5 | include_package_data = True 6 | 7 | [extract_messages] 8 | keywords = translate isPlural 9 | add_comments = TRANSLATORS: 10 | output_file = i18n/ckanext-harvest.pot 11 | width = 80 12 | 13 | [init_catalog] 14 | domain = ckanext-harvest 15 | input_file = i18n/ckanext-harvest.pot 16 | output_dir = i18n 17 | 18 | [update_catalog] 19 | domain = ckanext-harvest 20 | input_file = i18n/ckanext-harvest.pot 21 | output_dir = i18n 22 | previous = true 23 | 24 | [compile_catalog] 25 | domain = ckanext-harvest 26 | directory = i18n 27 | statistics = true 28 | 29 | [flake8] 30 | max-line-length = 127 31 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | message_extractors={ 5 | 'ckanext': [ 6 | ('**.py', 'python', None), 7 | ('**.js', 'javascript', None), 8 | ('**/templates/**.html', 'ckan', None), 9 | ], 10 | } 11 | ) 12 | -------------------------------------------------------------------------------- /test.ini: -------------------------------------------------------------------------------- 1 | [DEFAULT] 2 | debug = false 3 | # Uncomment and replace with the address which should receive any error reports 4 | #email_to = you@yourdomain.com 5 | smtp_server = localhost 6 | error_email_from = paste@localhost 7 | 8 | [server:main] 9 | use = egg:Paste#http 10 | host = 0.0.0.0 11 | port = 5000 12 | 13 | 14 | [app:main] 15 | use = config:../ckan/test-core.ini 16 | # Here we hard-code the database and a flag to make default tests 17 | # run fast. 18 | ckan.plugins = harvest ckan_harvester test_harvester test_harvester2 test_action_harvester 19 | ckan.harvest.mq.type = redis 20 | ckan.legacy_templates = false 21 | # NB: other test configuration should go in test-core.ini, which is 22 | # what the postgres tests use. 23 | 24 | 25 | # Logging configuration 26 | [loggers] 27 | keys = root, ckan, sqlalchemy 28 | 29 | [handlers] 30 | keys = console, dblog 31 | 32 | [formatters] 33 | keys = generic, dblog 34 | 35 | [logger_root] 36 | level = WARN 37 | handlers = console 38 | 39 | [logger_ckan] 40 | qualname = ckan 41 | handlers = 42 | level = INFO 43 | 44 | [logger_ckan_harvester] 45 | qualname = ckanext.harvest 46 | handlers = dblog 47 | level = DEBUG 48 | 49 | [logger_sqlalchemy] 50 | handlers = 51 | qualname = sqlalchemy.engine 52 | level = WARN 53 | 54 | [handler_console] 55 | class = StreamHandler 56 | args = (sys.stdout,) 57 | level = NOTSET 58 | formatter = generic 59 | 60 | [handler_dblog] 61 | class = ckanext.harvest.log.DBLogHandler 62 | args = () 63 | level = DEBUG 64 | formatter = dblog 65 | 66 | [formatter_dblog] 67 | format = %(message)s 68 | 69 | [formatter_generic] 70 | format = %(asctime)s %(levelname)-5.5s [%(name)s] %(message)s 71 | --------------------------------------------------------------------------------