├── .github
    └── workflows
    │   └── test.yml
├── .gitignore
├── .tx
    └── config
├── CHANGELOG.rst
├── CONTRIBUTING.rst
├── MANIFEST.in
├── README.rst
├── ckanext
    ├── __init__.py
    └── harvest
    │   ├── __init__.py
    │   ├── assets
    │       ├── styles
    │       │   ├── harvest.css
    │       │   ├── harvest.less
    │       │   ├── less
    │       │   ├── mixins.less
    │       │   └── variables.less
    │       └── webassets.yml
    │   ├── cli.py
    │   ├── controllers
    │       ├── __init__.py
    │       └── view.py
    │   ├── harvesters
    │       ├── __init__.py
    │       ├── base.py
    │       └── ckanharvester.py
    │   ├── helpers.py
    │   ├── i18n
    │       ├── ckanext-harvest.pot
    │       └── sv
    │       │   └── LC_MESSAGES
    │       │       ├── ckanext-harvest.mo
    │       │       └── ckanext-harvest.po
    │   ├── interfaces.py
    │   ├── log.py
    │   ├── logic
    │       ├── __init__.py
    │       ├── action
    │       │   ├── __init__.py
    │       │   ├── create.py
    │       │   ├── delete.py
    │       │   ├── get.py
    │       │   ├── patch.py
    │       │   └── update.py
    │       ├── auth
    │       │   ├── __init__.py
    │       │   ├── create.py
    │       │   ├── delete.py
    │       │   ├── get.py
    │       │   ├── patch.py
    │       │   └── update.py
    │       ├── dictization.py
    │       ├── schema.py
    │       └── validators.py
    │   ├── migration
    │       └── harvest
    │       │   ├── README
    │       │   ├── alembic.ini
    │       │   ├── env.py
    │       │   ├── script.py.mako
    │       │   └── versions
    │       │       ├── 3b4894672727_create_harvest_tables.py
    │       │       └── 75d650dfd519_add_cascade_to_harvest_tables.py
    │   ├── model
    │       └── __init__.py
    │   ├── plugin.py
    │   ├── public
    │       └── ckanext
    │       │   └── harvest
    │       │       ├── images
    │       │           └── icons
    │       │           │   ├── source_delete.png
    │       │           │   ├── source_edit.png
    │       │           │   ├── source_new.png
    │       │           │   ├── source_refresh.png
    │       │           │   └── source_view.png
    │       │       ├── javascript
    │       │           ├── extra_fields.js
    │       │           ├── resource.config
    │       │           └── webassets.yml
    │       │       └── style.css
    │   ├── queue.py
    │   ├── templates
    │       ├── admin
    │       │   └── base.html
    │       ├── base.html
    │       ├── emails
    │       │   ├── error_email.txt
    │       │   └── summary_email.txt
    │       ├── snippets
    │       │   ├── add_source_button.html
    │       │   ├── job_details.html
    │       │   ├── job_error_summary.html
    │       │   ├── package_list_empty.html
    │       │   ├── search_result_text.html
    │       │   ├── source_item.html
    │       │   └── source_list.html
    │       └── source
    │       │   ├── about.html
    │       │   ├── admin.html
    │       │   ├── admin_base.html
    │       │   ├── base.html
    │       │   ├── edit.html
    │       │   ├── job
    │       │       ├── list.html
    │       │       └── read.html
    │       │   ├── new.html
    │       │   ├── new_source_form.html
    │       │   ├── read.html
    │       │   ├── read_base.html
    │       │   └── search.html
    │   ├── tests
    │       ├── __init__.py
    │       ├── conftest.py
    │       ├── factories.py
    │       ├── fixtures.py
    │       ├── harvesters
    │       │   ├── __init__.py
    │       │   ├── mock_ckan.py
    │       │   ├── test_base.py
    │       │   └── test_ckanharvester.py
    │       ├── lib.py
    │       ├── test_action.py
    │       ├── test_blueprint.py
    │       ├── test_queue.py
    │       ├── test_queue2.py
    │       └── test_timeouts.py
    │   ├── utils.py
    │   └── views.py
├── config
    └── supervisor
    │   └── ckan_harvesting.conf
├── conftest.py
├── dev-requirements.txt
├── docs
    └── admin-tab.png
├── pip-requirements.txt
├── pyproject.toml
├── requirements.txt
├── setup.cfg
├── setup.py
└── test.ini


/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | on: [push, pull_request]
 3 | jobs:
 4 |   lint:
 5 |     runs-on: ubuntu-latest
 6 |     steps:
 7 |       - uses: actions/checkout@v4
 8 |       - uses: actions/setup-python@v5
 9 |         with:
10 |           python-version: '3.9'
11 |       - name: Install requirements
12 |         run: pip install flake8 pycodestyle
13 |       - name: Check syntax
14 |         run: flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics --exclude ckan
15 |       - name: Run flake8
16 |         run: flake8 . --count --max-line-length=127 --statistics --exclude ckan
17 | 
18 |   test:
19 |     needs: lint
20 |     strategy:
21 |       matrix:
22 |         include:
23 |           - ckan-version: "2.11"
24 |             ckan-image: "ckan/ckan-dev:2.11-py3.10"
25 |           - ckan-version: "2.10"
26 |             ckan-image: "ckan/ckan-dev:2.10-py3.10"
27 |       fail-fast: false
28 | 
29 |     name: CKAN ${{ matrix.ckan-version }}
30 |     runs-on: ubuntu-latest
31 |     container:
32 |       image: ${{ matrix.ckan-image }}
33 |       options: --user root
34 |     services:
35 |       solr:
36 |         image: ckan/ckan-solr:${{ matrix.ckan-version }}-solr9
37 |       postgres:
38 |         image: ckan/ckan-postgres-dev:${{ matrix.ckan-version }}
39 |         env:
40 |           POSTGRES_USER: postgres
41 |           POSTGRES_PASSWORD: postgres
42 |           POSTGRES_DB: postgres
43 |         options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5
44 |       redis:
45 |           image: redis:3
46 |     env:
47 |       CKAN_SQLALCHEMY_URL: postgresql://ckan_default:pass@postgres/ckan_test
48 |       CKAN_DATASTORE_WRITE_URL: postgresql://datastore_write:pass@postgres/datastore_test
49 |       CKAN_DATASTORE_READ_URL: postgresql://datastore_read:pass@postgres/datastore_test
50 |       CKAN_SOLR_URL: http://solr:8983/solr/ckan
51 |       CKAN_REDIS_URL: redis://redis:6379/1
52 | 
53 |     steps:
54 |     - uses: actions/checkout@v4
55 |     - name: Install requirements
56 |       run: |
57 |         pip install -r requirements.txt
58 |         pip install -r dev-requirements.txt
59 |         pip install -e .
60 |         # Replace default path to CKAN core config file with the one on the container
61 |         sed -i -e 's/use = config:.*/use = config:\/srv\/app\/src\/ckan\/test-core.ini/' test.ini
62 |     - name: Setup extension (CKAN >= 2.9)
63 |       run: |
64 |         ckan -c test.ini db init
65 |         ckan -c test.ini db pending-migrations --apply
66 |     - name: Run tests
67 |       run: pytest --ckan-ini=test.ini --cov=ckanext.harvest --disable-warnings ckanext/harvest/tests
68 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | syntax: glob
 2 | *.pyc
 3 | *.egg-info
 4 | *.orig
 5 | .coverage
 6 | build
 7 | *.egg
 8 | .DS_Store
 9 | dist
10 | development.ini
11 | *.sw?
12 | *~
13 | node_modules
14 | *.project
15 | .eggs
16 | .idea/
17 | .vscode/
18 | 
19 | 


--------------------------------------------------------------------------------
/.tx/config:
--------------------------------------------------------------------------------
 1 | [main]
 2 | host = https://www.transifex.com
 3 | 
 4 | [ckanext-harvest.ckanext-harvestpot]
 5 | file_filter = i18n/<lang>/LC_MESSAGES/ckanext-harvest.po
 6 | source_file = i18n/ckanext-harvest.pot
 7 | source_lang = en
 8 | type = PO
 9 | 
10 | 


--------------------------------------------------------------------------------
/CHANGELOG.rst:
--------------------------------------------------------------------------------
  1 | #########
  2 | Changelog
  3 | #########
  4 | 
  5 | All notable changes to this project will be documented in this file.
  6 | 
  7 | The format is based on `Keep a Changelog <http://keepachangelog.com>`_
  8 | and this project adheres to `Semantic Versioning <http://semver.org/>`_
  9 | 
 10 | ***********
 11 | 1.6.1_ - 2025-01-14
 12 | ***********
 13 | 
 14 | Changed
 15 | _______
 16 | 
 17 | - CKAN 2.9 is not longer maintained #559
 18 | 
 19 | Fixed
 20 | -------
 21 | - Update manifest to include alembic configuration #558
 22 | 
 23 | 
 24 | ***********
 25 | 1.6.0_ - 2024-10-31
 26 | ***********
 27 | 
 28 | Changed
 29 | _______
 30 | 
 31 | - CKAN 2.11 support #551
 32 | - Switched to alembic migrations #540
 33 | - Support for SQLAlchemy 2 #553
 34 | - Use pyproject.toml file #554
 35 | - Add tab for harvest sources in sysadmin page
 36 | 
 37 | Fixed
 38 | -------
 39 | 
 40 | - Clean up harvest source clear command, fix revisions exception #556
 41 | - Convert boolean values to bools #544
 42 | 
 43 | 
 44 | ***********
 45 | 1.5.6_ - 2023-06-26
 46 | ***********
 47 | 
 48 | Fixed
 49 | -------
 50 | 
 51 | - Fix url endpoint for job_show #534
 52 | 
 53 | ***********
 54 | 1.5.5_ - 2023-06-05
 55 | ***********
 56 | 
 57 | Fixed
 58 | -------
 59 | 
 60 | - Fix display of harvest job errors #533
 61 | 
 62 | ***********
 63 | 1.5.4_ - 2023-05-23
 64 | ***********
 65 | 
 66 | Fixed
 67 | -------
 68 | 
 69 | - Fix a problem with data-dictization when using sqlalchemy 1.4+ #529
 70 | 
 71 | ***********
 72 | 1.5.3_ - 2023-04-03
 73 | ***********
 74 | 
 75 | Fixed
 76 | -------
 77 | 
 78 | - Fix asset path in MANIFEST.in #525
 79 | 
 80 | ***********
 81 | 1.5.2_ - 2023-03-28
 82 | ***********
 83 | 
 84 | Fixed
 85 | -------
 86 | 
 87 | - Fix URL endpoints: from ``harvest.object_show`` to ``harvester.object_show`` #524
 88 | 
 89 | ***********
 90 | 1.5.1_ - 2023-03-22
 91 | ***********
 92 | 
 93 | Fixed
 94 | -------
 95 | 
 96 | - Fix ``url_for`` routing to point to harvester blueprint #523
 97 | 
 98 | ***********
 99 | 1.5.0_ - 2023-03-16
100 | ***********
101 | 
102 | Changed
103 | -------
104 | 
105 | - Added unescape for email text body to avoid encoded characters #517
106 | - Pick the right harvest_object_id if there are multiple #519
107 | - Do not duplicate harvest_extras if exist in root schema #521
108 | - Use 403 when actions are forbidden, not 401 #522
109 | - Drop support old versions #520
110 | 
111 | Breaking Changes
112 | -------
113 | - ``h.bootstrap_version()`` no longer exist since it is no longer needed to inject CSS classes
114 | - Support for old Pylon's route syntax has been removed. Example: calling ``url_for("harvest_read")`` will no longer work. URLs for ``ckanext-harvest`` needs to respect Flask's syntax: ``url_for("harvest.read")``, etc
115 | 
116 | ***********
117 | 1.4.2_ - 2023-01-12
118 | ***********
119 | 
120 | Changed
121 | -------
122 | 
123 | - Add DB index harvest_error_harvest_object_id_idx #514
124 | - Remove pyopenssl requirement c87309a
125 | - Add CSRF protection to new source form #516
126 | 
127 | ***********
128 | 1.4.1_ - 2022-09-20
129 | ***********
130 | 
131 | Changed
132 | -------
133 | 
134 | - Use requirements.txt instead of pip-requirements.txt (still working via symlink) 8ed1eca
135 | 
136 | Fixed
137 | -----
138 | 
139 | - Bump pyopenssl requirement to avoid requirements error on install 98edcd3
140 | - Fixes unicode error in Python 2 #502
141 | - Fixes in email notification sendngi  #499, #505
142 | - Fix pagination for Dataset list on source page #504
143 | 
144 | ***********
145 | 1.4.0_ - 2022-04-20
146 | ***********
147 | 
148 | Changed
149 | -------
150 | 
151 | - Add ckan.harvest.not_overwrite_fields #472
152 | - Support for Bootstrap 5 templates #490
153 | - Support for CKAN 2.10 #492 #496
154 | 
155 | Fixed
156 | -----
157 | 
158 | - Fix JSONDecode error #489
159 | - Check if email exists before sending notification #498
160 | 
161 | 
162 | ***********
163 | 1.3.4_ - 2022-01-24
164 | ***********
165 | 
166 | Changed
167 | -------
168 | 
169 | - Changes function calls to `render_jinja2` over to `render` as the former is
170 |   no longer used. #459
171 | - Set the default value for MQ_TYPE to redis #463
172 | - Add option `keep-current` to `clearsource_history` command #484
173 | 
174 | Fixed
175 | -----
176 | 
177 | - Fix JSON serialization for Python3 #450
178 | - Make `Rehavest` and `Clear` buttons work again #452
179 | - Fix error when running run-test #466
180 | - Fix timeout calculation #482
181 | - Fix harvest extras for packages #458
182 | 
183 | 
184 | ***********
185 | 1.3.3_ - 2021-03-26
186 | ***********
187 | 
188 | Changed
189 | -------
190 | 
191 | - Migrate tests from Travis CI to GitHub Actions
192 | - Optimize last error free job detection #437
193 | 
194 | Fixed
195 | -----
196 | - Improve timeout detection #431
197 | - Check if Redis key is available #432
198 | - Include webassets.yml in MANIFEST
199 | 
200 | 
201 | ***********
202 | 1.3.2_ - 2020-10-08
203 | ***********
204 | 
205 | Changed
206 | -------
207 | 
208 | - Calculate timeouts based on last finished object instead of job creation time #418
209 | 
210 | Fixed
211 | -----
212 | 
213 | - Fix resubmitting harvest objects to Redis fetch queue #421
214 | 
215 | 
216 | ***********
217 | 1.3.1_ - 2020-09-01
218 | ***********
219 | 
220 | Changed
221 | -------
222 | 
223 | - Abort failed jobs CLI command #398
224 | 
225 | Fixed
226 | -----
227 | 
228 | - Fix Redis conflict with core workers
229 | - Fix harvest source list reference
230 | - Fix and improve test suite, remove nose tests
231 | 
232 | 
233 | ***********
234 | 1.3.0_ - 2020-06-04
235 | ***********
236 | 
237 | Changed
238 | -------
239 | 
240 | - Support for Python 3 #392
241 | - Add option for job timeout #403
242 | - Add support for limiting number of results and filtering by organization in harvest_source_list #403
243 | 
244 | Fixed
245 | -----
246 | 
247 | - Fix support for different Redis client libraries #403
248 | - Fix force_import option in run_test command #402
249 | - Fix show object #395
250 | - Fix handling of exceptions in controller #390
251 | 
252 | 
253 | ***********
254 | 1.2.1_ - 2020-01-22
255 | ***********
256 | 
257 | Changed
258 | -------
259 | 
260 | - Support ``not modified`` status for objects #385
261 | - New ``force-import`` flag for the ``run_test`` command #385
262 | 
263 | Fixed
264 | -----
265 | 
266 | - Get message from harvest_object_error-dict #381
267 | - Fix Admin link appearing to non authorized users #389
268 | - Capture Redis Exceptions #385
269 | 
270 | *******************
271 | 1.2.0_ - 2019-11-01
272 | *******************
273 | 
274 | Changed
275 | -------
276 | - Apply flake8 to be PEP-8 compliant #354
277 | - Use ckantoolkit to clean up imports #358
278 | - Add hook to extend the package dict in CKAN harvester
279 | - Use CKAN core ckan.redis.url setting if present
280 | - Remove database migration code targeting ancient versions #376
281 |     (In the unlikely event that you need to upgrade from one
282 |      of the previous DB versions just apply the changes removed
283 |      on the linked PR manually)
284 | 
285 | Fixed
286 | -----
287 | - harvest_source_type_exists validator should not fail if Harvester has no ``info()`` method #338
288 | - Fix SSL problems for old versions of Python 2.7.x #344
289 | - Add an 'owner_org' to the v3 package migration #348
290 | - Fix harvest request exceptions #357
291 | - Fix wrong toolkit reference 8e862c8
292 | - Mark early errored jobs as finished 5ad6d86
293 | - Resubmit awaiting objects in the DB not on Redis 5ffe6d4
294 | 
295 | *******************
296 | 1.1.4_ - 2018-10-26
297 | *******************
298 | Fixed
299 | -----
300 | - Fix nav link
301 | 
302 | *******************
303 | 1.1.3_ - 2018-10-26
304 | *******************
305 | Fixed
306 | -----
307 | - Reduce usage of c vars (CKAN 2.9)
308 | 
309 | *******************
310 | 1.1.2_ - 2018-10-25
311 | *******************
312 | Added
313 | -----
314 | - Send harvest-error-mails to organization-admins #329
315 | - CKAN Harvester option to include/exclude groups #323
316 | - Use Redis password from configuration when present #332
317 | - Support for CKAN 2.9
318 | 
319 | Fixed
320 | -----
321 | - Ensures the AND operator for fq in solr #335
322 | - Fix styling issues on Bootstrap 3
323 | 
324 | *******************
325 | 1.1.1_ - 2018-06-13
326 | *******************
327 | Added
328 | -----
329 | - Move CKANHarvester._last_error_free_job to HarvesterBase.last_error_free_job #305
330 | - Add the CSS classes for FontAwesome 4.x #313
331 | - Add config option for dataset name append type #327
332 | - Send error mail to admin when harvesting fails #244
333 | 
334 | Changed
335 | -------
336 | - Readme test tip ckan parameter #318
337 | 
338 | Fixed
339 | -----
340 | - Fix handling of ``clean_tags`` options for tag lists and dicts #304
341 | - Don't delete all solr documents/fail to index harvesters when harvest config blank #315
342 | - Fix print statements to be Py3 friendly #328
343 | 
344 | *******************
345 | 1.1.0_ - 2017-11-07
346 | *******************
347 | Added
348 | -----
349 | - Button on harvest admin page to abort running jobs #296
350 | 
351 | Changed
352 | -------
353 | - Test improvements for harvester config #288
354 | - Use package_search API for count of datasets #298
355 | - Catch sqlalchemy.exc.DatabaseError instead of sqlalchemy.exc.OperationalError in ``gather_callback`` #301
356 | 
357 | Fixed
358 | -------
359 | - Fix default_extras initialization #290
360 | - Travis build (postgres service, checkout of correct CKAN branch, libcommons-fileupload) #297
361 | 
362 | *******************
363 | 1.0.0_ - 2017-03-30
364 | *******************
365 | Added
366 | -----
367 | - Includes i18n directory in package.
368 | - Adds a new ``clearsource_history`` command/operation.
369 | - Adds new parameter ``return_last_job_status`` to ``harvest_source_list``
370 | - Documentation for logs API
371 | 
372 | Changed
373 | -------
374 | - ``gather_stage`` return empty list instead of None if errors occured
375 | - Change ``redirect`` calls to ``h.redirect_to``
376 | 
377 | Fixed
378 | -----
379 | - Fix namespace package declarations
380 | - Only purge own data when calling ``queue_purge`` with redis
381 | - Fix ``default_groups`` behavior
382 | 
383 | *******************
384 | 0.0.5_ - 2016-05-23
385 | *******************
386 | Added
387 | -----
388 | - Adds ``HarvestLog`` to log to database
389 | - Adds a new ``clean_harvest_log`` command to clean the log table
390 | 
391 | Removed
392 | -------
393 | - This release removes support for CKAN <= 2.0
394 | 
395 | *******************
396 | 0.0.4_ - 2015-12-11
397 | *******************
398 | Added
399 | -----
400 | - Adds ``_find_existing_package`` method to allow harvesters extending the ``HarvesterBase`` to implement their own logic to find an existing package
401 | - Adds support for ``ITranslation`` interface
402 | - Adds special CSS class to datetimes in frontend to enable localisation to the users timezone
403 | 
404 | Changed
405 | -------
406 | - Make statistics keys consistent across all actions
407 | 
408 | Removed
409 | -------
410 | - Remove ``harvest_source_for_a_dataset`` action
411 | 
412 | *******************
413 | 0.0.3_ - 2015-11-20
414 | *******************
415 | Fixed
416 | -----
417 | - Fixed queues tests
418 | 
419 | 
420 | *******************
421 | 0.0.2_ - 2015-11-20
422 | *******************
423 | Changed
424 | -------
425 | - Namespace redis keys to avoid conflicts between CKAN instances
426 | 
427 | 
428 | *******************
429 | 0.0.1_ - 2015-11-20
430 | *******************
431 | Added
432 | -----
433 | - Adds clear source as a command
434 | - Adds specific exceptions instead of having only the generic ``Exception``
435 | 
436 | Fixed
437 | -----
438 | - Catch 'no harvest job' exception
439 | 
440 | **********
441 | Categories
442 | **********
443 | - ``Added`` for new features.
444 | - ``Changed`` for changes in existing functionality.
445 | - ``Deprecated`` for once-stable features removed in upcoming releases.
446 | - ``Removed`` for deprecated features removed in this release.
447 | - ``Fixed`` for any bug fixes.
448 | - ``Security`` to invite users to upgrade in case of vulnerabilities.
449 | 
450 | .. _Unreleased: https://github.com/ckan/ckanext-harvest/compare/v1.6.0...HEAD
451 | .. _1.6.0: https://github.com/ckan/ckanext-harvest/compare/v1.5.6...v1.6.0
452 | .. _1.5.6: https://github.com/ckan/ckanext-harvest/compare/v1.5.5...v1.5.6
453 | .. _1.5.5: https://github.com/ckan/ckanext-harvest/compare/v1.5.4...v1.5.5
454 | .. _1.5.4: https://github.com/ckan/ckanext-harvest/compare/v1.5.3...v1.5.4
455 | .. _1.5.3: https://github.com/ckan/ckanext-harvest/compare/v1.5.2...v1.5.3
456 | .. _1.5.2: https://github.com/ckan/ckanext-harvest/compare/v1.5.1...v1.5.2
457 | .. _1.5.1: https://github.com/ckan/ckanext-harvest/compare/v1.5.0...v1.5.1
458 | .. _1.5.0: https://github.com/ckan/ckanext-harvest/compare/v1.4.2...v1.5.0
459 | .. _1.4.2: https://github.com/ckan/ckanext-harvest/compare/v1.4.1...v1.4.2
460 | .. _1.4.1: https://github.com/ckan/ckanext-harvest/compare/v1.4.0...v1.4.1
461 | .. _1.4.0: https://github.com/ckan/ckanext-harvest/compare/v1.3.4...v1.4.0
462 | .. _1.3.4: https://github.com/ckan/ckanext-harvest/compare/v1.3.3...v1.3.4
463 | .. _1.3.3: https://github.com/ckan/ckanext-harvest/compare/v1.3.2...v1.3.3
464 | .. _1.3.2: https://github.com/ckan/ckanext-harvest/compare/v1.3.1...v1.3.2
465 | .. _1.3.1: https://github.com/ckan/ckanext-harvest/compare/v1.3.0...v1.3.1
466 | .. _1.3.0: https://github.com/ckan/ckanext-harvest/compare/v1.2.1...v1.3.0
467 | .. _1.2.1: https://github.com/ckan/ckanext-harvest/compare/v1.2.0...v1.2.1
468 | .. _1.2.0: https://github.com/ckan/ckanext-harvest/compare/v1.1.4...v1.2.0
469 | .. _1.1.4: https://github.com/ckan/ckanext-harvest/compare/v1.1.3...v1.1.4
470 | .. _1.1.3: https://github.com/ckan/ckanext-harvest/compare/v1.1.2...v1.1.3
471 | .. _1.1.2: https://github.com/ckan/ckanext-harvest/compare/v1.1.1...v1.1.2
472 | .. _1.1.1: https://github.com/ckan/ckanext-harvest/compare/v1.1.0...v1.1.1
473 | .. _1.1.0: https://github.com/ckan/ckanext-harvest/compare/v1.0.0...v1.1.0
474 | .. _1.0.0: https://github.com/ckan/ckanext-harvest/compare/v0.0.5...v1.0.0
475 | .. _0.0.5: https://github.com/ckan/ckanext-harvest/compare/v0.0.4...v0.0.5
476 | .. _0.0.4: https://github.com/ckan/ckanext-harvest/compare/v0.0.3...v0.0.4
477 | .. _0.0.3: https://github.com/ckan/ckanext-harvest/compare/v0.0.2...v0.0.3
478 | .. _0.0.2: https://github.com/ckan/ckanext-harvest/compare/v0.0.1...v0.0.2
479 | .. _0.0.1: https://github.com/ckan/ckanext-harvest/compare/ckan-1.6...v0.0.1
480 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.rst:
--------------------------------------------------------------------------------
 1 | ####################################
 2 | How to contribute to ckanext-harvest
 3 | ####################################
 4 | 
 5 | For contributing to ckanext-harvest or its documentation, follow the same guidelines that apply to CKAN core, described in the `contributing guidelines <http://docs.ckan.org/en/latest/contributing>`_.
 6 | 
 7 | **Did you find a bug?**
 8 | -----------------------
 9 | 
10 | * **Ensure the bug was not already reported** by searching on GitHub under `Issues <https://github.com/ckan/ckanext-harvest/issues>`_.
11 | 
12 | * If you're unable to find an open issue addressing the problem, `open a new one <https://github.com/ckan/ckanext-harvest/issues/new>`_. Be sure to include a **title and clear description**, as much relevant information as possible. 
13 | 
14 | **Did you write a patch that fixes a bug?**
15 | -------------------------------------------
16 | 
17 | * Open a new GitHub pull request with the patch.
18 | 
19 | * Ensure the PR description clearly describes the problem and solution. Include the relevant issue number if applicable.
20 | 
21 | * Make sure to **update the CHANGELOG.rst** in the "Unreleased" section with your bugfix
22 | 
23 | **Do you intend to add a new feature or change an existing one?**
24 | -----------------------------------------------------------------
25 | 
26 | * Open a new issue on Github and start writing code
27 | 
28 | * If you are unsure about the change, wait for feedback on the issue or post to the `ckan-dev mailinglist <https://lists.okfn.org/mailman/listinfo/ckan-dev>`_
29 | 
30 | * Make sure to **update the CHANGELOG.rst** in the "Unreleased" section with your change
31 | 
32 | **Do you have questions about the source code?**
33 | ------------------------------------------------
34 | 
35 | * Ask any question about how to use ckanext-harvest on the `ckan-dev mailinglist <https://lists.okfn.org/mailman/listinfo/ckan-dev>`_ 
36 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include ckanext/harvest/templates *
2 | recursive-include ckanext/harvest/assets *
3 | recursive-include ckanext/harvest/public *
4 | recursive-include ckanext/harvest/i18n *
5 | recursive-include ckanext/harvest/migration *
6 | 
7 | 


--------------------------------------------------------------------------------
/ckanext/__init__.py:
--------------------------------------------------------------------------------
1 | # this is a namespace package
2 | try:
3 |     import pkg_resources
4 |     pkg_resources.declare_namespace(__name__)
5 | except ImportError:
6 |     import pkgutil
7 |     __path__ = pkgutil.extend_path(__path__, __name__)
8 | 


--------------------------------------------------------------------------------
/ckanext/harvest/__init__.py:
--------------------------------------------------------------------------------
1 | # this is a namespace package
2 | try:
3 |     import pkg_resources
4 |     pkg_resources.declare_namespace(__name__)
5 | except ImportError:
6 |     import pkgutil
7 |     __path__ = pkgutil.extend_path(__path__, __name__)
8 | 


--------------------------------------------------------------------------------
/ckanext/harvest/assets/styles/harvest.css:
--------------------------------------------------------------------------------
 1 | header.with-filter {
 2 |   clear: both;
 3 |   overflow: hidden;
 4 | }
 5 | header.with-filter h1 {
 6 |   margin-top: 0;
 7 | }
 8 | [data-diff] {
 9 |   color: #000;
10 |   background-color: #DDD;
11 |   text-shadow: none;
12 |   font-weight: normal;
13 | }
14 | [data-diff="error"] {
15 |   background-color: #b55457;
16 | }
17 | [data-diff="added"] {
18 |   background-color: #9ee592;
19 | }
20 | [data-diff="updated"] {
21 |   background-color: #c5aaff;
22 | }
23 | [data-diff="deleted"] {
24 |   background-color: #e7a4a6;
25 | }
26 | .harvest-error-summary .count {
27 |   text-align: right;
28 | }
29 | .harvest-error-list h5 {
30 |   margin-top: 0;
31 | }
32 | .harvest-error-list .error {
33 |   padding-left: 20px;
34 | }
35 | .harvest-types label.radio {
36 |   font-weight: normal;
37 |   margin-bottom: 10px;
38 | }
39 | .harvest-types label.radio input {
40 |   top: 3px;
41 | }
42 | 
43 | #source-new.bs2 .control-label {
44 |   width: 125px;
45 | }
46 | 


--------------------------------------------------------------------------------
/ckanext/harvest/assets/styles/harvest.less:
--------------------------------------------------------------------------------
 1 | @import 'mixins.less';
 2 | @import 'variables.less';
 3 | 
 4 | header.with-filter {
 5 |   clear: both;
 6 |   overflow: hidden;
 7 |   h1 {
 8 |     margin-top: 0;
 9 |   }
10 | }
11 | 
12 | [data-diff] {
13 |   color: #000;
14 |   background-color: #DDD;
15 |   text-shadow: none;
16 |   font-weight: normal;
17 | }
18 | [data-diff="added"] {
19 |   background-color: @diffAdded;
20 | }
21 | [data-diff="updated"] {
22 |   background-color: @diffUpdated;
23 | }
24 | [data-diff="deleted"] {
25 |   background-color: @diffDeleted;
26 | }
27 | 
28 | .harvest-error-summary {
29 |   .count {
30 |     text-align: right;
31 |   }
32 | }
33 | 
34 | .harvest-error-list {
35 |   h5 {
36 |     margin-top: 0;
37 |   }
38 |   .error {
39 |     padding-left: 20px;
40 |   }
41 | }
42 | 
43 | .harvest-types label.radio {
44 |   font-weight: normal;
45 |   margin-bottom: 10px;
46 |   input {
47 |     top: 3px;
48 |   }
49 | }
50 | 
51 | #source-new {
52 |   .controls {
53 |     margin-left: 135px;
54 |   }
55 |   .control-label {
56 |     width: 125px;
57 |   }
58 | }
59 | 


--------------------------------------------------------------------------------
/ckanext/harvest/assets/styles/less:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | // This file is only used to generate the harvest.css
 4 | 
 5 | var path = require('path'),
 6 |     nodeWatch = require('nodewatch'),
 7 |     exec = require('child_process').exec,
 8 |     watch = path.join(__dirname),
 9 |     lastArg = process.argv.slice().pop();
10 | 
11 | function now() {
12 |   return new Date().toISOString().replace('T', ' ').substr(0, 19);
13 | }
14 | 
15 | function compile(event, filename) {
16 |   var start = Date.now();
17 | 
18 |   exec('`npm bin`/lessc ' + __dirname + '/harvest.less > ' + __dirname + '/harvest.css', function (err, stdout, stderr) {
19 |     var duration = Date.now() - start;
20 | 
21 |     if (err) {
22 |       console.log('An error occurred running the less command:');
23 |       console.log(err.message);
24 |     }
25 |     else if (stderr || stdout) {
26 |       console.log(stdout, stderr);
27 |     } else {
28 |       console.log('[%s] recompiled in %sms', now(), duration);
29 |     }
30 |   });
31 | }
32 | 
33 | nodeWatch.add(watch).onChange(compile);
34 | compile();
35 | 


--------------------------------------------------------------------------------
/ckanext/harvest/assets/styles/mixins.less:
--------------------------------------------------------------------------------
 1 | .clearfix() {
 2 | 	
 3 | }
 4 | 
 5 | .border-radius(@radius) {
 6 | 	-webkit-border-radius: @radius;
 7 | 	-moz-border-radius: @radius;
 8 | 	border-radius: @radius;
 9 | }
10 | 
11 | .box-shadow(@shadowA, @shadowB:X, ...){
12 | 	@props: ~`"@{arguments}".replace(/[\[\]]|\,\sX/g, '')`;
13 | 	-webkit-box-shadow: @props;
14 | 	-moz-box-shadow: @props;
15 | 	box-shadow: @props;
16 | }
17 | 


--------------------------------------------------------------------------------
/ckanext/harvest/assets/styles/variables.less:
--------------------------------------------------------------------------------
1 | @borderColor: #DDD;
2 | @hoverColor: #F6F6F6;
3 | 
4 | @diffAdded: #9EE592;
5 | @diffUpdated: #C5AAFF;
6 | @diffDeleted: #E7A4A6;
7 | 


--------------------------------------------------------------------------------
/ckanext/harvest/assets/webassets.yml:
--------------------------------------------------------------------------------
1 | harvest_css:
2 |   output: ckanext-harvest/%(version)s_harvest_css.css
3 |   contents:
4 |     - styles/harvest.css
5 | 


--------------------------------------------------------------------------------
/ckanext/harvest/cli.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from __future__ import print_function
  4 | 
  5 | import ckantoolkit as tk
  6 | import click
  7 | 
  8 | import ckanext.harvest.utils as utils
  9 | from ckanext.harvest.logic import HarvestJobExists
 10 | 
 11 | 
 12 | def get_commands():
 13 |     return [harvester]
 14 | 
 15 | 
 16 | @click.group()
 17 | def harvester():
 18 |     """Harvests remotely mastered metadata.
 19 |     """
 20 |     pass
 21 | 
 22 | 
 23 | @harvester.group()
 24 | def source():
 25 |     """Manage harvest sources
 26 |     """
 27 |     pass
 28 | 
 29 | 
 30 | @source.command()
 31 | @click.argument(u"name")
 32 | @click.argument(u"url")
 33 | @click.argument(u"type")
 34 | @click.argument(u"title", required=False)
 35 | @click.argument(u"active", type=tk.asbool, default=True)
 36 | @click.argument(u"owner_org", required=False)
 37 | @click.argument(u"frequency", default=u"MANUAL")
 38 | @click.argument(u"config", required=False)
 39 | def create(name, url, type, title, active, owner_org, frequency, config):
 40 |     """Create new harvest source.
 41 |     """
 42 |     try:
 43 |         result = utils.create_harvest_source(
 44 |             name, url, type, title, active, owner_org, frequency, config
 45 |         )
 46 |     except tk.ValidationError as e:
 47 |         tk.error_shout(u"Validation error:")
 48 |         for field, err in e.error_summary.items():
 49 |             tk.error_shout("\t{}: {}".format(field, err))
 50 |         raise click.Abort()
 51 |     click.echo(result)
 52 | 
 53 | 
 54 | @source.command()
 55 | @click.argument(u"id", metavar=u"SOURCE_ID_OR_NAME")
 56 | @click.pass_context
 57 | def show(ctx, id):
 58 |     """Shows a harvest source.
 59 |     """
 60 |     flask_app = ctx.meta["flask_app"]
 61 | 
 62 |     try:
 63 |         with flask_app.test_request_context():
 64 |             result = utils.show_harvest_source(id)
 65 |     except tk.ObjectNotFound:
 66 |         tk.error_shout(u"Source <{}> not found.".format(id))
 67 |         raise click.Abort()
 68 |     click.echo(result)
 69 | 
 70 | 
 71 | @source.command()
 72 | @click.argument(u"id", metavar=u"SOURCE_ID_OR_NAME")
 73 | @click.pass_context
 74 | def remove(ctx, id):
 75 |     """Remove (deactivate) a harvester source, whilst leaving any related
 76 |     datasets, jobs and objects.
 77 | 
 78 |     """
 79 |     flask_app = ctx.meta["flask_app"]
 80 | 
 81 |     with flask_app.test_request_context():
 82 |         utils.remove_harvest_source(id)
 83 |     click.secho("Removed harvest source: {0}".format(id), fg="green")
 84 | 
 85 | 
 86 | @source.command()
 87 | @click.argument(u"id", metavar=u"SOURCE_ID_OR_NAME")
 88 | @click.pass_context
 89 | def clear(ctx, id):
 90 |     """Clears all datasets, jobs and objects related to a harvest source,
 91 |     but keeps the source itself.
 92 | 
 93 |     """
 94 |     flask_app = ctx.meta["flask_app"]
 95 | 
 96 |     with flask_app.test_request_context():
 97 |         utils.clear_harvest_source(id)
 98 |     click.secho("Cleared harvest source: {0}".format(id), fg="green")
 99 | 
100 | 
101 | @source.command()
102 | @click.argument(u"id", metavar=u"SOURCE_ID_OR_NAME", required=False)
103 | @click.option(
104 |     "-k",
105 |     "--keep-current",
106 |     default=False
107 | )
108 | @click.pass_context
109 | def clear_history(ctx, id, keep_current):
110 |     """If no source id is given the history for all harvest sources
111 |     (maximum is 1000) will be cleared.
112 | 
113 |     Clears all jobs and objects related to a harvest source, but keeps
114 |     the source itself.  The datasets imported from the harvest source
115 |     will NOT be deleted!!!  If a source id is given, it only clears
116 |     the history of the harvest source with the given source id.
117 | 
118 |     """
119 |     flask_app = ctx.meta["flask_app"]
120 | 
121 |     with flask_app.test_request_context():
122 |         result = utils.clear_harvest_source_history(id, bool(keep_current))
123 |     click.secho(result, fg="green")
124 | 
125 | 
126 | @harvester.command()
127 | @click.argument("all", required=False)
128 | @click.pass_context
129 | def sources(ctx, all):
130 |     """Lists harvest sources.
131 | 
132 |     If 'all' is defined, it also shows the Inactive sources
133 | 
134 |     """
135 |     flask_app = ctx.meta["flask_app"]
136 | 
137 |     with flask_app.test_request_context():
138 |         result = utils.list_sources(bool(all))
139 |     click.echo(result)
140 | 
141 | 
142 | @harvester.command()
143 | @click.argument("id", metavar="SOURCE_ID_OR_NAME")
144 | @click.pass_context
145 | def job(ctx, id):
146 |     """Create new harvest job and runs it (puts it on the gather queue).
147 | 
148 |     """
149 |     flask_app = ctx.meta["flask_app"]
150 |     with flask_app.test_request_context():
151 |         try:
152 |             result = utils.create_job(id)
153 |         except HarvestJobExists as e:
154 |             tk.error_shout(e)
155 |             ctx.abort()
156 |     click.echo(result)
157 | 
158 | 
159 | @harvester.command()
160 | @click.pass_context
161 | def jobs(ctx):
162 |     """Lists harvest jobs.
163 | 
164 |     """
165 |     flask_app = ctx.meta["flask_app"]
166 |     with flask_app.test_request_context():
167 |         result = utils.list_jobs()
168 |     click.echo(result)
169 | 
170 | 
171 | @harvester.command()
172 | @click.argument("id", metavar="SOURCE_OR_JOB_ID")
173 | @click.pass_context
174 | def job_abort(ctx, id):
175 |     """Marks a job as "Aborted" so that the source can be restarted afresh.
176 | 
177 |     It ensures that the job's harvest objects status are also marked
178 |     finished. You should ensure that neither the job nor its objects
179 |     are currently in the gather/fetch queues.
180 | 
181 |     """
182 |     flask_app = ctx.meta["flask_app"]
183 |     with flask_app.test_request_context():
184 |         try:
185 |             result = utils.abort_job(id)
186 |         except tk.ObjectNotFound:
187 |             tk.error_shout(u"Job not found.")
188 |             ctx.abort()
189 | 
190 |     click.echo(result)
191 | 
192 | 
193 | @harvester.command()
194 | @click.argument("life_span", default=False, required=False)
195 | @click.option(
196 |     "-i",
197 |     "--include",
198 |     default=False,
199 |     help="""If source_id provided as included, then only it's failed jobs will be aborted.
200 |     You can use comma as a separator to provide multiple source_id's""",
201 | )
202 | @click.option(
203 |     "-e",
204 |     "--exclude",
205 |     default=False,
206 |     help="""If source_id provided as excluded, all sources failed jobs, except for that
207 |     will be aborted. You can use comma as a separator to provide multiple source_id's""",
208 | )
209 | @click.pass_context
210 | def abort_failed_jobs(ctx, life_span, include, exclude):
211 |     """Abort all jobs which are in a "limbo state" where the job has
212 |     run with errors but the harvester run command will not mark it
213 |     as finished, and therefore you cannot run another job.
214 |     """
215 |     flask_app = ctx.meta["flask_app"]
216 |     with flask_app.test_request_context():
217 |         result = utils.abort_failed_jobs(life_span, include, exclude)
218 |     click.echo(result)
219 | 
220 | 
221 | @harvester.command()
222 | def purge_queues():
223 |     """Removes all jobs from fetch and gather queue.
224 |     """
225 |     utils.purge_queues()
226 | 
227 | 
228 | @harvester.command()
229 | def gather_consumer():
230 |     """Starts the consumer for the gathering queue.
231 | 
232 |     """
233 |     utils.gather_consumer()
234 | 
235 | 
236 | @harvester.command()
237 | def fetch_consumer():
238 |     """Starts the consumer for the fetching queue.
239 | 
240 |     """
241 |     utils.fetch_consumer()
242 | 
243 | 
244 | @harvester.command()
245 | @click.pass_context
246 | def run(ctx):
247 |     """Starts any harvest jobs that have been created by putting them onto
248 |     the gather queue.
249 | 
250 |     Also checks running jobs - if finished it changes their status to
251 |     Finished.
252 | 
253 |     """
254 |     flask_app = ctx.meta["flask_app"]
255 |     with flask_app.test_request_context():
256 |         utils.run_harvester()
257 | 
258 | 
259 | @harvester.command()
260 | @click.pass_context
261 | @click.argument("id", metavar="SOURCE_ID_OR_NAME")
262 | @click.argument("force-import", required=False, metavar="GUID")
263 | def run_test(ctx, id, force_import=None):
264 |     """Runs a harvest - for testing only.
265 | 
266 |     This does all the stages of the harvest (creates job, gather,
267 |     fetch, import) without involving the web UI or the queue
268 |     backends. This is useful for testing a harvester without having to
269 |     fire up gather/fetch_consumer processes, as is done in production.
270 | 
271 |     """
272 |     if force_import:
273 |         force_import = force_import.split('=')[-1]
274 |     flask_app = ctx.meta["flask_app"]
275 |     with flask_app.test_request_context():
276 |         utils.run_test_harvester(id, force_import)
277 | 
278 | 
279 | @harvester.command("import")
280 | @click.pass_context
281 | @click.argument("id", metavar="SOURCE_ID_OR_NAME", required=False)
282 | @click.option(
283 |     "-j",
284 |     "--no-join-datasets",
285 |     is_flag=True,
286 |     help="Do not join harvest objects to existing datasets",
287 | )
288 | @click.option(
289 |     "-o",
290 |     "--harvest-object-id",
291 |     help="Id of the harvest object to which perform the import stage",
292 | )
293 | @click.option(
294 |     "-p",
295 |     "--package-id",
296 |     help="Id of the package whose harvest object to perform the import stage for",
297 | )
298 | @click.option(
299 |     "-g",
300 |     "--guid",
301 |     help="Guid of the harvest object to which perform the import stage for",
302 | )
303 | @click.option(
304 |     "--segments",
305 |     help="""A string containing hex digits that represent which of
306 |  the 16 harvest object segments to import. e.g. 15af will run segments 1,5,a,f""",
307 | )
308 | def import_stage(
309 |     ctx, id, no_join_datasets, harvest_object_id, guid, package_id, segments
310 | ):
311 |     """Perform the import stage with the last fetched objects, for a
312 |     certain source or a single harvest object.
313 | 
314 |     Please note that no objects will be fetched from the remote
315 |     server. It will only affect the objects already present in the
316 |     database.
317 | 
318 |     To import a particular harvest source, specify its id as an argument.
319 |     To import a particular harvest object use the -o option.
320 |     To import a particular guid use the -g option.
321 |     To import a particular package use the -p option.
322 | 
323 |     You will need to specify the -j flag in cases where the datasets
324 |     are not yet created (e.g. first harvest, or all previous harvests
325 |     have failed)
326 | 
327 |     The --segments flag allows to define a string containing hex
328 |     digits that represent which of the 16 harvest object segments to
329 |     import. e.g. 15af will run segments 1,5,a,f
330 | 
331 |     """
332 |     flask_app = ctx.meta["flask_app"]
333 |     with flask_app.test_request_context():
334 |         try:
335 |             utils.import_stage(
336 |                 id,
337 |                 no_join_datasets,
338 |                 harvest_object_id,
339 |                 guid,
340 |                 package_id,
341 |                 segments,
342 |             )
343 |         except tk.ObjectNotFound:
344 |             tk.error_shout(u"Source <{}> not found.".format(id))
345 | 
346 | 
347 | @harvester.command()
348 | @click.pass_context
349 | def clean_harvest_log(ctx):
350 |     """Clean-up mechanism for the harvest log table.
351 | 
352 |     You can configure the time frame through the configuration
353 |     parameter `ckan.harvest.log_timeframe`. The default time frame is 30
354 |     days
355 | 
356 |     """
357 |     flask_app = ctx.meta["flask_app"]
358 |     with flask_app.test_request_context():
359 |         utils.clean_harvest_log()
360 | 
361 | 
362 | @harvester.command("job-all")
363 | @click.pass_context
364 | def job_all(ctx):
365 |     """Create new harvest jobs for all active sources.
366 | 
367 |     """
368 |     flask_app = ctx.meta["flask_app"]
369 |     with flask_app.test_request_context():
370 |         result = utils.job_all()
371 |     click.echo(result)
372 | 
373 | 
374 | @harvester.command()
375 | @click.pass_context
376 | def reindex(ctx):
377 |     """Reindexes the harvest source datasets.
378 | 
379 |     """
380 |     flask_app = ctx.meta["flask_app"]
381 |     with flask_app.test_request_context():
382 |         utils.reindex()
383 | 
384 | 
385 | @harvester.command("harvesters_info")
386 | @click.pass_context
387 | def harvesters_info(ctx):
388 |     """
389 | 
390 |     """
391 |     flask_app = ctx.meta["flask_app"]
392 |     with flask_app.test_request_context():
393 |         result = utils.harvesters_info()
394 | 
395 |     click.echo(result)
396 | 


--------------------------------------------------------------------------------
/ckanext/harvest/controllers/__init__.py:
--------------------------------------------------------------------------------
1 | try:
2 |     import pkg_resources
3 |     pkg_resources.declare_namespace(__name__)
4 | except ImportError:
5 |     import pkgutil
6 |     __path__ = pkgutil.extend_path(__path__, __name__)
7 | 


--------------------------------------------------------------------------------
/ckanext/harvest/controllers/view.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from ckan.lib.base import BaseController, c
 4 | from ckan.common import response
 5 | 
 6 | import ckanext.harvest.utils as utils
 7 | 
 8 | 
 9 | class ViewController(BaseController):
10 |     def __before__(self, action, **params):
11 | 
12 |         super(ViewController, self).__before__(action, **params)
13 | 
14 |         c.dataset_type = utils.DATASET_TYPE_NAME
15 | 
16 |     def delete(self, id):
17 |         return utils.delete_view(id)
18 | 
19 |     def refresh(self, id):
20 |         return utils.refresh_view(id)
21 | 
22 |     def clear(self, id):
23 |         return utils.clear_view(id)
24 | 
25 |     def show_object(self, id, ref_type='object'):
26 |         _, content = utils.object_show_view(id, ref_type, response)
27 |         return content
28 | 
29 |     def show_job(self, id, source_dict=False, is_last=False):
30 |         return utils.job_show_view(id, source_dict, is_last)
31 | 
32 |     def about(self, id):
33 |         return utils.about_view(id)
34 | 
35 |     def admin(self, id):
36 |         return utils.admin_view(id)
37 | 
38 |     def abort_job(self, source, id):
39 |         return utils.job_abort_view(source, id)
40 | 
41 |     def show_last_job(self, source):
42 |         return utils.job_show_last_view(source)
43 | 
44 |     def list_jobs(self, source):
45 |         return utils.job_list_view(source)
46 | 


--------------------------------------------------------------------------------
/ckanext/harvest/harvesters/__init__.py:
--------------------------------------------------------------------------------
1 | from ckanext.harvest.harvesters.ckanharvester import CKANHarvester
2 | from ckanext.harvest.harvesters.base import HarvesterBase
3 | 
4 | __all__ = ['CKANHarvester', 'HarvesterBase']
5 | 


--------------------------------------------------------------------------------
/ckanext/harvest/helpers.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from ckan import logic
  3 | from ckan import model
  4 | import ckan.lib.helpers as h
  5 | import ckan.plugins as p
  6 | 
  7 | from ckanext.harvest.model import UPDATE_FREQUENCIES
  8 | from ckanext.harvest.utils import (
  9 |     DATASET_TYPE_NAME
 10 | )
 11 | from ckanext.harvest.interfaces import IHarvester
 12 | 
 13 | 
 14 | c = p.toolkit.c
 15 | request = p.toolkit.request
 16 | 
 17 | 
 18 | def get_harvest_source(source_id=None):
 19 | 
 20 |     context = {'model': model, 'session': model.Session}
 21 |     if source_id:
 22 |         return p.toolkit.get_action('harvest_source_show')(context, {'id': source_id})
 23 |     elif hasattr(c, 'pkg_dict'):
 24 |         return c.pkg_dict
 25 |     elif hasattr(c, 'pkg'):
 26 |         return p.toolkit.get_action('harvest_source_show')(context, {'id': c.pkg.id})
 27 | 
 28 |     return None
 29 | 
 30 | 
 31 | def package_list_for_source(source_id):
 32 |     '''
 33 |     Creates a dataset list with the ones belonging to a particular harvest
 34 |     source.
 35 | 
 36 |     It calls the package_list snippet and the pager.
 37 |     '''
 38 |     limit = 20
 39 |     page = int(request.args.get('page', 1))
 40 |     fq = '+harvest_source_id:"{0}"'.format(source_id)
 41 |     search_dict = {
 42 |         'fq': fq,
 43 |         'rows': limit,
 44 |         'sort': 'metadata_modified desc',
 45 |         'start': (page - 1) * limit,
 46 |     }
 47 | 
 48 |     context = {'model': model, 'session': model.Session}
 49 |     harvest_source = get_harvest_source(source_id)
 50 |     owner_org = harvest_source.get('owner_org', '')
 51 |     if owner_org:
 52 |         user_member_of_orgs = [org['id'] for org
 53 |                                in h.organizations_available('read')]
 54 |         if (harvest_source and owner_org in user_member_of_orgs):
 55 |             context['ignore_capacity_check'] = True
 56 | 
 57 |     query = logic.get_action('package_search')(context, search_dict)
 58 | 
 59 |     base_url = h.url_for(
 60 |         '{0}.read'.format(DATASET_TYPE_NAME),
 61 |         id=harvest_source['name']
 62 |     )
 63 | 
 64 |     def pager_url(q=None, page=None):
 65 |         url = base_url
 66 |         if page:
 67 |             url += '?page={0}'.format(page)
 68 |         return url
 69 | 
 70 |     pager = h.Page(
 71 |         collection=query['results'],
 72 |         page=page,
 73 |         url=pager_url,
 74 |         item_count=query['count'],
 75 |         items_per_page=limit
 76 |     )
 77 |     pager.items = query['results']
 78 | 
 79 |     if query['results']:
 80 |         out = h.snippet('snippets/package_list.html', packages=query['results'])
 81 |         out += pager.pager()
 82 |     else:
 83 |         out = h.snippet('snippets/package_list_empty.html')
 84 | 
 85 |     return out
 86 | 
 87 | 
 88 | def package_count_for_source(source_id):
 89 |     '''
 90 |     Returns the current package count for datasets associated with the given
 91 |     source id
 92 |     '''
 93 |     fq = '+harvest_source_id:"{0}"'.format(source_id)
 94 |     search_dict = {'fq': fq}
 95 |     context = {'model': model, 'session': model.Session}
 96 |     result = logic.get_action('package_search')(context, search_dict)
 97 |     return result.get('count', 0)
 98 | 
 99 | 
100 | def harvesters_info():
101 |     context = {'model': model, 'user': p.toolkit.c.user or p.toolkit.c.author}
102 |     return logic.get_action('harvesters_info_show')(context, {})
103 | 
104 | 
105 | def harvester_types():
106 |     harvesters = harvesters_info()
107 |     return [{'text': p.toolkit._(h['title']), 'value': h['name']}
108 |             for h in harvesters]
109 | 
110 | 
111 | def harvest_frequencies():
112 | 
113 |     return [{'text': p.toolkit._(f.title()), 'value': f}
114 |             for f in UPDATE_FREQUENCIES]
115 | 
116 | 
117 | def link_for_harvest_object(id=None, guid=None, text=None):
118 | 
119 |     if not id and not guid:
120 |         return None
121 | 
122 |     if guid:
123 |         context = {'model': model, 'user': p.toolkit.c.user or p.toolkit.c.author}
124 |         obj = logic.get_action('harvest_object_show')(context, {'id': guid, 'attr': 'guid'})
125 |         id = obj.id
126 | 
127 |     url = h.url_for('harvester.object_show', id=id)
128 |     text = text or guid or id
129 |     link = '<a href="{url}">{text}</a>'.format(url=url, text=text)
130 | 
131 |     return p.toolkit.literal(link)
132 | 
133 | 
134 | def harvest_source_extra_fields():
135 |     fields = {}
136 |     for harvester in p.PluginImplementations(IHarvester):
137 |         if not hasattr(harvester, 'extra_schema'):
138 |             continue
139 |         fields[harvester.info()['name']] = list(harvester.extra_schema().keys())
140 |     return fields
141 | 


--------------------------------------------------------------------------------
/ckanext/harvest/i18n/sv/LC_MESSAGES/ckanext-harvest.mo:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ckan/ckanext-harvest/b74cba23b647f0aefab1db406784dd8bb11f8c7d/ckanext/harvest/i18n/sv/LC_MESSAGES/ckanext-harvest.mo


--------------------------------------------------------------------------------
/ckanext/harvest/interfaces.py:
--------------------------------------------------------------------------------
  1 | from ckan.plugins.interfaces import Interface
  2 | 
  3 | 
  4 | class IHarvester(Interface):
  5 |     '''
  6 |     Common harvesting interface
  7 | 
  8 |     '''
  9 | 
 10 |     def info(self):
 11 |         '''
 12 |         Harvesting implementations must provide this method, which will return
 13 |         a dictionary containing different descriptors of the harvester. The
 14 |         returned dictionary should contain:
 15 | 
 16 |         * name: machine-readable name. This will be the value stored in the
 17 |           database, and the one used by ckanext-harvest to call the appropiate
 18 |           harvester.
 19 |         * title: human-readable name. This will appear in the form's select box
 20 |           in the WUI.
 21 |         * description: a small description of what the harvester does. This
 22 |           will appear on the form as a guidance to the user.
 23 | 
 24 |         A complete example may be::
 25 | 
 26 |             {
 27 |                 'name': 'csw',
 28 |                 'title': 'CSW Server',
 29 |                 'description': 'A server that implements OGC's Catalog Service
 30 |                                 for the Web (CSW) standard'
 31 |             }
 32 | 
 33 |         :returns: A dictionary with the harvester descriptors
 34 |         '''
 35 | 
 36 |     def validate_config(self, config):
 37 |         '''
 38 | 
 39 |         [optional]
 40 | 
 41 |         Harvesters can provide this method to validate the configuration
 42 |         entered in the form. It should return a single string, which will be
 43 |         stored in the database.  Exceptions raised will be shown in the form's
 44 |         error messages.
 45 | 
 46 |         :param harvest_object_id: Config string coming from the form
 47 |         :returns: A string with the validated configuration options
 48 |         '''
 49 | 
 50 |     def get_original_url(self, harvest_object_id):
 51 |         '''
 52 | 
 53 |         [optional]
 54 | 
 55 |         This optional but very recommended method allows harvesters to return
 56 |         the URL to the original remote document, given a Harvest Object id.
 57 |         Note that getting the harvest object you have access to its guid as
 58 |         well as the object source, which has the URL.
 59 |         This URL will be used on error reports to help publishers link to the
 60 |         original document that has the errors. If this method is not provided
 61 |         or no URL is returned, only a link to the local copy of the remote
 62 |         document will be shown.
 63 | 
 64 |         Examples:
 65 |             * For a CKAN record: http://{ckan-instance}/api/rest/{guid}
 66 |             * For a WAF record: http://{waf-root}/{file-name}
 67 |             * For a CSW record: http://{csw-server}/?Request=GetElementById&Id={guid}&...
 68 | 
 69 |         :param harvest_object_id: HarvestObject id
 70 |         :returns: A string with the URL to the original document
 71 |         '''
 72 | 
 73 |     def gather_stage(self, harvest_job):
 74 |         '''
 75 |         The gather stage will receive a HarvestJob object and will be
 76 |         responsible for:
 77 |             - gathering all the necessary objects to fetch on a later.
 78 |               stage (e.g. for a CSW server, perform a GetRecords request)
 79 |             - creating the necessary HarvestObjects in the database, specifying
 80 |               the guid and a reference to its job. The HarvestObjects need a
 81 |               reference date with the last modified date for the resource, this
 82 |               may need to be set in a different stage depending on the type of
 83 |               source.
 84 |             - creating and storing any suitable HarvestGatherErrors that may
 85 |               occur.
 86 |             - returning a list with all the ids of the created HarvestObjects.
 87 |             - to abort the harvest, create a HarvestGatherError and raise an
 88 |               exception. Any created HarvestObjects will be deleted.
 89 | 
 90 |         :param harvest_job: HarvestJob object
 91 |         :returns: A list of HarvestObject ids
 92 |         '''
 93 | 
 94 |     def fetch_stage(self, harvest_object):
 95 |         '''
 96 |         The fetch stage will receive a HarvestObject object and will be
 97 |         responsible for:
 98 |             - getting the contents of the remote object (e.g. for a CSW server,
 99 |               perform a GetRecordById request).
100 |             - saving the content in the provided HarvestObject.
101 |             - creating and storing any suitable HarvestObjectErrors that may
102 |               occur.
103 |             - returning True if everything is ok (ie the object should now be
104 |               imported), "unchanged" if the object didn't need harvesting after
105 |               all (ie no error, but don't continue to import stage) or False if
106 |               there were errors.
107 | 
108 |         :param harvest_object: HarvestObject object
109 |         :returns: True if successful, 'unchanged' if nothing to import after
110 |                   all, False if not successful
111 |         '''
112 | 
113 |     def import_stage(self, harvest_object):
114 |         '''
115 |         The import stage will receive a HarvestObject object and will be
116 |         responsible for:
117 |             - performing any necessary action with the fetched object (e.g.
118 |               create, update or delete a CKAN package).
119 |               Note: if this stage creates or updates a package, a reference
120 |               to the package should be added to the HarvestObject.
121 |             - setting the HarvestObject.package (if there is one)
122 |             - setting the HarvestObject.current for this harvest:
123 |                - True if successfully created/updated
124 |                - False if successfully deleted
125 |             - setting HarvestObject.current to False for previous harvest
126 |               objects of this harvest source if the action was successful.
127 |             - creating and storing any suitable HarvestObjectErrors that may
128 |               occur.
129 |             - creating the HarvestObject - Package relation (if necessary)
130 |             - returning True if the action was done, "unchanged" if the object
131 |               didn't need harvesting after all or False if there were errors.
132 | 
133 |         NB You can run this stage repeatedly using 'paster harvest import'.
134 | 
135 |         :param harvest_object: HarvestObject object
136 |         :returns: True if the action was done, "unchanged" if the object didn't
137 |                   need harvesting after all or False if there were errors.
138 |         '''
139 | 


--------------------------------------------------------------------------------
/ckanext/harvest/log.py:
--------------------------------------------------------------------------------
 1 | from logging import Handler, NOTSET
 2 | 
 3 | from ckanext.harvest.model import HarvestLog
 4 | 
 5 | 
 6 | class DBLogHandler(Handler):
 7 |     def __init__(self, level=NOTSET):
 8 |         super(DBLogHandler, self).__init__(level=level)
 9 | 
10 |     def emit(self, record):
11 |         try:
12 |             level = record.levelname
13 |             msg = self.format(record)
14 |             obj = HarvestLog(level=level, content=msg)
15 |             obj.save()
16 |         except Exception:
17 |             pass
18 | 


--------------------------------------------------------------------------------
/ckanext/harvest/logic/__init__.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import pkg_resources
 3 |     pkg_resources.declare_namespace(__name__)
 4 | except ImportError:
 5 |     import pkgutil
 6 |     __path__ = pkgutil.extend_path(__path__, __name__)
 7 | 
 8 | 
 9 | class HarvestJobExists(Exception):
10 |     pass
11 | 
12 | 
13 | class HarvestSourceInactiveError(Exception):
14 |     pass
15 | 


--------------------------------------------------------------------------------
/ckanext/harvest/logic/action/__init__.py:
--------------------------------------------------------------------------------
1 | try:
2 |     import pkg_resources
3 |     pkg_resources.declare_namespace(__name__)
4 | except ImportError:
5 |     import pkgutil
6 |     __path__ = pkgutil.extend_path(__path__, __name__)
7 | 


--------------------------------------------------------------------------------
/ckanext/harvest/logic/action/create.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | import ckan
  4 | 
  5 | from ckan.plugins import toolkit
  6 | 
  7 | from ckanext.harvest.logic import HarvestJobExists, HarvestSourceInactiveError
  8 | from ckanext.harvest.utils import (
  9 |     DATASET_TYPE_NAME
 10 | )
 11 | from ckanext.harvest.model import (HarvestSource, HarvestJob, HarvestObject,
 12 |                                    HarvestObjectExtra)
 13 | from ckanext.harvest.logic.dictization import (harvest_job_dictize,
 14 |                                                harvest_object_dictize)
 15 | from ckanext.harvest.logic.schema import harvest_object_create_schema
 16 | from ckanext.harvest.logic.action.get import (harvest_source_list,
 17 |                                               harvest_job_list)
 18 | 
 19 | log = logging.getLogger(__name__)
 20 | 
 21 | _validate = ckan.lib.navl.dictization_functions.validate
 22 | check_access = toolkit.check_access
 23 | 
 24 | 
 25 | class InactiveSource(Exception):
 26 |     pass
 27 | 
 28 | 
 29 | def harvest_source_create(context, data_dict):
 30 |     '''
 31 |     Creates a new harvest source
 32 | 
 33 |     This method just proxies the request to package_create,
 34 |     which will create a harvest_source dataset type and the
 35 |     HarvestSource object. All auth checks and validation will
 36 |     be done there .We only make sure to set the dataset type.
 37 | 
 38 |     Note that the harvest source type (ckan, waf, csw, etc)
 39 |     is now set via the source_type field.
 40 | 
 41 |     :param url: the URL for the harvest source
 42 |     :type url: string
 43 |     :param name: the name of the new harvest source, must be between 2 and 100
 44 |         characters long and contain only lowercase alphanumeric characters
 45 |     :type name: string
 46 |     :param title: the title of the dataset (optional, default: same as
 47 |         ``name``)
 48 |     :type title: string
 49 |     :param notes: a description of the harvest source (optional)
 50 |     :type notes: string
 51 |     :param source_type: the harvester type for this source. This must be one
 52 |         of the registerd harvesters, eg 'ckan', 'csw', etc.
 53 |     :type source_type: string
 54 |     :param frequency: the frequency in wich this harvester should run. See
 55 |         ``ckanext.harvest.model`` source for possible values. Default is
 56 |         'MANUAL'
 57 |     :type frequency: string
 58 |     :param config: extra configuration options for the particular harvester
 59 |         type. Should be a serialized as JSON. (optional)
 60 |     :type config: string
 61 | 
 62 | 
 63 |     :returns: the newly created harvest source
 64 |     :rtype: dictionary
 65 |     '''
 66 | 
 67 |     log.info('Creating harvest source: %r', data_dict)
 68 | 
 69 |     data_dict['type'] = DATASET_TYPE_NAME
 70 | 
 71 |     context['extras_as_string'] = True
 72 |     source = toolkit.get_action('package_create')(context, data_dict)
 73 | 
 74 |     return source
 75 | 
 76 | 
 77 | def harvest_job_create(context, data_dict):
 78 |     '''
 79 |     Creates a Harvest Job for a Harvest Source and runs it (by putting it on
 80 |     the gather queue)
 81 | 
 82 |     :param source_id: id of the harvest source to create a job for
 83 |     :type source_id: string
 84 |     :param run: whether to also run it or not (default: True)
 85 |     :type run: bool
 86 |     '''
 87 |     log.info('Harvest job create: %r', data_dict)
 88 |     check_access('harvest_job_create', context, data_dict)
 89 | 
 90 |     source_id = data_dict['source_id']
 91 |     run_it = data_dict.get('run', True)
 92 | 
 93 |     # Check if source exists
 94 |     source = HarvestSource.get(source_id)
 95 |     if not source:
 96 |         log.warn('Harvest source %s does not exist', source_id)
 97 |         raise toolkit.ObjectNotFound('Harvest source %s does not exist' % source_id)
 98 | 
 99 |     # Check if the source is active
100 |     if not source.active:
101 |         log.warn('Harvest job cannot be created for inactive source %s',
102 |                  source_id)
103 |         raise HarvestSourceInactiveError('Can not create jobs on inactive sources')
104 | 
105 |     # Check if there already is an unrun or currently running job for this
106 |     # source
107 |     exists = _check_for_existing_jobs(context, source_id)
108 |     if exists:
109 |         log.warn('There is already an unrun job %r for this source %s',
110 |                  exists, source_id)
111 |         raise HarvestJobExists('There already is an unrun job for this source')
112 | 
113 |     job = HarvestJob()
114 |     job.source = source
115 |     job.save()
116 |     log.info('Harvest job saved %s', job.id)
117 | 
118 |     if run_it:
119 |         toolkit.get_action('harvest_send_job_to_gather_queue')(
120 |             context, {'id': job.id})
121 | 
122 |     return harvest_job_dictize(job, context)
123 | 
124 | 
125 | def harvest_job_create_all(context, data_dict):
126 |     '''
127 |     Creates a Harvest Job for all Harvest Sources and runs them (by
128 |     putting them on the gather queue)
129 | 
130 |     :param source_id:
131 |     :type param: string
132 |     :param run: whether to also run the jobs or not (default: True)
133 |     :type run: bool
134 |     '''
135 | 
136 |     log.info('Harvest job create all: %r', data_dict)
137 |     check_access('harvest_job_create_all', context, data_dict)
138 | 
139 |     run = data_dict.get('run', True)
140 | 
141 |     data_dict.update({'only_active': True})
142 | 
143 |     # Get all active sources
144 |     sources = harvest_source_list(context, data_dict)
145 |     jobs = []
146 |     # Create a new job for each, if there isn't already one
147 |     for source in sources:
148 |         exists = _check_for_existing_jobs(context, source['id'])
149 |         if exists:
150 |             log.info('Skipping source %s as it already has a pending job',
151 |                      source['id'])
152 |             continue
153 | 
154 |         job = harvest_job_create(
155 |             context, {'source_id': source['id'], 'run': run})
156 |         jobs.append(job)
157 | 
158 |     log.info('Created jobs for %s%i harvest sources',
159 |              'and run ' if run else '', len(jobs))
160 |     return jobs
161 | 
162 | 
163 | def _check_for_existing_jobs(context, source_id):
164 |     '''
165 |     Given a source id, checks if there are jobs for this source
166 |     with status 'New' or 'Running'
167 | 
168 |     rtype: boolean
169 |     '''
170 |     data_dict = {
171 |         'source_id': source_id,
172 |         'status': u'New'
173 |     }
174 |     exist_new = harvest_job_list(context, data_dict)
175 |     data_dict = {
176 |         'source_id': source_id,
177 |         'status': u'Running'
178 |     }
179 |     exist_running = harvest_job_list(context, data_dict)
180 |     exist = len(exist_new + exist_running) > 0
181 | 
182 |     return exist
183 | 
184 | 
185 | def harvest_object_create(context, data_dict):
186 |     ''' Create a new harvest object
187 | 
188 |     :type guid: string (optional)
189 |     :type content: string (optional)
190 |     :type job_id: string
191 |     :type source_id: string (optional)
192 |     :type package_id: string (optional)
193 |     :type extras: dict (optional)
194 |     '''
195 |     check_access('harvest_object_create', context, data_dict)
196 |     data, errors = _validate(data_dict, harvest_object_create_schema(),
197 |                              context)
198 | 
199 |     if errors:
200 |         raise toolkit.ValidationError(errors)
201 | 
202 |     obj = HarvestObject(
203 |         guid=data.get('guid'),
204 |         content=data.get('content'),
205 |         job=data['job_id'],  # which was validated into a HarvestJob object
206 |         harvest_source_id=data.get('source_id'),
207 |         package_id=data.get('package_id'),
208 |         extras=[HarvestObjectExtra(key=k, value=v)
209 |                 for k, v in data.get('extras', {}).items()]
210 |     )
211 | 
212 |     obj.save()
213 |     return harvest_object_dictize(obj, context)
214 | 


--------------------------------------------------------------------------------
/ckanext/harvest/logic/action/delete.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from ckan import plugins as p
 4 | 
 5 | log = logging.getLogger(__name__)
 6 | 
 7 | 
 8 | def harvest_source_delete(context, data_dict):
 9 |     '''Deletes an existing harvest source
10 | 
11 |     This method just proxies the request to package_delete,
12 |     which will delete the actual harvest type dataset and the
13 |     HarvestSource object (via the after_delete extension point).
14 | 
15 |     :param id: the name or id of the harvest source to delete
16 |     :type id: string
17 |     '''
18 |     log.info('Deleting harvest source: %r', data_dict)
19 | 
20 |     p.toolkit.check_access('harvest_source_delete', context, data_dict)
21 | 
22 |     p.toolkit.get_action('package_delete')(context, data_dict)
23 | 
24 |     if context.get('clear_source', False):
25 | 
26 |         # We need the id. The name won't work.
27 |         package_dict = p.toolkit.get_action('package_show')(context, data_dict)
28 | 
29 |         p.toolkit.get_action('harvest_source_clear')(
30 |             context, {'id': package_dict['id']})
31 | 


--------------------------------------------------------------------------------
/ckanext/harvest/logic/action/patch.py:
--------------------------------------------------------------------------------
 1 | '''API functions for partial updates of existing data in CKAN'''
 2 | 
 3 | import logging
 4 | from ckan.logic import get_action
 5 | from ckanext.harvest.utils import (
 6 |     DATASET_TYPE_NAME
 7 | )
 8 | 
 9 | log = logging.getLogger(__name__)
10 | 
11 | 
12 | def harvest_source_patch(context, data_dict):
13 |     '''
14 |     Patch an existing harvest source
15 | 
16 |     This method just proxies the request to package_patch, which will update a
17 |     harvest_source dataset type and the HarvestSource object. All auth checks
18 |     and validation will be done there. We only make sure to set the dataset
19 |     type.
20 | 
21 |     Note that the harvest source type (ckan, waf, csw, etc) is now set via the
22 |     source_type field.
23 | 
24 |     All fields that are not provided, will be stay as they were before.
25 | 
26 |     :param id: the name or id of the harvest source to update
27 |     :type id: string
28 |     :param url: the URL for the harvest source
29 |     :type url: string
30 |     :param name: the name of the new harvest source, must be between 2 and 100
31 |         characters long and contain only lowercase alphanumeric characters
32 |     :type name: string
33 |     :param title: the title of the dataset (optional, default: same as
34 |         ``name``)
35 |     :type title: string
36 |     :param notes: a description of the harvest source (optional)
37 |     :type notes: string
38 |     :param source_type: the harvester type for this source. This must be one
39 |         of the registerd harvesters, eg 'ckan', 'csw', etc.
40 |     :type source_type: string
41 |     :param frequency: the frequency in wich this harvester should run. See
42 |         ``ckanext.harvest.model`` source for possible values. Default is
43 |         'MANUAL'
44 |     :type frequency: string
45 |     :param config: extra configuration options for the particular harvester
46 |         type. Should be a serialized as JSON. (optional)
47 |     :type config: string
48 | 
49 |     :returns: the updated harvest source
50 |     :rtype: dictionary
51 |     '''
52 |     log.info('Patch harvest source: %r', data_dict)
53 | 
54 |     data_dict['type'] = DATASET_TYPE_NAME
55 | 
56 |     context['extras_as_string'] = True
57 |     try:
58 |         source = get_action('package_patch')(context, data_dict)
59 |     except KeyError:
60 |         raise Exception('The harvest_source_patch action is not available on '
61 |                         'this version of CKAN')
62 | 
63 |     return source
64 | 


--------------------------------------------------------------------------------
/ckanext/harvest/logic/auth/__init__.py:
--------------------------------------------------------------------------------
 1 | from ckan.plugins import toolkit as pt
 2 | from ckanext.harvest import model as harvest_model
 3 | 
 4 | 
 5 | def user_is_sysadmin(context):
 6 |     '''
 7 |         Checks if the user defined in the context is a sysadmin
 8 | 
 9 |         rtype: boolean
10 |     '''
11 |     model = context['model']
12 |     user = context['user']
13 |     user_obj = model.User.get(user)
14 |     if not user_obj:
15 |         raise pt.Objectpt.ObjectNotFound('User {0} not found').format(user)
16 | 
17 |     return user_obj.sysadmin
18 | 
19 | 
20 | def _get_object(context, data_dict, name, class_name):
21 |     '''
22 |         return the named item if in the data_dict, or get it from
23 |         model.class_name
24 |     '''
25 |     if name not in context:
26 |         id = data_dict.get('id', None)
27 |         obj = getattr(harvest_model, class_name).get(id)
28 |         if not obj:
29 |             raise pt.ObjectNotFound
30 |     else:
31 |         obj = context[name]
32 |     return obj
33 | 
34 | 
35 | def get_source_object(context, data_dict={}):
36 |     return _get_object(context, data_dict, 'source', 'HarvestSource')
37 | 
38 | 
39 | def get_job_object(context, data_dict={}):
40 |     return _get_object(context, data_dict, 'job', 'HarvestJob')
41 | 
42 | 
43 | def get_obj_object(context, data_dict={}):
44 |     return _get_object(context, data_dict, 'obj', 'HarvestObject')
45 | 


--------------------------------------------------------------------------------
/ckanext/harvest/logic/auth/create.py:
--------------------------------------------------------------------------------
 1 | from ckan.plugins import toolkit as pt
 2 | from ckanext.harvest.logic.auth import user_is_sysadmin
 3 | 
 4 | 
 5 | def harvest_source_create(context, data_dict):
 6 |     '''
 7 |         Authorization check for harvest source creation
 8 | 
 9 |         It forwards the checks to package_create, which will check for
10 |         organization membership, whether if sysadmin, etc according to the
11 |         instance configuration.
12 |     '''
13 |     user = context.get('user')
14 |     try:
15 |         pt.check_access('package_create', context, data_dict)
16 |         return {'success': True}
17 |     except pt.NotAuthorized:
18 |         return {'success': False,
19 |                 'msg': pt._('User {0} not authorized to create harvest sources').format(user)}
20 | 
21 | 
22 | def harvest_job_create(context, data_dict):
23 |     '''
24 |         Authorization check for harvest job creation
25 | 
26 |         It forwards the checks to package_update, ie the user can only create
27 |         new jobs if she is allowed to edit the harvest source dataset.
28 |     '''
29 |     model = context['model']
30 |     source_id = data_dict['source_id']
31 | 
32 |     pkg = model.Package.get(source_id)
33 |     if not pkg:
34 |         raise pt.ObjectNotFound(pt._('Harvest source not found'))
35 | 
36 |     context['package'] = pkg
37 |     try:
38 |         pt.check_access('package_update', context, {"id": source_id})
39 |         return {'success': True}
40 |     except pt.NotAuthorized:
41 |         return {'success': False,
42 |                 'msg': pt._('User not authorized to create a job for source {0}').format(source_id)}
43 | 
44 | 
45 | def harvest_job_create_all(context, data_dict):
46 |     '''
47 |         Authorization check for creating new jobs for all sources
48 | 
49 |         Only sysadmins can do it
50 |     '''
51 |     if not user_is_sysadmin(context):
52 |         return {'success': False, 'msg': pt._('Only sysadmins can create harvest jobs for all sources')}
53 |     else:
54 |         return {'success': True}
55 | 
56 | 
57 | def harvest_object_create(context, data_dict):
58 |     """
59 |         Auth check for creating a harvest object
60 | 
61 |         only the sysadmins can create harvest objects
62 |     """
63 |     # sysadmins can run all actions if we've got to this point we're not a sysadmin
64 |     return {'success': False, 'msg': pt._('Only the sysadmins can create harvest objects')}
65 | 


--------------------------------------------------------------------------------
/ckanext/harvest/logic/auth/delete.py:
--------------------------------------------------------------------------------
 1 | from ckan.plugins import toolkit as pt
 2 | 
 3 | 
 4 | def harvest_source_delete(context, data_dict):
 5 |     '''
 6 |         Authorization check for harvest source deletion
 7 | 
 8 |         It forwards the checks to package_delete, which will check for
 9 |         organization membership, whether if sysadmin, etc according to the
10 |         instance configuration.
11 |     '''
12 |     model = context.get('model')
13 |     user = context.get('user')
14 |     source_id = data_dict['id']
15 | 
16 |     pkg = model.Package.get(source_id)
17 |     if not pkg:
18 |         raise pt.ObjectNotFound(pt._('Harvest source not found'))
19 | 
20 |     context['package'] = pkg
21 | 
22 |     try:
23 |         pt.check_access('package_delete', context, data_dict)
24 |         return {'success': True}
25 |     except pt.NotAuthorized:
26 |         return {'success': False,
27 |                 'msg': pt._('User {0} not authorized to delete harvest source {1}').format(user, source_id)}
28 | 


--------------------------------------------------------------------------------
/ckanext/harvest/logic/auth/get.py:
--------------------------------------------------------------------------------
  1 | from ckan.plugins import toolkit as pt
  2 | 
  3 | from ckanext.harvest.logic.auth import get_job_object
  4 | 
  5 | 
  6 | def auth_allow_anonymous_access(auth_function):
  7 |     '''
  8 |         Local version of the auth_allow_anonymous_access decorator that only
  9 |         calls the actual toolkit decorator if the CKAN version supports it
 10 |     '''
 11 |     if pt.check_ckan_version(min_version='2.2'):
 12 |         auth_function = pt.auth_allow_anonymous_access(auth_function)
 13 | 
 14 |     return auth_function
 15 | 
 16 | 
 17 | @auth_allow_anonymous_access
 18 | def harvest_source_show(context, data_dict):
 19 |     '''
 20 |         Authorization check for getting the details of a harvest source
 21 | 
 22 |         It forwards the checks to package_show, which will check for
 23 |         organization membership, whether if sysadmin, etc according to the
 24 |         instance configuration.
 25 |     '''
 26 |     model = context.get('model')
 27 |     user = context.get('user')
 28 |     source_id = data_dict['id']
 29 | 
 30 |     pkg = model.Package.get(source_id)
 31 |     if not pkg:
 32 |         raise pt.ObjectNotFound(pt._('Harvest source not found'))
 33 | 
 34 |     context['package'] = pkg
 35 | 
 36 |     try:
 37 |         pt.check_access('package_show', context, data_dict)
 38 |         return {'success': True}
 39 |     except pt.NotAuthorized:
 40 |         return {'success': False,
 41 |                 'msg': pt._('User {0} not authorized to read harvest source {1}')
 42 |                 .format(user, source_id)}
 43 | 
 44 | 
 45 | @auth_allow_anonymous_access
 46 | def harvest_source_show_status(context, data_dict):
 47 |     '''
 48 |         Authorization check for getting the status of a harvest source
 49 | 
 50 |         It forwards the checks to harvest_source_show.
 51 |     '''
 52 |     return harvest_source_show(context, data_dict)
 53 | 
 54 | 
 55 | @auth_allow_anonymous_access
 56 | def harvest_source_list(context, data_dict):
 57 |     '''
 58 |         Authorization check for getting a list of harveste sources
 59 | 
 60 |         Everybody can do it
 61 |     '''
 62 |     return {'success': True}
 63 | 
 64 | 
 65 | def harvest_job_show(context, data_dict):
 66 |     '''
 67 |         Authorization check for getting the details of a harvest job
 68 | 
 69 |         It forwards the checks to harvest_source_update, ie if the user can
 70 |         update the parent source (eg create new jobs), she can get the details
 71 |         for the job, including the reports
 72 |     '''
 73 |     user = context.get('user')
 74 |     job = get_job_object(context, data_dict)
 75 | 
 76 |     try:
 77 |         pt.check_access('harvest_source_update',
 78 |                         context,
 79 |                         {'id': job.source.id})
 80 |         return {'success': True}
 81 |     except pt.NotAuthorized:
 82 |         return {'success': False,
 83 |                 'msg': pt._('User {0} not authorized to see jobs from source {1}')
 84 |                 .format(user, job.source.id)}
 85 | 
 86 | 
 87 | def harvest_job_list(context, data_dict):
 88 |     '''
 89 |         Authorization check for getting a list of jobs for a source
 90 | 
 91 |         It forwards the checks to harvest_source_update, ie if the user can
 92 |         update the parent source (eg create new jobs), she can get the list of
 93 |         jobs
 94 |     '''
 95 |     user = context.get('user')
 96 |     source_id = data_dict['source_id']
 97 | 
 98 |     try:
 99 |         pt.check_access('harvest_source_update',
100 |                         context,
101 |                         {'id': source_id})
102 |         return {'success': True}
103 |     except pt.NotAuthorized:
104 |         return {'success': False,
105 |                 'msg': pt._('User {0} not authorized to list jobs for source {1}')
106 |                 .format(user, source_id)}
107 | 
108 | 
109 | @auth_allow_anonymous_access
110 | def harvest_object_show(context, data_dict):
111 |     '''
112 |         Authorization check for getting the contents of a harvest object
113 | 
114 |         Everybody can do it
115 |     '''
116 |     return {'success': True}
117 | 
118 | 
119 | def harvest_object_list(context, data_dict):
120 |     '''
121 |     TODO: remove
122 |     '''
123 |     return {'success': True}
124 | 
125 | 
126 | @auth_allow_anonymous_access
127 | def harvesters_info_show(context, data_dict):
128 |     '''
129 |         Authorization check for getting information about the available
130 |         harvesters
131 | 
132 |         Everybody can do it
133 |     '''
134 |     return {'success': True}
135 | 
136 | 
137 | def harvest_get_notifications_recipients(context, data_dict):
138 |     # Only sysadmins can access this
139 |     return {'success': False}
140 | 


--------------------------------------------------------------------------------
/ckanext/harvest/logic/auth/patch.py:
--------------------------------------------------------------------------------
1 | import ckanext.harvest.logic.auth.update as _update
2 | 
3 | harvest_source_patch = _update.harvest_source_update
4 | 


--------------------------------------------------------------------------------
/ckanext/harvest/logic/auth/update.py:
--------------------------------------------------------------------------------
  1 | from ckan.plugins import toolkit as pt
  2 | from ckanext.harvest.logic.auth import user_is_sysadmin
  3 | 
  4 | 
  5 | def harvest_source_update(context, data_dict):
  6 |     '''
  7 |         Authorization check for harvest source update
  8 | 
  9 |         It forwards the checks to package_update, which will check for
 10 |         organization membership, whether if sysadmin, etc according to the
 11 |         instance configuration.
 12 |     '''
 13 |     model = context.get('model')
 14 |     user = context.get('user')
 15 |     source_id = data_dict['id']
 16 | 
 17 |     pkg = model.Package.get(source_id)
 18 |     if not pkg:
 19 |         raise pt.ObjectNotFound(pt._('Harvest source not found'))
 20 | 
 21 |     context['package'] = pkg
 22 | 
 23 |     try:
 24 |         pt.check_access('package_update', context, data_dict)
 25 |         return {'success': True}
 26 |     except pt.NotAuthorized:
 27 |         return {'success': False,
 28 |                 'msg': pt._('User {0} not authorized to update harvest source {1}').format(user, source_id)}
 29 | 
 30 | 
 31 | def harvest_sources_clear(context, data_dict):
 32 |     '''
 33 |         Authorization check for clearing history for all harvest sources
 34 | 
 35 |         Only sysadmins can do it
 36 |     '''
 37 |     if not user_is_sysadmin(context):
 38 |         return {'success': False, 'msg': pt._('Only sysadmins can clear history for all harvest jobs')}
 39 |     else:
 40 |         return {'success': True}
 41 | 
 42 | 
 43 | def harvest_source_clear(context, data_dict):
 44 |     '''
 45 |         Authorization check for clearing a harvest source
 46 | 
 47 |         It forwards to harvest_source_update
 48 |     '''
 49 |     return harvest_source_update(context, data_dict)
 50 | 
 51 | 
 52 | def harvest_objects_import(context, data_dict):
 53 |     '''
 54 |         Authorization check reimporting all harvest objects
 55 | 
 56 |         Only sysadmins can do it
 57 |     '''
 58 |     if not user_is_sysadmin(context):
 59 |         return {'success': False, 'msg': pt._('Only sysadmins can reimport all harvest objects')}
 60 |     else:
 61 |         return {'success': True}
 62 | 
 63 | 
 64 | def harvest_jobs_run(context, data_dict):
 65 |     '''
 66 |         Authorization check for running the pending harvest jobs
 67 | 
 68 |         Only sysadmins can do it
 69 |     '''
 70 |     if not user_is_sysadmin(context):
 71 |         return {'success': False, 'msg': pt._('Only sysadmins can run the pending harvest jobs')}
 72 |     else:
 73 |         return {'success': True}
 74 | 
 75 | 
 76 | def harvest_send_job_to_gather_queue(context, data_dict):
 77 |     '''
 78 |         Authorization check for sending a job to the gather queue
 79 | 
 80 |         It forwards the checks to harvest_job_create, ie the user can only run
 81 |         the job if she is allowed to create the job.
 82 |     '''
 83 |     from ckanext.harvest.logic.auth.create import harvest_job_create
 84 |     return harvest_job_create(context, data_dict)
 85 | 
 86 | 
 87 | def harvest_job_abort(context, data_dict):
 88 |     '''
 89 |         Authorization check for aborting a running harvest job
 90 | 
 91 |         Same permissions as running one
 92 |     '''
 93 |     return harvest_jobs_run(context, data_dict)
 94 | 
 95 | 
 96 | def harvest_sources_reindex(context, data_dict):
 97 |     '''
 98 |         Authorization check for reindexing all harvest sources
 99 | 
100 |         Only sysadmins can do it
101 |     '''
102 |     if not user_is_sysadmin(context):
103 |         return {'success': False, 'msg': pt._('Only sysadmins can reindex all harvest sources')}
104 |     else:
105 |         return {'success': True}
106 | 
107 | 
108 | def harvest_source_reindex(context, data_dict):
109 |     '''
110 |         Authorization check for reindexing a harvest source
111 | 
112 |         It forwards to harvest_source_update
113 |     '''
114 |     return harvest_source_update(context, data_dict)
115 | 


--------------------------------------------------------------------------------
/ckanext/harvest/logic/dictization.py:
--------------------------------------------------------------------------------
  1 | from sqlalchemy import func, text
  2 | 
  3 | from ckan.model import Group
  4 | from ckan import logic
  5 | from ckanext.harvest.model import (HarvestJob, HarvestObject,
  6 |                                    HarvestGatherError, HarvestObjectError)
  7 | 
  8 | 
  9 | def harvest_source_dictize(source, context, last_job_status=False):
 10 |     out = source.as_dict()
 11 | 
 12 |     out['publisher_title'] = u''
 13 | 
 14 |     publisher_id = out.get('publisher_id')
 15 |     if publisher_id:
 16 |         group = Group.get(publisher_id)
 17 |         if group:
 18 |             out['publisher_title'] = group.title
 19 | 
 20 |     out['status'] = _get_source_status(source, context)
 21 | 
 22 |     if last_job_status:
 23 |         source_status = logic.get_action('harvest_source_show_status')(context, {'id': source.id})
 24 |         out['last_job_status'] = source_status.get('last_job', {})
 25 | 
 26 |     return out
 27 | 
 28 | 
 29 | def harvest_job_dictize(job, context):
 30 |     out = job.as_dict()
 31 | 
 32 |     model = context['model']
 33 | 
 34 |     if context.get('return_stats', True):
 35 |         stats = model.Session.query(
 36 |             HarvestObject.report_status,
 37 |             func.count(HarvestObject.id).label('total_objects'))\
 38 |             .filter_by(harvest_job_id=job.id)\
 39 |             .group_by(HarvestObject.report_status).all()
 40 |         out['stats'] = {'added': 0, 'updated': 0, 'not modified': 0,
 41 |                         'errored': 0, 'deleted': 0}
 42 |         for status, count in stats:
 43 |             out['stats'][status] = count
 44 | 
 45 |         # We actually want to check which objects had errors, because they
 46 |         # could have been added/updated anyway (eg bbox errors)
 47 |         count = model.Session.query(
 48 |             func.distinct(HarvestObjectError.harvest_object_id)) \
 49 |             .join(HarvestObject) \
 50 |             .filter(HarvestObject.harvest_job_id == job.id) \
 51 |             .count()
 52 |         if count > 0:
 53 |             out['stats']['errored'] = count
 54 | 
 55 |         # Add gather errors to the error count
 56 |         count = model.Session.query(HarvestGatherError) \
 57 |             .filter(HarvestGatherError.harvest_job_id == job.id) \
 58 |             .count()
 59 |         if count > 0:
 60 |             out['stats']['errored'] = out['stats'].get('errored', 0) + count
 61 | 
 62 |     if context.get('return_error_summary', True):
 63 |         q = model.Session.query(
 64 |             HarvestObjectError.message,
 65 |             func.count(HarvestObjectError.message).label('error_count')) \
 66 |             .join(HarvestObject) \
 67 |             .filter(HarvestObject.harvest_job_id == job.id) \
 68 |             .group_by(HarvestObjectError.message) \
 69 |             .order_by(text('error_count desc')) \
 70 |             .limit(context.get('error_summmary_limit', 20))
 71 |         out['object_error_summary'] = harvest_error_dictize(q.all(), context)
 72 |         q = model.Session.query(
 73 |             HarvestGatherError.message,
 74 |             func.count(HarvestGatherError.message).label('error_count')) \
 75 |             .filter(HarvestGatherError.harvest_job_id == job.id) \
 76 |             .group_by(HarvestGatherError.message) \
 77 |             .order_by(text('error_count desc')) \
 78 |             .limit(context.get('error_summmary_limit', 20))
 79 |         out['gather_error_summary'] = harvest_error_dictize(q.all(), context)
 80 | 
 81 |     return out
 82 | 
 83 | 
 84 | def harvest_object_dictize(obj, context):
 85 |     out = obj.as_dict()
 86 |     out['source'] = obj.harvest_source_id
 87 |     out['job'] = obj.harvest_job_id
 88 | 
 89 |     if obj.package:
 90 |         out['package'] = obj.package.id
 91 | 
 92 |     out['errors'] = []
 93 |     for error in obj.errors:
 94 |         out['errors'].append(error.as_dict())
 95 | 
 96 |     out['extras'] = {}
 97 |     for extra in obj.extras:
 98 |         out['extras'][extra.key] = extra.value
 99 | 
100 |     return out
101 | 
102 | 
103 | def harvest_log_dictize(obj, context):
104 |     out = obj.as_dict()
105 |     del out['id']
106 | 
107 |     return out
108 | 
109 | 
110 | def harvest_error_dictize(obj, context):
111 |     out = []
112 |     for elem in obj:
113 |         out.append(elem._asdict())
114 |     return out
115 | 
116 | 
117 | def _get_source_status(source, context):
118 |     '''
119 |     TODO: Deprecated, use harvest_source_show_status instead
120 |     '''
121 | 
122 |     out = dict()
123 | 
124 |     job_count = HarvestJob.filter(source=source).count()
125 | 
126 |     out = {
127 |         'job_count': 0,
128 |         'next_harvest': '',
129 |         'last_harvest_request': '',
130 |         }
131 | 
132 |     if not job_count:
133 |         out['msg'] = 'No jobs yet'
134 |         return out
135 |     else:
136 |         out['job_count'] = job_count
137 | 
138 |     # Get next scheduled job
139 |     next_job = HarvestJob.filter(source=source, status=u'New').first()
140 |     if next_job:
141 |         out['next_harvest'] = 'Scheduled'
142 |     else:
143 |         out['next_harvest'] = 'Not yet scheduled'
144 | 
145 |     # Get the last finished job
146 |     last_job = HarvestJob.filter(source=source, status=u'Finished') \
147 |         .order_by(HarvestJob.created.desc()).first()
148 | 
149 |     if last_job:
150 |         out['last_harvest_request'] = str(last_job.gather_finished)
151 |     else:
152 |         out['last_harvest_request'] = 'Not yet harvested'
153 | 
154 |     return out
155 | 


--------------------------------------------------------------------------------
/ckanext/harvest/logic/schema.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import ckan.plugins.toolkit as tk
  3 | 
  4 | from ckan.logic.schema import default_extras_schema
  5 | from ckan.logic.validators import (package_id_exists,
  6 |                                    name_validator,
  7 |                                    owner_org_validator,
  8 |                                    package_name_validator,
  9 |                                    boolean_validator,
 10 |                                    )
 11 | from ckan.logic.converters import convert_to_extras, convert_from_extras
 12 | from ckantoolkit import unicode_safe
 13 | 
 14 | from ckanext.harvest.logic.validators import (
 15 |     harvest_source_url_validator,
 16 |     harvest_source_type_exists,
 17 |     harvest_source_config_validator,
 18 |     harvest_source_extra_validator,
 19 |     harvest_source_frequency_exists,
 20 |     dataset_type_exists,
 21 |     harvest_source_convert_from_config,
 22 |     harvest_source_id_exists,
 23 |     harvest_job_exists,
 24 |     harvest_object_extras_validator,
 25 | )
 26 | ignore_missing = tk.get_validator("ignore_missing")
 27 | not_empty = tk.get_validator("not_empty")
 28 | ignore = tk.get_validator("ignore")
 29 | if_empty_same_as = tk.get_validator("if_empty_same_as")
 30 | 
 31 | 
 32 | def harvest_source_schema():
 33 | 
 34 |     schema = {
 35 |         'id': [ignore_missing, unicode_safe, package_id_exists],
 36 |         'type': [dataset_type_exists, unicode_safe],
 37 |         'url': [not_empty, unicode_safe, harvest_source_url_validator],
 38 |         'name': [not_empty, unicode_safe, name_validator, package_name_validator],
 39 |         'source_type': [not_empty, unicode_safe, harvest_source_type_exists, convert_to_extras],
 40 |         'title': [if_empty_same_as("name"), unicode_safe],
 41 |         'notes': [ignore_missing, unicode_safe],
 42 |         'owner_org': [owner_org_validator, unicode_safe],
 43 |         'private': [ignore_missing, boolean_validator],
 44 |         'organization': [ignore_missing],
 45 |         'frequency': [ignore_missing, unicode_safe, harvest_source_frequency_exists, convert_to_extras],
 46 |         'state': [ignore_missing],
 47 |         'config': [ignore_missing, harvest_source_config_validator, convert_to_extras],
 48 |         'extras': default_extras_schema(),
 49 |     }
 50 | 
 51 |     extras_schema = default_extras_schema()
 52 |     extras_schema['__extras'] = [ignore]
 53 | 
 54 |     schema['extras'] = extras_schema
 55 | 
 56 |     return schema
 57 | 
 58 | 
 59 | def harvest_source_create_package_schema():
 60 | 
 61 |     schema = harvest_source_schema()
 62 |     schema['__extras'] = [harvest_source_extra_validator]
 63 |     schema['save'] = [ignore]
 64 |     schema.pop("id")
 65 | 
 66 |     return schema
 67 | 
 68 | 
 69 | def harvest_source_update_package_schema():
 70 | 
 71 |     schema = harvest_source_create_package_schema()
 72 |     schema['owner_org'] = [ignore_missing, owner_org_validator, unicode_safe]
 73 |     return schema
 74 | 
 75 | 
 76 | def harvest_source_show_package_schema():
 77 | 
 78 |     schema = harvest_source_schema()
 79 |     schema.update({
 80 |         'source_type': [convert_from_extras, ignore_missing],
 81 |         'frequency': [convert_from_extras, ignore_missing],
 82 |         'config': [convert_from_extras, harvest_source_convert_from_config, ignore_missing],
 83 |         'metadata_created': [],
 84 |         'metadata_modified': [],
 85 |         'owner_org': [],
 86 |         'creator_user_id': [],
 87 |         'organization': [],
 88 |         'notes': [],
 89 |         'revision_id': [ignore_missing],
 90 |         'revision_timestamp': [ignore_missing],
 91 |         'tracking_summary': [ignore_missing],
 92 |     })
 93 | 
 94 |     schema['__extras'] = [ignore]
 95 | 
 96 |     return schema
 97 | 
 98 | 
 99 | def harvest_object_create_schema():
100 |     schema = {
101 |         'guid': [ignore_missing, unicode_safe],
102 |         'content': [ignore_missing, unicode_safe],
103 |         'state': [ignore_missing, unicode_safe],
104 |         'job_id': [harvest_job_exists],
105 |         'source_id': [ignore_missing, harvest_source_id_exists],
106 |         'package_id': [ignore_missing, package_id_exists],
107 |         'extras': [ignore_missing, harvest_object_extras_validator],
108 |     }
109 |     return schema
110 | 


--------------------------------------------------------------------------------
/ckanext/harvest/logic/validators.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import logging
  4 | import json
  5 | 
  6 | from ckan.lib.navl.dictization_functions import Invalid, validate
  7 | from ckan import model
  8 | from ckan.plugins import PluginImplementations
  9 | 
 10 | from ckanext.harvest.utils import (
 11 |     DATASET_TYPE_NAME
 12 | )
 13 | from ckanext.harvest.model import HarvestSource, UPDATE_FREQUENCIES, HarvestJob
 14 | from ckanext.harvest.interfaces import IHarvester
 15 | from urllib.parse import (urlparse, urlunparse)
 16 | 
 17 | log = logging.getLogger(__name__)
 18 | 
 19 | 
 20 | def harvest_source_id_exists(value, context):
 21 | 
 22 |     result = HarvestSource.get(value)
 23 | 
 24 |     if not result:
 25 |         raise Invalid('Harvest Source with id %r does not exist.' % str(value))
 26 |     return value
 27 | 
 28 | 
 29 | def harvest_job_exists(value, context):
 30 |     '''Check if a harvest job exists and returns the model if it does'''
 31 |     result = HarvestJob.get(value)
 32 | 
 33 |     if not result:
 34 |         raise Invalid('Harvest Job with id %r does not exist.' % str(value))
 35 |     return result
 36 | 
 37 | 
 38 | def _normalize_url(url):
 39 |     '''Strips off parameters off a URL, and an unnecessary port number, so that
 40 |     simple variations on a URL are ignored, to used to help avoid getting two
 41 |     harvesters for the same URL.'''
 42 |     o = urlparse(url)
 43 | 
 44 |     # Normalize port
 45 |     if ':' in o.netloc:
 46 |         parts = o.netloc.split(':')
 47 |         if (o.scheme == 'http' and parts[1] == '80') or \
 48 |            (o.scheme == 'https' and parts[1] == '443'):
 49 |             netloc = parts[0]
 50 |         else:
 51 |             netloc = ':'.join(parts)
 52 |     else:
 53 |         netloc = o.netloc
 54 | 
 55 |     # Remove trailing slash
 56 |     path = o.path.rstrip('/')
 57 | 
 58 |     check_url = urlunparse((
 59 |         o.scheme,
 60 |         netloc,
 61 |         path,
 62 |         None, None, None))
 63 | 
 64 |     return check_url
 65 | 
 66 | 
 67 | def harvest_source_url_validator(key, data, errors, context):
 68 |     '''Validate the provided harvest source URL
 69 | 
 70 |     Checks that the URL & config combination are unique to this HarvestSource.
 71 |     '''
 72 | 
 73 |     package = context.get('package')
 74 | 
 75 |     if package:
 76 |         package_id = package.id
 77 |     else:
 78 |         package_id = data.get(key[:-1] + ('id',))
 79 | 
 80 |     try:
 81 |         new_config = data.get(key[:-1] + ('config',))
 82 |     except Exception:
 83 |         new_config = None
 84 | 
 85 |     new_url = _normalize_url(data[key])
 86 | 
 87 |     q = model.Session.query(model.Package.id, model.Package.url) \
 88 |              .filter(model.Package.type == DATASET_TYPE_NAME)
 89 | 
 90 |     if package_id:
 91 |         # When editing a source we need to avoid its own URL
 92 |         q = q.filter(model.Package.id != package_id)
 93 | 
 94 |     existing_sources = q.all()
 95 | 
 96 |     for id_, url in existing_sources:
 97 |         url = _normalize_url(url)
 98 |         conf = model.Session.query(HarvestSource.config).filter(
 99 |             HarvestSource.id == id_).first()
100 |         if conf:
101 |             conf = conf[0]
102 |         else:
103 |             conf = None
104 | 
105 |         if url == new_url and conf == new_config:
106 |             raise Invalid('There already is a Harvest Source for this URL (& '
107 |                           'config): url=%s config=%s' % (new_url, new_config))
108 | 
109 |     return data[key]
110 | 
111 | 
112 | def harvest_source_type_exists(value, context):
113 |     # TODO: use new description interface
114 | 
115 |     # Get all the registered harvester types
116 |     available_types = []
117 |     for harvester in PluginImplementations(IHarvester):
118 |         try:
119 |             info = harvester.info()
120 |         except AttributeError:
121 |             continue
122 |         if not info or 'name' not in info:
123 |             log.error('Harvester %s does not provide the harvester name in '
124 |                       'the info response' % harvester)
125 |             continue
126 |         available_types.append(info['name'])
127 | 
128 |     if value not in available_types:
129 |         raise Invalid('Unknown harvester type: %s. Registered types: %r' %
130 |                       (value, available_types))
131 | 
132 |     return value
133 | 
134 | 
135 | def harvest_source_config_validator(key, data, errors, context):
136 |     harvester_type = data.get(('source_type',), '')
137 |     for harvester in PluginImplementations(IHarvester):
138 |         info = harvester.info()
139 |         if info['name'] == harvester_type:
140 |             if hasattr(harvester, 'validate_config'):
141 |                 try:
142 |                     config = harvester.validate_config(data[key])
143 |                 except Exception as e:
144 |                     raise Invalid('Error parsing the configuration options: %s'
145 |                                   % e)
146 |                 if config is not None:
147 |                     # save an edited config, for use during the harvest
148 |                     data[key] = config
149 |             # no value is returned for this sort of validator/converter
150 | 
151 | 
152 | def keep_not_empty_extras(key, data, errors, context):
153 |     extras = data.pop(key, {})
154 |     for extras_key, value in extras.items():
155 |         if value:
156 |             data[key[:-1] + (extras_key,)] = value
157 | 
158 | 
159 | def harvest_source_extra_validator(key, data, errors, context):
160 |     harvester_type = data.get(('source_type',), '')
161 | 
162 |     # gather all extra fields to use as whitelist of what
163 |     # can be added to top level data_dict
164 |     all_extra_fields = set()
165 |     for harvester in PluginImplementations(IHarvester):
166 |         if not hasattr(harvester, 'extra_schema'):
167 |             continue
168 |         all_extra_fields.update(harvester.extra_schema().keys())
169 | 
170 |     extra_schema = {'__extras': [keep_not_empty_extras]}
171 |     for harvester in PluginImplementations(IHarvester):
172 |         if not hasattr(harvester, 'extra_schema'):
173 |             continue
174 |         info = harvester.info()
175 |         if not info['name'] == harvester_type:
176 |             continue
177 |         extra_schema.update(harvester.extra_schema())
178 |         break
179 | 
180 |     extra_data, extra_errors = validate(data.get(key, {}), extra_schema)
181 |     for key in list(extra_data.keys()):
182 |         # only allow keys that appear in at least one harvester
183 |         if key not in all_extra_fields:
184 |             extra_data.pop(key)
185 | 
186 |     for key, value in extra_data.items():
187 |         data[(key,)] = value
188 | 
189 |     for key, value in extra_errors.items():
190 |         errors[(key,)] = value
191 | 
192 |     # need to get config out of extras as __extra runs
193 |     # after rest of validation
194 |     package_extras = data.get(('extras',), [])
195 | 
196 |     for num, extra in enumerate(list(package_extras)):
197 |         if extra['key'] == 'config':
198 |             # remove config extra so we can add back cleanly later
199 |             package_extras.pop(num)
200 |             try:
201 |                 config_dict = json.loads(extra.get('value') or '{}')
202 |             except ValueError:
203 |                 log.error('Wrong JSON provided in config, skipping')
204 |                 config_dict = {}
205 |             break
206 |     else:
207 |         config_dict = {}
208 |     config_dict.update(extra_data)
209 |     if config_dict and not extra_errors:
210 |         config = json.dumps(config_dict)
211 |         package_extras.append(dict(key='config',
212 |                                    value=config))
213 |         data[('config',)] = config
214 |     if package_extras:
215 |         data[('extras',)] = package_extras
216 | 
217 | 
218 | def harvest_source_convert_from_config(key, data, errors, context):
219 |     config = data[key]
220 |     if config:
221 |         try:
222 |             config_dict = json.loads(config)
223 |         except ValueError:
224 |             log.error('Wrong JSON provided config, skipping')
225 |             data[key] = None
226 |             return
227 |         for key, value in config_dict.items():
228 |             data[(key,)] = value
229 | 
230 | 
231 | def harvest_source_active_validator(value, context):
232 |     if isinstance(value, str):
233 |         if value.lower() == 'true':
234 |             return True
235 |         else:
236 |             return False
237 |     return bool(value)
238 | 
239 | 
240 | def harvest_source_frequency_exists(value):
241 |     if value == '':
242 |         value = 'MANUAL'
243 |     if value.upper() not in UPDATE_FREQUENCIES:
244 |         raise Invalid('Frequency %s not recognised' % value)
245 |     return value.upper()
246 | 
247 | 
248 | def dataset_type_exists(value):
249 |     if value != DATASET_TYPE_NAME:
250 |         value = DATASET_TYPE_NAME
251 |     return value
252 | 
253 | 
254 | def harvest_object_extras_validator(value, context):
255 |     if not isinstance(value, dict):
256 |         raise Invalid('extras must be a dict')
257 |     for v in value.values():
258 |         if not isinstance(v, str):
259 |             raise Invalid('extras must be a dict of strings')
260 |     return value
261 | 


--------------------------------------------------------------------------------
/ckanext/harvest/migration/harvest/README:
--------------------------------------------------------------------------------
1 | Generic single-database configuration.


--------------------------------------------------------------------------------
/ckanext/harvest/migration/harvest/alembic.ini:
--------------------------------------------------------------------------------
 1 | # A generic, single database configuration.
 2 | 
 3 | [alembic]
 4 | # path to migration scripts
 5 | script_location = %(here)s
 6 | 
 7 | # template used to generate migration files
 8 | # file_template = %%(rev)s_%%(slug)s
 9 | 
10 | # timezone to use when rendering the date
11 | # within the migration file as well as the filename.
12 | # string value is passed to dateutil.tz.gettz()
13 | # leave blank for localtime
14 | # timezone =
15 | 
16 | # max length of characters to apply to the
17 | # "slug" field
18 | #truncate_slug_length = 40
19 | 
20 | # set to 'true' to run the environment during
21 | # the 'revision' command, regardless of autogenerate
22 | # revision_environment = false
23 | 
24 | # set to 'true' to allow .pyc and .pyo files without
25 | # a source .py file to be detected as revisions in the
26 | # versions/ directory
27 | # sourceless = false
28 | 
29 | # version location specification; this defaults
30 | # to /home/sergey/projects/core/ckanext-harvest/ckanext/harvest/migration/harvest/versions.  When using multiple version
31 | # directories, initial revisions must be specified with --version-path
32 | # version_locations = %(here)s/bar %(here)s/bat /home/sergey/projects/core/ckanext-harvest/ckanext/harvest/migration/harvest/versions
33 | 
34 | # the output encoding used when revision files
35 | # are written from script.py.mako
36 | # output_encoding = utf-8
37 | 
38 | sqlalchemy.url = driver://user:pass@localhost/dbname
39 | 
40 | 
41 | # Logging configuration
42 | [loggers]
43 | keys = root,sqlalchemy,alembic
44 | 
45 | [handlers]
46 | keys = console
47 | 
48 | [formatters]
49 | keys = generic
50 | 
51 | [logger_root]
52 | level = WARN
53 | handlers = console
54 | qualname =
55 | 
56 | [logger_sqlalchemy]
57 | level = WARN
58 | handlers =
59 | qualname = sqlalchemy.engine
60 | 
61 | [logger_alembic]
62 | level = INFO
63 | handlers =
64 | qualname = alembic
65 | 
66 | [handler_console]
67 | class = StreamHandler
68 | args = (sys.stderr,)
69 | level = NOTSET
70 | formatter = generic
71 | 
72 | [formatter_generic]
73 | format = %(levelname)-5.5s [%(name)s] %(message)s
74 | datefmt = %H:%M:%S
75 | 


--------------------------------------------------------------------------------
/ckanext/harvest/migration/harvest/env.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import with_statement
 4 | from alembic import context
 5 | from sqlalchemy import engine_from_config, pool
 6 | from logging.config import fileConfig
 7 | 
 8 | import os
 9 | 
10 | # this is the Alembic Config object, which provides
11 | # access to the values within the .ini file in use.
12 | config = context.config
13 | 
14 | # Interpret the config file for Python logging.
15 | # This line sets up loggers basically.
16 | fileConfig(config.config_file_name)
17 | 
18 | # add your model's MetaData object here
19 | # for 'autogenerate' support
20 | # from myapp import mymodel
21 | # target_metadata = mymodel.Base.metadata
22 | target_metadata = None
23 | 
24 | # other values from the config, defined by the needs of env.py,
25 | # can be acquired:
26 | # my_important_option = config.get_main_option("my_important_option")
27 | # ... etc.
28 | 
29 | name = os.path.basename(os.path.dirname(__file__))
30 | 
31 | 
32 | def run_migrations_offline():
33 |     """Run migrations in 'offline' mode.
34 | 
35 |     This configures the context with just a URL
36 |     and not an Engine, though an Engine is acceptable
37 |     here as well.  By skipping the Engine creation
38 |     we don't even need a DBAPI to be available.
39 | 
40 |     Calls to context.execute() here emit the given string to the
41 |     script output.
42 | 
43 |     """
44 | 
45 |     url = config.get_main_option(u"sqlalchemy.url")
46 |     context.configure(
47 |         url=url, target_metadata=target_metadata, literal_binds=True,
48 |         version_table=u'{}_alembic_version'.format(name)
49 |     )
50 | 
51 |     with context.begin_transaction():
52 |         context.run_migrations()
53 | 
54 | 
55 | def run_migrations_online():
56 |     """Run migrations in 'online' mode.
57 | 
58 |     In this scenario we need to create an Engine
59 |     and associate a connection with the context.
60 | 
61 |     """
62 |     connectable = engine_from_config(
63 |         config.get_section(config.config_ini_section),
64 |         prefix=u'sqlalchemy.',
65 |         poolclass=pool.NullPool)
66 | 
67 |     with connectable.connect() as connection:
68 |         context.configure(
69 |             connection=connection,
70 |             target_metadata=target_metadata,
71 |             version_table=u'{}_alembic_version'.format(name)
72 |         )
73 | 
74 |         with context.begin_transaction():
75 |             context.run_migrations()
76 | 
77 | 
78 | if context.is_offline_mode():
79 |     run_migrations_offline()
80 | else:
81 |     run_migrations_online()
82 | 


--------------------------------------------------------------------------------
/ckanext/harvest/migration/harvest/script.py.mako:
--------------------------------------------------------------------------------
 1 | """${message}
 2 | 
 3 | Revision ID: ${up_revision}
 4 | Revises: ${down_revision | comma,n}
 5 | Create Date: ${create_date}
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | ${imports if imports else ""}
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = ${repr(up_revision)}
14 | down_revision = ${repr(down_revision)}
15 | branch_labels = ${repr(branch_labels)}
16 | depends_on = ${repr(depends_on)}
17 | 
18 | 
19 | def upgrade():
20 |     ${upgrades if upgrades else "pass"}
21 | 
22 | 
23 | def downgrade():
24 |     ${downgrades if downgrades else "pass"}
25 | 


--------------------------------------------------------------------------------
/ckanext/harvest/migration/harvest/versions/3b4894672727_create_harvest_tables.py:
--------------------------------------------------------------------------------
  1 | """create harvest tables
  2 | 
  3 | Revision ID: 3b4894672727
  4 | Revises:
  5 | Create Date: 2023-11-02 15:53:02.262586
  6 | 
  7 | """
  8 | from alembic import op
  9 | import sqlalchemy as sa
 10 | 
 11 | 
 12 | # revision identifiers, used by Alembic.
 13 | revision = "3b4894672727"
 14 | down_revision = None
 15 | branch_labels = None
 16 | depends_on = None
 17 | 
 18 | 
 19 | def upgrade():
 20 |     engine = op.get_bind()
 21 |     inspector = sa.inspect(engine)
 22 |     tables = inspector.get_table_names()
 23 |     if "harvest_source" not in tables:
 24 |         op.create_table(
 25 |             "harvest_source",
 26 |             sa.Column("id", sa.UnicodeText, primary_key=True),
 27 |             sa.Column("url", sa.UnicodeText, nullable=False),
 28 |             sa.Column("title", sa.UnicodeText),
 29 |             sa.Column("description", sa.UnicodeText),
 30 |             sa.Column("config", sa.UnicodeText),
 31 |             sa.Column("created", sa.DateTime),
 32 |             sa.Column("type", sa.UnicodeText, nullable=False),
 33 |             sa.Column("active", sa.Boolean),
 34 |             sa.Column("user_id", sa.UnicodeText),
 35 |             sa.Column("publisher_id", sa.UnicodeText),
 36 |             sa.Column("frequency", sa.UnicodeText),
 37 |             sa.Column("next_run", sa.DateTime),
 38 |         )
 39 | 
 40 |     if "harvest_job" not in tables:
 41 |         op.create_table(
 42 |             "harvest_job",
 43 |             sa.Column("id", sa.UnicodeText, primary_key=True),
 44 |             sa.Column("created", sa.DateTime),
 45 |             sa.Column("gather_started", sa.DateTime),
 46 |             sa.Column("gather_finished", sa.DateTime),
 47 |             sa.Column("finished", sa.DateTime),
 48 |             sa.Column(
 49 |                 "source_id",
 50 |                 sa.UnicodeText,
 51 |                 sa.ForeignKey("harvest_source.id"),
 52 |             ),
 53 |             sa.Column("status", sa.UnicodeText, nullable=False),
 54 |         )
 55 | 
 56 |     if "harvest_object" not in tables:
 57 |         op.create_table(
 58 |             "harvest_object",
 59 |             sa.Column("id", sa.UnicodeText, primary_key=True),
 60 |             sa.Column("guid", sa.UnicodeText),
 61 |             sa.Column("current", sa.Boolean),
 62 |             sa.Column("gathered", sa.DateTime),
 63 |             sa.Column("fetch_started", sa.DateTime),
 64 |             sa.Column("content", sa.UnicodeText, nullable=True),
 65 |             sa.Column("fetch_finished", sa.DateTime),
 66 |             sa.Column("import_started", sa.DateTime),
 67 |             sa.Column("import_finished", sa.DateTime),
 68 |             sa.Column("state", sa.UnicodeText),
 69 |             sa.Column("metadata_modified_date", sa.DateTime),
 70 |             sa.Column("retry_times", sa.Integer),
 71 |             sa.Column(
 72 |                 "harvest_job_id",
 73 |                 sa.UnicodeText,
 74 |                 sa.ForeignKey("harvest_job.id"),
 75 |             ),
 76 |             sa.Column(
 77 |                 "harvest_source_id",
 78 |                 sa.UnicodeText,
 79 |                 sa.ForeignKey("harvest_source.id"),
 80 |             ),
 81 |             sa.Column(
 82 |                 "package_id",
 83 |                 sa.UnicodeText,
 84 |                 sa.ForeignKey("package.id", deferrable=True),
 85 |                 nullable=True,
 86 |             ),
 87 |             sa.Column("report_status", sa.UnicodeText, nullable=True),
 88 |         )
 89 | 
 90 |     index_names = [index["name"] for index in inspector.get_indexes("harvest_object")]
 91 |     if "harvest_job_id_idx" not in index_names:
 92 |         op.create_index("harvest_job_id_idx", "harvest_object", ["harvest_job_id"])
 93 | 
 94 |     if "harvest_source_id_idx" not in index_names:
 95 |         op.create_index(
 96 |             "harvest_source_id_idx", "harvest_object", ["harvest_source_id"]
 97 |         )
 98 | 
 99 |     if "package_id_idx" not in index_names:
100 |         op.create_index("package_id_idx", "harvest_object", ["package_id"])
101 | 
102 |     if "guid_idx" not in index_names:
103 |         op.create_index("guid_idx", "harvest_object", ["guid"])
104 | 
105 |     if "harvest_object_extra" not in tables:
106 |         op.create_table(
107 |             "harvest_object_extra",
108 |             sa.Column("id", sa.UnicodeText, primary_key=True),
109 |             sa.Column(
110 |                 "harvest_object_id",
111 |                 sa.UnicodeText,
112 |                 sa.ForeignKey("harvest_object.id"),
113 |             ),
114 |             sa.Column("key", sa.UnicodeText),
115 |             sa.Column("value", sa.UnicodeText),
116 |         )
117 | 
118 |     index_names = [
119 |         index["name"] for index in inspector.get_indexes("harvest_object_extra")
120 |     ]
121 |     if "harvest_object_id_idx" not in index_names:
122 |         op.create_index(
123 |             "harvest_object_id_idx", "harvest_object_extra", ["harvest_object_id"]
124 |         )
125 | 
126 |     if "harvest_gather_error" not in tables:
127 |         op.create_table(
128 |             "harvest_gather_error",
129 |             sa.Column("id", sa.UnicodeText, primary_key=True),
130 |             sa.Column(
131 |                 "harvest_job_id",
132 |                 sa.UnicodeText,
133 |                 sa.ForeignKey("harvest_job.id"),
134 |             ),
135 |             sa.Column("message", sa.UnicodeText),
136 |             sa.Column("created", sa.DateTime),
137 |         )
138 | 
139 |     if "harvest_object_error" not in tables:
140 |         op.create_table(
141 |             "harvest_object_error",
142 |             sa.Column("id", sa.UnicodeText, primary_key=True),
143 |             sa.Column(
144 |                 "harvest_object_id",
145 |                 sa.UnicodeText,
146 |                 sa.ForeignKey("harvest_object.id"),
147 |             ),
148 |             sa.Column("message", sa.UnicodeText),
149 |             sa.Column("stage", sa.UnicodeText),
150 |             sa.Column("line", sa.Integer),
151 |             sa.Column("created", sa.DateTime),
152 |         )
153 | 
154 |     index_names = [
155 |         index["name"] for index in inspector.get_indexes("harvest_object_error")
156 |     ]
157 |     if "harvest_error_harvest_object_id_idx" not in index_names:
158 |         op.create_index(
159 |             "harvest_error_harvest_object_id_idx",
160 |             "harvest_object_error",
161 |             ["harvest_object_id"],
162 |         )
163 | 
164 |     if "harvest_log" not in tables:
165 |         op.create_table(
166 |             "harvest_log",
167 |             sa.Column("id", sa.UnicodeText, primary_key=True),
168 |             sa.Column("content", sa.UnicodeText, nullable=False),
169 |             sa.Column(
170 |                 "level",
171 |                 sa.Enum(
172 |                     "DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL", name="log_level"
173 |                 ),
174 |             ),
175 |             sa.Column("created", sa.DateTime),
176 |         )
177 | 
178 | 
179 | def downgrade():
180 |     op.drop_table("harvest_log")
181 |     sa.Enum(name="log_level").drop(op.get_bind())
182 |     op.drop_table("harvest_object_error")
183 |     op.drop_table("harvest_gather_error")
184 |     op.drop_table("harvest_object_extra")
185 |     op.drop_table("harvest_object")
186 |     op.drop_table("harvest_job")
187 |     op.drop_table("harvest_source")
188 | 


--------------------------------------------------------------------------------
/ckanext/harvest/migration/harvest/versions/75d650dfd519_add_cascade_to_harvest_tables.py:
--------------------------------------------------------------------------------
  1 | """add cascade to harvest tables
  2 | 
  3 | Revision ID: 75d650dfd519
  4 | Revises: 3b4894672727
  5 | Create Date: 2023-11-02 17:13:39.995339
  6 | 
  7 | """
  8 | from alembic import op
  9 | 
 10 | 
 11 | # revision identifiers, used by Alembic.
 12 | revision = "75d650dfd519"
 13 | down_revision = "3b4894672727"
 14 | branch_labels = None
 15 | depends_on = None
 16 | 
 17 | 
 18 | def upgrade():
 19 |     _recreate_fk("CASCADE")
 20 | 
 21 | 
 22 | def downgrade():
 23 |     _recreate_fk(None)
 24 | 
 25 | 
 26 | def _recreate_fk(ondelete):
 27 |     op.drop_constraint("harvest_job_source_id_fkey", "harvest_job")
 28 |     op.create_foreign_key(
 29 |         "harvest_job_source_id_fkey",
 30 |         "harvest_job",
 31 |         "harvest_source",
 32 |         ["source_id"],
 33 |         ["id"],
 34 |         ondelete=ondelete,
 35 |     )
 36 | 
 37 |     op.drop_constraint("harvest_object_harvest_job_id_fkey", "harvest_object")
 38 |     op.create_foreign_key(
 39 |         "harvest_object_harvest_job_id_fkey",
 40 |         "harvest_object",
 41 |         "harvest_job",
 42 |         ["harvest_job_id"],
 43 |         ["id"],
 44 |         ondelete=ondelete,
 45 |     )
 46 | 
 47 |     op.drop_constraint("harvest_object_harvest_source_id_fkey", "harvest_object")
 48 |     op.create_foreign_key(
 49 |         "harvest_object_harvest_source_id_fkey",
 50 |         "harvest_object",
 51 |         "harvest_source",
 52 |         ["harvest_source_id"],
 53 |         ["id"],
 54 |         ondelete=ondelete,
 55 |     )
 56 | 
 57 |     op.drop_constraint("harvest_object_package_id_fkey", "harvest_object")
 58 |     op.create_foreign_key(
 59 |         "harvest_object_package_id_fkey",
 60 |         "harvest_object",
 61 |         "package",
 62 |         ["package_id"],
 63 |         ["id"],
 64 |         ondelete=ondelete,
 65 |         deferrable=True,
 66 |     )
 67 | 
 68 |     op.drop_constraint(
 69 |         "harvest_object_extra_harvest_object_id_fkey", "harvest_object_extra"
 70 |     )
 71 |     op.create_foreign_key(
 72 |         "harvest_object_extra_harvest_object_id_fkey",
 73 |         "harvest_object_extra",
 74 |         "harvest_object",
 75 |         ["harvest_object_id"],
 76 |         ["id"],
 77 |         ondelete=ondelete,
 78 |     )
 79 | 
 80 |     op.drop_constraint(
 81 |         "harvest_gather_error_harvest_job_id_fkey", "harvest_gather_error"
 82 |     )
 83 |     op.create_foreign_key(
 84 |         "harvest_gather_error_harvest_job_id_fkey",
 85 |         "harvest_gather_error",
 86 |         "harvest_job",
 87 |         ["harvest_job_id"],
 88 |         ["id"],
 89 |         ondelete=ondelete,
 90 |     )
 91 | 
 92 |     op.drop_constraint(
 93 |         "harvest_object_error_harvest_object_id_fkey", "harvest_object_error"
 94 |     )
 95 |     op.create_foreign_key(
 96 |         "harvest_object_error_harvest_object_id_fkey",
 97 |         "harvest_object_error",
 98 |         "harvest_object",
 99 |         ["harvest_object_id"],
100 |         ["id"],
101 |         ondelete=ondelete,
102 |     )
103 | 


--------------------------------------------------------------------------------
/ckanext/harvest/model/__init__.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import datetime
  3 | 
  4 | from sqlalchemy import event
  5 | from sqlalchemy import Column
  6 | from sqlalchemy import ForeignKey
  7 | from sqlalchemy import types
  8 | from sqlalchemy import Index
  9 | from sqlalchemy.orm import backref, relationship
 10 | from sqlalchemy.exc import InvalidRequestError
 11 | 
 12 | from ckan.model.meta import Session
 13 | from ckan.model.types import make_uuid
 14 | from ckan.model.domain_object import DomainObject
 15 | from ckan.model.package import Package
 16 | 
 17 | try:
 18 |     from ckan.plugins.toolkit import BaseModel
 19 | except ImportError:
 20 |     # CKAN <= 2.9
 21 |     from ckan.model.meta import metadata
 22 |     from sqlalchemy.ext.declarative import declarative_base
 23 | 
 24 |     BaseModel = declarative_base(metadata=metadata)
 25 | 
 26 | 
 27 | UPDATE_FREQUENCIES = ["MANUAL", "MONTHLY", "WEEKLY", "BIWEEKLY", "DAILY", "ALWAYS"]
 28 | 
 29 | log = logging.getLogger(__name__)
 30 | 
 31 | 
 32 | class HarvestError(Exception):
 33 |     pass
 34 | 
 35 | 
 36 | class HarvestDomainObject(DomainObject):
 37 |     """Convenience methods for searching objects"""
 38 | 
 39 |     key_attr = "id"
 40 | 
 41 |     @classmethod
 42 |     def get(cls, key, default=None, attr=None):
 43 |         """Finds a single entity in the register."""
 44 |         if attr is None:
 45 |             attr = cls.key_attr
 46 |         kwds = {attr: key}
 47 |         o = cls.filter(**kwds).first()
 48 |         if o:
 49 |             return o
 50 |         else:
 51 |             return default
 52 | 
 53 |     @classmethod
 54 |     def filter(cls, **kwds):
 55 |         query = Session.query(cls).autoflush(False)
 56 |         return query.filter_by(**kwds)
 57 | 
 58 | 
 59 | class HarvestSource(BaseModel, HarvestDomainObject):
 60 |     """A Harvest Source is essentially a URL plus some other metadata.
 61 |     It must have a type (e.g. CSW) and can have a status of "active"
 62 |     or "inactive". The harvesting processes are not fired on inactive
 63 |     sources.
 64 |     """
 65 | 
 66 |     __tablename__ = "harvest_source"
 67 | 
 68 |     id = Column(types.UnicodeText, primary_key=True, default=make_uuid)
 69 |     url = Column(types.UnicodeText, nullable=False)
 70 |     title = Column(types.UnicodeText, default="")
 71 |     description = Column(types.UnicodeText, default="")
 72 |     config = Column(types.UnicodeText, default="")
 73 |     created = Column(types.DateTime, default=datetime.datetime.utcnow)
 74 |     type = Column(types.UnicodeText, nullable=False)
 75 |     active = Column(types.Boolean, default=True)
 76 |     user_id = Column(types.UnicodeText, default="")
 77 |     publisher_id = Column(types.UnicodeText, default="")
 78 |     frequency = Column(types.UnicodeText, default="MANUAL")
 79 |     next_run = Column(types.DateTime)
 80 |     jobs = relationship(
 81 |         "HarvestJob",
 82 |         lazy="select",
 83 |         back_populates="source",
 84 |         order_by=lambda: HarvestJob.created,
 85 |     )
 86 | 
 87 |     def __repr__(self):
 88 |         return "<HarvestSource id=%s title=%s url=%s active=%r>" % (
 89 |             self.id,
 90 |             self.title,
 91 |             self.url,
 92 |             self.active,
 93 |         )
 94 | 
 95 |     def __str__(self):
 96 |         return self.__repr__().encode("ascii", "ignore")
 97 | 
 98 |     def get_jobs(self, status=None):
 99 |         """get the running jobs for this source"""
100 | 
101 |         query = Session.query(HarvestJob).filter(HarvestJob.source_id == self.id)
102 | 
103 |         if status is not None:
104 |             query = query.filter(HarvestJob.status == status)
105 | 
106 |         return query.all()
107 | 
108 | 
109 | class HarvestJob(BaseModel, HarvestDomainObject):
110 |     """A Harvesting Job is performed in two phases. In first place, the
111 |     **gather** stage collects all the Ids and URLs that need to be fetched
112 |     from the harvest source. Errors occurring in this phase
113 |     (``HarvestGatherError``) are stored in the ``harvest_gather_error``
114 |     table. During the next phase, the **fetch** stage retrieves the
115 |     ``HarvestedObjects`` and, if necessary, the **import** stage stores
116 |     them on the database. Errors occurring in this second stage
117 |     (``HarvestObjectError``) are stored in the ``harvest_object_error``
118 |     table.
119 |     """
120 | 
121 |     __tablename__ = "harvest_job"
122 | 
123 |     id = Column(types.UnicodeText, primary_key=True, default=make_uuid)
124 |     created = Column(types.DateTime, default=datetime.datetime.utcnow)
125 |     gather_started = Column(types.DateTime)
126 |     gather_finished = Column(types.DateTime)
127 |     finished = Column(types.DateTime)
128 |     source_id = Column(types.UnicodeText, ForeignKey("harvest_source.id"))
129 |     # status: New, Running, Finished
130 |     status = Column(types.UnicodeText, default="New", nullable=False)
131 |     source = relationship(
132 |         "HarvestSource",
133 |         lazy="select",
134 |         back_populates="jobs",
135 |     )
136 | 
137 |     def get_last_finished_object(self):
138 |         """Determine the last finished object in this job
139 |         Helpful to know if a job is running or not and
140 |           to avoid timeouts when the source is running
141 |         """
142 | 
143 |         query = (
144 |             Session.query(HarvestObject)
145 |             .filter(HarvestObject.harvest_job_id == self.id)
146 |             .filter(HarvestObject.state == "COMPLETE")
147 |             .filter(HarvestObject.import_finished.isnot(None))
148 |             .order_by(HarvestObject.import_finished.desc())
149 |             .first()
150 |         )
151 | 
152 |         return query
153 | 
154 |     def get_last_gathered_object(self):
155 |         """Determine the last gathered object in this job
156 |         Helpful to know if a job is running or not and
157 |           to avoid timeouts when the source is running
158 |         """
159 | 
160 |         query = (
161 |             Session.query(HarvestObject)
162 |             .filter(HarvestObject.harvest_job_id == self.id)
163 |             .order_by(HarvestObject.gathered.desc())
164 |             .first()
165 |         )
166 | 
167 |         return query
168 | 
169 |     def get_last_action_time(self):
170 |         last_object = self.get_last_finished_object()
171 |         if last_object is not None:
172 |             return last_object.import_finished
173 | 
174 |         if self.gather_finished is not None:
175 |             return self.gather_finished
176 | 
177 |         last_gathered_object = self.get_last_gathered_object()
178 |         if last_gathered_object is not None:
179 |             return last_gathered_object.gathered
180 | 
181 |         return self.created
182 | 
183 |     def get_gather_errors(self):
184 |         query = (
185 |             Session.query(HarvestGatherError)
186 |             .filter(HarvestGatherError.harvest_job_id == self.id)
187 |             .order_by(HarvestGatherError.created.desc())
188 |         )
189 | 
190 |         return query.all()
191 | 
192 | 
193 | class HarvestObject(BaseModel, HarvestDomainObject):
194 |     """A Harvest Object is created every time an element is fetched from a
195 |     harvest source. Its contents can be processed and imported to ckan
196 |     packages, RDF graphs, etc.
197 | 
198 |     """
199 | 
200 |     __tablename__ = "harvest_object"
201 | 
202 |     id = Column(types.UnicodeText, primary_key=True, default=make_uuid)
203 |     # The guid is the 'identity' of the dataset, according to the source.
204 |     # So if you reharvest it, then the harvester knows which dataset to
205 |     # update because of this identity. The identity needs to be unique
206 |     # within this CKAN.
207 |     guid = Column(types.UnicodeText, default="")
208 |     # When you harvest a dataset multiple times, only the latest
209 |     # successfully imported harvest_object should be flagged 'current'.
210 |     # The import_stage usually reads and writes it.
211 |     current = Column(types.Boolean, default=False)
212 |     gathered = Column(types.DateTime, default=datetime.datetime.utcnow)
213 |     fetch_started = Column(types.DateTime)
214 |     content = Column(types.UnicodeText, nullable=True)
215 |     fetch_finished = Column(types.DateTime)
216 |     import_started = Column(types.DateTime)
217 |     import_finished = Column(types.DateTime)
218 |     # state: WAITING, FETCH, IMPORT, COMPLETE, ERROR
219 |     state = Column(types.UnicodeText, default="WAITING")
220 |     metadata_modified_date = Column(types.DateTime)
221 |     retry_times = Column(types.Integer, default=0)
222 |     harvest_job_id = Column(types.UnicodeText, ForeignKey("harvest_job.id"))
223 |     harvest_source_id = Column(types.UnicodeText, ForeignKey("harvest_source.id"))
224 |     package_id = Column(
225 |         types.UnicodeText,
226 |         ForeignKey("package.id", deferrable=True),
227 |         nullable=True,
228 |     )
229 |     # report_status: 'added', 'updated', 'not modified', 'deleted', 'errored'
230 |     report_status = Column(types.UnicodeText, nullable=True)
231 |     harvest_job_id_idx = Index("harvest_job_id")
232 |     harvest_source_id_idx = Index("harvest_source_id")
233 |     package_id_idx = Index("package_id")
234 |     guid_idx = Index("guid")
235 |     package = relationship(
236 |         Package,
237 |         lazy="select",
238 |         backref="harvest_objects",
239 |     )
240 |     job = relationship(
241 |         HarvestJob,
242 |         lazy="select",
243 |         backref="objects",
244 |     )
245 |     source = relationship(
246 |         HarvestSource,
247 |         lazy="select",
248 |         backref="objects",
249 |     )
250 | 
251 | 
252 | class HarvestObjectExtra(BaseModel, HarvestDomainObject):
253 |     """Extra key value data for Harvest objects"""
254 | 
255 |     __tablename__ = "harvest_object_extra"
256 | 
257 |     id = Column(types.UnicodeText, primary_key=True, default=make_uuid)
258 |     harvest_object_id = Column(types.UnicodeText, ForeignKey("harvest_object.id"))
259 |     key = Column(types.UnicodeText)
260 |     value = Column(types.UnicodeText)
261 |     harvest_object_id_idx = Index("harvest_object_id")
262 |     object = relationship(
263 |         HarvestObject, backref=backref("extras", cascade="all,delete-orphan")
264 |     )
265 | 
266 | 
267 | class HarvestGatherError(BaseModel, HarvestDomainObject):
268 |     """Gather errors are raised during the **gather** stage of a harvesting
269 |     job.
270 |     """
271 | 
272 |     __tablename__ = "harvest_gather_error"
273 | 
274 |     id = Column(types.UnicodeText, primary_key=True, default=make_uuid)
275 |     harvest_job_id = Column(types.UnicodeText, ForeignKey("harvest_job.id"))
276 |     message = Column(types.UnicodeText)
277 |     created = Column(types.DateTime, default=datetime.datetime.utcnow)
278 | 
279 |     job = relationship(HarvestJob, backref="gather_errors")
280 | 
281 |     @classmethod
282 |     def create(cls, message, job):
283 |         """
284 |         Helper function to create an error object and save it.
285 |         """
286 |         err = cls(message=message, job=job)
287 |         try:
288 |             err.save()
289 |         except InvalidRequestError:
290 |             Session.rollback()
291 |             err.save()
292 |         finally:
293 |             # No need to alert administrator so don't log as an error
294 |             log.info(message)
295 | 
296 | 
297 | class HarvestObjectError(BaseModel, HarvestDomainObject):
298 |     """Object errors are raised during the **fetch** or **import** stage of a
299 |     harvesting job, and are referenced to a specific harvest object.
300 |     """
301 | 
302 |     __tablename__ = "harvest_object_error"
303 | 
304 |     id = Column(types.UnicodeText, primary_key=True, default=make_uuid)
305 |     harvest_object_id = Column(types.UnicodeText, ForeignKey("harvest_object.id"))
306 |     message = Column(types.UnicodeText)
307 |     stage = Column(types.UnicodeText)
308 |     line = Column(types.Integer)
309 |     created = Column(types.DateTime, default=datetime.datetime.utcnow)
310 |     harvest_error_harvest_object_id_idx = Index("harvest_object_id")
311 | 
312 |     object = relationship(
313 |         HarvestObject, backref=backref("errors", cascade="all,delete-orphan")
314 |     )
315 | 
316 |     @classmethod
317 |     def create(cls, message, object, stage="Fetch", line=None):
318 |         """
319 |         Helper function to create an error object and save it.
320 |         """
321 |         err = cls(message=message, object=object, stage=stage, line=line)
322 |         try:
323 |             err.save()
324 |         except InvalidRequestError:
325 |             # Clear any in-progress sqlalchemy transactions
326 |             try:
327 |                 Session.rollback()
328 |             except Exception:
329 |                 pass
330 |             try:
331 |                 Session.remove()
332 |             except Exception:
333 |                 pass
334 |             err.save()
335 |         finally:
336 |             log_message = "{0}, line {1}".format(message, line) if line else message
337 |             log.debug(log_message)
338 | 
339 | 
340 | class HarvestLog(BaseModel, HarvestDomainObject):
341 |     """HarvestLog objects are created each time something is logged
342 |     using python's standard logging module
343 |     """
344 | 
345 |     __tablename__ = "harvest_log"
346 | 
347 |     id = Column(types.UnicodeText, primary_key=True, default=make_uuid)
348 |     content = Column(types.UnicodeText, nullable=False)
349 |     level = Column(
350 |         types.Enum("DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL", name="log_level"),
351 |     )
352 |     created = Column(types.DateTime, default=datetime.datetime.utcnow)
353 | 
354 | 
355 | def harvest_object_before_insert_listener(mapper, connection, target):
356 |     """
357 |     For compatibility with old harvesters, check if the source id has
358 |     been set, and set it automatically from the job if not.
359 |     """
360 |     if not target.harvest_source_id or not target.source:
361 |         if not target.job:
362 |             raise Exception("You must define a Harvest Job for each Harvest Object")
363 |         target.harvest_source_id = target.job.source.id
364 | 
365 | 
366 | class PackageIdHarvestSourceIdMismatch(Exception):
367 |     """
368 |     The package created for the harvest source must match the id of the
369 |     harvest source
370 |     """
371 | 
372 |     pass
373 | 
374 | 
375 | def clean_harvest_log(condition):
376 |     Session.query(HarvestLog).filter(HarvestLog.created <= condition).delete(
377 |         synchronize_session=False
378 |     )
379 |     try:
380 |         Session.commit()
381 |     except InvalidRequestError:
382 |         Session.rollback()
383 |         log.error("An error occurred while trying to clean-up the harvest log table")
384 | 
385 |     log.info("Harvest log table clean-up finished successfully")
386 | 
387 | 
388 | event.listen(HarvestObject, "before_insert", harvest_object_before_insert_listener)
389 | 


--------------------------------------------------------------------------------
/ckanext/harvest/public/ckanext/harvest/images/icons/source_delete.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ckan/ckanext-harvest/b74cba23b647f0aefab1db406784dd8bb11f8c7d/ckanext/harvest/public/ckanext/harvest/images/icons/source_delete.png


--------------------------------------------------------------------------------
/ckanext/harvest/public/ckanext/harvest/images/icons/source_edit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ckan/ckanext-harvest/b74cba23b647f0aefab1db406784dd8bb11f8c7d/ckanext/harvest/public/ckanext/harvest/images/icons/source_edit.png


--------------------------------------------------------------------------------
/ckanext/harvest/public/ckanext/harvest/images/icons/source_new.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ckan/ckanext-harvest/b74cba23b647f0aefab1db406784dd8bb11f8c7d/ckanext/harvest/public/ckanext/harvest/images/icons/source_new.png


--------------------------------------------------------------------------------
/ckanext/harvest/public/ckanext/harvest/images/icons/source_refresh.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ckan/ckanext-harvest/b74cba23b647f0aefab1db406784dd8bb11f8c7d/ckanext/harvest/public/ckanext/harvest/images/icons/source_refresh.png


--------------------------------------------------------------------------------
/ckanext/harvest/public/ckanext/harvest/images/icons/source_view.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ckan/ckanext-harvest/b74cba23b647f0aefab1db406784dd8bb11f8c7d/ckanext/harvest/public/ckanext/harvest/images/icons/source_view.png


--------------------------------------------------------------------------------
/ckanext/harvest/public/ckanext/harvest/javascript/extra_fields.js:
--------------------------------------------------------------------------------
 1 | ckan.module('harvest-type-change', function (jQuery, _) {
 2 |   return {
 3 |     initialize: function () {
 4 |       var self, harvest_source_type;
 5 |       self = this;
 6 |       harvest_source_type = this.el.attr('value');
 7 |       this.el.change(function(){
 8 |         self.sandbox.publish('harvest-source-type-select', harvest_source_type);
 9 |       })
10 |       if (this.el.attr("checked") === "checked"){
11 |         self.sandbox.publish('harvest-source-type-select', harvest_source_type);
12 |       }
13 |     },
14 |   }
15 | })
16 | 
17 | ckan.module('harvest-extra-form-change', function (jQuery, _) {
18 |   return {
19 |     initialize: function () {
20 |       var self, item, i, control_groups, control_group, item_name;
21 |       self = this;
22 |       self.sandbox.subscribe('harvest-source-type-select', function(source_type) {
23 |         form_items = self.options.formItems;
24 |         items = form_items[source_type] || [];
25 | 
26 |         control_groups = self.el.find('.control-group');
27 |         for (i=0;i<control_groups.length;i++){
28 |           control_group = $(control_groups[i])
29 |           item_name = control_group.find('input').attr('name');
30 |           if ($.inArray(item_name, items) === -1){
31 |             control_group.hide();
32 |           } else{
33 |             control_group.show();
34 |           }
35 |         }
36 |       })
37 |     },
38 |   }
39 | })
40 | 


--------------------------------------------------------------------------------
/ckanext/harvest/public/ckanext/harvest/javascript/resource.config:
--------------------------------------------------------------------------------
 1 | 
 2 | [depends]
 3 | 
 4 | main = base/main
 5 | 
 6 | [groups]
 7 | 
 8 | main =
 9 |     extra_fields.js
10 | 
11 | 


--------------------------------------------------------------------------------
/ckanext/harvest/public/ckanext/harvest/javascript/webassets.yml:
--------------------------------------------------------------------------------
1 | main:
2 |   filters: rjsmin
3 |   output: ckanext-harvest/%(version)s_harvest_extra_fieldmain.js
4 |   extra:
5 |     preload:
6 |       - base/main
7 |   contents:
8 |     - extra_fields.js
9 | 


--------------------------------------------------------------------------------
/ckanext/harvest/public/ckanext/harvest/style.css:
--------------------------------------------------------------------------------
  1 | /* Harvest styles */
  2 | 
  3 | 
  4 | 
  5 | body.index.ViewController #minornavigation {
  6 |     display: none;
  7 | }
  8 | 
  9 | body.index.ViewController #content {
 10 |     width: 100% !important;
 11 |     border: 0 !important;
 12 | }
 13 | 
 14 | body.index.ViewController .content-outer {
 15 |     width: 100% !important;
 16 | }
 17 | 
 18 | #harvest-sources{
 19 |     width: 100%;
 20 | }
 21 | 
 22 | #new-harvest-source {
 23 | 	background: transparent url("images/icons/source_new.png") no-repeat 0px 0px;
 24 |   margin-top: 30px;
 25 |   padding-left: 20px;
 26 |   margin-bottom: 10px;
 27 |   font-weight: bold;
 28 | }
 29 | 
 30 | #show-inactive-sources-content{
 31 |     float: right;
 32 |     width: 170px;
 33 | }
 34 | 
 35 | #show-inactive-sources-content input{
 36 |     float: left;
 37 | }
 38 | 
 39 | #show-inactive-sources-content label{
 40 |     float: left;
 41 |     font-weight: normal;
 42 |     margin-right: 5px;
 43 | }
 44 | 
 45 | #harvest-sources th.action{
 46 |     font-style: italic;
 47 |     width: 50px;
 48 |     text-align: middle;
 49 | }
 50 | 
 51 | #harvest-sources th.url{
 52 |     width: 200px;
 53 | }
 54 | 
 55 | #harvest-sources td.action{
 56 |     text-align: middle;
 57 | }
 58 | 
 59 | #harvest-sources .inactive{
 60 |     display:none;
 61 | }
 62 | 
 63 | #harvest-sources .inactive .state{
 64 |     color: red;
 65 | }
 66 | 
 67 | .harvester-title{
 68 |     font-weight: bold;
 69 | }
 70 | 
 71 | .source-state-active{
 72 |     font-weight:bold;
 73 | }
 74 | 
 75 | .source-state-inactive{
 76 |     font-weight:bold;
 77 |     color: red;
 78 | }
 79 | 
 80 | #harvest-source-actions {
 81 |   margin: 20px 0;
 82 | }
 83 | 
 84 | #harvest-source-actions img{
 85 |     vertical-align: middle;
 86 |     margin: 0 5px;
 87 | }
 88 | 
 89 | #harvest-sources .publisher > td{
 90 |     background-color: #E3E3E3 !important;
 91 |     padding: 3px;
 92 |     font-weight: bold;
 93 | }
 94 | 
 95 | #harvest-source-details th {
 96 |   width: 33%;
 97 | }
 98 | 
 99 | #source-new {
100 |   margin-top: 30px;
101 | }
102 | 


--------------------------------------------------------------------------------
/ckanext/harvest/templates/admin/base.html:
--------------------------------------------------------------------------------
1 | {% ckan_extends %}
2 | 
3 | {% block content_primary_nav %}
4 |   {{ super() }}
5 |   {{ h.build_nav_icon('harvest.search', _('Harvest'), icon='download') }}
6 | {% endblock %}
7 | 


--------------------------------------------------------------------------------
/ckanext/harvest/templates/base.html:
--------------------------------------------------------------------------------
1 | {% ckan_extends %}
2 | 
3 | {% block styles %}
4 |     {{ super() }}
5 |     {% asset 'ckanext-harvest/harvest_css' %}
6 | {% endblock %}
7 | 


--------------------------------------------------------------------------------
/ckanext/harvest/templates/emails/error_email.txt:
--------------------------------------------------------------------------------
 1 | This is a failure notification of the latest harvest job set-up in {{ site_url }}. 
 2 | Job URL: {{ job_url }}
 3 | 
 4 | Harvest Source: {{ harvest_source_title }}
 5 | Harvest Configuration: {{ harvest_configuration | safe }}
 6 | Organization: {{ organization }}
 7 | 
 8 | Harvest Job Id: {{ job_id }}
 9 | Created: {{ job_created }}
10 | Finished: {{ job_finished }}
11 | 
12 | Records in Error: {{ records_in_error }}
13 | Records Added: {{ records_added }}
14 | Records Updated: {{ records_updated }}
15 | Records Deleted: {{ records_deleted }}
16 | 
17 | {{ error_summary_title }}: {{ errors|length }} errors
18 | 
19 | {{ job_errors_title }}: {{ job_errors|length }}
20 | {% for error in job_errors %}
21 |     - {{ error }} {% endfor %}
22 | {{ obj_errors_title }}: {{ obj_errors|length }}
23 | {% for error in obj_errors %}
24 |     - {{ error }} {% endfor %}
25 | --
26 | You are receiving this email because you are currently set-up as Administrator for {{ site_url }}.
27 | Please do not reply to this email as it was sent from a non-monitored address.
28 | 


--------------------------------------------------------------------------------
/ckanext/harvest/templates/emails/summary_email.txt:
--------------------------------------------------------------------------------
 1 | This is a summary of the latest harvest job set-up in {{ site_url }}.
 2 | Job URL: {{ job_url }}
 3 | 
 4 | Harvest Source: {{ harvest_source_title }}
 5 | Harvest Configuration: {{ harvest_configuration | safe }}
 6 | 
 7 | Organization: {{ organization }}
 8 | 
 9 | Harvest Job Id: {{ job_id }}
10 | Created: {{ job_created }}
11 | Finished: {{ job_finished }}
12 | 
13 | Records in Error: {{ records_in_error }}
14 | Records Added: {{ records_added }}
15 | Records Updated: {{ records_updated }}
16 | Records Deleted: {{ records_deleted }}
17 | 
18 | {{ error_summary_title }}: {{ errors|length }} errors
19 | {{ job_errors_title }}: {{ job_errors|length }}
20 | {% for error in job_errors %}
21 |     - {{ error }} {% endfor %}
22 | 
23 | {{ obj_errors_title }}: {{ obj_errors|length }}
24 | {% for error in obj_errors %}
25 |     - {{ error }} {% endfor %}
26 | 
27 | --
28 | You are receiving this email because you are currently set-up as Administrator for {{ site_url }}.
29 | Please do not reply to this email as it was sent from a non-monitored address.
30 | 


--------------------------------------------------------------------------------
/ckanext/harvest/templates/snippets/add_source_button.html:
--------------------------------------------------------------------------------
 1 | {% set authorized_user = h.check_access('harvest_source_create') %}
 2 | 
 3 | {% if authorized_user %}
 4 |   <a href="{{ h.url_for('{0}.new'.format(dataset_type)) }}" class="btn btn-primary">
 5 |     <i class="fa fa-plus-square icon-plus-sign-alt"></i>
 6 |       {{ _('Add Harvest Source') }}
 7 |   </a>
 8 | {% endif %}
 9 | 
10 | 


--------------------------------------------------------------------------------
/ckanext/harvest/templates/snippets/job_details.html:
--------------------------------------------------------------------------------
 1 | {#
 2 | Displays information for a particular harvest job, including:
 3 | 
 4 |   * counts for added, updated, deleted or errored datasets
 5 |   * table with general details
 6 |   * table with a summary of the most common errors on this job
 7 | 
 8 | job        - dictized harvest job object
 9 | 
10 | Example:
11 | 
12 |   {% snippet 'snippets/job_details.html', job=job %}
13 | 
14 | #}
15 | 
16 | {% set stats = job.stats %}
17 | 
18 | {% if job.status == 'Finished' %}
19 |   <p>
20 |     <span class="label label-important" data-diff="error">
21 |       {% if 'errored' in stats and stats['errored'] > 0 %}
22 |         {{ stats['errored'] }}
23 |       {% else %}
24 |         0
25 |       {% endif %}
26 |       {{ _('errors') }}
27 |     </span>
28 |     {% for action in ['added', 'updated', 'deleted', 'not modified'] %}
29 |       <span class="label" data-diff="{{ action }}">
30 |         {% if action in stats and stats[action] > 0 %}
31 |           {{ stats[action] }}
32 |         {% else %}
33 |           0
34 |         {% endif %}
35 |         {{ _(action) }}
36 |       </span>
37 |     {% endfor %}
38 |   </p>
39 | {% endif %}
40 | 
41 | <h3 class="hide-heading">{{ _('Details') }}</h3>
42 | <table class="table table-striped table-bordered table-condensed">
43 |   <colgroup>
44 |     <col width="15">
45 |     <col width="85">
46 |   </colgroup>
47 |   <tr>
48 |     <th>{{ _('Id') }}</th>
49 |     <td>{{ job.id }}</td>
50 |   </tr>
51 |   <tr>
52 |     <th>{{ _('Created') }}</th>
53 |     <td>
54 |         <span class="automatic-local-datetime" data-datetime="{{ h.render_datetime(job.created, date_format='%Y-%m-%dT%H:%M:%S%z') }}">
55 |             {{ h.render_datetime(job.created, with_hours=True) }}
56 |         </span>
57 |     </td>
58 |   </tr>
59 |   <tr>
60 |     <th>{{ _('Started') }}</th>
61 |     <td>
62 |         <span class="automatic-local-datetime" data-datetime="{{ h.render_datetime(job.gather_started, date_format='%Y-%m-%dT%H:%M:%S%z') }}">
63 |             {{ h.render_datetime(job.gather_started, with_hours=True) }}
64 |         </span>
65 |     </td>
66 |   </tr>
67 |   <tr>
68 |     <th>{{ _('Finished') }}</th>
69 |     <td>
70 |         <span class="automatic-local-datetime" data-datetime="{{ h.render_datetime(job.finished, date_format='%Y-%m-%dT%H:%M:%S%z') }}">
71 |             {{ h.render_datetime(job.finished, with_hours=True) }}
72 |         </span>
73 |     </td>
74 |   </tr>
75 |   <tr>
76 |     <th>{{ _('Status') }}</th>
77 |     <td>{{ _(job.status) }}</td>
78 |   </tr>
79 | </table>
80 | 


--------------------------------------------------------------------------------
/ckanext/harvest/templates/snippets/job_error_summary.html:
--------------------------------------------------------------------------------
 1 | {#
 2 | Displays a table with a summary of the most common errors for a job
 3 | 
 4 | error_summary        - List of dicts with message and error_count
 5 | 
 6 | Example:
 7 | 
 8 |   {% snippet 'snippets/job_error_summary.html', summary=job.object_error_summary %}
 9 | 
10 | #}
11 | <table class="table table-striped table-bordered table-condensed harvest-error-summary">
12 |   <colgroup>
13 |     <col width="8">
14 |     <col width="92">
15 |   </colgroup>
16 |   <thead>
17 |     <tr>
18 |       <th class="count">{{ _('Count') }}</th>
19 |       <th>{{ _('Message') }}</th>
20 |     </tr>
21 |   </thead>
22 |   <tbody>
23 |   {% for error in summary %}
24 |     <tr>
25 |       <td class="count">{{ error["error_count"] }}</td>
26 |       <td>{{ error["message"] }}</td>
27 |     </tr>
28 |   {% endfor %}
29 |    </tbody>
30 | </table>
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/ckanext/harvest/templates/snippets/package_list_empty.html:
--------------------------------------------------------------------------------
1 | <p class="empty">{% trans %}There are no datasets associated to this harvest source.{% endtrans %}</p>
2 | 


--------------------------------------------------------------------------------
/ckanext/harvest/templates/snippets/search_result_text.html:
--------------------------------------------------------------------------------
1 | {% ckan_extends %}
2 | 
3 | {% if type == 'harvest' %}
4 |   {% set text_query = ungettext('{number} harvest source found for "{query}"', '{number} harvest sources found for "{query}"', count) %}
5 |   {% set text_query_none = _('Sorry no harvest sources found for "{query}"') %}
6 |   {% set text_no_query = ungettext('{number} harvest source found', '{number} harvest sources found', count) %}
7 |   {% set text_no_query_none = _('Sorry no harvest sources found') %}
8 | {%- endif -%}
9 | 


--------------------------------------------------------------------------------
/ckanext/harvest/templates/snippets/source_item.html:
--------------------------------------------------------------------------------
 1 | {#
 2 | Displays a single harvest source result.
 3 | 
 4 | source         - A source to display.
 5 | item_class     - The class name to use on the list item.
 6 | hide_resources - If true hides the resources (default: false).
 7 | banner         - If true displays a popular banner (default: false).
 8 | truncate       - The length to trucate the description to (default: 180)
 9 | truncate_title - The length to truncate the title to (default: 80).
10 | show_organization - Boolean on whether to show the related organization
11 | 
12 | Example:
13 | 
14 |   {% snippet 'snippets/source_item.html', source=sources[0] %}
15 | 
16 | #}
17 | {% set ckan_version = h.ckan_version().split('.')[1] %}
18 | {% set truncate = truncate or 180 %}
19 | {% set truncate_title = truncate_title or 80 %}
20 | {% set title = source.title or source.name %}
21 | {% set source_type = h.get_pkg_dict_extra(source, 'source_type') %}
22 | {% set url = h.url_for('harvest_admin', id=source.name) if within_organization else h.url_for('harvest.read', id=source.name) %}
23 | 
24 | <li class="{{ item_class or "dataset-item" }}">
25 |   <div class="dataset-content">
26 |     <h3 class="dataset-heading">
27 |       {% if ckan_version | int >= 9 %}
28 |         {{ h.link_to(title|truncate(truncate_title),  url) }}
29 |       {% else %}
30 |         {{ h.link_to(h.truncate(title, truncate_title), url) }}
31 |       {% endif %}
32 |       {% if source.get(state, '').startswith('draft') %}
33 |         <span class="label label-info">{{ _('Draft') }}</span>
34 |       {% elif source.get(state, '').startswith('deleted') %}
35 |         <span class="label label-important">{{ _('Deleted') }}</span>
36 |       {% endif %}
37 |     </h3>
38 | 
39 |     {% if source.notes %}
40 |       <p>{{ source.notes }}</p>
41 |     {% else %}
42 |       <p class="empty">{{ _('There is no description for this harvest source') }}</p>
43 |     {% endif %}
44 | 
45 |     <p class="muted">
46 |       {% if source.status %}
47 |         {{ _('Datasets') }}: {{ source.status.total_datasets }}
48 |       {% endif %}
49 |       {% if not within_organization and source.organization %}
50 |       &mdash; {{ _('Organization') }}: {{ h.link_to(source.organization.title or source.organization.name, h.url_for('organization.read', id=source.organization.name)) }}</a>
51 |       {% endif %}
52 |     </p>
53 | 
54 |   </div>
55 | </li>
56 | 


--------------------------------------------------------------------------------
/ckanext/harvest/templates/snippets/source_list.html:
--------------------------------------------------------------------------------
 1 | {#
 2 | Displays a list of harvest sources.
 3 | 
 4 | sources        - A list of harvest sources to display.
 5 | list_class     - The class name for the list item.
 6 | item_class     - The class name to use on each item.
 7 | hide_resources - If true hides the resources (default: false).
 8 | banner         - If true displays a popular banner (default: false).
 9 | truncate       - The length to trucate the description to (default: 180)
10 | truncate_title - The length to truncate the title to (default: 80).
11 | 
12 | Example:
13 | 
14 |   {% snippet 'snippets/sources_list.html', sources=sources %}
15 | 
16 | #}
17 | {% if sources %}
18 |   <ul class="{{ list_class or 'dataset-list unstyled list-unstyled' }}">
19 |     {% for source in sources %}
20 |       {% snippet 'snippets/source_item.html', source=source, item_class=item_class, hide_resources=hide_resources, banner=banner, truncate=truncate, truncate_title=truncate_title, within_organization=within_organization %}
21 |     {% endfor %}
22 |   </ul>
23 | {% endif %}
24 | 


--------------------------------------------------------------------------------
/ckanext/harvest/templates/source/about.html:
--------------------------------------------------------------------------------
 1 | {% extends "source/read_base.html" %}
 2 | 
 3 | {% block primary_content_inner %}
 4 |   <section class="module-content">
 5 |     <h1>{{ harvest_source.title or harvest_source.name }}</h1>
 6 |     {% if harvest_source.notes %}
 7 |       <p>{{ h.markdown_extract(harvest_source.notes)|urlize }}</p>
 8 |     {% else %}
 9 |       <p class="empty">{{ _('There is no description for this harvest source') }}</p>
10 |     {% endif %}
11 |   </section>
12 |   {% snippet "package/snippets/additional_info.html", pkg_dict=harvest_source %}
13 | {% endblock %}
14 | 


--------------------------------------------------------------------------------
/ckanext/harvest/templates/source/admin.html:
--------------------------------------------------------------------------------
 1 | {% extends "source/admin_base.html" %}
 2 | 
 3 | {% block primary_content_inner %}
 4 |   <section class="module-content">
 5 |     <h1>{{ _('Last Harvest Job') }}</h1>
 6 |     {% if harvest_source.status and harvest_source.status.last_job %}
 7 |       {% snippet "snippets/job_details.html", job=harvest_source.status.last_job %}
 8 |       <div class="form-actions">
 9 |           <a href="{{ h.url_for('harvester.job_show_last', source=harvest_source.name) }}" class="btn pull-right btn-default">
10 |           <i class="fa fa-briefcase icon-briefcase"></i>
11 |           {{ _('View full job report')  }}
12 |         </a>
13 |       </div>
14 |     {% else %}
15 |       <p class="empty">{{ _('No jobs yet for this source') }}</p>
16 |     {% endif %}
17 |   </section>
18 | {% endblock %}
19 | 


--------------------------------------------------------------------------------
/ckanext/harvest/templates/source/admin_base.html:
--------------------------------------------------------------------------------
 1 | {% extends "source/read_base.html" %}
 2 | 
 3 | {% block breadcrumb_content_root_selected %}{% endblock %}
 4 | 
 5 | {% block breadcrumb_content %}
 6 |   {{ super() }}
 7 |   <li class="active"><a href="">{{ _('Admin') }}</a></li>
 8 | {% endblock %}
 9 | 
10 | {% block content_action %}
11 |   <div class="content_action btn-group">
12 |   {% if harvest_source.status and harvest_source.status.last_job and (harvest_source.status.last_job.status == 'New' or harvest_source.status.last_job.status == 'Running') %}
13 |     <a class="btn btn-default disabled" rel="tooltip" title="There already is an unrun job for this source"><i class="fa fa-lg fa-refresh icon-refresh icon-large"></i> Reharvest</a>
14 |   {% else %}
15 |     {% set locale = h.dump_json({'content': _('This will re-run the harvesting for this source. Any updates at the source will overwrite the local datasets. Sources with a large number of datasets may take a significant amount of time to finish harvesting. Please confirm you would like us to start reharvesting.')}) %}
16 |       <a href="{{ h.url_for('harvester.refresh', id=harvest_source.id) }}" class="btn btn-default" data-module="confirm-action" data-module-i18n="{{ locale }}"
17 |          title="{{ _('Start a new harvesting job for this harvest source now') }}">
18 |         <i class="fa fa-refresh icon-refresh"></i>
19 |         {{ _('Reharvest') }}
20 |       </a>
21 |   {% endif %}
22 |   {% if harvest_source.status and harvest_source.status.last_job and (harvest_source.status.last_job.status == 'Running') %}
23 |       <a href="{{ h.url_for('harvester.job_abort', source=harvest_source.name, id=harvest_source.status.last_job.id) }}" class="btn btn-default" title="Stop this Job">
24 |         <i class="fa fa-ban icon-ban-circle"></i>
25 |         {{ _('Stop') }}
26 |       </a>
27 |   {% endif %}
28 |     {% set locale = h.dump_json({'content': _('Warning: This will remove all datasets for this source, as well as all previous job reports. Are you sure you want to continue?')}) %}
29 |       <a href="{{ h.url_for('harvester.clear', id=harvest_source.id) }}" class="btn btn-default" data-module="confirm-action" data-module-i18n="{{ locale }}"
30 |          title="{{ _('Delete all harvest jobs and existing datasets from this source') }}">
31 |         {{ _('Clear') }}
32 |       </a>
33 |        <a href="{{ h.url_for('{0}.read'.format(c.dataset_type), id=harvest_source.id) }}" class="btn btn-default">
34 |         <i class="fa fa-eye eye-open"></i>
35 |         {{ _('View harvest source') }}
36 |       </a>
37 |     </div>
38 | {% endblock %}
39 | 
40 | {% block page_header_tabs %}
41 |   {{ h.build_nav_icon('harvester.admin', _('Dashboard'), id=harvest_source.name, icon='dashboard') }}
42 |   {{ h.build_nav_icon('harvester.job_list', _('Jobs'), source=harvest_source.name, icon='reorder') }}
43 |   {{ h.build_nav_icon(c.dataset_type ~ '.edit', _('Edit'), id=harvest_source.name, icon='edit') }}
44 | {% endblock %}
45 | 


--------------------------------------------------------------------------------
/ckanext/harvest/templates/source/base.html:
--------------------------------------------------------------------------------
 1 | {% extends "page.html" %}
 2 | {% set harvest_source = harvest_source or h.get_harvest_source() %}
 3 | {% if harvest_source %}
 4 |     {% set authorized_user = h.check_access('harvest_source_update', {'id':harvest_source.id }) %}
 5 | {% else %}
 6 |     {% set authorized_user = h.check_access('harvest_source_create') %}
 7 | {% endif %}
 8 | 
 9 | {% block subtitle %}{{ harvest_source.title or harvest_source.name }}{% endblock %}
10 | 
11 | {% block breadcrumb_content_root_selected %} class="active"{% endblock %}
12 | 
13 | {% block breadcrumb_content %}
14 |   {% if harvest_source.organization %}
15 |     {% set org = harvest_source.organization %}
16 |     <li>{{ h.nav_link(_('Organizations'), named_route='organizations_index') }}</li>
17 |     <li>{{ h.nav_link(org.title or org.name|truncate(10), named_route='organization_read', id=org.name) }}</li>
18 |     <li>{{ h.nav_link(_('Harvest Sources'), named_route='{0}_search'.format(c.dataset_type)) }}</li>
19 |     <li{{ self.breadcrumb_content_root_selected() }}>{{ h.nav_link(harvest_source.title|truncate(10), named_route='{0}_read'.format(c.dataset_type), id=harvest_source.name) }}</li>
20 |   {% else %}
21 |     <li>{{ h.nav_link(_('Harvest Sources'), named_route='{0}_search'.format(c.dataset_type)) }}</li>
22 |     <li{{ self.breadcrumb_content_root_selected() }}>{{ h.nav_link(harvest_source.title|truncate(30), named_route='{0}_read'.format(c.dataset_type), id=harvest_source.name) }}</li>
23 |   {% endif %}
24 | {% endblock %}
25 | 


--------------------------------------------------------------------------------
/ckanext/harvest/templates/source/edit.html:
--------------------------------------------------------------------------------
 1 | {% extends "source/admin_base.html" %}
 2 | 
 3 | {% block subtitle %}{{ _('Edit harvest source') }}{% endblock %}
 4 | 
 5 | {% block primary_content_inner %}
 6 | 
 7 |   <div class="module-content">
 8 |     {% block form %}
 9 |       {{- h.snippet(form_snippet, c=c, **form_vars) -}}
10 |     {% endblock %}
11 |   </div>
12 | {% endblock %}
13 | 


--------------------------------------------------------------------------------
/ckanext/harvest/templates/source/job/list.html:
--------------------------------------------------------------------------------
 1 | {% extends "source/admin_base.html" %}
 2 | 
 3 | 
 4 | {% block subtitle %}{{ _('Harvest Jobs')}} - {{ super() }}{% endblock %}
 5 | 
 6 | {% block primary_content_inner %}
 7 | <div class="module-content">
 8 | 
 9 |   <h1 class="results">{{ _('Harvest Jobs') }}</h1>
10 | 
11 |   {% if jobs|length == 0 %}
12 |     <p class="empty">{{ _('No jobs yet for this source') }}</p>
13 |   {% else %}
14 |     <ul class="dataset-list unstyled">
15 |       {% for job in jobs %}
16 |         <li class="dataset-item">
17 |           <div class="dataset-content">
18 |             <h3 class="dataset-heading">
19 |                 <a href="{{ h.url_for('harvester.job_show', source=harvest_source.name, id=job.id) }}">
20 |                 {{ _('Job: ') }} {{ job.id }}
21 |               </a>
22 |               {% if job.status != 'Finished' %}
23 |                 <span class="label">{{ job.status }}</span>
24 |               {% endif %}
25 |             </h3>
26 |             <p>
27 |               {{ _('Started:') }}
28 |               <span class="automatic-local-datetime" data-datetime="{{ h.render_datetime(job.gather_started, date_format='%Y-%m-%dT%H:%M:%S%z') }}">
29 |                 {{ h.render_datetime(job.gather_started, with_hours=True) or _('Not yet') }}
30 |               </span>
31 |               &mdash;
32 |               {{ _('Finished:') }}
33 |               <span class="automatic-local-datetime" data-datetime="{{ h.render_datetime(job.finished, date_format='%Y-%m-%dT%H:%M:%S%z') }}">
34 |                 {{ h.render_datetime(job.finished, with_hours=True) or _('Not yet') }}
35 |               </span>
36 |             </p>
37 |           </div>
38 |           {% if job.status == 'Finished' %}
39 |             <ul class="dataset-resources unstyled">
40 |               {% if 'errored' in job.stats and job.stats['errored'] > 0 %}
41 |                 <li>
42 |                   <span class="label label-important" data-diff="error">
43 |                     {{ job.stats['errored'] }} {{ _('errors') }}
44 |                   </span>
45 |                 </li>
46 |               {% endif %}
47 |               {% for action in ['added', 'updated', 'deleted', 'not modified'] %}
48 |                 <li>
49 |                   <span class="label" data-diff="{{ action }}" title="{{ _(action) }}">
50 |                     {% if action in job.stats and job.stats[action] > 0 %}
51 |                       {{ job.stats[action] }}
52 |                     {% else %}
53 |                       0
54 |                     {% endif %}
55 |                     {{ _(action) }}
56 |                   </span>
57 |                 </li>
58 |               {% endfor %}
59 |             </ul>
60 |           {% endif %}
61 |         </li>
62 |        {% endfor %}
63 |     </ul>
64 |   {% endif %}
65 | 
66 | </div>
67 | {% endblock %}
68 | 


--------------------------------------------------------------------------------
/ckanext/harvest/templates/source/job/read.html:
--------------------------------------------------------------------------------
 1 | {% extends "source/admin_base.html" %}
 2 | 
 3 | {% block subtitle %}{{ _('Job Report') }} - {{ super() }}{% endblock %}
 4 | 
 5 | {% block primary_content_inner %}
 6 | <div class="module-content">
 7 | 
 8 |   <p class="pull-right">
 9 |     {{ h.nav_link(_('Back to job list'), named_route='harvester.job_list', source=harvest_source.name, class_='btn btn-default', icon='arrow-left')}}
10 |   </p>
11 | 
12 |   <h1>{{ _('Job Report') }}</h1>
13 |   {% snippet 'snippets/job_details.html', job=job %}
14 | 
15 |   {% if job.status == 'Finished' %}
16 | 
17 |     {% if job.object_error_summary|length == 0 and job.gather_error_summary|length == 0 %}
18 |       <h2>{{ _('Error Summary') }}</h2>
19 |       <p class="empty">{{ _('No errors for this job') }}</p>
20 |     {% else %}
21 |       <h2>
22 |         {{ _('Error Summary') }}
23 |         <small>{{ _('Only the 20 most frequent errors are shown') }}</small>
24 |       </h2>
25 |       {% if job.gather_error_summary|length > 0 %}
26 |         <h3>{{ _('Job Errors') }}</h3>
27 |         {% snippet 'snippets/job_error_summary.html', summary=job.gather_error_summary %}
28 |       {% endif %}
29 |       {% if job.object_error_summary|length > 0 %}
30 |         <h3>{{ _('Document Errors') }}</h3>
31 |         {% snippet 'snippets/job_error_summary.html', summary=job.object_error_summary %}
32 |       {% endif %}
33 |     {% endif %}
34 | 
35 |     {% if job_report.gather_errors|length > 0 or job_report.object_errors.keys()|length > 0 %}
36 |       <h2>
37 |         {{ _('Error Report') }}
38 |       </h2>
39 |       {% if job_report.gather_errors|length > 0 %}
40 |         <h3>{{ _('Job Errors') }}</h3>
41 |         <table class="table table-bordered table-hover harvest-error-list">
42 |           <tbody>
43 |             {% for error  in job_report.gather_errors %}
44 |             <tr>
45 |               <td>
46 |                   <div class="error">
47 |                     {{ error.message }}
48 |                   </div>
49 |               </td>
50 |             </tr>
51 |             {% endfor %}
52 |           </tbody>
53 |         </table>
54 |       {% endif %}
55 | 
56 |       {% if job_report.object_errors.keys()|length > 0 %}
57 |         <h3>{{ _('Document Errors') }}
58 |           <small>{{ job_report.object_errors.keys()|length}} {{ _('documents with errors') }}</small>
59 |         </h3>
60 |         <table class="table table-bordered table-hover harvest-error-list">
61 |           <tbody>
62 |             {% for harvest_object_id in job_report.object_errors.keys() %}
63 |             {% set object = job_report.object_errors[harvest_object_id] %}
64 |             <tr>
65 |               <td>
66 |                 <span class="btn-group pull-right">
67 |                   {% if 'original_url' in  object%}
68 |                     <a href="{{ object.original_url }}" class="btn btn-small">
69 |                       {{ _('Remote content') }}
70 |                     </a>
71 |                   {% endif %}
72 |                   <a href="{{ h.url_for('harvester.object_show', id=harvest_object_id) }}" class="btn btn-small">
73 |                     {{ _('Local content') }}
74 |                   </a>
75 | 
76 |                 </span>
77 |                 <h5>{{ object.guid }}</h5>
78 |                 {% for error in object.errors %}
79 |                   <div class="error">
80 |                     {{ error.message }}
81 |                     {% if error.line %}
82 |                       <span class="line">(line {{ error.line }})</span>
83 |                     {% endif %}
84 |                   </div>
85 |                 {% endfor %}
86 |               </td>
87 |             </tr>
88 |             {% endfor %}
89 |           </tbody>
90 |         </table>
91 |       {% endif %}
92 | 
93 |   {% endif %}
94 |   {% endif %}
95 | {% endblock %}
96 | 


--------------------------------------------------------------------------------
/ckanext/harvest/templates/source/new.html:
--------------------------------------------------------------------------------
 1 | {% extends "source/admin_base.html" %}
 2 | 
 3 | {% block breadcrumb_content %}
 4 |   <li>{{ h.nav_link(_('Harvest Sources'), named_route='{0}_search'.format(c.dataset_type)) }}</li>
 5 |   <li class="active">{{ h.nav_link(_('Create Harvest Source'), named_route='{0}_new'.format(c.dataset_type)) }}</li>
 6 | {% endblock %}
 7 | 
 8 | {% block actions_content %}
 9 | {% endblock %}
10 | 
11 | {% block subtitle %}{{ _('Create harvest source') }}{% endblock %}
12 | 
13 | {% block primary_content %}
14 |   <section class="module">
15 |     <div class="module-content">
16 |       {{- h.snippet(form_snippet, c=c, **form_vars) -}}
17 |     </div>
18 |   </section>
19 | {% endblock %}
20 | 
21 | {% block secondary_content %}
22 |   <section class="module module-narrow">
23 |     <h2 class="module-heading"><i class="fa fa-lg fa-info-circle icon-large icon-info-sign"></i> {{ _('Harvest sources') }}</h2>
24 |     <div class="module-content">
25 |       <p>
26 |         {% trans %}
27 |         Harvest sources allow importing remote metadata into this catalog.
28 |         Remote sources can be other catalogs such as other CKAN instances, CSW
29 |         servers or Web Accessible Folders (WAF) (depending on the actual
30 |         harvesters enabled for this instance).
31 |         {% endtrans %}
32 |       </p>
33 |     </div>
34 |   </section>
35 | {% endblock %}
36 | 


--------------------------------------------------------------------------------
/ckanext/harvest/templates/source/new_source_form.html:
--------------------------------------------------------------------------------
  1 | {% import 'macros/form.html' as form %}
  2 | 
  3 | {% asset 'harvest-extra-field/main' %}
  4 | 
  5 | <form id="source-new" class="form-horizontal" method="post" >
  6 | 
  7 |   {% block errors %}{{ form.errors(error_summary) }}{% endblock %}
  8 | 
  9 |   {% call form.input('url', id='field-url', label=_('URL'), value=data.url, error=errors.url, classes=['control-full', 'control-large']) %}
 10 |       <span class="info-block">
 11 |         {{ _('This should include the http:// part of the URL') }}
 12 |       </span>
 13 |   {% endcall %}
 14 | 
 15 |   {{ h.csrf_input() if 'csrf_input' in h }}
 16 | 
 17 |   {{ form.input('title', id='field-title', label=_('Title'), placeholder=_('eg. A descriptive title'), value=data.title, error=errors.title, classes=['control-full'], attrs={'data-module': 'slug-preview-target'}) }}
 18 | 
 19 |   {% set prefix = 'harvest' %}
 20 |   {% set domain = h.url_for('{0}.read'.format(c.dataset_type), id='', qualified=true) %}
 21 |   {% set domain = domain|replace("http://", "")|replace("https://", "") %}
 22 |   {% set attrs = {'data-module': 'slug-preview-slug', 'data-module-prefix': domain, 'data-module-placeholder': '<harvest-source>'} %}
 23 | 
 24 |   {{ form.prepend('name', id='field-name', label=_('Name'), prepend=prefix, placeholder=_('eg. my-dataset'), value=data.name, error=errors.name, attrs=attrs) }}
 25 | 
 26 |   {{ form.markdown('notes', id='field-notes', label=_('Description'), value=data.notes, error=errors.notes) }}
 27 | 
 28 |   <div class="harvest-types form-group control-group">
 29 |     <label class="control-label">Source type</label>
 30 |     <div class="controls">
 31 |       {% for harvester in h.harvesters_info() %}
 32 |         {% set checked = False %}
 33 |         {# select first option if nothing in data #}
 34 |         {% if data.source_type == harvester['name'] or (not data.source_type and loop.first) %}
 35 |            {% set checked = True %}
 36 |         {% endif %}
 37 |         <label class="radio">
 38 |           <input type="radio" name="source_type" value="{{ harvester['name'] }}" {{ "checked " if checked }} data-module="harvest-type-change">
 39 |           {{ harvester['title'] }}
 40 |           <i class="fa fa-question-circle icon-question-sign muted" title="{{ harvester['description'] }}" data-bs-toggle="tooltip" data-toggle="tooltip"></i>
 41 |         </label>
 42 |       {% endfor %}
 43 |     </div>
 44 |   </div>
 45 | 
 46 |   {{ form.select('frequency', id='field-frequency', label=_('Update frequency'), options=h.harvest_frequencies(), selected=data.frequency, error=errors.frequency) }}
 47 | 
 48 |   {% block extra_config %}
 49 |   {{ form.textarea('config', id='field-config', label=_('Configuration'), value=data.config, error=errors.config) }}
 50 |   {% endblock extra_config %}
 51 | 
 52 |   {# if we have a default group then this wants remembering #}
 53 |   {% if data.group_id %}
 54 |     <input type="hidden" name="groups__0__id" value="{{ data.group_id }}" />
 55 |   {% endif %}
 56 | 
 57 |   {% set dataset_is_draft = data.get('state', 'draft').startswith('draft') or data.get('state', 'none') ==  'none' %}
 58 |   {% set dataset_has_organization = data.owner_org or data.group_id %}
 59 |   {% set organizations_available = h.organizations_available('create_dataset') %}
 60 |   {% set user_is_sysadmin = h.check_access('sysadmin') %}
 61 |   {% set show_organizations_selector = organizations_available and (user_is_sysadmin or dataset_is_draft) %}
 62 | 
 63 |   {% if show_organizations_selector %}
 64 |     {% set existing_org = data.owner_org %}
 65 |     <div class="control-group form-group">
 66 |       <label for="field-organizations" class="control-label">{{ _('Organization') }}</label>
 67 |       <div class="controls">
 68 |         <select id="field-organizations" name="owner_org" data-module="autocomplete">
 69 |           <option value="" {% if not selected_org and data.id %} selected="selected" {% endif %}>{{ _('No organization') }}</option>
 70 |           {% for organization in organizations_available %}
 71 |             {# get out first org from users list only if there is not an existing org #}
 72 |             {% set selected_org = (existing_org and existing_org == organization.id) or (not existing_org and not data.id and organization.id == organizations_available[0].id) %}
 73 |             <option value="{{ organization.id }}" {% if selected_org %} selected="selected" {% endif %}>{{ organization.name }}</option>
 74 |           {% endfor %}
 75 |         </select>
 76 |       </div>
 77 |     </div>
 78 |   {% endif %}
 79 | 
 80 |   {% if data.get('id', None) and h.check_access('harvest_source_delete', {'id': data.id}) and data.get('state', 'none') == 'deleted' %}
 81 |     <div class="control-group">
 82 |       <label for="field-state" class="control-label">{{ _('State') }}</label>
 83 |       <div class="controls">
 84 |         <select id="field-state" name="state">
 85 |           <option value="active" {% if data.get('state', 'none') == 'active' %} selected="selected" {% endif %}>{{ _('Active') }}</option>
 86 |           <option value="deleted" {% if data.get('state', 'none') == 'deleted' %} selected="selected" {% endif %}>{{ _('Deleted') }}</option>
 87 |         </select>
 88 |       </div>
 89 |     </div>
 90 |   {% endif %}
 91 | 
 92 |   <p class="form-actions">
 93 |     {% block delete_button %}
 94 |       {% if data.get('id', None) and h.check_access('harvest_source_delete', {'id': data.id}) and not data.get('state', 'none') == 'deleted' %}
 95 |         {% set locale_delete = h.dump_json({'content': _('This will flag the source as deleted but keep all its datasets and previous jobs. Are you sure you want to delete this harvest source?')}) %}
 96 |         {% set locale_clear = h.dump_json({'content': _('Warning: Apart from deleting this source, this command will remove all its datasets, as well as all previous job reports. Are you sure you want to continue?')}) %}
 97 |   <div class="dropdown btn-group">
 98 |     <a href="#" class="btn btn-danger dropdown-toggle" data-bs-toggle="dropdown" data-toggle="dropdown">
 99 |       {{ _('Delete') }}
100 |       <span class="caret"></span>
101 |     </a>
102 |     <ul class="dropdown-menu">
103 |       <li>
104 |         <a href="{% url_for 'harvest_delete', id=data.name %}" data-module="confirm-action" data-module-i18n="{{ locale_delete }}">
105 |           {{ _('Delete source') }}
106 |         </a>
107 |       </li>
108 |       <li>
109 |         <a href="{% url_for 'harvest_delete', id=data.name %}?clear=True" data-module="confirm-action" data-module-i18n="{{ locale_clear }}">
110 |           {{ _('Delete and clear source') }}
111 |         </a>
112 |       </li>
113 |     </ul>
114 |   </div>
115 |       {% endif %}
116 |     {% endblock %}
117 | 
118 |     <input id="save" name="save" value="Save" type="submit" class="btn btn-primary pull-right">
119 |   </p>
120 | 
121 | </form>
122 | 


--------------------------------------------------------------------------------
/ckanext/harvest/templates/source/read.html:
--------------------------------------------------------------------------------
1 | {% extends "source/read_base.html" %}
2 | 
3 | {% block primary_content_inner %}
4 |   <section class="module-content">
5 |     <h1 class="hide-heading">{{ _('Datasets') }}</h1>
6 |     {{ h.package_list_for_source(harvest_source.id)  }}
7 |   </section>
8 | {% endblock %}
9 | 


--------------------------------------------------------------------------------
/ckanext/harvest/templates/source/read_base.html:
--------------------------------------------------------------------------------
 1 | {% extends "source/base.html" %}
 2 | 
 3 | {% block secondary_content %}
 4 |   <div class="module context-info">
 5 |     <section class="module-content">
 6 |       <h1 class="heading">{{ harvest_source.title }}</h1>
 7 |       {% if harvest_source.notes %}
 8 |         <p>
 9 |           {{ h.markdown_extract(harvest_source.notes, 180) }}
10 |           {{ h.nav_link(_('read more'), named_route='harvester.about', id=harvest_source.name) }}
11 |         </p>
12 |       {% else %}
13 |         <p class="empty">{{ _('There is no description for this harvest source') }}</p>
14 |       {% endif %}
15 |       <div class="nums">
16 |         <dl>
17 |             <dt>{{ _('Datasets') }}</dt>
18 |             <dd>{{ h.package_count_for_source(harvest_source.id) }}</dd>
19 |         </dl>
20 |       </div>
21 |     </section>
22 |   </div>
23 | {% endblock %}
24 | 
25 | {% block primary_content %}
26 |   <article class="module">
27 |     {% block page_header %}
28 |       <header class="module-content page-header">
29 |         {% block content_action %}
30 |           <div class="content_action">
31 |             {% if h.check_access('harvest_source_update', {'id':harvest_source.id }) %}
32 |               {{ h.nav_link(_('Admin'), named_route='harvester.admin', id=harvest_source.name, class_='btn btn-primary', icon='wrench')}}
33 |             {% endif %}
34 |           </div>
35 |         {% endblock %}
36 |         <ul class="nav nav-tabs">
37 |           {% block page_header_tabs %}
38 |           {{ h.build_nav_icon(c.dataset_type ~ '.read', _('Datasets'), id=harvest_source.name, icon='sitemap') }}
39 |           {{ h.build_nav_icon('harvester.about', _('About'), id=harvest_source.name, icon='info-sign') }}
40 |           {% endblock %}
41 |         </ul>
42 |       </header>
43 |     {% endblock %}
44 |     {% block primary_content_inner %}{% endblock %}
45 |   </article>
46 | {% endblock %}
47 | 


--------------------------------------------------------------------------------
/ckanext/harvest/templates/source/search.html:
--------------------------------------------------------------------------------
 1 | {% extends "page.html" %}
 2 | 
 3 | {% block subtitle %}{{ _("Harvest sources") }}{% endblock %}
 4 | 
 5 | 
 6 | {% block breadcrumb_content %}
 7 |   <li class="active">{{ h.nav_link(_('Harvest Sources'), named_route='{0}_search'.format(c.dataset_type)) }}</li>
 8 | {% endblock %}
 9 | 
10 | 
11 | {% block primary_content %}
12 |     <section class="module">
13 |       <div class="module-content">
14 |         {% block page_primary_action %}
15 |           <div class="page_primary_action">
16 |             {{ h.snippet('snippets/add_source_button.html', dataset_type=c.dataset_type) }}
17 |           </div>
18 |         {% endblock %}
19 | 
20 |         {% set facets = {
21 |           'fields': c.fields_grouped,
22 |           'search': c.search_facets,
23 |           'titles': c.facet_titles,
24 |           'translated_fields': c.translated_fields,
25 |           'remove_field': c.remove_field }
26 |         %}
27 |         {% set sorting = [
28 |           (_('Relevance'), 'score desc, metadata_modified desc'),
29 |           (_('Name Ascending'), 'title_string asc'),
30 |           (_('Name Descending'), 'title_string desc'),
31 |           (_('Last Modified'), 'metadata_modified desc'),
32 |           (_('Popular'), 'views_recent desc') if g.tracking_enabled else (false, false) ]
33 |         %}
34 |         {% snippet 'snippets/search_form.html', type='harvest', query=c.q, sorting=sorting, sorting_selected=c.sort_by_selected, count=c.page.item_count, facets=facets, show_empty=request.args, error=c.query_error, placeholder=_("Search harvest sources...") %}
35 | 
36 |         {{ h.snippet('snippets/source_list.html', sources=c.page.items, show_organization=true) }}
37 | 
38 |       </div>
39 | 
40 |       {{ c.page.pager(q=c.q) }}
41 |     </section>
42 | 
43 |     {% endblock %}
44 | 
45 | 
46 | 
47 | {% block secondary_content %}
48 |   {% for facet in c.facet_titles %}
49 |       {{ h.snippet('snippets/facet_list.html', title=c.facet_titles[facet], name=facet, alternative_url=h.url_for('{0}.search'.format(c.dataset_type))) }}
50 |   {% endfor %}
51 | {% endblock %}
52 | 


--------------------------------------------------------------------------------
/ckanext/harvest/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ckan/ckanext-harvest/b74cba23b647f0aefab1db406784dd8bb11f8c7d/ckanext/harvest/tests/__init__.py


--------------------------------------------------------------------------------
/ckanext/harvest/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | 
3 | 
4 | @pytest.fixture
5 | def clean_db(reset_db, migrate_db_for):
6 |     reset_db()
7 |     migrate_db_for("harvest")
8 | 


--------------------------------------------------------------------------------
/ckanext/harvest/tests/factories.py:
--------------------------------------------------------------------------------
  1 | import factory
  2 | import ckanext.harvest.model as harvest_model
  3 | from ckantoolkit.tests.factories import _get_action_user_name
  4 | from ckan.plugins import toolkit
  5 | 
  6 | 
  7 | class HarvestSource(factory.Factory):
  8 | 
  9 |     FACTORY_FOR = harvest_model.HarvestSource
 10 | 
 11 |     class Meta:
 12 |         model = harvest_model.HarvestSource
 13 | 
 14 |     _return_type = 'dict'
 15 | 
 16 |     name = factory.Sequence(lambda n: 'test_source_{n}'.format(n=n))
 17 |     title = factory.Sequence(lambda n: 'test title {n}'.format(n=n))
 18 |     url = factory.Sequence(lambda n: 'http://{n}.test.com'.format(n=n))
 19 |     source_type = 'test'  # defined in test_queue.py
 20 |     id = '{0}_id'.format(name).lower()
 21 | 
 22 |     @classmethod
 23 |     def _create(cls, target_class, *args, **kwargs):
 24 |         if args:
 25 |             assert False, "Positional args aren't supported, use keyword args."
 26 |         context = {'user': _get_action_user_name(kwargs)}
 27 |         # If there is an existing source for this URL, and we can't create
 28 |         # another source with that URL, just return the original one.
 29 |         try:
 30 |             source_dict = toolkit.get_action('harvest_source_show')(
 31 |                 context, dict(url=kwargs['url']))
 32 |         except toolkit.ObjectNotFound:
 33 |             source_dict = toolkit.get_action('harvest_source_create')(
 34 |                 context, kwargs)
 35 |         if cls._return_type == 'dict':
 36 |             return source_dict
 37 |         else:
 38 |             return harvest_model.HarvestSource.get(source_dict['id'])
 39 | 
 40 | 
 41 | class HarvestSourceObj(HarvestSource):
 42 |     _return_type = 'obj'
 43 | 
 44 | 
 45 | class HarvestJob(factory.Factory):
 46 | 
 47 |     FACTORY_FOR = harvest_model.HarvestJob
 48 | 
 49 |     class Meta:
 50 |         model = harvest_model.HarvestJob
 51 | 
 52 |     _return_type = 'dict'
 53 | 
 54 |     source = factory.SubFactory(HarvestSourceObj)
 55 | 
 56 |     @classmethod
 57 |     def _create(cls, target_class, *args, **kwargs):
 58 |         if args:
 59 |             assert False, "Positional args aren't supported, use keyword args."
 60 |         context = {'user': _get_action_user_name(kwargs)}
 61 |         if 'source_id' not in kwargs:
 62 |             kwargs['source_id'] = kwargs['source'].id
 63 |         if 'run' not in kwargs:
 64 |             kwargs['run'] = False
 65 |         job_dict = toolkit.get_action('harvest_job_create')(
 66 |             context, kwargs)
 67 |         if cls._return_type == 'dict':
 68 |             return job_dict
 69 |         else:
 70 |             return harvest_model.HarvestJob.get(job_dict['id'])
 71 | 
 72 | 
 73 | class HarvestJobObj(HarvestJob):
 74 |     _return_type = 'obj'
 75 | 
 76 | 
 77 | class HarvestObject(factory.Factory):
 78 | 
 79 |     FACTORY_FOR = harvest_model.HarvestObject
 80 | 
 81 |     class Meta:
 82 |         model = harvest_model.HarvestObject
 83 | 
 84 |     _return_type = 'dict'
 85 | 
 86 |     # source = factory.SubFactory(HarvestSourceObj)
 87 |     job = factory.SubFactory(HarvestJobObj)
 88 | 
 89 |     @classmethod
 90 |     def _create(cls, target_class, *args, **kwargs):
 91 |         if args:
 92 |             assert False, "Positional args aren't supported, use keyword args."
 93 |         context = {'user': _get_action_user_name(kwargs)}
 94 |         if 'job_id' not in kwargs:
 95 |             kwargs['job_id'] = kwargs['job'].id
 96 |             kwargs['source_id'] = kwargs['job'].source.id
 97 |         # Remove 'job' to avoid it getting added as a HarvestObjectExtra
 98 |         if 'job' in kwargs:
 99 |             kwargs.pop('job')
100 |         job_dict = toolkit.get_action('harvest_object_create')(
101 |             context, kwargs)
102 |         if cls._return_type == 'dict':
103 |             return job_dict
104 |         else:
105 |             return harvest_model.HarvestObject.get(job_dict['id'])
106 | 
107 | 
108 | class HarvestObjectObj(HarvestObject):
109 |     _return_type = 'obj'
110 | 


--------------------------------------------------------------------------------
/ckanext/harvest/tests/fixtures.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | 
3 | from ckanext.harvest import queue
4 | 
5 | 
6 | @pytest.fixture
7 | def clean_queues():
8 |     queue.purge_queues()
9 | 


--------------------------------------------------------------------------------
/ckanext/harvest/tests/harvesters/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ckan/ckanext-harvest/b74cba23b647f0aefab1db406784dd8bb11f8c7d/ckanext/harvest/tests/harvesters/__init__.py


--------------------------------------------------------------------------------
/ckanext/harvest/tests/harvesters/test_base.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | import pytest
  4 | try:
  5 |     from unittest.mock import patch
  6 | except ImportError:
  7 |     from mock import patch
  8 | 
  9 | 
 10 | from ckanext.harvest.harvesters.base import HarvesterBase, munge_tag
 11 | from ckantoolkit.tests import factories
 12 | 
 13 | _ensure_name_is_unique = HarvesterBase._ensure_name_is_unique
 14 | 
 15 | 
 16 | @pytest.mark.usefixtures('with_plugins', 'clean_db', 'clean_index')
 17 | class TestGenNewName(object):
 18 | 
 19 |     def test_basic(self):
 20 |         assert HarvesterBase._gen_new_name('Trees') == 'trees'
 21 | 
 22 |     def test_munge(self):
 23 |         assert HarvesterBase._gen_new_name('Trees and branches - survey.') == 'trees-and-branches-survey'
 24 | 
 25 |     @patch.dict('ckanext.harvest.harvesters.base.config',
 26 |                 {'ckanext.harvest.some_other_config': 'value'})
 27 |     def test_without_config(self):
 28 |         '''Tests if the number suffix is used when no config is set.'''
 29 |         factories.Dataset(name='trees')
 30 |         assert HarvesterBase._gen_new_name('Trees') == 'trees1'
 31 | 
 32 |     @patch.dict('ckanext.harvest.harvesters.base.config',
 33 |                 {'ckanext.harvest.default_dataset_name_append': 'number-sequence'})
 34 |     def test_number_config(self):
 35 |         factories.Dataset(name='trees')
 36 |         assert HarvesterBase._gen_new_name('Trees') == 'trees1'
 37 | 
 38 |     @patch.dict('ckanext.harvest.harvesters.base.config',
 39 |                 {'ckanext.harvest.default_dataset_name_append': 'random-hex'})
 40 |     def test_random_config(self):
 41 |         factories.Dataset(name='trees')
 42 |         new_name = HarvesterBase._gen_new_name('Trees')
 43 | 
 44 |         assert re.match(r'trees[\da-f]{5}', new_name)
 45 | 
 46 |     @patch.dict('ckanext.harvest.harvesters.base.config',
 47 |                 {'ckanext.harvest.default_dataset_name_append': 'random-hex'})
 48 |     def test_config_override(self):
 49 |         '''Tests if a parameter has precedence over a config value.'''
 50 |         factories.Dataset(name='trees')
 51 |         assert HarvesterBase._gen_new_name('Trees', append_type='number-sequence') == 'trees1'
 52 | 
 53 | 
 54 | @pytest.mark.usefixtures('with_plugins', 'clean_db', 'clean_index')
 55 | class TestEnsureNameIsUnique(object):
 56 | 
 57 |     def test_no_existing_datasets(self):
 58 |         factories.Dataset(name='unrelated')
 59 |         assert _ensure_name_is_unique('trees') == 'trees'
 60 | 
 61 |     def test_existing_dataset(self):
 62 |         factories.Dataset(name='trees')
 63 |         assert _ensure_name_is_unique('trees') == 'trees1'
 64 | 
 65 |     def test_two_existing_datasets(self):
 66 |         factories.Dataset(name='trees')
 67 |         factories.Dataset(name='trees1')
 68 |         assert _ensure_name_is_unique('trees') == 'trees2'
 69 | 
 70 |     def test_no_existing_datasets_and_long_name(self):
 71 |         assert _ensure_name_is_unique('x' * 101) == 'x' * 100
 72 | 
 73 |     def test_existing_dataset_and_long_name(self):
 74 |         # because PACKAGE_NAME_MAX_LENGTH = 100
 75 |         factories.Dataset(name='x' * 100)
 76 |         assert _ensure_name_is_unique('x' * 101) == 'x' * 99 + '1'
 77 | 
 78 |     def test_update_dataset_with_new_name(self):
 79 |         factories.Dataset(name='trees1')
 80 |         assert _ensure_name_is_unique('tree', existing_name='trees1') == 'tree'
 81 | 
 82 |     def test_update_dataset_but_with_same_name(self):
 83 |         # this can happen if you remove a trailing space from the title - the
 84 |         # harvester sees the title changed and thinks it should have a new
 85 |         # name, but clearly it can reuse its existing one
 86 |         factories.Dataset(name='trees')
 87 |         factories.Dataset(name='trees1')
 88 |         assert _ensure_name_is_unique('trees', existing_name='trees') == 'trees'
 89 | 
 90 |     def test_update_dataset_to_available_shorter_name(self):
 91 |         # this can be handy when if reharvesting, you got duplicates and
 92 |         # managed to purge one set and through a minor title change you can now
 93 |         # lose the appended number. users don't like unnecessary numbers.
 94 |         factories.Dataset(name='trees1')
 95 |         assert _ensure_name_is_unique('trees', existing_name='trees1') == 'trees'
 96 | 
 97 |     def test_update_dataset_but_doesnt_change_to_other_number(self):
 98 |         # there's no point changing one number for another though
 99 |         factories.Dataset(name='trees')
100 |         factories.Dataset(name='trees2')
101 |         assert _ensure_name_is_unique('trees', existing_name='trees2') == 'trees2'
102 | 
103 |     def test_update_dataset_with_new_name_with_numbers(self):
104 |         factories.Dataset(name='trees')
105 |         factories.Dataset(name='trees2')
106 |         factories.Dataset(name='frogs')
107 |         assert _ensure_name_is_unique('frogs', existing_name='trees2') == 'frogs1'
108 | 
109 |     def test_existing_dataset_appending_hex(self):
110 |         factories.Dataset(name='trees')
111 |         name = _ensure_name_is_unique('trees', append_type='random-hex')
112 |         # e.g. 'trees0b53f'
113 |         assert re.match(r'trees[\da-f]{5}', name)
114 | 
115 | 
116 | # taken from ckan/tests/lib/test_munge.py
117 | class TestMungeTag:
118 | 
119 |     # (original, expected)
120 |     munge_list = [
121 |         ('unchanged', 'unchanged'),
122 |         # ('s', 's_'),  # too short
123 |         ('some spaces  here', 'some-spaces--here'),
124 |         ('random:other%characters&_.here', 'randomothercharactershere'),
125 |         ('river-water-dashes', 'river-water-dashes'),
126 |     ]
127 | 
128 |     def test_munge_tag(self):
129 |         '''Munge a list of tags gives expected results.'''
130 |         for org, exp in self.munge_list:
131 |             munge = munge_tag(org)
132 |             assert munge == exp
133 | 
134 |     def test_munge_tag_multiple_pass(self):
135 |         '''Munge a list of tags muliple times gives expected results.'''
136 |         for org, exp in self.munge_list:
137 |             first_munge = munge_tag(org)
138 |             assert first_munge == exp
139 |             second_munge = munge_tag(first_munge)
140 |             assert second_munge == exp
141 | 
142 |     def test_clean_tags_package_show(self):
143 |         instance = HarvesterBase()
144 |         tags_as_dict = [{u'vocabulary_id': None,
145 |                          u'state': u'active',
146 |                          u'display_name': name,
147 |                          u'id': u'073080c8-fef2-4743-9c9e-6216019f8b3d',
148 |                          u'name': name} for name, exp in self.munge_list]
149 | 
150 |         clean_tags = HarvesterBase._clean_tags(instance, tags_as_dict)
151 | 
152 |         idx = 0
153 |         for _, exp in self.munge_list:
154 |             tag = clean_tags[idx]
155 |             assert tag['name'] == exp
156 |             idx += 1
157 | 
158 |     def test_clean_tags_rest(self):
159 |         instance = HarvesterBase()
160 |         tags_as_str = [name for name, exp in self.munge_list]
161 | 
162 |         clean_tags = HarvesterBase._clean_tags(instance, tags_as_str)
163 | 
164 |         assert len(clean_tags) == len(tags_as_str)
165 | 
166 |         for _, exp in self.munge_list:
167 |             assert exp in clean_tags
168 | 


--------------------------------------------------------------------------------
/ckanext/harvest/tests/harvesters/test_ckanharvester.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | import copy
  3 | 
  4 | import json
  5 | try:
  6 |     from unittest.mock import patch, MagicMock, Mock
  7 | except ImportError:
  8 |     from mock import patch, MagicMock, Mock
  9 | import pytest
 10 | from requests.exceptions import HTTPError, RequestException
 11 | 
 12 | from ckantoolkit.tests.helpers import call_action
 13 | from ckantoolkit.tests.factories import Organization, Group
 14 | from ckan import model
 15 | from ckan.plugins import toolkit
 16 | 
 17 | from ckanext.harvest.harvesters.ckanharvester import ContentFetchError
 18 | from ckanext.harvest.tests.factories import (HarvestSourceObj, HarvestJobObj,
 19 |                                              HarvestObjectObj)
 20 | from ckanext.harvest.tests.lib import run_harvest
 21 | import ckanext.harvest.model as harvest_model
 22 | from ckanext.harvest.harvesters.base import HarvesterBase
 23 | from ckanext.harvest.harvesters.ckanharvester import CKANHarvester
 24 | 
 25 | from . import mock_ckan
 26 | 
 27 | # Start CKAN-alike server we can test harvesting against it
 28 | mock_ckan.serve()
 29 | 
 30 | 
 31 | def was_last_job_considered_error_free():
 32 |     last_job = model.Session.query(harvest_model.HarvestJob) \
 33 |                     .order_by(harvest_model.HarvestJob.created.desc()) \
 34 |                     .first()
 35 |     job = MagicMock()
 36 |     job.source = last_job.source
 37 |     job.id = ''
 38 |     return bool(HarvesterBase.last_error_free_job(job))
 39 | 
 40 | 
 41 | @pytest.mark.usefixtures('with_plugins', 'clean_db', 'clean_index')
 42 | class TestCkanHarvester(object):
 43 | 
 44 |     def test_gather_normal(self):
 45 |         source = HarvestSourceObj(url='http://localhost:%s/' % mock_ckan.PORT)
 46 |         job = HarvestJobObj(source=source)
 47 | 
 48 |         harvester = CKANHarvester()
 49 |         obj_ids = harvester.gather_stage(job)
 50 | 
 51 |         assert job.gather_errors == []
 52 |         assert isinstance(obj_ids, list)
 53 |         assert len(obj_ids) == len(mock_ckan.DATASETS)
 54 |         harvest_object = harvest_model.HarvestObject.get(obj_ids[0])
 55 |         assert harvest_object.guid == mock_ckan.DATASETS[0]['id']
 56 |         assert json.loads(harvest_object.content) == mock_ckan.DATASETS[0]
 57 | 
 58 |     def test_fetch_normal(self):
 59 |         source = HarvestSourceObj(url='http://localhost:%s/' % mock_ckan.PORT)
 60 |         job = HarvestJobObj(source=source)
 61 |         harvest_object = HarvestObjectObj(
 62 |             guid=mock_ckan.DATASETS[0]['id'],
 63 |             job=job,
 64 |             content=json.dumps(mock_ckan.DATASETS[0]))
 65 | 
 66 |         harvester = CKANHarvester()
 67 |         result = harvester.fetch_stage(harvest_object)
 68 | 
 69 |         assert harvest_object.errors == []
 70 |         assert result is True
 71 | 
 72 |     def test_import_normal(self):
 73 |         org = Organization()
 74 |         harvest_object = HarvestObjectObj(
 75 |             guid=mock_ckan.DATASETS[0]['id'],
 76 |             content=json.dumps(mock_ckan.DATASETS[0]),
 77 |             job__source__owner_org=org['id'])
 78 | 
 79 |         harvester = CKANHarvester()
 80 |         result = harvester.import_stage(harvest_object)
 81 | 
 82 |         assert harvest_object.errors == []
 83 |         assert result is True
 84 |         assert harvest_object.package_id
 85 |         dataset = model.Package.get(harvest_object.package_id)
 86 |         assert dataset.name == mock_ckan.DATASETS[0]['name']
 87 | 
 88 |     def test_harvest(self):
 89 |         results_by_guid = run_harvest(
 90 |             url='http://localhost:%s/' % mock_ckan.PORT,
 91 |             harvester=CKANHarvester())
 92 | 
 93 |         result = results_by_guid['dataset1-id']
 94 |         assert result['state'] == 'COMPLETE'
 95 |         assert result['report_status'] == 'added'
 96 |         assert result['dataset']['name'] == mock_ckan.DATASETS[0]['name']
 97 |         assert result['errors'] == []
 98 | 
 99 |         result = results_by_guid[mock_ckan.DATASETS[1]['id']]
100 |         assert result['state'] == 'COMPLETE'
101 |         assert result['report_status'] == 'added'
102 |         assert result['dataset']['name'] == mock_ckan.DATASETS[1]['name']
103 |         assert result['errors'] == []
104 |         assert was_last_job_considered_error_free()
105 | 
106 |     def test_harvest_twice(self):
107 |         run_harvest(
108 |             url='http://localhost:%s/' % mock_ckan.PORT,
109 |             harvester=CKANHarvester())
110 | 
111 |         # change the modified date
112 |         datasets = copy.deepcopy(mock_ckan.DATASETS)
113 |         datasets[1]['metadata_modified'] = '2050-05-09T22:00:01.486366'
114 |         with patch('ckanext.harvest.tests.harvesters.mock_ckan.DATASETS',
115 |                    datasets):
116 |             results_by_guid = run_harvest(
117 |                 url='http://localhost:%s/' % mock_ckan.PORT,
118 |                 harvester=CKANHarvester())
119 | 
120 |         # updated the dataset which has revisions
121 |         result = results_by_guid[mock_ckan.DATASETS[1]['id']]
122 |         assert result['state'] == 'COMPLETE'
123 |         assert result['report_status'] == 'updated'
124 |         assert result['dataset']['name'] == mock_ckan.DATASETS[1]['name']
125 |         assert result['errors'] == []
126 | 
127 |         # the other dataset is unchanged and not harvested
128 |         assert mock_ckan.DATASETS[0]['id'] not in result
129 |         assert was_last_job_considered_error_free()
130 | 
131 |     def test_exclude_organizations(self):
132 |         config = {'organizations_filter_exclude': ['org1']}
133 |         results_by_guid = run_harvest(
134 |             url='http://localhost:%s' % mock_ckan.PORT,
135 |             harvester=CKANHarvester(),
136 |             config=json.dumps(config))
137 |         assert 'dataset1-id' not in results_by_guid
138 |         assert mock_ckan.DATASETS[1]['id'] in results_by_guid
139 | 
140 |     def test_include_organizations(self):
141 |         config = {'organizations_filter_include': ['org1']}
142 |         results_by_guid = run_harvest(
143 |             url='http://localhost:%s' % mock_ckan.PORT,
144 |             harvester=CKANHarvester(),
145 |             config=json.dumps(config))
146 |         assert 'dataset1-id' in results_by_guid
147 |         assert mock_ckan.DATASETS[1]['id'] not in results_by_guid
148 | 
149 |     def test_exclude_groups(self):
150 |         config = {'groups_filter_exclude': ['group1']}
151 |         results_by_guid = run_harvest(
152 |             url='http://localhost:%s' % mock_ckan.PORT,
153 |             harvester=CKANHarvester(),
154 |             config=json.dumps(config))
155 |         assert 'dataset1-id' not in results_by_guid
156 |         assert mock_ckan.DATASETS[1]['id'] in results_by_guid
157 | 
158 |     def test_include_groups(self):
159 |         config = {'groups_filter_include': ['group1']}
160 |         results_by_guid = run_harvest(
161 |             url='http://localhost:%s' % mock_ckan.PORT,
162 |             harvester=CKANHarvester(),
163 |             config=json.dumps(config))
164 |         assert 'dataset1-id' in results_by_guid
165 |         assert mock_ckan.DATASETS[1]['id'] not in results_by_guid
166 | 
167 |     def test_remote_groups_create(self):
168 |         config = {'remote_groups': 'create'}
169 |         results_by_guid = run_harvest(
170 |             url='http://localhost:%s' % mock_ckan.PORT,
171 |             harvester=CKANHarvester(),
172 |             config=json.dumps(config))
173 |         assert 'dataset1-id' in results_by_guid
174 |         # Check that the remote group was created locally
175 |         call_action('group_show', {}, id=mock_ckan.GROUPS[0]['id'])
176 | 
177 |     def test_harvest_info_in_package_show(self):
178 |         results_by_guid = run_harvest(
179 |             url='http://localhost:%s' % mock_ckan.PORT,
180 |             harvester=CKANHarvester())
181 |         assert 'dataset1-id' in results_by_guid
182 | 
183 |         # Check that the dataset extras has the harvest_object_id, harvest_source_id, and harvest_source_title
184 |         dataset = call_action('package_show', {"for_view": True}, id=mock_ckan.DATASETS[0]['id'])
185 |         extras_dict = dict((e['key'], e['value']) for e in dataset['extras'])
186 |         assert 'harvest_object_id' in extras_dict
187 |         assert 'harvest_source_id' in extras_dict
188 |         assert 'harvest_source_title' in extras_dict
189 | 
190 |     def test_remote_groups_only_local(self):
191 |         # Create an existing group
192 |         Group(id='10037fa4-e683-4a67-892a-efba815e24ad', name='group1')
193 | 
194 |         config = {'remote_groups': 'only_local'}
195 |         results_by_guid = run_harvest(
196 |             url='http://localhost:%s' % mock_ckan.PORT,
197 |             harvester=CKANHarvester(),
198 |             config=json.dumps(config))
199 |         assert 'dataset1-id' in results_by_guid
200 | 
201 |         # Check that the dataset was added to the existing local group
202 |         dataset = call_action('package_show', {}, id=mock_ckan.DATASETS[0]['id'])
203 |         assert dataset['groups'][0]['id'] == mock_ckan.DATASETS[0]['groups'][0]['id']
204 | 
205 |         # Check that the other remote group was not created locally
206 |         with pytest.raises(toolkit.ObjectNotFound):
207 |             call_action('group_show', {}, id='remote-group')
208 | 
209 |     def test_harvest_not_modified(self):
210 |         run_harvest(
211 |             url='http://localhost:%s/' % mock_ckan.PORT,
212 |             harvester=CKANHarvester())
213 | 
214 |         results_by_guid = run_harvest(
215 |             url='http://localhost:%s/' % mock_ckan.PORT,
216 |             harvester=CKANHarvester())
217 | 
218 |         # The metadata_modified was the same for this dataset so the import
219 |         # would have returned 'unchanged'
220 |         result = results_by_guid[mock_ckan.DATASETS[1]['id']]
221 |         assert result['state'] == 'COMPLETE'
222 |         assert result['report_status'] == 'not modified'
223 |         assert 'dataset' not in result
224 |         assert result['errors'] == []
225 |         assert was_last_job_considered_error_free()
226 | 
227 |     def test_harvest_whilst_datasets_added(self):
228 |         results_by_guid = run_harvest(
229 |             url='http://localhost:%s/datasets_added' % mock_ckan.PORT,
230 |             harvester=CKANHarvester())
231 | 
232 |         assert sorted(results_by_guid.keys()) == [mock_ckan.DATASETS[1]['id'], mock_ckan.DATASETS[0]['id']]
233 | 
234 |     def test_harvest_site_down(self):
235 |         results_by_guid = run_harvest(
236 |             url='http://localhost:%s/site_down' % mock_ckan.PORT,
237 |             harvester=CKANHarvester())
238 |         assert not results_by_guid
239 |         assert not was_last_job_considered_error_free()
240 | 
241 |     def test_default_tags(self):
242 |         config = {'default_tags': [{'name': 'geo'}]}
243 |         results_by_guid = run_harvest(
244 |             url='http://localhost:%s' % mock_ckan.PORT,
245 |             harvester=CKANHarvester(),
246 |             config=json.dumps(config))
247 |         tags = results_by_guid['dataset1-id']['dataset']['tags']
248 |         tag_names = [tag['name'] for tag in tags]
249 |         assert 'geo' in tag_names
250 | 
251 |     def test_default_tags_invalid(self):
252 |         config = {'default_tags': ['geo']}  # should be list of dicts
253 |         with pytest.raises(toolkit.ValidationError) as harvest_context:
254 |             run_harvest(
255 |                 url='http://localhost:%s' % mock_ckan.PORT,
256 |                 harvester=CKANHarvester(),
257 |                 config=json.dumps(config))
258 |         assert 'default_tags must be a list of dictionaries' in str(harvest_context.value)
259 | 
260 |     def test_default_groups(self):
261 |         Group(name='group1')
262 |         Group(name='group2')
263 |         Group(name='group3')
264 | 
265 |         config = {'default_groups': ['group2', 'group3'],
266 |                   'remote_groups': 'only_local'}
267 |         tmp_c = toolkit.c
268 |         try:
269 |             # c.user is used by the validation (annoying),
270 |             # however patch doesn't work because it's a weird
271 |             # StackedObjectProxy, so we swap it manually
272 |             toolkit.c = MagicMock(user='')
273 |             results_by_guid = run_harvest(
274 |                 url='http://localhost:%s' % mock_ckan.PORT,
275 |                 harvester=CKANHarvester(),
276 |                 config=json.dumps(config))
277 |         finally:
278 |             toolkit.c = tmp_c
279 |         assert results_by_guid['dataset1-id']['errors'] == []
280 |         groups = results_by_guid['dataset1-id']['dataset']['groups']
281 |         group_names = set(group['name'] for group in groups)
282 |         # group1 comes from the harvested dataset
283 |         # group2 & 3 come from the default_groups
284 |         assert group_names, set(('group1', 'group2' == 'group3'))
285 | 
286 |     def test_default_groups_invalid(self):
287 |         Group(name='group2')
288 | 
289 |         # should be list of strings
290 |         config = {'default_groups': [{'name': 'group2'}]}
291 |         with pytest.raises(toolkit.ValidationError) as harvest_context:
292 |             run_harvest(
293 |                 url='http://localhost:%s' % mock_ckan.PORT,
294 |                 harvester=CKANHarvester(),
295 |                 config=json.dumps(config))
296 |         assert 'default_groups must be a list of group names/ids' in str(harvest_context.value)
297 | 
298 |     def test_default_extras(self):
299 |         config = {
300 |             'default_extras': {
301 |                 'encoding': 'utf8',
302 |                 'harvest_url': '{harvest_source_url}/dataset/{dataset_id}'
303 |             }
304 |         }
305 |         results_by_guid = run_harvest(
306 |             url='http://localhost:%s' % mock_ckan.PORT,
307 |             harvester=CKANHarvester(),
308 |             config=json.dumps(config))
309 |         assert results_by_guid['dataset1-id']['errors'] == []
310 |         extras = results_by_guid['dataset1-id']['dataset']['extras']
311 |         extras_dict = dict((e['key'], e['value']) for e in extras)
312 |         assert extras_dict['encoding'] == 'utf8'
313 |         assert extras_dict['harvest_url'] == 'http://localhost:8998/dataset/dataset1-id'
314 | 
315 |     def test_default_extras_invalid(self):
316 |         config = {
317 |             'default_extras': 'utf8',  # value should be a dict
318 |         }
319 |         with pytest.raises(toolkit.ValidationError) as harvest_context:
320 |             run_harvest(
321 |                 url='http://localhost:%s' % mock_ckan.PORT,
322 |                 harvester=CKANHarvester(),
323 |                 config=json.dumps(config))
324 |         assert 'default_extras must be a dictionary' in str(harvest_context.value)
325 | 
326 |     @patch('ckanext.harvest.harvesters.ckanharvester.CKANHarvester.config')
327 |     @patch('ckanext.harvest.harvesters.ckanharvester.requests.get', side_effect=RequestException('Test.value'))
328 |     def test_get_content_handles_request_exception(
329 |         self, mock_requests_get, mock_config
330 |     ):
331 |         mock_config.return_value = {}
332 | 
333 |         harvester = CKANHarvester()
334 | 
335 |         with pytest.raises(ContentFetchError) as context:
336 |             harvester._get_content("http://test.example.gov.uk")
337 | 
338 |         assert str(context.value) == 'Request error: Test.value'
339 | 
340 |     class MockHTTPError(HTTPError):
341 |         def __init__(self):
342 |             self.response = Mock()
343 |             self.response.status_code = 404
344 |             self.request = Mock()
345 |             self.request.url = "http://test.example.gov.uk"
346 | 
347 |     @patch('ckanext.harvest.harvesters.ckanharvester.CKANHarvester.config')
348 |     @patch('ckanext.harvest.harvesters.ckanharvester.requests.get', side_effect=MockHTTPError())
349 |     def test_get_content_handles_http_error(
350 |         self, mock_requests_get, mock_config
351 |     ):
352 |         mock_config.return_value = {}
353 | 
354 |         harvester = CKANHarvester()
355 | 
356 |         with pytest.raises(ContentFetchError) as context:
357 |             harvester._get_content("http://test.example.gov.uk")
358 | 
359 |         assert str(context.value) == 'HTTP error: 404 http://test.example.gov.uk'
360 | 


--------------------------------------------------------------------------------
/ckanext/harvest/tests/lib.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from ckanext.harvest.tests.factories import HarvestSourceObj, HarvestJobObj
 4 | import ckanext.harvest.model as harvest_model
 5 | from ckanext.harvest import queue
 6 | from ckan.plugins import toolkit
 7 | 
 8 | log = logging.getLogger(__name__)
 9 | 
10 | 
11 | def run_harvest(url, harvester, config=''):
12 |     '''Runs a harvest and returns the results.
13 |     This allows you to test a harvester.
14 |     Queues are avoided as they are a pain in tests.
15 |     '''
16 |     # User creates a harvest source
17 |     source = HarvestSourceObj(url=url, config=config,
18 |                               source_type=harvester.info()['name'])
19 | 
20 |     # User triggers a harvest, which is the creation of a harvest job.
21 |     # We set run=False so that it doesn't put it on the gather queue.
22 |     job = HarvestJobObj(source=source, run=False)
23 | 
24 |     return run_harvest_job(job, harvester)
25 | 
26 | 
27 | def run_harvest_job(job, harvester):
28 |     # In 'harvest_job_create' it would call 'harvest_send_job_to_gather_queue'
29 |     # which would do 2 things to 'run' the job:
30 |     # 1. change the job status to Running
31 |     job.status = 'Running'
32 |     job.save()
33 |     # 2. put the job on the gather queue which is consumed by
34 |     # queue.gather_callback, which determines the harvester and then calls
35 |     # gather_stage. We simply call the gather_stage.
36 |     obj_ids = queue.gather_stage(harvester, job)
37 |     if not isinstance(obj_ids, list):
38 |         # gather had nothing to do or errored. Carry on to ensure the job is
39 |         # closed properly
40 |         obj_ids = []
41 | 
42 |     # The object ids are put onto the fetch queue, consumed by
43 |     # queue.fetch_callback which calls queue.fetch_and_import_stages
44 |     results_by_guid = {}
45 |     for obj_id in obj_ids:
46 |         harvest_object = harvest_model.HarvestObject.get(obj_id)
47 |         guid = harvest_object.guid
48 | 
49 |         # force reimport of datasets
50 |         if hasattr(job, 'force_import'):
51 |             if guid in job.force_import:
52 |                 harvest_object.force_import = True
53 |             else:
54 |                 log.info('Skipping: %s', guid)
55 |                 continue
56 | 
57 |         results_by_guid[guid] = {'obj_id': obj_id}
58 | 
59 |         queue.fetch_and_import_stages(harvester, harvest_object)
60 |         results_by_guid[guid]['state'] = harvest_object.state
61 |         results_by_guid[guid]['report_status'] = harvest_object.report_status
62 |         if harvest_object.state == 'COMPLETE' and harvest_object.package_id:
63 |             results_by_guid[guid]['dataset'] = \
64 |                 toolkit.get_action('package_show')(
65 |                     {'ignore_auth': True},
66 |                     dict(id=harvest_object.package_id))
67 |         results_by_guid[guid]['errors'] = harvest_object.errors
68 | 
69 |     # Do 'harvest_jobs_run' to change the job status to 'finished'
70 |     toolkit.get_action('harvest_jobs_run')({'ignore_auth': True}, {})
71 | 
72 |     return results_by_guid
73 | 


--------------------------------------------------------------------------------
/ckanext/harvest/tests/test_blueprint.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from ckantoolkit import url_for
  4 | from ckantoolkit.tests import factories
  5 | from ckanext.harvest.tests import factories as harvest_factories
  6 | 
  7 | 
  8 | @pytest.mark.usefixtures('with_plugins', 'clean_db', 'clean_index')
  9 | class TestBlueprint():
 10 | 
 11 |     def test_index_page_is_rendered(self, app):
 12 | 
 13 |         source1 = harvest_factories.HarvestSource()
 14 |         source2 = harvest_factories.HarvestSource()
 15 | 
 16 |         response = app.get(u'/harvest')
 17 | 
 18 |         assert source1['title'] in response.body
 19 |         assert source2['title'] in response.body
 20 | 
 21 |     def test_new_form_is_rendered(self, app):
 22 | 
 23 |         url = url_for('harvest.new')
 24 |         sysadmin = factories.Sysadmin()
 25 |         env = {"REMOTE_USER": sysadmin['name'].encode('ascii')}
 26 | 
 27 |         response = app.get(url, extra_environ=env)
 28 | 
 29 |         assert '<form id="source-new"' in response.body
 30 | 
 31 |     def test_edit_form_is_rendered(self, app):
 32 | 
 33 |         source = harvest_factories.HarvestSource()
 34 | 
 35 |         url = url_for('harvest.edit', id=source['id'])
 36 |         sysadmin = factories.Sysadmin()
 37 |         env = {"REMOTE_USER": sysadmin['name'].encode('ascii')}
 38 | 
 39 |         response = app.get(url, extra_environ=env)
 40 | 
 41 |         assert '<form id="source-new"' in response.body
 42 | 
 43 |     def test_source_page_rendered(self, app):
 44 | 
 45 |         source = harvest_factories.HarvestSource()
 46 | 
 47 |         url = url_for('harvest.read', id=source['name'])
 48 |         sysadmin = factories.Sysadmin()
 49 |         env = {"REMOTE_USER": sysadmin['name'].encode('ascii')}
 50 | 
 51 |         response = app.get(url, extra_environ=env)
 52 | 
 53 |         assert source['name'] in response.body
 54 | 
 55 |     def test_admin_page_rendered(self, app):
 56 | 
 57 |         source_obj = harvest_factories.HarvestSourceObj()
 58 |         job = harvest_factories.HarvestJob(source=source_obj)
 59 | 
 60 |         sysadmin = factories.Sysadmin()
 61 |         env = {"REMOTE_USER": sysadmin['name'].encode('ascii')}
 62 | 
 63 |         url = url_for('harvester.admin', id=source_obj.id)
 64 | 
 65 |         response = app.get(url, extra_environ=env)
 66 | 
 67 |         assert source_obj.title in response.body
 68 | 
 69 |         assert job['id'] in response.body
 70 | 
 71 |     def test_about_page_rendered(self, app):
 72 | 
 73 |         source = harvest_factories.HarvestSource()
 74 | 
 75 |         url = url_for('harvester.about', id=source['name'])
 76 |         sysadmin = factories.Sysadmin()
 77 |         env = {"REMOTE_USER": sysadmin['name'].encode('ascii')}
 78 | 
 79 |         response = app.get(url, extra_environ=env)
 80 | 
 81 |         assert source['name'] in response.body
 82 | 
 83 |     def test_job_page_rendered(self, app):
 84 | 
 85 |         job = harvest_factories.HarvestJob()
 86 | 
 87 |         sysadmin = factories.Sysadmin()
 88 |         env = {"REMOTE_USER": sysadmin['name'].encode('ascii')}
 89 | 
 90 |         url = url_for('harvester.job_list', source=job['source_id'])
 91 | 
 92 |         response = app.get(url, extra_environ=env)
 93 | 
 94 |         assert job['id'] in response.body
 95 | 
 96 |     def test_job_show_last_page_rendered(self, app):
 97 | 
 98 |         job = harvest_factories.HarvestJob()
 99 | 
100 |         sysadmin = factories.Sysadmin()
101 |         env = {"REMOTE_USER": sysadmin['name'].encode('ascii')}
102 | 
103 |         url = url_for('harvester.job_show_last', source=job['source_id'])
104 | 
105 |         response = app.get(url, extra_environ=env)
106 | 
107 |         assert job['id'] in response.body
108 | 
109 |     def test_job_show_page_rendered(self, app):
110 | 
111 |         job = harvest_factories.HarvestJob()
112 | 
113 |         url = url_for(
114 |             'harvester.job_show', source=job['source_id'], id=job['id'])
115 |         sysadmin = factories.Sysadmin()
116 |         env = {"REMOTE_USER": sysadmin['name'].encode('ascii')}
117 | 
118 |         response = app.get(url, extra_environ=env)
119 | 
120 |         assert job['id'] in response.body
121 | 


--------------------------------------------------------------------------------
/ckanext/harvest/tests/test_queue2.py:
--------------------------------------------------------------------------------
  1 | '''Tests elements of queue.py, but doesn't use the queue subsystem
  2 | (redis/rabbitmq)
  3 | '''
  4 | import json
  5 | 
  6 | import pytest
  7 | 
  8 | from ckan import model
  9 | from ckan import plugins as p
 10 | from ckan.plugins import toolkit
 11 | 
 12 | from ckanext.harvest.tests.factories import (HarvestObjectObj)
 13 | from ckanext.harvest.interfaces import IHarvester
 14 | import ckanext.harvest.model as harvest_model
 15 | from ckanext.harvest.tests.lib import run_harvest
 16 | 
 17 | 
 18 | class MockHarvester(p.SingletonPlugin):
 19 |     p.implements(IHarvester)
 20 | 
 21 |     @classmethod
 22 |     def _set_test_params(cls, guid, **test_params):
 23 |         cls._guid = guid
 24 |         cls._test_params = test_params
 25 | 
 26 |     def info(self):
 27 |         return {'name': 'test2', 'title': 'test', 'description': 'test'}
 28 | 
 29 |     def gather_stage(self, harvest_job):
 30 |         obj = HarvestObjectObj(guid=self._guid, job=harvest_job)
 31 |         return [obj.id]
 32 | 
 33 |     def fetch_stage(self, harvest_object):
 34 |         if self._test_params.get('fetch_object_unchanged'):
 35 |             return 'unchanged'
 36 |         harvest_object.content = json.dumps({'name': harvest_object.guid})
 37 |         harvest_object.save()
 38 |         return True
 39 | 
 40 |     def import_stage(self, harvest_object):
 41 |         user = toolkit.get_action('get_site_user')(
 42 |             {'model': model, 'ignore_auth': True}, {}
 43 |         )['name']
 44 | 
 45 |         package = json.loads(harvest_object.content)
 46 |         name = package['name']
 47 | 
 48 |         package_object = model.Package.get(name)
 49 |         if package_object:
 50 |             logic_function = 'package_update'
 51 |         else:
 52 |             logic_function = 'package_create'
 53 | 
 54 |         package_dict = toolkit.get_action(logic_function)(
 55 |             {'model': model, 'session': model.Session,
 56 |              'user': user},
 57 |             json.loads(harvest_object.content)
 58 |         )
 59 | 
 60 |         if self._test_params.get('object_error'):
 61 |             return False
 62 | 
 63 |         # successful, so move 'current' to this object
 64 |         previous_object = model.Session.query(harvest_model.HarvestObject) \
 65 |                                .filter_by(guid=harvest_object.guid) \
 66 |                                .filter_by(current=True) \
 67 |                                .first()
 68 |         if previous_object:
 69 |             previous_object.current = False
 70 |             previous_object.save()
 71 |         harvest_object.package_id = package_dict['id']
 72 |         harvest_object.current = True
 73 | 
 74 |         if self._test_params.get('delete'):
 75 |             # 'current=False' is the key step in getting report_status to be
 76 |             # set as 'deleted'
 77 |             harvest_object.current = False
 78 |             package_object.save()
 79 | 
 80 |         harvest_object.save()
 81 | 
 82 |         if self._test_params.get('import_object_unchanged'):
 83 |             return 'unchanged'
 84 |         return True
 85 | 
 86 | 
 87 | @pytest.mark.usefixtures('with_plugins', 'clean_db', 'clean_queues')
 88 | @pytest.mark.ckan_config('ckan.plugins', 'harvest test_harvester2')
 89 | class TestEndStates(object):
 90 | 
 91 |     def test_create_dataset(self):
 92 |         guid = 'obj-create'
 93 |         MockHarvester._set_test_params(guid=guid)
 94 | 
 95 |         results_by_guid = run_harvest(
 96 |             url='http://some-url.com',
 97 |             harvester=MockHarvester())
 98 | 
 99 |         result = results_by_guid[guid]
100 |         assert result['state'] == 'COMPLETE'
101 |         assert result['report_status'] == 'added'
102 |         assert result['errors'] == []
103 | 
104 |     def test_update_dataset(self):
105 |         guid = 'obj-update'
106 |         MockHarvester._set_test_params(guid=guid)
107 | 
108 |         # create the original harvest_object and dataset
109 |         run_harvest(
110 |             url='http://some-url.com',
111 |             harvester=MockHarvester())
112 |         # update it
113 |         results_by_guid = run_harvest(
114 |             url='http://some-url.com',
115 |             harvester=MockHarvester())
116 | 
117 |         result = results_by_guid[guid]
118 |         assert result['state'] == 'COMPLETE'
119 |         assert result['report_status'] == 'updated'
120 |         assert result['errors'] == []
121 | 
122 |     def test_delete_dataset(self):
123 |         guid = 'obj-delete'
124 |         MockHarvester._set_test_params(guid=guid)
125 |         # create the original harvest_object and dataset
126 |         run_harvest(
127 |             url='http://some-url.com',
128 |             harvester=MockHarvester())
129 |         MockHarvester._set_test_params(guid=guid, delete=True)
130 | 
131 |         # delete it
132 |         results_by_guid = run_harvest(
133 |             url='http://some-url.com',
134 |             harvester=MockHarvester())
135 | 
136 |         result = results_by_guid[guid]
137 |         assert result['state'] == 'COMPLETE'
138 |         assert result['report_status'] == 'deleted'
139 |         assert result['errors'] == []
140 | 
141 |     def test_obj_error(self):
142 |         guid = 'obj-error'
143 |         MockHarvester._set_test_params(guid=guid, object_error=True)
144 | 
145 |         results_by_guid = run_harvest(
146 |             url='http://some-url.com',
147 |             harvester=MockHarvester())
148 | 
149 |         result = results_by_guid[guid]
150 |         assert result['state'] == 'ERROR'
151 |         assert result['report_status'] == 'errored'
152 |         assert result['errors'] == []
153 | 
154 |     def test_fetch_unchanged(self):
155 |         guid = 'obj-error'
156 |         MockHarvester._set_test_params(guid=guid, fetch_object_unchanged=True)
157 | 
158 |         results_by_guid = run_harvest(
159 |             url='http://some-url.com',
160 |             harvester=MockHarvester())
161 | 
162 |         result = results_by_guid[guid]
163 |         assert result['state'] == 'COMPLETE'
164 |         assert result['report_status'] == 'not modified'
165 |         assert result['errors'] == []
166 | 
167 |     def test_import_unchanged(self):
168 |         guid = 'obj-error'
169 |         MockHarvester._set_test_params(guid=guid, import_object_unchanged=True)
170 | 
171 |         results_by_guid = run_harvest(
172 |             url='http://some-url.com',
173 |             harvester=MockHarvester())
174 | 
175 |         result = results_by_guid[guid]
176 |         assert result['state'] == 'COMPLETE'
177 |         assert result['report_status'] == 'not modified'
178 |         assert result['errors'] == []
179 | 


--------------------------------------------------------------------------------
/ckanext/harvest/tests/test_timeouts.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime, timedelta
  2 | import pytest
  3 | from ckan.tests import factories as ckan_factories
  4 | from ckan import model
  5 | from ckan.lib.base import config
  6 | from ckan.plugins.toolkit import get_action
  7 | from ckanext.harvest.tests import factories as harvest_factories
  8 | from ckanext.harvest.logic import HarvestJobExists
  9 | 
 10 | 
 11 | @pytest.mark.usefixtures('with_plugins', 'clean_db', 'clean_queues')
 12 | @pytest.mark.ckan_config('ckan.plugins', 'harvest test_action_harvester')
 13 | class TestModelFunctions:
 14 |     dataset_counter = 0
 15 | 
 16 |     def test_timeout_jobs(self):
 17 |         """ Create harvest source, job and objects
 18 |             Validate we read the last object fished time
 19 |             Validate we raise timeout in harvest_jobs_run_action
 20 |             """
 21 |         source, job = self.get_source()
 22 | 
 23 |         self.add_object(job=job, source=source, state='COMPLETE', minutes_ago=10)
 24 |         ob2 = self.add_object(job=job, source=source, state='COMPLETE', minutes_ago=5)
 25 |         self.add_object(job=job, source=source, state='COMPLETE', minutes_ago=15)
 26 | 
 27 |         assert job.get_last_finished_object() == ob2
 28 |         assert job.get_last_action_time() == ob2.import_finished
 29 | 
 30 |         gather_errors = self.run(timeout=3, source=source, job=job)
 31 |         assert len(gather_errors) == 1
 32 |         assert job.status == 'Finished'
 33 |         gather_error = gather_errors[0]
 34 |         assert 'timeout' in gather_error.message
 35 | 
 36 |     def test_no_timeout_jobs(self):
 37 |         """ Test a job that don't raise timeout """
 38 |         source, job = self.get_source()
 39 | 
 40 |         self.add_object(job=job, source=source, state='COMPLETE', minutes_ago=10)
 41 |         ob2 = self.add_object(job=job, source=source, state='COMPLETE', minutes_ago=5)
 42 |         self.add_object(job=job, source=source, state='COMPLETE', minutes_ago=15)
 43 | 
 44 |         assert job.get_last_finished_object() == ob2
 45 |         assert job.get_last_action_time() == ob2.import_finished
 46 | 
 47 |         gather_errors = self.run(timeout=7, source=source, job=job)
 48 |         assert len(gather_errors) == 0
 49 |         assert job.status == 'Finished'
 50 | 
 51 |     def test_no_objects_job(self):
 52 |         """ Test a job that don't raise timeout """
 53 |         _, job = self.get_source()
 54 | 
 55 |         job.gather_finished = datetime.utcnow()
 56 |         job.save()
 57 | 
 58 |         assert job.get_last_finished_object() is None
 59 |         assert job.get_last_action_time() == job.gather_finished
 60 | 
 61 |     def test_no_gathered_job(self):
 62 |         """ Test a job that don't raise timeout """
 63 |         _, job = self.get_source()
 64 | 
 65 |         job.gather_finished = None
 66 |         job.save()
 67 | 
 68 |         assert job.get_last_finished_object() is None
 69 |         assert job.get_last_action_time() == job.created
 70 | 
 71 |     def test_gather_get_last_action_time(self):
 72 |         """ Test get_last_action_time at gather stage """
 73 |         source, job = self.get_source()
 74 | 
 75 |         self.add_object(job=job, source=source, state='WAITING')
 76 |         self.add_object(job=job, source=source, state='WAITING')
 77 |         ob3 = self.add_object(job=job, source=source, state='WAITING')
 78 | 
 79 |         assert job.get_last_gathered_object() == ob3
 80 |         assert job.get_last_action_time() == ob3.gathered
 81 | 
 82 |     def run(self, timeout, source, job):
 83 |         """ Run the havester_job_run and return the errors """
 84 | 
 85 |         # check timeout
 86 |         context = {'model': model, 'session': model.Session,
 87 |                    'ignore_auth': True, 'user': ''}
 88 | 
 89 |         data_dict = {
 90 |             'guid': 'guid',
 91 |             'content': 'content',
 92 |             'job_id': job.id,
 93 |             'source_id': source.id
 94 |         }
 95 | 
 96 |         # prepare the job to run
 97 |         job.gather_finished = datetime.utcnow()
 98 |         job.save()
 99 | 
100 |         # run (we expect a timeout)
101 |         config['ckan.harvest.timeout'] = timeout
102 |         harvest_jobs_run_action = get_action('harvest_jobs_run')
103 |         harvest_jobs_run_action(context, data_dict)
104 | 
105 |         return job.get_gather_errors()
106 | 
107 |     def get_source(self):
108 | 
109 |         SOURCE_DICT = {
110 |             "url": "http://test.timeout.com",
111 |             "name": "test-source-timeout",
112 |             "title": "Test source timeout",
113 |             "notes": "Notes source timeout",
114 |             "source_type": "test-for-action",
115 |             "frequency": "MANUAL"
116 |         }
117 |         source = harvest_factories.HarvestSourceObj(**SOURCE_DICT)
118 |         try:
119 |             job = harvest_factories.HarvestJobObj(source=source)
120 |         except HarvestJobExists:  # not sure why
121 |             job = source.get_jobs()[0]
122 | 
123 |         job.status = 'Running'
124 |         job.save()
125 | 
126 |         jobs = source.get_jobs(status='Running')
127 |         assert job in jobs
128 | 
129 |         return source, job
130 | 
131 |     def add_object(self, job, source, state, minutes_ago=0):
132 |         now = datetime.utcnow()
133 |         self.dataset_counter += 1
134 |         name = 'dataset-{}-{}'.format(state.lower(), self.dataset_counter)
135 |         dataset = ckan_factories.Dataset(name=name)
136 |         obj = harvest_factories.HarvestObjectObj(
137 |             job=job,
138 |             source=source,
139 |             package_id=dataset['id'],
140 |             guid=dataset['id'],
141 |             content='{}',
142 |             # always is WAITING state=state,
143 |             )
144 | 
145 |         obj.state = state
146 |         if minutes_ago > 0:
147 |             obj.import_finished = now - timedelta(minutes=minutes_ago)
148 |         obj.save()
149 |         return obj
150 | 


--------------------------------------------------------------------------------
/ckanext/harvest/views.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import ckantoolkit as tk
  4 | from flask import Blueprint, make_response
  5 | 
  6 | import ckanext.harvest.utils as utils
  7 | 
  8 | # IDatasetForm provides a "harvest" blueprint for the package type harvest.
  9 | # We name the extension blueprint "harvester" to avoid clashing of names.
 10 | harvester = Blueprint("harvester", __name__)
 11 | 
 12 | 
 13 | @harvester.before_request
 14 | def before_request():
 15 |     tk.c.dataset_type = utils.DATASET_TYPE_NAME
 16 | 
 17 | 
 18 | def delete(id):
 19 |     return utils.delete_view(id)
 20 | 
 21 | 
 22 | def refresh(id):
 23 |     return utils.refresh_view(id)
 24 | 
 25 | 
 26 | def admin(id):
 27 |     return utils.admin_view(id)
 28 | 
 29 | 
 30 | def about(id):
 31 |     return utils.about_view(id)
 32 | 
 33 | 
 34 | def clear(id):
 35 |     return utils.clear_view(id)
 36 | 
 37 | 
 38 | def job_list(source):
 39 |     return utils.job_list_view(source)
 40 | 
 41 | 
 42 | def job_show_last(source):
 43 |     return utils.job_show_last_view(source)
 44 | 
 45 | 
 46 | def job_show(source, id):
 47 |     return utils.job_show_view(id)
 48 | 
 49 | 
 50 | def job_abort(source, id):
 51 |     return utils.job_abort_view(source, id)
 52 | 
 53 | 
 54 | def object_show(id, ref_type):
 55 |     (response, content) = utils.object_show_view(id, ref_type, make_response())
 56 |     response.set_data(content)
 57 |     return response
 58 | 
 59 | 
 60 | harvester.add_url_rule(
 61 |     "/" + utils.DATASET_TYPE_NAME + "/delete/<id>",
 62 |     view_func=delete,
 63 | )
 64 | harvester.add_url_rule("/" + utils.DATASET_TYPE_NAME + "/refresh/<id>",
 65 |                        view_func=refresh,
 66 |                        methods=(u'POST', u'GET'))
 67 | harvester.add_url_rule(
 68 |     "/" + utils.DATASET_TYPE_NAME + "/admin/<id>",
 69 |     view_func=admin,
 70 | )
 71 | harvester.add_url_rule(
 72 |     "/" + utils.DATASET_TYPE_NAME + "/about/<id>",
 73 |     view_func=about,
 74 | )
 75 | harvester.add_url_rule("/" + utils.DATASET_TYPE_NAME + "/clear/<id>",
 76 |                        view_func=clear,
 77 |                        methods=(u'POST', u'GET'))
 78 | harvester.add_url_rule(
 79 |     "/" + utils.DATASET_TYPE_NAME + "/<source>/job",
 80 |     view_func=job_list,
 81 | )
 82 | harvester.add_url_rule(
 83 |     "/" + utils.DATASET_TYPE_NAME + "/<source>/job/last",
 84 |     view_func=job_show_last,
 85 | )
 86 | 
 87 | harvester.add_url_rule(
 88 |     "/" + utils.DATASET_TYPE_NAME + "/<source>/job/<id>",
 89 |     view_func=job_show,
 90 | )
 91 | harvester.add_url_rule(
 92 |     "/" + utils.DATASET_TYPE_NAME + "/<source>/job/<id>/abort",
 93 |     view_func=job_abort,
 94 | )
 95 | harvester.add_url_rule(
 96 |     "/" + utils.DATASET_TYPE_NAME + "/object/<id>",
 97 |     view_func=object_show,
 98 |     defaults={"ref_type": "object"},
 99 | )
100 | harvester.add_url_rule(
101 |     "/dataset/harvest_object/<id>",
102 |     view_func=object_show,
103 |     defaults={"ref_type": "dataset"},
104 | )
105 | 
106 | 
107 | def get_blueprints():
108 |     return [harvester]
109 | 


--------------------------------------------------------------------------------
/config/supervisor/ckan_harvesting.conf:
--------------------------------------------------------------------------------
 1 | ; ===============================
 2 | ; ckan harvester example
 3 | ; ===============================
 4 | 
 5 | ; symlink or copy this file to /etc/supervisr/conf.d 
 6 | ; change the path/to/virtualenv below to the virtualenv ckan is in.
 7 | 
 8 | [program:ckan_gather_consumer]
 9 | 
10 | ; Full Path to executable, should be path to virtural environment,
11 | ; Full path to config file too.
12 | 
13 | command=/path/to/pyenv/bin/paster --plugin=ckanext-harvest harvester gather_consumer --config=/path/to/config/std.ini
14 | 
15 | ; user that owns virtual environment.
16 | user=ckan
17 | 
18 | numprocs=1
19 | stdout_logfile=/var/log/ckan/std/gather_consumer.log
20 | stderr_logfile=/var/log/ckan/std/gather_consumer.log
21 | autostart=true
22 | autorestart=true
23 | startsecs=10
24 | 
25 | [program:ckan_fetch_consumer]
26 | 
27 | ; Full Path to executable, should be path to virtural environment,
28 | ; Full path to config file too.
29 | 
30 | command=/path/to/pyenv/bin/paster --plugin=ckanext-harvest harvester fetch_consumer --config=/path/to/config/std.ini
31 | 
32 | ; user that owns virtual environment.
33 | user=ckan
34 | 
35 | numprocs=1
36 | stdout_logfile=/var/log/ckan/std/fetch_consumer.log
37 | stderr_logfile=/var/log/ckan/std/fetch_consumer.log
38 | autostart=true
39 | autorestart=true
40 | startsecs=10
41 | 


--------------------------------------------------------------------------------
/conftest.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | pytest_plugins = [
4 |     u'ckanext.harvest.tests.fixtures',
5 | ]
6 | 


--------------------------------------------------------------------------------
/dev-requirements.txt:
--------------------------------------------------------------------------------
1 | pytest-ckan
2 | pytest-cov
3 | factory-boy>=2
4 | mock
5 | 


--------------------------------------------------------------------------------
/docs/admin-tab.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ckan/ckanext-harvest/b74cba23b647f0aefab1db406784dd8bb11f8c7d/docs/admin-tab.png


--------------------------------------------------------------------------------
/pip-requirements.txt:
--------------------------------------------------------------------------------
1 | requirements.txt


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "ckanext-harvest"
 3 | version = "1.6.1"
 4 | description = "Harvesting interface plugin for CKAN, plus harvester for other CKAN sites"
 5 | authors = [
 6 |     {name = "Adrià Mercader", email = "amercadero@gmail.com"}
 7 | ]
 8 | maintainers = [
 9 |     {name = "CKAN Tech Team and contributors", email = "tech-team@ckan.org"},
10 |     {name = "Seitenbau Govdata"},
11 | ]
12 | license = {text = "AGPL"}
13 | classifiers = [
14 |     "Intended Audience :: Developers",
15 |     "Development Status :: 5 - Production/Stable",
16 |     "License :: OSI Approved :: GNU Affero General Public License v3",
17 |     "Programming Language :: Python :: 3.9",
18 |     "Programming Language :: Python :: 3.10",
19 |     "Programming Language :: Python :: 3.11",
20 |     "Programming Language :: Python :: 3.12"
21 | ]
22 | keywords = [
23 |     "ckan",
24 |     "ckanext",
25 |     "harvesting",
26 |     "federation",
27 | ]
28 | dependencies = []
29 | 
30 | [project.urls]
31 | Homepage = "http://github.com/ckan/ckanext-harvest"
32 | Repository = "https://github.com/ckan/ckanext-harvest"
33 | Issues = "https://github.com/ckan/ckanext-harvest/issues"
34 | Changelog = "https://github.com/ckan/ckanext-harvest/blob/master/CHANGELOG.rst"
35 | 
36 | [build-system]
37 | requires = ["setuptools"]
38 | build-backend = "setuptools.build_meta"
39 | 
40 | [project.entry-points."ckan.plugins"]
41 | harvest = "ckanext.harvest.plugin:Harvest"
42 | ckan_harvester = "ckanext.harvest.harvesters:CKANHarvester"
43 | 
44 | # Test plugins
45 | test_harvester = "ckanext.harvest.tests.test_queue:MockHarvester"
46 | test_harvester2 = "ckanext.harvest.tests.test_queue2:MockHarvester"
47 | test_action_harvester = "ckanext.harvest.tests.test_action:MockHarvesterForActionTests"
48 | 
49 | 
50 | [project.entry-points."babel.extractors"]
51 | ckan = "ckan.lib.extract:extract_ckan"
52 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | ckantoolkit>=0.0.7
2 | pika>=1.1.0,<1.3.0
3 | redis
4 | requests>=2.11.1
5 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [options]
 2 | packages = find:
 3 | namespace_packages = ckanext
 4 | install_requires =
 5 | include_package_data = True
 6 | 
 7 | [extract_messages]
 8 | keywords = translate isPlural
 9 | add_comments = TRANSLATORS:
10 | output_file = i18n/ckanext-harvest.pot
11 | width = 80
12 | 
13 | [init_catalog]
14 | domain = ckanext-harvest
15 | input_file = i18n/ckanext-harvest.pot
16 | output_dir = i18n
17 | 
18 | [update_catalog]
19 | domain = ckanext-harvest
20 | input_file = i18n/ckanext-harvest.pot
21 | output_dir = i18n
22 | previous = true
23 | 
24 | [compile_catalog]
25 | domain = ckanext-harvest
26 | directory = i18n
27 | statistics = true
28 | 
29 | [flake8]
30 | max-line-length = 127
31 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(
 4 |     message_extractors={
 5 |         'ckanext': [
 6 |             ('**.py', 'python', None),
 7 |             ('**.js', 'javascript', None),
 8 |             ('**/templates/**.html', 'ckan', None),
 9 |         ],
10 |     }
11 | )
12 | 


--------------------------------------------------------------------------------
/test.ini:
--------------------------------------------------------------------------------
 1 | [DEFAULT]
 2 | debug = false
 3 | # Uncomment and replace with the address which should receive any error reports
 4 | #email_to = you@yourdomain.com
 5 | smtp_server = localhost
 6 | error_email_from = paste@localhost
 7 | 
 8 | [server:main]
 9 | use = egg:Paste#http
10 | host = 0.0.0.0
11 | port = 5000
12 | 
13 | 
14 | [app:main]
15 | use = config:../ckan/test-core.ini
16 | # Here we hard-code the database and a flag to make default tests
17 | # run fast.
18 | ckan.plugins = harvest ckan_harvester test_harvester test_harvester2 test_action_harvester
19 | ckan.harvest.mq.type = redis
20 | ckan.legacy_templates = false
21 | # NB: other test configuration should go in test-core.ini, which is
22 | #     what the postgres tests use.
23 | 
24 | 
25 | # Logging configuration
26 | [loggers]
27 | keys = root, ckan, sqlalchemy
28 | 
29 | [handlers]
30 | keys = console, dblog
31 | 
32 | [formatters]
33 | keys = generic, dblog
34 | 
35 | [logger_root]
36 | level = WARN
37 | handlers = console
38 | 
39 | [logger_ckan]
40 | qualname = ckan
41 | handlers = 
42 | level = INFO
43 | 
44 | [logger_ckan_harvester]
45 | qualname = ckanext.harvest
46 | handlers = dblog
47 | level = DEBUG
48 | 
49 | [logger_sqlalchemy]
50 | handlers =
51 | qualname = sqlalchemy.engine
52 | level = WARN  
53 | 
54 | [handler_console]
55 | class = StreamHandler
56 | args = (sys.stdout,)
57 | level = NOTSET
58 | formatter = generic
59 | 
60 | [handler_dblog]
61 | class = ckanext.harvest.log.DBLogHandler
62 | args = ()
63 | level = DEBUG
64 | formatter = dblog
65 | 
66 | [formatter_dblog]
67 | format = %(message)s
68 | 
69 | [formatter_generic]
70 | format = %(asctime)s %(levelname)-5.5s [%(name)s] %(message)s
71 | 


--------------------------------------------------------------------------------