{% endblock %} 6 | 7 | {% block main_content %} 8 |
9 |
10 | {% block form %} 11 |

{{ _('Are you sure you want to delete the DataStore and Data Dictionary?') }}

12 |

13 |

14 | {{ h.csrf_input() if 'csrf_input' in h }} 15 | 16 | 17 |
18 |

19 | {% endblock %} 20 |
21 |
22 | {% endblock %} 23 | -------------------------------------------------------------------------------- /ckanext/xloader/webassets/css/xloader.css: -------------------------------------------------------------------------------- 1 | .loader-badge { 2 | margin-left: 10px; 3 | background: #555; 4 | color: #fff; 5 | border-radius: 3px; 6 | display: inline-block; 7 | font-size: 14px; 8 | vertical-align: middle; 9 | font-weight: 400; 10 | line-height: 1.2; 11 | } 12 | 13 | a.loader-badge { 14 | text-decoration: none; 15 | } 16 | 17 | .loader-badge:hover, 18 | .loader-badge:focus { 19 | color: #fff; 20 | } 21 | 22 | .prefix, 23 | .status { 24 | display: inline-block; 25 | padding: 2px 6px; 26 | } 27 | 28 | .loader-badge .status { 29 | border-top-right-radius: 3px; 30 | border-bottom-right-radius: 3px; 31 | } 32 | 33 | .loader-badge .status.active { 34 | background: #97C50F; 35 | } 36 | 37 | .loader-badge .status.complete { 38 | background: #1081C2; 39 | } 40 | 41 | .loader-badge .status.error { 42 | background: #D9634D; 43 | } 44 | 45 | .loader-badge .status.inactive { 46 | background: #F27E3F; 47 | } 48 | 49 | .loader-badge .status.pending { 50 | background: #9B9B9B; 51 | } 52 | 53 | .loader-badge .status.running { 54 | background: #D8B124; 55 | } 56 | 57 | .loader-badge .status.unknown { 58 | background: #9D9D9D; 59 | } 60 | 61 | -------------------------------------------------------------------------------- /ckanext/xloader/schema.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | import ckan.plugins as p 4 | import ckanext.datastore.logic.schema as dsschema 5 | 6 | get_validator = p.toolkit.get_validator 7 | 8 | not_missing = get_validator('not_missing') 9 | not_empty = get_validator('not_empty') 10 | resource_id_exists = get_validator('resource_id_exists') 11 | package_id_exists = get_validator('package_id_exists') 12 | ignore_missing = get_validator('ignore_missing') 13 | empty = get_validator('empty') 14 | boolean_validator = get_validator('boolean_validator') 15 | int_validator = get_validator('int_validator') 16 | OneOf = get_validator('OneOf') 17 | ignore_not_sysadmin = get_validator('ignore_not_sysadmin') 18 | unicode_safe = get_validator('unicode_safe') 19 | 20 | 21 | def xloader_submit_schema(): 22 | schema = { 23 | 'resource_id': [not_missing, not_empty, unicode_safe], 24 | 'id': [ignore_missing], 25 | 'set_url_type': [ignore_missing, boolean_validator], 26 | 'ignore_hash': [ignore_missing, boolean_validator], 27 | 'sync': [ignore_missing, boolean_validator, ignore_not_sysadmin], 28 | '__junk': [empty], 29 | '__before': [dsschema.rename('id', 'resource_id')] 30 | } 31 | return schema 32 | -------------------------------------------------------------------------------- /ckanext/xloader/tests/ckan_setup.py: -------------------------------------------------------------------------------- 1 | try: 2 | from ckan.tests.pytest_ckan.ckan_setup import * # noqa 3 | except ImportError: 4 | import pkg_resources 5 | from paste.deploy import loadapp 6 | import sys 7 | import os 8 | 9 | import pylons 10 | from pylons.i18n.translation import _get_translator 11 | 12 | def pytest_addoption(parser): 13 | """Allow using custom config file during tests. 14 | """ 15 | parser.addoption(u"--ckan-ini", action=u"store") 16 | 17 | def pytest_sessionstart(session): 18 | """Initialize CKAN environment. 19 | """ 20 | global pylonsapp 21 | path = os.getcwd() 22 | sys.path.insert(0, path) 23 | pkg_resources.working_set.add_entry(path) 24 | pylonsapp = loadapp( 25 | "config:" + session.config.option.ckan_ini, relative_to=path, 26 | ) 27 | 28 | # Initialize a translator for tests that utilize i18n 29 | translator = _get_translator(pylons.config.get("lang")) 30 | pylons.translator._push_object(translator) 31 | 32 | class FakeResponse: 33 | headers = {} # because render wants to delete Pragma 34 | 35 | pylons.response._push_object(FakeResponse) 36 | -------------------------------------------------------------------------------- /test.ini: -------------------------------------------------------------------------------- 1 | [DEFAULT] 2 | debug = false 3 | smtp_server = localhost 4 | error_email_from = paste@localhost 5 | 6 | [server:main] 7 | use = egg:Paste#http 8 | host = 0.0.0.0 9 | port = 5000 10 | 11 | [app:main] 12 | use = config:../ckan/test-core.ini 13 | 14 | # solr_url = http://127.0.0.1:8983/solr 15 | 16 | # Insert any custom config settings to be used when running your extension's 17 | # tests here. 18 | ckan.plugins = xloader datastore 19 | ckanext.xloader.jobs_db.uri = sqlite:////tmp/jobs.db 20 | 21 | # Logging configuration 22 | [loggers] 23 | keys = root, ckan, ckanext_xloader, sqlalchemy 24 | 25 | [handlers] 26 | keys = console 27 | 28 | [formatters] 29 | keys = generic 30 | 31 | [logger_root] 32 | level = WARN 33 | handlers = console 34 | 35 | [logger_ckan] 36 | qualname = ckan 37 | handlers = console 38 | level = INFO 39 | 40 | [logger_ckanext_xloader] 41 | qualname = ckanext.xloader 42 | handlers = console 43 | level = WARN 44 | 45 | [logger_sqlalchemy] 46 | handlers = 47 | qualname = sqlalchemy.engine 48 | level = WARN 49 | 50 | [handler_console] 51 | class = StreamHandler 52 | args = (sys.stdout,) 53 | level = NOTSET 54 | formatter = generic 55 | 56 | [formatter_generic] 57 | format = %(asctime)s %(levelname)-5.5s [%(name)s] %(message)s 58 | -------------------------------------------------------------------------------- /ckanext/xloader/tests/samples/boston_311_sample.csv: -------------------------------------------------------------------------------- 1 | CASE_ENQUIRY_ID,open_dt,target_dt,closed_dt,OnTime_Status,CASE_STATUS,CLOSURE_REASON,CASE_TITLE,SUBJECT,REASON,TYPE,QUEUE,Department,SubmittedPhoto,ClosedPhoto,Location,Fire_district,pwd_district,city_council_district,police_district,neighborhood,neighborhood_services_district,ward,precinct,LOCATION_STREET_NAME,LOCATION_ZIPCODE,Latitude,Longitude,Source 2 | 101002153891,2017-07-06 23:38:43,2017-07-21 08:30:00,,ONTIME,Open, ,Street Light Outages,Public Works Department ,Street Lights,Street Light Outages,PWDx_Street Light Outages,PWDx,,,480 Harvard St Dorchester MA 02124,8,07,4,B3,Greater Mattapan,9,Ward 14,1411,480 Harvard St,02124,42.288,-71.0927,Citizens Connect App 3 | 101002153890,2017-07-06 23:29:13,2017-09-11 08:30:00,,ONTIME,Open, ,Graffiti Removal,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP, https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg,,522 Saratoga St East Boston MA 02128,1,09,1,A7,East Boston,1,Ward 1,0110,522 Saratoga St,02128,42.3807,-71.0259,Citizens Connect App 4 | 101002153889,2017-07-06 23:24:20,2017-09-11 08:30:00,,ONTIME,Open, ,Graffiti Removal,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP, https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg,,965 Bennington St East Boston MA 02128,1,09,1,A7,East Boston,1,Ward 1,0112,965 Bennington St,02128,42.386,-71.008,Citizens Connect App 5 | -------------------------------------------------------------------------------- /ckanext/xloader/job_exceptions.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | from six import text_type as str 4 | 5 | 6 | class DataTooBigError(Exception): 7 | pass 8 | 9 | 10 | class JobError(Exception): 11 | pass 12 | 13 | 14 | class FileCouldNotBeLoadedError(Exception): 15 | pass 16 | 17 | 18 | class HTTPError(JobError): 19 | """Exception that's raised if a job fails due to an HTTP problem.""" 20 | 21 | def __init__(self, message, status_code, request_url, response): 22 | """Initialise a new HTTPError. 23 | 24 | :param message: A human-readable error message 25 | :type message: string 26 | 27 | :param status_code: The status code of the errored HTTP response, 28 | e.g. 500 29 | :type status_code: int 30 | 31 | :param request_url: The URL that was requested 32 | :type request_url: string 33 | 34 | :param response: The body of the errored HTTP response as unicode 35 | (if you have a requests.Response object then response.text will 36 | give you this) 37 | :type response: unicode 38 | 39 | """ 40 | super(HTTPError, self).__init__(message) 41 | self.message = message 42 | self.status_code = status_code 43 | self.request_url = request_url 44 | self.response = response 45 | 46 | def __str__(self): 47 | return str('{} status={} url={} response={}'.format( 48 | self.message, self.status_code, self.request_url, self.response) 49 | .encode('ascii', 'replace')) 50 | 51 | 52 | class LoaderError(JobError): 53 | '''Exception that's raised if a load fails''' 54 | pass 55 | 56 | 57 | class XLoaderTimeoutError(JobError): 58 | """Custom timeout exception that can be retried""" 59 | pass 60 | -------------------------------------------------------------------------------- /ckanext/xloader/interfaces.py: -------------------------------------------------------------------------------- 1 | from ckan.plugins.interfaces import Interface 2 | 3 | 4 | class IXloader(Interface): 5 | """ 6 | The IXloader interface allows plugin authors to receive notifications 7 | before and after a resource is submitted to the xloader service, as 8 | well as determining whether a resource should be submitted in can_upload 9 | 10 | The before_submit function, when implemented 11 | """ 12 | 13 | def can_upload(self, resource_id): 14 | """ This call when implemented can be used to stop the processing of 15 | the xloader submit function. This method will not be called if 16 | the resource format does not match those defined in the 17 | ckanext.xloader.formats config option or the default formats. 18 | 19 | If this function returns False then processing will be aborted, 20 | whilst returning True will submit the resource to the xloader 21 | service 22 | 23 | Note that before reaching this hook there is a prior check on the 24 | resource format, which depends on the value of 25 | the :ref:`ckanext.xloader.formats` configuration option (and requires 26 | the resource to have a format defined). 27 | 28 | :param resource_id: The ID of the resource that is to be 29 | pushed to the xloader service. 30 | 31 | Returns ``True`` if the job should be submitted and ``False`` if 32 | the job should be aborted 33 | 34 | :rtype: bool 35 | """ 36 | return True 37 | 38 | def after_upload(self, context, resource_dict, dataset_dict): 39 | """ After a resource has been successfully upload to the datastore 40 | this method will be called with the resource dictionary and the 41 | package dictionary for this resource. 42 | 43 | :param context: The context within which the upload happened 44 | :param resource_dict: The dict represenstaion of the resource that was 45 | successfully uploaded to the datastore 46 | :param dataset_dict: The dict represenstation of the dataset containing 47 | the resource that was uploaded 48 | """ 49 | pass 50 | -------------------------------------------------------------------------------- /ckanext/xloader/views.py: -------------------------------------------------------------------------------- 1 | from flask import Blueprint 2 | 3 | from ckan.plugins.toolkit import _, h, g, render, request, abort, NotAuthorized, get_action, ObjectNotFound 4 | 5 | import ckanext.xloader.utils as utils 6 | 7 | 8 | xloader = Blueprint("xloader", __name__) 9 | 10 | 11 | def get_blueprints(): 12 | return [xloader] 13 | 14 | 15 | @xloader.route("/dataset//resource_data/", methods=("GET", "POST")) 16 | def resource_data(id, resource_id): 17 | rows = request.args.get('rows') 18 | if rows: 19 | try: 20 | rows = int(rows) 21 | if rows < 0: 22 | rows = None 23 | except ValueError: 24 | rows = None 25 | return utils.resource_data(id, resource_id, rows) 26 | 27 | 28 | @xloader.route("/dataset//delete-datastore/", methods=("GET", "POST")) 29 | def delete_datastore_table(id, resource_id): 30 | if u'cancel' in request.form: 31 | return h.redirect_to(u'xloader.resource_data', id=id, resource_id=resource_id) 32 | 33 | context = {"user": g.user} 34 | 35 | try: 36 | res_dict = get_action('resource_show')(context, {"id": resource_id}) 37 | if res_dict.get('package_id') != id: 38 | raise ObjectNotFound 39 | except ObjectNotFound: 40 | return abort(404, _(u'Resource not found')) 41 | 42 | if request.method == 'POST': 43 | try: 44 | get_action('datastore_delete')(context, { 45 | "resource_id": resource_id, 46 | "force": True}) 47 | except NotAuthorized: 48 | return abort(403, _(u'Unauthorized to delete resource %s') % resource_id) 49 | 50 | h.flash_notice(_(u'DataStore and Data Dictionary deleted for resource %s') % resource_id) 51 | 52 | return h.redirect_to( 53 | 'xloader.resource_data', 54 | id=id, 55 | resource_id=resource_id 56 | ) 57 | else: 58 | g.resource_id = resource_id 59 | g.package_id = id 60 | 61 | extra_vars = { 62 | u"resource_id": resource_id, 63 | u"package_id": id 64 | } 65 | return render(u'xloader/confirm_datastore_delete.html', extra_vars) 66 | -------------------------------------------------------------------------------- /ckanext/xloader/cli.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import click 5 | from ckanext.xloader.command import XloaderCmd 6 | 7 | # Click commands for CKAN 2.9 and above 8 | 9 | 10 | @click.group(short_help='Perform XLoader related actions') 11 | def xloader(): 12 | """xloader commands 13 | """ 14 | pass 15 | 16 | 17 | @xloader.command() 18 | def status(): 19 | """Shows status of jobs 20 | """ 21 | cmd = XloaderCmd() 22 | cmd.print_status() 23 | 24 | 25 | @xloader.command() 26 | @click.argument(u'dataset-spec') 27 | @click.option('-y', is_flag=True, default=False, help='Always answer yes to questions') 28 | @click.option('--dry-run', is_flag=True, default=False, help='Don\'t actually submit any resources') 29 | @click.option('--queue', help='Queue name for asynchronous processing, unused if executing immediately') 30 | @click.option('--sync', is_flag=True, default=False, 31 | help='Execute immediately instead of enqueueing for asynchronous processing') 32 | def submit(dataset_spec, y, dry_run, queue, sync): 33 | """ 34 | xloader submit [options] 35 | """ 36 | cmd = XloaderCmd(dry_run) 37 | 38 | if dataset_spec == 'all': 39 | cmd._setup_xloader_logger() 40 | cmd._submit_all(sync=sync, queue=queue) 41 | elif dataset_spec == 'all-existing': 42 | _confirm_or_abort(y, dry_run) 43 | cmd._setup_xloader_logger() 44 | cmd._submit_all_existing(sync=sync, queue=queue) 45 | else: 46 | pkg_name_or_id = dataset_spec 47 | cmd._setup_xloader_logger() 48 | cmd._submit_package(pkg_name_or_id, sync=sync, queue=queue) 49 | 50 | if cmd.error_occured: 51 | print('Finished but saw errors - see above for details') 52 | sys.exit(1) 53 | 54 | 55 | def get_commands(): 56 | return [xloader] 57 | 58 | 59 | def _confirm_or_abort(yes, dry_run): 60 | if yes or dry_run: 61 | return 62 | question = ( 63 | "Data in any datastore resource that isn't in their source files " 64 | "(e.g. data added using the datastore API) will be permanently " 65 | "lost. Are you sure you want to proceed?" 66 | ) 67 | if not click.confirm(question): 68 | print("Aborting...") 69 | sys.exit(0) 70 | -------------------------------------------------------------------------------- /ckanext/xloader/tests/samples/polling_locations.kml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | OGRGeoJSON 24 | 25 | 26 | 1 27 | 781862.000004 28 | 2958580.000015 29 | 1 30 | 1 31 | SAMUEL ADAMS ELEMENTARY SCHOOL 32 | 165 WEBSTER STREET 33 | VOTE IN AUDITORIUM 34 | VOTERS ENTER FROM LEFT OF MAIN ENTRANCE 35 | 36 | 37 | -71.0348903104939,42.365563004886 38 | 39 | 40 | 41 | 2 42 | 782174.071396 43 | 2959815.54504 44 | 1 45 | 2 46 | DONALD MCKAY SCHOOL 47 | 122 COTTAGE STREET 48 | VOTE IN CAFETERIA 49 | VOTER ENTRANCE MCKAY PLACE 50 | 51 | 52 | -71.0337105829271,42.368948675863 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /ckanext/xloader/parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import datetime 3 | from decimal import Decimal, InvalidOperation 4 | import re 5 | import six 6 | 7 | from ckan.plugins.toolkit import asbool 8 | from dateutil.parser import isoparser, parser, ParserError 9 | 10 | from ckan.plugins.toolkit import config 11 | 12 | CSV_SAMPLE_LINES = 1000 13 | DATE_REGEX = re.compile(r'''^\d{1,4}[-/.\s]\S+[-/.\s]\S+''') 14 | 15 | 16 | class TypeConverter: 17 | """ Post-process table cells to convert strings into numbers and timestamps 18 | as desired. 19 | """ 20 | 21 | def __init__(self, types=None, fields=None): 22 | self.types = types 23 | self.fields = fields 24 | 25 | def convert_types(self, extended_rows): 26 | """ Try converting cells to numbers or timestamps if applicable. 27 | If a list of types was supplied, use that. 28 | If not, then try converting each column to numeric first, 29 | then to a timestamp. If both fail, just keep it as a string. 30 | """ 31 | for row_number, headers, row in extended_rows: 32 | for cell_index, cell_value in enumerate(row): 33 | if cell_value is None: 34 | row[cell_index] = '' 35 | if self.fields: 36 | # only strip white space if strip_extra_white is True 37 | if self.fields[cell_index].get('info', {}).get('strip_extra_white', True) and isinstance(cell_value, six.text_type): 38 | cell_value = cell_value.strip() 39 | row[cell_index] = cell_value.strip() 40 | if not cell_value: 41 | # load_csv parody: empty of string type should be None 42 | if self.types and self.types[cell_index] == six.text_type: 43 | cell_value = None 44 | row[cell_index] = None 45 | continue 46 | cell_type = self.types[cell_index] if self.types else None 47 | if cell_type in [Decimal, None]: 48 | converted_value = to_number(cell_value) 49 | # Can't do a simple truthiness check, 50 | # because 0 is a valid numeric result. 51 | if converted_value is not None: 52 | row[cell_index] = converted_value 53 | continue 54 | if cell_type in [datetime.datetime, None]: 55 | converted_value = to_timestamp(cell_value) 56 | if converted_value: 57 | row[cell_index] = converted_value 58 | yield (row_number, headers, row) 59 | 60 | 61 | def to_number(value): 62 | if not isinstance(value, six.string_types): 63 | return None 64 | try: 65 | return Decimal(value) 66 | except InvalidOperation: 67 | return None 68 | 69 | 70 | def to_timestamp(value): 71 | if not isinstance(value, six.string_types) or not DATE_REGEX.search(value): 72 | return None 73 | try: 74 | i = isoparser() 75 | return i.isoparse(value) 76 | except ValueError: 77 | try: 78 | p = parser() 79 | yearfirst = asbool(config.get('ckanext.xloader.parse_dates_yearfirst', False)) 80 | dayfirst = asbool(config.get('ckanext.xloader.parse_dates_dayfirst', False)) 81 | return p.parse(value, yearfirst=yearfirst, dayfirst=dayfirst) 82 | except ParserError: 83 | return None 84 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: Tests 3 | on: 4 | push: 5 | pull_request: 6 | branches: 7 | - master 8 | workflow_call: 9 | 10 | jobs: 11 | lint: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v4 15 | - uses: actions/setup-python@v5 16 | with: 17 | python-version: '3.10' 18 | - name: Install requirements 19 | run: pip install flake8 pycodestyle 20 | - name: Check syntax 21 | run: flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics --extend-exclude ckan 22 | 23 | test: 24 | needs: lint 25 | strategy: 26 | matrix: 27 | include: #ckan-image see https://github.com/ckan/ckan-docker-base, ckan-version controls other image tags 28 | - ckan-version: "2.11" 29 | ckan-image: "2.11-py3.10" 30 | experimental: false 31 | - ckan-version: "2.10" 32 | ckan-image: "2.10-py3.10" 33 | experimental: false 34 | - ckan-version: "master" 35 | ckan-image: "master" 36 | experimental: true # master is unstable, good to know if we are compatible or not 37 | fail-fast: false 38 | 39 | name: ${{ matrix.experimental && '**Fail_Ignored** ' || '' }} CKAN ${{ matrix.ckan-version }} 40 | runs-on: ubuntu-latest 41 | container: 42 | image: ckan/ckan-dev:${{ matrix.ckan-image }} 43 | options: --user root 44 | services: 45 | solr: 46 | image: ckan/ckan-solr:${{ matrix.ckan-version }}-solr9 47 | postgres: 48 | image: ckan/ckan-postgres-dev:${{ matrix.ckan-version }} 49 | env: 50 | POSTGRES_USER: postgres 51 | POSTGRES_PASSWORD: postgres 52 | POSTGRES_DB: postgres 53 | ports: 54 | - 5432:5432 55 | options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5 56 | redis: 57 | image: redis:7 58 | env: 59 | CKAN_SQLALCHEMY_URL: postgresql://ckan_default:pass@postgres/ckan_test 60 | CKAN_DATASTORE_WRITE_URL: postgresql://datastore_write:pass@postgres/datastore_test 61 | CKAN_DATASTORE_READ_URL: postgresql://datastore_read:pass@postgres/datastore_test 62 | CKAN_SOLR_URL: http://solr:8983/solr/ckan 63 | CKAN_REDIS_URL: redis://redis:6379/1 64 | 65 | steps: 66 | - uses: actions/checkout@v4 67 | continue-on-error: ${{ matrix.experimental }} 68 | 69 | - name: ${{ matrix.experimental && '**Fail_Ignored** ' || '' }} Install requirements 70 | continue-on-error: ${{ matrix.experimental }} 71 | run: | 72 | pip install -r requirements.txt 73 | pip install -r dev-requirements.txt 74 | pip install -e . 75 | pip install -U requests[security] 76 | # Replace default path to CKAN core config file with the one on the container 77 | sed -i -e 's/use = config:.*/use = config:\/srv\/app\/src\/ckan\/test-core.ini/' test.ini 78 | 79 | - name: ${{ matrix.experimental && '**Fail_Ignored** ' || '' }} Setup extension 80 | continue-on-error: ${{ matrix.experimental }} 81 | run: | 82 | ckan -c test.ini db init 83 | ckan -c test.ini user add ckan_admin email=ckan_admin@localhost password="AbCdEf12345!@#%" 84 | ckan -c test.ini sysadmin add ckan_admin 85 | ckan config-tool test.ini "ckanext.xloader.api_token=$(ckan -c test.ini user token add ckan_admin xloader | tail -n 1 | tr -d '\t')" 86 | ckan -c test.ini user list 87 | 88 | - name: ${{ matrix.experimental && '**Fail_Ignored** ' || '' }} Run tests 89 | continue-on-error: ${{ matrix.experimental }} 90 | run: pytest --ckan-ini=test.ini --cov=ckanext.xloader --disable-warnings ckanext/xloader/tests --junit-xml=/tmp/artifacts/junit/results.xml 91 | 92 | - name: ${{ matrix.experimental && '**Fail_Ignored** ' || '' }} Test Summary 93 | uses: test-summary/action@v2 94 | continue-on-error: ${{ matrix.experimental }} 95 | with: 96 | paths: "/tmp/artifacts/junit/*.xml" 97 | if: always() -------------------------------------------------------------------------------- /ckanext/xloader/tests/test_parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import pytest 4 | 5 | from decimal import Decimal 6 | from datetime import datetime 7 | 8 | from tabulator import Stream 9 | from ckanext.xloader.parser import TypeConverter 10 | 11 | csv_filepath = os.path.abspath( 12 | os.path.join(os.path.dirname(__file__), "samples", "date_formats.csv") 13 | ) 14 | 15 | 16 | class TestParser(object): 17 | def test_simple(self): 18 | with Stream(csv_filepath, format='csv', 19 | post_parse=[TypeConverter().convert_types]) as stream: 20 | assert stream.sample == [ 21 | [ 22 | 'date', 23 | 'temperature', 24 | 'place' 25 | ], 26 | [ 27 | datetime(2011, 1, 2, 0, 0), 28 | Decimal('-1'), 29 | 'Galway' 30 | ], 31 | [ 32 | datetime(2011, 1, 3, 0, 0), 33 | Decimal('0.5'), 34 | 'Galway' 35 | ], 36 | [ 37 | datetime(2011, 1, 2, 0, 0), 38 | Decimal('5'), 39 | 'Berkeley' 40 | ], 41 | [ 42 | datetime(2003, 11, 1, 0, 0), 43 | Decimal('6'), 44 | 'Berkeley' 45 | ], 46 | ] 47 | 48 | @pytest.mark.ckan_config("ckanext.xloader.parse_dates_dayfirst", True) 49 | def test_dayfirst(self): 50 | print('test_dayfirst') 51 | with Stream(csv_filepath, format='csv', 52 | post_parse=[TypeConverter().convert_types]) as stream: 53 | assert stream.sample == [ 54 | [ 55 | 'date', 56 | 'temperature', 57 | 'place' 58 | ], 59 | [ 60 | datetime(2011, 1, 2, 0, 0), 61 | Decimal('-1'), 62 | 'Galway' 63 | ], 64 | [ 65 | datetime(2011, 3, 1, 0, 0), 66 | Decimal('0.5'), 67 | 'Galway' 68 | ], 69 | [ 70 | datetime(2011, 2, 1, 0, 0), 71 | Decimal('5'), 72 | 'Berkeley' 73 | ], 74 | [ 75 | datetime(2003, 1, 11, 0, 0), 76 | Decimal('6'), 77 | 'Berkeley' 78 | ], 79 | ] 80 | 81 | @pytest.mark.ckan_config("ckanext.xloader.parse_dates_yearfirst", True) 82 | def test_yearfirst(self): 83 | print('test_yearfirst') 84 | with Stream(csv_filepath, format='csv', 85 | post_parse=[TypeConverter().convert_types]) as stream: 86 | assert stream.sample == [ 87 | [ 88 | 'date', 89 | 'temperature', 90 | 'place' 91 | ], 92 | [ 93 | datetime(2011, 1, 2, 0, 0), 94 | Decimal('-1'), 95 | 'Galway' 96 | ], 97 | [ 98 | datetime(2011, 1, 3, 0, 0), 99 | Decimal('0.5'), 100 | 'Galway' 101 | ], 102 | [ 103 | datetime(2011, 1, 2, 0, 0), 104 | Decimal('5'), 105 | 'Berkeley' 106 | ], 107 | [ 108 | datetime(2011, 1, 3, 0, 0), 109 | Decimal('6'), 110 | 'Berkeley' 111 | ], 112 | ] 113 | 114 | @pytest.mark.ckan_config("ckanext.xloader.parse_dates_dayfirst", True) 115 | @pytest.mark.ckan_config("ckanext.xloader.parse_dates_yearfirst", True) 116 | def test_yearfirst_dayfirst(self): 117 | with Stream(csv_filepath, format='csv', 118 | post_parse=[TypeConverter().convert_types]) as stream: 119 | assert stream.sample == [ 120 | [ 121 | 'date', 122 | 'temperature', 123 | 'place' 124 | ], 125 | [ 126 | datetime(2011, 1, 2, 0, 0), 127 | Decimal('-1'), 128 | 'Galway' 129 | ], 130 | [ 131 | datetime(2011, 3, 1, 0, 0), 132 | Decimal('0.5'), 133 | 'Galway' 134 | ], 135 | [ 136 | datetime(2011, 2, 1, 0, 0), 137 | Decimal('5'), 138 | 'Berkeley' 139 | ], 140 | [ 141 | datetime(2011, 3, 1, 0, 0), 142 | Decimal('6'), 143 | 'Berkeley' 144 | ], 145 | ] 146 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: Publish to pypi 3 | on: 4 | push: 5 | #On versioned releases 6 | tags: 7 | - '*.*.*' 8 | # Allows you to run this workflow manually from the Actions tab 9 | workflow_dispatch: 10 | inputs: 11 | force: 12 | type: choice 13 | description: Retry Publish Version 14 | options: 15 | - No 16 | - Yes 17 | environment: 18 | description: 'Deployment environment' 19 | required: true 20 | default: 'pypi' 21 | type: choice 22 | options: 23 | - pypi 24 | - testpypi 25 | dryRun: 26 | description: 'Dry Run deployment (set to false to deploy)' 27 | required: true 28 | type: boolean 29 | default: true 30 | 31 | 32 | 33 | jobs: 34 | 35 | validateVersion: 36 | runs-on: ubuntu-latest 37 | if: github.repository == 'ckan/ckanext-xloader' 38 | steps: 39 | - uses: actions/checkout@v4 40 | 41 | - uses: actions/setup-python@v5 42 | with: 43 | python-version: '3.10' 44 | 45 | - name: Validate tag version 46 | if: ${{ startsWith(github.ref, 'refs/tags') }} 47 | run: | 48 | TAG_VALUE=${GITHUB_REF/refs\/tags\//} 49 | PYTHON_VERSION=$(grep -E '\bversion\s?=\s?"[^"]+"' pyproject.toml | awk -F '"' '{print $2}') 50 | echo "Tag version is [$TAG_VALUE], Python version is [$PYTHON_VERSION]" 51 | if [ "$TAG_VALUE" != "$PYTHON_VERSION" ]; then 52 | echo "Version mismatch; tag version is [$TAG_VALUE] but Python version is [$PYTHON_VERSION]" >> $GITHUB_STEP_SUMMARY 53 | exit 1 54 | fi 55 | 56 | test: 57 | needs: validateVersion 58 | name: Test 59 | uses: ./.github/workflows/test.yml # Call the reusable workflow 60 | 61 | publishSkipped: 62 | if: github.repository != 'ckan/ckanext-xloader' 63 | runs-on: ubuntu-latest 64 | steps: 65 | - run: | 66 | echo "## Skipping PyPI publish on downstream repository" >> $GITHUB_STEP_SUMMARY 67 | 68 | publish: 69 | needs: test 70 | permissions: 71 | id-token: write # IMPORTANT: this permission is mandatory for trusted publishing 72 | name: Publish Package 73 | runs-on: ubuntu-latest 74 | environment: 75 | name: ${{ github.event.inputs.environment || 'pypi' }} 76 | url: ${{ steps.version.outputs.url }} 77 | concurrency: 78 | group: ${{ github.event.inputs.environment }}-deployment 79 | cancel-in-progress: false 80 | env: 81 | ENVIRONMENT: ${{ github.event.inputs.environment || 'pypi' }} 82 | steps: 83 | - name: Get Git Tag and set url from environment 84 | id: version 85 | run: | 86 | #!/bin/bash 87 | 88 | TAG_VALUE=${GITHUB_REF/refs\/tags\//} 89 | echo "version=${TAG_VALUE}" >> $GITHUB_OUTPUT 90 | 91 | # Extract the repository name (minus the owner/org) 92 | reponame=$(basename $GITHUB_REPOSITORY) 93 | echo "reponame=${reponame}" >> $GITHUB_OUTPUT 94 | 95 | if [ "$env.ENVIRONMENT" == "testpypi" ]; then 96 | url="https://test.pypi.org/project/$reponame/$TAG_VALUE/" 97 | echo "environment=${env.ENVIRONMENT}" >> $GITHUB_OUTPUT 98 | else 99 | url="https://pypi.org/project/$reponame/$TAG_VALUE/" 100 | echo "environment=pypi" >> $GITHUB_OUTPUT 101 | fi 102 | 103 | echo "url=${url}" >> $GITHUB_OUTPUT 104 | 105 | - name: Checkout repository 106 | uses: actions/checkout@v4 107 | 108 | - name: Build package ${{ steps.version.outputs.reponame }} @ ${{ steps.version.outputs.version }} 109 | run: | 110 | pip install build 111 | pip install twine 112 | python -m build 113 | - name: Publish package distributions to PyPI 114 | if: ${{ startsWith(github.ref, 'refs/tags') && steps.version.outputs.environment == 'pypi' && github.event.inputs.dryRun != 'true' }} 115 | uses: pypa/gh-action-pypi-publish@release/v1 116 | # with: 117 | # skip-existing: true 118 | # verbose: true 119 | # print-hash: true 120 | - name: Test Publish package distributions to PyPI 121 | if: ${{ startsWith(github.ref, 'refs/tags') && steps.version.outputs.environment == 'testpypi' && github.event.inputs.dryRun == 'true' }} 122 | uses: pypa/gh-action-pypi-publish@release/v1 123 | with: 124 | repository-url: https://test.pypi.org/legacy/ 125 | # skip-existing: true 126 | # verbose: true 127 | # print-hash: true 128 | - name: Summary output 129 | if: ${{ startsWith(github.ref, 'refs/tags') && github.event.inputs.dryRun != 'true' }} 130 | run: 131 | echo "Published ${{ steps.version.outputs.repo_name }} @ ${{ steps.version.outputs.version }} to ${{ steps.version.outputs.url }}" >> $GITHUB_STEP_SUMMARY 132 | 133 | - name: (TEST RUN) Test Publish package distributions to PyPI 134 | if: ${{ github.event.inputs.dryRun == 'true' }} 135 | run: 136 | echo "Dry run deployment, did not publish" >> $GITHUB_STEP_SUMMARY 137 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ "setuptools",] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "ckanext-xloader" 7 | version = "2.2.0" 8 | description = "Express Loader - quickly load data into CKAN DataStore" 9 | classifiers = [ "Development Status :: 5 - Production/Stable", 10 | "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)", 11 | "Programming Language :: Python :: 3.7", 12 | "Programming Language :: Python :: 3.8", 13 | "Programming Language :: Python :: 3.9", 14 | "Programming Language :: Python :: 3.10",] 15 | keywords = [ "CKAN", "extension", "datastore",] 16 | dependencies = [ "typing_extensions",] 17 | authors = [ 18 | {name = "ThrawnCA", email = "carl.antuar@smartservice.qld.gov.au"}, 19 | {name = "Jesse Vickery (JVickery-TBS)", email = "jesse.vickery@tbs-sct.gc.ca"}, 20 | {name = "Adrià Mercader (amercader)", email = "amercadero@gmail.com"}, 21 | {name = "David Read (davidread)"}, 22 | {name = "Brett Jones (kowh-ai)", email = "datashades@linkdigital.com.au"}, 23 | {name = "Patricio Del Boca (pdelboca)"}, 24 | {name = "William Dutton (duttonw)", email = "william.dutton@qld.gov.au"}, 25 | # {name = "", email = ""}, 26 | ] 27 | maintainers = [ 28 | {name = "Adrià Mercader (amercader)", email = "amercadero@gmail.com"}, 29 | {name = "William Dutton (duttonw)", email = "william.dutton@qld.gov.au"}, 30 | {name = "Ian Ward (wardi)"}, 31 | {name = "Brett Jones (kowh-ai)", email = "datashades@linkdigital.com.au"}, 32 | ] 33 | 34 | [project.readme] 35 | file = "README.md" 36 | content-type = "text/markdown" 37 | 38 | [project.license] 39 | text = "AGPL" 40 | 41 | [project.urls] 42 | Homepage = "https://github.com/ckan/ckanext-xloader" 43 | 44 | [project.optional-dependencies] 45 | test = [ "pytest-factoryboy",] 46 | 47 | [project.entry-points."ckan.plugins"] 48 | xloader = "ckanext.xloader.plugin:xloaderPlugin" 49 | 50 | [project.entry-points."babel.extractors"] 51 | ckan = "ckan.lib.extract:extract_ckan" 52 | 53 | [tool.setuptools.packages] 54 | find = {} 55 | 56 | [tool.black] 57 | line-length = 79 58 | preview = true 59 | 60 | [tool.isort] 61 | known_ckan = "ckan" 62 | known_ckanext = "ckanext" 63 | known_self = "ckanext.xloader" 64 | sections = "FUTURE,STDLIB,FIRSTPARTY,THIRDPARTY,CKAN,CKANEXT,SELF,LOCALFOLDER" 65 | 66 | [tool.pytest.ini_options] 67 | addopts = "--ckan-ini test.ini" 68 | filterwarnings = [ 69 | "ignore::sqlalchemy.exc.SADeprecationWarning", 70 | "ignore::sqlalchemy.exc.SAWarning", 71 | "ignore::DeprecationWarning", 72 | ] 73 | 74 | [tool.pyright] 75 | pythonVersion = "3.7" 76 | include = ["ckanext"] 77 | exclude = [ 78 | "**/test*", 79 | "**/migration", 80 | ] 81 | strict = [] 82 | 83 | strictParameterNoneValue = true # type must be Optional if default value is None 84 | 85 | # Check the meaning of rules here 86 | # https://github.com/microsoft/pyright/blob/main/docs/configuration.md 87 | reportFunctionMemberAccess = true # non-standard member accesses for functions 88 | reportMissingImports = true 89 | reportMissingModuleSource = true 90 | reportMissingTypeStubs = false 91 | reportImportCycles = true 92 | reportUnusedImport = true 93 | reportUnusedClass = true 94 | reportUnusedFunction = true 95 | reportUnusedVariable = true 96 | reportDuplicateImport = true 97 | reportOptionalSubscript = true 98 | reportOptionalMemberAccess = true 99 | reportOptionalCall = true 100 | reportOptionalIterable = true 101 | reportOptionalContextManager = true 102 | reportOptionalOperand = true 103 | reportTypedDictNotRequiredAccess = false # We are using Context in a way that conflicts with this check 104 | reportConstantRedefinition = true 105 | reportIncompatibleMethodOverride = true 106 | reportIncompatibleVariableOverride = true 107 | reportOverlappingOverload = true 108 | reportUntypedFunctionDecorator = false 109 | reportUnknownParameterType = true 110 | reportUnknownArgumentType = false 111 | reportUnknownLambdaType = false 112 | reportUnknownMemberType = false 113 | reportMissingTypeArgument = true 114 | reportInvalidTypeVarUse = true 115 | reportCallInDefaultInitializer = true 116 | reportUnknownVariableType = true 117 | reportUntypedBaseClass = true 118 | reportUnnecessaryIsInstance = true 119 | reportUnnecessaryCast = true 120 | reportUnnecessaryComparison = true 121 | reportAssertAlwaysTrue = true 122 | reportSelfClsParameterName = true 123 | reportUnusedCallResult = false # allow function calls for side-effect only (like logic.check_acces) 124 | useLibraryCodeForTypes = true 125 | reportGeneralTypeIssues = true 126 | reportPropertyTypeMismatch = true 127 | reportWildcardImportFromLibrary = true 128 | reportUntypedClassDecorator = false # authenticator relies on repoze.who class-decorator 129 | reportUntypedNamedTuple = true 130 | reportPrivateUsage = true 131 | reportPrivateImportUsage = true 132 | reportInconsistentConstructor = true 133 | reportMissingSuperCall = false 134 | reportUninitializedInstanceVariable = true 135 | reportInvalidStringEscapeSequence = true 136 | reportMissingParameterType = true 137 | reportImplicitStringConcatenation = false 138 | reportUndefinedVariable = true 139 | reportUnboundVariable = true 140 | reportInvalidStubStatement = true 141 | reportIncompleteStub = true 142 | reportUnsupportedDunderAll = true 143 | reportUnusedCoroutine = true 144 | reportUnnecessaryTypeIgnoreComment = true 145 | reportMatchNotExhaustive = true -------------------------------------------------------------------------------- /ckanext/xloader/tests/test_action.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from ckan.plugins import toolkit 3 | try: 4 | from unittest import mock 5 | except ImportError: 6 | import mock 7 | 8 | from ckan.plugins.toolkit import NotAuthorized 9 | from ckan.tests import helpers, factories 10 | 11 | from ckanext.xloader.utils import get_xloader_user_apitoken 12 | 13 | 14 | @pytest.mark.usefixtures("clean_db", "with_plugins") 15 | @pytest.mark.ckan_config("ckan.plugins", "datastore xloader") 16 | class TestAction(object): 17 | 18 | def test_submit(self): 19 | # checks that xloader_submit enqueues the resource (to be xloadered) 20 | user = factories.User() 21 | # normally creating a resource causes xloader_submit to be called, 22 | # but we avoid that by setting an invalid format 23 | res = factories.Resource(user=user, format="aaa") 24 | # mock the enqueue 25 | with mock.patch( 26 | "ckanext.xloader.action.enqueue_job", 27 | return_value=mock.MagicMock(id=123), 28 | ) as enqueue_mock: 29 | helpers.call_action( 30 | "xloader_submit", 31 | context=dict(user=user["name"]), 32 | resource_id=res["id"], 33 | ) 34 | assert 1 == enqueue_mock.call_count 35 | 36 | def test_submit_to_custom_queue_without_auth(self): 37 | # check that xloader_submit doesn't allow regular users to change queues 38 | user = factories.User() 39 | with pytest.raises(NotAuthorized): 40 | helpers.call_auth( 41 | "xloader_submit", 42 | context=dict(user=user["name"], model=None), 43 | queue='foo', 44 | ) 45 | 46 | def test_submit_to_custom_queue_as_sysadmin(self): 47 | # check that xloader_submit allows sysadmins to change queues 48 | user = factories.Sysadmin() 49 | assert helpers.call_auth( 50 | "xloader_submit", 51 | context=dict(user=user["name"], model=None), 52 | queue='foo', 53 | ) is True 54 | 55 | def test_duplicated_submits(self): 56 | def submit(res, user): 57 | return helpers.call_action( 58 | "xloader_submit", 59 | context=dict(user=user["name"]), 60 | resource_id=res["id"], 61 | ) 62 | 63 | user = factories.User() 64 | 65 | with mock.patch( 66 | "ckanext.xloader.action.enqueue_job", 67 | return_value=mock.MagicMock(id=123), 68 | ) as enqueue_mock: 69 | enqueue_mock.reset_mock() 70 | # creating the resource causes it to be queued 71 | res = factories.Resource(user=user, format="csv") 72 | assert 1 == enqueue_mock.call_count 73 | # a second request to queue it will be stopped, because of the 74 | # existing task for this resource - shown by task_status_show 75 | submit(res, user) 76 | assert 1 == enqueue_mock.call_count 77 | 78 | def test_xloader_hook(self): 79 | # Check the task_status is stored correctly after a xloader job. 80 | user = factories.User() 81 | res = factories.Resource(user=user, format="csv") 82 | task_status = helpers.call_action( 83 | "task_status_update", 84 | context={}, 85 | entity_id=res["id"], 86 | entity_type="resource", 87 | task_type="xloader", 88 | key="xloader", 89 | value="{}", 90 | error="{}", 91 | state="pending", 92 | ) 93 | 94 | helpers.call_action( 95 | "xloader_hook", 96 | context=dict(user=user["name"]), 97 | metadata={"resource_id": res["id"]}, 98 | status="complete", 99 | ) 100 | 101 | task_status = helpers.call_action( 102 | "task_status_show", 103 | context={}, 104 | entity_id=res["id"], 105 | task_type="xloader", 106 | key="xloader", 107 | ) 108 | assert task_status["state"] == "complete" 109 | 110 | def test_status(self): 111 | 112 | # Trigger an xloader job 113 | res = factories.Resource(format="CSV") 114 | 115 | status = helpers.call_action( 116 | "xloader_status", 117 | resource_id=res["id"], 118 | ) 119 | 120 | assert status["status"] == "pending" 121 | 122 | def test_xloader_user_api_token_from_config(self): 123 | sysadmin = factories.SysadminWithToken() 124 | apikey = sysadmin["token"] 125 | with mock.patch.dict(toolkit.config, {'ckanext.xloader.api_token': apikey}): 126 | api_token = get_xloader_user_apitoken() 127 | assert api_token == apikey 128 | 129 | @pytest.mark.ckan_config("ckanext.xloader.api_token", "NOT_SET") 130 | def test_xloader_user_api_token_from_config_should_throw_exceptio_when_not_set(self): 131 | 132 | hasNotThrownException = True 133 | try: 134 | get_xloader_user_apitoken() 135 | except Exception: 136 | hasNotThrownException = False 137 | 138 | assert not hasNotThrownException 139 | 140 | @pytest.mark.ckan_config("ckanext.xloader.api_token", "random-api-token") 141 | def test_xloader_user_api_token(self): 142 | api_token = get_xloader_user_apitoken() 143 | 144 | assert api_token == "random-api-token" 145 | -------------------------------------------------------------------------------- /ckanext/xloader/tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from unittest.mock import patch 3 | from ckan.plugins import toolkit 4 | from ckanext.xloader import utils 5 | 6 | 7 | def test_private_modify_url_no_change(): 8 | url = "https://ckan.example.com/dataset" 9 | assert utils._modify_url(url, "https://ckan.example.com") == url 10 | 11 | 12 | @pytest.mark.parametrize("result_url, ckan_url, expected", [ 13 | ("https://example.com/resource/123", "https://ckan.example.org", "https://ckan.example.org/resource/123"), 14 | ("https://example.com/resource/123", "http://127.0.0.1:3001", "http://127.0.0.1:3001/resource/123"), 15 | ("https://example.com/resource/123", "http://127.0.0.1:3001/pathnotadded", "http://127.0.0.1:3001/resource/123"), 16 | ("https://ckan.example.org/resource/123", "https://ckan.example.org", "https://ckan.example.org/resource/123"), 17 | ("http://old-ckan.com/resource/456", "http://new-ckan.com", "http://new-ckan.com/resource/456"), 18 | ("https://sub.example.com/path", "https://ckan.example.com", "https://ckan.example.com/path"), 19 | ("ftp://fileserver.com/file", "https://ckan.example.com", "ftp://fileserver.com/file"), # should never happen 20 | ("https://ckan.example.org/resource/789", "https://xloader.example.org", "https://xloader.example.org/resource/789"), 21 | ("https://ckan.example.org/dataset/data", "https://xloader.example.org", "https://xloader.example.org/dataset/data"), 22 | ("https://ckan.example.org/resource/123?foo=bar", "https://xloader.example.org", "https://xloader.example.org/resource/123?foo=bar"), 23 | ("https://ckan.example.org/dataset/456#section", "https://xloader.example.org", "https://xloader.example.org/dataset/456#section"), 24 | ("https://ckan.example.org/resource/123?param=value&other=123", "https://xloader.example.org", "https://xloader.example.org/resource/123?param=value&other=123"), 25 | ("https://ckan.example.org/resource/partial#fragment", "https://xloader.example.org", "https://xloader.example.org/resource/partial#fragment"), 26 | ("https://ckan.example.org/path/to/data?key=value#section", "https://xloader.example.org", "https://xloader.example.org/path/to/data?key=value#section"), 27 | ("", "", ""), 28 | ("", "http://127.0.0.1:5000", ""), 29 | (None, None, None), 30 | (None, "http://127.0.0.1:5000", None), 31 | ]) 32 | def test_private_modify_url(result_url, ckan_url, expected): 33 | assert utils._modify_url(result_url, ckan_url) == expected 34 | 35 | 36 | @pytest.mark.parametrize("input_url, ckan_site_url, xloader_site_url, is_altered, expected", [ 37 | ("https://ckan.example.org/resource/789", "https://ckan.example.org", "https://xloader.example.org", True, "https://xloader.example.org/resource/789"), 38 | ("https://ckan.example.org/resource/789", "https://ckan.example.org", "http://127.0.0.1:3012", True, "http://127.0.0.1:3012/resource/789"), 39 | ("https://ckan.example.org/dataset/data", "https://ckan.example.org", "https://xloader.example.org", True, "https://xloader.example.org/dataset/data"), 40 | ("https://ckan.example.org/resource/123?foo=bar", "https://ckan.example.org", "https://xloader.example.org", True, "https://xloader.example.org/resource/123?foo=bar"), 41 | ("https://ckan.example.org/dataset/456#section", "https://ckan.example.org", "https://xloader.example.org", True, "https://xloader.example.org/dataset/456#section"), 42 | ("https://other-site.com/resource/999", "https://ckan.example.org", "https://xloader.example.org", False, ""), 43 | ("https://ckan.example.org/resource/123?param=value&other=123", "https://ckan.example.org", "https://xloader.example.org", True, "https://xloader.example.org/resource/123?param=value&other=123"), 44 | ("https://ckan.example.org/resource/partial#fragment", "https://ckan.example.org", "https://xloader.example.org", True, "https://xloader.example.org/resource/partial#fragment"), 45 | ("https://ckan.example.org/path/to/data?key=value#section", "https://ckan.example.org", "https://xloader.example.org", True, "https://xloader.example.org/path/to/data?key=value#section"), 46 | ("https://ckan.example.org/path/to/data?key=value#section", "https://ckan.example.org", "http://localhost:3000", True, "http://localhost:3000/path/to/data?key=value#section"), 47 | ("https://ckan.example.org/blackListedPathToS3HostOrigin?key=value#section", "https://ckan.example.org", "https://xloader.example.org", False, ""), 48 | ("ftp://ckan.example.org/dataset/456#section", "https://ckan.example.org", "https://xloader.example.org", False, ""), 49 | ("https://ckan.example.org/dataset/456#section", "https://ckan.example.org", "", False, ""), 50 | ("", "http://127.0.0.1:5000", None, False, ""), 51 | ("", "http://127.0.0.1:5000", "", False, ""), 52 | (None, "http://127.0.0.1:5000", None, False, ""), 53 | (None, "http://127.0.0.1:5000", "", False, ""), 54 | ]) 55 | def test_modify_input_url(input_url, ckan_site_url, xloader_site_url, is_altered, expected): 56 | with patch.dict(toolkit.config, 57 | {"ckan.site_url": ckan_site_url, 58 | "ckanext.xloader.site_url": xloader_site_url, 59 | "ckanext.xloader.site_url_ignore_path_regex": "(/blackListedPathToS3HostOrigin|/anotherpath)"}): 60 | response = utils.modify_input_url(input_url) 61 | if is_altered: 62 | assert response == expected 63 | else: 64 | assert response == input_url 65 | 66 | 67 | def test_modify_input_url_no_xloader_site(): 68 | url = "https://ckan.example.org/dataset" 69 | with patch.dict(toolkit.config, {"ckan.site_url": "https://ckan.example.org", "ckanext.xloader.site_url": None}): 70 | assert utils.modify_input_url(url) == url 71 | -------------------------------------------------------------------------------- /ckanext/xloader/tests/fixtures.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from sqlalchemy import orm 3 | import os 4 | 5 | from ckanext.datastore.tests import helpers as datastore_helpers 6 | from ckanext.xloader.loader import get_write_engine 7 | 8 | __location__ = os.path.realpath( 9 | os.path.join(os.getcwd(), os.path.dirname(__file__)) 10 | ) 11 | 12 | try: 13 | from ckan.tests.pytest_ckan.fixtures import * # noqa 14 | except ImportError: 15 | import pytest 16 | 17 | from ckan.tests import helpers as test_helpers 18 | import ckan.plugins 19 | import ckan.lib.search as search 20 | 21 | from ckan.common import config 22 | 23 | @pytest.fixture 24 | def ckan_config(request, monkeypatch): 25 | """Allows to override the configuration object used by tests 26 | 27 | Takes into account config patches introduced by the ``ckan_config`` 28 | mark. 29 | 30 | If you just want to set one or more configuration options for the 31 | scope of a test (or a test class), use the ``ckan_config`` mark:: 32 | 33 | @pytest.mark.ckan_config('ckan.auth.create_unowned_dataset', True) 34 | def test_auth_create_unowned_dataset(): 35 | 36 | # ... 37 | 38 | To use the custom config inside a test, apply the 39 | ``ckan_config`` mark to it and inject the ``ckan_config`` fixture: 40 | 41 | .. literalinclude:: /../ckan/tests/pytest_ckan/test_fixtures.py 42 | :start-after: # START-CONFIG-OVERRIDE 43 | :end-before: # END-CONFIG-OVERRIDE 44 | 45 | If the change only needs to be applied locally, use the 46 | ``monkeypatch`` fixture 47 | 48 | .. literalinclude:: /../ckan/tests/test_common.py 49 | :start-after: # START-CONFIG-OVERRIDE 50 | :end-before: # END-CONFIG-OVERRIDE 51 | 52 | """ 53 | _original = config.copy() 54 | for mark in request.node.iter_markers(u"ckan_config"): 55 | monkeypatch.setitem(config, *mark.args) 56 | yield config 57 | config.clear() 58 | config.update(_original) 59 | 60 | @pytest.fixture 61 | def make_app(ckan_config): 62 | """Factory for client app instances. 63 | 64 | Unless you need to create app instances lazily for some reason, 65 | use the ``app`` fixture instead. 66 | """ 67 | return test_helpers._get_test_app 68 | 69 | @pytest.fixture 70 | def app(make_app): 71 | """Returns a client app instance to use in functional tests 72 | 73 | To use it, just add the ``app`` parameter to your test function signature:: 74 | 75 | def test_dataset_search(self, app): 76 | 77 | url = h.url_for('dataset.search') 78 | 79 | response = app.get(url) 80 | 81 | 82 | """ 83 | return make_app() 84 | 85 | @pytest.fixture(scope=u"session") 86 | def reset_db(): 87 | """Callable for resetting the database to the initial state. 88 | 89 | If possible use the ``clean_db`` fixture instead. 90 | 91 | """ 92 | return test_helpers.reset_db 93 | 94 | @pytest.fixture(scope=u"session") 95 | def reset_index(): 96 | """Callable for cleaning search index. 97 | 98 | If possible use the ``clean_index`` fixture instead. 99 | """ 100 | return search.clear_all 101 | 102 | @pytest.fixture 103 | def clean_db(reset_db): 104 | """Resets the database to the initial state. 105 | 106 | This can be used either for all tests in a class:: 107 | 108 | @pytest.mark.usefixtures("clean_db") 109 | class TestExample(object): 110 | 111 | def test_example(self): 112 | 113 | or for a single test:: 114 | 115 | class TestExample(object): 116 | 117 | @pytest.mark.usefixtures("clean_db") 118 | def test_example(self): 119 | 120 | """ 121 | reset_db() 122 | 123 | @pytest.fixture 124 | def clean_index(reset_index): 125 | """Clear search index before starting the test. 126 | """ 127 | reset_index() 128 | 129 | @pytest.fixture 130 | def with_plugins(ckan_config): 131 | """Load all plugins specified by the ``ckan.plugins`` config option 132 | at the beginning of the test. When the test ends (even it fails), it will 133 | unload all the plugins in the reverse order. 134 | 135 | .. literalinclude:: /../ckan/tests/test_factories.py 136 | :start-after: # START-CONFIG-OVERRIDE 137 | :end-before: # END-CONFIG-OVERRIDE 138 | 139 | """ 140 | plugins = ckan_config["ckan.plugins"].split() 141 | for plugin in plugins: 142 | if not ckan.plugins.plugin_loaded(plugin): 143 | ckan.plugins.load(plugin) 144 | yield 145 | for plugin in reversed(plugins): 146 | if ckan.plugins.plugin_loaded(plugin): 147 | ckan.plugins.unload(plugin) 148 | 149 | @pytest.fixture 150 | def test_request_context(app): 151 | """Provide function for creating Flask request context. 152 | """ 153 | return app.flask_app.test_request_context 154 | 155 | @pytest.fixture 156 | def with_request_context(test_request_context): 157 | """Execute test inside requests context 158 | """ 159 | with test_request_context(): 160 | yield 161 | 162 | 163 | def reset_datastore_db(): 164 | engine = get_write_engine() 165 | Session = orm.scoped_session(orm.sessionmaker(bind=engine)) 166 | datastore_helpers.clear_db(Session) 167 | 168 | 169 | @pytest.fixture() 170 | def full_reset(reset_db): 171 | reset_db() 172 | reset_datastore_db() 173 | -------------------------------------------------------------------------------- /ckanext/xloader/templates/xloader/resource_data.html: -------------------------------------------------------------------------------- 1 | {% extends "package/resource_edit_base.html" %} 2 | 3 | {% block subtitle %}{{ h.dataset_display_name(pkg) }} - {{ h.resource_display_name(res) }}{% endblock %} 4 | 5 | {% block primary_content_inner %} 6 | 7 | {% set show_table = true %} 8 | 9 | {% block upload_ds_button %} 10 | {% set action = h.url_for('xloader.resource_data', id=pkg.name, resource_id=res.id) %} 11 |
12 | {{ h.csrf_input() if 'csrf_input' in h }} 13 | 16 |
17 | {% endblock %} 18 | 19 |
20 | 21 | {% block delete_ds_button %} 22 | {% if res.datastore_active %} 23 | {% set delete_action = h.url_for('xloader.delete_datastore_table', id=pkg.id, resource_id=res.id) %} 24 |
25 | {{ h.csrf_input() if 'csrf_input' in h }} 26 | {% block delete_datastore_button_text %} {{ _('Delete from DataStore') }}{% endblock %} 33 |
34 | {% endif %} 35 | {% endblock %} 36 | 37 | {% if status.error and status.error.message %} 38 | {% set show_table = false %} 39 |
40 | {{ _('Upload error:') }} {{ status.error.message }} 41 |
42 | {% elif status.task_info and status.task_info.error %} 43 |
44 | {% if status.task_info.error is mapping %} 45 | {{ _('Error:') }} {{ status.task_info.error.message }} 46 | {% for error_key, error_value in status.task_info.error.items() %} 47 | {% if error_key != "message" and error_value %} 48 |
49 | {{ error_key }}: 50 | {{ error_value }} 51 | {% endif %} 52 | {% endfor %} 53 | {% elif status.task_info.error is iterable %} 54 | {{ _('Error traceback:') }} 55 |
{{ ''.join(status.task_info.error) }}
56 | {% endif %} 57 |
58 | {% endif %} 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | {% if status.status %} 72 | 73 | {% else %} 74 | 75 | {% endif %} 76 | 77 |
{{ _('Status') }}{{ h.xloader_status_description(status) }}
{{ _('Last updated') }}{{ h.time_ago_from_timestamp(status.last_updated) }}{{ _('Never') }}
78 | 79 | {% if status.status and status.task_info and show_table %} 80 |

{{ _('Upload Log') }}

81 |
    82 | {% set items = status.task_info.logs %} 83 | {% set rows = rows or 50 %} 84 | {% set skipped_rows = (items | length) - (rows * 2) %} 85 | {% if skipped_rows > 1 %} 86 |
  • 87 | 88 |

    89 | {{ skipped_rows }} out of {{ items | length }} logs will be hidden. 90 |
    91 | 92 | Show more   Show all 93 | 94 |

    95 |
  • 96 | {% endif %} 97 | {% for item in items %} 98 | {# Truncate very long loops, showing just the start and end #} 99 | {% if loop.index <= rows or loop.revindex <= rows 100 | or (loop.index == rows + 1 and loop.revindex == rows + 1) %} 101 | {% set icon = 'ok' if item.level == 'INFO' else 'exclamation' %} 102 | {% set class = ' failure' if icon == 'exclamation' else ' success' %} 103 | {% set popover_content = 'test' %} 104 |
  • 105 | 106 |

    107 | {% for line in item.message.strip().split('\n') %} 108 | {{ line | urlize }}
    109 | {% endfor %} 110 | 111 | {{ h.time_ago_from_timestamp(item.timestamp) }} 112 | {{ _('Details') }} 113 | 114 |

    115 |
  • 116 | {% elif loop.index == rows + 1 %} 117 |
  • 118 | 119 |

    120 | Skipping {{ skipped_rows }} logs... 121 |
    122 | 123 | Show more   Show all 124 | 125 |

    126 |
  • 127 | {% endif %} 128 | {% endfor %} 129 |
  • 130 | 131 |

    {{ _('End of log') }}

    132 |
  • 133 |
134 | {% endif %} 135 | 136 | {% endblock %} 137 | -------------------------------------------------------------------------------- /ckanext/xloader/helpers.py: -------------------------------------------------------------------------------- 1 | import ckan.plugins.toolkit as toolkit 2 | from ckanext.xloader.utils import XLoaderFormats 3 | from markupsafe import Markup 4 | from html import escape as html_escape 5 | 6 | 7 | def xloader_status(resource_id): 8 | try: 9 | return toolkit.get_action('xloader_status')( 10 | {}, {'resource_id': resource_id}) 11 | except toolkit.ObjectNotFound: 12 | return { 13 | 'status': 'unknown' 14 | } 15 | 16 | 17 | def xloader_status_description(status): 18 | _ = toolkit._ 19 | 20 | if status.get('status'): 21 | captions = { 22 | 'complete': _('Complete'), 23 | 'pending': _('Pending'), 24 | 'submitting': _('Submitting'), 25 | 'error': _('Error'), 26 | } 27 | 28 | return captions.get(status['status'], status['status'].capitalize()) 29 | else: 30 | return _('Not Uploaded Yet') 31 | 32 | 33 | def is_resource_supported_by_xloader(res_dict, check_access=True): 34 | is_supported_format = XLoaderFormats.is_it_an_xloader_format(res_dict.get('format')) 35 | is_datastore_active = res_dict.get('datastore_active', False) 36 | user_has_access = not check_access or toolkit.h.check_access( 37 | 'package_update', {'id': res_dict.get('package_id')}) 38 | url_type = res_dict.get('url_type') 39 | if url_type: 40 | try: 41 | is_supported_url_type = url_type not in toolkit.h.datastore_rw_resource_url_types() 42 | except AttributeError: 43 | is_supported_url_type = (url_type in ['upload', 'None']) 44 | else: 45 | is_supported_url_type = True 46 | return (is_supported_format or is_datastore_active) and user_has_access and is_supported_url_type 47 | 48 | 49 | def xloader_badge(resource): 50 | # type: (dict) -> str 51 | """ 52 | Displays a custom badge for the status of Xloader and DataStore for the specified resource. 53 | """ 54 | if not toolkit.asbool(toolkit.config.get('ckanext.xloader.show_badges', True)): 55 | return '' 56 | 57 | if not XLoaderFormats.is_it_an_xloader_format(resource.get('format')): 58 | # we only want to show badges for supported xloader formats 59 | return '' 60 | 61 | is_datastore_active = resource.get('datastore_active', False) 62 | 63 | try: 64 | xloader_job = toolkit.get_action("xloader_status")({'ignore_auth': True}, 65 | {"resource_id": resource.get('id')}) 66 | except toolkit.ObjectNotFound: 67 | xloader_job = {} 68 | 69 | if xloader_job.get('status') == 'complete': 70 | # the xloader task is complete, show datastore active or inactive. 71 | # xloader will delete the datastore table at the beggining of the job run. 72 | # so this will only be true if the job is fully finished. 73 | status = 'active' if is_datastore_active else 'inactive' 74 | elif xloader_job.get('status') in ['submitting', 'pending', 'running', 'running_but_viewable', 'error']: 75 | # the job is running or pending or errored 76 | # show the xloader status 77 | status = xloader_job.get('status') 78 | if status == 'running_but_viewable': 79 | # treat running_but_viewable the same as running 80 | status = 'running' 81 | elif status == 'submitting': 82 | # treat submitting the same as pending 83 | status = 'pending' 84 | else: 85 | # we do not know what the status is 86 | status = 'unknown' 87 | 88 | status_translations = { 89 | # Default messages 90 | 'pending': toolkit._('Pending'), 91 | 'running': toolkit._('Running'), 92 | 'error': toolkit._('Error'), 93 | # Debug messages 94 | 'complete': toolkit._('Complete'), 95 | 'active': toolkit._('Active'), 96 | 'inactive': toolkit._('Inactive'), 97 | 'unknown': toolkit._('Unknown'), 98 | } 99 | 100 | status_descriptions = { 101 | # Default messages 102 | 'pending': toolkit._('Data awaiting load to DataStore'), 103 | 'running': toolkit._('Loading data into DataStore'), 104 | 'error': toolkit._('Failed to load data into DataStore'), 105 | # Debug messages 106 | 'complete': toolkit._('Data loaded into DataStore'), 107 | 'active': toolkit._('Data available in DataStore'), 108 | 'inactive': toolkit._('Resource not active in DataStore'), 109 | 'unknown': toolkit._('DataStore status unknown'), 110 | } 111 | basic_statuses = ['pending', 'running', 'error'] 112 | 113 | if status not in basic_statuses and not toolkit.asbool(toolkit.config.get('ckanext.xloader.debug_badges', False)): 114 | return '' 115 | 116 | last_updated = toolkit.h.render_datetime(xloader_job.get('last_updated'), with_hours=True) \ 117 | if xloader_job.get('last_updated') else toolkit._('Last Updated Not Available') 118 | 119 | try: 120 | toolkit.check_access('resource_update', {'user': toolkit.g.user}, {'id': resource.get('id')}) 121 | pusher_url = toolkit.h.url_for('xloader.resource_data', 122 | id=resource.get('package_id'), 123 | resource_id=resource.get('id')) 124 | 125 | return Markup(u''' 126 | 127 | {prefix} 128 | {status_display} 129 | '''.format( 130 | pusher_url=pusher_url, 131 | prefix=toolkit._('datastore'), 132 | status=status, 133 | status_display=html_escape(status_translations[status], quote=True), 134 | status_description=html_escape(status_descriptions[status], quote=True), 135 | title=html_escape(last_updated, quote=True))) 136 | except toolkit.NotAuthorized: 137 | return Markup(u''' 138 | 139 | {prefix} 140 | {status_display} 141 | 142 | '''.format( 143 | prefix=toolkit._('datastore'), 144 | status=status, 145 | status_display=html_escape(status_translations[status], quote=True), 146 | status_description=html_escape(status_descriptions[status], quote=True), 147 | title=html_escape(last_updated, quote=True))) 148 | -------------------------------------------------------------------------------- /ckanext/xloader/command.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import logging 5 | import ckan.plugins.toolkit as tk 6 | 7 | from ckanext.xloader.jobs import xloader_data_into_datastore_ 8 | from ckanext.xloader.utils import XLoaderFormats, get_xloader_user_apitoken 9 | 10 | 11 | class XloaderCmd: 12 | def __init__(self, dry_run=False): 13 | self.dry_run = dry_run 14 | self.error_occured = False 15 | 16 | def _setup_xloader_logger(self): 17 | # whilst the development.ini's loggers are setup now, because this is 18 | # cli, let's ensure we xloader debug messages are printed for the user 19 | logger = logging.getLogger('ckanext.xloader') 20 | handler = logging.StreamHandler() 21 | formatter = logging.Formatter( 22 | ' %(name)-12s %(levelname)-5s %(message)s') 23 | handler.setFormatter(formatter) 24 | logger.addHandler(handler) 25 | logger.setLevel(logging.DEBUG) 26 | logger.propagate = False # in case the config 27 | 28 | def _submit_all_existing(self, sync=False, queue=None): 29 | from ckanext.datastore.backend \ 30 | import get_all_resources_ids_in_datastore 31 | resource_ids = get_all_resources_ids_in_datastore() 32 | print('Processing %d resources' % len(resource_ids)) 33 | user = tk.get_action('get_site_user')( 34 | {'ignore_auth': True}, {}) 35 | for resource_id in resource_ids: 36 | try: 37 | resource_dict = tk.get_action('resource_show')( 38 | {'ignore_auth': True}, {'id': resource_id}) 39 | except tk.ObjectNotFound: 40 | print(' Skipping resource {} found in datastore but not in ' 41 | 'metadata'.format(resource_id)) 42 | continue 43 | self._submit_resource(resource_dict, user, indent=2, sync=sync, queue=queue) 44 | 45 | def _submit_all(self, sync=False, queue=None): 46 | # submit every package 47 | # for each package in the package list, 48 | # submit each resource w/ _submit_package 49 | package_list = tk.get_action('package_search')( 50 | {'ignore_auth': True}, {'include_private': True, 'rows': 1000}) 51 | package_list = [pkg['id'] for pkg in package_list['results']] 52 | print('Processing %d datasets' % len(package_list)) 53 | user = tk.get_action('get_site_user')( 54 | {'ignore_auth': True}, {}) 55 | for p_id in package_list: 56 | self._submit_package(p_id, user, indent=2, sync=sync, queue=queue) 57 | 58 | def _submit_package(self, pkg_id, user=None, indent=0, sync=False, queue=None): 59 | indentation = ' ' * indent 60 | if not user: 61 | user = tk.get_action('get_site_user')( 62 | {'ignore_auth': True}, {}) 63 | 64 | try: 65 | pkg = tk.get_action('package_show')( 66 | {'ignore_auth': True}, 67 | {'id': pkg_id.strip()}) 68 | except Exception as e: 69 | print(e) 70 | print(indentation + 'Dataset "{}" was not found'.format(pkg_id)) 71 | sys.exit(1) 72 | 73 | print(indentation + 'Processing dataset {} with {} resources'.format( 74 | pkg['name'], len(pkg['resources']))) 75 | for resource in pkg['resources']: 76 | try: 77 | resource['package_name'] = pkg['name'] # for debug output 78 | self._submit_resource(resource, user, indent=indent + 2, sync=sync, queue=queue) 79 | except Exception as e: 80 | self.error_occured = True 81 | print(str(e)) 82 | print(indentation + 'ERROR submitting resource "{}" '.format( 83 | resource['id'])) 84 | continue 85 | 86 | def _submit_resource(self, resource, user, indent=0, sync=False, queue=None): 87 | '''resource: resource dictionary 88 | ''' 89 | indentation = ' ' * indent 90 | 91 | if not XLoaderFormats.is_it_an_xloader_format(resource['format']): 92 | print(indentation 93 | + 'Skipping resource {r[id]} because format "{r[format]}" is ' 94 | 'not configured to be xloadered'.format(r=resource)) 95 | return 96 | if resource['url_type'] in ('datapusher', 'xloader'): 97 | print(indentation 98 | + 'Skipping resource {r[id]} because url_type "{r[url_type]}" ' 99 | 'means resource.url points to the datastore ' 100 | 'already, so loading would be circular.'.format( 101 | r=resource)) 102 | return 103 | dataset_ref = resource.get('package_name', resource['package_id']) 104 | print('{indent}{sync_style} /dataset/{dataset}/resource/{r[id]}\n' 105 | '{indent} url={r[url]}\n' 106 | '{indent} format={r[format]}' 107 | .format(sync_style='Processing' if sync else 'Submitting', 108 | dataset=dataset_ref, r=resource, indent=indentation)) 109 | if self.dry_run: 110 | print(indentation + '(not submitted - dry-run)') 111 | return 112 | data_dict = { 113 | 'resource_id': resource['id'], 114 | 'ignore_hash': True, 115 | } 116 | if sync: 117 | data_dict['ckan_url'] = tk.config.get('ckan.site_url') 118 | input_dict = { 119 | 'metadata': data_dict, 120 | 'api_key': get_xloader_user_apitoken() 121 | } 122 | logger = logging.getLogger('ckanext.xloader.cli') 123 | xloader_data_into_datastore_(input_dict, None, logger) 124 | else: 125 | if queue: 126 | data_dict['queue'] = queue 127 | success = tk.get_action('xloader_submit')({'user': user['name']}, data_dict) 128 | if success: 129 | print(indentation + '...ok') 130 | else: 131 | print(indentation + 'ERROR submitting resource') 132 | self.error_occured = True 133 | 134 | def print_status(self): 135 | import ckan.lib.jobs as rq_jobs 136 | jobs = rq_jobs.get_queue().jobs 137 | if not jobs: 138 | print('No jobs currently queued') 139 | for job in jobs: 140 | job_params = eval(job.description.replace( 141 | 'ckanext.xloader.jobs.xloader_data_into_datastore', '')) 142 | job_metadata = job_params['metadata'] 143 | print('{id} Enqueued={enqueued:%Y-%m-%d %H:%M} res_id={res_id} ' 144 | 'url={url}'.format( 145 | id=job._id, 146 | enqueued=job.enqueued_at, 147 | res_id=job_metadata['resource_id'], 148 | url=job_metadata['original_url'], 149 | )) 150 | -------------------------------------------------------------------------------- /ckanext/xloader/tests/test_plugin.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | import datetime 4 | import pytest 5 | try: 6 | from unittest import mock 7 | except ImportError: 8 | import mock 9 | from six import text_type as str 10 | 11 | from ckan.tests import helpers, factories 12 | from ckan.logic import _actions 13 | from ckanext.xloader.plugin import _should_remove_unsupported_resource_from_datastore 14 | 15 | 16 | @pytest.mark.usefixtures("clean_db", "with_plugins") 17 | @pytest.mark.ckan_config("ckan.plugins", "datastore xloader") 18 | class TestNotify(object): 19 | def test_submit_on_resource_create(self, monkeypatch): 20 | func = mock.Mock() 21 | monkeypatch.setitem(_actions, "xloader_submit", func) 22 | 23 | dataset = factories.Dataset() 24 | 25 | assert not func.called 26 | 27 | helpers.call_action( 28 | "resource_create", 29 | {}, 30 | package_id=dataset["id"], 31 | url="http://example.com/file.csv", 32 | format="CSV", 33 | ) 34 | 35 | assert func.called 36 | 37 | def test_submit_when_url_changes(self, monkeypatch): 38 | func = mock.Mock() 39 | monkeypatch.setitem(_actions, "xloader_submit", func) 40 | 41 | dataset = factories.Dataset() 42 | 43 | resource = helpers.call_action( 44 | "resource_create", 45 | {}, 46 | package_id=dataset["id"], 47 | url="http://example.com/file.pdf", 48 | ) 49 | 50 | assert not func.called # because of the format being PDF 51 | 52 | helpers.call_action( 53 | "resource_update", 54 | {}, 55 | id=resource["id"], 56 | package_id=dataset["id"], 57 | url="http://example.com/file.csv", 58 | format="CSV", 59 | ) 60 | 61 | assert func.called 62 | 63 | @pytest.mark.ckan_config("ckanext.xloader.validation.requires_successful_report", True) 64 | def test_require_validation(self, monkeypatch): 65 | func = mock.Mock() 66 | monkeypatch.setitem(_actions, "xloader_submit", func) 67 | 68 | mock_resource_validation_show = mock.Mock() 69 | monkeypatch.setitem(_actions, "resource_validation_show", mock_resource_validation_show) 70 | 71 | dataset = factories.Dataset() 72 | 73 | resource = helpers.call_action( 74 | "resource_create", 75 | {}, 76 | package_id=dataset["id"], 77 | url="http://example.com/file.csv", 78 | format="CSV", 79 | validation_status='failure', 80 | ) 81 | 82 | # TODO: test IPipeValidation 83 | assert not func.called # because of the validation_status not being `success` 84 | func.called = None # reset 85 | 86 | helpers.call_action( 87 | "resource_update", 88 | {}, 89 | id=resource["id"], 90 | package_id=dataset["id"], 91 | url="http://example.com/file2.csv", 92 | format="CSV", 93 | validation_status='success', 94 | ) 95 | 96 | # TODO: test IPipeValidation 97 | assert not func.called # because of the validation_status is `success` 98 | 99 | @pytest.mark.ckan_config("ckanext.xloader.validation.requires_successful_report", True) 100 | @pytest.mark.ckan_config("ckanext.xloader.validation.enforce_schema", False) 101 | def test_enforce_validation_schema(self, monkeypatch): 102 | func = mock.Mock() 103 | monkeypatch.setitem(_actions, "xloader_submit", func) 104 | 105 | mock_resource_validation_show = mock.Mock() 106 | monkeypatch.setitem(_actions, "resource_validation_show", mock_resource_validation_show) 107 | 108 | dataset = factories.Dataset() 109 | 110 | resource = helpers.call_action( 111 | "resource_create", 112 | {}, 113 | package_id=dataset["id"], 114 | url="http://example.com/file.csv", 115 | schema='', 116 | validation_status='', 117 | ) 118 | 119 | # TODO: test IPipeValidation 120 | assert not func.called # because of the schema being empty 121 | func.called = None # reset 122 | 123 | helpers.call_action( 124 | "resource_update", 125 | {}, 126 | id=resource["id"], 127 | package_id=dataset["id"], 128 | url="http://example.com/file2.csv", 129 | schema='https://example.com/schema.json', 130 | validation_status='failure', 131 | ) 132 | 133 | # TODO: test IPipeValidation 134 | assert not func.called # because of the validation_status not being `success` and there is a schema 135 | func.called = None # reset 136 | 137 | helpers.call_action( 138 | "resource_update", 139 | {}, 140 | package_id=dataset["id"], 141 | id=resource["id"], 142 | url="http://example.com/file3.csv", 143 | schema='https://example.com/schema.json', 144 | validation_status='success', 145 | ) 146 | 147 | # TODO: test IPipeValidation 148 | assert not func.called # because of the validation_status is `success` and there is a schema 149 | 150 | @pytest.mark.parametrize("toolkit_config_value, mock_xloader_formats, url_type, datastore_active, expected_result", [ 151 | # Test1: Should pass as it is an upload with an active datastore entry but an unsupported format 152 | (True, False, 'upload', True, True), 153 | # Test2: Should fail as it is a supported XLoader format. 154 | (True, True, 'upload', True, False), 155 | # Test3: Should fail as the config option is turned off. 156 | (False, False, 'upload', True, False), 157 | # Test4: Should fail as the url_type is not supported. 158 | (True, False, 'custom_type', True, False), 159 | # Test5: Should fail as datastore is inactive. 160 | (True, False, 'upload', False, False), 161 | # Test6: Should pass as it is a recognised resource type with an active datastore entry but an unsupported format 162 | (True, False, '', True, True), 163 | # Test7: Should pass as it is a recognised resource type with an active datastore entry but an unsupported format 164 | (True, False, None, True, True), 165 | ]) 166 | def test_should_remove_unsupported_resource_from_datastore( 167 | self, toolkit_config_value, mock_xloader_formats, url_type, datastore_active, expected_result): 168 | 169 | # Setup mock data 170 | res_dict = { 171 | 'format': 'some_format', 172 | 'url_type': url_type, 173 | 'datastore_active': datastore_active, 174 | 'extras': {'datastore_active': datastore_active} 175 | } 176 | 177 | # Assert the result based on the logic paths covered 178 | with helpers.changed_config('ckanext.xloader.clean_datastore_tables', toolkit_config_value): 179 | with mock.patch('ckanext.xloader.utils.XLoaderFormats.is_it_an_xloader_format') as mock_is_xloader_format: 180 | mock_is_xloader_format.return_value = mock_xloader_formats 181 | assert _should_remove_unsupported_resource_from_datastore(res_dict) == expected_result 182 | 183 | def _pending_task(self, resource_id): 184 | return { 185 | "entity_id": resource_id, 186 | "entity_type": "resource", 187 | "task_type": "xloader", 188 | "last_updated": str(datetime.datetime.utcnow()), 189 | "state": "pending", 190 | "key": "xloader", 191 | "value": "{}", 192 | "error": "{}", 193 | } 194 | -------------------------------------------------------------------------------- /ckanext/xloader/tests/test_chunks.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import pytest 4 | import tempfile 5 | import logging 6 | from typing import Callable, List, Tuple, Any 7 | from unittest.mock import patch, MagicMock 8 | import csv 9 | import sqlalchemy.orm as orm 10 | 11 | from ckan.tests import factories 12 | from ckanext.xloader import loader 13 | from ckanext.xloader.loader import get_write_engine 14 | from ckanext.xloader.tests.test_loader import TestLoadBase, get_sample_filepath 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | @pytest.fixture() 20 | def Session(): 21 | engine = get_write_engine() 22 | Session = orm.scoped_session(orm.sessionmaker(bind=engine)) 23 | yield Session 24 | Session.close() 25 | 26 | 27 | @pytest.mark.usefixtures("full_reset", "with_plugins") 28 | @pytest.mark.ckan_config("ckan.plugins", "datastore xloader") 29 | class TestChunkedLoading(TestLoadBase): 30 | 31 | def _create_mock_split_copy(self, chunk_size: int) -> Callable: 32 | """Create a mock function for split_copy_by_size with specified chunk size""" 33 | original_split_copy = loader.split_copy_by_size 34 | 35 | def mock_split_copy(input_file: Any, engine: Any, logger: Any, resource_id: str, headers: List[str], delimiter: str = ',', max_size: int = 1024**3) -> Any: 36 | return original_split_copy(input_file, engine, logger, resource_id, headers, delimiter, chunk_size) 37 | 38 | return mock_split_copy 39 | 40 | def _create_mock_copy_file(self, copy_calls_list: List[Tuple]) -> Callable: 41 | """Create a mock function for copy_file that tracks calls""" 42 | original_copy_file = loader.copy_file 43 | 44 | def mock_copy_file(*args: Any, **kwargs: Any) -> Any: 45 | copy_calls_list.append(args) 46 | return original_copy_file(*args, **kwargs) 47 | 48 | return mock_copy_file 49 | 50 | def _generate_large_csv(self, filepath: str, num_rows: int = 100000, row_size_kb: int = 1) -> Tuple[str, List[str], int]: 51 | """Generate a large CSV file for testing chunked processing""" 52 | headers = ['id', 'name', 'description', 'data'] 53 | 54 | # Create data that will make each row approximately row_size_kb KB 55 | padding_size = (row_size_kb * 1024) - 50 # Account for other columns 56 | padding_data = 'x' * max(1, padding_size) 57 | 58 | with open(filepath, 'w', newline='', encoding='utf-8') as csvfile: 59 | writer = csv.writer(csvfile) 60 | writer.writerow(headers) 61 | 62 | for i in range(num_rows): 63 | writer.writerow([ 64 | i + 1, 65 | f'Name_{i + 1}', 66 | f'Description for row {i + 1}', 67 | padding_data 68 | ]) 69 | 70 | return filepath, headers, num_rows 71 | 72 | def test_chunked_processing_large_file(self, Session: Any) -> None: 73 | """Test that large files are processed in chunks and data integrity is maintained""" 74 | 75 | # Create a temporary large CSV file (~15MB to trigger chunking) 76 | with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as temp_file: 77 | temp_filepath = temp_file.name 78 | 79 | try: 80 | # Generate file with ~15MB (15000 rows * ~1KB each) 81 | csv_filepath, expected_headers, expected_rows = self._generate_large_csv( 82 | temp_filepath, num_rows=15000, row_size_kb=1 83 | ) 84 | 85 | # Verify file size is large enough to trigger chunking 86 | file_size = os.path.getsize(csv_filepath) 87 | assert file_size > 10 * 1024 * 1024, f"File size {file_size} should be > 10MB" 88 | 89 | resource = factories.Resource() 90 | resource_id = resource['id'] 91 | 92 | # Set up mocks with 10MB chunk size 93 | copy_calls = [] 94 | mock_split_copy = self._create_mock_split_copy(10 * 1024 * 1024) 95 | mock_copy_file = self._create_mock_copy_file(copy_calls) 96 | 97 | with patch('ckanext.xloader.loader.split_copy_by_size', side_effect=mock_split_copy): 98 | with patch('ckanext.xloader.loader.copy_file', side_effect=mock_copy_file): 99 | # Load the CSV with chunked processing 100 | fields = loader.load_csv( 101 | csv_filepath, 102 | resource_id=resource_id, 103 | mimetype="text/csv", 104 | logger=logger, 105 | ) 106 | 107 | # Verify chunking occurred (should have multiple copy calls) 108 | assert len(copy_calls) > 1, "Expected multiple chunks but file was not chunked" 109 | 110 | # Verify data integrity - check that all rows were loaded 111 | records = self._get_records(Session, resource_id) 112 | assert len(records) == expected_rows, f"Expected {expected_rows} records, got {len(records)}" 113 | 114 | # Verify column structure 115 | column_names = self._get_column_names(Session, resource_id) 116 | expected_columns = ['_id', '_full_text'] + expected_headers 117 | assert column_names == expected_columns 118 | 119 | # Verify first and last records to ensure data integrity 120 | # Sort records by the 'id' column (index 1) to ensure consistent ordering 121 | sorted_records = sorted(records, key=lambda x: int(x[1])) 122 | first_record = sorted_records[0] 123 | last_record = sorted_records[-1] 124 | 125 | # Check first record (excluding _id and _full_text columns) 126 | # The _get_records method excludes _full_text by default, so indices are: 127 | # 0: _id, 1: id, 2: name, 3: description, 4: data 128 | 129 | assert first_record[1] == '1' # id column (index 1 after _id) 130 | assert first_record[2] == 'Name_1' # name column (index 2) 131 | 132 | # Check last record 133 | assert last_record[1] == str(expected_rows) # id column 134 | assert last_record[2] == f'Name_{expected_rows}' # name column 135 | 136 | finally: 137 | # Clean up temporary file 138 | if os.path.exists(temp_filepath): 139 | os.unlink(temp_filepath) 140 | 141 | def test_small_file_no_chunking(self, Session: Any) -> None: 142 | """Test that small files are not chunked when chunk size is larger than file""" 143 | 144 | # Use existing small sample file 145 | csv_filepath = get_sample_filepath("simple.csv") 146 | resource = factories.Resource() 147 | resource_id = resource['id'] 148 | 149 | # Set up mocks with large chunk size to prevent chunking 150 | copy_calls = [] 151 | mock_split_copy = self._create_mock_split_copy(10 * 1024 * 1024) # 10MB 152 | mock_copy_file = self._create_mock_copy_file(copy_calls) 153 | 154 | with patch('ckanext.xloader.loader.split_copy_by_size', side_effect=mock_split_copy): 155 | with patch('ckanext.xloader.loader.copy_file', side_effect=mock_copy_file): 156 | fields = loader.load_csv( 157 | csv_filepath, 158 | resource_id=resource_id, 159 | mimetype="text/csv", 160 | logger=logger, 161 | ) 162 | 163 | # Small file should only have one copy call (no chunking) 164 | assert len(copy_calls) == 1, f"Small file should not be chunked, got {len(copy_calls)} copy calls" 165 | 166 | # Verify data loaded correctly 167 | records = self._get_records(Session, resource_id) 168 | assert len(records) == 6 # Known number of records in simple.csv 169 | 170 | -------------------------------------------------------------------------------- /ckanext/xloader/tests/samples/brazilian_sample.csv: -------------------------------------------------------------------------------- 1 | NU_ANO_CENSO,CO_MUNICIPIO,MUNIC,SIGLA,CO_UF,SCHOOLS_NU,SCHOOLS_FED_NU,SCHOOLS_ESTADUAL_NU,SCHOOLS_MUN_NU,SCHOOLS_PRIV_NU,SCHOOLS_FED_STUD,SCHOOLS_ESTADUAL_STUD,SCHOOLS_MUN_STUD,SCHOOLS_PRIV_STUD,SCHOOLS_URBAN_NU,SCHOOLS_RURAL_NU,SCHOOLS_URBAN_STUD,SCHOOLS_RURAL_STUD,SCHOOLS_NIVFUND_1_NU,SCHOOLS_NIVFUND_2_NU,SCHOOLS_EIGHTYEARS_NU,SCHOOLS_NINEYEARS_NU,SCHOOLS_EIGHTYEARS_STUD,SCHOOLS_NINEYEARS_STUD,MATFUND_NU,MATFUND_I_NU,MATFUND_T_NU,SCHOOLS_INTERNET_AVG,SCHOOLS_WATER_PUBLIC_AVG,SCHOOLS_WATER_AVG,SCHOOLS_ELECTR_PUB_AVG,SCHOOLS_SEWAGE_PUB_AVG,SCHOOLS_SEWAGE_AVG,PROFFUNDTOT_NU,PROFFUNDINC_PC,PROFFUNDCOMP_PC,PROFMED_PC,PROFSUP_PC,CLASSSIZE,CLASSSIZE_I,CLASSSIZE_T,STUDTEACH,RATE_APROV,RATE_APROV_I,RATE_APROV_T,RATE_FAILURE,RATE_FAILURE_I,RATE_FAILURE_T,RATE_ABANDON,RATE_ABANDON_I,RATE_ABANDON_T,RATE_TRANSFER,RATE_TRANSFER_I,RATE_TRANSFER_T,RATE_OVERAGE,RATE_OVERAGE_I,RATE_OVERAGE_T,PROVA_MEAN_PORT_I,PROVA_MEAN_PORT_T,PROVA_MEAN_MAT_I,PROVA_MEAN_MAT_T,CLASSSIZE_PUB,STUDTEACH_PUB,RATE_APROV_PUB,RATE_APROV_I_PUB,RATE_APROV_T_PUB,RATE_FAILURE_PUB,RATE_FAILURE_I_PUB,RATE_FAILURE_T_PUB,RATE_ABANDON_PUB,RATE_ABANDON_I_PUB,RATE_ABANDON_T_PUB,RATE_TRANSFER_PUB,RATE_TRANSFER_I_PUB,RATE_TRANSFER_T_PUB,RATE_OVERAGE_PUB,RATE_OVERAGE_I_PUB,RATE_OVERAGE_T_PUB,PROVA_MEAN_PORT_I_PUB,PROVA_MEAN_PORT_T_PUB,PROVA_MEAN_MAT_I_PUB,PROFFUNDTOT_NU_PUB,PROVA_MEAN_MAT_T_PUB,EDUCTEACH_PUB,EDUCTEACH_FEDERAL,EDUCTEACH_STATE,EDUCTEACH_MUN,PROVA_MEAN_PORT_I_STATE,PROVA_MEAN_PORT_T_STATE,PROVA_MEAN_MAT_I_STATE,PROVA_MEAN_MAT_T_STATE,PROVA_MEAN_PORT_I_MUN,PROVA_MEAN_PORT_T_MUN,PROVA_MEAN_MAT_I_MUN,PROVA_MEAN_MAT_T_MUN 2 | 01/01/1996 12:00:00 AM,1100015,ALTA FLORESTA D'OESTE,RO,,128,0,8,119,1,0,3613,3051,130,7,121,3716,3078,127,7,,,,,6794,5036,1758,,,,,,,337,0.26112759,0.17210683,0.43323442,0.13353115,24.833692447908199,,,22.704964,67.080006197818605,65.144188573097907,74.672390253375497,16.7913561569619,19.4894563570641,8.649237411458509,7.60165422117368,11.1540090366186,17.263407056738099,8.5269823,9.2213373,5.3085136,52.472769803217503,,,,,,,25.0011414302354,22.830887000000001,66.8150490097632,64.893674212235595,74.288246611754104,17.0725384713319,19.8404105332814,8.856561911292371,7.74275834336647,11.357671741889,17.9410577459881,8.3696527,8.9979973,5.0570836,53.286314230720798,,,,,,122988,,10.155015000000001,14.826086999999999,11.671533,9.072917,,,,,,,, 3 | 01/01/1997 12:00:00 AM,1100015,ALTA FLORESTA D'OESTE,RO,,94,0,8,85,1,0,2839,2614,148,6,88,2940,2661,94,7,,,,,5601,3570,2031,,0,1,0.063829787,0,0.93617022,287,0,0,0,0,25.0833657500872,,,21.250907999999999,71.110977629352107,70.1419150990167,75.763059126544903,15.245370982682999,16.496493591540201,8.98147212940713,6.94622497346647,15.7921332337152,12.6757637455453,6.6974254,8.1282864,4.9043164,,,,,,,,25.262045033236401,21.381359,70.672316388471998,69.521445456705493,75.431639575393902,15.5843352556537,16.889994178079299,9.30720180061441,7.13628622250936,16.242641275077901,13.369555548186099,6.6070609,8.2059631,4.3434491,,,,,,,108150,,,14.555555,,,,,,,,,, 4 | 01/01/1998 12:00:00 AM,1100015,ALTA FLORESTA D'OESTE,RO,,99,0,11,86,2,0,3021,2976,230,6,93,3035,3192,99,11,,,,,6227,3909,2318,,0,1,0.070707068,0,0.969697,297,0.13131313,0.23905724,0.48148149,0.14814815,25.1785270610666,,,23.833083999999999,70.647780398333097,70.898689527007704,74.367528940283705,13.4900411515016,13.638060818557699,9.181863672836959,7.76335222032492,18.221439042002501,13.523162982904299,8.0988264,9.7743425,5.6417899,45.848722840075197,122.78195020025601,94.471234723740395,,,,,25.4629641919806,24.20215,70.573802067316294,70.824099405698007,73.998407111957505,13.5963380925552,13.773877099133999,9.24169826321147,7.85336854029679,18.705919883281801,13.848521521423599,7.976491,9.7092905,5.6522574,47.140236294227797,127.024045994386,96.805895791551293,,,,90085,,10.395683,11,10.990741,10.017647,,,,,,,, 5 | 01/01/1999 12:00:00 AM,1100015,ALTA FLORESTA D'OESTE,RO,,97,0,11,84,2,0,3116,2621,197,6,91,3120,2814,97,12,,,,,5934,3872,2062,,0.020618556,0.96907216,0.12371134,0,0.88659793,362,0.082872927,0.14640884,0.50552487,0.26519337,24.567049566647501,,,18.29768,69.648254925924505,71.519813426042504,74.226500780418604,15.3810794145337,14.053312169428599,11.945412913064001,5.31873054336247,12.260287808610601,9.913036606242409,9.651934600000001,11.237819999999999,6.5629535,43.090663904769201,41.287557072538199,50.809843587814697,,,,,24.866690846463602,18.443901,69.401824973613202,71.312204366447503,74.226500780418604,15.686910383830501,14.3316018624195,11.945412913064001,5.36299549793561,12.448538350522799,9.913036606242409,9.5482683,11.167047999999999,6.5629535,43.873104357440901,41.985104684362597,51.777890266455998,,,,125964,,11.404692000000001,13,12.019608,10.904254999999999,,,,,,,, 6 | 01/01/2000 12:00:00 AM,1100015,ALTA FLORESTA D'OESTE,RO,,96,0,11,83,2,0,3006,2832,216,6,90,3000,3054,96,12,,,,,6054,3848,2206,0,0.020833334,0.98958331,0.13541667,0.010416667,1,354,0.048022598,0.14124294,0.5367232,0.27401131,23.448200576487299,,,19.703617000000001,,,,,,,,,,,,,39.478031148207897,0.304947477971829,0.466826025025085,,,,,23.7720972043514,19.964548000000001,,,,,,,,,,,,,40.253511576997198,0.309378559617233,0.473848650082831,,,,112075,,11.498488999999999,12.428572000000001,11.962121,11.190955000000001,,,,,,,, 7 | 01/01/2001 12:00:00 AM,1100015,ALTA FLORESTA D'OESTE,RO,,94,0,11,82,1,0,3179,3030,149,6,88,3226,3132,94,11,,,,,6358,3941,2417,0,0.031914894,0.92553192,0.14893617,0.010638298,0.91489363,355,0.03943662,0.11549295,0.61971831,0.22535211,23.934857271571801,,,20.544874,,,,,,,,,,,,,38.4869456756186,26.335080147989501,49.929375244514603,,,,,23.998429857027599,20.527096,,,,,,,,,,,,,38.975680552467402,26.532201566371999,49.929375244514603,,,,122647,,11.505747,11,11.798450000000001,11.333333,,,,,,,, 8 | 01/01/2002 12:00:00 AM,1100015,ALTA FLORESTA D'OESTE,RO,,87,0,11,76,0,0,3102,3112,0,5,82,3087,3127,87,17,,,,,6214,3510,2704,0,0.011494253,0.98850572,0.14942528,0,0.91954023,340,0.020588236,0.0029411765,0.7088235,0.26764706,24.9124169093372,,,20.526834000000001,,,,,,,,,,,,,37.206307733484799,26.134087850013799,45.999435780294696,,,,,24.9124169093372,20.526834000000001,,,,,,,,,,,,,37.206307733484799,26.134087850013799,45.999435780294696,,,,116976,,12,9,12.273438000000001,11.834906,,,,,,,, 9 | 01/01/2003 12:00:00 AM,1100015,ALTA FLORESTA D'OESTE,RO,,90,0,14,75,1,0,2990,3117,38,7,83,2987,3158,90,17,,,,,6145,3377,2768,0.2,0.022222223,1,0.21111111,0,0.85555553,330,0.036363635,0.0030303029,0.71515149,0.24545455,24.4788346665384,,,20.743715000000002,73.564145784103204,66.042258510608505,53.202522584121603,10.2361585853628,10.611000623557601,5.53406858121018,5.46809248299976,2.42417679682655,6.2575464876816,10.731604000000001,20.922564000000001,35.005862999999998,34.808787714164303,24.204907314713299,42.741019322483503,,,,,24.5720384846698,20.793973999999999,73.564145784103204,66.042258510608505,53.202522584121603,10.2361585853628,10.611000623557601,5.53406858121018,5.46809248299976,2.42417679682655,6.2575464876816,10.731604000000001,20.922564000000001,35.005862999999998,35.009006141230302,24.339144498095099,42.741019322483503,,,,105342,,11.871560000000001,13.777778,12.165355,11.685,,,,,,,, 10 | 01/01/2004 12:00:00 AM,1100015,ALTA FLORESTA D'OESTE,RO,,80,0,13,66,1,0,2879,2994,37,7,73,2935,2975,80,17,,,,,5910,3174,2736,0.16666667,0.025,1,0.30000001,0,0.89999998,333,0.015015015,0.027027028,0.37537536,0.58258259,24.812663731357102,,,20.061893000000001,70.6676315980514,72.835550853161195,67.109379689216198,10.910058940768,11.702339327436199,10.5794508698459,7.32243167260778,3.38640385403245,11.0340198138212,11.099878,12.075706,11.277148,32.588832626084503,23.330968231678799,39.2893599428049,,,,,24.852433620350901,20.110583999999999,70.587837445437501,72.769414628767805,67.109379689216198,10.9337923766684,11.731064143262801,10.5794508698459,7.36856311682479,3.4077382559734,11.0340198138212,11.109807,12.091783,11.277148,32.794142826521302,23.4779537287964,39.2893599428049,,,,102497,,13.212121,14.6,13.775,12.890476,,,,,,,, 11 | 01/01/2005 12:00:00 AM,1100015,ALTA FLORESTA D'OESTE,RO,,81,0,14,66,1,0,2804,2843,37,8,73,3022,2662,81,17,81,0,5684,0,5684,3046,2638,0.049382716,0.037037037,1,0.33333334,0,0.91358024,334,0.01497006,0.0089820363,0.42814371,0.54790419,24.101050315651499,,,20.160371999999999,69.997465446274404,71.240851300523303,69.625995047378595,12.323755200959599,14.7737897338986,8.58658350425592,6.54434281992422,3.01945352250705,10.509938178431399,11.134436000000001,10.965906,11.277483,30.735397747920299,23.638617608347499,35.446829331059703,,,,,24.1781541832109,20.211655,69.922556526071503,71.174219688094297,69.625995047378595,12.3641886198936,14.830533206398499,8.58658350425592,6.58790897399712,3.03955423865184,10.509938178431399,11.125344999999999,10.955693,11.277483,30.901363696633101,23.758084377821099,35.446829331059703,,,,94217,,13.114803,11,13.990826,12.684685,,,,,,,, 12 | -------------------------------------------------------------------------------- /ckanext/xloader/config_declaration.yaml: -------------------------------------------------------------------------------- 1 | version: 1 2 | groups: 3 | - annotation: ckanext-xloader settings 4 | options: 5 | - key: ckanext.xloader.site_url 6 | example: http://ckan-dev:5000 7 | default: 8 | description: | 9 | Provide an alternate site URL for the xloader_submit action. 10 | This is useful, for example, when the site is running within a docker network. 11 | Note: This setting will not alter path. i.e ckan.root_path 12 | required: false 13 | - key: ckanext.xloader.site_url_ignore_path_regex 14 | example: "(/PathToS3HostOriginIWantToGoDirectTo|/anotherPath)" 15 | default: 16 | description: | 17 | Provide the ability to ignore paths which can't be mapped to alternative site URL for resource access. 18 | This is useful, for example, when the site is running within a docker network and the cdn front door has 19 | Blob storage mapped to another path on the same domain. 20 | required: false 21 | - key: ckanext.xloader.jobs_db.uri 22 | default: sqlite:////tmp/xloader_jobs.db 23 | description: | 24 | The connection string for the jobs database used by XLoader. The 25 | default of an sqlite file is fine for development. For production use a 26 | Postgresql database. 27 | validators: not_missing 28 | required: true 29 | - key: ckanext.xloader.api_token 30 | example: eyJ0eXAiOiJKV1QiLCJh.eyJqdGkiOiJ0M2VNUFlQWFg0VU.8QgV8em4RA 31 | description: | 32 | Uses a specific API token for the xloader_submit action instead of the 33 | apikey of the site_user. 34 | default: 'NOT_SET' 35 | required: true 36 | - key: ckanext.xloader.formats 37 | example: csv application/csv xls application/vnd.ms-excel 38 | description: | 39 | The formats that are accepted. If the value of the resource.format is 40 | anything else then it won't be 'xloadered' to DataStore (and will therefore 41 | only be available to users in the form of the original download/link). 42 | Case insensitive. Defaults are listed in plugin.py. 43 | required: false 44 | - key: ckanext.xloader.max_content_length 45 | default: 1_000_000_000 46 | example: 100000 47 | description: | 48 | The maximum file size that XLoader will attempt to load. 49 | type: int 50 | required: false 51 | - key: ckanext.xloader.use_type_guessing 52 | default: False 53 | example: False 54 | description: | 55 | By default, xloader will first try to add tabular data to the DataStore 56 | with a direct PostgreSQL COPY. This is relatively fast, but does not 57 | guess column types. If this fails, xloader falls back to a method more 58 | like DataPusher's behaviour. This has the advantage that the column types 59 | are guessed. However it is more error prone and far slower. 60 | To always skip the direct PostgreSQL COPY and use type guessing, set 61 | this option to True. 62 | type: bool 63 | required: false 64 | legacy_key: ckanext.xloader.just_load_with_messytables 65 | - key: ckanext.xloader.strict_type_guessing 66 | default: True 67 | example: False 68 | description: | 69 | Use with ckanext.xloader.use_type_guessing to set strict true or false 70 | for type guessing. If set to False, the types will always fallback to string type. 71 | 72 | Strict means that a type will not be guessed if parsing fails for a single cell in the column. 73 | type: bool 74 | - key: ckanext.xloader.max_type_guessing_length 75 | default: 0 76 | example: 100000 77 | description: | 78 | The maximum file size that will be passed to Tabulator if the 79 | use_type_guessing flag is enabled. Larger files will use COPY even if 80 | the flag is set. Defaults to 1/10 of the maximum content length. 81 | type: int 82 | required: false 83 | - key: ckanext.xloader.parse_dates_dayfirst 84 | default: False 85 | example: False 86 | description: | 87 | Whether ambiguous dates should be parsed day first. Defaults to False. 88 | If set to True, dates like '01.02.2022' will be parsed as day = 01, 89 | month = 02. 90 | NB: isoformat dates like '2022-01-02' will be parsed as YYYY-MM-DD, and 91 | this option will not override that. 92 | See https://dateutil.readthedocs.io/en/stable/parser.html#dateutil.parser.parse 93 | for more details. 94 | type: bool 95 | required: false 96 | - key: ckanext.xloader.parse_dates_yearfirst 97 | default: False 98 | example: False 99 | description: | 100 | Whether ambiguous dates should be parsed year first. Defaults to False. 101 | If set to True, dates like '01.02.03' will be parsed as year = 2001, 102 | month = 02, day = 03. See https://dateutil.readthedocs.io/en/stable/parser.html#dateutil.parser.parse 103 | for more details. 104 | type: bool 105 | required: false 106 | - key: ckanext.xloader.job_timeout 107 | default: 3600 108 | example: 3600 109 | description: | 110 | The maximum time for the loading of a resource before it is aborted. 111 | Give an amount in seconds. Default is 60 minutes 112 | type: int 113 | required: false 114 | - key: ckanext.xloader.ignore_hash 115 | default: False 116 | example: False 117 | description: | 118 | Ignore the file hash when submitting to the DataStore, if set to True 119 | resources are always submitted (if their format matches), if set to 120 | False (default), resources are only submitted if their hash has changed. 121 | type: bool 122 | required: false 123 | - key: ckanext.xloader.max_excerpt_lines 124 | default: 0 125 | example: 100 126 | description: | 127 | When loading a file that is bigger than `max_content_length`, xloader can 128 | still try and load some of the file, which is useful to display a 129 | preview. Set this option to the desired number of lines/rows that it 130 | loads in this case. 131 | If the file-type is supported (CSV, TSV) an excerpt with the number of 132 | `max_excerpt_lines` lines will be submitted while the `max_content_length` 133 | is not exceeded. 134 | If set to 0 (default) files that exceed the `max_content_length` will 135 | not be loaded into the datastore. 136 | type: int 137 | required: false 138 | - key: ckanext.xloader.ssl_verify 139 | default: True 140 | example: True 141 | description: | 142 | Requests verifies SSL certificates for HTTPS requests. Setting verify to 143 | False should only be enabled during local development or testing. Default 144 | to True. 145 | type: bool 146 | required: false 147 | - key: ckanext.xloader.validation.requires_successful_report 148 | default: False 149 | example: True 150 | description: | 151 | Resources are required to pass Validation from the ckanext-validation 152 | plugin to be able to get XLoadered. 153 | type: bool 154 | required: false 155 | - key: ckanext.xloader.validation.enforce_schema 156 | default: True 157 | example: False 158 | description: | 159 | Resources are expected to have a Validation Schema, or use the default ones if not. 160 | 161 | If this option is set to `False`, Resources that do not have 162 | a Validation Schema will be treated like they do not require Validation. 163 | 164 | See https://github.com/frictionlessdata/ckanext-validation?tab=readme-ov-file#data-schema 165 | for more details. 166 | - key: ckanext.xloader.clean_datastore_tables 167 | default: False 168 | example: True 169 | description: | 170 | Enqueue jobs to remove Datastore tables from Resources that have a format 171 | that is not in ckanext.xloader.formats after a Resource is updated. 172 | type: bool 173 | required: false 174 | - key: ckanext.xloader.show_badges 175 | default: True 176 | example: False 177 | description: | 178 | Controls whether or not the status badges display in the front end. 179 | type: bool 180 | required: false 181 | - key: ckanext.xloader.debug_badges 182 | default: False 183 | example: True 184 | description: | 185 | Controls whether or not the status badges display all of the statuses. By default, 186 | the badges will display "pending", "running", and "error". With debug_badges enabled, 187 | they will also display "complete", "active", "inactive", and "unknown". 188 | type: bool 189 | required: false 190 | - key: ckanext.xloader.search_update_chunks 191 | default: 100000 192 | example: 1000 193 | description: | 194 | The number of rows to process in each batch when populating the full-text 195 | search index. Chunked processing prevents database timeouts and memory 196 | exhaustion when indexing very large datasets (4GB+ files with millions of rows). 197 | Smaller values reduce memory usage but increase processing time. Larger values 198 | improve performance but may cause timeouts on very large tables. 199 | type: int 200 | required: false 201 | - key: ckanext.xloader.max_retries 202 | default: 1 203 | example: 3 204 | description: | 205 | Maximum number of retry attempts for failed jobs due to temporary errors 206 | like database deadlocks or network timeouts. Set to 0 to disable retries. 207 | type: int 208 | required: false 209 | - key: ckanext.xloader.copy_chunk_size 210 | default: 1073741824 211 | example: 536870912 212 | description: | 213 | Maximum size in bytes for each chunk when processing files. 214 | Files are split into chunks to prevent memory exhaustion and 215 | system freezing. Default is 1GB (1073741824 bytes). Smaller values 216 | use less memory but create more chunks. 217 | type: int 218 | required: false 219 | -------------------------------------------------------------------------------- /CHANGELOG: -------------------------------------------------------------------------------- 1 | CHANGELOG 2 | ========= 3 | See: https://github.com/ckan/ckanext-xloader/releases if this file has drifted. 4 | 5 | 2.0.1 2025-03-04 6 | ================ 7 | 8 | ## Fix 9 | 10 | * #244 Static webassets not included in package 11 | * #245 support apitoken_header_name in 2.11.x. 12 | * #241 loading R/W datasource resources via api (not hardcoded) 13 | 14 | 2.0.0 2024-12-10 15 | ================ 16 | 17 | ## Major 18 | Dropped CKAN 2.9.x and Python2. 19 | 20 | 21 | ## Feat: 22 | * Adds Strip White Space fields to the Data Dictionary (defualts to `True` for each field). 23 | This will strip surrounding white space from data values prior to inserting them into the database. 24 | * Adds support for ckanext-validation. Config `ckanext.xloader.validation.requires_successful_report` 25 | controls whether a resource requires a successful validation report to be XLoadered. 26 | By default, a resource would also require a Validation Schema, which can be turned off with 27 | `ckanext.xloader.validation.enforce_schema`. 28 | * Frontend Status Badges by @JVickery-TBS in https://github.com/ckan/ckanext-xloader/pull/224 29 | 30 | 31 | ## Fix: 32 | * Properly handle REDIS queue timeouts to close/delete any temporary files. 33 | * Fix automated PyPI publishing by @ThrawnCA in https://github.com/ckan/ckanext-xloader/pull/231 34 | 35 | ## What's Changed 36 | * Update README, migrate it to Markdown by @amercader in https://github.com/ckan/ckanext-xloader/pull/235 37 | * chore: switch to pyproject.toml by @duttonw in https://github.com/ckan/ckanext-xloader/pull/236 38 | * Validation Extension Support (Squashed) by @JVickery-TBS in https://github.com/ckan/ckanext-xloader/pull/237 39 | * Strip White Space from Cell Values (Squashed) by @JVickery-TBS in https://github.com/ckan/ckanext-xloader/pull/238 40 | * RQ Job Timeout Handling (Squashed) by @JVickery-TBS in https://github.com/ckan/ckanext-xloader/pull/239 41 | * SQLAlchemy v2 support by @smotornyuk in https://github.com/ckan/ckanext-xloader/pull/225 42 | 43 | **Full Changelog**: https://github.com/ckan/ckanext-xloader/compare/1.2.0...2.0.0 44 | 45 | 1.2.0 2024-11-21 46 | ================ 47 | 48 | ## What's Changed 49 | * Fix PyPI publishing by @ThrawnCA in https://github.com/ckan/ckanext-xloader/pull/233 50 | * Enhancement/Bugfix: Downstream qld-gov-au fix's by @duttonw in https://github.com/ckan/ckanext-xloader/pull/232 51 | * feat: @JVickery-TBS work on validation integration (await successful validation prior to doing datastore work via 'IPipeValidation' 52 | * fix: handle gracefully if tabulator load fails by trying 'direct load' 53 | * fix: Excel blank header row bug 54 | * fix: Datastore truncate, restart identity so numbering restarts from 0 again (when imported data has same columns and types 55 | * fix: parital fix on DB deadlock by adding timeouts on DDL events 56 | * test: test_simple_large_file, test_with_blanks, test_with_empty_lines, test_with_extra_blank_cells 57 | * test: test_require_validation, test_enforce_validation_schema 58 | * chore: min version requirements for cve's, 59 | * requests>=2.32.0 60 | * urllib3>=2.2.2 61 | * zipp>=3.19.1 62 | 63 | **Full Changelog**: https://github.com/ckan/ckanext-xloader/compare/1.1.2...1.2.0 64 | 65 | 66 | 1.1.1 2024-10-16 67 | ================ 68 | 69 | * feat: Add pypi cicd publish via github action via environment controls by @duttonw in https://github.com/ckan/ckanext-xloader/pull/228 70 | 71 | 72 | **Full Changelog**: https://github.com/ckan/ckanext-xloader/compare/1.1.0...1.1.1 73 | 74 | 1.1.0 2024-10-16 75 | ================ 76 | 77 | Fixes: 78 | * feat: Add pypi cicd publish via github action via environment controls by @duttonw in https://github.com/ckan/ckanext-xloader/pull/228 79 | 80 | 81 | **Full Changelog**: https://github.com/ckan/ckanext-xloader/compare/1.1.0...1.1.1 82 | 83 | 84 | 1.1.0 2024-10-15 85 | ================ 86 | 87 | 88 | Fixes: 89 | 90 | * add README note about running on separate server, #191 by @ThrawnCA in https://github.com/ckan/ckanext-xloader/pull/192 91 | * Use IDomainObjectModification Implementation by @JVickery-TBS in https://github.com/ckan/ckanext-xloader/pull/198 92 | * Hide excessive numbers of resource_data log entries, #187 by @ThrawnCA in https://github.com/ckan/ckanext-xloader/pull/188 93 | * #182 Type guessing fixes by @ThrawnCA in https://github.com/ckan/ckanext-xloader/pull/186 94 | * Document the ckan.download_proxy setting, #176 by @ThrawnCA in https://github.com/ckan/ckanext-xloader/pull/204 95 | * Conditional DataStore Tab in Resource Edit by @JVickery-TBS in https://github.com/ckan/ckanext-xloader/pull/190 96 | * Make locking behaviour more robust by @ThrawnCA in https://github.com/ckan/ckanext-xloader/pull/205 97 | * Delete Datastore Table Button by @JVickery-TBS in https://github.com/ckan/ckanext-xloader/pull/197 98 | * Quality of life improvements by @duttonw in https://github.com/ckan/ckanext-xloader/pull/195 99 | * Clean Datastore Tables Job by @JVickery-TBS in https://github.com/ckan/ckanext-xloader/pull/196 100 | * strip extra space for column name by @mutantsan in https://github.com/ckan/ckanext-xloader/pull/210 101 | * Skip empty lines instead of erroring by @ThrawnCA in https://github.com/ckan/ckanext-xloader/pull/208 102 | * add more options for maintainers to expedite XLoader runs, GitHub #202 by @ThrawnCA in https://github.com/ckan/ckanext-xloader/pull/212 103 | * Add Mixed Integers Type Guessing to NUMERIC Tests by @JVickery-TBS in https://github.com/ckan/ckanext-xloader/pull/217 104 | * PY2 & PY3 String/Binary Fixes by @JVickery-TBS in https://github.com/ckan/ckanext-xloader/pull/203 105 | * In plugin.py, there is an fix of resource format key error by @Nisha1293 in https://github.com/ckan/ckanext-xloader/pull/209 106 | * CKAN 2.11 support by @amercader in https://github.com/ckan/ckanext-xloader/pull/220 107 | 108 | New Contributors: 109 | 110 | * @JVickery-TBS made their first contribution in https://github.com/ckan/ckanext-xloader/pull/198 111 | * @duttonw made their first contribution in https://github.com/ckan/ckanext-xloader/pull/195 112 | * @mutantsan made their first contribution in https://github.com/ckan/ckanext-xloader/pull/210 113 | * @Nisha1293 made their first contribution in https://github.com/ckan/ckanext-xloader/pull/209 114 | 115 | **Full Changelog**: https://github.com/ckan/ckanext-xloader/compare/1.0.1...1.1.0 116 | 117 | 118 | 1.0.1 2024-04-04 119 | ================ 120 | 121 | Fixes: 122 | 123 | * Include config_declaration.yaml into MANIFEST by @pdelboca in https://github.com/ckan/ckanext-xloader/pull/183 124 | 125 | 126 | **Full Changelog**: https://github.com/ckan/ckanext-xloader/compare/1.0.0...1.0.1 127 | 128 | 1.0.1 2024-04-04 129 | ================ 130 | 131 | Fixes: 132 | 133 | * Fixed date parsing while fetching entries for task_status by @muhammed-ajmal in https://github.com/ckan/ckanext-xloader/pull/179 134 | * Drop support for old CKAN versions and add CSRF support by @pdelboca in https://github.com/ckan/ckanext-xloader/pull/180 135 | * Refactor test_jobs.py by @pdelboca in https://github.com/ckan/ckanext-xloader/pull/181 136 | 137 | New Contributors: 138 | 139 | * @muhammed-ajmal made their first contribution in https://github.com/ckan/ckanext-xloader/pull/179 140 | 141 | **Full Changelog**: https://github.com/ckan/ckanext-xloader/compare/0.12.2...1.0.0 142 | 143 | 144 | 145 | 146 | 0.9.0 2021-10-01 147 | ================ 148 | 149 | Fixes: 150 | 151 | * Fix SQLAlchemy session exception on CKAN 2.9 #140 152 | * Fix xloader status timestamps #141 153 | * Fix to correctly report exceptions in stdout #141 154 | 155 | 156 | 0.8.1 2021-08-30 157 | ================ 158 | 159 | Features: 160 | 161 | * Add ssl_verify option to callback_xloader_hook #136 162 | 163 | Fixes: 164 | 165 | * Fix bytes / str concat #138 166 | * Stream request needs to be explicited closed #139 167 | 168 | 169 | 0.8.0 2021-06-11 170 | ================ 171 | 172 | Features: 173 | * Click CLI for CKAN >= 2.9 #128 174 | 175 | Fixes: 176 | * Submit private datasets when using the `submit all` command #121 177 | * Send user context to the resource patch function #134 178 | * Add documentation for ssl_verify #135 179 | 180 | 181 | 0.7.0 2020-11-23 182 | ================ 183 | 184 | Features: 185 | * Python 3 support #113 186 | * CKAN 2.9 support #113 187 | 188 | Fixes: 189 | * Update resource hash after load to datastore #116 190 | 191 | 192 | 0.6.1 2020-05-03 193 | ================ 194 | 195 | Features: 196 | * Add 'just_load_with_messytables' option #96 197 | 198 | Fixes: 199 | * When getting the resource from CKAN, it now copes with the edge case that CKAN hasn't quite added the resource yet - now it successfully retries #94 200 | 201 | 202 | 0.6.0 2020-04-27 203 | ================ 204 | 205 | Release withdrawn 206 | 207 | 208 | 0.5.0 2019-12-04 209 | ================ 210 | 211 | Features: 212 | * migrate_types CLI command added for freezing/migrating data dictionaries created with datapusher #85 213 | 214 | Fixes: 215 | * DataStore tab missing from resource manage page, due to templates missing from PyPI package #74 216 | 217 | 218 | 0.4.1 2019-11-13 219 | ================ 220 | 221 | Fixes: 222 | * populate_full_text_trigger error when doing 'datastore set-permissions' #72 223 | * '%' in column name causes TypeError("'dict' object does not support indexing") #65 224 | * numpy >= 1.16 causes 'RuntimeError: implement_array_function' on CKAN startup #79 225 | * CKAN 2.9 compatibility - fix error about 'resource_revision_table' #81 226 | 227 | 228 | v0.4.0 2019-06-21 229 | ================= 230 | 231 | Features: 232 | * 'ignore_hash' config option added to bypass the hash check which can skip loads #34 233 | * File size is logged #35 234 | * Retries are no done when downloading CSV #39 235 | * xloader_status action now available to GET (not just POST) #46 236 | * ANALYZE run after load, needed for CKAN 2.9 resource preview speed-up #47 237 | * CLI load of multiple datasets now resumes following an error with a resource #57 238 | * Added 'max_excerpt_lines' config to load an excerpt of files larger than max_content_length #63 239 | 240 | Fixes: 241 | * Unicode characters in CSV headers caused error #28 242 | * Column headers longer than 63 characters caused error #14 243 | * Floats in CSV headers caused error #49 244 | * Integers in xlsx header caused error #53 245 | * Extraneous commas in combination with non-ascii chars caused error #51 246 | * CSV with no rows caused error #38 247 | * Template compatibility with CKAN 2.9 #40 248 | * CLI submitted resources with non-xloader formats #43 249 | * ckanext.xloader.max_content_length wasn't working #62 250 | * KeyError: 'resource_id' when xloading a resource #68 251 | 252 | 253 | v0.3.1 2018-01-22 254 | ================= 255 | 256 | * Fix for exception about 'ckan.datapusher.url' not being set when you check xload status #26 257 | 258 | 259 | v0.3.0 2017-11-17 260 | ================= 261 | 262 | * Trigger any resource_views to be created straight after load, rather than wait for the index. 263 | 264 | 265 | v0.2.0 2017-11-10 266 | ================= 267 | 268 | * Renamed ckanext-xloader 269 | * Added to PyPI 270 | * The user is given access to the data earlier in the job - the column indexing now occurs afterwards, since this is only an optimization for queries and takes much longer than the load itself 271 | * Fixed exception during error-handling for files too long and of non-accepted schemes 272 | 273 | 274 | v0.1 2017-11-03 275 | =============== 276 | 277 | * Initial code, named ckanext-shift. 278 | -------------------------------------------------------------------------------- /ckanext/xloader/tests/samples/sample_with_mixed_quotes.csv: -------------------------------------------------------------------------------- 1 | Category,Category name,Priority,Initiative name,Investment objectives,Primary digital priority,Initiative stage,Actual start date,Approved end date,Date data current at,Percentage complete,Overall status,Project commencement allocation,Approved expenditure,Actual cost to date,Scope change event,Cost re-evaluation event,Delivery delay event,Project journey and reasons for variance,Learn more (URL) 2 | DDSSHHESW,"Department of Defence, Social Security, Health, Housing, Education, and Silly Walks",High,Silly Walks project - Stage 2,"Lorum ipsum.",Collaboration,Delivery,01/07/1970,30/06/1971,31/03/1971,41,G,5633000,5739000,2352000,N,N,N,"As at 31 March 1971 3 | - Overall 'green' (on track) status 4 | - Revised user journey following results of Silly Walk UX/UI testing 5 | - Transition to support progressing with documentation and walk-through of the solution. 6 | - Ongoing high levels of silly walk usage reflecting the success of search engine marketing. Silly walk focused campaign to further increase awareness and usage is being finalised. 7 | 8 | As at 28 February 1971 9 | - Overall 'green' (on track) status 10 | - Results of Silly Walk UX/UI testing is guiding development of the revised user journey. 11 | - Silly Walk transition to BAU support continuing with workshops, showcases and handover documentation. 12 | - Silly Walk usage is increasing 13 | 14 | As at 31 January 1971 15 | - Continued amber status [closely monitored] with risks under management 16 | - Search Engine Marketing -'Always On' yielding good results with continued increase in users and the proportion benefitting from Silly Walk 17 | - Good progress on development of revised Silly Walk user journey. 18 | 19 | As at 31 December 1970 20 | Status AMBER [Closely monitored] 21 | - Search Engine Marketing commenced 19 December 1970 and already showing increased users and proportion of customers benefitting from Silly Walk 22 | - External assurance review completed and reported 'green' rating for confidence of delivery. 23 | 24 | As at 30 November 1970 25 | - Continued amber status pending risk management 26 | - Marketing to commence to increase awareness of platform 27 | - Good progress on development of revised user journey 28 | 29 | As at 31 October 1970 30 | Status AMBER [Closely monitored] 31 | - Silly Walk Stage 2 continue reporting amber status reflective of ongoing high-level risks associated with demand-driven labour-market conditions and planned transition to support. 32 | - Communications and engagement are in progress. 33 | - The revised user journey continues development and testing. This is planned to be ready for release in the first quarter of 1971. As at 30 September 1970 34 | Status AMBER [Closely monitored] 35 | Project journey events: 36 | - A revised customer journey in line with outcomes of customer testing and retesting to validate solution usefulness continues to progress. 37 | - Silly Walk industries expanded to include all industries. 38 | - Engagement with agencies continues, to heighten Silly Walk awareness and complete validation following recent expansion to encompass all industries. 39 | 40 | As at 31 August 1970 41 | Status GREEN [On track] 42 | The project is reporting green overall. Ongoing resourcing risk will continue to be monitored and managed for the life of the project, due to a tight labour market. 43 | Project journey events: 44 | - A revised customer journey in line with outcomes of customer testing and retesting to validate solution usefulness continues to progress. 45 | - Further analysis of June/July 1970 marketing campaign has offered recommendations for consideration, to improve target audience awareness and Silly Walk uptake. 46 | - Silly Walk industries expanded to include Retail Trade, Accommodation and Non-residential Construction industries finalised. 47 | - Engagement with agencies continues, to heighten Silly Walk awareness and complete validation following recent expansion with three additional industries. 48 | 49 | As at 31 July 1970 50 | Status AMBER [Closely monitored] 51 | The project is continuing to report amber overall mainly due to ongoing resourcing challenges. 52 | Project journey events: 53 | - A revised customer journey in line with outcomes of customer testing and retesting to validate solution usefulness, is progressing. 54 | - Analysis of a major marketing campaign conducted in June/July 1970 showed a significant step-up in number of Silly Walk users. 55 | - The target of 95% of Circus population coverage was met in June 1970 with 100% of Circus population now covered on Silly Walk. 56 | - Agency engagement for extension industries has commenced. 57 | 58 | As at 1 July 1970 59 | Silly Walk commenced work on expanding industries to include Retail Trade, Accommodation and Non-residential Construction industries. 60 | 61 | As at June 1970 62 | Stage 2 of the project is commencing and will build up the solution delivered in Silly Walk Stage 1. Customer journey will be revised in line with outcome of customer testing. The increased coverage target of at least 95% of the Circus population was met in June 1970, with all local governments included on Silly Walk. Benefits realisation through marketing and promotion of Silly Walk.",https://example.com 63 | DDSSHHESW,"Department of Defence, Social Security, Health, Housing, Education, and Silly Walks",High,Flying Circus Modernisation and Transformation Program - Tranche 1,"The Flying Circus Modernisation and Transformation (FCMT) Program seeks to reduce the risks associated with department legacy systems by delivering contemporary, consolidated, integrated, user-friendly applications to support delivery of Flying Circus outcomes. To optimise the technical capabilities of the new solutions, engagement with business teams in the review and development of business processes is a priority. ",Trust,Delivery,01/07/1969,31/08/1971,28/02/1971,52,G,8692200,9614968,4961147,Y,Y,Y,"As at 28 February 1971 64 | - Tranche 1 FCMT projects continue on schedule and on budget for Tranche 1 completion by 31 August 1971. 65 | - Customer Engagement and Contract Establishment projects continue to progress focusing on delivery activities for new CRM and Portal enhancements. 66 | - FCMT Tranche 2 Business Case tracking for completion April 1971. 67 | 68 | As at 31 January 1971 69 | - FCMT Projects continue to track to schedule and on budget for Tranche 1 completion 31 August 1971. 70 | - Customer Engagement and Contract Establishment Projects progressing well with delivery activities for new CRM and Portal enhancements. 71 | 72 | As at 31 December 1970 73 | Status GREEN 74 | - FCMT projects continuing to track to board endorsed updated schedule and on budget for Tranche 1 completion on 31 August 1971. 75 | - Customer Engagement and Contract Establishment projects completed partner onboarding and delivery activities underway. 76 | - Planning in progress for Tranche 2, focusing on remaining legacy systems for planned commencement at completion of Tranch 1. 77 | 78 | As at 30 November 1970 79 | Status GREEN 80 | - Tranche 1 delivery date extended to 31 August 1971 due to CRM vendor procurement delays and subsequent additional time requirements for build completion and testing of new CRM. 81 | - All projects maintaining momentum and progressing to revised schedule within budget. 82 | 83 | As at 31 October 1970 84 | Status GREEN 85 | -New 'Partner Portal' Digital Channel continues to perform well with 3516 registered, active, external users from 634 different organisations. Update release being planned for January 1971. 86 | -SkillsCRM (CEP Project) delivery partner on-boarded and formal delivery stage commenced. 87 | -Contract Establishment and Variation (CEV PRoject) continuing delivery partner select with a view to commencing prior to end of December 1970. 88 | 89 | As at 30 September 1970 Status GREEN. 90 | The FCMT 'Partner Portal' solution was successfully launched on the 17 August 1970. The decommissioning of the outdated legacy application, 'WalkConnect', has completed. Work is now increasing on the next Flying Circus systems to be replaced, SkillsCRM (via the Customer Engagement Project) and Policy on Line (via the Contract Establishment and Variation Project). 91 | Project Journey Events: 92 | - Partner Portal. After the successful launch of Partner Portal and decommissioning of WalkConnect, the transition to BAU is underway with the Project team continuing to support business until BAU transition is completed. 93 | - Data, Infrastructure and Reporting. 94 | New 'Data Lake' infrastructure built. Data ingestion processes being trialled. QTS report requirement gathering underway which will showcase new capability once completed. Compliance tool SMCM successfully launched September 30. 95 | -Customer Engagement Project (CEP). Completed assurance reviews successfully. Delivery partner selection completed. Partner and formal delivery stage due to start 18 October 1970. Ramp up of activities continuing with business demonstrations of CRM proof of concept. 96 | -Contract Establishment and Variation (CEV). 97 | Requirements gathering completed. Delivery partner selection process commenced. 'As is' process documentation underway. 98 | 99 | As at 31 August 1970 100 | Status GREEN. The project remains on track. Successful launch of new secure 'Partner Portal' Digital Channel for Flying Circus related organisations occurred 17 August 1970. 101 | 102 | Current Projects underway: 103 | - Partner Portal. Go-live occurred on track 17 August 1970. All registered Flying Circus organisations now able to use the portal to access key applications and send information to DDSSHHESW via secure channel. Enhanced support being provided for 6 weeks. Legacy system decommissioning underway. 104 | - Data, Infrastructure and Reporting. Build of initial Data Lake (centralised, quality, information source) continuing and requirement gathering of first report planned to use new capabilites commenced. 105 | - Customer Services Hub (CRM). Implementation partner selection complete. Solution delivery activities due to start by end September 1970. 106 | - Contract Engagement and Variation. Requirements gathering complete and partner selection process to commence by end September 1970. 107 | 108 | As at 31 July 1970 109 | Status GREEN 110 | 111 | Project journey events: 112 | Implementation of next changes to FCMT applications remain on track for August 1970 with full launch of new secure Partner Portal Digital Channel for Flying Circus related organisations. 113 | FCMT Program scope adjusted to include additional at risk system decommission activties during this financial year. Approved expenditure updated to align with revised scope. 114 | 115 | Current Projects underway 116 | - Partner Portal. Opened for registrations 4 July 1970. Majority of Flying Circus related organisation now registered. Full access (go-live) on track to commence 17 August 1970. Legacy system to be disabled and decommissioned September 1970. 117 | - Data, Infrastructure and Reporting. Build of initial Data Lake (centralised, quality, information source) underway with population and work on first report to commence in September. 118 | - Customer Services Hub (CRM). Requirements confirmed and partner selection underway. Work on legacy CRM replacement due to start September/October 1970. 119 | - Contract Engagement and Variation. Requirements gathering and new process design activities in progress. 120 | 121 | 15 May 1970 Update 122 | Status GREEN 123 | 124 | Implementation of next changes to Flying Circus applications on track for August 1970 with introduction of new secure 'Silly Portal' Digital Channel for Flying Circus related organisations. 125 | 126 | Projects Completed 127 | -Database consolidation - key databases transitioned to supported versions and platforms. Completed November 1969. 128 | -System to System Integration platform. Completed 9 May 1970. 129 | 130 | Current projects underway 131 | -Partner Portal secure digital channel, in final testing. Pilot successfully complete and on track for release in August 1970. 132 | Projects in startup 133 | -Data, Infrastructure and Reporting, planning underway. 134 | -Customer Services Hub (CRM), planning underway. 135 | -Contract Engagement and Variation, planning underway. 136 | -Planning continues for Tranche 2.",https://example.com 137 | -------------------------------------------------------------------------------- /ckanext/xloader/plugin.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | import logging 4 | 5 | from ckan import plugins 6 | from ckan.plugins import toolkit 7 | 8 | from ckan.model.domain_object import DomainObjectOperation 9 | from ckan.model.resource import Resource 10 | 11 | from . import action, auth, helpers as xloader_helpers, utils 12 | from ckanext.xloader.utils import XLoaderFormats 13 | 14 | try: 15 | from ckanext.validation.interfaces import IPipeValidation 16 | HAS_IPIPE_VALIDATION = True 17 | except ImportError: 18 | HAS_IPIPE_VALIDATION = False 19 | 20 | config_declarations = toolkit.blanket.config_declarations 21 | 22 | if toolkit.check_ckan_version(min_version='2.11'): 23 | from ckanext.datastore.interfaces import IDataDictionaryForm 24 | has_idata_dictionary_form = True 25 | else: 26 | has_idata_dictionary_form = False 27 | 28 | log = logging.getLogger(__name__) 29 | 30 | 31 | @config_declarations 32 | class xloaderPlugin(plugins.SingletonPlugin): 33 | plugins.implements(plugins.IConfigurer) 34 | plugins.implements(plugins.IConfigurable) 35 | plugins.implements(plugins.IDomainObjectModification) 36 | plugins.implements(plugins.IActions) 37 | plugins.implements(plugins.IAuthFunctions) 38 | plugins.implements(plugins.ITemplateHelpers) 39 | plugins.implements(plugins.IResourceController, inherit=True) 40 | plugins.implements(plugins.IClick) 41 | plugins.implements(plugins.IBlueprint) 42 | if has_idata_dictionary_form: 43 | plugins.implements(IDataDictionaryForm, inherit=True) 44 | if HAS_IPIPE_VALIDATION: 45 | plugins.implements(IPipeValidation) 46 | 47 | # IClick 48 | def get_commands(self): 49 | from ckanext.xloader.cli import get_commands 50 | 51 | return get_commands() 52 | 53 | # IBlueprint 54 | def get_blueprint(self): 55 | from ckanext.xloader.views import get_blueprints 56 | 57 | return get_blueprints() 58 | 59 | # IConfigurer 60 | 61 | def update_config(self, config): 62 | toolkit.add_template_directory(config, 'templates') 63 | toolkit.add_resource(u'webassets', 'ckanext-xloader') 64 | 65 | # IConfigurable 66 | 67 | def configure(self, config_): 68 | if config_.get("ckanext.xloader.ignore_hash") in ["True", "TRUE", "1", True, 1]: 69 | self.ignore_hash = True 70 | else: 71 | self.ignore_hash = False 72 | 73 | # IPipeValidation 74 | 75 | def receive_validation_report(self, validation_report): 76 | if utils.requires_successful_validation_report(): 77 | res_dict = toolkit.get_action('resource_show')({'ignore_auth': True}, 78 | {'id': validation_report.get('resource_id')}) 79 | if (toolkit.asbool(toolkit.config.get('ckanext.xloader.validation.enforce_schema', True)) 80 | or res_dict.get('schema', None)) and validation_report.get('status') != 'success': 81 | # A schema is present, or required to be present 82 | return 83 | # if validation is running in async mode, it is running from the redis workers. 84 | # thus we need to do sync=True to have Xloader put the job at the front of the queue. 85 | sync = toolkit.asbool(toolkit.config.get(u'ckanext.validation.run_on_update_async', True)) 86 | self._submit_to_xloader(res_dict, sync=sync) 87 | 88 | # IDomainObjectModification 89 | 90 | def notify(self, entity, operation): 91 | # type: (Package|Resource, DomainObjectOperation) -> None 92 | """ 93 | Runs before_commit to database for Packages and Resources. 94 | We only want to check for changed Resources for this. 95 | We want to check if values have changed, namely the url and the format. 96 | See: ckan/model/modification.py.DomainObjectModificationExtension 97 | """ 98 | if operation != DomainObjectOperation.changed \ 99 | or not isinstance(entity, Resource): 100 | return 101 | 102 | context = { 103 | "ignore_auth": True, 104 | } 105 | resource_dict = toolkit.get_action("resource_show")( 106 | context, 107 | { 108 | "id": entity.id, 109 | }, 110 | ) 111 | 112 | if _should_remove_unsupported_resource_from_datastore(resource_dict): 113 | toolkit.enqueue_job(fn=_remove_unsupported_resource_from_datastore, args=[entity.id]) 114 | 115 | if utils.requires_successful_validation_report(): 116 | # If the resource requires validation, stop here if validation 117 | # has not been performed or did not succeed. The Validation 118 | # extension will call resource_patch and this method should 119 | # be called again. However, url_changed will not be in the entity 120 | # once Validation does the patch. 121 | log.debug("Deferring xloading resource %s because the " 122 | "resource did not pass validation yet.", resource_dict.get('id')) 123 | return 124 | elif not getattr(entity, 'url_changed', False): 125 | # do not submit to xloader if the url has not changed. 126 | return 127 | 128 | self._submit_to_xloader(resource_dict) 129 | 130 | # IResourceController 131 | 132 | def after_resource_create(self, context, resource_dict): 133 | if utils.requires_successful_validation_report(): 134 | log.debug("Deferring xloading resource %s because the " 135 | "resource did not pass validation yet.", resource_dict.get('id')) 136 | return 137 | 138 | self._submit_to_xloader(resource_dict) 139 | 140 | def before_resource_show(self, resource_dict): 141 | resource_dict[ 142 | "datastore_contains_all_records_of_source_file" 143 | ] = toolkit.asbool( 144 | resource_dict.get("datastore_contains_all_records_of_source_file") 145 | ) 146 | 147 | def after_resource_update(self, context, resource_dict): 148 | """ Check whether the datastore is out of sync with the 149 | 'datastore_active' flag. This can occur due to race conditions 150 | like https://github.com/ckan/ckan/issues/4663 151 | """ 152 | datastore_active = resource_dict.get('datastore_active', False) 153 | try: 154 | context = {'ignore_auth': True} 155 | if toolkit.get_action('datastore_info')( 156 | context=context, data_dict={'id': resource_dict['id']}): 157 | datastore_exists = True 158 | else: 159 | datastore_exists = False 160 | except toolkit.ObjectNotFound: 161 | datastore_exists = False 162 | 163 | if datastore_active != datastore_exists: 164 | # flag is out of sync with datastore; update it 165 | utils.set_resource_metadata( 166 | {'resource_id': resource_dict['id'], 167 | 'datastore_active': datastore_exists}) 168 | 169 | if not toolkit.check_ckan_version("2.10"): 170 | 171 | def after_create(self, context, resource_dict): 172 | self.after_resource_create(context, resource_dict) 173 | 174 | def before_show(self, resource_dict): 175 | self.before_resource_show(resource_dict) 176 | 177 | def after_update(self, context, resource_dict): 178 | self.after_resource_update(context, resource_dict) 179 | 180 | def _submit_to_xloader(self, resource_dict, sync=False): 181 | context = {"ignore_auth": True, "defer_commit": True} 182 | resource_format = resource_dict.get("format") 183 | if not XLoaderFormats.is_it_an_xloader_format(resource_format): 184 | log.debug( 185 | f"Skipping xloading resource {resource_dict['id']} because " 186 | f'format "{resource_format}" is not configured to be ' 187 | "xloadered" 188 | ) 189 | return 190 | if resource_dict["url_type"] in ("datapusher", "xloader"): 191 | log.debug( 192 | "Skipping xloading resource {id} because " 193 | 'url_type "{url_type}" means resource.url ' 194 | "points to the datastore already, so loading " 195 | "would be circular.".format(**resource_dict) 196 | ) 197 | return 198 | 199 | try: 200 | if sync: 201 | log.debug( 202 | "xloadering resource %s in sync mode", resource_dict["id"] 203 | ) 204 | else: 205 | log.debug( 206 | "Submitting resource %s to be xloadered", resource_dict["id"] 207 | ) 208 | toolkit.get_action("xloader_submit")( 209 | context, 210 | { 211 | "resource_id": resource_dict["id"], 212 | "ignore_hash": self.ignore_hash, 213 | "sync": sync, 214 | }, 215 | ) 216 | except toolkit.ValidationError as e: 217 | # If xloader is offline, we want to catch error instead 218 | # of raising otherwise resource save will fail with 500 219 | log.critical(e) 220 | pass 221 | 222 | # IActions 223 | 224 | def get_actions(self): 225 | return { 226 | "xloader_submit": action.xloader_submit, 227 | "xloader_hook": action.xloader_hook, 228 | "xloader_status": action.xloader_status, 229 | } 230 | 231 | # IAuthFunctions 232 | 233 | def get_auth_functions(self): 234 | return { 235 | "xloader_submit": auth.xloader_submit, 236 | "xloader_status": auth.xloader_status, 237 | } 238 | 239 | # ITemplateHelpers 240 | 241 | def get_helpers(self): 242 | return { 243 | "xloader_status": xloader_helpers.xloader_status, 244 | "xloader_status_description": xloader_helpers.xloader_status_description, 245 | "is_resource_supported_by_xloader": xloader_helpers.is_resource_supported_by_xloader, 246 | "xloader_badge": xloader_helpers.xloader_badge, 247 | } 248 | 249 | # IDataDictionaryForm 250 | 251 | def update_datastore_create_schema(self, schema): 252 | default = toolkit.get_validator('default') 253 | boolean_validator = toolkit.get_validator('boolean_validator') 254 | to_datastore_plugin_data = toolkit.get_validator('to_datastore_plugin_data') 255 | schema['fields']['strip_extra_white'] = [default(True), boolean_validator, to_datastore_plugin_data('xloader')] 256 | return schema 257 | 258 | def update_datastore_info_field(self, field, plugin_data): 259 | # expose all our non-secret plugin data in the field 260 | field.update(plugin_data.get('xloader', {})) 261 | # CKAN version parody 262 | if '_info' in plugin_data: 263 | field.update({'info': plugin_data['_info']}) 264 | return field 265 | 266 | 267 | def _should_remove_unsupported_resource_from_datastore(res_dict): 268 | if not toolkit.asbool(toolkit.config.get('ckanext.xloader.clean_datastore_tables', False)): 269 | return False 270 | return (not XLoaderFormats.is_it_an_xloader_format(res_dict.get('format', u'')) 271 | and (res_dict.get('url_type') == 'upload' 272 | or not res_dict.get('url_type')) 273 | and (toolkit.asbool(res_dict.get('datastore_active', False)) 274 | or toolkit.asbool(res_dict.get('extras', {}).get('datastore_active', False)))) 275 | 276 | 277 | def _remove_unsupported_resource_from_datastore(resource_id): 278 | """ 279 | Callback to remove unsupported datastore tables. 280 | Controlled by config value: ckanext.xloader.clean_datastore_tables. 281 | Double check the resource format. Only supported Xloader formats should have datastore tables. 282 | If the resource format is not supported, we should delete the datastore tables. 283 | """ 284 | context = {"ignore_auth": True} 285 | try: 286 | res = toolkit.get_action('resource_show')(context, {"id": resource_id}) 287 | except toolkit.ObjectNotFound: 288 | log.error('Resource %s does not exist.', resource_id) 289 | return 290 | 291 | if _should_remove_unsupported_resource_from_datastore(res): 292 | log.info('Unsupported resource format "%s". Deleting datastore tables for resource %s', 293 | res.get(u'format', u''), res['id']) 294 | try: 295 | toolkit.get_action('datastore_delete')(context, { 296 | "resource_id": res['id'], 297 | "force": True}) 298 | log.info('Datastore table dropped for resource %s', res['id']) 299 | except toolkit.ObjectNotFound: 300 | log.error('Datastore table for resource %s does not exist', res['id']) 301 | -------------------------------------------------------------------------------- /ckanext/xloader/tests/test_jobs.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import io 3 | import os 4 | 5 | from datetime import datetime 6 | 7 | from requests import Response 8 | 9 | from ckan.cli.cli import ckan 10 | from ckan.plugins import toolkit 11 | from ckan.tests import helpers, factories 12 | 13 | from unittest import mock 14 | 15 | from ckanext.xloader import jobs 16 | 17 | 18 | _TEST_FILE_CONTENT = "x, y\n1,2\n2,4\n3,6\n4,8\n5,10" 19 | _TEST_LARGE_FILE_CONTENT = "\n1,2\n2,4\n3,6\n4,8\n5,10" 20 | 21 | 22 | def get_response(download_url, headers): 23 | """Mock jobs.get_response() method.""" 24 | resp = Response() 25 | resp.raw = io.BytesIO(_TEST_FILE_CONTENT.encode()) 26 | resp.headers = headers 27 | return resp 28 | 29 | 30 | def get_large_response(download_url, headers): 31 | """Mock jobs.get_response() method to fake a large file.""" 32 | resp = Response() 33 | resp.raw = io.BytesIO(_TEST_FILE_CONTENT.encode()) 34 | resp.headers = {'content-length': 2000000000} 35 | return resp 36 | 37 | 38 | def get_large_data_response(download_url, headers): 39 | """Mock jobs.get_response() method.""" 40 | resp = Response() 41 | f_content = _TEST_FILE_CONTENT + (_TEST_LARGE_FILE_CONTENT * 500000) 42 | resp.raw = io.BytesIO(f_content.encode()) 43 | resp.headers = headers 44 | return resp 45 | 46 | 47 | def _get_temp_files(dir='/tmp'): 48 | return [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))] 49 | 50 | 51 | @pytest.fixture 52 | def apikey(): 53 | sysadmin = factories.SysadminWithToken() 54 | return sysadmin["token"] 55 | 56 | 57 | @pytest.fixture 58 | def data(create_with_upload, apikey): 59 | dataset = factories.Dataset() 60 | resource = create_with_upload( 61 | _TEST_FILE_CONTENT, 62 | "multiplication_2.csv", 63 | url="http://data", 64 | package_id=dataset["id"] 65 | ) 66 | callback_url = toolkit.url_for( 67 | "api.action", ver=3, logic_function="xloader_hook", qualified=True 68 | ) 69 | return { 70 | 'api_key': apikey, 71 | 'job_type': 'xloader_to_datastore', 72 | 'result_url': callback_url, 73 | 'metadata': { 74 | 'ignore_hash': True, 75 | 'ckan_url': toolkit.config.get('ckan.site_url'), 76 | 'resource_id': resource["id"], 77 | 'set_url_type': False, 78 | 'task_created': datetime.utcnow().isoformat(), 79 | 'original_url': resource["url"], 80 | } 81 | } 82 | 83 | 84 | @pytest.mark.usefixtures("clean_db", "with_plugins") 85 | @pytest.mark.ckan_config("ckanext.xloader.job_timeout", 2) 86 | @pytest.mark.ckan_config("ckanext.xloader.copy_chunk_size", 5120) 87 | @pytest.mark.ckan_config("ckan.jobs.timeout", 2) 88 | class TestXLoaderJobs(helpers.FunctionalRQTestBase): 89 | 90 | def test_xloader_data_into_datastore(self, cli, data): 91 | self.enqueue(jobs.xloader_data_into_datastore, [data]) 92 | with mock.patch("ckanext.xloader.jobs.get_response", get_response): 93 | stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output 94 | assert "File hash: d44fa65eda3675e11710682fdb5f1648" in stdout 95 | assert "Fields: [{'id': 'x', 'type': 'text', 'strip_extra_white': True}, {'id': 'y', 'type': 'text', 'strip_extra_white': True}]" in stdout 96 | assert "Copying to database..." in stdout 97 | assert "Creating search index..." in stdout 98 | assert "Express Load completed" in stdout 99 | 100 | resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"]) 101 | assert resource["datastore_contains_all_records_of_source_file"] 102 | 103 | # Set the ckanext.xloader.site_url in the config 104 | @pytest.mark.ckan_config("ckanext.xloader.site_url", 'http://xloader-site-url') 105 | def test_download_resource_data_with_ckanext_xloader_site_url(self, cli, data): 106 | 107 | data['metadata']['original_url'] = 'http://xloader-site-url/resource.csv' 108 | self.enqueue(jobs.xloader_data_into_datastore, [data]) 109 | with mock.patch("ckanext.xloader.jobs.get_response", get_response): 110 | stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output 111 | assert "Express Load completed" in stdout 112 | 113 | resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"]) 114 | assert resource["datastore_contains_all_records_of_source_file"] 115 | 116 | @pytest.mark.ckan_config("ckanext.site_url", 'http://ckan-site-url') 117 | def test_download_resource_data_with_ckan_site_url(self, cli, data): 118 | data['metadata']['original_url'] = 'http://ckan-site-url/resource.csv' 119 | self.enqueue(jobs.xloader_data_into_datastore, [data]) 120 | with mock.patch("ckanext.xloader.jobs.get_response", get_response): 121 | stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output 122 | assert "Express Load completed" in stdout 123 | 124 | resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"]) 125 | assert resource["datastore_contains_all_records_of_source_file"] 126 | 127 | @pytest.mark.ckan_config("ckanext.site_url", 'http://ckan-site-url') 128 | def test_download_resource_data_with_different_original_url(self, cli, data): 129 | data['metadata']['original_url'] = 'http://external-site-url/resource.csv' 130 | self.enqueue(jobs.xloader_data_into_datastore, [data]) 131 | with mock.patch("ckanext.xloader.jobs.get_response", get_response): 132 | stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output 133 | assert "Express Load completed" in stdout 134 | 135 | resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"]) 136 | assert resource["datastore_contains_all_records_of_source_file"] 137 | 138 | @pytest.mark.ckan_config("ckanext.xloader.site_url", 'http://xloader-site-url') 139 | def test_callback_xloader_hook_with_ckanext_xloader_site_url(self, cli, data): 140 | data['result_url'] = 'http://xloader-site-url/api/3/action/xloader_hook' 141 | self.enqueue(jobs.xloader_data_into_datastore, [data]) 142 | with mock.patch("ckanext.xloader.jobs.get_response", get_response): 143 | stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output 144 | assert "Express Load completed" in stdout 145 | 146 | resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"]) 147 | assert resource["datastore_contains_all_records_of_source_file"] 148 | 149 | @pytest.mark.ckan_config("ckanext.site_url", 'http://ckan-site-url') 150 | def test_callback_xloader_hook_with_ckan_site_url(self, cli, data): 151 | data['result_url'] = 'http://ckan-site-url/api/3/action/xloader_hook' 152 | self.enqueue(jobs.xloader_data_into_datastore, [data]) 153 | with mock.patch("ckanext.xloader.jobs.get_response", get_response): 154 | stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output 155 | assert "Express Load completed" in stdout 156 | 157 | resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"]) 158 | assert resource["datastore_contains_all_records_of_source_file"] 159 | 160 | def test_xloader_ignore_hash(self, cli, data): 161 | self.enqueue(jobs.xloader_data_into_datastore, [data]) 162 | with mock.patch("ckanext.xloader.jobs.get_response", get_response): 163 | stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output 164 | assert "Express Load completed" in stdout 165 | 166 | self.enqueue(jobs.xloader_data_into_datastore, [data]) 167 | with mock.patch("ckanext.xloader.jobs.get_response", get_response): 168 | stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output 169 | assert "Copying to database..." in stdout 170 | assert "Express Load completed" in stdout 171 | 172 | data["metadata"]["ignore_hash"] = False 173 | self.enqueue(jobs.xloader_data_into_datastore, [data]) 174 | with mock.patch("ckanext.xloader.jobs.get_response", get_response): 175 | stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output 176 | assert "Ignoring resource - the file hash hasn't changed" in stdout 177 | 178 | def test_data_too_big_error_if_content_length_bigger_than_config(self, cli, data): 179 | self.enqueue(jobs.xloader_data_into_datastore, [data]) 180 | with mock.patch("ckanext.xloader.jobs.get_response", get_large_response): 181 | stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output 182 | assert "Data too large to load into Datastore:" in stdout 183 | 184 | def test_data_max_excerpt_lines_config(self, cli, data): 185 | self.enqueue(jobs.xloader_data_into_datastore, [data]) 186 | with mock.patch("ckanext.xloader.jobs.get_response", get_large_response): 187 | with mock.patch("ckanext.xloader.jobs.MAX_EXCERPT_LINES", 1): 188 | stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output 189 | assert "Loading excerpt of ~1 lines to DataStore." in stdout 190 | 191 | resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"]) 192 | assert resource["datastore_contains_all_records_of_source_file"] is False 193 | 194 | def test_data_with_rq_job_timeout(self, cli, data): 195 | file_suffix = 'multiplication_2.csv' 196 | self.enqueue(jobs.xloader_data_into_datastore, [data], rq_kwargs=dict(timeout=2)) 197 | with mock.patch("ckanext.xloader.jobs.get_response", get_large_data_response): 198 | stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output 199 | assert "Job timed out after" in stdout 200 | for f in _get_temp_files(): 201 | # make sure that the tmp file has been closed/deleted in job timeout exception handling 202 | assert file_suffix not in f 203 | 204 | @pytest.mark.parametrize("error_type,should_retry", [ 205 | # Retryable errors from RETRYABLE_ERRORS 206 | ("DeadlockDetected", True), 207 | ("LockNotAvailable", True), 208 | ("ObjectInUse", True), 209 | ("XLoaderTimeoutError", True), 210 | # Retryable HTTP errors (status codes from is_retryable_error) 211 | ("HTTPError_408", True), 212 | ("HTTPError_429", True), 213 | ("HTTPError_500", True), 214 | ("HTTPError_502", True), 215 | ("HTTPError_503", True), 216 | ("HTTPError_504", True), 217 | ("HTTPError_507", True), 218 | ("HTTPError_522", True), 219 | ("HTTPError_524", True), 220 | # Non-retryable HTTP errors 221 | ("HTTPError_400", False), 222 | ("HTTPError_404", False), 223 | ("HTTPError_403", False), 224 | # Other non-retryable errors (not in RETRYABLE_ERRORS) 225 | ("ValueError", False), 226 | ("TypeError", False), 227 | ]) 228 | def test_retry_behavior(self, cli, data, error_type, should_retry): 229 | """Test retry behavior for different error types.""" 230 | 231 | def create_mock_error(error_type): 232 | if error_type == "DeadlockDetected": 233 | from psycopg2 import errors 234 | return errors.DeadlockDetected() 235 | elif error_type == "LockNotAvailable": 236 | from psycopg2 import errors 237 | return errors.LockNotAvailable() 238 | elif error_type == "ObjectInUse": 239 | from psycopg2 import errors 240 | return errors.ObjectInUse() 241 | elif error_type == "XLoaderTimeoutError": 242 | return jobs.XLoaderTimeoutError('Connection timed out after 30s') 243 | elif error_type.startswith("HTTPError_"): 244 | status_code = int(error_type.split("_")[1]) 245 | return jobs.HTTPError("HTTP Error", status_code=status_code, request_url="test", response=None) 246 | elif error_type == "ValueError": 247 | return ValueError("Test error") 248 | elif error_type == "TypeError": 249 | return TypeError("Test error") 250 | 251 | def mock_download_with_error(*args, **kwargs): 252 | if not hasattr(mock_download_with_error, 'call_count'): 253 | mock_download_with_error.call_count = 0 254 | mock_download_with_error.call_count += 1 255 | 256 | if mock_download_with_error.call_count == 1: 257 | # First call - raise the test error 258 | raise create_mock_error(error_type) 259 | elif should_retry: 260 | # Second call - return successful response only if retryable 261 | import tempfile 262 | tmp_file = tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.csv') 263 | tmp_file.write(_TEST_FILE_CONTENT) 264 | tmp_file.flush() 265 | return (tmp_file, 'd44fa65eda3675e11710682fdb5f1648') 266 | else: 267 | # Non-retryable errors should not get a second chance 268 | raise create_mock_error(error_type) 269 | 270 | self.enqueue(jobs.xloader_data_into_datastore, [data]) 271 | 272 | with mock.patch("ckanext.xloader.jobs._download_resource_data", mock_download_with_error): 273 | stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output 274 | 275 | if should_retry: 276 | # Check that retry was attempted 277 | assert "Job failed due to temporary error" in stdout 278 | assert "retrying" in stdout 279 | assert "Express Load completed" in stdout 280 | # Verify resource was successfully loaded after retry 281 | resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"]) 282 | assert resource["datastore_contains_all_records_of_source_file"] 283 | else: 284 | # Check that job failed without retry - should have error messages 285 | assert "xloader error:" in stdout or "error" in stdout.lower() 286 | assert "Express Load completed" not in stdout 287 | 288 | 289 | @pytest.mark.usefixtures("clean_db") 290 | class TestSetResourceMetadata(object): 291 | def test_simple(self): 292 | resource = factories.Resource() 293 | 294 | jobs.set_resource_metadata( 295 | { 296 | "datastore_contains_all_records_of_source_file": True, 297 | "datastore_active": True, 298 | "ckan_url": "http://www.ckan.org/", 299 | "resource_id": resource["id"], 300 | } 301 | ) 302 | 303 | resource = helpers.call_action("resource_show", id=resource["id"]) 304 | assert resource["datastore_contains_all_records_of_source_file"] 305 | assert resource["datastore_active"] 306 | assert resource["ckan_url"] == "http://www.ckan.org/" 307 | -------------------------------------------------------------------------------- /ckanext/xloader/utils.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | from collections import defaultdict 4 | from decimal import Decimal 5 | import json 6 | import datetime 7 | import logging 8 | import re 9 | from six import text_type as str, binary_type 10 | from urllib.parse import urlunparse, urlparse 11 | 12 | from ckan import model 13 | from ckan.lib import search 14 | import ckan.plugins.toolkit as tk 15 | 16 | from .job_exceptions import JobError 17 | 18 | log = logging.getLogger(__name__) 19 | 20 | 21 | # resource.formats accepted by ckanext-xloader. Must be lowercase here. 22 | DEFAULT_FORMATS = [ 23 | "csv", 24 | "application/csv", 25 | "xls", 26 | "xlsx", 27 | "tsv", 28 | "application/vnd.ms-excel", 29 | "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", 30 | "ods", 31 | "application/vnd.oasis.opendocument.spreadsheet", 32 | ] 33 | 34 | 35 | class XLoaderFormats(object): 36 | formats = None 37 | 38 | @classmethod 39 | def is_it_an_xloader_format(cls, format_): 40 | if cls.formats is None: 41 | cls._formats = tk.config.get("ckanext.xloader.formats") 42 | if cls._formats is not None: 43 | # use config value. preserves empty list as well. 44 | cls._formats = cls._formats.lower().split() 45 | else: 46 | cls._formats = DEFAULT_FORMATS 47 | if not format_: 48 | return False 49 | return format_.lower() in cls._formats 50 | 51 | 52 | def requires_successful_validation_report(): 53 | return tk.asbool(tk.config.get('ckanext.xloader.validation.requires_successful_report', False)) 54 | 55 | 56 | def awaiting_validation(res_dict): 57 | """ 58 | Checks the existence of a logic action from the ckanext-validation 59 | plugin, thus supporting any extending of the Validation Plugin class. 60 | 61 | Checks ckanext.xloader.validation.requires_successful_report config 62 | option value. 63 | 64 | Checks ckanext.xloader.validation.enforce_schema config 65 | option value. Then checks the Resource's validation_status. 66 | """ 67 | if not requires_successful_validation_report(): 68 | # validation.requires_successful_report is turned off, return right away 69 | return False 70 | 71 | try: 72 | # check for one of the main actions from ckanext-validation 73 | # in the case that users extend the Validation plugin class 74 | # and rename the plugin entry-point. 75 | tk.get_action('resource_validation_show') 76 | is_validation_plugin_loaded = True 77 | except KeyError: 78 | is_validation_plugin_loaded = False 79 | 80 | if not is_validation_plugin_loaded: 81 | # the validation plugin is not loaded but required, log a warning 82 | log.warning('ckanext.xloader.validation.requires_successful_report ' 83 | 'requires the ckanext-validation plugin to be activated.') 84 | return False 85 | 86 | if (tk.asbool(tk.config.get('ckanext.xloader.validation.enforce_schema', True)) 87 | or res_dict.get('schema', None)) and res_dict.get('validation_status', None) != 'success': 88 | 89 | # either validation.enforce_schema is turned on or it is off and there is a schema, 90 | # we then explicitly check for the `validation_status` report to be `success`` 91 | return True 92 | 93 | # at this point, we can assume that the Resource is not waiting for Validation. 94 | # or that the Resource does not have a Validation Schema and we are not enforcing schemas. 95 | return False 96 | 97 | 98 | def resource_data(id, resource_id, rows=None): 99 | 100 | if tk.request.method == "POST": 101 | 102 | context = { 103 | "ignore_auth": True, 104 | } 105 | resource_dict = tk.get_action("resource_show")( 106 | context, 107 | { 108 | "id": resource_id, 109 | }, 110 | ) 111 | 112 | if awaiting_validation(resource_dict): 113 | tk.h.flash_error(tk._("Cannot upload resource %s to the DataStore " 114 | "because the resource did not pass validation yet.") % resource_id) 115 | return tk.redirect_to( 116 | "xloader.resource_data", id=id, resource_id=resource_id 117 | ) 118 | 119 | try: 120 | tk.get_action("xloader_submit")( 121 | None, 122 | { 123 | "resource_id": resource_id, 124 | "ignore_hash": True, # user clicked the reload button 125 | }, 126 | ) 127 | except tk.ValidationError: 128 | pass 129 | 130 | return tk.redirect_to( 131 | "xloader.resource_data", id=id, resource_id=resource_id 132 | ) 133 | 134 | try: 135 | pkg_dict = tk.get_action("package_show")(None, {"id": id}) 136 | resource = tk.get_action("resource_show")(None, {"id": resource_id}) 137 | except (tk.ObjectNotFound, tk.NotAuthorized): 138 | return tk.abort(404, tk._("Resource not found")) 139 | 140 | try: 141 | xloader_status = tk.get_action("xloader_status")( 142 | None, {"resource_id": resource_id} 143 | ) 144 | except tk.ObjectNotFound: 145 | xloader_status = {} 146 | except tk.NotAuthorized: 147 | return tk.abort(403, tk._("Not authorized to see this page")) 148 | 149 | extra_vars = { 150 | "status": xloader_status, 151 | "resource": resource, 152 | "pkg_dict": pkg_dict, 153 | } 154 | if rows: 155 | extra_vars["rows"] = rows 156 | return tk.render( 157 | "xloader/resource_data.html", 158 | extra_vars=extra_vars, 159 | ) 160 | 161 | 162 | def get_xloader_user_apitoken(): 163 | """ Returns the API Token for authentication. 164 | 165 | xloader actions require an authenticated user to perform the actions. This 166 | method returns the api_token set in the config file and defaults to the 167 | site_user. 168 | """ 169 | api_token = tk.config.get('ckanext.xloader.api_token') 170 | if api_token and api_token != 'NOT_SET': 171 | return api_token 172 | raise tk.ValidationError({u'ckanext.xloader.api_token': u'NOT_SET, please provide valid api token'}) 173 | 174 | 175 | def _modify_url(input_url: str, base_url: str) -> str: 176 | """ Modifies the input URL with base_url provided. 177 | 178 | Args: 179 | input_url (str): The original URL to potentially modify 180 | base_url (str): The base URL to compare/replace against 181 | Returns: 182 | str: The modified URL with replaced scheme and netloc 183 | """ 184 | parsed_input_url = urlparse(input_url) 185 | parsed_base_url = urlparse(base_url) 186 | # Do not modify non-HTTP(S) URLs (e.g., ftp://) 187 | if parsed_input_url.scheme not in ("http", "https"): 188 | return input_url 189 | # replace scheme: "http/https" and netloc:"//:@:/" 190 | new_url = urlunparse( 191 | (parsed_base_url.scheme, 192 | parsed_base_url.netloc, 193 | parsed_input_url.path, 194 | parsed_input_url.params, 195 | parsed_input_url.query, 196 | parsed_input_url.fragment)) 197 | return new_url 198 | 199 | 200 | def modify_input_url(input_url: str) -> str: 201 | """Returns a potentially modified CKAN URL. 202 | 203 | This function takes a possible CKAN URL and potentially modifies its base URL while preserving the path, 204 | query parameters, and fragments. The modification occurs only if three conditions are met: 205 | 1. The base URL of the input matches the configured CKAN site URL (ckan.site_url). 206 | 2. A `ckanext.xloader.site_url` is configured in the settings. 207 | 3. A `ckanext.xloader.site_url_ignore_path_regex` if configured in the settings and does not match. 208 | 209 | Args: 210 | input_url (str): The original CKAN URL to potentially modify 211 | Returns: 212 | str: Either the modified URL with new base URL from xloader_site_url, 213 | or the original URL if conditions aren't met 214 | """ 215 | 216 | xloader_site_url = tk.config.get('ckanext.xloader.site_url') 217 | if not xloader_site_url: 218 | return input_url 219 | 220 | parsed_input_url = urlparse(input_url) 221 | input_base_url = f"{parsed_input_url.scheme}://{parsed_input_url.netloc}" 222 | parsed_ckan_site_url = urlparse(tk.config.get('ckan.site_url')) 223 | ckan_base_url = f"{parsed_ckan_site_url.scheme}://{parsed_ckan_site_url.netloc}" 224 | 225 | xloader_ignore_regex = tk.config.get('ckanext.xloader.site_url_ignore_path_regex') 226 | 227 | # Don't alter non-matching base URLs. 228 | if input_base_url != ckan_base_url: 229 | return input_url 230 | # And not any URLs on the ignore regex 231 | elif xloader_ignore_regex and re.search(xloader_ignore_regex, input_url): 232 | return input_url 233 | 234 | return _modify_url(input_url, xloader_site_url) 235 | 236 | 237 | def set_resource_metadata(update_dict): 238 | ''' 239 | Set appropriate datastore_active flag on CKAN resource. 240 | 241 | Called after creation or deletion of DataStore table. 242 | ''' 243 | # We're modifying the resource extra directly here to avoid a 244 | # race condition, see issue #3245 for details and plan for a 245 | # better fix 246 | 247 | q = model.Session.query(model.Resource). \ 248 | with_for_update(of=model.Resource). \ 249 | filter(model.Resource.id == update_dict['resource_id']) 250 | resource = q.one() 251 | 252 | # update extras in database for record 253 | extras = resource.extras 254 | extras.update(update_dict) 255 | q.update({'extras': extras}, synchronize_session=False) 256 | 257 | model.Session.commit() 258 | 259 | # get package with updated resource from solr 260 | # find changed resource, patch it and reindex package 261 | psi = search.PackageSearchIndex() 262 | solr_query = search.PackageSearchQuery() 263 | q = { 264 | 'q': 'id:"{0}"'.format(resource.package_id), 265 | 'fl': 'data_dict', 266 | 'wt': 'json', 267 | 'fq': 'site_id:"%s"' % tk.config.get('ckan.site_id'), 268 | 'rows': 1 269 | } 270 | for record in solr_query.run(q)['results']: 271 | solr_data_dict = json.loads(record['data_dict']) 272 | for resource in solr_data_dict['resources']: 273 | if resource['id'] == update_dict['resource_id']: 274 | resource.update(update_dict) 275 | psi.index_package(solr_data_dict) 276 | break 277 | 278 | 279 | def column_count_modal(rows): 280 | """ Return the modal value of columns in the row_set's 281 | sample. This can be assumed to be the number of columns 282 | of the table. 283 | 284 | Copied from messytables. 285 | """ 286 | counts = defaultdict(int) 287 | for row in rows: 288 | length = len([c for c in row if c != '']) 289 | if length > 1: 290 | counts[length] += 1 291 | if not len(counts): 292 | return 0 293 | return max(list(counts.items()), key=lambda k_v: k_v[1])[0] 294 | 295 | 296 | def headers_guess(rows, tolerance=1): 297 | """ Guess the offset and names of the headers of the row set. 298 | This will attempt to locate the first row within ``tolerance`` 299 | of the mode of the number of rows in the row set sample. 300 | 301 | The return value is a tuple of the offset of the header row 302 | and the names of the columns. 303 | 304 | Copied from messytables. 305 | """ 306 | rows = list(rows) 307 | modal = column_count_modal(rows) 308 | for i, row in enumerate(rows): 309 | length = len([c for c in row if c != '']) 310 | if length >= modal - tolerance: 311 | # TODO: use type guessing to check that this row has 312 | # strings and does not conform to the type schema of 313 | # the table. 314 | return i, row 315 | return 0, [] 316 | 317 | 318 | TYPES = [int, bool, str, binary_type, datetime.datetime, float, Decimal] 319 | 320 | 321 | def type_guess(rows, types=TYPES, strict=False): 322 | """ The type guesser aggregates the number of successful 323 | conversions of each column to each type, weights them by a 324 | fixed type priority and select the most probable type for 325 | each column based on that figure. It returns a list of 326 | ``CellType``. Empty cells are ignored. 327 | 328 | Strict means that a type will not be guessed 329 | if parsing fails for a single cell in the column.""" 330 | guesses = [] 331 | if strict: 332 | at_least_one_value = [] 333 | for ri, row in enumerate(rows): 334 | diff = len(row) - len(guesses) 335 | for _i in range(diff): 336 | typesdict = {} 337 | for type in types: 338 | typesdict[type] = 0 339 | guesses.append(typesdict) 340 | at_least_one_value.append(False) 341 | for ci, cell in enumerate(row): 342 | if not cell: 343 | continue 344 | for type in list(guesses[ci].keys()): 345 | if not isinstance(cell, type): 346 | guesses[ci].pop(type) 347 | at_least_one_value[ci] = True if guesses[ci] else False 348 | # no need to set guessing weights before this 349 | # because we only accept a type if it never fails 350 | for i, guess in enumerate(guesses): 351 | for type in guess: 352 | guesses[i][type] = 1 353 | # in case there were no values at all in the column, 354 | # we just set the guessed type to string 355 | for i, v in enumerate(at_least_one_value): 356 | if not v: 357 | guesses[i] = {str: 1} 358 | else: 359 | for i, row in enumerate(rows): 360 | diff = len(row) - len(guesses) 361 | for _i in range(diff): 362 | guesses.append(defaultdict(int)) 363 | for i, cell in enumerate(row): 364 | # add string guess so that we have at least one guess 365 | guesses[i][str] = guesses[i].get(str, 1) 366 | if not cell: 367 | continue 368 | for type in types: 369 | if isinstance(cell, type): 370 | guesses[i][type] += 1 371 | _columns = [] 372 | _columns = [] 373 | for guess in guesses: 374 | # this first creates an array of tuples because we want the types to be 375 | # sorted. Even though it is not specified, python chooses the first 376 | # element in case of a tie 377 | # See: http://stackoverflow.com/a/6783101/214950 378 | guesses_tuples = [(t, guess[t]) for t in types if t in guess] 379 | if not guesses_tuples: 380 | raise JobError('Failed to guess types') 381 | _columns.append(max(guesses_tuples, key=lambda t_n: t_n[1])[0]) 382 | return _columns 383 | 384 | 385 | def datastore_resource_exists(resource_id): 386 | context = {'model': model, 'ignore_auth': True} 387 | try: 388 | response = tk.get_action('datastore_search')(context, dict( 389 | id=resource_id, limit=0)) 390 | except tk.ObjectNotFound: 391 | return False 392 | return response or {'fields': []} 393 | -------------------------------------------------------------------------------- /ckanext/xloader/action.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | from __future__ import absolute_import 4 | import datetime 5 | import json 6 | import logging 7 | 8 | import ckan.lib.jobs as rq_jobs 9 | import ckan.lib.navl.dictization_functions 10 | from ckan.logic import side_effect_free 11 | import ckan.plugins as p 12 | from dateutil.parser import parse as parse_date 13 | from dateutil.parser import isoparse as parse_iso_date 14 | 15 | import ckanext.xloader.schema 16 | 17 | from . import interfaces as xloader_interfaces, jobs, db, utils 18 | 19 | enqueue_job = p.toolkit.enqueue_job 20 | get_queue = rq_jobs.get_queue 21 | 22 | log = logging.getLogger(__name__) 23 | config = p.toolkit.config 24 | 25 | _get_or_bust = p.toolkit.get_or_bust 26 | _validate = ckan.lib.navl.dictization_functions.validate 27 | 28 | 29 | def xloader_submit(context, data_dict): 30 | ''' Submit a job to be Express Loaded. The Express Loader / 'xloader' is a 31 | service that imports tabular data into the datastore. 32 | 33 | :param resource_id: The resource id of the resource that the data 34 | should be imported in. The resource's URL will be used to get the data. 35 | :type resource_id: string 36 | :param set_url_type: If set to True, the ``url_type`` of the resource will 37 | be set to ``datastore`` and the resource URL will automatically point 38 | to the :ref:`datastore dump ` URL. (optional, default: False) 39 | :type set_url_type: bool 40 | :param ignore_hash: If set to True, the xloader will reload the file 41 | even if it haven't changed. (optional, default: False) 42 | :type ignore_hash: bool 43 | 44 | Returns ``True`` if the job has been submitted and ``False`` if the job 45 | has not been submitted, i.e. when ckanext-xloader is not configured. 46 | 47 | :rtype: bool 48 | ''' 49 | p.toolkit.check_access('xloader_submit', context, data_dict) 50 | api_key = utils.get_xloader_user_apitoken() 51 | custom_queue = data_dict.pop('queue', rq_jobs.DEFAULT_QUEUE_NAME) 52 | schema = context.get('schema', ckanext.xloader.schema.xloader_submit_schema()) 53 | data_dict, errors = _validate(data_dict, schema, context) 54 | if errors: 55 | raise p.toolkit.ValidationError(errors) 56 | 57 | p.toolkit.check_access('xloader_submit', context, data_dict) 58 | 59 | # If sync is set to True, the xloader callback will be executed right 60 | # away, instead of a job being enqueued. It will also delete any existing jobs 61 | # for the given resource. This is only controlled by sysadmins or the system. 62 | sync = data_dict.pop('sync', False) 63 | 64 | res_id = data_dict['resource_id'] 65 | try: 66 | resource_dict = p.toolkit.get_action('resource_show')(context, { 67 | 'id': res_id, 68 | }) 69 | except p.toolkit.ObjectNotFound: 70 | return False 71 | 72 | for plugin in p.PluginImplementations(xloader_interfaces.IXloader): 73 | upload = plugin.can_upload(res_id) 74 | if not upload: 75 | msg = "Plugin {0} rejected resource {1}"\ 76 | .format(plugin.__class__.__name__, res_id) 77 | log.info(msg) 78 | return False 79 | 80 | # Check if this resource is already in the process of being xloadered 81 | task = { 82 | 'entity_id': res_id, 83 | 'entity_type': 'resource', 84 | 'task_type': 'xloader', 85 | 'last_updated': str(datetime.datetime.utcnow()), 86 | 'state': 'submitting', 87 | 'key': 'xloader', 88 | 'value': '{}', 89 | 'error': '{}', 90 | } 91 | try: 92 | existing_task = p.toolkit.get_action('task_status_show')(context, { 93 | 'entity_id': res_id, 94 | 'task_type': 'xloader', 95 | 'key': 'xloader' 96 | }) 97 | assume_task_stale_after = datetime.timedelta(seconds=int( 98 | config.get('ckanext.xloader.assume_task_stale_after', 3600))) 99 | assume_task_stillborn_after = \ 100 | datetime.timedelta(seconds=int( 101 | config.get('ckanext.xloader.assume_task_stillborn_after', 5))) 102 | if existing_task.get('state') == 'pending': 103 | import re # here because it takes a moment to load 104 | queued_res_ids = [ 105 | re.search(r"'resource_id': u?'([^']+)'", 106 | job.description).groups()[0] 107 | for job in get_queue().get_jobs() 108 | if 'xloader_to_datastore' in str(job) # filter out test_job etc 109 | ] 110 | updated = parse_iso_date(existing_task['last_updated']) 111 | time_since_last_updated = datetime.datetime.utcnow() - updated 112 | if (res_id not in queued_res_ids 113 | and time_since_last_updated > assume_task_stillborn_after): 114 | # it's not on the queue (and if it had just been started then 115 | # its taken too long to update the task_status from pending - 116 | # the first thing it should do in the xloader job). 117 | # Let it be restarted. 118 | log.info('A pending task was found %r, but its not found in ' 119 | 'the queue %r and is %s hours old', 120 | existing_task['id'], queued_res_ids, 121 | time_since_last_updated) 122 | elif time_since_last_updated > assume_task_stale_after: 123 | # it's been a while since the job was last updated - it's more 124 | # likely something went wrong with it and the state wasn't 125 | # updated than its still in progress. Let it be restarted. 126 | log.info('A pending task was found %r, but it is only %s hours' 127 | ' old', existing_task['id'], time_since_last_updated) 128 | else: 129 | log.info('A pending task was found %s for this resource, so ' 130 | 'skipping this duplicate task', existing_task['id']) 131 | return False 132 | 133 | task['id'] = existing_task['id'] 134 | except p.toolkit.ObjectNotFound: 135 | pass 136 | 137 | model = context['model'] 138 | 139 | p.toolkit.get_action('task_status_update')( 140 | {'session': model.meta.create_local_session(), 'ignore_auth': True}, 141 | task 142 | ) 143 | 144 | callback_url = p.toolkit.url_for( 145 | "api.action", 146 | ver=3, 147 | logic_function="xloader_hook", 148 | qualified=True 149 | ) 150 | data = { 151 | 'api_key': api_key, 152 | 'job_type': 'xloader_to_datastore', 153 | 'result_url': callback_url, 154 | 'metadata': { 155 | 'ignore_hash': data_dict.get('ignore_hash', False), 156 | 'ckan_url': config['ckan.site_url'], 157 | 'resource_id': res_id, 158 | 'set_url_type': data_dict.get('set_url_type', False), 159 | 'task_created': task['last_updated'], 160 | 'original_url': resource_dict.get('url'), 161 | } 162 | } 163 | if custom_queue != rq_jobs.DEFAULT_QUEUE_NAME: 164 | # Don't automatically retry if it's a custom run 165 | data['metadata']['tries'] = jobs.MAX_RETRIES 166 | 167 | # Expand timeout for resources that have to be type-guessed 168 | timeout = config.get( 169 | 'ckanext.xloader.job_timeout', 170 | '3600' if utils.datastore_resource_exists(res_id) else '10800') 171 | log.debug("Timeout for XLoading resource %s is %s", res_id, timeout) 172 | 173 | try: 174 | job = enqueue_job( 175 | jobs.xloader_data_into_datastore, [data], queue=custom_queue, 176 | title="xloader_submit: package: {} resource: {}".format(resource_dict.get('package_id'), res_id), 177 | rq_kwargs=dict(timeout=timeout, at_front=sync) 178 | ) 179 | except Exception: 180 | if sync: 181 | log.exception('Unable to xloader res_id=%s', res_id) 182 | else: 183 | log.exception('Unable to enqueue xloader res_id=%s', res_id) 184 | return False 185 | log.debug('Enqueued xloader job=%s res_id=%s', job.id, res_id) 186 | value = json.dumps({'job_id': job.id}) 187 | 188 | if sync: 189 | log.debug('Pushed xloader sync mode job=%s res_id=%s to front of queue', job.id, res_id) 190 | 191 | task['value'] = value 192 | task['state'] = 'pending' 193 | task['last_updated'] = str(datetime.datetime.utcnow()) 194 | 195 | p.toolkit.get_action('task_status_update')( 196 | {'session': model.meta.create_local_session(), 'ignore_auth': True}, 197 | task 198 | ) 199 | 200 | return True 201 | 202 | 203 | def _enqueue(fn, args=None, kwargs=None, title=None, queue='default', 204 | timeout=180): 205 | '''Same as latest ckan.lib.jobs.enqueue - earlier CKAN versions dont have 206 | the timeout param 207 | 208 | This function can be removed when dropping support for 2.7 209 | ''' 210 | if args is None: 211 | args = [] 212 | if kwargs is None: 213 | kwargs = {} 214 | job = get_queue(queue).enqueue_call(func=fn, args=args, kwargs=kwargs, 215 | timeout=timeout) 216 | job.meta[u'title'] = title 217 | job.save() 218 | msg = u'Added background job {}'.format(job.id) 219 | if title: 220 | msg = u'{} ("{}")'.format(msg, title) 221 | msg = u'{} to queue "{}"'.format(msg, queue) 222 | log.info(msg) 223 | return job 224 | 225 | 226 | def xloader_hook(context, data_dict): 227 | ''' Update xloader task. This action is typically called by ckanext-xloader 228 | whenever the status of a job changes. 229 | 230 | :param metadata: metadata provided when submitting job. key-value pairs. 231 | Must have resource_id property. 232 | :type metadata: dict 233 | :param status: status of the job from the xloader service. Allowed values: 234 | pending, running, running_but_viewable, complete, error 235 | (which must all be valid values for task_status too) 236 | :type status: string 237 | :param error: Error raised during job execution 238 | :type error: string 239 | 240 | NB here are other params which are in the equivalent object in 241 | ckan-service-provider (from job_status): 242 | :param sent_data: Input data for job 243 | :type sent_data: json encodable data 244 | :param job_id: An identifier for the job 245 | :type job_id: string 246 | :param result_url: Callback url 247 | :type result_url: url string 248 | :param data: Results from job. 249 | :type data: json encodable data 250 | :param requested_timestamp: Time the job started 251 | :type requested_timestamp: timestamp 252 | :param finished_timestamp: Time the job finished 253 | :type finished_timestamp: timestamp 254 | 255 | ''' 256 | 257 | metadata, status = _get_or_bust(data_dict, ['metadata', 'status']) 258 | 259 | res_id = _get_or_bust(metadata, 'resource_id') 260 | 261 | # Pass metadata, not data_dict, as it contains the resource id needed 262 | # on the auth checks 263 | p.toolkit.check_access('xloader_submit', context, metadata) 264 | 265 | task = p.toolkit.get_action('task_status_show')(context, { 266 | 'entity_id': res_id, 267 | 'task_type': 'xloader', 268 | 'key': 'xloader' 269 | }) 270 | 271 | task['state'] = status 272 | task['last_updated'] = str(datetime.datetime.utcnow()) 273 | task['error'] = data_dict.get('error') 274 | 275 | resubmit = False 276 | 277 | if status in ('complete', 'running_but_viewable'): 278 | # Create default views for resource if necessary (only the ones that 279 | # require data to be in the DataStore) 280 | resource_dict = p.toolkit.get_action('resource_show')( 281 | context, {'id': res_id}) 282 | 283 | dataset_dict = p.toolkit.get_action('package_show')( 284 | context, {'id': resource_dict['package_id']}) 285 | 286 | for plugin in p.PluginImplementations(xloader_interfaces.IXloader): 287 | plugin.after_upload(context, resource_dict, dataset_dict) 288 | 289 | p.toolkit.get_action('resource_create_default_resource_views')( 290 | context, 291 | { 292 | 'resource': resource_dict, 293 | 'package': dataset_dict, 294 | 'create_datastore_views': True, 295 | }) 296 | 297 | # Check if the uploaded file has been modified in the meantime 298 | if (resource_dict.get('last_modified') 299 | and metadata.get('task_created')): 300 | try: 301 | last_modified_datetime = parse_date( 302 | resource_dict['last_modified']) 303 | task_created_datetime = parse_date(metadata['task_created']) 304 | if last_modified_datetime > task_created_datetime: 305 | log.debug('Uploaded file more recent: %s > %s', 306 | last_modified_datetime, task_created_datetime) 307 | resubmit = True 308 | except ValueError: 309 | pass 310 | # Check if the URL of the file has been modified in the meantime 311 | elif (resource_dict.get('url') 312 | and metadata.get('original_url') 313 | and resource_dict['url'] != metadata['original_url']): 314 | log.debug('URLs are different: %s != %s', 315 | resource_dict['url'], metadata['original_url']) 316 | resubmit = True 317 | 318 | context['ignore_auth'] = True 319 | p.toolkit.get_action('task_status_update')(context, task) 320 | 321 | if resubmit: 322 | log.debug('Resource %s has been modified, ' 323 | 'resubmitting to DataPusher', res_id) 324 | p.toolkit.get_action('xloader_submit')( 325 | context, {'resource_id': res_id}) 326 | 327 | 328 | @side_effect_free 329 | def xloader_status(context, data_dict): 330 | ''' Get the status of a ckanext-xloader job for a certain resource. 331 | 332 | :param resource_id: The resource id of the resource that you want the 333 | status for. 334 | :type resource_id: string 335 | ''' 336 | 337 | p.toolkit.check_access('xloader_status', context, data_dict) 338 | 339 | if 'id' in data_dict: 340 | data_dict['resource_id'] = data_dict['id'] 341 | res_id = _get_or_bust(data_dict, 'resource_id') 342 | 343 | task = p.toolkit.get_action('task_status_show')(context, { 344 | 'entity_id': res_id, 345 | 'task_type': 'xloader', 346 | 'key': 'xloader' 347 | }) 348 | 349 | value = json.loads(task['value']) 350 | job_id = value.get('job_id') 351 | url = None 352 | job_detail = None 353 | 354 | if job_id: 355 | # get logs from the xloader db 356 | db.init(config) 357 | job_detail = db.get_job(job_id) 358 | 359 | if job_detail and job_detail.get('logs'): 360 | for log in job_detail['logs']: 361 | if 'timestamp' in log and isinstance(log['timestamp'], datetime.datetime): 362 | log['timestamp'] = log['timestamp'].isoformat() 363 | try: 364 | error = json.loads(task['error']) 365 | except ValueError: 366 | # this happens occasionally, such as when the job times out 367 | error = task['error'] 368 | return { 369 | 'status': task['state'], 370 | 'job_id': job_id, 371 | 'job_url': url, 372 | 'last_updated': task['last_updated'], 373 | 'task_info': job_detail, 374 | 'error': error, 375 | } 376 | --------------------------------------------------------------------------------