├── ckanext
├── xloader
│ ├── __init__.py
│ ├── tests
│ │ ├── __init__.py
│ │ ├── samples
│ │ │ ├── no_entries.csv
│ │ │ ├── simple.xls
│ │ │ ├── go-realtime.xlsx
│ │ │ ├── simple-large.xls
│ │ │ ├── umlaut_and_extra_comma.csv
│ │ │ ├── non_utf8_sample.csv
│ │ │ ├── date_formats.csv
│ │ │ ├── polling_locations.shapefile.zip
│ │ │ ├── simple.csv
│ │ │ ├── column_names.csv
│ │ │ ├── simple-with-extra-column.csv
│ │ │ ├── sample_with_empty_lines.csv
│ │ │ ├── sample_with_blanks.csv
│ │ │ ├── sample_with_quoted_commas.csv
│ │ │ ├── non_timestamp_sample.csv
│ │ │ ├── mixed_numeric_string_sample.csv
│ │ │ ├── sample_with_extra_blank_cells.csv
│ │ │ ├── german_sample.csv
│ │ │ ├── polling_locations.geojson
│ │ │ ├── boston_311_sample.csv
│ │ │ ├── polling_locations.kml
│ │ │ ├── brazilian_sample.csv
│ │ │ └── sample_with_mixed_quotes.csv
│ │ ├── ckan_setup.py
│ │ ├── test_parser.py
│ │ ├── test_action.py
│ │ ├── test_utils.py
│ │ ├── fixtures.py
│ │ ├── test_plugin.py
│ │ ├── test_chunks.py
│ │ └── test_jobs.py
│ ├── templates
│ │ ├── .gitignore
│ │ ├── package
│ │ │ ├── snippets
│ │ │ │ ├── resource_info.html
│ │ │ │ ├── resources.html
│ │ │ │ └── resource_item.html
│ │ │ ├── resource_edit_base.html
│ │ │ └── resource_read.html
│ │ ├── datastore
│ │ │ └── snippets
│ │ │ │ └── dictionary_form.html
│ │ └── xloader
│ │ │ ├── confirm_datastore_delete.html
│ │ │ └── resource_data.html
│ ├── webassets
│ │ ├── webassets.yml
│ │ └── css
│ │ │ └── xloader.css
│ ├── auth.py
│ ├── schema.py
│ ├── job_exceptions.py
│ ├── interfaces.py
│ ├── views.py
│ ├── cli.py
│ ├── parser.py
│ ├── helpers.py
│ ├── command.py
│ ├── config_declaration.yaml
│ ├── plugin.py
│ ├── utils.py
│ └── action.py
└── __init__.py
├── .coveragerc
├── conftest.py
├── dev-requirements.txt
├── MANIFEST.in
├── .flake8
├── requirements.txt
├── .github
├── dependabot.yml
└── workflows
│ ├── test.yml
│ └── publish.yml
├── setup.py
├── setup.cfg
├── .gitignore
├── test.ini
├── pyproject.toml
└── CHANGELOG
/ckanext/xloader/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/ckanext/xloader/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/ckanext/xloader/templates/.gitignore:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [report]
2 | omit =
3 | */site-packages/*
4 | */python?.?/*
5 | ckan/*
--------------------------------------------------------------------------------
/ckanext/xloader/tests/samples/no_entries.csv:
--------------------------------------------------------------------------------
1 | OBJECTID,HEX25_ID,COM_NAME,SCI_NAME,CWHR_ID,SEASON,PCT_AREA
2 |
--------------------------------------------------------------------------------
/conftest.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | pytest_plugins = [
4 | u'ckanext.xloader.tests.fixtures',
5 | ]
6 |
--------------------------------------------------------------------------------
/ckanext/xloader/tests/samples/simple.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ckan/ckanext-xloader/HEAD/ckanext/xloader/tests/samples/simple.xls
--------------------------------------------------------------------------------
/ckanext/xloader/tests/samples/go-realtime.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ckan/ckanext-xloader/HEAD/ckanext/xloader/tests/samples/go-realtime.xlsx
--------------------------------------------------------------------------------
/ckanext/xloader/tests/samples/simple-large.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ckan/ckanext-xloader/HEAD/ckanext/xloader/tests/samples/simple-large.xls
--------------------------------------------------------------------------------
/ckanext/xloader/tests/samples/umlaut_and_extra_comma.csv:
--------------------------------------------------------------------------------
1 | name,withumlautstrings
2 | sophie,sophié
3 | bernard,bernård
4 | paul,paül, another field
5 |
--------------------------------------------------------------------------------
/ckanext/xloader/webassets/webassets.yml:
--------------------------------------------------------------------------------
1 | main-css:
2 | output: ckanext-xloader/%(version)s_xloader.css
3 | contents:
4 | - css/xloader.css
5 |
--------------------------------------------------------------------------------
/ckanext/xloader/tests/samples/non_utf8_sample.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ckan/ckanext-xloader/HEAD/ckanext/xloader/tests/samples/non_utf8_sample.csv
--------------------------------------------------------------------------------
/ckanext/xloader/tests/samples/date_formats.csv:
--------------------------------------------------------------------------------
1 | date,temperature,place
2 | 2011-01-02,-1,Galway
3 | 01-03-2011,0.5,Galway
4 | 2011.01.02,5,Berkeley
5 | 11-01-03,6,Berkeley
6 |
--------------------------------------------------------------------------------
/ckanext/xloader/tests/samples/polling_locations.shapefile.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ckan/ckanext-xloader/HEAD/ckanext/xloader/tests/samples/polling_locations.shapefile.zip
--------------------------------------------------------------------------------
/ckanext/xloader/tests/samples/simple.csv:
--------------------------------------------------------------------------------
1 | date,temperature,place
2 | 2011-01-01,1,Galway
3 | 2011-01-02,-1,Galway
4 | 2011-01-03,0,Galway
5 | 2011-01-01,6,Berkeley
6 | ,,Berkeley
7 | 2011-01-03,5,
8 |
--------------------------------------------------------------------------------
/ckanext/xloader/tests/samples/column_names.csv:
--------------------------------------------------------------------------------
1 | d@t$e,t^e&m*pe!r(a)t?u:r%%e,p\l/a[c{e%
2 | 2011-01-01,1,Galway
3 | 2011-01-02,-1,Galway
4 | 2011-01-03,0,Galway
5 | 2011-01-01,6,Berkeley
6 | ,,Berkeley
7 | 2011-01-03,5,
--------------------------------------------------------------------------------
/ckanext/xloader/tests/samples/simple-with-extra-column.csv:
--------------------------------------------------------------------------------
1 | date,temperature,place,foo
2 | 2011-01-01,1,Galway,1
3 | 2011-01-02,-1,Galway,2
4 | 2011-01-03,0,Galway,3
5 | 2011-01-01,6,Berkeley,4
6 | ,,Berkeley,5
7 | 2011-01-03,5,,6
8 |
--------------------------------------------------------------------------------
/ckanext/xloader/templates/package/snippets/resource_info.html:
--------------------------------------------------------------------------------
1 | {% ckan_extends %}
2 |
3 | {% block resource_info %}
4 | {{ super() }}
5 | {{ h.xloader_badge(res) }}
6 | {% asset 'ckanext-xloader/main-css' %}
7 | {% endblock %}
8 |
--------------------------------------------------------------------------------
/ckanext/xloader/tests/samples/sample_with_empty_lines.csv:
--------------------------------------------------------------------------------
1 | date,temperature,place
2 | 2011-01-01,1,Galway
3 | 2011-01-02,-1,Galway
4 | 2011-01-03,0,Galway
5 | 2011-01-01,6,Berkeley
6 |
7 | ,,Berkeley
8 | 2011-01-03,5,
9 |
10 |
11 |
--------------------------------------------------------------------------------
/ckanext/__init__.py:
--------------------------------------------------------------------------------
1 | # this is a namespace package
2 | try:
3 | import pkg_resources
4 | pkg_resources.declare_namespace(__name__)
5 | except ImportError:
6 | import pkgutil
7 | __path__ = pkgutil.extend_path(__path__, __name__)
8 |
--------------------------------------------------------------------------------
/ckanext/xloader/tests/samples/sample_with_blanks.csv:
--------------------------------------------------------------------------------
1 | Funding agency,Program title,Opening date,Service ID
2 | DTIS,Visitor First Experiences Fund,23/03/2023,63039
3 | DTIS,First Nations Sport and Recreation Program Round 2,22/03/2023,63040
4 | ,,,63041
5 |
--------------------------------------------------------------------------------
/ckanext/xloader/tests/samples/sample_with_quoted_commas.csv:
--------------------------------------------------------------------------------
1 | Funding agency,Program title,Opening date,Service ID
2 | DTIS,"Department of Employment, Small Business and Training",23/03/2023,63039
3 | DTIS,"Foo, baz, meh",22/03/2023,63040
4 | ,,,63041
5 |
--------------------------------------------------------------------------------
/ckanext/xloader/tests/samples/non_timestamp_sample.csv:
--------------------------------------------------------------------------------
1 | Title,Postal postcode,Latitude,Longitude,Mon am,Mon pm,Last updated
2 | Adavale,4474,-25.9092582,144.5975769,8:00,16:00,19/07/2018
3 | Aramac,4726,-22.971298,145.241481,9:00-13:00,14:00-16:45,17/07/2018
4 | Barcaldine,4725,-23.55327901,145.289156,9:00-12:30,13:30-16:30,20/07/2018
5 |
--------------------------------------------------------------------------------
/dev-requirements.txt:
--------------------------------------------------------------------------------
1 | responses==0.10.9
2 | mock==2.0.0
3 | flake8
4 | pytest-ckan
5 | pytest-cov
6 | requests>=2.32.0 # not directly required, pinned by Snyk to avoid a vulnerability
7 | urllib3>=2.2.2 # not directly required, pinned by Snyk to avoid a vulnerability
8 | zipp>=3.19.1 # not directly required, pinned by Snyk to avoid a vulnerability
9 |
--------------------------------------------------------------------------------
/ckanext/xloader/templates/package/resource_edit_base.html:
--------------------------------------------------------------------------------
1 | {% ckan_extends %}
2 |
3 | {% block inner_primary_nav %}
4 | {{ super() }}
5 | {% if h.is_resource_supported_by_xloader(res) %}
6 | {{ h.build_nav_icon('xloader.resource_data', _('DataStore'), id=pkg.name, resource_id=res.id, icon='cloud-upload') }}
7 | {% endif %}
8 | {% endblock %}
9 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *requirements*.txt
2 | include CHANGELOG
3 | include LICENSE
4 | include README.md
5 | include ckanext/xloader/config_declaration.yaml
6 | recursive-include ckanext/xloader/templates *.html
7 | recursive-include ckanext/xloader/webassets *.css
8 | recursive-include ckanext/xloader/webassets *.yml
9 | recursive-include ckanext/xloader/webassets *.js
--------------------------------------------------------------------------------
/ckanext/xloader/templates/package/snippets/resources.html:
--------------------------------------------------------------------------------
1 | {% ckan_extends %}
2 |
3 | {% block resources_list_edit_dropdown_inner %}
4 | {{ super() }}
5 | {% if h.is_resource_supported_by_xloader(resource) %}
6 |
{% link_for _('DataStore'), named_route='xloader.resource_data', id=pkg.name, resource_id=resource.id, class_='dropdown-item', icon='cloud-upload' %}
7 | {% endif %}
8 | {% endblock %}
9 |
--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | # @see https://flake8.pycqa.org/en/latest/user/configuration.html?highlight=.flake8
3 |
4 | exclude =
5 | ckan
6 | scripts
7 | .git
8 |
9 | # Extended output format.
10 | format = pylint
11 |
12 | # Show the source of errors.
13 | show_source = True
14 |
15 | max-complexity = 27
16 | max-line-length=127
17 |
18 | # List ignore rules one per line.
19 | ignore =
20 | C901
21 | E501
22 | W503
23 |
--------------------------------------------------------------------------------
/ckanext/xloader/tests/samples/mixed_numeric_string_sample.csv:
--------------------------------------------------------------------------------
1 | Funding agency,Program title,Maximum (indicative) grant amount,Some mixed integers
2 | DTIS,Accessible Tourism Infrastructure Grants,Five hundred thousand dollars,1
3 | DTIS,Boosting Accessible Tourism Experiences Grants,5000,-1
4 | DTIS,Some Other Grants,5 hundred thousand,0
5 | DTIS,Some Other Grants,$5000,6
6 | DTIS,Some Other Grants,$5 hundred thousand,
7 | DTIS,Some Other Grants,"$5,000.00","5"
8 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | ckantoolkit>=0.0.4
2 | requests>=2.31.0
3 | six>=1.12.0
4 | tabulator==1.53.5
5 | Unidecode==1.0.22
6 | python-dateutil>=2.8.2
7 | certifi>=2023.7.22 # not directly required, pinned by Snyk to avoid a vulnerability
8 | chardet==5.2.0
9 | idna>=3.7 # not directly required, pinned by Snyk to avoid a vulnerability
10 | urllib3>=1.26.19 # not directly required, pinned by Snyk to avoid a vulnerability
11 | zipp>=3.19.1 # not directly required, pinned by Snyk to avoid a vulnerability
12 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | registries:
3 | python-index-pypi-org:
4 | type: python-index
5 | url: https://pypi.org/
6 | replaces-base: true
7 | username: "${{secrets.PYTHON_INDEX_PYPI_ORG_USERNAME}}"
8 | password: "${{secrets.PYTHON_INDEX_PYPI_ORG_PASSWORD}}"
9 |
10 | updates:
11 | - package-ecosystem: pip
12 | directory: "/"
13 | schedule:
14 | interval: daily
15 | time: "19:00"
16 | open-pull-requests-limit: 10
17 | registries:
18 | - python-index-pypi-org
19 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from setuptools import setup
3 |
4 |
5 | setup(
6 | # If you are changing from the default layout of your extension, you may
7 | # have to change the message extractors, you can read more about babel
8 | # message extraction at
9 | # http://babel.pocoo.org/docs/messages/#extraction-method-mapping-and-configuration
10 | message_extractors={
11 | 'ckanext': [
12 | ('**.py', 'python', None),
13 | ('**.js', 'javascript', None),
14 | ('**/templates/**.html', 'ckan', None),
15 | ],
16 | }
17 | )
18 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [extract_messages]
2 | keywords = translate isPlural
3 | add_comments = TRANSLATORS:
4 | output_file = ckanext/xloader/i18n/ckanext-xloader.pot
5 | width = 80
6 |
7 | [init_catalog]
8 | domain = ckanext-xloader
9 | input_file = ckanext/xloader/i18n/ckanext-xloader.pot
10 | output_dir = ckanext/xloader/i18n
11 |
12 | [update_catalog]
13 | domain = ckanext-xloader
14 | input_file = ckanext/xloader/i18n/ckanext-xloader.pot
15 | output_dir = ckanext/xloader/i18n
16 | previous = true
17 |
18 | [compile_catalog]
19 | domain = ckanext-xloader
20 | directory = ckanext/xloader/i18n
21 | statistics = true
22 |
--------------------------------------------------------------------------------
/ckanext/xloader/auth.py:
--------------------------------------------------------------------------------
1 | from ckan import authz
2 | from ckan.lib import jobs as rq_jobs
3 |
4 | import ckanext.datastore.logic.auth as auth
5 |
6 |
7 | def xloader_submit(context, data_dict):
8 | # only sysadmins can specify a custom processing queue
9 | custom_queue = data_dict.get('queue')
10 | if custom_queue and custom_queue != rq_jobs.DEFAULT_QUEUE_NAME:
11 | return authz.is_authorized('config_option_update', context, data_dict)
12 | return auth.datastore_auth(context, data_dict)
13 |
14 |
15 | def xloader_status(context, data_dict):
16 | return auth.datastore_auth(context, data_dict)
17 |
--------------------------------------------------------------------------------
/ckanext/xloader/tests/samples/sample_with_extra_blank_cells.csv:
--------------------------------------------------------------------------------
1 | Agency (Dept or Stat Body),Agency address,Contract description/name,Award contract date,Contract value,Supplier name,Supplier address,Variation to contract (Yes/No),Specific confidentiality provision used,Procurement method,Reason for Limited tender,Form of contract,Number of offers sought,Evaluation criteria and weightings,Deliverables,Contract milestones,Contract performance management,,,,,,,,,,,,,,,
2 | State-wide Operations,"111 Easy St, Duckburg, 40000",con_12345-Social services,01/01/1970,"$123,456",LexCorp,123 Example St ELEMENT CITY 4444,No,No,Selective,,,,,,,,,,,,,,,,,,,,,,
3 |
--------------------------------------------------------------------------------
/ckanext/xloader/templates/datastore/snippets/dictionary_form.html:
--------------------------------------------------------------------------------
1 | {% ckan_extends %}
2 | {% import 'macros/form.html' as form %}
3 |
4 | {% block additional_fields %}
5 | {{ super() }}
6 | {% if h.check_ckan_version(min_version='2.11') %}
7 | {% set field_prefix = 'fields__' %}
8 | {% else %}
9 | {% set field_prefix = 'info__' %}
10 | {% endif %}
11 | {% set selected_value = field.get('info', {}).get('strip_extra_white', field.get('strip_extra_white', true)) %}
12 | {{ form.select(field_prefix ~ position ~ '__strip_extra_white',
13 | label=_('Strip Extra Leading and Trailing White Space'), options=[
14 | {'text': _('Yes'), 'value': true},
15 | {'text': _('No'), 'value': false},
16 | ], selected=selected_value) }}
17 | {% endblock %}
18 |
--------------------------------------------------------------------------------
/ckanext/xloader/tests/samples/german_sample.csv:
--------------------------------------------------------------------------------
1 | "Stadtname";"Schüler_Total_2010/2011";"Schüler_Total_2000/2001";"Schüler_Total_1990/1991";"Schüler_Vorschule_2010/2011";"Schüler_Obligatorische Primar- und Sekundarstufe I_2010/2011";"Schüler_Sekundarstufe II, Übergangsausbildung Sek I. - Sek. II_2010/2011";"Schüler_Maturitätsschulen_2010/2011";"Schüler_Berufsausbildung_2010/2011";"Schüler_andere allgemeinbildende Schulen_2010/2011"
2 | "Zürich";68260;65444;62646;6503;28800;1173;6891;24221;672
3 | "Genève";33418;33065;29647;3283;16159;934;8150;4580;312
4 | "Bern";31667;30728;33151;1880;9176;1031;3467;15717;396
5 | "Basel";26103;28458;28327;2664;12369;1099;3041;6641;289
6 | "Lausanne";29499;29683;30141;2600;12122;748;4933;8904;192
7 | "Winterthur";22927;19823;17510;2044;9653;425;1488;9307;10
8 | "Schweiz / Suisse";1257204;1270832;1147746;148573;757335;16508;96606;234463;3719
9 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .ropeproject
2 | node_modules
3 | bower_components
4 |
5 | # Byte-compiled / optimized / DLL files
6 | __pycache__/
7 | *.py[cod]
8 |
9 | # C extensions
10 | *.so
11 |
12 | # Distribution / packaging
13 | .Python
14 | env/
15 | build/
16 | develop-eggs/
17 | dist/
18 | sdist/
19 | *.egg-info/
20 | .installed.cfg
21 | *.egg
22 |
23 | # PyInstaller
24 | # Usually these files are written by a python script from a template
25 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
26 | *.manifest
27 | *.spec
28 |
29 | # Installer logs
30 | pip-log.txt
31 | pip-delete-this-directory.txt
32 |
33 | # Unit test / coverage reports
34 | htmlcov/
35 | .tox/
36 | .coverage
37 | .cache
38 | nosetests.xml
39 | coverage.xml
40 |
41 | # Sphinx documentation
42 | docs/_build/
43 |
44 | # editors
45 | .vscode
46 |
47 | ckan
48 | .rdp
49 | .noseids
50 | subdir/
51 | dump.rdb
52 |
--------------------------------------------------------------------------------
/ckanext/xloader/tests/samples/polling_locations.geojson:
--------------------------------------------------------------------------------
1 | {"type":"FeatureCollection","features":[{"type":"Feature","properties":{"OBJECTID":1,"X":781862.000004,"Y":2958580.000015,"PollLocID":null,"Ward":"1","Precinct":"1","Location":"SAMUEL ADAMS ELEMENTARY SCHOOL","Address":"165 WEBSTER STREET","VoterEntra":"VOTE IN AUDITORIUM","HPEntrance":"VOTERS ENTER FROM LEFT OF MAIN ENTRANCE","Comment":" ","sp":null,"rs":null,"ch":null,"vt":null,"ht":null,"cv":null},"geometry":{"type":"Point","coordinates":[-71.03489031049394,42.36556300488601]}},{"type":"Feature","properties":{"OBJECTID":2,"X":782174.071396,"Y":2959815.54504,"PollLocID":null,"Ward":"1","Precinct":"2","Location":"DONALD MCKAY SCHOOL","Address":"122 COTTAGE STREET","VoterEntra":"VOTE IN CAFETERIA","HPEntrance":"VOTER ENTRANCE MCKAY PLACE","Comment":" ","sp":null,"rs":null,"ch":null,"vt":null,"ht":null,"cv":null},"geometry":{"type":"Point","coordinates":[-71.03371058292713,42.36894867586296]}}]}
--------------------------------------------------------------------------------
/ckanext/xloader/templates/package/snippets/resource_item.html:
--------------------------------------------------------------------------------
1 | {% ckan_extends %}
2 |
3 | {% block resource_item_explore_inner %}
4 | {{ super() }}
5 | {% if h.is_resource_supported_by_xloader(res) %}
6 | {% link_for _('DataStore'), named_route='xloader.resource_data', id=pkg.name, resource_id=res.id, class_='dropdown-item', icon='cloud-upload' %}
7 | {% endif %}
8 | {% endblock %}
9 |
10 | {% block resource_item_explore_links %}
11 | {% if h.check_ckan_version(max_version='2.10') and h.is_resource_supported_by_xloader(res) %}
12 | {% link_for _('DataStore'), named_route='xloader.resource_data', id=pkg.name, resource_id=res.id, class_='dropdown-item', icon='cloud-upload' %}
13 | {% endif %}
14 | {{ super() }}
15 | {% endblock %}
16 |
17 | {% block resource_item_title %}
18 | {{ super() }}
19 | {{ h.xloader_badge(res) }}
20 | {% asset 'ckanext-xloader/main-css' %}
21 | {% endblock %}
22 |
23 |
24 |
--------------------------------------------------------------------------------
/ckanext/xloader/templates/package/resource_read.html:
--------------------------------------------------------------------------------
1 | {% ckan_extends %}
2 |
3 |
4 | {% block resource_read_url %}
5 | {% set badge = h.xloader_badge(res) %}
6 | {% if badge %}
7 | {{ badge }}
8 | {% asset 'ckanext-xloader/main-css' %}
9 | {% endif %}
10 | {{ super() }}
11 | {% endblock %}
12 |
13 | {% block action_manage %}
14 | {{ super() }}
15 | {% if h.is_resource_supported_by_xloader(res) %}
16 | {% link_for _('DataStore'), named_route='xloader.resource_data', id=pkg.name, resource_id=res.id, class_='btn btn-light', icon='cloud-upload' %}
17 | {% endif %}
18 | {% endblock %}
19 |
20 | {% block resource_actions_inner %}
21 | {% if h.check_ckan_version(max_version='2.10') and h.is_resource_supported_by_xloader(res) %}
22 | {% link_for _('DataStore'), named_route='xloader.resource_data', id=pkg.name, resource_id=res.id, class_='btn btn-light', icon='cloud-upload' %}
23 | {% endif %}
24 | {{ super() }}
25 | {% endblock %}
26 |
--------------------------------------------------------------------------------
/ckanext/xloader/templates/xloader/confirm_datastore_delete.html:
--------------------------------------------------------------------------------
1 | {% extends "page.html" %}
2 |
3 | {% block subtitle %}{{ _("Confirm Delete") }}{% endblock %}
4 |
5 | {% block maintag %}{% endblock %}
6 |
7 | {% block main_content %}
8 |
9 |
10 | {% block form %}
11 |
{{ _('Are you sure you want to delete the DataStore and Data Dictionary?') }}
12 |
13 |
18 |
19 | {% endblock %}
20 |
21 |
22 | {% endblock %}
23 |
--------------------------------------------------------------------------------
/ckanext/xloader/webassets/css/xloader.css:
--------------------------------------------------------------------------------
1 | .loader-badge {
2 | margin-left: 10px;
3 | background: #555;
4 | color: #fff;
5 | border-radius: 3px;
6 | display: inline-block;
7 | font-size: 14px;
8 | vertical-align: middle;
9 | font-weight: 400;
10 | line-height: 1.2;
11 | }
12 |
13 | a.loader-badge {
14 | text-decoration: none;
15 | }
16 |
17 | .loader-badge:hover,
18 | .loader-badge:focus {
19 | color: #fff;
20 | }
21 |
22 | .prefix,
23 | .status {
24 | display: inline-block;
25 | padding: 2px 6px;
26 | }
27 |
28 | .loader-badge .status {
29 | border-top-right-radius: 3px;
30 | border-bottom-right-radius: 3px;
31 | }
32 |
33 | .loader-badge .status.active {
34 | background: #97C50F;
35 | }
36 |
37 | .loader-badge .status.complete {
38 | background: #1081C2;
39 | }
40 |
41 | .loader-badge .status.error {
42 | background: #D9634D;
43 | }
44 |
45 | .loader-badge .status.inactive {
46 | background: #F27E3F;
47 | }
48 |
49 | .loader-badge .status.pending {
50 | background: #9B9B9B;
51 | }
52 |
53 | .loader-badge .status.running {
54 | background: #D8B124;
55 | }
56 |
57 | .loader-badge .status.unknown {
58 | background: #9D9D9D;
59 | }
60 |
61 |
--------------------------------------------------------------------------------
/ckanext/xloader/schema.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | import ckan.plugins as p
4 | import ckanext.datastore.logic.schema as dsschema
5 |
6 | get_validator = p.toolkit.get_validator
7 |
8 | not_missing = get_validator('not_missing')
9 | not_empty = get_validator('not_empty')
10 | resource_id_exists = get_validator('resource_id_exists')
11 | package_id_exists = get_validator('package_id_exists')
12 | ignore_missing = get_validator('ignore_missing')
13 | empty = get_validator('empty')
14 | boolean_validator = get_validator('boolean_validator')
15 | int_validator = get_validator('int_validator')
16 | OneOf = get_validator('OneOf')
17 | ignore_not_sysadmin = get_validator('ignore_not_sysadmin')
18 | unicode_safe = get_validator('unicode_safe')
19 |
20 |
21 | def xloader_submit_schema():
22 | schema = {
23 | 'resource_id': [not_missing, not_empty, unicode_safe],
24 | 'id': [ignore_missing],
25 | 'set_url_type': [ignore_missing, boolean_validator],
26 | 'ignore_hash': [ignore_missing, boolean_validator],
27 | 'sync': [ignore_missing, boolean_validator, ignore_not_sysadmin],
28 | '__junk': [empty],
29 | '__before': [dsschema.rename('id', 'resource_id')]
30 | }
31 | return schema
32 |
--------------------------------------------------------------------------------
/ckanext/xloader/tests/ckan_setup.py:
--------------------------------------------------------------------------------
1 | try:
2 | from ckan.tests.pytest_ckan.ckan_setup import * # noqa
3 | except ImportError:
4 | import pkg_resources
5 | from paste.deploy import loadapp
6 | import sys
7 | import os
8 |
9 | import pylons
10 | from pylons.i18n.translation import _get_translator
11 |
12 | def pytest_addoption(parser):
13 | """Allow using custom config file during tests.
14 | """
15 | parser.addoption(u"--ckan-ini", action=u"store")
16 |
17 | def pytest_sessionstart(session):
18 | """Initialize CKAN environment.
19 | """
20 | global pylonsapp
21 | path = os.getcwd()
22 | sys.path.insert(0, path)
23 | pkg_resources.working_set.add_entry(path)
24 | pylonsapp = loadapp(
25 | "config:" + session.config.option.ckan_ini, relative_to=path,
26 | )
27 |
28 | # Initialize a translator for tests that utilize i18n
29 | translator = _get_translator(pylons.config.get("lang"))
30 | pylons.translator._push_object(translator)
31 |
32 | class FakeResponse:
33 | headers = {} # because render wants to delete Pragma
34 |
35 | pylons.response._push_object(FakeResponse)
36 |
--------------------------------------------------------------------------------
/test.ini:
--------------------------------------------------------------------------------
1 | [DEFAULT]
2 | debug = false
3 | smtp_server = localhost
4 | error_email_from = paste@localhost
5 |
6 | [server:main]
7 | use = egg:Paste#http
8 | host = 0.0.0.0
9 | port = 5000
10 |
11 | [app:main]
12 | use = config:../ckan/test-core.ini
13 |
14 | # solr_url = http://127.0.0.1:8983/solr
15 |
16 | # Insert any custom config settings to be used when running your extension's
17 | # tests here.
18 | ckan.plugins = xloader datastore
19 | ckanext.xloader.jobs_db.uri = sqlite:////tmp/jobs.db
20 |
21 | # Logging configuration
22 | [loggers]
23 | keys = root, ckan, ckanext_xloader, sqlalchemy
24 |
25 | [handlers]
26 | keys = console
27 |
28 | [formatters]
29 | keys = generic
30 |
31 | [logger_root]
32 | level = WARN
33 | handlers = console
34 |
35 | [logger_ckan]
36 | qualname = ckan
37 | handlers = console
38 | level = INFO
39 |
40 | [logger_ckanext_xloader]
41 | qualname = ckanext.xloader
42 | handlers = console
43 | level = WARN
44 |
45 | [logger_sqlalchemy]
46 | handlers =
47 | qualname = sqlalchemy.engine
48 | level = WARN
49 |
50 | [handler_console]
51 | class = StreamHandler
52 | args = (sys.stdout,)
53 | level = NOTSET
54 | formatter = generic
55 |
56 | [formatter_generic]
57 | format = %(asctime)s %(levelname)-5.5s [%(name)s] %(message)s
58 |
--------------------------------------------------------------------------------
/ckanext/xloader/tests/samples/boston_311_sample.csv:
--------------------------------------------------------------------------------
1 | CASE_ENQUIRY_ID,open_dt,target_dt,closed_dt,OnTime_Status,CASE_STATUS,CLOSURE_REASON,CASE_TITLE,SUBJECT,REASON,TYPE,QUEUE,Department,SubmittedPhoto,ClosedPhoto,Location,Fire_district,pwd_district,city_council_district,police_district,neighborhood,neighborhood_services_district,ward,precinct,LOCATION_STREET_NAME,LOCATION_ZIPCODE,Latitude,Longitude,Source
2 | 101002153891,2017-07-06 23:38:43,2017-07-21 08:30:00,,ONTIME,Open, ,Street Light Outages,Public Works Department ,Street Lights,Street Light Outages,PWDx_Street Light Outages,PWDx,,,480 Harvard St Dorchester MA 02124,8,07,4,B3,Greater Mattapan,9,Ward 14,1411,480 Harvard St,02124,42.288,-71.0927,Citizens Connect App
3 | 101002153890,2017-07-06 23:29:13,2017-09-11 08:30:00,,ONTIME,Open, ,Graffiti Removal,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP, https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg,,522 Saratoga St East Boston MA 02128,1,09,1,A7,East Boston,1,Ward 1,0110,522 Saratoga St,02128,42.3807,-71.0259,Citizens Connect App
4 | 101002153889,2017-07-06 23:24:20,2017-09-11 08:30:00,,ONTIME,Open, ,Graffiti Removal,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP, https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg,,965 Bennington St East Boston MA 02128,1,09,1,A7,East Boston,1,Ward 1,0112,965 Bennington St,02128,42.386,-71.008,Citizens Connect App
5 |
--------------------------------------------------------------------------------
/ckanext/xloader/job_exceptions.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | from six import text_type as str
4 |
5 |
6 | class DataTooBigError(Exception):
7 | pass
8 |
9 |
10 | class JobError(Exception):
11 | pass
12 |
13 |
14 | class FileCouldNotBeLoadedError(Exception):
15 | pass
16 |
17 |
18 | class HTTPError(JobError):
19 | """Exception that's raised if a job fails due to an HTTP problem."""
20 |
21 | def __init__(self, message, status_code, request_url, response):
22 | """Initialise a new HTTPError.
23 |
24 | :param message: A human-readable error message
25 | :type message: string
26 |
27 | :param status_code: The status code of the errored HTTP response,
28 | e.g. 500
29 | :type status_code: int
30 |
31 | :param request_url: The URL that was requested
32 | :type request_url: string
33 |
34 | :param response: The body of the errored HTTP response as unicode
35 | (if you have a requests.Response object then response.text will
36 | give you this)
37 | :type response: unicode
38 |
39 | """
40 | super(HTTPError, self).__init__(message)
41 | self.message = message
42 | self.status_code = status_code
43 | self.request_url = request_url
44 | self.response = response
45 |
46 | def __str__(self):
47 | return str('{} status={} url={} response={}'.format(
48 | self.message, self.status_code, self.request_url, self.response)
49 | .encode('ascii', 'replace'))
50 |
51 |
52 | class LoaderError(JobError):
53 | '''Exception that's raised if a load fails'''
54 | pass
55 |
56 |
57 | class XLoaderTimeoutError(JobError):
58 | """Custom timeout exception that can be retried"""
59 | pass
60 |
--------------------------------------------------------------------------------
/ckanext/xloader/interfaces.py:
--------------------------------------------------------------------------------
1 | from ckan.plugins.interfaces import Interface
2 |
3 |
4 | class IXloader(Interface):
5 | """
6 | The IXloader interface allows plugin authors to receive notifications
7 | before and after a resource is submitted to the xloader service, as
8 | well as determining whether a resource should be submitted in can_upload
9 |
10 | The before_submit function, when implemented
11 | """
12 |
13 | def can_upload(self, resource_id):
14 | """ This call when implemented can be used to stop the processing of
15 | the xloader submit function. This method will not be called if
16 | the resource format does not match those defined in the
17 | ckanext.xloader.formats config option or the default formats.
18 |
19 | If this function returns False then processing will be aborted,
20 | whilst returning True will submit the resource to the xloader
21 | service
22 |
23 | Note that before reaching this hook there is a prior check on the
24 | resource format, which depends on the value of
25 | the :ref:`ckanext.xloader.formats` configuration option (and requires
26 | the resource to have a format defined).
27 |
28 | :param resource_id: The ID of the resource that is to be
29 | pushed to the xloader service.
30 |
31 | Returns ``True`` if the job should be submitted and ``False`` if
32 | the job should be aborted
33 |
34 | :rtype: bool
35 | """
36 | return True
37 |
38 | def after_upload(self, context, resource_dict, dataset_dict):
39 | """ After a resource has been successfully upload to the datastore
40 | this method will be called with the resource dictionary and the
41 | package dictionary for this resource.
42 |
43 | :param context: The context within which the upload happened
44 | :param resource_dict: The dict represenstaion of the resource that was
45 | successfully uploaded to the datastore
46 | :param dataset_dict: The dict represenstation of the dataset containing
47 | the resource that was uploaded
48 | """
49 | pass
50 |
--------------------------------------------------------------------------------
/ckanext/xloader/views.py:
--------------------------------------------------------------------------------
1 | from flask import Blueprint
2 |
3 | from ckan.plugins.toolkit import _, h, g, render, request, abort, NotAuthorized, get_action, ObjectNotFound
4 |
5 | import ckanext.xloader.utils as utils
6 |
7 |
8 | xloader = Blueprint("xloader", __name__)
9 |
10 |
11 | def get_blueprints():
12 | return [xloader]
13 |
14 |
15 | @xloader.route("/dataset/
/resource_data/", methods=("GET", "POST"))
16 | def resource_data(id, resource_id):
17 | rows = request.args.get('rows')
18 | if rows:
19 | try:
20 | rows = int(rows)
21 | if rows < 0:
22 | rows = None
23 | except ValueError:
24 | rows = None
25 | return utils.resource_data(id, resource_id, rows)
26 |
27 |
28 | @xloader.route("/dataset//delete-datastore/", methods=("GET", "POST"))
29 | def delete_datastore_table(id, resource_id):
30 | if u'cancel' in request.form:
31 | return h.redirect_to(u'xloader.resource_data', id=id, resource_id=resource_id)
32 |
33 | context = {"user": g.user}
34 |
35 | try:
36 | res_dict = get_action('resource_show')(context, {"id": resource_id})
37 | if res_dict.get('package_id') != id:
38 | raise ObjectNotFound
39 | except ObjectNotFound:
40 | return abort(404, _(u'Resource not found'))
41 |
42 | if request.method == 'POST':
43 | try:
44 | get_action('datastore_delete')(context, {
45 | "resource_id": resource_id,
46 | "force": True})
47 | except NotAuthorized:
48 | return abort(403, _(u'Unauthorized to delete resource %s') % resource_id)
49 |
50 | h.flash_notice(_(u'DataStore and Data Dictionary deleted for resource %s') % resource_id)
51 |
52 | return h.redirect_to(
53 | 'xloader.resource_data',
54 | id=id,
55 | resource_id=resource_id
56 | )
57 | else:
58 | g.resource_id = resource_id
59 | g.package_id = id
60 |
61 | extra_vars = {
62 | u"resource_id": resource_id,
63 | u"package_id": id
64 | }
65 | return render(u'xloader/confirm_datastore_delete.html', extra_vars)
66 |
--------------------------------------------------------------------------------
/ckanext/xloader/cli.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import sys
4 | import click
5 | from ckanext.xloader.command import XloaderCmd
6 |
7 | # Click commands for CKAN 2.9 and above
8 |
9 |
10 | @click.group(short_help='Perform XLoader related actions')
11 | def xloader():
12 | """xloader commands
13 | """
14 | pass
15 |
16 |
17 | @xloader.command()
18 | def status():
19 | """Shows status of jobs
20 | """
21 | cmd = XloaderCmd()
22 | cmd.print_status()
23 |
24 |
25 | @xloader.command()
26 | @click.argument(u'dataset-spec')
27 | @click.option('-y', is_flag=True, default=False, help='Always answer yes to questions')
28 | @click.option('--dry-run', is_flag=True, default=False, help='Don\'t actually submit any resources')
29 | @click.option('--queue', help='Queue name for asynchronous processing, unused if executing immediately')
30 | @click.option('--sync', is_flag=True, default=False,
31 | help='Execute immediately instead of enqueueing for asynchronous processing')
32 | def submit(dataset_spec, y, dry_run, queue, sync):
33 | """
34 | xloader submit [options]
35 | """
36 | cmd = XloaderCmd(dry_run)
37 |
38 | if dataset_spec == 'all':
39 | cmd._setup_xloader_logger()
40 | cmd._submit_all(sync=sync, queue=queue)
41 | elif dataset_spec == 'all-existing':
42 | _confirm_or_abort(y, dry_run)
43 | cmd._setup_xloader_logger()
44 | cmd._submit_all_existing(sync=sync, queue=queue)
45 | else:
46 | pkg_name_or_id = dataset_spec
47 | cmd._setup_xloader_logger()
48 | cmd._submit_package(pkg_name_or_id, sync=sync, queue=queue)
49 |
50 | if cmd.error_occured:
51 | print('Finished but saw errors - see above for details')
52 | sys.exit(1)
53 |
54 |
55 | def get_commands():
56 | return [xloader]
57 |
58 |
59 | def _confirm_or_abort(yes, dry_run):
60 | if yes or dry_run:
61 | return
62 | question = (
63 | "Data in any datastore resource that isn't in their source files "
64 | "(e.g. data added using the datastore API) will be permanently "
65 | "lost. Are you sure you want to proceed?"
66 | )
67 | if not click.confirm(question):
68 | print("Aborting...")
69 | sys.exit(0)
70 |
--------------------------------------------------------------------------------
/ckanext/xloader/tests/samples/polling_locations.kml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 | OGRGeoJSON
24 |
25 |
26 | 1
27 | 781862.000004
28 | 2958580.000015
29 | 1
30 | 1
31 | SAMUEL ADAMS ELEMENTARY SCHOOL
32 | 165 WEBSTER STREET
33 | VOTE IN AUDITORIUM
34 | VOTERS ENTER FROM LEFT OF MAIN ENTRANCE
35 |
36 |
37 | -71.0348903104939,42.365563004886
38 |
39 |
40 |
41 | 2
42 | 782174.071396
43 | 2959815.54504
44 | 1
45 | 2
46 | DONALD MCKAY SCHOOL
47 | 122 COTTAGE STREET
48 | VOTE IN CAFETERIA
49 | VOTER ENTRANCE MCKAY PLACE
50 |
51 |
52 | -71.0337105829271,42.368948675863
53 |
54 |
55 |
56 |
--------------------------------------------------------------------------------
/ckanext/xloader/parser.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import datetime
3 | from decimal import Decimal, InvalidOperation
4 | import re
5 | import six
6 |
7 | from ckan.plugins.toolkit import asbool
8 | from dateutil.parser import isoparser, parser, ParserError
9 |
10 | from ckan.plugins.toolkit import config
11 |
12 | CSV_SAMPLE_LINES = 1000
13 | DATE_REGEX = re.compile(r'''^\d{1,4}[-/.\s]\S+[-/.\s]\S+''')
14 |
15 |
16 | class TypeConverter:
17 | """ Post-process table cells to convert strings into numbers and timestamps
18 | as desired.
19 | """
20 |
21 | def __init__(self, types=None, fields=None):
22 | self.types = types
23 | self.fields = fields
24 |
25 | def convert_types(self, extended_rows):
26 | """ Try converting cells to numbers or timestamps if applicable.
27 | If a list of types was supplied, use that.
28 | If not, then try converting each column to numeric first,
29 | then to a timestamp. If both fail, just keep it as a string.
30 | """
31 | for row_number, headers, row in extended_rows:
32 | for cell_index, cell_value in enumerate(row):
33 | if cell_value is None:
34 | row[cell_index] = ''
35 | if self.fields:
36 | # only strip white space if strip_extra_white is True
37 | if self.fields[cell_index].get('info', {}).get('strip_extra_white', True) and isinstance(cell_value, six.text_type):
38 | cell_value = cell_value.strip()
39 | row[cell_index] = cell_value.strip()
40 | if not cell_value:
41 | # load_csv parody: empty of string type should be None
42 | if self.types and self.types[cell_index] == six.text_type:
43 | cell_value = None
44 | row[cell_index] = None
45 | continue
46 | cell_type = self.types[cell_index] if self.types else None
47 | if cell_type in [Decimal, None]:
48 | converted_value = to_number(cell_value)
49 | # Can't do a simple truthiness check,
50 | # because 0 is a valid numeric result.
51 | if converted_value is not None:
52 | row[cell_index] = converted_value
53 | continue
54 | if cell_type in [datetime.datetime, None]:
55 | converted_value = to_timestamp(cell_value)
56 | if converted_value:
57 | row[cell_index] = converted_value
58 | yield (row_number, headers, row)
59 |
60 |
61 | def to_number(value):
62 | if not isinstance(value, six.string_types):
63 | return None
64 | try:
65 | return Decimal(value)
66 | except InvalidOperation:
67 | return None
68 |
69 |
70 | def to_timestamp(value):
71 | if not isinstance(value, six.string_types) or not DATE_REGEX.search(value):
72 | return None
73 | try:
74 | i = isoparser()
75 | return i.isoparse(value)
76 | except ValueError:
77 | try:
78 | p = parser()
79 | yearfirst = asbool(config.get('ckanext.xloader.parse_dates_yearfirst', False))
80 | dayfirst = asbool(config.get('ckanext.xloader.parse_dates_dayfirst', False))
81 | return p.parse(value, yearfirst=yearfirst, dayfirst=dayfirst)
82 | except ParserError:
83 | return None
84 |
--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
1 | ---
2 | name: Tests
3 | on:
4 | push:
5 | pull_request:
6 | branches:
7 | - master
8 | workflow_call:
9 |
10 | jobs:
11 | lint:
12 | runs-on: ubuntu-latest
13 | steps:
14 | - uses: actions/checkout@v4
15 | - uses: actions/setup-python@v5
16 | with:
17 | python-version: '3.10'
18 | - name: Install requirements
19 | run: pip install flake8 pycodestyle
20 | - name: Check syntax
21 | run: flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics --extend-exclude ckan
22 |
23 | test:
24 | needs: lint
25 | strategy:
26 | matrix:
27 | include: #ckan-image see https://github.com/ckan/ckan-docker-base, ckan-version controls other image tags
28 | - ckan-version: "2.11"
29 | ckan-image: "2.11-py3.10"
30 | experimental: false
31 | - ckan-version: "2.10"
32 | ckan-image: "2.10-py3.10"
33 | experimental: false
34 | - ckan-version: "master"
35 | ckan-image: "master"
36 | experimental: true # master is unstable, good to know if we are compatible or not
37 | fail-fast: false
38 |
39 | name: ${{ matrix.experimental && '**Fail_Ignored** ' || '' }} CKAN ${{ matrix.ckan-version }}
40 | runs-on: ubuntu-latest
41 | container:
42 | image: ckan/ckan-dev:${{ matrix.ckan-image }}
43 | options: --user root
44 | services:
45 | solr:
46 | image: ckan/ckan-solr:${{ matrix.ckan-version }}-solr9
47 | postgres:
48 | image: ckan/ckan-postgres-dev:${{ matrix.ckan-version }}
49 | env:
50 | POSTGRES_USER: postgres
51 | POSTGRES_PASSWORD: postgres
52 | POSTGRES_DB: postgres
53 | ports:
54 | - 5432:5432
55 | options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5
56 | redis:
57 | image: redis:7
58 | env:
59 | CKAN_SQLALCHEMY_URL: postgresql://ckan_default:pass@postgres/ckan_test
60 | CKAN_DATASTORE_WRITE_URL: postgresql://datastore_write:pass@postgres/datastore_test
61 | CKAN_DATASTORE_READ_URL: postgresql://datastore_read:pass@postgres/datastore_test
62 | CKAN_SOLR_URL: http://solr:8983/solr/ckan
63 | CKAN_REDIS_URL: redis://redis:6379/1
64 |
65 | steps:
66 | - uses: actions/checkout@v4
67 | continue-on-error: ${{ matrix.experimental }}
68 |
69 | - name: ${{ matrix.experimental && '**Fail_Ignored** ' || '' }} Install requirements
70 | continue-on-error: ${{ matrix.experimental }}
71 | run: |
72 | pip install -r requirements.txt
73 | pip install -r dev-requirements.txt
74 | pip install -e .
75 | pip install -U requests[security]
76 | # Replace default path to CKAN core config file with the one on the container
77 | sed -i -e 's/use = config:.*/use = config:\/srv\/app\/src\/ckan\/test-core.ini/' test.ini
78 |
79 | - name: ${{ matrix.experimental && '**Fail_Ignored** ' || '' }} Setup extension
80 | continue-on-error: ${{ matrix.experimental }}
81 | run: |
82 | ckan -c test.ini db init
83 | ckan -c test.ini user add ckan_admin email=ckan_admin@localhost password="AbCdEf12345!@#%"
84 | ckan -c test.ini sysadmin add ckan_admin
85 | ckan config-tool test.ini "ckanext.xloader.api_token=$(ckan -c test.ini user token add ckan_admin xloader | tail -n 1 | tr -d '\t')"
86 | ckan -c test.ini user list
87 |
88 | - name: ${{ matrix.experimental && '**Fail_Ignored** ' || '' }} Run tests
89 | continue-on-error: ${{ matrix.experimental }}
90 | run: pytest --ckan-ini=test.ini --cov=ckanext.xloader --disable-warnings ckanext/xloader/tests --junit-xml=/tmp/artifacts/junit/results.xml
91 |
92 | - name: ${{ matrix.experimental && '**Fail_Ignored** ' || '' }} Test Summary
93 | uses: test-summary/action@v2
94 | continue-on-error: ${{ matrix.experimental }}
95 | with:
96 | paths: "/tmp/artifacts/junit/*.xml"
97 | if: always()
--------------------------------------------------------------------------------
/ckanext/xloader/tests/test_parser.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import os
3 | import pytest
4 |
5 | from decimal import Decimal
6 | from datetime import datetime
7 |
8 | from tabulator import Stream
9 | from ckanext.xloader.parser import TypeConverter
10 |
11 | csv_filepath = os.path.abspath(
12 | os.path.join(os.path.dirname(__file__), "samples", "date_formats.csv")
13 | )
14 |
15 |
16 | class TestParser(object):
17 | def test_simple(self):
18 | with Stream(csv_filepath, format='csv',
19 | post_parse=[TypeConverter().convert_types]) as stream:
20 | assert stream.sample == [
21 | [
22 | 'date',
23 | 'temperature',
24 | 'place'
25 | ],
26 | [
27 | datetime(2011, 1, 2, 0, 0),
28 | Decimal('-1'),
29 | 'Galway'
30 | ],
31 | [
32 | datetime(2011, 1, 3, 0, 0),
33 | Decimal('0.5'),
34 | 'Galway'
35 | ],
36 | [
37 | datetime(2011, 1, 2, 0, 0),
38 | Decimal('5'),
39 | 'Berkeley'
40 | ],
41 | [
42 | datetime(2003, 11, 1, 0, 0),
43 | Decimal('6'),
44 | 'Berkeley'
45 | ],
46 | ]
47 |
48 | @pytest.mark.ckan_config("ckanext.xloader.parse_dates_dayfirst", True)
49 | def test_dayfirst(self):
50 | print('test_dayfirst')
51 | with Stream(csv_filepath, format='csv',
52 | post_parse=[TypeConverter().convert_types]) as stream:
53 | assert stream.sample == [
54 | [
55 | 'date',
56 | 'temperature',
57 | 'place'
58 | ],
59 | [
60 | datetime(2011, 1, 2, 0, 0),
61 | Decimal('-1'),
62 | 'Galway'
63 | ],
64 | [
65 | datetime(2011, 3, 1, 0, 0),
66 | Decimal('0.5'),
67 | 'Galway'
68 | ],
69 | [
70 | datetime(2011, 2, 1, 0, 0),
71 | Decimal('5'),
72 | 'Berkeley'
73 | ],
74 | [
75 | datetime(2003, 1, 11, 0, 0),
76 | Decimal('6'),
77 | 'Berkeley'
78 | ],
79 | ]
80 |
81 | @pytest.mark.ckan_config("ckanext.xloader.parse_dates_yearfirst", True)
82 | def test_yearfirst(self):
83 | print('test_yearfirst')
84 | with Stream(csv_filepath, format='csv',
85 | post_parse=[TypeConverter().convert_types]) as stream:
86 | assert stream.sample == [
87 | [
88 | 'date',
89 | 'temperature',
90 | 'place'
91 | ],
92 | [
93 | datetime(2011, 1, 2, 0, 0),
94 | Decimal('-1'),
95 | 'Galway'
96 | ],
97 | [
98 | datetime(2011, 1, 3, 0, 0),
99 | Decimal('0.5'),
100 | 'Galway'
101 | ],
102 | [
103 | datetime(2011, 1, 2, 0, 0),
104 | Decimal('5'),
105 | 'Berkeley'
106 | ],
107 | [
108 | datetime(2011, 1, 3, 0, 0),
109 | Decimal('6'),
110 | 'Berkeley'
111 | ],
112 | ]
113 |
114 | @pytest.mark.ckan_config("ckanext.xloader.parse_dates_dayfirst", True)
115 | @pytest.mark.ckan_config("ckanext.xloader.parse_dates_yearfirst", True)
116 | def test_yearfirst_dayfirst(self):
117 | with Stream(csv_filepath, format='csv',
118 | post_parse=[TypeConverter().convert_types]) as stream:
119 | assert stream.sample == [
120 | [
121 | 'date',
122 | 'temperature',
123 | 'place'
124 | ],
125 | [
126 | datetime(2011, 1, 2, 0, 0),
127 | Decimal('-1'),
128 | 'Galway'
129 | ],
130 | [
131 | datetime(2011, 3, 1, 0, 0),
132 | Decimal('0.5'),
133 | 'Galway'
134 | ],
135 | [
136 | datetime(2011, 2, 1, 0, 0),
137 | Decimal('5'),
138 | 'Berkeley'
139 | ],
140 | [
141 | datetime(2011, 3, 1, 0, 0),
142 | Decimal('6'),
143 | 'Berkeley'
144 | ],
145 | ]
146 |
--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
1 | ---
2 | name: Publish to pypi
3 | on:
4 | push:
5 | #On versioned releases
6 | tags:
7 | - '*.*.*'
8 | # Allows you to run this workflow manually from the Actions tab
9 | workflow_dispatch:
10 | inputs:
11 | force:
12 | type: choice
13 | description: Retry Publish Version
14 | options:
15 | - No
16 | - Yes
17 | environment:
18 | description: 'Deployment environment'
19 | required: true
20 | default: 'pypi'
21 | type: choice
22 | options:
23 | - pypi
24 | - testpypi
25 | dryRun:
26 | description: 'Dry Run deployment (set to false to deploy)'
27 | required: true
28 | type: boolean
29 | default: true
30 |
31 |
32 |
33 | jobs:
34 |
35 | validateVersion:
36 | runs-on: ubuntu-latest
37 | if: github.repository == 'ckan/ckanext-xloader'
38 | steps:
39 | - uses: actions/checkout@v4
40 |
41 | - uses: actions/setup-python@v5
42 | with:
43 | python-version: '3.10'
44 |
45 | - name: Validate tag version
46 | if: ${{ startsWith(github.ref, 'refs/tags') }}
47 | run: |
48 | TAG_VALUE=${GITHUB_REF/refs\/tags\//}
49 | PYTHON_VERSION=$(grep -E '\bversion\s?=\s?"[^"]+"' pyproject.toml | awk -F '"' '{print $2}')
50 | echo "Tag version is [$TAG_VALUE], Python version is [$PYTHON_VERSION]"
51 | if [ "$TAG_VALUE" != "$PYTHON_VERSION" ]; then
52 | echo "Version mismatch; tag version is [$TAG_VALUE] but Python version is [$PYTHON_VERSION]" >> $GITHUB_STEP_SUMMARY
53 | exit 1
54 | fi
55 |
56 | test:
57 | needs: validateVersion
58 | name: Test
59 | uses: ./.github/workflows/test.yml # Call the reusable workflow
60 |
61 | publishSkipped:
62 | if: github.repository != 'ckan/ckanext-xloader'
63 | runs-on: ubuntu-latest
64 | steps:
65 | - run: |
66 | echo "## Skipping PyPI publish on downstream repository" >> $GITHUB_STEP_SUMMARY
67 |
68 | publish:
69 | needs: test
70 | permissions:
71 | id-token: write # IMPORTANT: this permission is mandatory for trusted publishing
72 | name: Publish Package
73 | runs-on: ubuntu-latest
74 | environment:
75 | name: ${{ github.event.inputs.environment || 'pypi' }}
76 | url: ${{ steps.version.outputs.url }}
77 | concurrency:
78 | group: ${{ github.event.inputs.environment }}-deployment
79 | cancel-in-progress: false
80 | env:
81 | ENVIRONMENT: ${{ github.event.inputs.environment || 'pypi' }}
82 | steps:
83 | - name: Get Git Tag and set url from environment
84 | id: version
85 | run: |
86 | #!/bin/bash
87 |
88 | TAG_VALUE=${GITHUB_REF/refs\/tags\//}
89 | echo "version=${TAG_VALUE}" >> $GITHUB_OUTPUT
90 |
91 | # Extract the repository name (minus the owner/org)
92 | reponame=$(basename $GITHUB_REPOSITORY)
93 | echo "reponame=${reponame}" >> $GITHUB_OUTPUT
94 |
95 | if [ "$env.ENVIRONMENT" == "testpypi" ]; then
96 | url="https://test.pypi.org/project/$reponame/$TAG_VALUE/"
97 | echo "environment=${env.ENVIRONMENT}" >> $GITHUB_OUTPUT
98 | else
99 | url="https://pypi.org/project/$reponame/$TAG_VALUE/"
100 | echo "environment=pypi" >> $GITHUB_OUTPUT
101 | fi
102 |
103 | echo "url=${url}" >> $GITHUB_OUTPUT
104 |
105 | - name: Checkout repository
106 | uses: actions/checkout@v4
107 |
108 | - name: Build package ${{ steps.version.outputs.reponame }} @ ${{ steps.version.outputs.version }}
109 | run: |
110 | pip install build
111 | pip install twine
112 | python -m build
113 | - name: Publish package distributions to PyPI
114 | if: ${{ startsWith(github.ref, 'refs/tags') && steps.version.outputs.environment == 'pypi' && github.event.inputs.dryRun != 'true' }}
115 | uses: pypa/gh-action-pypi-publish@release/v1
116 | # with:
117 | # skip-existing: true
118 | # verbose: true
119 | # print-hash: true
120 | - name: Test Publish package distributions to PyPI
121 | if: ${{ startsWith(github.ref, 'refs/tags') && steps.version.outputs.environment == 'testpypi' && github.event.inputs.dryRun == 'true' }}
122 | uses: pypa/gh-action-pypi-publish@release/v1
123 | with:
124 | repository-url: https://test.pypi.org/legacy/
125 | # skip-existing: true
126 | # verbose: true
127 | # print-hash: true
128 | - name: Summary output
129 | if: ${{ startsWith(github.ref, 'refs/tags') && github.event.inputs.dryRun != 'true' }}
130 | run:
131 | echo "Published ${{ steps.version.outputs.repo_name }} @ ${{ steps.version.outputs.version }} to ${{ steps.version.outputs.url }}" >> $GITHUB_STEP_SUMMARY
132 |
133 | - name: (TEST RUN) Test Publish package distributions to PyPI
134 | if: ${{ github.event.inputs.dryRun == 'true' }}
135 | run:
136 | echo "Dry run deployment, did not publish" >> $GITHUB_STEP_SUMMARY
137 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [ "setuptools",]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "ckanext-xloader"
7 | version = "2.2.0"
8 | description = "Express Loader - quickly load data into CKAN DataStore"
9 | classifiers = [ "Development Status :: 5 - Production/Stable",
10 | "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)",
11 | "Programming Language :: Python :: 3.7",
12 | "Programming Language :: Python :: 3.8",
13 | "Programming Language :: Python :: 3.9",
14 | "Programming Language :: Python :: 3.10",]
15 | keywords = [ "CKAN", "extension", "datastore",]
16 | dependencies = [ "typing_extensions",]
17 | authors = [
18 | {name = "ThrawnCA", email = "carl.antuar@smartservice.qld.gov.au"},
19 | {name = "Jesse Vickery (JVickery-TBS)", email = "jesse.vickery@tbs-sct.gc.ca"},
20 | {name = "Adrià Mercader (amercader)", email = "amercadero@gmail.com"},
21 | {name = "David Read (davidread)"},
22 | {name = "Brett Jones (kowh-ai)", email = "datashades@linkdigital.com.au"},
23 | {name = "Patricio Del Boca (pdelboca)"},
24 | {name = "William Dutton (duttonw)", email = "william.dutton@qld.gov.au"},
25 | # {name = "", email = ""},
26 | ]
27 | maintainers = [
28 | {name = "Adrià Mercader (amercader)", email = "amercadero@gmail.com"},
29 | {name = "William Dutton (duttonw)", email = "william.dutton@qld.gov.au"},
30 | {name = "Ian Ward (wardi)"},
31 | {name = "Brett Jones (kowh-ai)", email = "datashades@linkdigital.com.au"},
32 | ]
33 |
34 | [project.readme]
35 | file = "README.md"
36 | content-type = "text/markdown"
37 |
38 | [project.license]
39 | text = "AGPL"
40 |
41 | [project.urls]
42 | Homepage = "https://github.com/ckan/ckanext-xloader"
43 |
44 | [project.optional-dependencies]
45 | test = [ "pytest-factoryboy",]
46 |
47 | [project.entry-points."ckan.plugins"]
48 | xloader = "ckanext.xloader.plugin:xloaderPlugin"
49 |
50 | [project.entry-points."babel.extractors"]
51 | ckan = "ckan.lib.extract:extract_ckan"
52 |
53 | [tool.setuptools.packages]
54 | find = {}
55 |
56 | [tool.black]
57 | line-length = 79
58 | preview = true
59 |
60 | [tool.isort]
61 | known_ckan = "ckan"
62 | known_ckanext = "ckanext"
63 | known_self = "ckanext.xloader"
64 | sections = "FUTURE,STDLIB,FIRSTPARTY,THIRDPARTY,CKAN,CKANEXT,SELF,LOCALFOLDER"
65 |
66 | [tool.pytest.ini_options]
67 | addopts = "--ckan-ini test.ini"
68 | filterwarnings = [
69 | "ignore::sqlalchemy.exc.SADeprecationWarning",
70 | "ignore::sqlalchemy.exc.SAWarning",
71 | "ignore::DeprecationWarning",
72 | ]
73 |
74 | [tool.pyright]
75 | pythonVersion = "3.7"
76 | include = ["ckanext"]
77 | exclude = [
78 | "**/test*",
79 | "**/migration",
80 | ]
81 | strict = []
82 |
83 | strictParameterNoneValue = true # type must be Optional if default value is None
84 |
85 | # Check the meaning of rules here
86 | # https://github.com/microsoft/pyright/blob/main/docs/configuration.md
87 | reportFunctionMemberAccess = true # non-standard member accesses for functions
88 | reportMissingImports = true
89 | reportMissingModuleSource = true
90 | reportMissingTypeStubs = false
91 | reportImportCycles = true
92 | reportUnusedImport = true
93 | reportUnusedClass = true
94 | reportUnusedFunction = true
95 | reportUnusedVariable = true
96 | reportDuplicateImport = true
97 | reportOptionalSubscript = true
98 | reportOptionalMemberAccess = true
99 | reportOptionalCall = true
100 | reportOptionalIterable = true
101 | reportOptionalContextManager = true
102 | reportOptionalOperand = true
103 | reportTypedDictNotRequiredAccess = false # We are using Context in a way that conflicts with this check
104 | reportConstantRedefinition = true
105 | reportIncompatibleMethodOverride = true
106 | reportIncompatibleVariableOverride = true
107 | reportOverlappingOverload = true
108 | reportUntypedFunctionDecorator = false
109 | reportUnknownParameterType = true
110 | reportUnknownArgumentType = false
111 | reportUnknownLambdaType = false
112 | reportUnknownMemberType = false
113 | reportMissingTypeArgument = true
114 | reportInvalidTypeVarUse = true
115 | reportCallInDefaultInitializer = true
116 | reportUnknownVariableType = true
117 | reportUntypedBaseClass = true
118 | reportUnnecessaryIsInstance = true
119 | reportUnnecessaryCast = true
120 | reportUnnecessaryComparison = true
121 | reportAssertAlwaysTrue = true
122 | reportSelfClsParameterName = true
123 | reportUnusedCallResult = false # allow function calls for side-effect only (like logic.check_acces)
124 | useLibraryCodeForTypes = true
125 | reportGeneralTypeIssues = true
126 | reportPropertyTypeMismatch = true
127 | reportWildcardImportFromLibrary = true
128 | reportUntypedClassDecorator = false # authenticator relies on repoze.who class-decorator
129 | reportUntypedNamedTuple = true
130 | reportPrivateUsage = true
131 | reportPrivateImportUsage = true
132 | reportInconsistentConstructor = true
133 | reportMissingSuperCall = false
134 | reportUninitializedInstanceVariable = true
135 | reportInvalidStringEscapeSequence = true
136 | reportMissingParameterType = true
137 | reportImplicitStringConcatenation = false
138 | reportUndefinedVariable = true
139 | reportUnboundVariable = true
140 | reportInvalidStubStatement = true
141 | reportIncompleteStub = true
142 | reportUnsupportedDunderAll = true
143 | reportUnusedCoroutine = true
144 | reportUnnecessaryTypeIgnoreComment = true
145 | reportMatchNotExhaustive = true
--------------------------------------------------------------------------------
/ckanext/xloader/tests/test_action.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from ckan.plugins import toolkit
3 | try:
4 | from unittest import mock
5 | except ImportError:
6 | import mock
7 |
8 | from ckan.plugins.toolkit import NotAuthorized
9 | from ckan.tests import helpers, factories
10 |
11 | from ckanext.xloader.utils import get_xloader_user_apitoken
12 |
13 |
14 | @pytest.mark.usefixtures("clean_db", "with_plugins")
15 | @pytest.mark.ckan_config("ckan.plugins", "datastore xloader")
16 | class TestAction(object):
17 |
18 | def test_submit(self):
19 | # checks that xloader_submit enqueues the resource (to be xloadered)
20 | user = factories.User()
21 | # normally creating a resource causes xloader_submit to be called,
22 | # but we avoid that by setting an invalid format
23 | res = factories.Resource(user=user, format="aaa")
24 | # mock the enqueue
25 | with mock.patch(
26 | "ckanext.xloader.action.enqueue_job",
27 | return_value=mock.MagicMock(id=123),
28 | ) as enqueue_mock:
29 | helpers.call_action(
30 | "xloader_submit",
31 | context=dict(user=user["name"]),
32 | resource_id=res["id"],
33 | )
34 | assert 1 == enqueue_mock.call_count
35 |
36 | def test_submit_to_custom_queue_without_auth(self):
37 | # check that xloader_submit doesn't allow regular users to change queues
38 | user = factories.User()
39 | with pytest.raises(NotAuthorized):
40 | helpers.call_auth(
41 | "xloader_submit",
42 | context=dict(user=user["name"], model=None),
43 | queue='foo',
44 | )
45 |
46 | def test_submit_to_custom_queue_as_sysadmin(self):
47 | # check that xloader_submit allows sysadmins to change queues
48 | user = factories.Sysadmin()
49 | assert helpers.call_auth(
50 | "xloader_submit",
51 | context=dict(user=user["name"], model=None),
52 | queue='foo',
53 | ) is True
54 |
55 | def test_duplicated_submits(self):
56 | def submit(res, user):
57 | return helpers.call_action(
58 | "xloader_submit",
59 | context=dict(user=user["name"]),
60 | resource_id=res["id"],
61 | )
62 |
63 | user = factories.User()
64 |
65 | with mock.patch(
66 | "ckanext.xloader.action.enqueue_job",
67 | return_value=mock.MagicMock(id=123),
68 | ) as enqueue_mock:
69 | enqueue_mock.reset_mock()
70 | # creating the resource causes it to be queued
71 | res = factories.Resource(user=user, format="csv")
72 | assert 1 == enqueue_mock.call_count
73 | # a second request to queue it will be stopped, because of the
74 | # existing task for this resource - shown by task_status_show
75 | submit(res, user)
76 | assert 1 == enqueue_mock.call_count
77 |
78 | def test_xloader_hook(self):
79 | # Check the task_status is stored correctly after a xloader job.
80 | user = factories.User()
81 | res = factories.Resource(user=user, format="csv")
82 | task_status = helpers.call_action(
83 | "task_status_update",
84 | context={},
85 | entity_id=res["id"],
86 | entity_type="resource",
87 | task_type="xloader",
88 | key="xloader",
89 | value="{}",
90 | error="{}",
91 | state="pending",
92 | )
93 |
94 | helpers.call_action(
95 | "xloader_hook",
96 | context=dict(user=user["name"]),
97 | metadata={"resource_id": res["id"]},
98 | status="complete",
99 | )
100 |
101 | task_status = helpers.call_action(
102 | "task_status_show",
103 | context={},
104 | entity_id=res["id"],
105 | task_type="xloader",
106 | key="xloader",
107 | )
108 | assert task_status["state"] == "complete"
109 |
110 | def test_status(self):
111 |
112 | # Trigger an xloader job
113 | res = factories.Resource(format="CSV")
114 |
115 | status = helpers.call_action(
116 | "xloader_status",
117 | resource_id=res["id"],
118 | )
119 |
120 | assert status["status"] == "pending"
121 |
122 | def test_xloader_user_api_token_from_config(self):
123 | sysadmin = factories.SysadminWithToken()
124 | apikey = sysadmin["token"]
125 | with mock.patch.dict(toolkit.config, {'ckanext.xloader.api_token': apikey}):
126 | api_token = get_xloader_user_apitoken()
127 | assert api_token == apikey
128 |
129 | @pytest.mark.ckan_config("ckanext.xloader.api_token", "NOT_SET")
130 | def test_xloader_user_api_token_from_config_should_throw_exceptio_when_not_set(self):
131 |
132 | hasNotThrownException = True
133 | try:
134 | get_xloader_user_apitoken()
135 | except Exception:
136 | hasNotThrownException = False
137 |
138 | assert not hasNotThrownException
139 |
140 | @pytest.mark.ckan_config("ckanext.xloader.api_token", "random-api-token")
141 | def test_xloader_user_api_token(self):
142 | api_token = get_xloader_user_apitoken()
143 |
144 | assert api_token == "random-api-token"
145 |
--------------------------------------------------------------------------------
/ckanext/xloader/tests/test_utils.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from unittest.mock import patch
3 | from ckan.plugins import toolkit
4 | from ckanext.xloader import utils
5 |
6 |
7 | def test_private_modify_url_no_change():
8 | url = "https://ckan.example.com/dataset"
9 | assert utils._modify_url(url, "https://ckan.example.com") == url
10 |
11 |
12 | @pytest.mark.parametrize("result_url, ckan_url, expected", [
13 | ("https://example.com/resource/123", "https://ckan.example.org", "https://ckan.example.org/resource/123"),
14 | ("https://example.com/resource/123", "http://127.0.0.1:3001", "http://127.0.0.1:3001/resource/123"),
15 | ("https://example.com/resource/123", "http://127.0.0.1:3001/pathnotadded", "http://127.0.0.1:3001/resource/123"),
16 | ("https://ckan.example.org/resource/123", "https://ckan.example.org", "https://ckan.example.org/resource/123"),
17 | ("http://old-ckan.com/resource/456", "http://new-ckan.com", "http://new-ckan.com/resource/456"),
18 | ("https://sub.example.com/path", "https://ckan.example.com", "https://ckan.example.com/path"),
19 | ("ftp://fileserver.com/file", "https://ckan.example.com", "ftp://fileserver.com/file"), # should never happen
20 | ("https://ckan.example.org/resource/789", "https://xloader.example.org", "https://xloader.example.org/resource/789"),
21 | ("https://ckan.example.org/dataset/data", "https://xloader.example.org", "https://xloader.example.org/dataset/data"),
22 | ("https://ckan.example.org/resource/123?foo=bar", "https://xloader.example.org", "https://xloader.example.org/resource/123?foo=bar"),
23 | ("https://ckan.example.org/dataset/456#section", "https://xloader.example.org", "https://xloader.example.org/dataset/456#section"),
24 | ("https://ckan.example.org/resource/123?param=value&other=123", "https://xloader.example.org", "https://xloader.example.org/resource/123?param=value&other=123"),
25 | ("https://ckan.example.org/resource/partial#fragment", "https://xloader.example.org", "https://xloader.example.org/resource/partial#fragment"),
26 | ("https://ckan.example.org/path/to/data?key=value#section", "https://xloader.example.org", "https://xloader.example.org/path/to/data?key=value#section"),
27 | ("", "", ""),
28 | ("", "http://127.0.0.1:5000", ""),
29 | (None, None, None),
30 | (None, "http://127.0.0.1:5000", None),
31 | ])
32 | def test_private_modify_url(result_url, ckan_url, expected):
33 | assert utils._modify_url(result_url, ckan_url) == expected
34 |
35 |
36 | @pytest.mark.parametrize("input_url, ckan_site_url, xloader_site_url, is_altered, expected", [
37 | ("https://ckan.example.org/resource/789", "https://ckan.example.org", "https://xloader.example.org", True, "https://xloader.example.org/resource/789"),
38 | ("https://ckan.example.org/resource/789", "https://ckan.example.org", "http://127.0.0.1:3012", True, "http://127.0.0.1:3012/resource/789"),
39 | ("https://ckan.example.org/dataset/data", "https://ckan.example.org", "https://xloader.example.org", True, "https://xloader.example.org/dataset/data"),
40 | ("https://ckan.example.org/resource/123?foo=bar", "https://ckan.example.org", "https://xloader.example.org", True, "https://xloader.example.org/resource/123?foo=bar"),
41 | ("https://ckan.example.org/dataset/456#section", "https://ckan.example.org", "https://xloader.example.org", True, "https://xloader.example.org/dataset/456#section"),
42 | ("https://other-site.com/resource/999", "https://ckan.example.org", "https://xloader.example.org", False, ""),
43 | ("https://ckan.example.org/resource/123?param=value&other=123", "https://ckan.example.org", "https://xloader.example.org", True, "https://xloader.example.org/resource/123?param=value&other=123"),
44 | ("https://ckan.example.org/resource/partial#fragment", "https://ckan.example.org", "https://xloader.example.org", True, "https://xloader.example.org/resource/partial#fragment"),
45 | ("https://ckan.example.org/path/to/data?key=value#section", "https://ckan.example.org", "https://xloader.example.org", True, "https://xloader.example.org/path/to/data?key=value#section"),
46 | ("https://ckan.example.org/path/to/data?key=value#section", "https://ckan.example.org", "http://localhost:3000", True, "http://localhost:3000/path/to/data?key=value#section"),
47 | ("https://ckan.example.org/blackListedPathToS3HostOrigin?key=value#section", "https://ckan.example.org", "https://xloader.example.org", False, ""),
48 | ("ftp://ckan.example.org/dataset/456#section", "https://ckan.example.org", "https://xloader.example.org", False, ""),
49 | ("https://ckan.example.org/dataset/456#section", "https://ckan.example.org", "", False, ""),
50 | ("", "http://127.0.0.1:5000", None, False, ""),
51 | ("", "http://127.0.0.1:5000", "", False, ""),
52 | (None, "http://127.0.0.1:5000", None, False, ""),
53 | (None, "http://127.0.0.1:5000", "", False, ""),
54 | ])
55 | def test_modify_input_url(input_url, ckan_site_url, xloader_site_url, is_altered, expected):
56 | with patch.dict(toolkit.config,
57 | {"ckan.site_url": ckan_site_url,
58 | "ckanext.xloader.site_url": xloader_site_url,
59 | "ckanext.xloader.site_url_ignore_path_regex": "(/blackListedPathToS3HostOrigin|/anotherpath)"}):
60 | response = utils.modify_input_url(input_url)
61 | if is_altered:
62 | assert response == expected
63 | else:
64 | assert response == input_url
65 |
66 |
67 | def test_modify_input_url_no_xloader_site():
68 | url = "https://ckan.example.org/dataset"
69 | with patch.dict(toolkit.config, {"ckan.site_url": "https://ckan.example.org", "ckanext.xloader.site_url": None}):
70 | assert utils.modify_input_url(url) == url
71 |
--------------------------------------------------------------------------------
/ckanext/xloader/tests/fixtures.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from sqlalchemy import orm
3 | import os
4 |
5 | from ckanext.datastore.tests import helpers as datastore_helpers
6 | from ckanext.xloader.loader import get_write_engine
7 |
8 | __location__ = os.path.realpath(
9 | os.path.join(os.getcwd(), os.path.dirname(__file__))
10 | )
11 |
12 | try:
13 | from ckan.tests.pytest_ckan.fixtures import * # noqa
14 | except ImportError:
15 | import pytest
16 |
17 | from ckan.tests import helpers as test_helpers
18 | import ckan.plugins
19 | import ckan.lib.search as search
20 |
21 | from ckan.common import config
22 |
23 | @pytest.fixture
24 | def ckan_config(request, monkeypatch):
25 | """Allows to override the configuration object used by tests
26 |
27 | Takes into account config patches introduced by the ``ckan_config``
28 | mark.
29 |
30 | If you just want to set one or more configuration options for the
31 | scope of a test (or a test class), use the ``ckan_config`` mark::
32 |
33 | @pytest.mark.ckan_config('ckan.auth.create_unowned_dataset', True)
34 | def test_auth_create_unowned_dataset():
35 |
36 | # ...
37 |
38 | To use the custom config inside a test, apply the
39 | ``ckan_config`` mark to it and inject the ``ckan_config`` fixture:
40 |
41 | .. literalinclude:: /../ckan/tests/pytest_ckan/test_fixtures.py
42 | :start-after: # START-CONFIG-OVERRIDE
43 | :end-before: # END-CONFIG-OVERRIDE
44 |
45 | If the change only needs to be applied locally, use the
46 | ``monkeypatch`` fixture
47 |
48 | .. literalinclude:: /../ckan/tests/test_common.py
49 | :start-after: # START-CONFIG-OVERRIDE
50 | :end-before: # END-CONFIG-OVERRIDE
51 |
52 | """
53 | _original = config.copy()
54 | for mark in request.node.iter_markers(u"ckan_config"):
55 | monkeypatch.setitem(config, *mark.args)
56 | yield config
57 | config.clear()
58 | config.update(_original)
59 |
60 | @pytest.fixture
61 | def make_app(ckan_config):
62 | """Factory for client app instances.
63 |
64 | Unless you need to create app instances lazily for some reason,
65 | use the ``app`` fixture instead.
66 | """
67 | return test_helpers._get_test_app
68 |
69 | @pytest.fixture
70 | def app(make_app):
71 | """Returns a client app instance to use in functional tests
72 |
73 | To use it, just add the ``app`` parameter to your test function signature::
74 |
75 | def test_dataset_search(self, app):
76 |
77 | url = h.url_for('dataset.search')
78 |
79 | response = app.get(url)
80 |
81 |
82 | """
83 | return make_app()
84 |
85 | @pytest.fixture(scope=u"session")
86 | def reset_db():
87 | """Callable for resetting the database to the initial state.
88 |
89 | If possible use the ``clean_db`` fixture instead.
90 |
91 | """
92 | return test_helpers.reset_db
93 |
94 | @pytest.fixture(scope=u"session")
95 | def reset_index():
96 | """Callable for cleaning search index.
97 |
98 | If possible use the ``clean_index`` fixture instead.
99 | """
100 | return search.clear_all
101 |
102 | @pytest.fixture
103 | def clean_db(reset_db):
104 | """Resets the database to the initial state.
105 |
106 | This can be used either for all tests in a class::
107 |
108 | @pytest.mark.usefixtures("clean_db")
109 | class TestExample(object):
110 |
111 | def test_example(self):
112 |
113 | or for a single test::
114 |
115 | class TestExample(object):
116 |
117 | @pytest.mark.usefixtures("clean_db")
118 | def test_example(self):
119 |
120 | """
121 | reset_db()
122 |
123 | @pytest.fixture
124 | def clean_index(reset_index):
125 | """Clear search index before starting the test.
126 | """
127 | reset_index()
128 |
129 | @pytest.fixture
130 | def with_plugins(ckan_config):
131 | """Load all plugins specified by the ``ckan.plugins`` config option
132 | at the beginning of the test. When the test ends (even it fails), it will
133 | unload all the plugins in the reverse order.
134 |
135 | .. literalinclude:: /../ckan/tests/test_factories.py
136 | :start-after: # START-CONFIG-OVERRIDE
137 | :end-before: # END-CONFIG-OVERRIDE
138 |
139 | """
140 | plugins = ckan_config["ckan.plugins"].split()
141 | for plugin in plugins:
142 | if not ckan.plugins.plugin_loaded(plugin):
143 | ckan.plugins.load(plugin)
144 | yield
145 | for plugin in reversed(plugins):
146 | if ckan.plugins.plugin_loaded(plugin):
147 | ckan.plugins.unload(plugin)
148 |
149 | @pytest.fixture
150 | def test_request_context(app):
151 | """Provide function for creating Flask request context.
152 | """
153 | return app.flask_app.test_request_context
154 |
155 | @pytest.fixture
156 | def with_request_context(test_request_context):
157 | """Execute test inside requests context
158 | """
159 | with test_request_context():
160 | yield
161 |
162 |
163 | def reset_datastore_db():
164 | engine = get_write_engine()
165 | Session = orm.scoped_session(orm.sessionmaker(bind=engine))
166 | datastore_helpers.clear_db(Session)
167 |
168 |
169 | @pytest.fixture()
170 | def full_reset(reset_db):
171 | reset_db()
172 | reset_datastore_db()
173 |
--------------------------------------------------------------------------------
/ckanext/xloader/templates/xloader/resource_data.html:
--------------------------------------------------------------------------------
1 | {% extends "package/resource_edit_base.html" %}
2 |
3 | {% block subtitle %}{{ h.dataset_display_name(pkg) }} - {{ h.resource_display_name(res) }}{% endblock %}
4 |
5 | {% block primary_content_inner %}
6 |
7 | {% set show_table = true %}
8 |
9 | {% block upload_ds_button %}
10 | {% set action = h.url_for('xloader.resource_data', id=pkg.name, resource_id=res.id) %}
11 |
17 | {% endblock %}
18 |
19 |
20 |
21 | {% block delete_ds_button %}
22 | {% if res.datastore_active %}
23 | {% set delete_action = h.url_for('xloader.delete_datastore_table', id=pkg.id, resource_id=res.id) %}
24 |
34 | {% endif %}
35 | {% endblock %}
36 |
37 | {% if status.error and status.error.message %}
38 | {% set show_table = false %}
39 |
40 | {{ _('Upload error:') }} {{ status.error.message }}
41 |
42 | {% elif status.task_info and status.task_info.error %}
43 |
44 | {% if status.task_info.error is mapping %}
45 |
{{ _('Error:') }} {{ status.task_info.error.message }}
46 | {% for error_key, error_value in status.task_info.error.items() %}
47 | {% if error_key != "message" and error_value %}
48 |
49 |
{{ error_key }}:
50 | {{ error_value }}
51 | {% endif %}
52 | {% endfor %}
53 | {% elif status.task_info.error is iterable %}
54 |
{{ _('Error traceback:') }}
55 |
{{ ''.join(status.task_info.error) }}
56 | {% endif %}
57 |
58 | {% endif %}
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 | | {{ _('Status') }} |
67 | {{ h.xloader_status_description(status) }} |
68 |
69 |
70 | | {{ _('Last updated') }} |
71 | {% if status.status %}
72 | {{ h.time_ago_from_timestamp(status.last_updated) }} |
73 | {% else %}
74 | {{ _('Never') }} |
75 | {% endif %}
76 |
77 |
78 |
79 | {% if status.status and status.task_info and show_table %}
80 | {{ _('Upload Log') }}
81 |
82 | {% set items = status.task_info.logs %}
83 | {% set rows = rows or 50 %}
84 | {% set skipped_rows = (items | length) - (rows * 2) %}
85 | {% if skipped_rows > 1 %}
86 | -
87 |
88 |
89 | {{ skipped_rows }} out of {{ items | length }} logs will be hidden.
90 |
91 |
92 | Show more Show all
93 |
94 |
95 |
96 | {% endif %}
97 | {% for item in items %}
98 | {# Truncate very long loops, showing just the start and end #}
99 | {% if loop.index <= rows or loop.revindex <= rows
100 | or (loop.index == rows + 1 and loop.revindex == rows + 1) %}
101 | {% set icon = 'ok' if item.level == 'INFO' else 'exclamation' %}
102 | {% set class = ' failure' if icon == 'exclamation' else ' success' %}
103 | {% set popover_content = 'test' %}
104 | -
105 |
106 |
107 | {% for line in item.message.strip().split('\n') %}
108 | {{ line | urlize }}
109 | {% endfor %}
110 |
111 | {{ h.time_ago_from_timestamp(item.timestamp) }}
112 | {{ _('Details') }}
113 |
114 |
115 |
116 | {% elif loop.index == rows + 1 %}
117 | -
118 |
119 |
120 | Skipping {{ skipped_rows }} logs...
121 |
122 |
123 | Show more Show all
124 |
125 |
126 |
127 | {% endif %}
128 | {% endfor %}
129 | -
130 |
131 |
{{ _('End of log') }}
132 |
133 |
134 | {% endif %}
135 |
136 | {% endblock %}
137 |
--------------------------------------------------------------------------------
/ckanext/xloader/helpers.py:
--------------------------------------------------------------------------------
1 | import ckan.plugins.toolkit as toolkit
2 | from ckanext.xloader.utils import XLoaderFormats
3 | from markupsafe import Markup
4 | from html import escape as html_escape
5 |
6 |
7 | def xloader_status(resource_id):
8 | try:
9 | return toolkit.get_action('xloader_status')(
10 | {}, {'resource_id': resource_id})
11 | except toolkit.ObjectNotFound:
12 | return {
13 | 'status': 'unknown'
14 | }
15 |
16 |
17 | def xloader_status_description(status):
18 | _ = toolkit._
19 |
20 | if status.get('status'):
21 | captions = {
22 | 'complete': _('Complete'),
23 | 'pending': _('Pending'),
24 | 'submitting': _('Submitting'),
25 | 'error': _('Error'),
26 | }
27 |
28 | return captions.get(status['status'], status['status'].capitalize())
29 | else:
30 | return _('Not Uploaded Yet')
31 |
32 |
33 | def is_resource_supported_by_xloader(res_dict, check_access=True):
34 | is_supported_format = XLoaderFormats.is_it_an_xloader_format(res_dict.get('format'))
35 | is_datastore_active = res_dict.get('datastore_active', False)
36 | user_has_access = not check_access or toolkit.h.check_access(
37 | 'package_update', {'id': res_dict.get('package_id')})
38 | url_type = res_dict.get('url_type')
39 | if url_type:
40 | try:
41 | is_supported_url_type = url_type not in toolkit.h.datastore_rw_resource_url_types()
42 | except AttributeError:
43 | is_supported_url_type = (url_type in ['upload', 'None'])
44 | else:
45 | is_supported_url_type = True
46 | return (is_supported_format or is_datastore_active) and user_has_access and is_supported_url_type
47 |
48 |
49 | def xloader_badge(resource):
50 | # type: (dict) -> str
51 | """
52 | Displays a custom badge for the status of Xloader and DataStore for the specified resource.
53 | """
54 | if not toolkit.asbool(toolkit.config.get('ckanext.xloader.show_badges', True)):
55 | return ''
56 |
57 | if not XLoaderFormats.is_it_an_xloader_format(resource.get('format')):
58 | # we only want to show badges for supported xloader formats
59 | return ''
60 |
61 | is_datastore_active = resource.get('datastore_active', False)
62 |
63 | try:
64 | xloader_job = toolkit.get_action("xloader_status")({'ignore_auth': True},
65 | {"resource_id": resource.get('id')})
66 | except toolkit.ObjectNotFound:
67 | xloader_job = {}
68 |
69 | if xloader_job.get('status') == 'complete':
70 | # the xloader task is complete, show datastore active or inactive.
71 | # xloader will delete the datastore table at the beggining of the job run.
72 | # so this will only be true if the job is fully finished.
73 | status = 'active' if is_datastore_active else 'inactive'
74 | elif xloader_job.get('status') in ['submitting', 'pending', 'running', 'running_but_viewable', 'error']:
75 | # the job is running or pending or errored
76 | # show the xloader status
77 | status = xloader_job.get('status')
78 | if status == 'running_but_viewable':
79 | # treat running_but_viewable the same as running
80 | status = 'running'
81 | elif status == 'submitting':
82 | # treat submitting the same as pending
83 | status = 'pending'
84 | else:
85 | # we do not know what the status is
86 | status = 'unknown'
87 |
88 | status_translations = {
89 | # Default messages
90 | 'pending': toolkit._('Pending'),
91 | 'running': toolkit._('Running'),
92 | 'error': toolkit._('Error'),
93 | # Debug messages
94 | 'complete': toolkit._('Complete'),
95 | 'active': toolkit._('Active'),
96 | 'inactive': toolkit._('Inactive'),
97 | 'unknown': toolkit._('Unknown'),
98 | }
99 |
100 | status_descriptions = {
101 | # Default messages
102 | 'pending': toolkit._('Data awaiting load to DataStore'),
103 | 'running': toolkit._('Loading data into DataStore'),
104 | 'error': toolkit._('Failed to load data into DataStore'),
105 | # Debug messages
106 | 'complete': toolkit._('Data loaded into DataStore'),
107 | 'active': toolkit._('Data available in DataStore'),
108 | 'inactive': toolkit._('Resource not active in DataStore'),
109 | 'unknown': toolkit._('DataStore status unknown'),
110 | }
111 | basic_statuses = ['pending', 'running', 'error']
112 |
113 | if status not in basic_statuses and not toolkit.asbool(toolkit.config.get('ckanext.xloader.debug_badges', False)):
114 | return ''
115 |
116 | last_updated = toolkit.h.render_datetime(xloader_job.get('last_updated'), with_hours=True) \
117 | if xloader_job.get('last_updated') else toolkit._('Last Updated Not Available')
118 |
119 | try:
120 | toolkit.check_access('resource_update', {'user': toolkit.g.user}, {'id': resource.get('id')})
121 | pusher_url = toolkit.h.url_for('xloader.resource_data',
122 | id=resource.get('package_id'),
123 | resource_id=resource.get('id'))
124 |
125 | return Markup(u'''
126 |
127 | {prefix}
128 | {status_display}
129 | '''.format(
130 | pusher_url=pusher_url,
131 | prefix=toolkit._('datastore'),
132 | status=status,
133 | status_display=html_escape(status_translations[status], quote=True),
134 | status_description=html_escape(status_descriptions[status], quote=True),
135 | title=html_escape(last_updated, quote=True)))
136 | except toolkit.NotAuthorized:
137 | return Markup(u'''
138 |
139 | {prefix}
140 | {status_display}
141 |
142 | '''.format(
143 | prefix=toolkit._('datastore'),
144 | status=status,
145 | status_display=html_escape(status_translations[status], quote=True),
146 | status_description=html_escape(status_descriptions[status], quote=True),
147 | title=html_escape(last_updated, quote=True)))
148 |
--------------------------------------------------------------------------------
/ckanext/xloader/command.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import sys
4 | import logging
5 | import ckan.plugins.toolkit as tk
6 |
7 | from ckanext.xloader.jobs import xloader_data_into_datastore_
8 | from ckanext.xloader.utils import XLoaderFormats, get_xloader_user_apitoken
9 |
10 |
11 | class XloaderCmd:
12 | def __init__(self, dry_run=False):
13 | self.dry_run = dry_run
14 | self.error_occured = False
15 |
16 | def _setup_xloader_logger(self):
17 | # whilst the development.ini's loggers are setup now, because this is
18 | # cli, let's ensure we xloader debug messages are printed for the user
19 | logger = logging.getLogger('ckanext.xloader')
20 | handler = logging.StreamHandler()
21 | formatter = logging.Formatter(
22 | ' %(name)-12s %(levelname)-5s %(message)s')
23 | handler.setFormatter(formatter)
24 | logger.addHandler(handler)
25 | logger.setLevel(logging.DEBUG)
26 | logger.propagate = False # in case the config
27 |
28 | def _submit_all_existing(self, sync=False, queue=None):
29 | from ckanext.datastore.backend \
30 | import get_all_resources_ids_in_datastore
31 | resource_ids = get_all_resources_ids_in_datastore()
32 | print('Processing %d resources' % len(resource_ids))
33 | user = tk.get_action('get_site_user')(
34 | {'ignore_auth': True}, {})
35 | for resource_id in resource_ids:
36 | try:
37 | resource_dict = tk.get_action('resource_show')(
38 | {'ignore_auth': True}, {'id': resource_id})
39 | except tk.ObjectNotFound:
40 | print(' Skipping resource {} found in datastore but not in '
41 | 'metadata'.format(resource_id))
42 | continue
43 | self._submit_resource(resource_dict, user, indent=2, sync=sync, queue=queue)
44 |
45 | def _submit_all(self, sync=False, queue=None):
46 | # submit every package
47 | # for each package in the package list,
48 | # submit each resource w/ _submit_package
49 | package_list = tk.get_action('package_search')(
50 | {'ignore_auth': True}, {'include_private': True, 'rows': 1000})
51 | package_list = [pkg['id'] for pkg in package_list['results']]
52 | print('Processing %d datasets' % len(package_list))
53 | user = tk.get_action('get_site_user')(
54 | {'ignore_auth': True}, {})
55 | for p_id in package_list:
56 | self._submit_package(p_id, user, indent=2, sync=sync, queue=queue)
57 |
58 | def _submit_package(self, pkg_id, user=None, indent=0, sync=False, queue=None):
59 | indentation = ' ' * indent
60 | if not user:
61 | user = tk.get_action('get_site_user')(
62 | {'ignore_auth': True}, {})
63 |
64 | try:
65 | pkg = tk.get_action('package_show')(
66 | {'ignore_auth': True},
67 | {'id': pkg_id.strip()})
68 | except Exception as e:
69 | print(e)
70 | print(indentation + 'Dataset "{}" was not found'.format(pkg_id))
71 | sys.exit(1)
72 |
73 | print(indentation + 'Processing dataset {} with {} resources'.format(
74 | pkg['name'], len(pkg['resources'])))
75 | for resource in pkg['resources']:
76 | try:
77 | resource['package_name'] = pkg['name'] # for debug output
78 | self._submit_resource(resource, user, indent=indent + 2, sync=sync, queue=queue)
79 | except Exception as e:
80 | self.error_occured = True
81 | print(str(e))
82 | print(indentation + 'ERROR submitting resource "{}" '.format(
83 | resource['id']))
84 | continue
85 |
86 | def _submit_resource(self, resource, user, indent=0, sync=False, queue=None):
87 | '''resource: resource dictionary
88 | '''
89 | indentation = ' ' * indent
90 |
91 | if not XLoaderFormats.is_it_an_xloader_format(resource['format']):
92 | print(indentation
93 | + 'Skipping resource {r[id]} because format "{r[format]}" is '
94 | 'not configured to be xloadered'.format(r=resource))
95 | return
96 | if resource['url_type'] in ('datapusher', 'xloader'):
97 | print(indentation
98 | + 'Skipping resource {r[id]} because url_type "{r[url_type]}" '
99 | 'means resource.url points to the datastore '
100 | 'already, so loading would be circular.'.format(
101 | r=resource))
102 | return
103 | dataset_ref = resource.get('package_name', resource['package_id'])
104 | print('{indent}{sync_style} /dataset/{dataset}/resource/{r[id]}\n'
105 | '{indent} url={r[url]}\n'
106 | '{indent} format={r[format]}'
107 | .format(sync_style='Processing' if sync else 'Submitting',
108 | dataset=dataset_ref, r=resource, indent=indentation))
109 | if self.dry_run:
110 | print(indentation + '(not submitted - dry-run)')
111 | return
112 | data_dict = {
113 | 'resource_id': resource['id'],
114 | 'ignore_hash': True,
115 | }
116 | if sync:
117 | data_dict['ckan_url'] = tk.config.get('ckan.site_url')
118 | input_dict = {
119 | 'metadata': data_dict,
120 | 'api_key': get_xloader_user_apitoken()
121 | }
122 | logger = logging.getLogger('ckanext.xloader.cli')
123 | xloader_data_into_datastore_(input_dict, None, logger)
124 | else:
125 | if queue:
126 | data_dict['queue'] = queue
127 | success = tk.get_action('xloader_submit')({'user': user['name']}, data_dict)
128 | if success:
129 | print(indentation + '...ok')
130 | else:
131 | print(indentation + 'ERROR submitting resource')
132 | self.error_occured = True
133 |
134 | def print_status(self):
135 | import ckan.lib.jobs as rq_jobs
136 | jobs = rq_jobs.get_queue().jobs
137 | if not jobs:
138 | print('No jobs currently queued')
139 | for job in jobs:
140 | job_params = eval(job.description.replace(
141 | 'ckanext.xloader.jobs.xloader_data_into_datastore', ''))
142 | job_metadata = job_params['metadata']
143 | print('{id} Enqueued={enqueued:%Y-%m-%d %H:%M} res_id={res_id} '
144 | 'url={url}'.format(
145 | id=job._id,
146 | enqueued=job.enqueued_at,
147 | res_id=job_metadata['resource_id'],
148 | url=job_metadata['original_url'],
149 | ))
150 |
--------------------------------------------------------------------------------
/ckanext/xloader/tests/test_plugin.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | import datetime
4 | import pytest
5 | try:
6 | from unittest import mock
7 | except ImportError:
8 | import mock
9 | from six import text_type as str
10 |
11 | from ckan.tests import helpers, factories
12 | from ckan.logic import _actions
13 | from ckanext.xloader.plugin import _should_remove_unsupported_resource_from_datastore
14 |
15 |
16 | @pytest.mark.usefixtures("clean_db", "with_plugins")
17 | @pytest.mark.ckan_config("ckan.plugins", "datastore xloader")
18 | class TestNotify(object):
19 | def test_submit_on_resource_create(self, monkeypatch):
20 | func = mock.Mock()
21 | monkeypatch.setitem(_actions, "xloader_submit", func)
22 |
23 | dataset = factories.Dataset()
24 |
25 | assert not func.called
26 |
27 | helpers.call_action(
28 | "resource_create",
29 | {},
30 | package_id=dataset["id"],
31 | url="http://example.com/file.csv",
32 | format="CSV",
33 | )
34 |
35 | assert func.called
36 |
37 | def test_submit_when_url_changes(self, monkeypatch):
38 | func = mock.Mock()
39 | monkeypatch.setitem(_actions, "xloader_submit", func)
40 |
41 | dataset = factories.Dataset()
42 |
43 | resource = helpers.call_action(
44 | "resource_create",
45 | {},
46 | package_id=dataset["id"],
47 | url="http://example.com/file.pdf",
48 | )
49 |
50 | assert not func.called # because of the format being PDF
51 |
52 | helpers.call_action(
53 | "resource_update",
54 | {},
55 | id=resource["id"],
56 | package_id=dataset["id"],
57 | url="http://example.com/file.csv",
58 | format="CSV",
59 | )
60 |
61 | assert func.called
62 |
63 | @pytest.mark.ckan_config("ckanext.xloader.validation.requires_successful_report", True)
64 | def test_require_validation(self, monkeypatch):
65 | func = mock.Mock()
66 | monkeypatch.setitem(_actions, "xloader_submit", func)
67 |
68 | mock_resource_validation_show = mock.Mock()
69 | monkeypatch.setitem(_actions, "resource_validation_show", mock_resource_validation_show)
70 |
71 | dataset = factories.Dataset()
72 |
73 | resource = helpers.call_action(
74 | "resource_create",
75 | {},
76 | package_id=dataset["id"],
77 | url="http://example.com/file.csv",
78 | format="CSV",
79 | validation_status='failure',
80 | )
81 |
82 | # TODO: test IPipeValidation
83 | assert not func.called # because of the validation_status not being `success`
84 | func.called = None # reset
85 |
86 | helpers.call_action(
87 | "resource_update",
88 | {},
89 | id=resource["id"],
90 | package_id=dataset["id"],
91 | url="http://example.com/file2.csv",
92 | format="CSV",
93 | validation_status='success',
94 | )
95 |
96 | # TODO: test IPipeValidation
97 | assert not func.called # because of the validation_status is `success`
98 |
99 | @pytest.mark.ckan_config("ckanext.xloader.validation.requires_successful_report", True)
100 | @pytest.mark.ckan_config("ckanext.xloader.validation.enforce_schema", False)
101 | def test_enforce_validation_schema(self, monkeypatch):
102 | func = mock.Mock()
103 | monkeypatch.setitem(_actions, "xloader_submit", func)
104 |
105 | mock_resource_validation_show = mock.Mock()
106 | monkeypatch.setitem(_actions, "resource_validation_show", mock_resource_validation_show)
107 |
108 | dataset = factories.Dataset()
109 |
110 | resource = helpers.call_action(
111 | "resource_create",
112 | {},
113 | package_id=dataset["id"],
114 | url="http://example.com/file.csv",
115 | schema='',
116 | validation_status='',
117 | )
118 |
119 | # TODO: test IPipeValidation
120 | assert not func.called # because of the schema being empty
121 | func.called = None # reset
122 |
123 | helpers.call_action(
124 | "resource_update",
125 | {},
126 | id=resource["id"],
127 | package_id=dataset["id"],
128 | url="http://example.com/file2.csv",
129 | schema='https://example.com/schema.json',
130 | validation_status='failure',
131 | )
132 |
133 | # TODO: test IPipeValidation
134 | assert not func.called # because of the validation_status not being `success` and there is a schema
135 | func.called = None # reset
136 |
137 | helpers.call_action(
138 | "resource_update",
139 | {},
140 | package_id=dataset["id"],
141 | id=resource["id"],
142 | url="http://example.com/file3.csv",
143 | schema='https://example.com/schema.json',
144 | validation_status='success',
145 | )
146 |
147 | # TODO: test IPipeValidation
148 | assert not func.called # because of the validation_status is `success` and there is a schema
149 |
150 | @pytest.mark.parametrize("toolkit_config_value, mock_xloader_formats, url_type, datastore_active, expected_result", [
151 | # Test1: Should pass as it is an upload with an active datastore entry but an unsupported format
152 | (True, False, 'upload', True, True),
153 | # Test2: Should fail as it is a supported XLoader format.
154 | (True, True, 'upload', True, False),
155 | # Test3: Should fail as the config option is turned off.
156 | (False, False, 'upload', True, False),
157 | # Test4: Should fail as the url_type is not supported.
158 | (True, False, 'custom_type', True, False),
159 | # Test5: Should fail as datastore is inactive.
160 | (True, False, 'upload', False, False),
161 | # Test6: Should pass as it is a recognised resource type with an active datastore entry but an unsupported format
162 | (True, False, '', True, True),
163 | # Test7: Should pass as it is a recognised resource type with an active datastore entry but an unsupported format
164 | (True, False, None, True, True),
165 | ])
166 | def test_should_remove_unsupported_resource_from_datastore(
167 | self, toolkit_config_value, mock_xloader_formats, url_type, datastore_active, expected_result):
168 |
169 | # Setup mock data
170 | res_dict = {
171 | 'format': 'some_format',
172 | 'url_type': url_type,
173 | 'datastore_active': datastore_active,
174 | 'extras': {'datastore_active': datastore_active}
175 | }
176 |
177 | # Assert the result based on the logic paths covered
178 | with helpers.changed_config('ckanext.xloader.clean_datastore_tables', toolkit_config_value):
179 | with mock.patch('ckanext.xloader.utils.XLoaderFormats.is_it_an_xloader_format') as mock_is_xloader_format:
180 | mock_is_xloader_format.return_value = mock_xloader_formats
181 | assert _should_remove_unsupported_resource_from_datastore(res_dict) == expected_result
182 |
183 | def _pending_task(self, resource_id):
184 | return {
185 | "entity_id": resource_id,
186 | "entity_type": "resource",
187 | "task_type": "xloader",
188 | "last_updated": str(datetime.datetime.utcnow()),
189 | "state": "pending",
190 | "key": "xloader",
191 | "value": "{}",
192 | "error": "{}",
193 | }
194 |
--------------------------------------------------------------------------------
/ckanext/xloader/tests/test_chunks.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import os
3 | import pytest
4 | import tempfile
5 | import logging
6 | from typing import Callable, List, Tuple, Any
7 | from unittest.mock import patch, MagicMock
8 | import csv
9 | import sqlalchemy.orm as orm
10 |
11 | from ckan.tests import factories
12 | from ckanext.xloader import loader
13 | from ckanext.xloader.loader import get_write_engine
14 | from ckanext.xloader.tests.test_loader import TestLoadBase, get_sample_filepath
15 |
16 | logger = logging.getLogger(__name__)
17 |
18 |
19 | @pytest.fixture()
20 | def Session():
21 | engine = get_write_engine()
22 | Session = orm.scoped_session(orm.sessionmaker(bind=engine))
23 | yield Session
24 | Session.close()
25 |
26 |
27 | @pytest.mark.usefixtures("full_reset", "with_plugins")
28 | @pytest.mark.ckan_config("ckan.plugins", "datastore xloader")
29 | class TestChunkedLoading(TestLoadBase):
30 |
31 | def _create_mock_split_copy(self, chunk_size: int) -> Callable:
32 | """Create a mock function for split_copy_by_size with specified chunk size"""
33 | original_split_copy = loader.split_copy_by_size
34 |
35 | def mock_split_copy(input_file: Any, engine: Any, logger: Any, resource_id: str, headers: List[str], delimiter: str = ',', max_size: int = 1024**3) -> Any:
36 | return original_split_copy(input_file, engine, logger, resource_id, headers, delimiter, chunk_size)
37 |
38 | return mock_split_copy
39 |
40 | def _create_mock_copy_file(self, copy_calls_list: List[Tuple]) -> Callable:
41 | """Create a mock function for copy_file that tracks calls"""
42 | original_copy_file = loader.copy_file
43 |
44 | def mock_copy_file(*args: Any, **kwargs: Any) -> Any:
45 | copy_calls_list.append(args)
46 | return original_copy_file(*args, **kwargs)
47 |
48 | return mock_copy_file
49 |
50 | def _generate_large_csv(self, filepath: str, num_rows: int = 100000, row_size_kb: int = 1) -> Tuple[str, List[str], int]:
51 | """Generate a large CSV file for testing chunked processing"""
52 | headers = ['id', 'name', 'description', 'data']
53 |
54 | # Create data that will make each row approximately row_size_kb KB
55 | padding_size = (row_size_kb * 1024) - 50 # Account for other columns
56 | padding_data = 'x' * max(1, padding_size)
57 |
58 | with open(filepath, 'w', newline='', encoding='utf-8') as csvfile:
59 | writer = csv.writer(csvfile)
60 | writer.writerow(headers)
61 |
62 | for i in range(num_rows):
63 | writer.writerow([
64 | i + 1,
65 | f'Name_{i + 1}',
66 | f'Description for row {i + 1}',
67 | padding_data
68 | ])
69 |
70 | return filepath, headers, num_rows
71 |
72 | def test_chunked_processing_large_file(self, Session: Any) -> None:
73 | """Test that large files are processed in chunks and data integrity is maintained"""
74 |
75 | # Create a temporary large CSV file (~15MB to trigger chunking)
76 | with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as temp_file:
77 | temp_filepath = temp_file.name
78 |
79 | try:
80 | # Generate file with ~15MB (15000 rows * ~1KB each)
81 | csv_filepath, expected_headers, expected_rows = self._generate_large_csv(
82 | temp_filepath, num_rows=15000, row_size_kb=1
83 | )
84 |
85 | # Verify file size is large enough to trigger chunking
86 | file_size = os.path.getsize(csv_filepath)
87 | assert file_size > 10 * 1024 * 1024, f"File size {file_size} should be > 10MB"
88 |
89 | resource = factories.Resource()
90 | resource_id = resource['id']
91 |
92 | # Set up mocks with 10MB chunk size
93 | copy_calls = []
94 | mock_split_copy = self._create_mock_split_copy(10 * 1024 * 1024)
95 | mock_copy_file = self._create_mock_copy_file(copy_calls)
96 |
97 | with patch('ckanext.xloader.loader.split_copy_by_size', side_effect=mock_split_copy):
98 | with patch('ckanext.xloader.loader.copy_file', side_effect=mock_copy_file):
99 | # Load the CSV with chunked processing
100 | fields = loader.load_csv(
101 | csv_filepath,
102 | resource_id=resource_id,
103 | mimetype="text/csv",
104 | logger=logger,
105 | )
106 |
107 | # Verify chunking occurred (should have multiple copy calls)
108 | assert len(copy_calls) > 1, "Expected multiple chunks but file was not chunked"
109 |
110 | # Verify data integrity - check that all rows were loaded
111 | records = self._get_records(Session, resource_id)
112 | assert len(records) == expected_rows, f"Expected {expected_rows} records, got {len(records)}"
113 |
114 | # Verify column structure
115 | column_names = self._get_column_names(Session, resource_id)
116 | expected_columns = ['_id', '_full_text'] + expected_headers
117 | assert column_names == expected_columns
118 |
119 | # Verify first and last records to ensure data integrity
120 | # Sort records by the 'id' column (index 1) to ensure consistent ordering
121 | sorted_records = sorted(records, key=lambda x: int(x[1]))
122 | first_record = sorted_records[0]
123 | last_record = sorted_records[-1]
124 |
125 | # Check first record (excluding _id and _full_text columns)
126 | # The _get_records method excludes _full_text by default, so indices are:
127 | # 0: _id, 1: id, 2: name, 3: description, 4: data
128 |
129 | assert first_record[1] == '1' # id column (index 1 after _id)
130 | assert first_record[2] == 'Name_1' # name column (index 2)
131 |
132 | # Check last record
133 | assert last_record[1] == str(expected_rows) # id column
134 | assert last_record[2] == f'Name_{expected_rows}' # name column
135 |
136 | finally:
137 | # Clean up temporary file
138 | if os.path.exists(temp_filepath):
139 | os.unlink(temp_filepath)
140 |
141 | def test_small_file_no_chunking(self, Session: Any) -> None:
142 | """Test that small files are not chunked when chunk size is larger than file"""
143 |
144 | # Use existing small sample file
145 | csv_filepath = get_sample_filepath("simple.csv")
146 | resource = factories.Resource()
147 | resource_id = resource['id']
148 |
149 | # Set up mocks with large chunk size to prevent chunking
150 | copy_calls = []
151 | mock_split_copy = self._create_mock_split_copy(10 * 1024 * 1024) # 10MB
152 | mock_copy_file = self._create_mock_copy_file(copy_calls)
153 |
154 | with patch('ckanext.xloader.loader.split_copy_by_size', side_effect=mock_split_copy):
155 | with patch('ckanext.xloader.loader.copy_file', side_effect=mock_copy_file):
156 | fields = loader.load_csv(
157 | csv_filepath,
158 | resource_id=resource_id,
159 | mimetype="text/csv",
160 | logger=logger,
161 | )
162 |
163 | # Small file should only have one copy call (no chunking)
164 | assert len(copy_calls) == 1, f"Small file should not be chunked, got {len(copy_calls)} copy calls"
165 |
166 | # Verify data loaded correctly
167 | records = self._get_records(Session, resource_id)
168 | assert len(records) == 6 # Known number of records in simple.csv
169 |
170 |
--------------------------------------------------------------------------------
/ckanext/xloader/tests/samples/brazilian_sample.csv:
--------------------------------------------------------------------------------
1 | NU_ANO_CENSO,CO_MUNICIPIO,MUNIC,SIGLA,CO_UF,SCHOOLS_NU,SCHOOLS_FED_NU,SCHOOLS_ESTADUAL_NU,SCHOOLS_MUN_NU,SCHOOLS_PRIV_NU,SCHOOLS_FED_STUD,SCHOOLS_ESTADUAL_STUD,SCHOOLS_MUN_STUD,SCHOOLS_PRIV_STUD,SCHOOLS_URBAN_NU,SCHOOLS_RURAL_NU,SCHOOLS_URBAN_STUD,SCHOOLS_RURAL_STUD,SCHOOLS_NIVFUND_1_NU,SCHOOLS_NIVFUND_2_NU,SCHOOLS_EIGHTYEARS_NU,SCHOOLS_NINEYEARS_NU,SCHOOLS_EIGHTYEARS_STUD,SCHOOLS_NINEYEARS_STUD,MATFUND_NU,MATFUND_I_NU,MATFUND_T_NU,SCHOOLS_INTERNET_AVG,SCHOOLS_WATER_PUBLIC_AVG,SCHOOLS_WATER_AVG,SCHOOLS_ELECTR_PUB_AVG,SCHOOLS_SEWAGE_PUB_AVG,SCHOOLS_SEWAGE_AVG,PROFFUNDTOT_NU,PROFFUNDINC_PC,PROFFUNDCOMP_PC,PROFMED_PC,PROFSUP_PC,CLASSSIZE,CLASSSIZE_I,CLASSSIZE_T,STUDTEACH,RATE_APROV,RATE_APROV_I,RATE_APROV_T,RATE_FAILURE,RATE_FAILURE_I,RATE_FAILURE_T,RATE_ABANDON,RATE_ABANDON_I,RATE_ABANDON_T,RATE_TRANSFER,RATE_TRANSFER_I,RATE_TRANSFER_T,RATE_OVERAGE,RATE_OVERAGE_I,RATE_OVERAGE_T,PROVA_MEAN_PORT_I,PROVA_MEAN_PORT_T,PROVA_MEAN_MAT_I,PROVA_MEAN_MAT_T,CLASSSIZE_PUB,STUDTEACH_PUB,RATE_APROV_PUB,RATE_APROV_I_PUB,RATE_APROV_T_PUB,RATE_FAILURE_PUB,RATE_FAILURE_I_PUB,RATE_FAILURE_T_PUB,RATE_ABANDON_PUB,RATE_ABANDON_I_PUB,RATE_ABANDON_T_PUB,RATE_TRANSFER_PUB,RATE_TRANSFER_I_PUB,RATE_TRANSFER_T_PUB,RATE_OVERAGE_PUB,RATE_OVERAGE_I_PUB,RATE_OVERAGE_T_PUB,PROVA_MEAN_PORT_I_PUB,PROVA_MEAN_PORT_T_PUB,PROVA_MEAN_MAT_I_PUB,PROFFUNDTOT_NU_PUB,PROVA_MEAN_MAT_T_PUB,EDUCTEACH_PUB,EDUCTEACH_FEDERAL,EDUCTEACH_STATE,EDUCTEACH_MUN,PROVA_MEAN_PORT_I_STATE,PROVA_MEAN_PORT_T_STATE,PROVA_MEAN_MAT_I_STATE,PROVA_MEAN_MAT_T_STATE,PROVA_MEAN_PORT_I_MUN,PROVA_MEAN_PORT_T_MUN,PROVA_MEAN_MAT_I_MUN,PROVA_MEAN_MAT_T_MUN
2 | 01/01/1996 12:00:00 AM,1100015,ALTA FLORESTA D'OESTE,RO,,128,0,8,119,1,0,3613,3051,130,7,121,3716,3078,127,7,,,,,6794,5036,1758,,,,,,,337,0.26112759,0.17210683,0.43323442,0.13353115,24.833692447908199,,,22.704964,67.080006197818605,65.144188573097907,74.672390253375497,16.7913561569619,19.4894563570641,8.649237411458509,7.60165422117368,11.1540090366186,17.263407056738099,8.5269823,9.2213373,5.3085136,52.472769803217503,,,,,,,25.0011414302354,22.830887000000001,66.8150490097632,64.893674212235595,74.288246611754104,17.0725384713319,19.8404105332814,8.856561911292371,7.74275834336647,11.357671741889,17.9410577459881,8.3696527,8.9979973,5.0570836,53.286314230720798,,,,,,122988,,10.155015000000001,14.826086999999999,11.671533,9.072917,,,,,,,,
3 | 01/01/1997 12:00:00 AM,1100015,ALTA FLORESTA D'OESTE,RO,,94,0,8,85,1,0,2839,2614,148,6,88,2940,2661,94,7,,,,,5601,3570,2031,,0,1,0.063829787,0,0.93617022,287,0,0,0,0,25.0833657500872,,,21.250907999999999,71.110977629352107,70.1419150990167,75.763059126544903,15.245370982682999,16.496493591540201,8.98147212940713,6.94622497346647,15.7921332337152,12.6757637455453,6.6974254,8.1282864,4.9043164,,,,,,,,25.262045033236401,21.381359,70.672316388471998,69.521445456705493,75.431639575393902,15.5843352556537,16.889994178079299,9.30720180061441,7.13628622250936,16.242641275077901,13.369555548186099,6.6070609,8.2059631,4.3434491,,,,,,,108150,,,14.555555,,,,,,,,,,
4 | 01/01/1998 12:00:00 AM,1100015,ALTA FLORESTA D'OESTE,RO,,99,0,11,86,2,0,3021,2976,230,6,93,3035,3192,99,11,,,,,6227,3909,2318,,0,1,0.070707068,0,0.969697,297,0.13131313,0.23905724,0.48148149,0.14814815,25.1785270610666,,,23.833083999999999,70.647780398333097,70.898689527007704,74.367528940283705,13.4900411515016,13.638060818557699,9.181863672836959,7.76335222032492,18.221439042002501,13.523162982904299,8.0988264,9.7743425,5.6417899,45.848722840075197,122.78195020025601,94.471234723740395,,,,,25.4629641919806,24.20215,70.573802067316294,70.824099405698007,73.998407111957505,13.5963380925552,13.773877099133999,9.24169826321147,7.85336854029679,18.705919883281801,13.848521521423599,7.976491,9.7092905,5.6522574,47.140236294227797,127.024045994386,96.805895791551293,,,,90085,,10.395683,11,10.990741,10.017647,,,,,,,,
5 | 01/01/1999 12:00:00 AM,1100015,ALTA FLORESTA D'OESTE,RO,,97,0,11,84,2,0,3116,2621,197,6,91,3120,2814,97,12,,,,,5934,3872,2062,,0.020618556,0.96907216,0.12371134,0,0.88659793,362,0.082872927,0.14640884,0.50552487,0.26519337,24.567049566647501,,,18.29768,69.648254925924505,71.519813426042504,74.226500780418604,15.3810794145337,14.053312169428599,11.945412913064001,5.31873054336247,12.260287808610601,9.913036606242409,9.651934600000001,11.237819999999999,6.5629535,43.090663904769201,41.287557072538199,50.809843587814697,,,,,24.866690846463602,18.443901,69.401824973613202,71.312204366447503,74.226500780418604,15.686910383830501,14.3316018624195,11.945412913064001,5.36299549793561,12.448538350522799,9.913036606242409,9.5482683,11.167047999999999,6.5629535,43.873104357440901,41.985104684362597,51.777890266455998,,,,125964,,11.404692000000001,13,12.019608,10.904254999999999,,,,,,,,
6 | 01/01/2000 12:00:00 AM,1100015,ALTA FLORESTA D'OESTE,RO,,96,0,11,83,2,0,3006,2832,216,6,90,3000,3054,96,12,,,,,6054,3848,2206,0,0.020833334,0.98958331,0.13541667,0.010416667,1,354,0.048022598,0.14124294,0.5367232,0.27401131,23.448200576487299,,,19.703617000000001,,,,,,,,,,,,,39.478031148207897,0.304947477971829,0.466826025025085,,,,,23.7720972043514,19.964548000000001,,,,,,,,,,,,,40.253511576997198,0.309378559617233,0.473848650082831,,,,112075,,11.498488999999999,12.428572000000001,11.962121,11.190955000000001,,,,,,,,
7 | 01/01/2001 12:00:00 AM,1100015,ALTA FLORESTA D'OESTE,RO,,94,0,11,82,1,0,3179,3030,149,6,88,3226,3132,94,11,,,,,6358,3941,2417,0,0.031914894,0.92553192,0.14893617,0.010638298,0.91489363,355,0.03943662,0.11549295,0.61971831,0.22535211,23.934857271571801,,,20.544874,,,,,,,,,,,,,38.4869456756186,26.335080147989501,49.929375244514603,,,,,23.998429857027599,20.527096,,,,,,,,,,,,,38.975680552467402,26.532201566371999,49.929375244514603,,,,122647,,11.505747,11,11.798450000000001,11.333333,,,,,,,,
8 | 01/01/2002 12:00:00 AM,1100015,ALTA FLORESTA D'OESTE,RO,,87,0,11,76,0,0,3102,3112,0,5,82,3087,3127,87,17,,,,,6214,3510,2704,0,0.011494253,0.98850572,0.14942528,0,0.91954023,340,0.020588236,0.0029411765,0.7088235,0.26764706,24.9124169093372,,,20.526834000000001,,,,,,,,,,,,,37.206307733484799,26.134087850013799,45.999435780294696,,,,,24.9124169093372,20.526834000000001,,,,,,,,,,,,,37.206307733484799,26.134087850013799,45.999435780294696,,,,116976,,12,9,12.273438000000001,11.834906,,,,,,,,
9 | 01/01/2003 12:00:00 AM,1100015,ALTA FLORESTA D'OESTE,RO,,90,0,14,75,1,0,2990,3117,38,7,83,2987,3158,90,17,,,,,6145,3377,2768,0.2,0.022222223,1,0.21111111,0,0.85555553,330,0.036363635,0.0030303029,0.71515149,0.24545455,24.4788346665384,,,20.743715000000002,73.564145784103204,66.042258510608505,53.202522584121603,10.2361585853628,10.611000623557601,5.53406858121018,5.46809248299976,2.42417679682655,6.2575464876816,10.731604000000001,20.922564000000001,35.005862999999998,34.808787714164303,24.204907314713299,42.741019322483503,,,,,24.5720384846698,20.793973999999999,73.564145784103204,66.042258510608505,53.202522584121603,10.2361585853628,10.611000623557601,5.53406858121018,5.46809248299976,2.42417679682655,6.2575464876816,10.731604000000001,20.922564000000001,35.005862999999998,35.009006141230302,24.339144498095099,42.741019322483503,,,,105342,,11.871560000000001,13.777778,12.165355,11.685,,,,,,,,
10 | 01/01/2004 12:00:00 AM,1100015,ALTA FLORESTA D'OESTE,RO,,80,0,13,66,1,0,2879,2994,37,7,73,2935,2975,80,17,,,,,5910,3174,2736,0.16666667,0.025,1,0.30000001,0,0.89999998,333,0.015015015,0.027027028,0.37537536,0.58258259,24.812663731357102,,,20.061893000000001,70.6676315980514,72.835550853161195,67.109379689216198,10.910058940768,11.702339327436199,10.5794508698459,7.32243167260778,3.38640385403245,11.0340198138212,11.099878,12.075706,11.277148,32.588832626084503,23.330968231678799,39.2893599428049,,,,,24.852433620350901,20.110583999999999,70.587837445437501,72.769414628767805,67.109379689216198,10.9337923766684,11.731064143262801,10.5794508698459,7.36856311682479,3.4077382559734,11.0340198138212,11.109807,12.091783,11.277148,32.794142826521302,23.4779537287964,39.2893599428049,,,,102497,,13.212121,14.6,13.775,12.890476,,,,,,,,
11 | 01/01/2005 12:00:00 AM,1100015,ALTA FLORESTA D'OESTE,RO,,81,0,14,66,1,0,2804,2843,37,8,73,3022,2662,81,17,81,0,5684,0,5684,3046,2638,0.049382716,0.037037037,1,0.33333334,0,0.91358024,334,0.01497006,0.0089820363,0.42814371,0.54790419,24.101050315651499,,,20.160371999999999,69.997465446274404,71.240851300523303,69.625995047378595,12.323755200959599,14.7737897338986,8.58658350425592,6.54434281992422,3.01945352250705,10.509938178431399,11.134436000000001,10.965906,11.277483,30.735397747920299,23.638617608347499,35.446829331059703,,,,,24.1781541832109,20.211655,69.922556526071503,71.174219688094297,69.625995047378595,12.3641886198936,14.830533206398499,8.58658350425592,6.58790897399712,3.03955423865184,10.509938178431399,11.125344999999999,10.955693,11.277483,30.901363696633101,23.758084377821099,35.446829331059703,,,,94217,,13.114803,11,13.990826,12.684685,,,,,,,,
12 |
--------------------------------------------------------------------------------
/ckanext/xloader/config_declaration.yaml:
--------------------------------------------------------------------------------
1 | version: 1
2 | groups:
3 | - annotation: ckanext-xloader settings
4 | options:
5 | - key: ckanext.xloader.site_url
6 | example: http://ckan-dev:5000
7 | default:
8 | description: |
9 | Provide an alternate site URL for the xloader_submit action.
10 | This is useful, for example, when the site is running within a docker network.
11 | Note: This setting will not alter path. i.e ckan.root_path
12 | required: false
13 | - key: ckanext.xloader.site_url_ignore_path_regex
14 | example: "(/PathToS3HostOriginIWantToGoDirectTo|/anotherPath)"
15 | default:
16 | description: |
17 | Provide the ability to ignore paths which can't be mapped to alternative site URL for resource access.
18 | This is useful, for example, when the site is running within a docker network and the cdn front door has
19 | Blob storage mapped to another path on the same domain.
20 | required: false
21 | - key: ckanext.xloader.jobs_db.uri
22 | default: sqlite:////tmp/xloader_jobs.db
23 | description: |
24 | The connection string for the jobs database used by XLoader. The
25 | default of an sqlite file is fine for development. For production use a
26 | Postgresql database.
27 | validators: not_missing
28 | required: true
29 | - key: ckanext.xloader.api_token
30 | example: eyJ0eXAiOiJKV1QiLCJh.eyJqdGkiOiJ0M2VNUFlQWFg0VU.8QgV8em4RA
31 | description: |
32 | Uses a specific API token for the xloader_submit action instead of the
33 | apikey of the site_user.
34 | default: 'NOT_SET'
35 | required: true
36 | - key: ckanext.xloader.formats
37 | example: csv application/csv xls application/vnd.ms-excel
38 | description: |
39 | The formats that are accepted. If the value of the resource.format is
40 | anything else then it won't be 'xloadered' to DataStore (and will therefore
41 | only be available to users in the form of the original download/link).
42 | Case insensitive. Defaults are listed in plugin.py.
43 | required: false
44 | - key: ckanext.xloader.max_content_length
45 | default: 1_000_000_000
46 | example: 100000
47 | description: |
48 | The maximum file size that XLoader will attempt to load.
49 | type: int
50 | required: false
51 | - key: ckanext.xloader.use_type_guessing
52 | default: False
53 | example: False
54 | description: |
55 | By default, xloader will first try to add tabular data to the DataStore
56 | with a direct PostgreSQL COPY. This is relatively fast, but does not
57 | guess column types. If this fails, xloader falls back to a method more
58 | like DataPusher's behaviour. This has the advantage that the column types
59 | are guessed. However it is more error prone and far slower.
60 | To always skip the direct PostgreSQL COPY and use type guessing, set
61 | this option to True.
62 | type: bool
63 | required: false
64 | legacy_key: ckanext.xloader.just_load_with_messytables
65 | - key: ckanext.xloader.strict_type_guessing
66 | default: True
67 | example: False
68 | description: |
69 | Use with ckanext.xloader.use_type_guessing to set strict true or false
70 | for type guessing. If set to False, the types will always fallback to string type.
71 |
72 | Strict means that a type will not be guessed if parsing fails for a single cell in the column.
73 | type: bool
74 | - key: ckanext.xloader.max_type_guessing_length
75 | default: 0
76 | example: 100000
77 | description: |
78 | The maximum file size that will be passed to Tabulator if the
79 | use_type_guessing flag is enabled. Larger files will use COPY even if
80 | the flag is set. Defaults to 1/10 of the maximum content length.
81 | type: int
82 | required: false
83 | - key: ckanext.xloader.parse_dates_dayfirst
84 | default: False
85 | example: False
86 | description: |
87 | Whether ambiguous dates should be parsed day first. Defaults to False.
88 | If set to True, dates like '01.02.2022' will be parsed as day = 01,
89 | month = 02.
90 | NB: isoformat dates like '2022-01-02' will be parsed as YYYY-MM-DD, and
91 | this option will not override that.
92 | See https://dateutil.readthedocs.io/en/stable/parser.html#dateutil.parser.parse
93 | for more details.
94 | type: bool
95 | required: false
96 | - key: ckanext.xloader.parse_dates_yearfirst
97 | default: False
98 | example: False
99 | description: |
100 | Whether ambiguous dates should be parsed year first. Defaults to False.
101 | If set to True, dates like '01.02.03' will be parsed as year = 2001,
102 | month = 02, day = 03. See https://dateutil.readthedocs.io/en/stable/parser.html#dateutil.parser.parse
103 | for more details.
104 | type: bool
105 | required: false
106 | - key: ckanext.xloader.job_timeout
107 | default: 3600
108 | example: 3600
109 | description: |
110 | The maximum time for the loading of a resource before it is aborted.
111 | Give an amount in seconds. Default is 60 minutes
112 | type: int
113 | required: false
114 | - key: ckanext.xloader.ignore_hash
115 | default: False
116 | example: False
117 | description: |
118 | Ignore the file hash when submitting to the DataStore, if set to True
119 | resources are always submitted (if their format matches), if set to
120 | False (default), resources are only submitted if their hash has changed.
121 | type: bool
122 | required: false
123 | - key: ckanext.xloader.max_excerpt_lines
124 | default: 0
125 | example: 100
126 | description: |
127 | When loading a file that is bigger than `max_content_length`, xloader can
128 | still try and load some of the file, which is useful to display a
129 | preview. Set this option to the desired number of lines/rows that it
130 | loads in this case.
131 | If the file-type is supported (CSV, TSV) an excerpt with the number of
132 | `max_excerpt_lines` lines will be submitted while the `max_content_length`
133 | is not exceeded.
134 | If set to 0 (default) files that exceed the `max_content_length` will
135 | not be loaded into the datastore.
136 | type: int
137 | required: false
138 | - key: ckanext.xloader.ssl_verify
139 | default: True
140 | example: True
141 | description: |
142 | Requests verifies SSL certificates for HTTPS requests. Setting verify to
143 | False should only be enabled during local development or testing. Default
144 | to True.
145 | type: bool
146 | required: false
147 | - key: ckanext.xloader.validation.requires_successful_report
148 | default: False
149 | example: True
150 | description: |
151 | Resources are required to pass Validation from the ckanext-validation
152 | plugin to be able to get XLoadered.
153 | type: bool
154 | required: false
155 | - key: ckanext.xloader.validation.enforce_schema
156 | default: True
157 | example: False
158 | description: |
159 | Resources are expected to have a Validation Schema, or use the default ones if not.
160 |
161 | If this option is set to `False`, Resources that do not have
162 | a Validation Schema will be treated like they do not require Validation.
163 |
164 | See https://github.com/frictionlessdata/ckanext-validation?tab=readme-ov-file#data-schema
165 | for more details.
166 | - key: ckanext.xloader.clean_datastore_tables
167 | default: False
168 | example: True
169 | description: |
170 | Enqueue jobs to remove Datastore tables from Resources that have a format
171 | that is not in ckanext.xloader.formats after a Resource is updated.
172 | type: bool
173 | required: false
174 | - key: ckanext.xloader.show_badges
175 | default: True
176 | example: False
177 | description: |
178 | Controls whether or not the status badges display in the front end.
179 | type: bool
180 | required: false
181 | - key: ckanext.xloader.debug_badges
182 | default: False
183 | example: True
184 | description: |
185 | Controls whether or not the status badges display all of the statuses. By default,
186 | the badges will display "pending", "running", and "error". With debug_badges enabled,
187 | they will also display "complete", "active", "inactive", and "unknown".
188 | type: bool
189 | required: false
190 | - key: ckanext.xloader.search_update_chunks
191 | default: 100000
192 | example: 1000
193 | description: |
194 | The number of rows to process in each batch when populating the full-text
195 | search index. Chunked processing prevents database timeouts and memory
196 | exhaustion when indexing very large datasets (4GB+ files with millions of rows).
197 | Smaller values reduce memory usage but increase processing time. Larger values
198 | improve performance but may cause timeouts on very large tables.
199 | type: int
200 | required: false
201 | - key: ckanext.xloader.max_retries
202 | default: 1
203 | example: 3
204 | description: |
205 | Maximum number of retry attempts for failed jobs due to temporary errors
206 | like database deadlocks or network timeouts. Set to 0 to disable retries.
207 | type: int
208 | required: false
209 | - key: ckanext.xloader.copy_chunk_size
210 | default: 1073741824
211 | example: 536870912
212 | description: |
213 | Maximum size in bytes for each chunk when processing files.
214 | Files are split into chunks to prevent memory exhaustion and
215 | system freezing. Default is 1GB (1073741824 bytes). Smaller values
216 | use less memory but create more chunks.
217 | type: int
218 | required: false
219 |
--------------------------------------------------------------------------------
/CHANGELOG:
--------------------------------------------------------------------------------
1 | CHANGELOG
2 | =========
3 | See: https://github.com/ckan/ckanext-xloader/releases if this file has drifted.
4 |
5 | 2.0.1 2025-03-04
6 | ================
7 |
8 | ## Fix
9 |
10 | * #244 Static webassets not included in package
11 | * #245 support apitoken_header_name in 2.11.x.
12 | * #241 loading R/W datasource resources via api (not hardcoded)
13 |
14 | 2.0.0 2024-12-10
15 | ================
16 |
17 | ## Major
18 | Dropped CKAN 2.9.x and Python2.
19 |
20 |
21 | ## Feat:
22 | * Adds Strip White Space fields to the Data Dictionary (defualts to `True` for each field).
23 | This will strip surrounding white space from data values prior to inserting them into the database.
24 | * Adds support for ckanext-validation. Config `ckanext.xloader.validation.requires_successful_report`
25 | controls whether a resource requires a successful validation report to be XLoadered.
26 | By default, a resource would also require a Validation Schema, which can be turned off with
27 | `ckanext.xloader.validation.enforce_schema`.
28 | * Frontend Status Badges by @JVickery-TBS in https://github.com/ckan/ckanext-xloader/pull/224
29 |
30 |
31 | ## Fix:
32 | * Properly handle REDIS queue timeouts to close/delete any temporary files.
33 | * Fix automated PyPI publishing by @ThrawnCA in https://github.com/ckan/ckanext-xloader/pull/231
34 |
35 | ## What's Changed
36 | * Update README, migrate it to Markdown by @amercader in https://github.com/ckan/ckanext-xloader/pull/235
37 | * chore: switch to pyproject.toml by @duttonw in https://github.com/ckan/ckanext-xloader/pull/236
38 | * Validation Extension Support (Squashed) by @JVickery-TBS in https://github.com/ckan/ckanext-xloader/pull/237
39 | * Strip White Space from Cell Values (Squashed) by @JVickery-TBS in https://github.com/ckan/ckanext-xloader/pull/238
40 | * RQ Job Timeout Handling (Squashed) by @JVickery-TBS in https://github.com/ckan/ckanext-xloader/pull/239
41 | * SQLAlchemy v2 support by @smotornyuk in https://github.com/ckan/ckanext-xloader/pull/225
42 |
43 | **Full Changelog**: https://github.com/ckan/ckanext-xloader/compare/1.2.0...2.0.0
44 |
45 | 1.2.0 2024-11-21
46 | ================
47 |
48 | ## What's Changed
49 | * Fix PyPI publishing by @ThrawnCA in https://github.com/ckan/ckanext-xloader/pull/233
50 | * Enhancement/Bugfix: Downstream qld-gov-au fix's by @duttonw in https://github.com/ckan/ckanext-xloader/pull/232
51 | * feat: @JVickery-TBS work on validation integration (await successful validation prior to doing datastore work via 'IPipeValidation'
52 | * fix: handle gracefully if tabulator load fails by trying 'direct load'
53 | * fix: Excel blank header row bug
54 | * fix: Datastore truncate, restart identity so numbering restarts from 0 again (when imported data has same columns and types
55 | * fix: parital fix on DB deadlock by adding timeouts on DDL events
56 | * test: test_simple_large_file, test_with_blanks, test_with_empty_lines, test_with_extra_blank_cells
57 | * test: test_require_validation, test_enforce_validation_schema
58 | * chore: min version requirements for cve's,
59 | * requests>=2.32.0
60 | * urllib3>=2.2.2
61 | * zipp>=3.19.1
62 |
63 | **Full Changelog**: https://github.com/ckan/ckanext-xloader/compare/1.1.2...1.2.0
64 |
65 |
66 | 1.1.1 2024-10-16
67 | ================
68 |
69 | * feat: Add pypi cicd publish via github action via environment controls by @duttonw in https://github.com/ckan/ckanext-xloader/pull/228
70 |
71 |
72 | **Full Changelog**: https://github.com/ckan/ckanext-xloader/compare/1.1.0...1.1.1
73 |
74 | 1.1.0 2024-10-16
75 | ================
76 |
77 | Fixes:
78 | * feat: Add pypi cicd publish via github action via environment controls by @duttonw in https://github.com/ckan/ckanext-xloader/pull/228
79 |
80 |
81 | **Full Changelog**: https://github.com/ckan/ckanext-xloader/compare/1.1.0...1.1.1
82 |
83 |
84 | 1.1.0 2024-10-15
85 | ================
86 |
87 |
88 | Fixes:
89 |
90 | * add README note about running on separate server, #191 by @ThrawnCA in https://github.com/ckan/ckanext-xloader/pull/192
91 | * Use IDomainObjectModification Implementation by @JVickery-TBS in https://github.com/ckan/ckanext-xloader/pull/198
92 | * Hide excessive numbers of resource_data log entries, #187 by @ThrawnCA in https://github.com/ckan/ckanext-xloader/pull/188
93 | * #182 Type guessing fixes by @ThrawnCA in https://github.com/ckan/ckanext-xloader/pull/186
94 | * Document the ckan.download_proxy setting, #176 by @ThrawnCA in https://github.com/ckan/ckanext-xloader/pull/204
95 | * Conditional DataStore Tab in Resource Edit by @JVickery-TBS in https://github.com/ckan/ckanext-xloader/pull/190
96 | * Make locking behaviour more robust by @ThrawnCA in https://github.com/ckan/ckanext-xloader/pull/205
97 | * Delete Datastore Table Button by @JVickery-TBS in https://github.com/ckan/ckanext-xloader/pull/197
98 | * Quality of life improvements by @duttonw in https://github.com/ckan/ckanext-xloader/pull/195
99 | * Clean Datastore Tables Job by @JVickery-TBS in https://github.com/ckan/ckanext-xloader/pull/196
100 | * strip extra space for column name by @mutantsan in https://github.com/ckan/ckanext-xloader/pull/210
101 | * Skip empty lines instead of erroring by @ThrawnCA in https://github.com/ckan/ckanext-xloader/pull/208
102 | * add more options for maintainers to expedite XLoader runs, GitHub #202 by @ThrawnCA in https://github.com/ckan/ckanext-xloader/pull/212
103 | * Add Mixed Integers Type Guessing to NUMERIC Tests by @JVickery-TBS in https://github.com/ckan/ckanext-xloader/pull/217
104 | * PY2 & PY3 String/Binary Fixes by @JVickery-TBS in https://github.com/ckan/ckanext-xloader/pull/203
105 | * In plugin.py, there is an fix of resource format key error by @Nisha1293 in https://github.com/ckan/ckanext-xloader/pull/209
106 | * CKAN 2.11 support by @amercader in https://github.com/ckan/ckanext-xloader/pull/220
107 |
108 | New Contributors:
109 |
110 | * @JVickery-TBS made their first contribution in https://github.com/ckan/ckanext-xloader/pull/198
111 | * @duttonw made their first contribution in https://github.com/ckan/ckanext-xloader/pull/195
112 | * @mutantsan made their first contribution in https://github.com/ckan/ckanext-xloader/pull/210
113 | * @Nisha1293 made their first contribution in https://github.com/ckan/ckanext-xloader/pull/209
114 |
115 | **Full Changelog**: https://github.com/ckan/ckanext-xloader/compare/1.0.1...1.1.0
116 |
117 |
118 | 1.0.1 2024-04-04
119 | ================
120 |
121 | Fixes:
122 |
123 | * Include config_declaration.yaml into MANIFEST by @pdelboca in https://github.com/ckan/ckanext-xloader/pull/183
124 |
125 |
126 | **Full Changelog**: https://github.com/ckan/ckanext-xloader/compare/1.0.0...1.0.1
127 |
128 | 1.0.1 2024-04-04
129 | ================
130 |
131 | Fixes:
132 |
133 | * Fixed date parsing while fetching entries for task_status by @muhammed-ajmal in https://github.com/ckan/ckanext-xloader/pull/179
134 | * Drop support for old CKAN versions and add CSRF support by @pdelboca in https://github.com/ckan/ckanext-xloader/pull/180
135 | * Refactor test_jobs.py by @pdelboca in https://github.com/ckan/ckanext-xloader/pull/181
136 |
137 | New Contributors:
138 |
139 | * @muhammed-ajmal made their first contribution in https://github.com/ckan/ckanext-xloader/pull/179
140 |
141 | **Full Changelog**: https://github.com/ckan/ckanext-xloader/compare/0.12.2...1.0.0
142 |
143 |
144 |
145 |
146 | 0.9.0 2021-10-01
147 | ================
148 |
149 | Fixes:
150 |
151 | * Fix SQLAlchemy session exception on CKAN 2.9 #140
152 | * Fix xloader status timestamps #141
153 | * Fix to correctly report exceptions in stdout #141
154 |
155 |
156 | 0.8.1 2021-08-30
157 | ================
158 |
159 | Features:
160 |
161 | * Add ssl_verify option to callback_xloader_hook #136
162 |
163 | Fixes:
164 |
165 | * Fix bytes / str concat #138
166 | * Stream request needs to be explicited closed #139
167 |
168 |
169 | 0.8.0 2021-06-11
170 | ================
171 |
172 | Features:
173 | * Click CLI for CKAN >= 2.9 #128
174 |
175 | Fixes:
176 | * Submit private datasets when using the `submit all` command #121
177 | * Send user context to the resource patch function #134
178 | * Add documentation for ssl_verify #135
179 |
180 |
181 | 0.7.0 2020-11-23
182 | ================
183 |
184 | Features:
185 | * Python 3 support #113
186 | * CKAN 2.9 support #113
187 |
188 | Fixes:
189 | * Update resource hash after load to datastore #116
190 |
191 |
192 | 0.6.1 2020-05-03
193 | ================
194 |
195 | Features:
196 | * Add 'just_load_with_messytables' option #96
197 |
198 | Fixes:
199 | * When getting the resource from CKAN, it now copes with the edge case that CKAN hasn't quite added the resource yet - now it successfully retries #94
200 |
201 |
202 | 0.6.0 2020-04-27
203 | ================
204 |
205 | Release withdrawn
206 |
207 |
208 | 0.5.0 2019-12-04
209 | ================
210 |
211 | Features:
212 | * migrate_types CLI command added for freezing/migrating data dictionaries created with datapusher #85
213 |
214 | Fixes:
215 | * DataStore tab missing from resource manage page, due to templates missing from PyPI package #74
216 |
217 |
218 | 0.4.1 2019-11-13
219 | ================
220 |
221 | Fixes:
222 | * populate_full_text_trigger error when doing 'datastore set-permissions' #72
223 | * '%' in column name causes TypeError("'dict' object does not support indexing") #65
224 | * numpy >= 1.16 causes 'RuntimeError: implement_array_function' on CKAN startup #79
225 | * CKAN 2.9 compatibility - fix error about 'resource_revision_table' #81
226 |
227 |
228 | v0.4.0 2019-06-21
229 | =================
230 |
231 | Features:
232 | * 'ignore_hash' config option added to bypass the hash check which can skip loads #34
233 | * File size is logged #35
234 | * Retries are no done when downloading CSV #39
235 | * xloader_status action now available to GET (not just POST) #46
236 | * ANALYZE run after load, needed for CKAN 2.9 resource preview speed-up #47
237 | * CLI load of multiple datasets now resumes following an error with a resource #57
238 | * Added 'max_excerpt_lines' config to load an excerpt of files larger than max_content_length #63
239 |
240 | Fixes:
241 | * Unicode characters in CSV headers caused error #28
242 | * Column headers longer than 63 characters caused error #14
243 | * Floats in CSV headers caused error #49
244 | * Integers in xlsx header caused error #53
245 | * Extraneous commas in combination with non-ascii chars caused error #51
246 | * CSV with no rows caused error #38
247 | * Template compatibility with CKAN 2.9 #40
248 | * CLI submitted resources with non-xloader formats #43
249 | * ckanext.xloader.max_content_length wasn't working #62
250 | * KeyError: 'resource_id' when xloading a resource #68
251 |
252 |
253 | v0.3.1 2018-01-22
254 | =================
255 |
256 | * Fix for exception about 'ckan.datapusher.url' not being set when you check xload status #26
257 |
258 |
259 | v0.3.0 2017-11-17
260 | =================
261 |
262 | * Trigger any resource_views to be created straight after load, rather than wait for the index.
263 |
264 |
265 | v0.2.0 2017-11-10
266 | =================
267 |
268 | * Renamed ckanext-xloader
269 | * Added to PyPI
270 | * The user is given access to the data earlier in the job - the column indexing now occurs afterwards, since this is only an optimization for queries and takes much longer than the load itself
271 | * Fixed exception during error-handling for files too long and of non-accepted schemes
272 |
273 |
274 | v0.1 2017-11-03
275 | ===============
276 |
277 | * Initial code, named ckanext-shift.
278 |
--------------------------------------------------------------------------------
/ckanext/xloader/tests/samples/sample_with_mixed_quotes.csv:
--------------------------------------------------------------------------------
1 | Category,Category name,Priority,Initiative name,Investment objectives,Primary digital priority,Initiative stage,Actual start date,Approved end date,Date data current at,Percentage complete,Overall status,Project commencement allocation,Approved expenditure,Actual cost to date,Scope change event,Cost re-evaluation event,Delivery delay event,Project journey and reasons for variance,Learn more (URL)
2 | DDSSHHESW,"Department of Defence, Social Security, Health, Housing, Education, and Silly Walks",High,Silly Walks project - Stage 2,"Lorum ipsum.",Collaboration,Delivery,01/07/1970,30/06/1971,31/03/1971,41,G,5633000,5739000,2352000,N,N,N,"As at 31 March 1971
3 | - Overall 'green' (on track) status
4 | - Revised user journey following results of Silly Walk UX/UI testing
5 | - Transition to support progressing with documentation and walk-through of the solution.
6 | - Ongoing high levels of silly walk usage reflecting the success of search engine marketing. Silly walk focused campaign to further increase awareness and usage is being finalised.
7 |
8 | As at 28 February 1971
9 | - Overall 'green' (on track) status
10 | - Results of Silly Walk UX/UI testing is guiding development of the revised user journey.
11 | - Silly Walk transition to BAU support continuing with workshops, showcases and handover documentation.
12 | - Silly Walk usage is increasing
13 |
14 | As at 31 January 1971
15 | - Continued amber status [closely monitored] with risks under management
16 | - Search Engine Marketing -'Always On' yielding good results with continued increase in users and the proportion benefitting from Silly Walk
17 | - Good progress on development of revised Silly Walk user journey.
18 |
19 | As at 31 December 1970
20 | Status AMBER [Closely monitored]
21 | - Search Engine Marketing commenced 19 December 1970 and already showing increased users and proportion of customers benefitting from Silly Walk
22 | - External assurance review completed and reported 'green' rating for confidence of delivery.
23 |
24 | As at 30 November 1970
25 | - Continued amber status pending risk management
26 | - Marketing to commence to increase awareness of platform
27 | - Good progress on development of revised user journey
28 |
29 | As at 31 October 1970
30 | Status AMBER [Closely monitored]
31 | - Silly Walk Stage 2 continue reporting amber status reflective of ongoing high-level risks associated with demand-driven labour-market conditions and planned transition to support.
32 | - Communications and engagement are in progress.
33 | - The revised user journey continues development and testing. This is planned to be ready for release in the first quarter of 1971. As at 30 September 1970
34 | Status AMBER [Closely monitored]
35 | Project journey events:
36 | - A revised customer journey in line with outcomes of customer testing and retesting to validate solution usefulness continues to progress.
37 | - Silly Walk industries expanded to include all industries.
38 | - Engagement with agencies continues, to heighten Silly Walk awareness and complete validation following recent expansion to encompass all industries.
39 |
40 | As at 31 August 1970
41 | Status GREEN [On track]
42 | The project is reporting green overall. Ongoing resourcing risk will continue to be monitored and managed for the life of the project, due to a tight labour market.
43 | Project journey events:
44 | - A revised customer journey in line with outcomes of customer testing and retesting to validate solution usefulness continues to progress.
45 | - Further analysis of June/July 1970 marketing campaign has offered recommendations for consideration, to improve target audience awareness and Silly Walk uptake.
46 | - Silly Walk industries expanded to include Retail Trade, Accommodation and Non-residential Construction industries finalised.
47 | - Engagement with agencies continues, to heighten Silly Walk awareness and complete validation following recent expansion with three additional industries.
48 |
49 | As at 31 July 1970
50 | Status AMBER [Closely monitored]
51 | The project is continuing to report amber overall mainly due to ongoing resourcing challenges.
52 | Project journey events:
53 | - A revised customer journey in line with outcomes of customer testing and retesting to validate solution usefulness, is progressing.
54 | - Analysis of a major marketing campaign conducted in June/July 1970 showed a significant step-up in number of Silly Walk users.
55 | - The target of 95% of Circus population coverage was met in June 1970 with 100% of Circus population now covered on Silly Walk.
56 | - Agency engagement for extension industries has commenced.
57 |
58 | As at 1 July 1970
59 | Silly Walk commenced work on expanding industries to include Retail Trade, Accommodation and Non-residential Construction industries.
60 |
61 | As at June 1970
62 | Stage 2 of the project is commencing and will build up the solution delivered in Silly Walk Stage 1. Customer journey will be revised in line with outcome of customer testing. The increased coverage target of at least 95% of the Circus population was met in June 1970, with all local governments included on Silly Walk. Benefits realisation through marketing and promotion of Silly Walk.",https://example.com
63 | DDSSHHESW,"Department of Defence, Social Security, Health, Housing, Education, and Silly Walks",High,Flying Circus Modernisation and Transformation Program - Tranche 1,"The Flying Circus Modernisation and Transformation (FCMT) Program seeks to reduce the risks associated with department legacy systems by delivering contemporary, consolidated, integrated, user-friendly applications to support delivery of Flying Circus outcomes. To optimise the technical capabilities of the new solutions, engagement with business teams in the review and development of business processes is a priority. ",Trust,Delivery,01/07/1969,31/08/1971,28/02/1971,52,G,8692200,9614968,4961147,Y,Y,Y,"As at 28 February 1971
64 | - Tranche 1 FCMT projects continue on schedule and on budget for Tranche 1 completion by 31 August 1971.
65 | - Customer Engagement and Contract Establishment projects continue to progress focusing on delivery activities for new CRM and Portal enhancements.
66 | - FCMT Tranche 2 Business Case tracking for completion April 1971.
67 |
68 | As at 31 January 1971
69 | - FCMT Projects continue to track to schedule and on budget for Tranche 1 completion 31 August 1971.
70 | - Customer Engagement and Contract Establishment Projects progressing well with delivery activities for new CRM and Portal enhancements.
71 |
72 | As at 31 December 1970
73 | Status GREEN
74 | - FCMT projects continuing to track to board endorsed updated schedule and on budget for Tranche 1 completion on 31 August 1971.
75 | - Customer Engagement and Contract Establishment projects completed partner onboarding and delivery activities underway.
76 | - Planning in progress for Tranche 2, focusing on remaining legacy systems for planned commencement at completion of Tranch 1.
77 |
78 | As at 30 November 1970
79 | Status GREEN
80 | - Tranche 1 delivery date extended to 31 August 1971 due to CRM vendor procurement delays and subsequent additional time requirements for build completion and testing of new CRM.
81 | - All projects maintaining momentum and progressing to revised schedule within budget.
82 |
83 | As at 31 October 1970
84 | Status GREEN
85 | -New 'Partner Portal' Digital Channel continues to perform well with 3516 registered, active, external users from 634 different organisations. Update release being planned for January 1971.
86 | -SkillsCRM (CEP Project) delivery partner on-boarded and formal delivery stage commenced.
87 | -Contract Establishment and Variation (CEV PRoject) continuing delivery partner select with a view to commencing prior to end of December 1970.
88 |
89 | As at 30 September 1970 Status GREEN.
90 | The FCMT 'Partner Portal' solution was successfully launched on the 17 August 1970. The decommissioning of the outdated legacy application, 'WalkConnect', has completed. Work is now increasing on the next Flying Circus systems to be replaced, SkillsCRM (via the Customer Engagement Project) and Policy on Line (via the Contract Establishment and Variation Project).
91 | Project Journey Events:
92 | - Partner Portal. After the successful launch of Partner Portal and decommissioning of WalkConnect, the transition to BAU is underway with the Project team continuing to support business until BAU transition is completed.
93 | - Data, Infrastructure and Reporting.
94 | New 'Data Lake' infrastructure built. Data ingestion processes being trialled. QTS report requirement gathering underway which will showcase new capability once completed. Compliance tool SMCM successfully launched September 30.
95 | -Customer Engagement Project (CEP). Completed assurance reviews successfully. Delivery partner selection completed. Partner and formal delivery stage due to start 18 October 1970. Ramp up of activities continuing with business demonstrations of CRM proof of concept.
96 | -Contract Establishment and Variation (CEV).
97 | Requirements gathering completed. Delivery partner selection process commenced. 'As is' process documentation underway.
98 |
99 | As at 31 August 1970
100 | Status GREEN. The project remains on track. Successful launch of new secure 'Partner Portal' Digital Channel for Flying Circus related organisations occurred 17 August 1970.
101 |
102 | Current Projects underway:
103 | - Partner Portal. Go-live occurred on track 17 August 1970. All registered Flying Circus organisations now able to use the portal to access key applications and send information to DDSSHHESW via secure channel. Enhanced support being provided for 6 weeks. Legacy system decommissioning underway.
104 | - Data, Infrastructure and Reporting. Build of initial Data Lake (centralised, quality, information source) continuing and requirement gathering of first report planned to use new capabilites commenced.
105 | - Customer Services Hub (CRM). Implementation partner selection complete. Solution delivery activities due to start by end September 1970.
106 | - Contract Engagement and Variation. Requirements gathering complete and partner selection process to commence by end September 1970.
107 |
108 | As at 31 July 1970
109 | Status GREEN
110 |
111 | Project journey events:
112 | Implementation of next changes to FCMT applications remain on track for August 1970 with full launch of new secure Partner Portal Digital Channel for Flying Circus related organisations.
113 | FCMT Program scope adjusted to include additional at risk system decommission activties during this financial year. Approved expenditure updated to align with revised scope.
114 |
115 | Current Projects underway
116 | - Partner Portal. Opened for registrations 4 July 1970. Majority of Flying Circus related organisation now registered. Full access (go-live) on track to commence 17 August 1970. Legacy system to be disabled and decommissioned September 1970.
117 | - Data, Infrastructure and Reporting. Build of initial Data Lake (centralised, quality, information source) underway with population and work on first report to commence in September.
118 | - Customer Services Hub (CRM). Requirements confirmed and partner selection underway. Work on legacy CRM replacement due to start September/October 1970.
119 | - Contract Engagement and Variation. Requirements gathering and new process design activities in progress.
120 |
121 | 15 May 1970 Update
122 | Status GREEN
123 |
124 | Implementation of next changes to Flying Circus applications on track for August 1970 with introduction of new secure 'Silly Portal' Digital Channel for Flying Circus related organisations.
125 |
126 | Projects Completed
127 | -Database consolidation - key databases transitioned to supported versions and platforms. Completed November 1969.
128 | -System to System Integration platform. Completed 9 May 1970.
129 |
130 | Current projects underway
131 | -Partner Portal secure digital channel, in final testing. Pilot successfully complete and on track for release in August 1970.
132 | Projects in startup
133 | -Data, Infrastructure and Reporting, planning underway.
134 | -Customer Services Hub (CRM), planning underway.
135 | -Contract Engagement and Variation, planning underway.
136 | -Planning continues for Tranche 2.",https://example.com
137 |
--------------------------------------------------------------------------------
/ckanext/xloader/plugin.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | import logging
4 |
5 | from ckan import plugins
6 | from ckan.plugins import toolkit
7 |
8 | from ckan.model.domain_object import DomainObjectOperation
9 | from ckan.model.resource import Resource
10 |
11 | from . import action, auth, helpers as xloader_helpers, utils
12 | from ckanext.xloader.utils import XLoaderFormats
13 |
14 | try:
15 | from ckanext.validation.interfaces import IPipeValidation
16 | HAS_IPIPE_VALIDATION = True
17 | except ImportError:
18 | HAS_IPIPE_VALIDATION = False
19 |
20 | config_declarations = toolkit.blanket.config_declarations
21 |
22 | if toolkit.check_ckan_version(min_version='2.11'):
23 | from ckanext.datastore.interfaces import IDataDictionaryForm
24 | has_idata_dictionary_form = True
25 | else:
26 | has_idata_dictionary_form = False
27 |
28 | log = logging.getLogger(__name__)
29 |
30 |
31 | @config_declarations
32 | class xloaderPlugin(plugins.SingletonPlugin):
33 | plugins.implements(plugins.IConfigurer)
34 | plugins.implements(plugins.IConfigurable)
35 | plugins.implements(plugins.IDomainObjectModification)
36 | plugins.implements(plugins.IActions)
37 | plugins.implements(plugins.IAuthFunctions)
38 | plugins.implements(plugins.ITemplateHelpers)
39 | plugins.implements(plugins.IResourceController, inherit=True)
40 | plugins.implements(plugins.IClick)
41 | plugins.implements(plugins.IBlueprint)
42 | if has_idata_dictionary_form:
43 | plugins.implements(IDataDictionaryForm, inherit=True)
44 | if HAS_IPIPE_VALIDATION:
45 | plugins.implements(IPipeValidation)
46 |
47 | # IClick
48 | def get_commands(self):
49 | from ckanext.xloader.cli import get_commands
50 |
51 | return get_commands()
52 |
53 | # IBlueprint
54 | def get_blueprint(self):
55 | from ckanext.xloader.views import get_blueprints
56 |
57 | return get_blueprints()
58 |
59 | # IConfigurer
60 |
61 | def update_config(self, config):
62 | toolkit.add_template_directory(config, 'templates')
63 | toolkit.add_resource(u'webassets', 'ckanext-xloader')
64 |
65 | # IConfigurable
66 |
67 | def configure(self, config_):
68 | if config_.get("ckanext.xloader.ignore_hash") in ["True", "TRUE", "1", True, 1]:
69 | self.ignore_hash = True
70 | else:
71 | self.ignore_hash = False
72 |
73 | # IPipeValidation
74 |
75 | def receive_validation_report(self, validation_report):
76 | if utils.requires_successful_validation_report():
77 | res_dict = toolkit.get_action('resource_show')({'ignore_auth': True},
78 | {'id': validation_report.get('resource_id')})
79 | if (toolkit.asbool(toolkit.config.get('ckanext.xloader.validation.enforce_schema', True))
80 | or res_dict.get('schema', None)) and validation_report.get('status') != 'success':
81 | # A schema is present, or required to be present
82 | return
83 | # if validation is running in async mode, it is running from the redis workers.
84 | # thus we need to do sync=True to have Xloader put the job at the front of the queue.
85 | sync = toolkit.asbool(toolkit.config.get(u'ckanext.validation.run_on_update_async', True))
86 | self._submit_to_xloader(res_dict, sync=sync)
87 |
88 | # IDomainObjectModification
89 |
90 | def notify(self, entity, operation):
91 | # type: (Package|Resource, DomainObjectOperation) -> None
92 | """
93 | Runs before_commit to database for Packages and Resources.
94 | We only want to check for changed Resources for this.
95 | We want to check if values have changed, namely the url and the format.
96 | See: ckan/model/modification.py.DomainObjectModificationExtension
97 | """
98 | if operation != DomainObjectOperation.changed \
99 | or not isinstance(entity, Resource):
100 | return
101 |
102 | context = {
103 | "ignore_auth": True,
104 | }
105 | resource_dict = toolkit.get_action("resource_show")(
106 | context,
107 | {
108 | "id": entity.id,
109 | },
110 | )
111 |
112 | if _should_remove_unsupported_resource_from_datastore(resource_dict):
113 | toolkit.enqueue_job(fn=_remove_unsupported_resource_from_datastore, args=[entity.id])
114 |
115 | if utils.requires_successful_validation_report():
116 | # If the resource requires validation, stop here if validation
117 | # has not been performed or did not succeed. The Validation
118 | # extension will call resource_patch and this method should
119 | # be called again. However, url_changed will not be in the entity
120 | # once Validation does the patch.
121 | log.debug("Deferring xloading resource %s because the "
122 | "resource did not pass validation yet.", resource_dict.get('id'))
123 | return
124 | elif not getattr(entity, 'url_changed', False):
125 | # do not submit to xloader if the url has not changed.
126 | return
127 |
128 | self._submit_to_xloader(resource_dict)
129 |
130 | # IResourceController
131 |
132 | def after_resource_create(self, context, resource_dict):
133 | if utils.requires_successful_validation_report():
134 | log.debug("Deferring xloading resource %s because the "
135 | "resource did not pass validation yet.", resource_dict.get('id'))
136 | return
137 |
138 | self._submit_to_xloader(resource_dict)
139 |
140 | def before_resource_show(self, resource_dict):
141 | resource_dict[
142 | "datastore_contains_all_records_of_source_file"
143 | ] = toolkit.asbool(
144 | resource_dict.get("datastore_contains_all_records_of_source_file")
145 | )
146 |
147 | def after_resource_update(self, context, resource_dict):
148 | """ Check whether the datastore is out of sync with the
149 | 'datastore_active' flag. This can occur due to race conditions
150 | like https://github.com/ckan/ckan/issues/4663
151 | """
152 | datastore_active = resource_dict.get('datastore_active', False)
153 | try:
154 | context = {'ignore_auth': True}
155 | if toolkit.get_action('datastore_info')(
156 | context=context, data_dict={'id': resource_dict['id']}):
157 | datastore_exists = True
158 | else:
159 | datastore_exists = False
160 | except toolkit.ObjectNotFound:
161 | datastore_exists = False
162 |
163 | if datastore_active != datastore_exists:
164 | # flag is out of sync with datastore; update it
165 | utils.set_resource_metadata(
166 | {'resource_id': resource_dict['id'],
167 | 'datastore_active': datastore_exists})
168 |
169 | if not toolkit.check_ckan_version("2.10"):
170 |
171 | def after_create(self, context, resource_dict):
172 | self.after_resource_create(context, resource_dict)
173 |
174 | def before_show(self, resource_dict):
175 | self.before_resource_show(resource_dict)
176 |
177 | def after_update(self, context, resource_dict):
178 | self.after_resource_update(context, resource_dict)
179 |
180 | def _submit_to_xloader(self, resource_dict, sync=False):
181 | context = {"ignore_auth": True, "defer_commit": True}
182 | resource_format = resource_dict.get("format")
183 | if not XLoaderFormats.is_it_an_xloader_format(resource_format):
184 | log.debug(
185 | f"Skipping xloading resource {resource_dict['id']} because "
186 | f'format "{resource_format}" is not configured to be '
187 | "xloadered"
188 | )
189 | return
190 | if resource_dict["url_type"] in ("datapusher", "xloader"):
191 | log.debug(
192 | "Skipping xloading resource {id} because "
193 | 'url_type "{url_type}" means resource.url '
194 | "points to the datastore already, so loading "
195 | "would be circular.".format(**resource_dict)
196 | )
197 | return
198 |
199 | try:
200 | if sync:
201 | log.debug(
202 | "xloadering resource %s in sync mode", resource_dict["id"]
203 | )
204 | else:
205 | log.debug(
206 | "Submitting resource %s to be xloadered", resource_dict["id"]
207 | )
208 | toolkit.get_action("xloader_submit")(
209 | context,
210 | {
211 | "resource_id": resource_dict["id"],
212 | "ignore_hash": self.ignore_hash,
213 | "sync": sync,
214 | },
215 | )
216 | except toolkit.ValidationError as e:
217 | # If xloader is offline, we want to catch error instead
218 | # of raising otherwise resource save will fail with 500
219 | log.critical(e)
220 | pass
221 |
222 | # IActions
223 |
224 | def get_actions(self):
225 | return {
226 | "xloader_submit": action.xloader_submit,
227 | "xloader_hook": action.xloader_hook,
228 | "xloader_status": action.xloader_status,
229 | }
230 |
231 | # IAuthFunctions
232 |
233 | def get_auth_functions(self):
234 | return {
235 | "xloader_submit": auth.xloader_submit,
236 | "xloader_status": auth.xloader_status,
237 | }
238 |
239 | # ITemplateHelpers
240 |
241 | def get_helpers(self):
242 | return {
243 | "xloader_status": xloader_helpers.xloader_status,
244 | "xloader_status_description": xloader_helpers.xloader_status_description,
245 | "is_resource_supported_by_xloader": xloader_helpers.is_resource_supported_by_xloader,
246 | "xloader_badge": xloader_helpers.xloader_badge,
247 | }
248 |
249 | # IDataDictionaryForm
250 |
251 | def update_datastore_create_schema(self, schema):
252 | default = toolkit.get_validator('default')
253 | boolean_validator = toolkit.get_validator('boolean_validator')
254 | to_datastore_plugin_data = toolkit.get_validator('to_datastore_plugin_data')
255 | schema['fields']['strip_extra_white'] = [default(True), boolean_validator, to_datastore_plugin_data('xloader')]
256 | return schema
257 |
258 | def update_datastore_info_field(self, field, plugin_data):
259 | # expose all our non-secret plugin data in the field
260 | field.update(plugin_data.get('xloader', {}))
261 | # CKAN version parody
262 | if '_info' in plugin_data:
263 | field.update({'info': plugin_data['_info']})
264 | return field
265 |
266 |
267 | def _should_remove_unsupported_resource_from_datastore(res_dict):
268 | if not toolkit.asbool(toolkit.config.get('ckanext.xloader.clean_datastore_tables', False)):
269 | return False
270 | return (not XLoaderFormats.is_it_an_xloader_format(res_dict.get('format', u''))
271 | and (res_dict.get('url_type') == 'upload'
272 | or not res_dict.get('url_type'))
273 | and (toolkit.asbool(res_dict.get('datastore_active', False))
274 | or toolkit.asbool(res_dict.get('extras', {}).get('datastore_active', False))))
275 |
276 |
277 | def _remove_unsupported_resource_from_datastore(resource_id):
278 | """
279 | Callback to remove unsupported datastore tables.
280 | Controlled by config value: ckanext.xloader.clean_datastore_tables.
281 | Double check the resource format. Only supported Xloader formats should have datastore tables.
282 | If the resource format is not supported, we should delete the datastore tables.
283 | """
284 | context = {"ignore_auth": True}
285 | try:
286 | res = toolkit.get_action('resource_show')(context, {"id": resource_id})
287 | except toolkit.ObjectNotFound:
288 | log.error('Resource %s does not exist.', resource_id)
289 | return
290 |
291 | if _should_remove_unsupported_resource_from_datastore(res):
292 | log.info('Unsupported resource format "%s". Deleting datastore tables for resource %s',
293 | res.get(u'format', u''), res['id'])
294 | try:
295 | toolkit.get_action('datastore_delete')(context, {
296 | "resource_id": res['id'],
297 | "force": True})
298 | log.info('Datastore table dropped for resource %s', res['id'])
299 | except toolkit.ObjectNotFound:
300 | log.error('Datastore table for resource %s does not exist', res['id'])
301 |
--------------------------------------------------------------------------------
/ckanext/xloader/tests/test_jobs.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import io
3 | import os
4 |
5 | from datetime import datetime
6 |
7 | from requests import Response
8 |
9 | from ckan.cli.cli import ckan
10 | from ckan.plugins import toolkit
11 | from ckan.tests import helpers, factories
12 |
13 | from unittest import mock
14 |
15 | from ckanext.xloader import jobs
16 |
17 |
18 | _TEST_FILE_CONTENT = "x, y\n1,2\n2,4\n3,6\n4,8\n5,10"
19 | _TEST_LARGE_FILE_CONTENT = "\n1,2\n2,4\n3,6\n4,8\n5,10"
20 |
21 |
22 | def get_response(download_url, headers):
23 | """Mock jobs.get_response() method."""
24 | resp = Response()
25 | resp.raw = io.BytesIO(_TEST_FILE_CONTENT.encode())
26 | resp.headers = headers
27 | return resp
28 |
29 |
30 | def get_large_response(download_url, headers):
31 | """Mock jobs.get_response() method to fake a large file."""
32 | resp = Response()
33 | resp.raw = io.BytesIO(_TEST_FILE_CONTENT.encode())
34 | resp.headers = {'content-length': 2000000000}
35 | return resp
36 |
37 |
38 | def get_large_data_response(download_url, headers):
39 | """Mock jobs.get_response() method."""
40 | resp = Response()
41 | f_content = _TEST_FILE_CONTENT + (_TEST_LARGE_FILE_CONTENT * 500000)
42 | resp.raw = io.BytesIO(f_content.encode())
43 | resp.headers = headers
44 | return resp
45 |
46 |
47 | def _get_temp_files(dir='/tmp'):
48 | return [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]
49 |
50 |
51 | @pytest.fixture
52 | def apikey():
53 | sysadmin = factories.SysadminWithToken()
54 | return sysadmin["token"]
55 |
56 |
57 | @pytest.fixture
58 | def data(create_with_upload, apikey):
59 | dataset = factories.Dataset()
60 | resource = create_with_upload(
61 | _TEST_FILE_CONTENT,
62 | "multiplication_2.csv",
63 | url="http://data",
64 | package_id=dataset["id"]
65 | )
66 | callback_url = toolkit.url_for(
67 | "api.action", ver=3, logic_function="xloader_hook", qualified=True
68 | )
69 | return {
70 | 'api_key': apikey,
71 | 'job_type': 'xloader_to_datastore',
72 | 'result_url': callback_url,
73 | 'metadata': {
74 | 'ignore_hash': True,
75 | 'ckan_url': toolkit.config.get('ckan.site_url'),
76 | 'resource_id': resource["id"],
77 | 'set_url_type': False,
78 | 'task_created': datetime.utcnow().isoformat(),
79 | 'original_url': resource["url"],
80 | }
81 | }
82 |
83 |
84 | @pytest.mark.usefixtures("clean_db", "with_plugins")
85 | @pytest.mark.ckan_config("ckanext.xloader.job_timeout", 2)
86 | @pytest.mark.ckan_config("ckanext.xloader.copy_chunk_size", 5120)
87 | @pytest.mark.ckan_config("ckan.jobs.timeout", 2)
88 | class TestXLoaderJobs(helpers.FunctionalRQTestBase):
89 |
90 | def test_xloader_data_into_datastore(self, cli, data):
91 | self.enqueue(jobs.xloader_data_into_datastore, [data])
92 | with mock.patch("ckanext.xloader.jobs.get_response", get_response):
93 | stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output
94 | assert "File hash: d44fa65eda3675e11710682fdb5f1648" in stdout
95 | assert "Fields: [{'id': 'x', 'type': 'text', 'strip_extra_white': True}, {'id': 'y', 'type': 'text', 'strip_extra_white': True}]" in stdout
96 | assert "Copying to database..." in stdout
97 | assert "Creating search index..." in stdout
98 | assert "Express Load completed" in stdout
99 |
100 | resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"])
101 | assert resource["datastore_contains_all_records_of_source_file"]
102 |
103 | # Set the ckanext.xloader.site_url in the config
104 | @pytest.mark.ckan_config("ckanext.xloader.site_url", 'http://xloader-site-url')
105 | def test_download_resource_data_with_ckanext_xloader_site_url(self, cli, data):
106 |
107 | data['metadata']['original_url'] = 'http://xloader-site-url/resource.csv'
108 | self.enqueue(jobs.xloader_data_into_datastore, [data])
109 | with mock.patch("ckanext.xloader.jobs.get_response", get_response):
110 | stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output
111 | assert "Express Load completed" in stdout
112 |
113 | resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"])
114 | assert resource["datastore_contains_all_records_of_source_file"]
115 |
116 | @pytest.mark.ckan_config("ckanext.site_url", 'http://ckan-site-url')
117 | def test_download_resource_data_with_ckan_site_url(self, cli, data):
118 | data['metadata']['original_url'] = 'http://ckan-site-url/resource.csv'
119 | self.enqueue(jobs.xloader_data_into_datastore, [data])
120 | with mock.patch("ckanext.xloader.jobs.get_response", get_response):
121 | stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output
122 | assert "Express Load completed" in stdout
123 |
124 | resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"])
125 | assert resource["datastore_contains_all_records_of_source_file"]
126 |
127 | @pytest.mark.ckan_config("ckanext.site_url", 'http://ckan-site-url')
128 | def test_download_resource_data_with_different_original_url(self, cli, data):
129 | data['metadata']['original_url'] = 'http://external-site-url/resource.csv'
130 | self.enqueue(jobs.xloader_data_into_datastore, [data])
131 | with mock.patch("ckanext.xloader.jobs.get_response", get_response):
132 | stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output
133 | assert "Express Load completed" in stdout
134 |
135 | resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"])
136 | assert resource["datastore_contains_all_records_of_source_file"]
137 |
138 | @pytest.mark.ckan_config("ckanext.xloader.site_url", 'http://xloader-site-url')
139 | def test_callback_xloader_hook_with_ckanext_xloader_site_url(self, cli, data):
140 | data['result_url'] = 'http://xloader-site-url/api/3/action/xloader_hook'
141 | self.enqueue(jobs.xloader_data_into_datastore, [data])
142 | with mock.patch("ckanext.xloader.jobs.get_response", get_response):
143 | stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output
144 | assert "Express Load completed" in stdout
145 |
146 | resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"])
147 | assert resource["datastore_contains_all_records_of_source_file"]
148 |
149 | @pytest.mark.ckan_config("ckanext.site_url", 'http://ckan-site-url')
150 | def test_callback_xloader_hook_with_ckan_site_url(self, cli, data):
151 | data['result_url'] = 'http://ckan-site-url/api/3/action/xloader_hook'
152 | self.enqueue(jobs.xloader_data_into_datastore, [data])
153 | with mock.patch("ckanext.xloader.jobs.get_response", get_response):
154 | stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output
155 | assert "Express Load completed" in stdout
156 |
157 | resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"])
158 | assert resource["datastore_contains_all_records_of_source_file"]
159 |
160 | def test_xloader_ignore_hash(self, cli, data):
161 | self.enqueue(jobs.xloader_data_into_datastore, [data])
162 | with mock.patch("ckanext.xloader.jobs.get_response", get_response):
163 | stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output
164 | assert "Express Load completed" in stdout
165 |
166 | self.enqueue(jobs.xloader_data_into_datastore, [data])
167 | with mock.patch("ckanext.xloader.jobs.get_response", get_response):
168 | stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output
169 | assert "Copying to database..." in stdout
170 | assert "Express Load completed" in stdout
171 |
172 | data["metadata"]["ignore_hash"] = False
173 | self.enqueue(jobs.xloader_data_into_datastore, [data])
174 | with mock.patch("ckanext.xloader.jobs.get_response", get_response):
175 | stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output
176 | assert "Ignoring resource - the file hash hasn't changed" in stdout
177 |
178 | def test_data_too_big_error_if_content_length_bigger_than_config(self, cli, data):
179 | self.enqueue(jobs.xloader_data_into_datastore, [data])
180 | with mock.patch("ckanext.xloader.jobs.get_response", get_large_response):
181 | stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output
182 | assert "Data too large to load into Datastore:" in stdout
183 |
184 | def test_data_max_excerpt_lines_config(self, cli, data):
185 | self.enqueue(jobs.xloader_data_into_datastore, [data])
186 | with mock.patch("ckanext.xloader.jobs.get_response", get_large_response):
187 | with mock.patch("ckanext.xloader.jobs.MAX_EXCERPT_LINES", 1):
188 | stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output
189 | assert "Loading excerpt of ~1 lines to DataStore." in stdout
190 |
191 | resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"])
192 | assert resource["datastore_contains_all_records_of_source_file"] is False
193 |
194 | def test_data_with_rq_job_timeout(self, cli, data):
195 | file_suffix = 'multiplication_2.csv'
196 | self.enqueue(jobs.xloader_data_into_datastore, [data], rq_kwargs=dict(timeout=2))
197 | with mock.patch("ckanext.xloader.jobs.get_response", get_large_data_response):
198 | stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output
199 | assert "Job timed out after" in stdout
200 | for f in _get_temp_files():
201 | # make sure that the tmp file has been closed/deleted in job timeout exception handling
202 | assert file_suffix not in f
203 |
204 | @pytest.mark.parametrize("error_type,should_retry", [
205 | # Retryable errors from RETRYABLE_ERRORS
206 | ("DeadlockDetected", True),
207 | ("LockNotAvailable", True),
208 | ("ObjectInUse", True),
209 | ("XLoaderTimeoutError", True),
210 | # Retryable HTTP errors (status codes from is_retryable_error)
211 | ("HTTPError_408", True),
212 | ("HTTPError_429", True),
213 | ("HTTPError_500", True),
214 | ("HTTPError_502", True),
215 | ("HTTPError_503", True),
216 | ("HTTPError_504", True),
217 | ("HTTPError_507", True),
218 | ("HTTPError_522", True),
219 | ("HTTPError_524", True),
220 | # Non-retryable HTTP errors
221 | ("HTTPError_400", False),
222 | ("HTTPError_404", False),
223 | ("HTTPError_403", False),
224 | # Other non-retryable errors (not in RETRYABLE_ERRORS)
225 | ("ValueError", False),
226 | ("TypeError", False),
227 | ])
228 | def test_retry_behavior(self, cli, data, error_type, should_retry):
229 | """Test retry behavior for different error types."""
230 |
231 | def create_mock_error(error_type):
232 | if error_type == "DeadlockDetected":
233 | from psycopg2 import errors
234 | return errors.DeadlockDetected()
235 | elif error_type == "LockNotAvailable":
236 | from psycopg2 import errors
237 | return errors.LockNotAvailable()
238 | elif error_type == "ObjectInUse":
239 | from psycopg2 import errors
240 | return errors.ObjectInUse()
241 | elif error_type == "XLoaderTimeoutError":
242 | return jobs.XLoaderTimeoutError('Connection timed out after 30s')
243 | elif error_type.startswith("HTTPError_"):
244 | status_code = int(error_type.split("_")[1])
245 | return jobs.HTTPError("HTTP Error", status_code=status_code, request_url="test", response=None)
246 | elif error_type == "ValueError":
247 | return ValueError("Test error")
248 | elif error_type == "TypeError":
249 | return TypeError("Test error")
250 |
251 | def mock_download_with_error(*args, **kwargs):
252 | if not hasattr(mock_download_with_error, 'call_count'):
253 | mock_download_with_error.call_count = 0
254 | mock_download_with_error.call_count += 1
255 |
256 | if mock_download_with_error.call_count == 1:
257 | # First call - raise the test error
258 | raise create_mock_error(error_type)
259 | elif should_retry:
260 | # Second call - return successful response only if retryable
261 | import tempfile
262 | tmp_file = tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.csv')
263 | tmp_file.write(_TEST_FILE_CONTENT)
264 | tmp_file.flush()
265 | return (tmp_file, 'd44fa65eda3675e11710682fdb5f1648')
266 | else:
267 | # Non-retryable errors should not get a second chance
268 | raise create_mock_error(error_type)
269 |
270 | self.enqueue(jobs.xloader_data_into_datastore, [data])
271 |
272 | with mock.patch("ckanext.xloader.jobs._download_resource_data", mock_download_with_error):
273 | stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output
274 |
275 | if should_retry:
276 | # Check that retry was attempted
277 | assert "Job failed due to temporary error" in stdout
278 | assert "retrying" in stdout
279 | assert "Express Load completed" in stdout
280 | # Verify resource was successfully loaded after retry
281 | resource = helpers.call_action("resource_show", id=data["metadata"]["resource_id"])
282 | assert resource["datastore_contains_all_records_of_source_file"]
283 | else:
284 | # Check that job failed without retry - should have error messages
285 | assert "xloader error:" in stdout or "error" in stdout.lower()
286 | assert "Express Load completed" not in stdout
287 |
288 |
289 | @pytest.mark.usefixtures("clean_db")
290 | class TestSetResourceMetadata(object):
291 | def test_simple(self):
292 | resource = factories.Resource()
293 |
294 | jobs.set_resource_metadata(
295 | {
296 | "datastore_contains_all_records_of_source_file": True,
297 | "datastore_active": True,
298 | "ckan_url": "http://www.ckan.org/",
299 | "resource_id": resource["id"],
300 | }
301 | )
302 |
303 | resource = helpers.call_action("resource_show", id=resource["id"])
304 | assert resource["datastore_contains_all_records_of_source_file"]
305 | assert resource["datastore_active"]
306 | assert resource["ckan_url"] == "http://www.ckan.org/"
307 |
--------------------------------------------------------------------------------
/ckanext/xloader/utils.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | from collections import defaultdict
4 | from decimal import Decimal
5 | import json
6 | import datetime
7 | import logging
8 | import re
9 | from six import text_type as str, binary_type
10 | from urllib.parse import urlunparse, urlparse
11 |
12 | from ckan import model
13 | from ckan.lib import search
14 | import ckan.plugins.toolkit as tk
15 |
16 | from .job_exceptions import JobError
17 |
18 | log = logging.getLogger(__name__)
19 |
20 |
21 | # resource.formats accepted by ckanext-xloader. Must be lowercase here.
22 | DEFAULT_FORMATS = [
23 | "csv",
24 | "application/csv",
25 | "xls",
26 | "xlsx",
27 | "tsv",
28 | "application/vnd.ms-excel",
29 | "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
30 | "ods",
31 | "application/vnd.oasis.opendocument.spreadsheet",
32 | ]
33 |
34 |
35 | class XLoaderFormats(object):
36 | formats = None
37 |
38 | @classmethod
39 | def is_it_an_xloader_format(cls, format_):
40 | if cls.formats is None:
41 | cls._formats = tk.config.get("ckanext.xloader.formats")
42 | if cls._formats is not None:
43 | # use config value. preserves empty list as well.
44 | cls._formats = cls._formats.lower().split()
45 | else:
46 | cls._formats = DEFAULT_FORMATS
47 | if not format_:
48 | return False
49 | return format_.lower() in cls._formats
50 |
51 |
52 | def requires_successful_validation_report():
53 | return tk.asbool(tk.config.get('ckanext.xloader.validation.requires_successful_report', False))
54 |
55 |
56 | def awaiting_validation(res_dict):
57 | """
58 | Checks the existence of a logic action from the ckanext-validation
59 | plugin, thus supporting any extending of the Validation Plugin class.
60 |
61 | Checks ckanext.xloader.validation.requires_successful_report config
62 | option value.
63 |
64 | Checks ckanext.xloader.validation.enforce_schema config
65 | option value. Then checks the Resource's validation_status.
66 | """
67 | if not requires_successful_validation_report():
68 | # validation.requires_successful_report is turned off, return right away
69 | return False
70 |
71 | try:
72 | # check for one of the main actions from ckanext-validation
73 | # in the case that users extend the Validation plugin class
74 | # and rename the plugin entry-point.
75 | tk.get_action('resource_validation_show')
76 | is_validation_plugin_loaded = True
77 | except KeyError:
78 | is_validation_plugin_loaded = False
79 |
80 | if not is_validation_plugin_loaded:
81 | # the validation plugin is not loaded but required, log a warning
82 | log.warning('ckanext.xloader.validation.requires_successful_report '
83 | 'requires the ckanext-validation plugin to be activated.')
84 | return False
85 |
86 | if (tk.asbool(tk.config.get('ckanext.xloader.validation.enforce_schema', True))
87 | or res_dict.get('schema', None)) and res_dict.get('validation_status', None) != 'success':
88 |
89 | # either validation.enforce_schema is turned on or it is off and there is a schema,
90 | # we then explicitly check for the `validation_status` report to be `success``
91 | return True
92 |
93 | # at this point, we can assume that the Resource is not waiting for Validation.
94 | # or that the Resource does not have a Validation Schema and we are not enforcing schemas.
95 | return False
96 |
97 |
98 | def resource_data(id, resource_id, rows=None):
99 |
100 | if tk.request.method == "POST":
101 |
102 | context = {
103 | "ignore_auth": True,
104 | }
105 | resource_dict = tk.get_action("resource_show")(
106 | context,
107 | {
108 | "id": resource_id,
109 | },
110 | )
111 |
112 | if awaiting_validation(resource_dict):
113 | tk.h.flash_error(tk._("Cannot upload resource %s to the DataStore "
114 | "because the resource did not pass validation yet.") % resource_id)
115 | return tk.redirect_to(
116 | "xloader.resource_data", id=id, resource_id=resource_id
117 | )
118 |
119 | try:
120 | tk.get_action("xloader_submit")(
121 | None,
122 | {
123 | "resource_id": resource_id,
124 | "ignore_hash": True, # user clicked the reload button
125 | },
126 | )
127 | except tk.ValidationError:
128 | pass
129 |
130 | return tk.redirect_to(
131 | "xloader.resource_data", id=id, resource_id=resource_id
132 | )
133 |
134 | try:
135 | pkg_dict = tk.get_action("package_show")(None, {"id": id})
136 | resource = tk.get_action("resource_show")(None, {"id": resource_id})
137 | except (tk.ObjectNotFound, tk.NotAuthorized):
138 | return tk.abort(404, tk._("Resource not found"))
139 |
140 | try:
141 | xloader_status = tk.get_action("xloader_status")(
142 | None, {"resource_id": resource_id}
143 | )
144 | except tk.ObjectNotFound:
145 | xloader_status = {}
146 | except tk.NotAuthorized:
147 | return tk.abort(403, tk._("Not authorized to see this page"))
148 |
149 | extra_vars = {
150 | "status": xloader_status,
151 | "resource": resource,
152 | "pkg_dict": pkg_dict,
153 | }
154 | if rows:
155 | extra_vars["rows"] = rows
156 | return tk.render(
157 | "xloader/resource_data.html",
158 | extra_vars=extra_vars,
159 | )
160 |
161 |
162 | def get_xloader_user_apitoken():
163 | """ Returns the API Token for authentication.
164 |
165 | xloader actions require an authenticated user to perform the actions. This
166 | method returns the api_token set in the config file and defaults to the
167 | site_user.
168 | """
169 | api_token = tk.config.get('ckanext.xloader.api_token')
170 | if api_token and api_token != 'NOT_SET':
171 | return api_token
172 | raise tk.ValidationError({u'ckanext.xloader.api_token': u'NOT_SET, please provide valid api token'})
173 |
174 |
175 | def _modify_url(input_url: str, base_url: str) -> str:
176 | """ Modifies the input URL with base_url provided.
177 |
178 | Args:
179 | input_url (str): The original URL to potentially modify
180 | base_url (str): The base URL to compare/replace against
181 | Returns:
182 | str: The modified URL with replaced scheme and netloc
183 | """
184 | parsed_input_url = urlparse(input_url)
185 | parsed_base_url = urlparse(base_url)
186 | # Do not modify non-HTTP(S) URLs (e.g., ftp://)
187 | if parsed_input_url.scheme not in ("http", "https"):
188 | return input_url
189 | # replace scheme: "http/https" and netloc:"//:@:/"
190 | new_url = urlunparse(
191 | (parsed_base_url.scheme,
192 | parsed_base_url.netloc,
193 | parsed_input_url.path,
194 | parsed_input_url.params,
195 | parsed_input_url.query,
196 | parsed_input_url.fragment))
197 | return new_url
198 |
199 |
200 | def modify_input_url(input_url: str) -> str:
201 | """Returns a potentially modified CKAN URL.
202 |
203 | This function takes a possible CKAN URL and potentially modifies its base URL while preserving the path,
204 | query parameters, and fragments. The modification occurs only if three conditions are met:
205 | 1. The base URL of the input matches the configured CKAN site URL (ckan.site_url).
206 | 2. A `ckanext.xloader.site_url` is configured in the settings.
207 | 3. A `ckanext.xloader.site_url_ignore_path_regex` if configured in the settings and does not match.
208 |
209 | Args:
210 | input_url (str): The original CKAN URL to potentially modify
211 | Returns:
212 | str: Either the modified URL with new base URL from xloader_site_url,
213 | or the original URL if conditions aren't met
214 | """
215 |
216 | xloader_site_url = tk.config.get('ckanext.xloader.site_url')
217 | if not xloader_site_url:
218 | return input_url
219 |
220 | parsed_input_url = urlparse(input_url)
221 | input_base_url = f"{parsed_input_url.scheme}://{parsed_input_url.netloc}"
222 | parsed_ckan_site_url = urlparse(tk.config.get('ckan.site_url'))
223 | ckan_base_url = f"{parsed_ckan_site_url.scheme}://{parsed_ckan_site_url.netloc}"
224 |
225 | xloader_ignore_regex = tk.config.get('ckanext.xloader.site_url_ignore_path_regex')
226 |
227 | # Don't alter non-matching base URLs.
228 | if input_base_url != ckan_base_url:
229 | return input_url
230 | # And not any URLs on the ignore regex
231 | elif xloader_ignore_regex and re.search(xloader_ignore_regex, input_url):
232 | return input_url
233 |
234 | return _modify_url(input_url, xloader_site_url)
235 |
236 |
237 | def set_resource_metadata(update_dict):
238 | '''
239 | Set appropriate datastore_active flag on CKAN resource.
240 |
241 | Called after creation or deletion of DataStore table.
242 | '''
243 | # We're modifying the resource extra directly here to avoid a
244 | # race condition, see issue #3245 for details and plan for a
245 | # better fix
246 |
247 | q = model.Session.query(model.Resource). \
248 | with_for_update(of=model.Resource). \
249 | filter(model.Resource.id == update_dict['resource_id'])
250 | resource = q.one()
251 |
252 | # update extras in database for record
253 | extras = resource.extras
254 | extras.update(update_dict)
255 | q.update({'extras': extras}, synchronize_session=False)
256 |
257 | model.Session.commit()
258 |
259 | # get package with updated resource from solr
260 | # find changed resource, patch it and reindex package
261 | psi = search.PackageSearchIndex()
262 | solr_query = search.PackageSearchQuery()
263 | q = {
264 | 'q': 'id:"{0}"'.format(resource.package_id),
265 | 'fl': 'data_dict',
266 | 'wt': 'json',
267 | 'fq': 'site_id:"%s"' % tk.config.get('ckan.site_id'),
268 | 'rows': 1
269 | }
270 | for record in solr_query.run(q)['results']:
271 | solr_data_dict = json.loads(record['data_dict'])
272 | for resource in solr_data_dict['resources']:
273 | if resource['id'] == update_dict['resource_id']:
274 | resource.update(update_dict)
275 | psi.index_package(solr_data_dict)
276 | break
277 |
278 |
279 | def column_count_modal(rows):
280 | """ Return the modal value of columns in the row_set's
281 | sample. This can be assumed to be the number of columns
282 | of the table.
283 |
284 | Copied from messytables.
285 | """
286 | counts = defaultdict(int)
287 | for row in rows:
288 | length = len([c for c in row if c != ''])
289 | if length > 1:
290 | counts[length] += 1
291 | if not len(counts):
292 | return 0
293 | return max(list(counts.items()), key=lambda k_v: k_v[1])[0]
294 |
295 |
296 | def headers_guess(rows, tolerance=1):
297 | """ Guess the offset and names of the headers of the row set.
298 | This will attempt to locate the first row within ``tolerance``
299 | of the mode of the number of rows in the row set sample.
300 |
301 | The return value is a tuple of the offset of the header row
302 | and the names of the columns.
303 |
304 | Copied from messytables.
305 | """
306 | rows = list(rows)
307 | modal = column_count_modal(rows)
308 | for i, row in enumerate(rows):
309 | length = len([c for c in row if c != ''])
310 | if length >= modal - tolerance:
311 | # TODO: use type guessing to check that this row has
312 | # strings and does not conform to the type schema of
313 | # the table.
314 | return i, row
315 | return 0, []
316 |
317 |
318 | TYPES = [int, bool, str, binary_type, datetime.datetime, float, Decimal]
319 |
320 |
321 | def type_guess(rows, types=TYPES, strict=False):
322 | """ The type guesser aggregates the number of successful
323 | conversions of each column to each type, weights them by a
324 | fixed type priority and select the most probable type for
325 | each column based on that figure. It returns a list of
326 | ``CellType``. Empty cells are ignored.
327 |
328 | Strict means that a type will not be guessed
329 | if parsing fails for a single cell in the column."""
330 | guesses = []
331 | if strict:
332 | at_least_one_value = []
333 | for ri, row in enumerate(rows):
334 | diff = len(row) - len(guesses)
335 | for _i in range(diff):
336 | typesdict = {}
337 | for type in types:
338 | typesdict[type] = 0
339 | guesses.append(typesdict)
340 | at_least_one_value.append(False)
341 | for ci, cell in enumerate(row):
342 | if not cell:
343 | continue
344 | for type in list(guesses[ci].keys()):
345 | if not isinstance(cell, type):
346 | guesses[ci].pop(type)
347 | at_least_one_value[ci] = True if guesses[ci] else False
348 | # no need to set guessing weights before this
349 | # because we only accept a type if it never fails
350 | for i, guess in enumerate(guesses):
351 | for type in guess:
352 | guesses[i][type] = 1
353 | # in case there were no values at all in the column,
354 | # we just set the guessed type to string
355 | for i, v in enumerate(at_least_one_value):
356 | if not v:
357 | guesses[i] = {str: 1}
358 | else:
359 | for i, row in enumerate(rows):
360 | diff = len(row) - len(guesses)
361 | for _i in range(diff):
362 | guesses.append(defaultdict(int))
363 | for i, cell in enumerate(row):
364 | # add string guess so that we have at least one guess
365 | guesses[i][str] = guesses[i].get(str, 1)
366 | if not cell:
367 | continue
368 | for type in types:
369 | if isinstance(cell, type):
370 | guesses[i][type] += 1
371 | _columns = []
372 | _columns = []
373 | for guess in guesses:
374 | # this first creates an array of tuples because we want the types to be
375 | # sorted. Even though it is not specified, python chooses the first
376 | # element in case of a tie
377 | # See: http://stackoverflow.com/a/6783101/214950
378 | guesses_tuples = [(t, guess[t]) for t in types if t in guess]
379 | if not guesses_tuples:
380 | raise JobError('Failed to guess types')
381 | _columns.append(max(guesses_tuples, key=lambda t_n: t_n[1])[0])
382 | return _columns
383 |
384 |
385 | def datastore_resource_exists(resource_id):
386 | context = {'model': model, 'ignore_auth': True}
387 | try:
388 | response = tk.get_action('datastore_search')(context, dict(
389 | id=resource_id, limit=0))
390 | except tk.ObjectNotFound:
391 | return False
392 | return response or {'fields': []}
393 |
--------------------------------------------------------------------------------
/ckanext/xloader/action.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | from __future__ import absolute_import
4 | import datetime
5 | import json
6 | import logging
7 |
8 | import ckan.lib.jobs as rq_jobs
9 | import ckan.lib.navl.dictization_functions
10 | from ckan.logic import side_effect_free
11 | import ckan.plugins as p
12 | from dateutil.parser import parse as parse_date
13 | from dateutil.parser import isoparse as parse_iso_date
14 |
15 | import ckanext.xloader.schema
16 |
17 | from . import interfaces as xloader_interfaces, jobs, db, utils
18 |
19 | enqueue_job = p.toolkit.enqueue_job
20 | get_queue = rq_jobs.get_queue
21 |
22 | log = logging.getLogger(__name__)
23 | config = p.toolkit.config
24 |
25 | _get_or_bust = p.toolkit.get_or_bust
26 | _validate = ckan.lib.navl.dictization_functions.validate
27 |
28 |
29 | def xloader_submit(context, data_dict):
30 | ''' Submit a job to be Express Loaded. The Express Loader / 'xloader' is a
31 | service that imports tabular data into the datastore.
32 |
33 | :param resource_id: The resource id of the resource that the data
34 | should be imported in. The resource's URL will be used to get the data.
35 | :type resource_id: string
36 | :param set_url_type: If set to True, the ``url_type`` of the resource will
37 | be set to ``datastore`` and the resource URL will automatically point
38 | to the :ref:`datastore dump ` URL. (optional, default: False)
39 | :type set_url_type: bool
40 | :param ignore_hash: If set to True, the xloader will reload the file
41 | even if it haven't changed. (optional, default: False)
42 | :type ignore_hash: bool
43 |
44 | Returns ``True`` if the job has been submitted and ``False`` if the job
45 | has not been submitted, i.e. when ckanext-xloader is not configured.
46 |
47 | :rtype: bool
48 | '''
49 | p.toolkit.check_access('xloader_submit', context, data_dict)
50 | api_key = utils.get_xloader_user_apitoken()
51 | custom_queue = data_dict.pop('queue', rq_jobs.DEFAULT_QUEUE_NAME)
52 | schema = context.get('schema', ckanext.xloader.schema.xloader_submit_schema())
53 | data_dict, errors = _validate(data_dict, schema, context)
54 | if errors:
55 | raise p.toolkit.ValidationError(errors)
56 |
57 | p.toolkit.check_access('xloader_submit', context, data_dict)
58 |
59 | # If sync is set to True, the xloader callback will be executed right
60 | # away, instead of a job being enqueued. It will also delete any existing jobs
61 | # for the given resource. This is only controlled by sysadmins or the system.
62 | sync = data_dict.pop('sync', False)
63 |
64 | res_id = data_dict['resource_id']
65 | try:
66 | resource_dict = p.toolkit.get_action('resource_show')(context, {
67 | 'id': res_id,
68 | })
69 | except p.toolkit.ObjectNotFound:
70 | return False
71 |
72 | for plugin in p.PluginImplementations(xloader_interfaces.IXloader):
73 | upload = plugin.can_upload(res_id)
74 | if not upload:
75 | msg = "Plugin {0} rejected resource {1}"\
76 | .format(plugin.__class__.__name__, res_id)
77 | log.info(msg)
78 | return False
79 |
80 | # Check if this resource is already in the process of being xloadered
81 | task = {
82 | 'entity_id': res_id,
83 | 'entity_type': 'resource',
84 | 'task_type': 'xloader',
85 | 'last_updated': str(datetime.datetime.utcnow()),
86 | 'state': 'submitting',
87 | 'key': 'xloader',
88 | 'value': '{}',
89 | 'error': '{}',
90 | }
91 | try:
92 | existing_task = p.toolkit.get_action('task_status_show')(context, {
93 | 'entity_id': res_id,
94 | 'task_type': 'xloader',
95 | 'key': 'xloader'
96 | })
97 | assume_task_stale_after = datetime.timedelta(seconds=int(
98 | config.get('ckanext.xloader.assume_task_stale_after', 3600)))
99 | assume_task_stillborn_after = \
100 | datetime.timedelta(seconds=int(
101 | config.get('ckanext.xloader.assume_task_stillborn_after', 5)))
102 | if existing_task.get('state') == 'pending':
103 | import re # here because it takes a moment to load
104 | queued_res_ids = [
105 | re.search(r"'resource_id': u?'([^']+)'",
106 | job.description).groups()[0]
107 | for job in get_queue().get_jobs()
108 | if 'xloader_to_datastore' in str(job) # filter out test_job etc
109 | ]
110 | updated = parse_iso_date(existing_task['last_updated'])
111 | time_since_last_updated = datetime.datetime.utcnow() - updated
112 | if (res_id not in queued_res_ids
113 | and time_since_last_updated > assume_task_stillborn_after):
114 | # it's not on the queue (and if it had just been started then
115 | # its taken too long to update the task_status from pending -
116 | # the first thing it should do in the xloader job).
117 | # Let it be restarted.
118 | log.info('A pending task was found %r, but its not found in '
119 | 'the queue %r and is %s hours old',
120 | existing_task['id'], queued_res_ids,
121 | time_since_last_updated)
122 | elif time_since_last_updated > assume_task_stale_after:
123 | # it's been a while since the job was last updated - it's more
124 | # likely something went wrong with it and the state wasn't
125 | # updated than its still in progress. Let it be restarted.
126 | log.info('A pending task was found %r, but it is only %s hours'
127 | ' old', existing_task['id'], time_since_last_updated)
128 | else:
129 | log.info('A pending task was found %s for this resource, so '
130 | 'skipping this duplicate task', existing_task['id'])
131 | return False
132 |
133 | task['id'] = existing_task['id']
134 | except p.toolkit.ObjectNotFound:
135 | pass
136 |
137 | model = context['model']
138 |
139 | p.toolkit.get_action('task_status_update')(
140 | {'session': model.meta.create_local_session(), 'ignore_auth': True},
141 | task
142 | )
143 |
144 | callback_url = p.toolkit.url_for(
145 | "api.action",
146 | ver=3,
147 | logic_function="xloader_hook",
148 | qualified=True
149 | )
150 | data = {
151 | 'api_key': api_key,
152 | 'job_type': 'xloader_to_datastore',
153 | 'result_url': callback_url,
154 | 'metadata': {
155 | 'ignore_hash': data_dict.get('ignore_hash', False),
156 | 'ckan_url': config['ckan.site_url'],
157 | 'resource_id': res_id,
158 | 'set_url_type': data_dict.get('set_url_type', False),
159 | 'task_created': task['last_updated'],
160 | 'original_url': resource_dict.get('url'),
161 | }
162 | }
163 | if custom_queue != rq_jobs.DEFAULT_QUEUE_NAME:
164 | # Don't automatically retry if it's a custom run
165 | data['metadata']['tries'] = jobs.MAX_RETRIES
166 |
167 | # Expand timeout for resources that have to be type-guessed
168 | timeout = config.get(
169 | 'ckanext.xloader.job_timeout',
170 | '3600' if utils.datastore_resource_exists(res_id) else '10800')
171 | log.debug("Timeout for XLoading resource %s is %s", res_id, timeout)
172 |
173 | try:
174 | job = enqueue_job(
175 | jobs.xloader_data_into_datastore, [data], queue=custom_queue,
176 | title="xloader_submit: package: {} resource: {}".format(resource_dict.get('package_id'), res_id),
177 | rq_kwargs=dict(timeout=timeout, at_front=sync)
178 | )
179 | except Exception:
180 | if sync:
181 | log.exception('Unable to xloader res_id=%s', res_id)
182 | else:
183 | log.exception('Unable to enqueue xloader res_id=%s', res_id)
184 | return False
185 | log.debug('Enqueued xloader job=%s res_id=%s', job.id, res_id)
186 | value = json.dumps({'job_id': job.id})
187 |
188 | if sync:
189 | log.debug('Pushed xloader sync mode job=%s res_id=%s to front of queue', job.id, res_id)
190 |
191 | task['value'] = value
192 | task['state'] = 'pending'
193 | task['last_updated'] = str(datetime.datetime.utcnow())
194 |
195 | p.toolkit.get_action('task_status_update')(
196 | {'session': model.meta.create_local_session(), 'ignore_auth': True},
197 | task
198 | )
199 |
200 | return True
201 |
202 |
203 | def _enqueue(fn, args=None, kwargs=None, title=None, queue='default',
204 | timeout=180):
205 | '''Same as latest ckan.lib.jobs.enqueue - earlier CKAN versions dont have
206 | the timeout param
207 |
208 | This function can be removed when dropping support for 2.7
209 | '''
210 | if args is None:
211 | args = []
212 | if kwargs is None:
213 | kwargs = {}
214 | job = get_queue(queue).enqueue_call(func=fn, args=args, kwargs=kwargs,
215 | timeout=timeout)
216 | job.meta[u'title'] = title
217 | job.save()
218 | msg = u'Added background job {}'.format(job.id)
219 | if title:
220 | msg = u'{} ("{}")'.format(msg, title)
221 | msg = u'{} to queue "{}"'.format(msg, queue)
222 | log.info(msg)
223 | return job
224 |
225 |
226 | def xloader_hook(context, data_dict):
227 | ''' Update xloader task. This action is typically called by ckanext-xloader
228 | whenever the status of a job changes.
229 |
230 | :param metadata: metadata provided when submitting job. key-value pairs.
231 | Must have resource_id property.
232 | :type metadata: dict
233 | :param status: status of the job from the xloader service. Allowed values:
234 | pending, running, running_but_viewable, complete, error
235 | (which must all be valid values for task_status too)
236 | :type status: string
237 | :param error: Error raised during job execution
238 | :type error: string
239 |
240 | NB here are other params which are in the equivalent object in
241 | ckan-service-provider (from job_status):
242 | :param sent_data: Input data for job
243 | :type sent_data: json encodable data
244 | :param job_id: An identifier for the job
245 | :type job_id: string
246 | :param result_url: Callback url
247 | :type result_url: url string
248 | :param data: Results from job.
249 | :type data: json encodable data
250 | :param requested_timestamp: Time the job started
251 | :type requested_timestamp: timestamp
252 | :param finished_timestamp: Time the job finished
253 | :type finished_timestamp: timestamp
254 |
255 | '''
256 |
257 | metadata, status = _get_or_bust(data_dict, ['metadata', 'status'])
258 |
259 | res_id = _get_or_bust(metadata, 'resource_id')
260 |
261 | # Pass metadata, not data_dict, as it contains the resource id needed
262 | # on the auth checks
263 | p.toolkit.check_access('xloader_submit', context, metadata)
264 |
265 | task = p.toolkit.get_action('task_status_show')(context, {
266 | 'entity_id': res_id,
267 | 'task_type': 'xloader',
268 | 'key': 'xloader'
269 | })
270 |
271 | task['state'] = status
272 | task['last_updated'] = str(datetime.datetime.utcnow())
273 | task['error'] = data_dict.get('error')
274 |
275 | resubmit = False
276 |
277 | if status in ('complete', 'running_but_viewable'):
278 | # Create default views for resource if necessary (only the ones that
279 | # require data to be in the DataStore)
280 | resource_dict = p.toolkit.get_action('resource_show')(
281 | context, {'id': res_id})
282 |
283 | dataset_dict = p.toolkit.get_action('package_show')(
284 | context, {'id': resource_dict['package_id']})
285 |
286 | for plugin in p.PluginImplementations(xloader_interfaces.IXloader):
287 | plugin.after_upload(context, resource_dict, dataset_dict)
288 |
289 | p.toolkit.get_action('resource_create_default_resource_views')(
290 | context,
291 | {
292 | 'resource': resource_dict,
293 | 'package': dataset_dict,
294 | 'create_datastore_views': True,
295 | })
296 |
297 | # Check if the uploaded file has been modified in the meantime
298 | if (resource_dict.get('last_modified')
299 | and metadata.get('task_created')):
300 | try:
301 | last_modified_datetime = parse_date(
302 | resource_dict['last_modified'])
303 | task_created_datetime = parse_date(metadata['task_created'])
304 | if last_modified_datetime > task_created_datetime:
305 | log.debug('Uploaded file more recent: %s > %s',
306 | last_modified_datetime, task_created_datetime)
307 | resubmit = True
308 | except ValueError:
309 | pass
310 | # Check if the URL of the file has been modified in the meantime
311 | elif (resource_dict.get('url')
312 | and metadata.get('original_url')
313 | and resource_dict['url'] != metadata['original_url']):
314 | log.debug('URLs are different: %s != %s',
315 | resource_dict['url'], metadata['original_url'])
316 | resubmit = True
317 |
318 | context['ignore_auth'] = True
319 | p.toolkit.get_action('task_status_update')(context, task)
320 |
321 | if resubmit:
322 | log.debug('Resource %s has been modified, '
323 | 'resubmitting to DataPusher', res_id)
324 | p.toolkit.get_action('xloader_submit')(
325 | context, {'resource_id': res_id})
326 |
327 |
328 | @side_effect_free
329 | def xloader_status(context, data_dict):
330 | ''' Get the status of a ckanext-xloader job for a certain resource.
331 |
332 | :param resource_id: The resource id of the resource that you want the
333 | status for.
334 | :type resource_id: string
335 | '''
336 |
337 | p.toolkit.check_access('xloader_status', context, data_dict)
338 |
339 | if 'id' in data_dict:
340 | data_dict['resource_id'] = data_dict['id']
341 | res_id = _get_or_bust(data_dict, 'resource_id')
342 |
343 | task = p.toolkit.get_action('task_status_show')(context, {
344 | 'entity_id': res_id,
345 | 'task_type': 'xloader',
346 | 'key': 'xloader'
347 | })
348 |
349 | value = json.loads(task['value'])
350 | job_id = value.get('job_id')
351 | url = None
352 | job_detail = None
353 |
354 | if job_id:
355 | # get logs from the xloader db
356 | db.init(config)
357 | job_detail = db.get_job(job_id)
358 |
359 | if job_detail and job_detail.get('logs'):
360 | for log in job_detail['logs']:
361 | if 'timestamp' in log and isinstance(log['timestamp'], datetime.datetime):
362 | log['timestamp'] = log['timestamp'].isoformat()
363 | try:
364 | error = json.loads(task['error'])
365 | except ValueError:
366 | # this happens occasionally, such as when the job times out
367 | error = task['error']
368 | return {
369 | 'status': task['state'],
370 | 'job_id': job_id,
371 | 'job_url': url,
372 | 'last_updated': task['last_updated'],
373 | 'task_info': job_detail,
374 | 'error': error,
375 | }
376 |
--------------------------------------------------------------------------------