├── ckanext ├── geodatagov │ ├── saml2 │ │ ├── __init__.py │ │ ├── pki │ │ │ └── README │ │ ├── attributemaps │ │ │ └── saml_uri.py │ │ ├── pkitestcrt │ │ │ ├── mykey.pem │ │ │ └── mycert.pem │ │ └── sp_config.py.template │ ├── __init__.py │ ├── tests │ │ ├── data-samples │ │ │ ├── waf-trim-tags │ │ │ │ └── index.html │ │ │ ├── waf-collection2 │ │ │ │ └── index.html │ │ │ ├── waf-collection1 │ │ │ │ └── index.html │ │ │ ├── waf-fgdc │ │ │ │ └── index.html │ │ │ ├── waf-gmi │ │ │ │ └── index.html │ │ │ ├── waf1 │ │ │ │ └── index.html │ │ │ ├── sample6_bad_data.json │ │ │ └── sample5_data.json │ │ ├── conftest.py │ │ ├── test_fix_dataset.py │ │ ├── test_fix_packages.py │ │ ├── test_json_export.py │ │ ├── test_tracking.py │ │ ├── test_s3test.py │ │ ├── test_update_geo.py │ │ ├── test_category_tags.py │ │ ├── utils.py │ │ ├── test_logic.py │ │ ├── factories.py │ │ ├── test_waf_GMI.py │ │ ├── test_datajson.py │ │ ├── test_sitemap_creation.py │ │ ├── test_fix_spatial.py │ │ ├── test_relink.py │ │ └── test_waf-collection.py │ ├── auth.py │ ├── templates │ │ ├── organization │ │ │ ├── read.html │ │ │ └── snippets │ │ │ │ └── organization_form.html │ │ ├── package │ │ │ ├── search.html │ │ │ └── read.html │ │ ├── snippets │ │ │ └── related_collection.html │ │ └── source │ │ │ └── geodatagov_source_form.html │ ├── harvesters │ │ ├── __init__.py │ │ ├── z3950.py │ │ └── waf_collection.py │ ├── search.py │ ├── helpers.py │ ├── validation │ │ ├── __init__.py │ │ └── xml │ │ │ ├── fgdc-std-001.1-1999 │ │ │ └── fgdc-std-001.1-1999.xsd │ │ │ ├── fgdc-std-012-2002 │ │ │ ├── fgdc-std-001-1998-sect03.xsd │ │ │ ├── fgdc-std-012-2002-sect03.xsd │ │ │ ├── fgdc-std-012-2002-sect05.xsd │ │ │ ├── fgdc-std-012-2002-locainfo.xsd │ │ │ ├── fgdc-std-001-1998-sect05.xsd │ │ │ └── fgdc-std-001-1998-sect09.xsd │ │ │ └── fgdc-std-001.2-2001 │ │ │ ├── fgdc-std-001.2-2001.xsd │ │ │ └── fgdc-std-001.2-2001-sect09.xsd │ ├── rebuild.py │ └── bin │ │ └── scrapewaf.py └── __init__.py ├── docker-entrypoint.d └── 10-setup-db.sh ├── MANIFEST.in ├── dev-requirements.txt ├── solr └── README ├── setup.cfg ├── .gitignore ├── .github ├── pull_request_template.md └── workflows │ ├── test.yml │ └── deploy.yml ├── requirements.txt ├── Dockerfile ├── test.sh ├── test.ini ├── Makefile ├── docker-compose.yml ├── setup.py ├── scripts └── sql │ ├── make_pk.sql │ └── what_to_alter.sql ├── LICENSE.md ├── CONTRIBUTING.md ├── .env ├── README.md └── ADR.md /ckanext/geodatagov/saml2/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'rohe0002' 2 | -------------------------------------------------------------------------------- /ckanext/geodatagov/saml2/pki/README: -------------------------------------------------------------------------------- 1 | Add pki files in this folder named: 2 | mycert.pem 3 | mykey.pem 4 | 5 | -------------------------------------------------------------------------------- /docker-entrypoint.d/10-setup-db.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Init Harvest database tables" 4 | ckan harvester initdb 5 | 6 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE.md 3 | include requirements.txt 4 | recursive-include ckanext/geodatagov *.html *.xsd *.pem *.xslt 5 | -------------------------------------------------------------------------------- /dev-requirements.txt: -------------------------------------------------------------------------------- 1 | cryptography==44.0.1 2 | factory-boy==2.12.0 3 | mock==1.0.1 4 | flake8 5 | pycodestyle 6 | pytest 7 | pytest-ckan 8 | pytest-cov 9 | -------------------------------------------------------------------------------- /solr/README: -------------------------------------------------------------------------------- 1 | This file has been copied from the schema-2.0.xml file in CKAN core, 2 | and includes the following modifications: 3 | 4 | * Add 'spatial_geom' field to index geometries for the spatial query 5 | -------------------------------------------------------------------------------- /ckanext/__init__.py: -------------------------------------------------------------------------------- 1 | # this is a namespace package 2 | try: 3 | import pkg_resources 4 | pkg_resources.declare_namespace(__name__) 5 | except ImportError: 6 | import pkgutil 7 | __path__ = pkgutil.extend_path(__path__, __name__) 8 | -------------------------------------------------------------------------------- /ckanext/geodatagov/__init__.py: -------------------------------------------------------------------------------- 1 | # this is a namespace package 2 | try: 3 | import pkg_resources 4 | pkg_resources.declare_namespace(__name__) 5 | except ImportError: 6 | import pkgutil 7 | __path__ = pkgutil.extend_path(__path__, __name__) 8 | -------------------------------------------------------------------------------- /ckanext/geodatagov/tests/data-samples/waf-trim-tags/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [coverage:run] 2 | dynamic_context = test_function 3 | omit=ckanext/geodatagov/tests/* 4 | 5 | [flake8] 6 | max-line-length = 127 7 | # TODO disable once future.standard_libary is removed 8 | ignore = E402 9 | 10 | [tool:pytest] 11 | norecursedirs=ckanext/geodatagov/tests/nose 12 | -------------------------------------------------------------------------------- /ckanext/geodatagov/auth.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | def related_create(context, data_dict=None): 4 | return {'success': False} 5 | 6 | 7 | def related_update(context, data_dict=None): 8 | return {'success': False} 9 | 10 | 11 | def group_catagory_tag_update(context, data_dict=None): 12 | return {'success': False} 13 | -------------------------------------------------------------------------------- /ckanext/geodatagov/tests/data-samples/waf-collection2/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Index of /waf-collection2 5 | 6 | 7 |

Index of /waf

8 | 2013_county.ea.iso.xml 9 | 10 | 11 | -------------------------------------------------------------------------------- /ckanext/geodatagov/templates/organization/read.html: -------------------------------------------------------------------------------- 1 | {% ckan_extends %} 2 | 3 | {% block primary_content_inner %} 4 | {% if request.args.get('collection_package_id') %} 5 | {% snippet "snippets/related_collection.html", collection_package_id=request.args.get('collection_package_id') %} 6 | {% endif %} 7 | {{ super() }} 8 | {% endblock %} 9 | -------------------------------------------------------------------------------- /ckanext/geodatagov/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import utils 4 | 5 | 6 | @pytest.fixture(scope="session", autouse=True) 7 | def run_once_for_all_tests(): 8 | utils.simple_http_server() 9 | 10 | 11 | @pytest.fixture(scope="class", autouse=True) 12 | def run_for_every_test_class(): 13 | utils.reset_db_and_solr() 14 | -------------------------------------------------------------------------------- /ckanext/geodatagov/tests/data-samples/waf-collection1/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Index of /waf-collection1 5 | 6 | 7 |

Index of /waf

8 | tl_2013_us_county.shp.iso.xml 9 | 10 | 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | syntax: glob 3 | *.pyc 4 | *.egg-info 5 | .eggs/* 6 | *.swo 7 | *.swp 8 | *.bak 9 | .coverage 10 | conversiontool/run.sh 11 | conversiontool/errors.log 12 | conversiontool/debug.xml 13 | conversiontool/lib/saxon-license.lic 14 | conversiontool/lib/*.jar 15 | .vscode/ 16 | PyZ3950_parsetab.py 17 | src 18 | venv/* 19 | tmp/localstack/* 20 | .vim/* 21 | -------------------------------------------------------------------------------- /ckanext/geodatagov/tests/data-samples/waf-fgdc/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Index of CSDGM/FGDC sample data 6 | 7 | 8 |

Index of CSDGM/FGDC sample data

9 | 12 | -------------------------------------------------------------------------------- /ckanext/geodatagov/templates/package/search.html: -------------------------------------------------------------------------------- 1 | {% ckan_extends %} 2 | 3 | {% block primary_content %} 4 | {% if request.args.get('collection_package_id') %} 5 |
6 | {% snippet "snippets/related_collection.html", collection_package_id=request.args.get('collection_package_id') %} 7 |
8 | {% endif %} 9 | 10 | {{ super() }} 11 | 12 | {% endblock %} 13 | 14 | {{ super() }} -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | # Pull Request 2 | 3 | Related to [LINK TO ISSUE] 4 | 5 | ## About 6 | 7 | 8 | 9 | ## PR TASKS 10 | 11 | - [ ] The actual code changes. 12 | - [ ] Tests written and passed. 13 | - [ ] Any changes to docs? 14 | - [ ] Bumped version number in [setup.py](https://github.com/GSA/ckanext-geodatagov/blob/main/setup.py#L13) (also checked on [PyPi](https://pypi.org/project/ckanext-geodatagov/#history)). 15 | -------------------------------------------------------------------------------- /ckanext/geodatagov/tests/data-samples/waf-gmi/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Index of /data/existing/decennial/GEO/CPMB/boundary/2015gz/CartographicShapefiles/necta_500k 6 | 7 | 8 |

Index of /data/existing/decennial/GEO/CPMB/boundary/2015gz/CartographicShapefiles/necta_500k

9 | 12 | -------------------------------------------------------------------------------- /ckanext/geodatagov/tests/test_fix_dataset.py: -------------------------------------------------------------------------------- 1 | from ckanext.geodatagov.logic import fix_dataset 2 | 3 | 4 | def test_fix_dataset(): 5 | data_dict = { 6 | "title": "test dataset", 7 | "extras": [ 8 | {"key": "test-key", "value": "test value"}, 9 | {"key": "tags", "value": "taG*01, tag (test) 02"} 10 | ] 11 | } 12 | 13 | data_dict = fix_dataset(data_dict) 14 | 15 | assert "tag01" in [t['name'] for t in data_dict['tags']] 16 | assert "tag-test-02" in [t['name'] for t in data_dict['tags']] 17 | -------------------------------------------------------------------------------- /ckanext/geodatagov/saml2/attributemaps/saml_uri.py: -------------------------------------------------------------------------------- 1 | __author__ = 'rolandh' 2 | 3 | EDUPERSON_OID = "urn:oid:1.3.6.1.4.1.5923.1.1.1." 4 | X500ATTR_OID = "urn:oid:2.5.4." 5 | NOREDUPERSON_OID = "urn:oid:1.3.6.1.4.1.2428.90.1." 6 | NETSCAPE_LDAP = "urn:oid:2.16.840.1.113730.3.1." 7 | UCL_DIR_PILOT = 'urn:oid:0.9.2342.19200300.100.1.' 8 | PKCS_9 = "urn:oid:1.2.840.113549.1.9.1." 9 | UMICH = "urn:oid:1.3.6.1.4.1.250.1.57." 10 | 11 | MAP = { 12 | "identifier": "urn:oasis:names:tc:SAML:2.0:attrname-format:uri", 13 | "fro": { 14 | }, 15 | "to": { 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /ckanext/geodatagov/templates/snippets/related_collection.html: -------------------------------------------------------------------------------- 1 | {% set collection_package = h.get_collection_package(collection_package_id) %} 2 | {% set title = title or _('Collection:') %} 3 | {% set wrapper_class = wrapper_class or "well" %} 4 | 5 | {% if collection_package %} 6 |
7 |
8 |

{{ title }}

9 | 12 |
13 |
14 | {% endif %} 15 | -------------------------------------------------------------------------------- /ckanext/geodatagov/tests/data-samples/waf1/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Index of /data/existing/decennial/GEO/CPMB/boundary/2016Cartographic/division_500 5 | 6 | 7 |

Index of /data/existing/decennial/GEO/CPMB/boundary/2016Cartographic/division_500

8 | 12 | 13 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -e git+https://github.com/ckan/ckanext-harvest.git#egg=ckanext_harvest 2 | -e git+https://github.com/ckan/ckanext-spatial.git#egg=ckanext-spatial 3 | -e git+https://github.com/asl2/PyZ3950.git#egg=PyZ3950 4 | 5 | pyutilib 6 | 7 | # ckanext-harvest dependencies 8 | ckantoolkit>=0.0.7 9 | pika>=1.1.0 10 | pyOpenSSL>22.10 #pinning to fix error with crypto (https://levelup.gitconnected.com/fix-attributeerror-module-lib-has-no-attribute-openssl-521a35d83769) 11 | # redis==2.10.6 # included in ckan core 12 | # requests>=2.11.1 # included in ckan core 13 | 14 | # ckanext-spatial 15 | # ckantoolkit # included as dep of ckanext-harvest 16 | Shapely==2.0.6 17 | OWSLib==0.32.1 18 | lxml>=2.3 19 | argparse 20 | pyparsing>=2.1.10 21 | # requests>=1.1.0 # included in ckan-core 22 | six 23 | geojson==3.1.0 24 | 25 | # PyZ3950 26 | pyasn1 27 | # ply #required in setup.py 28 | 29 | # other requirments 30 | boto3 31 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | ARG CKAN_VERSION=2.11 2 | FROM ckan/ckan-dev:${CKAN_VERSION} 3 | ARG CKAN_VERSION 4 | 5 | USER root 6 | 7 | RUN apt-get update && apt-get install -y postgresql-client openjdk-17-jre libgeos-dev 8 | 9 | # Download Saxon jar for FGDC2ISO transform (geodatagov) 10 | ARG saxon_ver=9.9.1-7 11 | ADD \ 12 | https://repo1.maven.org/maven2/net/sf/saxon/Saxon-HE/${saxon_ver}/Saxon-HE-${saxon_ver}.jar \ 13 | /usr/lib/jvm/java-11-openjdk/saxon/saxon.jar 14 | 15 | ENV CLASSPATH=${CLASSPATH}:/usr/lib/jvm/java-11-openjdk/saxon/saxon.jar 16 | 17 | # Pinned for build issue: https://github.com/pyproj4/pyproj/issues/1321 18 | RUN pip install --upgrade pip 19 | # RUN python3 -m pip install 'cython<3' 20 | # RUN python3 -m pip install --no-use-pep517 pyproj==3.4.1 21 | RUN python3 -m pip install pyproj 22 | 23 | COPY . $APP_DIR/ 24 | 25 | RUN pip install -r $APP_DIR/requirements.txt -r $APP_DIR/dev-requirements.txt -e $APP_DIR/. 26 | -------------------------------------------------------------------------------- /ckanext/geodatagov/harvesters/__init__.py: -------------------------------------------------------------------------------- 1 | # this is a namespace package 2 | try: 3 | import pkg_resources 4 | pkg_resources.declare_namespace(__name__) 5 | except ImportError: 6 | import pkgutil 7 | __path__ = pkgutil.extend_path(__path__, __name__) 8 | 9 | from ckanext.geodatagov.harvesters.base import GeoDataGovHarvester # NOQA F401 10 | from ckanext.geodatagov.harvesters.base import GeoDataGovCSWHarvester # NOQA F401 11 | from ckanext.geodatagov.harvesters.base import GeoDataGovWAFHarvester # NOQA F401 12 | from ckanext.geodatagov.harvesters.base import GeoDataGovDocHarvester # NOQA F401 13 | from ckanext.geodatagov.harvesters.base import GeoDataGovGeoportalHarvester # NOQA F401 14 | from ckanext.geodatagov.harvesters.waf_collection import WAFCollectionHarvester # NOQA F401 15 | from ckanext.geodatagov.harvesters.z3950 import Z3950Harvester # NOQA F401 16 | from ckanext.geodatagov.harvesters.arcgis import ArcGISHarvester # NOQA F401 17 | -------------------------------------------------------------------------------- /ckanext/geodatagov/saml2/pkitestcrt/mykey.pem: -------------------------------------------------------------------------------- 1 | -----BEGIN RSA PRIVATE KEY----- 2 | MIICXAIBAAKBgQDkJWP7bwOxtH+E15VTaulNzVQ/0cSbM5G7abqeqSNSs0l0veHr 3 | 6/ROgW96ZeQ57fzVy2MCFiQRw2fzBs0n7leEmDJyVVtBTavYlhAVXDNa3stgvh43 4 | qCfLx+clUlOvtnsoMiiRmo7qf0BoPKTj7c0uLKpDpEbAHQT4OF1HRYVxMwIDAQAB 5 | AoGAbx9rKH91DCw/ZEPhHsVXJ6cYHxGcMoAWvnMMC9WUN+bNo4gNL205DLfsxXA1 6 | jqXFXZj3+38vSFumGPA6IvXrN+Wyp3+Lz3QGc4K5OdHeBtYlxa6EsrxPgvuxYDUB 7 | vx3xdWPMjy06G/ML+pR9XHnRaPNubXQX3UxGBuLjwNXVmyECQQD2/D84tYoCGWoq 8 | 5FhUBxFUy2nnOLKYC/GGxBTX62iLfMQ3fbQcdg2pJsB5rrniyZf7UL+9FOsAO9k1 9 | 8DO7G12DAkEA7Hkdg1KEw4ZfjnnjEa+KqpyLTLRQ91uTVW6kzR+4zY719iUJ/PXE 10 | PxJqm1ot7mJd1LW+bWtjLpxs7jYH19V+kQJBAIEpn2JnxdmdMuFlcy/WVmDy09pg 11 | 0z0imdexeXkFmjHAONkQOv3bWv+HzYaVMo8AgCOksfEPHGqN4eUMTfFeuUMCQF+5 12 | E1JSd/2yCkJhYqKJHae8oMLXByNqRXTCyiFioutK4JPYIHfugJdLfC4QziD+Xp85 13 | RrGCU+7NUWcIJhqfiJECQAIgUAzfzhdj5AyICaFPaOQ+N8FVMLcTyqeTXP0sIlFk 14 | JStVibemTRCbxdXXM7OVipz1oW3PBVEO3t/VyjiaGGg= 15 | -----END RSA PRIVATE KEY----- 16 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: [push] 4 | env: 5 | CODE_COVERAGE_THRESHOLD_REQUIRED: 33 6 | 7 | jobs: 8 | lint: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v3 12 | - uses: actions/setup-python@v4 13 | with: 14 | python-version: '3.8' 15 | - name: Install requirements 16 | run: pip install flake8 pycodestyle pytest pytest-ckan pytest-cov 17 | - name: Run flake8 18 | run: flake8 . --count --max-line-length=127 --statistics --exclude ckan 19 | 20 | test: 21 | needs: lint 22 | strategy: 23 | matrix: 24 | ckan-version: ['2.11'] 25 | fail-fast: false 26 | 27 | name: CKAN ${{ matrix.ckan-version }} 28 | runs-on: ubuntu-latest 29 | 30 | steps: 31 | - uses: actions/checkout@v3 32 | - name: Run everything 33 | run: source .env && CKAN_VERSION=${{ matrix.ckan-version }} make clean build ci test 34 | # - name: Setup tmate session on fail 35 | # if: ${{ failure() }} 36 | # uses: mxschmitt/action-tmate@v3 37 | -------------------------------------------------------------------------------- /.github/workflows/deploy.yml: -------------------------------------------------------------------------------- 1 | name: Publish to PyPI 2 | on: 3 | pull_request: 4 | branches: [main] 5 | types: [closed] 6 | workflow_dispatch: 7 | inputs: 8 | version_no: 9 | description: 'Release Version:' 10 | required: true 11 | 12 | jobs: 13 | deploy: 14 | name: Publish to PyPI 15 | runs-on: ubuntu-latest 16 | if: github.event.pull_request.merged == true || github.event_name == 'workflow_dispatch' 17 | steps: 18 | - name: checkout 19 | uses: actions/checkout@v4 20 | - name: Update setup.py if manual release 21 | if: github.event_name == 'workflow_dispatch' 22 | run: | 23 | sed -i "s/version='[0-9]\{1,2\}.[0-9]\{1,4\}.[0-9]\{1,4\}',/version='${{github.event.inputs.version_no}}',/g" setup.py 24 | - name: Create packages 25 | run: | 26 | pip install build 27 | python -m build 28 | - name: pypi-publish 29 | uses: pypa/gh-action-pypi-publish@release/v1 30 | with: 31 | user: __token__ 32 | password: ${{ secrets.PYPI_API_TOKEN }} 33 | -------------------------------------------------------------------------------- /ckanext/geodatagov/saml2/pkitestcrt/mycert.pem: -------------------------------------------------------------------------------- 1 | -----BEGIN CERTIFICATE----- 2 | MIIC8jCCAlugAwIBAgIJAJHg2V5J31I8MA0GCSqGSIb3DQEBBQUAMFoxCzAJBgNV 3 | BAYTAlNFMQ0wCwYDVQQHEwRVbWVhMRgwFgYDVQQKEw9VbWVhIFVuaXZlcnNpdHkx 4 | EDAOBgNVBAsTB0lUIFVuaXQxEDAOBgNVBAMTB1Rlc3QgU1AwHhcNMDkxMDI2MTMz 5 | MTE1WhcNMTAxMDI2MTMzMTE1WjBaMQswCQYDVQQGEwJTRTENMAsGA1UEBxMEVW1l 6 | YTEYMBYGA1UEChMPVW1lYSBVbml2ZXJzaXR5MRAwDgYDVQQLEwdJVCBVbml0MRAw 7 | DgYDVQQDEwdUZXN0IFNQMIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQDkJWP7 8 | bwOxtH+E15VTaulNzVQ/0cSbM5G7abqeqSNSs0l0veHr6/ROgW96ZeQ57fzVy2MC 9 | FiQRw2fzBs0n7leEmDJyVVtBTavYlhAVXDNa3stgvh43qCfLx+clUlOvtnsoMiiR 10 | mo7qf0BoPKTj7c0uLKpDpEbAHQT4OF1HRYVxMwIDAQABo4G/MIG8MB0GA1UdDgQW 11 | BBQ7RgbMJFDGRBu9o3tDQDuSoBy7JjCBjAYDVR0jBIGEMIGBgBQ7RgbMJFDGRBu9 12 | o3tDQDuSoBy7JqFepFwwWjELMAkGA1UEBhMCU0UxDTALBgNVBAcTBFVtZWExGDAW 13 | BgNVBAoTD1VtZWEgVW5pdmVyc2l0eTEQMA4GA1UECxMHSVQgVW5pdDEQMA4GA1UE 14 | AxMHVGVzdCBTUIIJAJHg2V5J31I8MAwGA1UdEwQFMAMBAf8wDQYJKoZIhvcNAQEF 15 | BQADgYEAMuRwwXRnsiyWzmRikpwinnhTmbooKm5TINPE7A7gSQ710RxioQePPhZO 16 | zkM27NnHTrCe2rBVg0EGz7QTd1JIwLPvgoj4VTi/fSha/tXrYUaqc9AqU1kWI4WN 17 | +vffBGQ09mo+6CffuFTZYeOhzP/2stAPwCTU4kxEoiy0KpZMANI= 18 | -----END CERTIFICATE----- 19 | -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Setup and run extension tests. This script should be run in a _clean_ CKAN 3 | # environment. e.g.: 4 | # 5 | # $ docker compose run --rm app ./test.sh 6 | # 7 | 8 | set -o errexit 9 | set -o pipefail 10 | 11 | test_ini=/srv/app/test.ini 12 | 13 | # Database is listening, but still unavailable. Just keep trying... 14 | while ! ckan -c $test_ini db init; do 15 | echo Retrying in 5 seconds... 16 | sleep 5 17 | done 18 | 19 | HOST=db 20 | DB_NAME=ckan 21 | DB_USER=ckan 22 | PASS=ckan 23 | 24 | # Uncomment if you would like to rapid-prototype with the spatial extension 25 | # Note: make sure the correct brance is referenced in either requirements.py file 26 | # cd /srv/app/src/ckanext-spatial/ 27 | # git pull 28 | # cd - 29 | 30 | ckan -c $test_ini db upgrade -p harvest 31 | 32 | 33 | pytest --ckan-ini=test.ini --cov=ckanext.geodatagov --disable-warnings ckanext/geodatagov/tests/ 34 | 35 | # Run this this pytest command if only testing a single test 36 | # pytest --ckan-ini=$test_ini --cov=ckanext.geodatagov --disable-warnings ckanext/geodatagov/tests/test_category_tags.py 37 | # pytest --ckan-ini=$test_ini --cov=ckanext.geodatagov --disable-warnings ckanext/geodatagov/tests/test_category_tags.py ckanext/geodatagov/tests/test_waf-collection.py 38 | -------------------------------------------------------------------------------- /ckanext/geodatagov/tests/test_fix_packages.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import logging 3 | 4 | from ckan.tests import factories 5 | 6 | 7 | log = logging.getLogger(__name__) 8 | 9 | 10 | @pytest.mark.usefixtures("with_plugins") 11 | class TestFixPkg(object): 12 | 13 | @classmethod 14 | def setup_class(cls): 15 | cls.organization = factories.Organization() 16 | 17 | def test_fix_tags(self): 18 | dataset_extras = [ 19 | { 20 | "key": "tags", 21 | "value": "tag01, tag02" 22 | } 23 | ] 24 | dataset = factories.Dataset( 25 | owner_org=self.organization['id'], 26 | extras=dataset_extras) 27 | 28 | assert "tag01" in [t['name'] for t in dataset['tags']] 29 | assert "tag02" in [t['name'] for t in dataset['tags']] 30 | 31 | def test_avoid_duplicated_tags(self): 32 | dataset_extras = [ 33 | { 34 | "key": "tags", 35 | "value": "tag01, tag02" 36 | } 37 | ] 38 | dataset = factories.Dataset( 39 | owner_org=self.organization['id'], 40 | extras=dataset_extras, 41 | tags=[{'name': 'tag01'}]) 42 | 43 | assert len(dataset['tags']) == 2 44 | assert "tag01" in [t['name'] for t in dataset['tags']] 45 | assert "tag02" in [t['name'] for t in dataset['tags']] 46 | -------------------------------------------------------------------------------- /ckanext/geodatagov/templates/package/read.html: -------------------------------------------------------------------------------- 1 | {% ckan_extends %} 2 | 3 | {% set pkg_dict = c.pkg_dict %} 4 | 5 | {% block collection_resources %} 6 | {% set collection_package_id = h.get_pkg_dict_extra(c.pkg_dict, 'collection_package_id', '') %} 7 | {% if h.get_pkg_dict_extra(c.pkg_dict, 'collection_metadata', '') %} 8 |
9 |

{{ _('Collection') }}

10 |

{{ _('This dataset is a collection of other datasets.') }}

11 |

{{ _('Search datasets within this collection') }}

12 |
13 | {% elif collection_package_id %} 14 | {% set collection_package = h.get_collection_package(collection_package_id) %} 15 |
16 |

{{ _('Collection') }}

17 | {% if collection_package %} 18 |

{{ _('This dataset is part of the following collection:') }}

19 | 22 | {% else %} 23 |

{{ _('This dataset is part of a deleted collection.') }}

24 |

{{ _('Search other datasets within the same collection') }}

25 | {% endif %} 26 |
27 | {% endif %} 28 | {% endblock %} 29 | -------------------------------------------------------------------------------- /ckanext/geodatagov/tests/test_json_export.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from ckan.tests import factories 4 | 5 | 6 | # import json 7 | # from ckan.common import config 8 | # from ckanext.geodatagov.commands import GeoGovCommand 9 | 10 | log = logging.getLogger(__name__) 11 | 12 | 13 | class TestJSONExport(object): 14 | 15 | def create_datasets(self): 16 | 17 | org_extras = [{'key': 'organization_type', 'value': 'Federal Government'}] 18 | organization = factories.Organization(extras=org_extras) 19 | dataset1 = factories.Dataset(owner_org=organization['id']) # NOQA 20 | dataset2 = factories.Dataset(owner_org=organization['id']) # NOQA 21 | 22 | # TODO: Fix this test when `jsonl_export` is no longer defunct 23 | ''' 24 | def test_json_output(self): 25 | """ run json_export and analyze results """ 26 | 27 | self.create_datasets() 28 | 29 | # skip AWS bucket if exists 30 | config['ckanext.geodatagov.aws_bucket_name'] = None 31 | 32 | cmd = GeoGovCommand() 33 | path, _ = cmd.jsonl_export() 34 | 35 | parsed_lines = 0 36 | with open(path, 'r') as f: 37 | line = f.readline() 38 | while line: 39 | data = json.loads(line) # NOQA 40 | parsed_lines += 1 41 | line = f.readline() 42 | 43 | log.info('Data is JSON valid: {} parsed lines'.format(parsed_lines)) 44 | assert parsed_lines > 0 45 | ''' 46 | -------------------------------------------------------------------------------- /test.ini: -------------------------------------------------------------------------------- 1 | [DEFAULT] 2 | debug = true 3 | # Uncomment and replace with the address which should receive any error reports 4 | #email_to = you@yourdomain.com 5 | smtp_server = localhost 6 | error_email_from = paste@localhost 7 | 8 | [app:main] 9 | use = config:/srv/app/src/ckan/test-core.ini 10 | ckan.site_title = My Test CKAN Site 11 | ckan.site_description = A test site for testing my CKAN extension 12 | ckan.plugins = tracking harvest geodatagov datagov_harvest ckan_harvester geodatagov_geoportal_harvester z3950_harvester arcgis_harvester waf_harvester_collection geodatagov_csw_harvester geodatagov_doc_harvester geodatagov_waf_harvester spatial_metadata spatial_query resource_proxy spatial_harvest_metadata_api datajson_harvest envvars 13 | ckan.legacy_templates = no 14 | ckan.spatial.validator.profiles = iso19139ngdc 15 | ckanext.spatial.search_backend = solr-bbox 16 | 17 | # Logging configuration 18 | [loggers] 19 | keys = root, ckan, sqlalchemy 20 | 21 | [handlers] 22 | keys = console 23 | 24 | [formatters] 25 | keys = generic 26 | 27 | [logger_root] 28 | level = WARN 29 | handlers = console 30 | 31 | [logger_ckan] 32 | qualname = ckan 33 | handlers = 34 | level = INFO 35 | 36 | [logger_sqlalchemy] 37 | handlers = 38 | qualname = sqlalchemy.engine 39 | level = WARN 40 | 41 | [handler_console] 42 | class = StreamHandler 43 | args = (sys.stdout,) 44 | level = NOTSET 45 | formatter = generic 46 | 47 | [formatter_generic] 48 | format = %(asctime)s %(levelname)-5.5s [%(name)s] %(message)s 49 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CKAN_VERSION ?= 2.11 2 | COMPOSE_FILE ?= docker-compose.yml 3 | 4 | build: ## Build the docker containers 5 | CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) build 6 | debug: 7 | CKAN_VERSION=$(CKAN_VERSION) docker compose run --service-ports app 8 | 9 | lint: ## Lint the code 10 | CKAN_VERSION=$(CKAN_VERSION) docker compose -f docker-compose.yml run --rm app flake8 /srv/app/ckanext/ --count --max-line-length=127 --show-source --statistics --exclude ckan 11 | 12 | clean: ## Clean workspace and containers 13 | find . -name *.pyc -delete 14 | CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) down -v --remove-orphans 15 | 16 | test: ## Run tests in a new container 17 | CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) run --rm app /srv/app/test.sh 18 | 19 | java-test: ## Test java transformation command (java + saxon installed) 20 | CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) run --rm app bash -c "java net.sf.saxon.Transform -s:/app/ckanext/geodatagov/tests/data-samples/waf-fgdc/fgdc-csdgm_sample.xml -xsl:/app/ckanext/geodatagov/harvesters/fgdcrse2iso19115-2.xslt" 21 | 22 | up: ## Start the containers 23 | CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) up 24 | 25 | down: ## Stop the containers 26 | CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) down 27 | 28 | ci: ## Start the containers in the background 29 | CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) up -d 30 | 31 | .DEFAULT_GOAL := help 32 | .PHONY: build clean help lint test up 33 | 34 | # Output documentation for top-level targets 35 | # Thanks to https://marmelab.com/blog/2016/02/29/auto-documented-makefile.html 36 | help: ## This help 37 | @awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf "\033[36m%-10s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST) 38 | -------------------------------------------------------------------------------- /ckanext/geodatagov/tests/test_tracking.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pytest 3 | 4 | import ckan.model as model 5 | from ckan.tests import factories, helpers 6 | from click.testing import CliRunner 7 | 8 | import ckanext.geodatagov.cli as cli 9 | 10 | 11 | log = logging.getLogger(__name__) 12 | 13 | 14 | @pytest.mark.usefixtures("with_plugins") 15 | class TestTracking(object): 16 | 17 | def create_datasets(self): 18 | 19 | organization = factories.Organization() 20 | self.dataset = factories.Dataset(owner_org=organization["id"]) 21 | 22 | # total view should be 0 for a new dataset 23 | package = helpers.call_action("package_show", id=self.dataset["id"], include_tracking=True) 24 | assert package['tracking_summary']['total'] == 0 25 | 26 | # insert two raw tracking data 27 | sql = ( 28 | "INSERT INTO tracking_raw (user_key, url, tracking_type, access_timestamp) VALUES" 29 | "('aaa','/dataset/{0}','page','2020-10-10')," 30 | "('bbb','/dataset/{0}','page','2021-11-11')" 31 | ).format(self.dataset["name"]) 32 | 33 | model.Session.execute(sql) 34 | model.Session.commit() 35 | 36 | @pytest.fixture 37 | def cli_result(self): 38 | self.create_datasets() 39 | 40 | runner = CliRunner() 41 | raw_cli_output = runner.invoke( 42 | cli.tracking_update, 43 | args=[], 44 | ) 45 | 46 | return raw_cli_output 47 | 48 | def test_tracking_data_in_package_show(self, cli_result): 49 | 50 | assert cli_result.exit_code == 0 51 | 52 | pacakge = helpers.call_action("package_show", id=self.dataset["id"], include_tracking=True) 53 | assert pacakge['tracking_summary']['total'] == 2 54 | assert pacakge['tracking_summary']['recent'] == 1 55 | -------------------------------------------------------------------------------- /ckanext/geodatagov/templates/organization/snippets/organization_form.html: -------------------------------------------------------------------------------- 1 | {% ckan_extends %} 2 | 3 | {% block custom_fields %} 4 | {% set extras_email = [] %} 5 | {% set extras_except_email = [] %} 6 | {% for extra in data.extras %} 7 | {% if extra.key == 'email_list' %} 8 | {# there is only one email value, but using a list here to bypass jinja valiable scope limitaiton #} 9 | {% do extras_email.append(extra.value) %} 10 | {% else %} 11 | {% do extras_except_email.append(extra) %} 12 | {% endif%} 13 | {% endfor %} 14 | {{ form.hidden('extras__0__key', value='email_list') }} 15 | {{ form.textarea('extras__0__value', label=_('Harvest report email list'), id='field-extras-0-value', value=extras_email[0], error=errors[prefix ~ 'value']) }} 16 | 17 | {% for extra in extras_except_email %} 18 | {% set prefix = 'extras__%d__' % loop.index %} 19 | {{ form.custom( 20 | names=(prefix ~ 'key', prefix ~ 'value', prefix ~ 'deleted'), 21 | id='field-extras-%d' % loop.index, 22 | label=_('Custom Field'), 23 | values=(extra.key, extra.value, extra.deleted), 24 | error=errors[prefix ~ 'key'] or errors[prefix ~ 'value'] 25 | ) }} 26 | {% endfor %} 27 | 28 | {# Add a max if 3 empty columns #} 29 | {% for extra in range(extras_except_email|count, 3) %} 30 | {% set index = (loop.index + extras_except_email|count) %} 31 | {% set prefix = 'extras__%d__' % index %} 32 | {{ form.custom( 33 | names=(prefix ~ 'key', prefix ~ 'value', prefix ~ 'deleted'), 34 | id='field-extras-%d' % index, 35 | label=_('Custom Field'), 36 | values=(extra.key, extra.value, extra.deleted), 37 | error=errors[prefix ~ 'key'] or errors[prefix ~ 'value'] 38 | ) }} 39 | {% endfor %} 40 | {% endblock %} 41 | 42 | -------------------------------------------------------------------------------- /ckanext/geodatagov/search.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from ckan.common import config 3 | # from ckan.lib.search.common import make_connection 4 | # from ckan.lib.search.query import SearchQuery 5 | 6 | from ckan.lib.search import make_connection, PackageSearchQuery # , SolrSettings 7 | 8 | 9 | log = logging.getLogger(__name__) 10 | 11 | 12 | class GeoPackageSearchQuery(PackageSearchQuery): 13 | def get_count(self): 14 | """ 15 | Return the count of all indexed packages. 16 | """ 17 | query = "*: *" 18 | fq = "+site_id: \"%s\" " % config.get('ckan.site_id') 19 | fq += "+state: active " 20 | 21 | conn = make_connection() 22 | 23 | try: 24 | data = conn.search(query, fq=fq, rows=0) 25 | except Exception as e: 26 | error = 'Error in GeoPackageSearchQuery.get_count: {}'.format(e) 27 | log.error(error) 28 | print(error) 29 | 30 | return data.hits 31 | 32 | def get_paginated_entity_name_modtime(self, max_results=1000, start=0): 33 | """ 34 | Return a list of the name and metadata_modified s of indexed packages. 35 | """ 36 | query = "*: *" 37 | fq = "+site_id: \"%s\" " % config.get('ckan.site_id') 38 | fq += "+state: active " 39 | 40 | conn = make_connection() 41 | try: 42 | data = conn.search(query, 43 | fq=fq, 44 | rows=max_results, 45 | fl='name,metadata_modified', 46 | start=start, 47 | sort='metadata_created asc') 48 | except Exception as e: 49 | error = 'Error in GeoPackageSearchQuery.get_paginated_entity_name_modtime: {}'.format(e) 50 | log.error(error) 51 | print(error) 52 | 53 | return [{'name': r.get('name'), 54 | 'metadata_modified': r.get('metadata_modified')} 55 | for r in data.docs] 56 | -------------------------------------------------------------------------------- /ckanext/geodatagov/helpers.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | 4 | from ckan import plugins as p 5 | from ckanext.harvest.model import HarvestSource 6 | from ckan.logic import NotFound, NotAuthorized 7 | 8 | log = logging.getLogger(__name__) 9 | 10 | try: 11 | from ckanext.geodatagov.harvesters.base import VALIDATION_PROFILES 12 | except ImportError as e: 13 | log.critical('Harvester not available %s' % str(e)) 14 | 15 | 16 | def get_validation_profiles(): 17 | return VALIDATION_PROFILES 18 | 19 | 20 | def get_validation_schema(): 21 | try: 22 | from ckanext.datajson.harvester_base import VALIDATION_SCHEMA 23 | except ImportError: 24 | return None 25 | 26 | return VALIDATION_SCHEMA 27 | 28 | 29 | def get_harvest_source_type(harvester_id): 30 | source_type = None 31 | try: 32 | package = p.toolkit.get_action('harvest_source_show')({}, {'id': harvester_id}) 33 | source_type = package['source_type'] 34 | except BaseException: 35 | pass 36 | 37 | return source_type 38 | 39 | 40 | def get_harvest_source_config(harvester_id): 41 | source_config = {} 42 | keys_lookfor = [ 43 | 'default_groups', 44 | 'private_datasets', 45 | 'validator_profiles', 46 | ] 47 | try: 48 | harvest_source = HarvestSource.get(harvester_id) 49 | source_config = json.loads(harvest_source.config) 50 | except BaseException: 51 | pass 52 | 53 | # convert single string element list to string 54 | if source_config: 55 | for key in keys_lookfor: 56 | value = source_config.get(key, '') 57 | if type(value) is list: 58 | source_config[key] = value[0] 59 | return source_config 60 | 61 | 62 | def get_collection_package(collection_package_id): 63 | try: 64 | package = p.toolkit.get_action('package_show')({}, {'id': collection_package_id}) 65 | return package 66 | except (NotFound, NotAuthorized): 67 | pass 68 | 69 | 70 | def string(value): 71 | return str(value) 72 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | app: 3 | image: datagov/ckanext-geodatagov:${CKAN_VERSION} # ensures docker-compose will rebuild the right image in case we change CKAN_VERSION 4 | build: 5 | context: . 6 | args: 7 | CKAN_VERSION: ${CKAN_VERSION} 8 | env_file: 9 | - .env 10 | environment: 11 | CKAN_SOLR_URL: http://solr:8983/solr/ckan 12 | CKAN_REDIS_URL: redis://redis:6379/1 13 | CKAN_DATAPUSHER_URL: http://localhost:8080/ # datapusher is not really enabled 14 | PYTHONDONTWRITEBYTECODE: 1 15 | ports: 16 | - "5000:5000" 17 | depends_on: 18 | - db 19 | - redis 20 | - solr 21 | - localstack-container 22 | volumes: 23 | - ./ckanext:/srv/app/ckanext/ 24 | - ./test.sh:/srv/app/test.sh 25 | - ./test.ini:/srv/app/test.ini 26 | - ./setup.py:/srv/app/setup.py 27 | - ckan_storage:/var/lib/ckan 28 | - ./docker-entrypoint.d:/docker-entrypoint.d 29 | 30 | db: 31 | image: datagov/catalog.data.gov.db:latest 32 | env_file: 33 | - .env 34 | healthcheck: 35 | test: ["CMD", "pg_isready --username=postgres"] 36 | interval: 10s 37 | timeout: 5s 38 | retries: 5 39 | ports: 40 | - "5432:5432" 41 | volumes: 42 | - pg_data:/var/lib/postgresql/data 43 | 44 | redis: 45 | image: redis:alpine 46 | 47 | solr: 48 | image: datagov/catalog.data.gov.solr:latest 49 | ports: 50 | - "8983:8983" 51 | 52 | localstack-container: 53 | container_name: "localstack-container" 54 | privileged: true 55 | image: localstack/localstack:1.1.0 56 | ports: 57 | - "4566-4583:4566-4583" 58 | - "8081:8081" 59 | environment: 60 | - SERVICES=s3 61 | - DEBUG=1 62 | - DATA_DIR=/tmp/localstack/data 63 | - HOSTNAME= 64 | - DOCKER_HOST=unix:///var/run/docker.sock 65 | - DEFAULT_REGION=us-east-1 66 | - START_WEB=1 67 | volumes: 68 | - "./tmp/localstack:/var/lib/localstack" 69 | - "./tmp/localstack/run/docker.sock:/var/run/docker.sock" 70 | 71 | volumes: 72 | ckan_storage: 73 | pg_data: 74 | solr_data: 75 | -------------------------------------------------------------------------------- /ckanext/geodatagov/tests/test_s3test.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pytest 3 | import requests 4 | from click.testing import CliRunner, Result 5 | 6 | from ckan.common import config 7 | 8 | import ckanext.geodatagov.cli as cli 9 | 10 | 11 | log = logging.getLogger(__name__) 12 | 13 | 14 | class TestS3TestCommand(object): 15 | @pytest.fixture 16 | def txt_cli_result(self) -> Result: 17 | 18 | runner = CliRunner() 19 | raw_cli_output = runner.invoke( 20 | cli.s3_test, 21 | args=['txt'], 22 | ) 23 | 24 | return raw_cli_output 25 | 26 | @pytest.fixture 27 | def html_cli_result(self) -> Result: 28 | 29 | runner = CliRunner() 30 | raw_cli_output = runner.invoke( 31 | cli.s3_test, 32 | args=['html'], 33 | ) 34 | 35 | return raw_cli_output 36 | 37 | def test_s3_upload_txt(self, txt_cli_result): 38 | """upload test.txt to s3 and make sure there's no errors""" 39 | # check successful cli run 40 | assert txt_cli_result.exit_code == 0 41 | 42 | endpoint_url = config.get("ckanext.s3sitemap.endpoint_url") 43 | bucket = config.get("ckanext.s3sitemap.aws_bucket_name") 44 | 45 | s3_response = requests.get(f"{endpoint_url}/{bucket}/test.txt") 46 | assert txt_cli_result.output.strip("\n") == s3_response.content.decode("utf8") 47 | 48 | # check content-type 49 | assert 'text/plain' == s3_response.headers['content-type'] 50 | 51 | def test_s3_upload_html(self, html_cli_result): 52 | """upload test.html to s3 and make sure there's no errors""" 53 | # check successful cli run 54 | assert html_cli_result.exit_code == 0 55 | 56 | endpoint_url = config.get("ckanext.s3sitemap.endpoint_url") 57 | bucket = config.get("ckanext.s3sitemap.aws_bucket_name") 58 | 59 | # chcek content 60 | s3_response = requests.get(f"{endpoint_url}/{bucket}/test.html") 61 | assert html_cli_result.output.strip("\n") == s3_response.content.decode("utf8") 62 | 63 | # check content-type 64 | assert 'application/html' == s3_response.headers['content-type'] 65 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from codecs import open # To use a consistent encoding 3 | from os import path 4 | 5 | here = path.abspath(path.dirname(__file__)) 6 | 7 | # Get the long description from the relevant file 8 | with open(path.join(here, "README.md"), encoding="utf-8") as f: 9 | long_description = f.read() 10 | 11 | setup( 12 | name="ckanext-geodatagov", 13 | version="0.3.6", 14 | description="", 15 | long_description=long_description, 16 | long_description_content_type="text/markdown", 17 | classifiers=[ 18 | "Programming Language :: Python :: 3" 19 | ], # Get strings from http://pypi.python.org/pypi?%3Aaction=list_classifiers 20 | keywords="", 21 | author="Data.gov", 22 | author_email="datagovhelp@gsa.gov", 23 | url="https://github.com/GSA/ckanext-geodatagov", 24 | license="", 25 | packages=find_packages(exclude=["ez_setup", "examples", "tests"]), 26 | namespace_packages=["ckanext", "ckanext.geodatagov"], 27 | include_package_data=True, 28 | zip_safe=False, 29 | install_requires=[ 30 | # -*- Extra requirements: -*- 31 | "ckanext-datajson>=0.1.19", 32 | "boto3", 33 | "ply>=3.4", 34 | ], 35 | setup_requires=["wheel"], 36 | entry_points=""" 37 | [ckan.plugins] 38 | # Add plugins here, eg 39 | geodatagov=ckanext.geodatagov.plugin:Demo 40 | s3test=ckanext.geodatagov.plugin:S3Test 41 | datagov_harvest=ckanext.geodatagov.plugin:DataGovHarvest 42 | 43 | geodatagov_csw_harvester=ckanext.geodatagov.harvesters:GeoDataGovCSWHarvester 44 | geodatagov_waf_harvester=ckanext.geodatagov.harvesters:GeoDataGovWAFHarvester 45 | geodatagov_doc_harvester=ckanext.geodatagov.harvesters:GeoDataGovDocHarvester 46 | geodatagov_geoportal_harvester=ckanext.geodatagov.harvesters:GeoDataGovGeoportalHarvester 47 | waf_harvester_collection=ckanext.geodatagov.harvesters:WAFCollectionHarvester 48 | arcgis_harvester=ckanext.geodatagov.harvesters:ArcGISHarvester 49 | z3950_harvester=ckanext.geodatagov.harvesters:Z3950Harvester 50 | 51 | [paste.paster_command] 52 | geodatagov=ckanext.geodatagov.commands:GeoGovCommand 53 | """, 54 | ) 55 | -------------------------------------------------------------------------------- /scripts/sql/make_pk.sql: -------------------------------------------------------------------------------- 1 | drop table old_new_source_id_mapping; 2 | drop table harvest_source_after_load; 3 | drop table tmp_to_delete; 4 | 5 | 6 | ALTER TABLE activity 7 | ADD CONSTRAINT activity_pkey PRIMARY KEY (id); 8 | 9 | ALTER TABLE activity_detail 10 | ADD CONSTRAINT activity_detail_pkey PRIMARY KEY (id); 11 | 12 | ALTER TABLE group_extra_revision 13 | ADD CONSTRAINT group_extra_revision_pkey PRIMARY KEY (id, revision_id); 14 | 15 | ALTER TABLE group_revision 16 | ADD CONSTRAINT group_revision_pkey PRIMARY KEY (id, revision_id); 17 | 18 | ALTER TABLE harvest_object_extra 19 | ADD CONSTRAINT harvest_object_extra_pkey PRIMARY KEY (id); 20 | 21 | ALTER TABLE member_revision 22 | ADD CONSTRAINT member_revision_pkey PRIMARY KEY (id, revision_id); 23 | 24 | ALTER TABLE package_extra 25 | ADD CONSTRAINT package_extra_pkey PRIMARY KEY (id); 26 | 27 | ALTER TABLE package_extra_revision 28 | ADD CONSTRAINT package_extra_revision_pkey PRIMARY KEY (id, revision_id); 29 | 30 | ALTER TABLE package_relationship_revision 31 | ADD CONSTRAINT package_relationship_revision_pkey PRIMARY KEY (id, revision_id); 32 | 33 | ALTER TABLE package_revision 34 | ADD CONSTRAINT package_revision_pkey PRIMARY KEY (id, revision_id); 35 | 36 | ALTER TABLE package_tag 37 | ADD CONSTRAINT package_tag_pkey PRIMARY KEY (id); 38 | 39 | ALTER TABLE package_tag_revision 40 | ADD CONSTRAINT package_tag_revision_pkey PRIMARY KEY (id, revision_id); 41 | 42 | ALTER TABLE resource_group_revision 43 | ADD CONSTRAINT resource_group_revision_pkey PRIMARY KEY (id, revision_id); 44 | 45 | ALTER TABLE resource_revision 46 | ADD CONSTRAINT resource_revision_pkey PRIMARY KEY (id, revision_id); 47 | 48 | ALTER TABLE revision 49 | ADD CONSTRAINT revision_pkey PRIMARY KEY (id); 50 | 51 | ALTER TABLE system_info_revision 52 | ADD CONSTRAINT system_info_revision_pkey PRIMARY KEY (id, revision_id); 53 | 54 | ALTER TABLE term_translation 55 | ADD CONSTRAINT term_translation_pkey PRIMARY KEY (term, term_translation); 56 | 57 | ALTER TABLE tracking_raw 58 | ADD CONSTRAINT tracking_raw_pkey PRIMARY KEY (user_key, access_timestamp); 59 | 60 | ALTER TABLE tracking_summary 61 | ADD CONSTRAINT tracking_summary_pkey PRIMARY KEY (url, tracking_type, package_id, tracking_date); 62 | 63 | -------------------------------------------------------------------------------- /ckanext/geodatagov/saml2/sp_config.py.template: -------------------------------------------------------------------------------- 1 | import os.path 2 | 3 | from saml2 import BINDING_HTTP_REDIRECT 4 | from saml2.saml import NAME_FORMAT_URI 5 | 6 | BASE= 'https://saml-test.datagov.ckan.org/' 7 | #BASE = 'http://localhost:5000/' 8 | CONFIG_PATH = os.path.dirname(__file__) 9 | 10 | CONFIG = { 11 | 'entityid' : 'urn:mace:umu.se:saml:ckan:sp', 12 | 'description': 'CKAN saml2 authorizor', 13 | 'service': { 14 | 'sp': { 15 | 'name' : 'CKAN SP', 16 | 'endpoints': { 17 | 'assertion_consumer_service': [BASE], 18 | 'single_logout_service' : [(BASE + 'slo', 19 | BINDING_HTTP_REDIRECT)], 20 | }, 21 | 'required_attributes': [ 22 | 'uid', 23 | 'name', 24 | 'mail', 25 | 'status', 26 | 'roles', 27 | 'field_display_name', 28 | 'realname', 29 | 'field_unique_id', 30 | 'field_type_of_user', 31 | 'field_organization_type', 32 | 'field_agency', 33 | 'field_organization', 34 | ], 35 | 'allow_unsolicited': True, 36 | 'optional_attributes': [], 37 | 'idp': ['urn:mace:umu.se:saml:ckan:idp'], 38 | } 39 | }, 40 | 'debug': 0, 41 | 'key_file': CONFIG_PATH + '/pki/mykey.pem', 42 | 'cert_file': CONFIG_PATH + '/pki/mycert.pem', 43 | 'attribute_map_dir': CONFIG_PATH + '/attributemaps', 44 | 'metadata': { 45 | 'local': [CONFIG_PATH + '/idp.xml'], 46 | }, 47 | # -- below used by make_metadata -- 48 | 'organization': { 49 | 'name': 'Exempel AB', 50 | 'display_name': [('Exempel AB','se'),('Example Co.','en')], 51 | 'url':'http://www.example.com/ckan', 52 | }, 53 | 'contact_person': [{ 54 | 'given_name':'John', 55 | 'sur_name': 'Smith', 56 | 'email_address': ['john.smith@example.com'], 57 | 'contact_type': 'technical', 58 | }, 59 | ], 60 | 'name_form': NAME_FORMAT_URI, 61 | 'logger': { 62 | 'rotating': { 63 | 'filename': '/tmp/sp.log', 64 | 'maxBytes': 100000, 65 | 'backupCount': 5, 66 | }, 67 | 'loglevel': 'error', 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /ckanext/geodatagov/tests/test_update_geo.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from ckanext.geodatagov.logic import translate_spatial 4 | 5 | from utils import populate_locations_table 6 | 7 | 8 | log = logging.getLogger(__name__) 9 | 10 | 11 | class TestUpdateGeo(object): 12 | 13 | def setup_method(self): 14 | populate_locations_table() 15 | 16 | def test_translations(self): 17 | """ test translate_spatial function """ 18 | 19 | # Test place in locations table 20 | us = ('{"type":"Polygon","coordinates":[[[-124.733253,24.544245],[-124.733253,49.388611],' 21 | '[-66.954811,49.388611],[-66.954811,24.544245],[-124.733253,24.544245]]]}') 22 | assert translate_spatial('United States') == us 23 | california = ('{"type":"Polygon","coordinates":[[[-124.3926,32.5358],[-124.3926,42.0022],' 24 | '[-114.1252,42.0022],[-114.1252,32.5358],[-124.3926,32.5358]]]}') 25 | assert translate_spatial('California') == california 26 | 27 | # test numeric versions 28 | assert translate_spatial('1.0,2.0,3.5,5.5') == ('{"type": "Polygon", "coordinates": ' 29 | '[[[1.0, 2.0], [1.0, 5.5], [3.5, 5.5], ' 30 | '[3.5, 2.0], [1.0, 2.0]]]}') 31 | # Test not existent places 32 | assert translate_spatial('not exists') is None 33 | assert translate_spatial('1.0,3.0') is None 34 | assert translate_spatial('US, Virginia, Fairfax, Reston') is None 35 | assert translate_spatial( 36 | '["CARTESIAN", [{"WestBoundingCoordinate": -69.864167, "NorthBoundingCoordinate": 70.843889, ' 37 | '"EastBoundingCoordinate": -69.864167, "SouthBoundingCoordinate": 70.843889}, ' 38 | '{"WestBoundingCoordinate": -68.156667, "NorthBoundingCoordinate": 70.313889, ' 39 | '"EastBoundingCoordinate": -68.156667, "SouthBoundingCoordinate": 70.313889}, ' 40 | '{"WestBoundingCoordinate": -70.52, "NorthBoundingCoordinate": 69.846667, ' 41 | '"EastBoundingCoordinate": -70.52, "SouthBoundingCoordinate": 69.846667}, ' 42 | '{"WestBoundingCoordinate": -70.52007, "NorthBoundingCoordinate": 70.843889, ' 43 | '"EastBoundingCoordinate": -68.15668, "SouthBoundingCoordinate": 69.84673}]]' 44 | ) is None 45 | -------------------------------------------------------------------------------- /scripts/sql/what_to_alter.sql: -------------------------------------------------------------------------------- 1 | create index idx_harvest_object_guid on harvest_object(guid); 2 | create index idx_harvest_object_pkg_id on harvest_object(package_id); 3 | create index idx_harvest_object_id on harvest_object_extra(harvest_object_id); 4 | create index idx_harvest_object_err on harvest_object_error(harvest_object_id); 5 | create index idx_package_extend_pkg_id on package_extent(package_id); 6 | 7 | create index idx_package_extra_revision_pkg_id on package_extra_revision(package_id); 8 | create index idx_package_extra_revision on package_extra_revision(id); 9 | 10 | 11 | --special 12 | create index idx_revision_id on revision(id); 13 | drop index idx_package_resource_pkg_id_resource_id; 14 | 15 | create index idx_resource_name on resource(name); 16 | 17 | 18 | create index idx_resource_group_pkg_id on resource_group(package_id); 19 | create index idx_resource_group_revision_pkg_id on resource_group_revision(package_id); 20 | create index idx_resource_group_revision_rev_id on resource_group_revision(revision_id); 21 | create index idx_resource_group_revision on resource_group_revision(id); 22 | 23 | create index idx_resource_revision on resource_revision(id); 24 | create index idx_resource_revision_res_grp_id on resource_revision(resource_group_id); 25 | create index idx_member_revision_id on member_revision(id); 26 | create index idx_member_revision_group_id on member_revision(group_id); 27 | 28 | 29 | 30 | 31 | drop INDEX idx_package_extra_current; 32 | drop INDEX idx_package_extra_period; 33 | drop INDEX idx_package_extra_period_package; 34 | drop index idx_extra_id_pkg_id; 35 | 36 | drop INDEX idx_package_tag_id ; 37 | 38 | drop INDEX idx_package_tag_current ; 39 | drop INDEX idx_package_tag_revision_pkg_id_tag_id ; 40 | drop INDEX idx_period_package_tag ; 41 | 42 | drop INDEX idx_resource_group_period ; 43 | drop INDEX idx_resource_group_period_package ; 44 | drop INDEX idx_resource_group_current ; 45 | 46 | drop INDEX idx_resource_period; 47 | drop INDEX idx_resource_current; 48 | drop INDEX idx_resource_period_resource_group; 49 | 50 | drop index idx_package_group_period_package_group; 51 | drop index "idx_package_group_current"; 52 | 53 | 54 | drop index idx_pkg_id; 55 | drop index idx_pkg_name; 56 | drop index idx_pkg_rev_id; 57 | drop index idx_pkg_sid; 58 | drop index idx_pkg_slname; 59 | drop index idx_pkg_sname; 60 | drop index idx_pkg_srev_id; 61 | drop index idx_pkg_stitle; 62 | drop index idx_pkg_suname; 63 | drop index idx_pkg_title; 64 | drop index idx_pkg_uname; 65 | -------------------------------------------------------------------------------- /ckanext/geodatagov/tests/test_category_tags.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import json 3 | import pytest 4 | 5 | from ckan import plugins as p 6 | from ckan.tests import factories 7 | 8 | 9 | log = logging.getLogger(__name__) 10 | 11 | 12 | @pytest.mark.usefixtures("with_plugins") 13 | class TestCategoryTags(object): 14 | 15 | def create_datasets(self): 16 | organization = factories.Organization() 17 | self.group1 = factories.Group() 18 | self.group2 = factories.Group() 19 | self.dataset1 = factories.Dataset(owner_org=organization['id'], groups=[{"name": self.group1["name"]}]) 20 | self.dataset2 = factories.Dataset(owner_org=organization['id'], groups=[{"name": self.group2["name"]}]) 21 | sysadmin = factories.SysadminWithToken() 22 | self.user_name = sysadmin['name'] 23 | 24 | def test_group_catagory_tag_update(self): 25 | self.create_datasets() 26 | context = {'user': self.user_name, 'ignore_auth': True} 27 | 28 | self.dataset1['categories'] = '["cat1"]' 29 | self.dataset1['group_id'] = self.group1["id"] 30 | p.toolkit.get_action('group_catagory_tag_update')(context, self.dataset1) 31 | expected_extra = {"key": "__category_tag_{}".format(self.group1["id"]), 32 | "value": json.dumps(self.dataset1['categories'])} 33 | pkg_dict = p.toolkit.get_action('package_show')(context, {'id': self.dataset1["id"]}) 34 | assert expected_extra in pkg_dict["extras"] 35 | 36 | # test if we preserve category tag extras while we update the dataset 37 | pkg_dict['Title'] = 'Change title 02' 38 | pkg_dict = p.toolkit.get_action('package_update')(context, pkg_dict) 39 | assert expected_extra in pkg_dict["extras"] 40 | 41 | self.dataset2['categories'] = '["cat2"]' 42 | self.dataset2['group_id'] = self.group2["id"] 43 | p.toolkit.get_action('group_catagory_tag_update')(context, self.dataset2) 44 | expected_extra = {"key": "__category_tag_{}".format(self.group2["id"]), 45 | "value": json.dumps(self.dataset2['categories'])} 46 | pkg_dict = p.toolkit.get_action('package_show')(context, {'id': self.dataset2["id"]}) 47 | assert expected_extra in pkg_dict["extras"] 48 | 49 | # test if we preserve category tag extras while we update the dataset 50 | pkg_dict['Title'] = 'Change title 03' 51 | pkg_dict = p.toolkit.get_action('package_update')(context, pkg_dict) 52 | assert expected_extra in pkg_dict["extras"] 53 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | This project utilizes code form http://ckan.org/. Therefore, all code and content created by CKAN is [licensed under the GNU Affero General Public License](https://github.com/ckan/ckan/blob/master/LICENSE.txt). All contributions and code added to this project are [dedicated to the public domain worldwide](https://creativecommons.org/publicdomain/zero/1.0/). 2 | 3 | ## Public Domain 4 | 5 | This project constitutes a work of the United States Government and is not subject to domestic copyright protection under 17 USC § 105. Additionally, we waive copyright and related rights in the work worldwide through the [CC0 1.0 Universal public domain dedication](https://creativecommons.org/publicdomain/zero/1.0/). 6 | 7 | All contributions to this project will be released under the CC0 dedication. By submitting a pull request, you are agreeing to comply with this waiver of copyright interest. See [CONTRIBUTING](https://github.com/GSA/ckanext-geodatagov/blob/master/CONTRIBUTING.md) for more information. 8 | 9 | ## GNU Affero General Public License 10 | 11 | This project utilizes code [licensed under the terms of the GNU Affero General Public License](https://github.com/ckan/ckan/blob/master/LICENSE.txt). 12 | 13 | CKAN is free software: you can redistribute it and/or modify 14 | it under the terms of the GNU Affero General Public License as 15 | published by the Free Software Foundation, either version 3 of the 16 | License, or (at your option) any later version. 17 | 18 | CKAN is distributed in the hope that it will be useful, 19 | but WITHOUT ANY WARRANTY; without even the implied warranty of 20 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 21 | GNU Affero General Public License for more details. 22 | 23 | Visit http://www.gnu.org/licenses/ to learn more about the GNU Affero General Public License. 24 | 25 | ###Note 26 | 27 | CKAN is sometimes packaged directly with other software (listed in 28 | requirements.txt and dev-requirements.txt). 29 | In these cases, we are required to list the licenses of the packaged softare 30 | too. They are all AGPL compatible and listed in the [CKAN licensing.txt](https://github.com/ckan/ckan/blob/master/LICENSE.txt). 31 | 32 | 33 | ## Other Information 34 | 35 | In no way are the patent or trademark rights of any person affected by CC0, nor are the rights that other persons may have in the work or in how the work is used, such as publicity or privacy rights. 36 | 37 | Unless expressly stated otherwise, the person who associated a work with this deed makes no warranties about the work, and disclaims liability for all uses of the work, to the fullest extent permitted by applicable law. When using or citing the work, you should not imply endorsement by the author or the affirmer. 38 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ## Welcome! 2 | 3 | We're so glad you're thinking about contributing to Data.gov! 4 | 5 | Before contributing to this extension, we encourage you to read our CONTRIBUTING guide (you are here), our [LICENSE](https://github.com/GSA/ckanext-geodatagov/blob/master/LICENSE.md), and our [README](https://github.com/GSA/ckanext-geodatagov/blob/master/README.md), all of which should be in this repository. If you have any questions, you can email the Data.gov team at [datagov@gsa.gov](mailto:datagov@gsa.gov). 6 | 7 | ## Ways to Contribute 8 | 9 | **The Data.gov team manages all project wide Data.gov updates, bugs, and feature additions via the public [GSA Data.gov issue tracker](https://github.com/GSA/data.gov/issues).** 10 | 11 | **Please limit submitting issues to this repository to discreet issues with this extension.** 12 | 13 | If you do not already have a GitHub account, you can [sign up for GitHub here](https://github.com/). In the spirit of open source software, everyone is encouraged to help improve this project. Here are some ways you can contribute: 14 | - by reporting bugs 15 | - by suggesting new features 16 | - by translating content to a new language 17 | - by writing or editing documentation 18 | - by writing specifications 19 | - by writing code and documentation (**no pull request is too small**: fix typos, add code comments, clean up inconsistent whitespace) 20 | - by reviewing [pull requests](https://github.com/GSA/ckanext-geodatagov/pulls). 21 | - by closing issues 22 | 23 | #### Submit Great Issues 24 | * Submit project wide issues to the [GSA Data.gov issue tracker](https://github.com/GSA/data.gov/issues). When in doubt, submit issues in that repo. 25 | * Before submitting a new [issue](https://github.com/GSA/ckanext-geodatagov/issues), check to make sure [a similar issue isn't already open](https://github.com/ckanext-geodatagov/data.gov/issues?q=is%3Aissue+is%3Aopen). If one is, contribute to that issue thread with your feedback. 26 | * When submitting a bug report, please try to provide as much detail as possible, i.e. a screenshot or [gist](https://gist.github.com/) that demonstrates the problem, the technology you are using, and any relevant links. 27 | 28 | #### Ready for your Help 29 | Issues labeled :sparkles:[`help wanted`](https://github.com/GSA/ckanext-geodatagov/labels/help%20wanted):sparkles: make it easy for you to find ways you can contribute today. 30 | 31 | ## Public Domain 32 | 33 | This project constitutes a work of the United States Government and is not subject to domestic copyright protection under 17 USC § 105. Additionally, we waive copyright and related rights in the work worldwide through the [CC0 1.0 Universal public domain dedication](https://creativecommons.org/publicdomain/zero/1.0/). 34 | 35 | All contributions to this project will be released under the CC0 36 | dedication. By submitting a pull request, you are agreeing to comply 37 | with this waiver of copyright interest. 38 | -------------------------------------------------------------------------------- /ckanext/geodatagov/tests/utils.py: -------------------------------------------------------------------------------- 1 | import http.server 2 | import logging 3 | import socketserver 4 | from threading import Thread 5 | import os 6 | 7 | from ckan.tests.helpers import reset_db 8 | from ckan.model.meta import Session, metadata 9 | import ckan.lib.search as search 10 | 11 | 12 | log = logging.getLogger(__name__) 13 | 14 | PORT = 8999 15 | 16 | 17 | def simple_http_server(port=PORT): 18 | '''Serves test XML files over HTTP''' 19 | 20 | # Make sure we serve from the tests' XML directory 21 | os.chdir(os.path.join(os.path.dirname(os.path.abspath(__file__)), 22 | 'data-samples')) 23 | 24 | Handler = http.server.SimpleHTTPRequestHandler 25 | 26 | class TestServer(socketserver.TCPServer): 27 | allow_reuse_address = True 28 | 29 | skip_connection = False 30 | try: 31 | httpd = TestServer(("", port), Handler) 32 | except Exception as e: 33 | print('Serve error {}'.format(e)) 34 | skip_connection = True 35 | 36 | if skip_connection is False: 37 | info = 'Serving test HTTP server at port', port 38 | print(info) 39 | log.info(info) 40 | 41 | httpd_thread = Thread(target=httpd.serve_forever) 42 | httpd_thread.setDaemon(True) 43 | httpd_thread.start() 44 | 45 | 46 | def populate_locations_table(): 47 | # download locations.sql.gz if not present 48 | if not os.path.exists('/tmp/locations.sql.gz'): 49 | os.system( 50 | "wget https://github.com/GSA/datagov-deploy/raw/71936f004be1882a506362670b82c710c64ef796/" 51 | "ansible/roles/software/ec2/ansible/files/locations.sql.gz " 52 | "-O /tmp/locations.sql.gz" 53 | ) 54 | # echo "Creating locations table" 55 | os.system("PGPASSWORD=ckan psql -h db -U ckan -d ckan -c 'DROP TABLE IF EXISTS locations;'") 56 | os.system("PGPASSWORD=ckan psql -h db -U ckan -d ckan -c 'DROP SEQUENCE IF EXISTS locations_id_seq;'") 57 | os.system("gunzip -c /tmp/locations.sql.gz | PGPASSWORD=ckan psql -h db -U ckan -d ckan -v ON_ERROR_STOP=1") 58 | 59 | 60 | def reset_db_and_solr(): 61 | # https://github.com/ckan/ckan/issues/4764 62 | # drop extension postgis so we can reset db 63 | try: 64 | os.system( 65 | "PGPASSWORD=ckan psql -h db -U ckan -d ckan -c " 66 | "'SELECT pg_terminate_backend(pg_stat_activity.pid) " 67 | " FROM pg_stat_activity WHERE " 68 | " datname = current_database() AND" 69 | " pid <> pg_backend_pid();'" 70 | ) 71 | except Exception: 72 | pass 73 | os.system("PGPASSWORD=ckan psql -h db -U ckan -d ckan -c 'drop extension IF EXISTS postgis cascade;'") 74 | try: 75 | reset_db() 76 | except Exception: 77 | pass 78 | os.system("PGPASSWORD=ckan psql -h db -U ckan -d ckan -c 'create extension postgis;'") 79 | # add back tables from extensions 80 | metadata.create_all(bind=Session.bind) 81 | 82 | search.clear_all() 83 | -------------------------------------------------------------------------------- /ckanext/geodatagov/tests/data-samples/sample6_bad_data.json: -------------------------------------------------------------------------------- 1 | { 2 | "@context": "https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld", 3 | "@type": "dcat:Catalog", 4 | "conformsTo": "https://project-open-data.cio.gov/v1.1/schema", 5 | "describedBy": "https://project-open-data.cio.gov/v1.1/schema/catalog.json", 6 | "dataset": [ 7 | { 8 | "@type": "dcat:Dataset", 9 | "accessLevel": "public", 10 | "accrualPeriodicity": "R/P1D", 11 | "bureauCode": [ 12 | "581:00" 13 | ], 14 | "contactPoint": { 15 | "@type": "vcard:Contact", 16 | "fn": "devops@cfpb.gov", 17 | "hasEmail": "mailto:devops@cfpb.gov" 18 | }, 19 | "describedBy": "https://cfpb.github.io/api/ccdb/api.html", 20 | "description": "The Consumer Complaint Database is a collection of complaints about consumer financial products and services that we sent to companies for response. Complaints are published after the company responds, confirming a commercial relationship with the consumer, or after 15 days, whichever comes first. Complaints referred to other regulators, such as complaints about depository institutions with less than $10 billion in assets, are not published in the Consumer Complaint Database. The database generally updates daily.", 21 | "distribution": [ 22 | { 23 | "@type": "dcat:Distribution", 24 | "downloadURL": "https://files.consumerfinance.gov/ccdb/complaints.csv.zip", 25 | "mediaType": "text/csv" 26 | }, 27 | { 28 | "@type": "dcat:Distribution", 29 | "downloadURL": "https://files.consumerfinance.gov/ccdb/complaints.json.zip", 30 | "mediaType": "application/json" 31 | }, 32 | { 33 | "@type": "dcat:Distribution", 34 | "format": "API", 35 | "accessURL": "https://www.consumerfinance.gov/data-research/consumer-complaints/search/api/v1/" 36 | } 37 | ], 38 | "identifier": "CCDB", 39 | "keyword": [ 40 | "consumer", 41 | "finance", 42 | "complaint", 43 | "bank account", 44 | "bank service", 45 | "credit card", 46 | "credit report", 47 | "debt collection", 48 | "money transfer", 49 | "mortgage", 50 | "student loan", 51 | "loan" 52 | ], 53 | "landingPage": "https://www.consumerfinance.gov/data-research/consumer-complaints/", 54 | "modified": "2020-01-13", 55 | "programCode": [ 56 | "000:000" 57 | ], 58 | "publisher": { 59 | "@type": "org:Organization", 60 | "name": "Consumer Financial Protection Bureau" 61 | }, 62 | "spatial": { 63 | "type": "Polygon", 64 | "coordinates": [ 65 | [ 66 | [ 67 | -124.733253, 68 | 24.544245 69 | ], 70 | [ 71 | -124.733253, 72 | 49.388611 73 | ], 74 | [ 75 | -66.954811, 76 | 49.388611 77 | ], 78 | [ 79 | -66.954811, 80 | 24.544245 81 | ] 82 | [ 83 | -124.733253, 84 | 24.544245 85 | ] 86 | ] 87 | ] 88 | }, 89 | "title": "Consumer Complaint Database" 90 | } 91 | ] 92 | } -------------------------------------------------------------------------------- /ckanext/geodatagov/tests/test_logic.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from ckan.tests.helpers import FunctionalTestBase 4 | from ckan.tests import factories 5 | from ckanext.geodatagov.logic import rollup_save_action 6 | 7 | from utils import populate_locations_table 8 | 9 | 10 | class TestLogic(FunctionalTestBase): 11 | 12 | def setup_method(self): 13 | populate_locations_table() 14 | 15 | def create_datasets(self): 16 | self.group1 = factories.Group() 17 | organization = factories.Organization() 18 | 19 | self.dataset1 = factories.Dataset( # NOQA 20 | title="Dataset 1", 21 | owner_org=organization['id'], 22 | groups=[ 23 | {"name": self.group1['name']}, 24 | ], 25 | extras=[]) 26 | 27 | sysadmin = factories.SysadminWithToken() 28 | self.user_name = sysadmin['name'] 29 | 30 | def test_rollup_save_action(self): 31 | """ test rollup_save_action for expected results """ 32 | test_data = [ 33 | {'key': 'harvest_object_id', 'value': 'to_be_ignored'}, 34 | {'key': 'spatial', 'value': 'US'}, 35 | {'key': 'extras_rollup', 'value': '{"some_extras_rollup": 123}'}, 36 | {'key': 'everything_else', 'value': 'others'} 37 | ] 38 | ignored_extra = test_data[0] 39 | # spatial_extra = test_data[1] 40 | rollup_extra = test_data[2] 41 | other_extra = test_data[3] 42 | 43 | self.create_datasets() 44 | context = {'user': self.user_name, 'ignore_auth': True} 45 | 46 | self.dataset1['extras'] = test_data 47 | 48 | rollup_save_action(context, self.dataset1) 49 | # print(self.dataset1['extras']) 50 | # [ 51 | # {'value': 'to_be_ignored', 'key': 'harvest_object_id'}, 52 | # {'value': u'{"type":"Polygon","coordinates":[[...]]}', 53 | # 'key': 'spatial'}, 54 | # {'value': '{"some_extras_rollup": 1, 55 | # "everything_else": "others", 56 | # "old-spatial": "US" 57 | # }', 58 | # 'key': 'extras_rollup'} 59 | # ] 60 | new_extras = self.dataset1['extras'] 61 | new_extras_rollup = json.loads(next( 62 | item for item in new_extras if item['key'] == 'extras_rollup' 63 | )['value']) 64 | 65 | # harvest_object_id in one of EXTRAS_ROLLUP_KEY_IGNORE 66 | # it should not go into new_extras_rollup 67 | assert ignored_extra in new_extras 68 | assert ignored_extra['key'] not in new_extras_rollup.keys() 69 | 70 | # old spatial sees translation 71 | assert 'old-spatial' in new_extras_rollup.keys() 72 | assert 'Polygon' in next( 73 | item for item in new_extras if item['key'] == 'spatial' 74 | )['value'] 75 | 76 | # all others should go into new_extras_rollup 77 | assert json.loads(rollup_extra['value'])['some_extras_rollup'] \ 78 | == new_extras_rollup['some_extras_rollup'] 79 | assert other_extra['key'] in new_extras_rollup.keys() 80 | -------------------------------------------------------------------------------- /.env: -------------------------------------------------------------------------------- 1 | # DB image settings 2 | POSTGRES_PASSWORD=ckan 3 | POSTGRES_USER=ckan 4 | POSTGRES_DB=ckan 5 | DATASTORE_READONLY_PASSWORD=datastore 6 | 7 | # Basic 8 | CKAN_SITE_ID=default 9 | CKAN_SITE_URL=http://ckan:5000 10 | CKAN_PORT=5000 11 | CKAN_SYSADMIN_NAME=admin 12 | CKAN_SYSADMIN_PASSWORD=password 13 | CKAN_SYSADMIN_EMAIL=your_email@example.com 14 | TZ=UTC 15 | 16 | # Database connections (TODO: avoid duplication) 17 | CKAN_SQLALCHEMY_URL=postgresql://ckan:ckan@db/ckan 18 | # CKAN_SQLALCHEMY_URL=postgresql://ckan_default:pass@db/ckan_test # ckan/ckan-postgres-dev: 19 | CKAN_DATASTORE_WRITE_URL=postgresql://ckan:ckan@db/datastore 20 | CKAN_DATASTORE_READ_URL=postgresql://ckan:ckan@db/datastore 21 | 22 | # Test database connections 23 | TEST_CKAN_SQLALCHEMY_URL=postgres://ckan:ckan@db/ckan_test 24 | # TEST_CKAN_SQLALCHEMY_URL=postgres://ckan_default:pass@db/ckan_test # ckan/ckan-postgres-dev: 25 | TEST_CKAN_DATASTORE_WRITE_URL=postgresql://ckan:ckan@db/datastore_test 26 | TEST_CKAN_DATASTORE_READ_URL=postgresql://ckan:ckan@db/datastore_test 27 | 28 | # Other services connections 29 | CKAN_SOLR_URL=http://solr:8983/solr/ckan 30 | CKAN_REDIS_URL=redis://redis:6379/1 31 | CKAN_DATAPUSHER_URL=http://datapusher:8800 32 | CKAN__DATAPUSHER__CALLBACK_URL_BASE=http://ckan:5000 33 | 34 | TEST_CKAN_SOLR_URL=http://solr:8983/solr/ckan 35 | TEST_CKAN_REDIS_URL=redis://redis:6379/1 36 | 37 | # Core settings 38 | CKAN__STORAGE_PATH=/var/lib/ckan 39 | 40 | CKAN_SMTP_SERVER=smtp.corporateict.domain:25 41 | CKAN_SMTP_STARTTLS=True 42 | CKAN_SMTP_USER=user 43 | CKAN_SMTP_PASSWORD=pass 44 | CKAN_SMTP_MAIL_FROM=ckan@localhost 45 | 46 | # Extensions 47 | CKAN__PLUGINS=tracking harvest datagov_harvest ckan_harvester geodatagov z3950_harvester arcgis_harvester geodatagov_geoportal_harvester waf_harvester_collection geodatagov_csw_harvester geodatagov_doc_harvester geodatagov_waf_harvester spatial_metadata spatial_query s3test datajson datajson_harvest envvars 48 | 49 | # Harvest settings 50 | CKAN__HARVEST__MQ__TYPE=redis 51 | CKAN__HARVEST__MQ__HOSTNAME=redis 52 | CKAN__HARVEST__MQ__PORT=6379 53 | CKAN__HARVEST__MQ__REDIS_DB=1 54 | CKAN__HARVEST__LOG_LEVEL=info 55 | CKAN__HARVEST__LOG_SCOPE=0 56 | 57 | CKAN__HARVEST__STATUS_MAIL__ALL=True 58 | 59 | CKANEXT__GEODATAGOV__BUREAU_CSV__URL=https://resources.data.gov/schemas/dcat-us/v1.1/omb_bureau_codes.csv 60 | CKANEXT__GEODATAGOV__BUREAU_CSV__URL_DEFAULT=https://resources.data.gov/schemas/dcat-us/v1.1/omb_bureau_codes.csv 61 | 62 | CKAN__SPATIAL__SRID=4326 63 | CKAN__SPATIAL__VALIDATOR__PROFILES=iso19139ngdc 64 | 65 | CKAN___BROKER_BACKEND=redis 66 | CKAN___BROKER_HOST=redis://redis/1 67 | CKAN___CELERY_RESULT_BACKEND=redis 68 | CKAN___REDIS_HOST=redis 69 | CKAN___REDIS_PORT=6379 70 | CKAN___REDIS_DB=0 71 | CKAN___REDIS_CONNECT_RETRY=True 72 | 73 | ## S3 settings 74 | # The maximum content size, in bytes, for uploads 75 | CKAN__STORAGE__MAX_CONTENT_LENGTH=650000000 76 | CKAN_STORAGE_PATH=/var/lib/ckan/files 77 | CKANEXT__S3SITEMAP__AWS_ACCESS_KEY_ID=_placeholder 78 | CKANEXT__S3SITEMAP__AWS_BUCKET_NAME=catalog-sitemap 79 | CKANEXT__S3SITEMAP__AWS_S3_URL=_placeholder 80 | CKANEXT__S3SITEMAP__AWS_SECRET_ACCESS_KEY=_placeholder 81 | CKANEXT__S3SITEMAP__AWS_STORAGE_PATH=local 82 | CKANEXT__S3SITEMAP__REGION_NAME=us-east-1 83 | CKANEXT__S3SITEMAP__HOST_NAME=http://localstack-container:4566 84 | CKANEXT__S3SITEMAP__PUBLIC_HOST_NAME=http://localhost:4566 85 | # endpoint used to create boto3.resource('s3') 86 | CKANEXT__S3SITEMAP__ENDPOINT_URL=http://localstack-container:4566 87 | CKANEXT__S3SITEMAP__SIGNATURE_VERSION=s3v4 88 | -------------------------------------------------------------------------------- /ckanext/geodatagov/validation/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from ckanext.spatial.validation import BaseValidator, XsdValidator, FGDCSchema 4 | 5 | 6 | class MinimalFGDCValidator(BaseValidator): 7 | 8 | name = 'fgdc_minimal' 9 | title = 'FGDC Minimal Validation' 10 | 11 | _elements = [ 12 | ('Identification Citation Title', '/metadata/idinfo/citation/citeinfo/title'), 13 | ('Identification Citation Originator', '/metadata/idinfo/citation/citeinfo/origin'), 14 | ('Identification Citation Publication Date', '/metadata/idinfo/citation/citeinfo/pubdate'), 15 | ('Identification Description Abstract', '/metadata/idinfo/descript/abstract'), 16 | ('Identification Spatial Domain West Bounding Coordinate', '/metadata/idinfo/spdom/bounding/westbc'), 17 | ('Identification Spatial Domain East Bounding Coordinate', '/metadata/idinfo/spdom/bounding/eastbc'), 18 | ('Identification Spatial Domain North Bounding Coordinate', '/metadata/idinfo/spdom/bounding/northbc'), 19 | ('Identification Spatial Domain South Bounding Coordinate', '/metadata/idinfo/spdom/bounding/southbc'), 20 | ('Metadata Reference Information Contact Address Type', '/metadata/metainfo/metc/cntinfo/cntaddr/addrtype'), 21 | ('Metadata Reference Information Contact Address State', '/metadata/metainfo/metc/cntinfo/cntaddr/state'), 22 | ] 23 | 24 | @classmethod 25 | def is_valid(cls, xml): 26 | 27 | errors = [] 28 | 29 | for title, xpath in cls._elements: 30 | element = xml.xpath(xpath) 31 | if len(element) == 0 or not element[0].text: 32 | errors.append(('Element not found: {0}'.format(title), None)) 33 | if len(errors): 34 | return False, errors 35 | 36 | return True, [] 37 | 38 | 39 | class FGDCValidator(XsdValidator): 40 | ''' 41 | Base class for FGDC XSD validators 42 | ''' 43 | 44 | @classmethod 45 | def is_valid(cls, xml): 46 | xsd_filepath = os.path.join(os.path.dirname(__file__), 47 | cls._xsd_path, cls._xsd_file) 48 | return cls._is_valid(xml, xsd_filepath, 'FGDC Schema ({0})'.format(cls._xsd_file)) 49 | 50 | 51 | class FGDC1998Schema(FGDCSchema): 52 | ''' 53 | XSD based validation for FGDC metadata documents, version FGDC-STD-001-1998 54 | 55 | This is the same version present on ckanext-spatial 56 | 57 | ''' 58 | 59 | name = 'fgdc_std_001_1998' 60 | title = 'FGDC CSDGM Version 2.0, 1998 (FGDC-STD-001-1998)' 61 | 62 | 63 | class FGDC1999Schema(FGDCValidator): 64 | ''' 65 | XSD based validation for FGDC metadata documents, version FGDC-STD-001.1-1999 66 | 67 | Source: http://www.ncddc.noaa.gov/metadata-standards/metadata-xml/ 68 | 69 | ''' 70 | _xsd_path = 'xml/fgdc-std-001.1-1999' 71 | _xsd_file = 'fgdc-std-001.1-1999.xsd' 72 | 73 | name = 'fgdc_std_001.1_1999' 74 | title = 'FGDC CSDGM Biological Data Profile (FGDC-STD-001.1-1999)' 75 | 76 | 77 | class FGDC2001Schema(FGDCValidator): 78 | ''' 79 | XSD based validation for FGDC metadata documents, version FGDC-STD-001.2-2001 80 | 81 | Source: http://www.ncddc.noaa.gov/metadata-standards/metadata-xml/ 82 | 83 | ''' 84 | _xsd_path = 'xml/fgdc-std-001.2-2001' 85 | _xsd_file = 'fgdc-std-001.2-2001.xsd' 86 | 87 | name = 'fgdc_std_001.2_2001' 88 | title = 'FGDC CSDGM Metadata Profile for Shoreline Data (FGDC-STD-001.2-2001)' 89 | 90 | 91 | class FGDC2002Schema(FGDCValidator): 92 | ''' 93 | XSD based validation for FGDC metadata documents, version FGDC-STD-0012-2002 94 | 95 | Source: http://www.ncddc.noaa.gov/metadata-standards/metadata-xml/ 96 | 97 | ''' 98 | _xsd_path = 'xml/fgdc-std-012-2002' 99 | _xsd_file = 'fgdc-std-012-2002.xsd' 100 | 101 | name = 'fgdc_std_012_2002' 102 | title = 'FGDC Extensions for Remote Sensing (FGDC-STD-012-2002)' 103 | -------------------------------------------------------------------------------- /ckanext/geodatagov/rebuild.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Collection, Optional 3 | 4 | import ckan.logic as logic 5 | import ckan.model as model 6 | from ckan.lib.search import index_for, query_for, text_traceback 7 | from ckan.lib.search.common import config 8 | from ckan.types import Context 9 | 10 | log = logging.getLogger(__name__) 11 | 12 | 13 | def rebuild( 14 | package_id: Optional[str] = None, 15 | only_missing: bool = False, 16 | force: bool = False, 17 | defer_commit: bool = False, 18 | package_ids: Optional[Collection[str]] = None, 19 | quiet: bool = False, 20 | clear: bool = False, 21 | ): 22 | """ 23 | Rebuilds the search index. 24 | 25 | If a dataset id is provided, only this dataset will be reindexed. 26 | When reindexing all datasets, if only_missing is True, only the 27 | datasets not already indexed will be processed. If force equals 28 | True, if an exception is found, the exception will be logged, but 29 | the process will carry on. 30 | """ 31 | log.info("Rebuilding search index...") 32 | 33 | package_index = index_for(model.Package) 34 | context: Context = {"ignore_auth": True, "validate": False, "use_cache": False} 35 | 36 | if package_id: 37 | pkg_dict = logic.get_action("package_show")(context, {"id": package_id}) 38 | log.info("Indexing package %r...", pkg_dict["name"]) 39 | package_index.remove_dict(pkg_dict) 40 | package_index.insert_dict(pkg_dict) 41 | elif package_ids is not None: 42 | for package_id in package_ids: 43 | pkg_dict = logic.get_action("package_show")(context, {"id": package_id}) 44 | log.info("Indexing package %r...", pkg_dict["name"]) 45 | try: 46 | package_index.update_dict(pkg_dict, True) 47 | except Exception as e: 48 | log.error("Error while indexing package %s: %s" % (package_id, repr(e))) 49 | if force: 50 | log.error(text_traceback()) 51 | continue 52 | else: 53 | raise 54 | # If no package_id or package_ids is provided, rebuild the index for all packages 55 | else: 56 | packages = model.Session.query(model.Package.id) 57 | if config.get("ckan.search.remove_deleted_packages"): 58 | packages = packages.filter(model.Package.state != "deleted") 59 | 60 | package_ids = [r[0] for r in packages.all()] 61 | 62 | if only_missing: 63 | log.info("Indexing only missing packages...") 64 | package_query = query_for(model.Package) 65 | indexed_pkg_ids = set( 66 | package_query.get_all_entity_ids(max_results=len(package_ids)) 67 | ) 68 | # Packages not indexed 69 | package_ids = set(package_ids) - indexed_pkg_ids 70 | 71 | if len(package_ids) == 0: 72 | log.info("All datasets are already indexed") 73 | return 74 | else: 75 | log.info("Rebuilding the whole index...") 76 | # When refreshing, the index is not previously cleared 77 | if clear: 78 | package_index.clear() 79 | 80 | total_packages = len(package_ids) 81 | for counter, pkg_id in enumerate(package_ids): 82 | if not quiet: 83 | log.info( 84 | "\rIndexing dataset {0}/{1}".format(counter + 1, total_packages) 85 | ) 86 | try: 87 | package_index.update_dict( 88 | logic.get_action("package_show")(context, {"id": pkg_id}), 89 | defer_commit, 90 | ) 91 | except Exception as e: 92 | log.error("Error while indexing dataset %s: %s" % (pkg_id, repr(e))) 93 | if force: 94 | log.error(text_traceback()) 95 | continue 96 | else: 97 | raise 98 | 99 | model.Session.commit() 100 | log.info("Finished rebuilding search index.") 101 | -------------------------------------------------------------------------------- /ckanext/geodatagov/harvesters/z3950.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import hashlib 3 | from PyZ3950 import zoom 4 | 5 | from ckan import model 6 | 7 | from ckan.plugins.core import SingletonPlugin, implements 8 | from ckan.plugins import IConfigurer 9 | 10 | from ckanext.harvest.interfaces import IHarvester 11 | from ckanext.harvest.model import HarvestObject 12 | from ckanext.harvest.model import HarvestObjectExtra as HOExtra 13 | 14 | from ckanext.geodatagov.harvesters import GeoDataGovHarvester 15 | 16 | from ckan.lib.navl.validators import not_empty, convert_int, ignore_empty 17 | from ckan.logic.validators import boolean_validator 18 | 19 | from ckan.plugins.toolkit import add_template_directory, add_resource, requires_ckan_version 20 | from ckanext.geodatagov.helpers import string 21 | 22 | requires_ckan_version("2.9") 23 | 24 | 25 | class Z3950Harvester(GeoDataGovHarvester, SingletonPlugin): 26 | ''' 27 | A Harvester for z3950. 28 | ''' 29 | 30 | implements(IConfigurer) 31 | implements(IHarvester) 32 | 33 | # IConfigurer 34 | def update_config(self, config): 35 | add_template_directory(config, 'templates') 36 | add_resource('fanstatic_library', 'geodatagov') 37 | 38 | def info(self): 39 | return { 40 | 'name': 'z3950', 41 | 'title': 'Z39.50', 42 | 'description': 'A remote database supporting the Z39.50 protocol' 43 | } 44 | 45 | def extra_schema(self): 46 | return {'private_datasets': [ignore_empty, boolean_validator], 47 | 'database': [not_empty, string], 48 | 'port': [not_empty, convert_int]} 49 | 50 | def gather_stage(self, harvest_job): 51 | 52 | log = logging.getLogger(__name__ + '.WAF.gather') 53 | log.debug('z3950Harvester gather_stage for job: %r', harvest_job) 54 | 55 | self.harvest_job = harvest_job 56 | 57 | # Get source URL 58 | source_url = harvest_job.source.url 59 | 60 | self._set_source_config(harvest_job.source.config) 61 | 62 | # get current objects out of db 63 | query = model.Session.query(HarvestObject.guid, HarvestObject.package_id).filter( 64 | True if HarvestObject.current else False).\ 65 | filter(HarvestObject.harvest_source_id == harvest_job.source.id) 66 | 67 | guid_to_package_id = dict((res[0], res[1]) for res in query) 68 | current_guids = set(guid_to_package_id.keys()) 69 | current_guids_in_harvest = set() 70 | 71 | # Get contents 72 | try: 73 | conn = zoom.Connection(source_url, int(self.source_config.get('port', 210))) 74 | conn.databaseName = self.source_config.get('database', '') 75 | conn.preferredRecordSyntax = 'XML' 76 | conn.elementSetName = 'T' 77 | query = zoom.Query('CCL', 'metadata') 78 | res = conn.search(query) 79 | ids = [] 80 | for num, result in enumerate(res): 81 | hash = hashlib.md5(result.data).hexdigest() 82 | if hash in current_guids: 83 | current_guids_in_harvest.add(hash) 84 | else: 85 | obj = HarvestObject(job=harvest_job, guid=hash, extras=[ 86 | HOExtra(key='status', value='new'), 87 | HOExtra(key='original_document', value=result.data.decode('latin-1')), 88 | HOExtra(key='original_format', value='fgdc') 89 | ]) 90 | obj.save() 91 | ids.append(obj.id) 92 | for guid in (current_guids - current_guids_in_harvest): 93 | obj = HarvestObject(job=harvest_job, 94 | guid=guid, 95 | package_id=guid_to_package_id[guid], 96 | extras=[HOExtra(key='status', value='delete')]) 97 | obj.save() 98 | ids.append(obj.id) 99 | return ids 100 | except Exception as e: 101 | self._save_gather_error('Unable to get content for URL: %s: %r' % 102 | (source_url, e), harvest_job) 103 | return None 104 | 105 | def fetch_stage(self, harvest_object): 106 | return True 107 | -------------------------------------------------------------------------------- /ckanext/geodatagov/tests/factories.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import factory 3 | 4 | import ckanext.harvest.model as harvest_model 5 | try: 6 | from ckan.new_tests.factories import _get_action_user_name 7 | except ImportError: 8 | from ckan.tests.factories import _get_action_user_name 9 | from ckan.plugins import toolkit 10 | 11 | 12 | log = logging.getLogger(__name__) 13 | 14 | 15 | class HarvestSource(factory.Factory): 16 | FACTORY_FOR = harvest_model.HarvestSource 17 | _return_type = 'dict' 18 | 19 | class Meta: 20 | model = harvest_model.HarvestSource 21 | 22 | name = factory.Sequence(lambda n: 'test_source_{n}'.format(n=n)) 23 | title = factory.Sequence(lambda n: 'test title {n}'.format(n=n)) 24 | url = factory.Sequence(lambda n: 'http://{n}.test.com'.format(n=n)) 25 | source_type = 'undefined' 26 | id = '{0}_id'.format(name).lower() 27 | 28 | @classmethod 29 | def _create(cls, target_class, *args, **kwargs): 30 | if args: 31 | assert False, "Positional args aren't supported, use keyword args." 32 | context = {'user': _get_action_user_name(kwargs)} 33 | if kwargs.get('owner_org', False): 34 | context['owner_org'] = kwargs['owner_org'] 35 | # If there is an existing source for this URL, and we can't create 36 | # another source with that URL, just return the original one. 37 | log.info('Factory HarvestSource : {} : {}'.format(context, kwargs)) 38 | try: 39 | source_dict = toolkit.get_action('harvest_source_show')( 40 | context, dict(url=kwargs['url'])) 41 | except (KeyError, toolkit.ObjectNotFound): 42 | source_dict = toolkit.get_action('harvest_source_create')( 43 | context, kwargs) 44 | if cls._return_type == 'dict': 45 | return source_dict 46 | else: 47 | return cls.FACTORY_FOR.get(source_dict['id']) 48 | 49 | 50 | class HarvestSourceObj(HarvestSource): 51 | _return_type = 'obj' 52 | 53 | 54 | class CSWHarvestSourceObj(HarvestSourceObj): 55 | source_type = 'csw' 56 | 57 | 58 | class WafCollectionHarvestSourceObj(HarvestSourceObj): 59 | source_type = 'waf-collection' 60 | 61 | 62 | class WafHarvestSourceObj(HarvestSourceObj): 63 | source_type = 'waf' 64 | 65 | 66 | class DataJsonHarvestSourceObj(HarvestSourceObj): 67 | source_type = 'datajson' 68 | 69 | 70 | class HarvestJob(factory.Factory): 71 | FACTORY_FOR = harvest_model.HarvestJob 72 | _return_type = 'dict' 73 | 74 | class Meta: 75 | model = harvest_model.HarvestJob 76 | 77 | source = factory.SubFactory(HarvestSourceObj) 78 | 79 | @classmethod 80 | def _create(cls, target_class, *args, **kwargs): 81 | if args: 82 | assert False, "Positional args aren't supported, use keyword args." 83 | context = {'user': _get_action_user_name(kwargs)} 84 | if 'source_id' not in kwargs: 85 | kwargs['source_id'] = kwargs['source'].id 86 | if 'run' not in kwargs: 87 | kwargs['run'] = False 88 | job_dict = toolkit.get_action('harvest_job_create')( 89 | context, kwargs) 90 | if cls._return_type == 'dict': 91 | return job_dict 92 | else: 93 | return cls.FACTORY_FOR.get(job_dict['id']) 94 | 95 | 96 | class HarvestJobObj(HarvestJob): 97 | _return_type = 'obj' 98 | 99 | 100 | class HarvestObject(factory.Factory): 101 | FACTORY_FOR = harvest_model.HarvestObject 102 | _return_type = 'dict' 103 | 104 | class Meta: 105 | model = harvest_model.HarvestObject 106 | 107 | # source = factory.SubFactory(HarvestSourceObj) 108 | job = factory.SubFactory(HarvestJobObj) 109 | 110 | @classmethod 111 | def _create(cls, target_class, *args, **kwargs): 112 | if args: 113 | assert False, "Positional args aren't supported, use keyword args." 114 | context = {'user': _get_action_user_name(kwargs)} 115 | if 'job_id' not in kwargs: 116 | kwargs['job_id'] = kwargs['job'].id 117 | kwargs['source_id'] = kwargs['job'].source.id 118 | # Remove 'job' to avoid it getting added as a HarvestObjectExtra 119 | if 'job' in kwargs: 120 | kwargs.pop('job') 121 | job_dict = toolkit.get_action('harvest_object_create')( 122 | context, kwargs) 123 | if cls._return_type == 'dict': 124 | return job_dict 125 | else: 126 | return cls.FACTORY_FOR.get(job_dict['id']) 127 | 128 | 129 | class HarvestObjectObj(HarvestObject): 130 | _return_type = 'obj' 131 | -------------------------------------------------------------------------------- /ckanext/geodatagov/tests/data-samples/sample5_data.json: -------------------------------------------------------------------------------- 1 | { 2 | "@context": "https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld", 3 | "@type": "dcat:Catalog", 4 | "conformsTo": "https://project-open-data.cio.gov/v1.1/schema", 5 | "describedBy": "https://project-open-data.cio.gov/v1.1/schema/catalog.json", 6 | "dataset": [ 7 | { 8 | "@type": "dcat:Dataset", 9 | "accessLevel": "public", 10 | "accrualPeriodicity": "R/P1D", 11 | "bureauCode": [ 12 | "581:00" 13 | ], 14 | "contactPoint": { 15 | "@type": "vcard:Contact", 16 | "fn": "devops@cfpb.gov", 17 | "hasEmail": "mailto:devops@cfpb.gov" 18 | }, 19 | "describedBy": "https://cfpb.github.io/api/ccdb/api.html", 20 | "description": "The Consumer Complaint Database is a collection of complaints about consumer financial products and services that we sent to companies for response. Complaints are published after the company responds, confirming a commercial relationship with the consumer, or after 15 days, whichever comes first. Complaints referred to other regulators, such as complaints about depository institutions with less than $10 billion in assets, are not published in the Consumer Complaint Database. The database generally updates daily.", 21 | "distribution": [ 22 | { 23 | "@type": "dcat:Distribution", 24 | "downloadURL": "https://files.consumerfinance.gov/ccdb/complaints.csv.zip", 25 | "mediaType": "text/csv" 26 | }, 27 | { 28 | "@type": "dcat:Distribution", 29 | "downloadURL": "https://files.consumerfinance.gov/ccdb/complaints.json.zip", 30 | "mediaType": "application/json" 31 | }, 32 | { 33 | "@type": "dcat:Distribution", 34 | "format": "API", 35 | "accessURL": "https://www.consumerfinance.gov/data-research/consumer-complaints/search/api/v1/" 36 | } 37 | ], 38 | "identifier": "CCDB", 39 | "keyword": [ 40 | "consumer", 41 | "finance", 42 | "complaint", 43 | "bank account", 44 | "bank service", 45 | "credit card", 46 | "credit report", 47 | "debt collection", 48 | "money transfer", 49 | "mortgage", 50 | "student loan", 51 | "loan" 52 | ], 53 | "landingPage": "https://www.consumerfinance.gov/data-research/consumer-complaints/", 54 | "modified": "2020-01-13", 55 | "programCode": [ 56 | "000:000" 57 | ], 58 | "publisher": { 59 | "@type": "org:Organization", 60 | "name": "Consumer Financial Protection Bureau" 61 | }, 62 | "spatial": "United States", 63 | "title": "Consumer Complaint Database" 64 | }, 65 | { 66 | "@type": "dcat:Dataset", 67 | "accessLevel": "public", 68 | "accrualPeriodicity": "R/P1Y", 69 | "bureauCode": [ 70 | "581:00" 71 | ], 72 | "contactPoint": { 73 | "@type": "vcard:Contact", 74 | "fn": "devops@cfpb.gov", 75 | "hasEmail": "mailto:devops@cfpb.gov" 76 | }, 77 | "describedBy": "https://api.consumerfinance.gov/data/hmda", 78 | "description": "The Home Mortgage Disclosure Act (HMDA) requires many financial institutions to maintain, report, and publicly disclose information about mortgages", 79 | "distribution": [ 80 | { 81 | "@type": "dcat:Distribution", 82 | "downloadURL": "https://api.consumerfinance.gov/data/hmda/slice/hmda_lar.csv", 83 | "mediaType": "text/csv" 84 | }, 85 | { 86 | "@type": "dcat:Distribution", 87 | "downloadURL": "https://api.consumerfinance.gov/data/hmda/slice/hmda_lar.csv", 88 | "mediaType": "text/csv" 89 | }, 90 | { 91 | "@type": "dcat:Distribution", 92 | "downloadURL": "https://api.consumerfinance.gov/data/hmda/slice/hmda_lar.json", 93 | "mediaType": "application/json" 94 | }, 95 | { 96 | "@type": "dcat:Distribution", 97 | "downloadURL": "https://api.consumerfinance.gov/data/hmda/slice/hmda_lar.xml", 98 | "mediaType": "application/xml" 99 | }, 100 | { 101 | "@type": "dcat:Distribution", 102 | "format": "API", 103 | "accessURL": "https://api.consumerfinance.gov/data/hmda/slice/hmda_lar.json" 104 | } 105 | ], 106 | "identifier": "hmda_lar", 107 | "keyword": [ 108 | "consumer", 109 | "finance", 110 | "mortgage", 111 | "HMDA", 112 | "Home Mortgage Disclosure Act", 113 | "loan" 114 | ], 115 | "landingPage": "https://www.consumerfinance.gov/hmda/", 116 | "modified": "2014-09-22", 117 | "programCode": [ 118 | "000:000" 119 | ], 120 | "publisher": { 121 | "@type": "org:Organization", 122 | "name": "Consumer Financial Protection Bureau" 123 | }, 124 | "spatial": "United States", 125 | "temporal": "2007-02-01T00:00:00Z/2014-12-31T00:00:00Z", 126 | "title": "Home Mortgage Disclosure Act Data for the years 2007-2014" 127 | } 128 | ] 129 | } -------------------------------------------------------------------------------- /ckanext/geodatagov/tests/test_waf_GMI.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import pytest 4 | 5 | import ckanext.harvest.model as harvest_model 6 | from ckan import model 7 | from ckanext.geodatagov.harvesters.base import GeoDataGovWAFHarvester 8 | from ckan.tests.factories import Organization 9 | 10 | from factories import HarvestJobObj, WafHarvestSourceObj 11 | from utils import PORT, reset_db_and_solr 12 | 13 | log = logging.getLogger(__name__) 14 | 15 | 16 | @pytest.mark.usefixtures("with_plugins") 17 | class TestWafHarvester(object): 18 | 19 | def setup_method(self): 20 | reset_db_and_solr() 21 | 22 | self.organization = Organization() 23 | 24 | def run_gather(self, url, source_config): 25 | sc = json.loads(source_config) 26 | 27 | source = WafHarvestSourceObj(url=url, 28 | owner_org=self.organization['id'], 29 | config=source_config, 30 | **sc) 31 | 32 | log.info('Created source {}'.format(repr(source))) 33 | self.job = HarvestJobObj(source=source) 34 | self.harvester = GeoDataGovWAFHarvester() 35 | 36 | # gather stage 37 | log.info('GATHERING %s', url) 38 | obj_ids = self.harvester.gather_stage(self.job) 39 | log.info('job.gather_errors=%s', self.job.gather_errors) 40 | if len(self.job.gather_errors) > 0: 41 | raise Exception(self.job.gather_errors[0]) 42 | 43 | log.info('obj_ids=%s', obj_ids) 44 | if obj_ids is None or len(obj_ids) == 0: 45 | # nothing to see 46 | return 47 | 48 | self.harvest_objects = [] 49 | for obj_id in obj_ids: 50 | harvest_object = harvest_model.HarvestObject.get(obj_id) 51 | log.info('ho guid=%s', harvest_object.guid) 52 | log.info('ho content=%s', harvest_object.content) 53 | self.harvest_objects.append(harvest_object) 54 | 55 | # this is a list of harvestObjects IDs. One for dataset 56 | return obj_ids 57 | 58 | def run_fetch(self): 59 | # fetch stage 60 | for harvest_object in self.harvest_objects: 61 | log.info('FETCHING %s' % harvest_object.id) 62 | result = self.harvester.fetch_stage(harvest_object) 63 | 64 | log.info('ho errors=%s', harvest_object.errors) 65 | log.info('result 1=%s', result) 66 | if len(harvest_object.errors) > 0: 67 | raise Exception(harvest_object.errors[0]) 68 | 69 | def run_import(self): 70 | # fetch stage 71 | datasets = [] 72 | for harvest_object in self.harvest_objects: 73 | log.info('IMPORTING %s' % harvest_object.id) 74 | result = self.harvester.import_stage(harvest_object) 75 | 76 | log.info('ho errors 2=%s', harvest_object.errors) 77 | log.info('result 2=%s', result) 78 | if len(harvest_object.errors) > 0: 79 | raise Exception(harvest_object.errors[0]) 80 | 81 | log.info('ho pkg id=%s', harvest_object.package_id) 82 | dataset = model.Package.get(harvest_object.package_id) 83 | datasets.append(dataset) 84 | log.info('dataset name=%s', dataset.name) 85 | 86 | return datasets 87 | 88 | def get_datasets_from_waf_gmi_sample(self): 89 | """ harvest waf-gmi/ folder as waf source """ 90 | url = f'http://127.0.0.1:{PORT}/waf-gmi/index.html' 91 | 92 | self.config1 = '{"private_datasets": "false"}' 93 | self.run_gather(url=url, source_config=self.config1) 94 | self.run_fetch() 95 | datasets = self.run_import() 96 | 97 | return datasets 98 | 99 | def test_waf_gmi_datasets_count(self): 100 | """ Get datasets from waf/ folder as waf source 101 | and test we have one dataset with the expected name """ 102 | 103 | datasets = self.get_datasets_from_waf_gmi_sample() 104 | assert len(datasets) == 1 105 | 106 | def test_waf_gmi_datasets_privacy(self): 107 | """ Harvest waf-gmi/ folder as waf source and check the datasets are public""" 108 | 109 | datasets = self.get_datasets_from_waf_gmi_sample() 110 | for dataset in datasets: 111 | assert dataset.private is False 112 | 113 | def test_waf_gmi_names(self): 114 | """ Harvest waf-gmi/ folder as waf source and test we have the names we expect """ 115 | 116 | expected_names = [ 117 | '2014-cartographic-boundary-file-new-england-city-and-town-area-for-united-states-1-500000' 118 | ] 119 | datasets = self.get_datasets_from_waf_gmi_sample() 120 | for dataset in datasets: 121 | assert dataset.name in expected_names 122 | -------------------------------------------------------------------------------- /ckanext/geodatagov/validation/xml/fgdc-std-001.1-1999/fgdc-std-001.1-1999.xsd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | FGDC Biological Data Working Group, and USGS Biological Resources Division. 1999. Content Standard for Digital Geospatial Metadata - Biological Data Profile, FGDC-STD-001.1-1999 Federal Geographic Data Committee 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /ckanext/geodatagov/tests/test_datajson.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pytest 3 | import logging 4 | 5 | from ckan.tests.factories import Organization 6 | from ckan import model 7 | import ckanext.harvest.model as harvest_model 8 | from ckanext.datajson.harvester_datajson import DataJsonHarvester 9 | 10 | from factories import (DataJsonHarvestSourceObj, 11 | HarvestJobObj) 12 | from utils import PORT, populate_locations_table 13 | 14 | log = logging.getLogger(__name__) 15 | 16 | 17 | @pytest.mark.usefixtures("with_plugins") 18 | class TestDataJsonHarvester(object): 19 | 20 | @classmethod 21 | def setup_class(cls): 22 | populate_locations_table() 23 | 24 | def run_gather(self, url): 25 | source = DataJsonHarvestSourceObj(url=url, owner_org=self.organization['id']) 26 | job = HarvestJobObj(source=source) 27 | 28 | self.harvester = DataJsonHarvester() 29 | 30 | # gather stage 31 | log.info('GATHERING %s', url) 32 | obj_ids = self.harvester.gather_stage(job) 33 | log.info('job.gather_errors=%s', job.gather_errors) 34 | if len(job.gather_errors) > 0: 35 | raise Exception(job.gather_errors[0]) 36 | 37 | log.info('obj_ids=%s', obj_ids) 38 | if obj_ids is None or len(obj_ids) == 0: 39 | # nothing to see 40 | return 41 | 42 | self.harvest_objects = [] 43 | for obj_id in obj_ids: 44 | harvest_object = harvest_model.HarvestObject.get(obj_id) 45 | log.info('ho guid=%s', harvest_object.guid) 46 | log.info('ho content=%s', harvest_object.content) 47 | self.harvest_objects.append(harvest_object) 48 | 49 | # this is a list of harvestObjects IDs. One for dataset 50 | return obj_ids 51 | 52 | def run_fetch(self): 53 | # fetch stage 54 | for harvest_object in self.harvest_objects: 55 | log.info('FETCHING %s' % harvest_object.id) 56 | result = self.harvester.fetch_stage(harvest_object) 57 | 58 | log.info('ho errors=%s', harvest_object.errors) 59 | log.info('result 1=%s', result) 60 | if len(harvest_object.errors) > 0: 61 | raise Exception(harvest_object.errors[0]) 62 | 63 | def run_import(self): 64 | # fetch stage 65 | datasets = [] 66 | for harvest_object in self.harvest_objects: 67 | log.info('IMPORTING %s' % harvest_object.id) 68 | result = self.harvester.import_stage(harvest_object) 69 | 70 | log.info('ho errors 2=%s', harvest_object.errors) 71 | log.info('result 2=%s', result) 72 | if len(harvest_object.errors) > 0: 73 | raise Exception(harvest_object.errors[0]) 74 | 75 | log.info('ho pkg id=%s', harvest_object.package_id) 76 | dataset = model.Package.get(harvest_object.package_id) 77 | datasets.append(dataset) 78 | log.info('dataset name=%s', dataset.name) 79 | 80 | return datasets 81 | 82 | def test_sample5_data(self): 83 | self.organization = Organization() 84 | 85 | # testing with data from https://www.consumerfinance.gov/data.json 86 | url = f'http://127.0.0.1:{PORT}/sample5_data.json' 87 | obj_ids = self.run_gather(url=url) 88 | assert len(obj_ids) == 2 89 | self.run_fetch() 90 | datasets = self.run_import() 91 | assert len(datasets) == 2 92 | titles = ['Consumer Complaint Database', 93 | 'Home Mortgage Disclosure Act Data for the years 2007-2014'] 94 | for dataset in datasets: 95 | assert dataset.title in titles 96 | # test we get the spatial as we want: https://github.com/GSA/catalog.data.gov/issues/55 97 | # we expect a data transformation here 98 | pkg = dataset.as_dict() 99 | extras = json.loads(pkg["extras"]['extras_rollup']) 100 | 101 | assert pkg["extras"]["spatial"] == ('{"type":"Polygon",' 102 | '"coordinates":[[[-124.733253,24.544245],' 103 | '[-124.733253,49.388611],' 104 | '[-66.954811,49.388611],' 105 | '[-66.954811,24.544245],' 106 | '[-124.733253,24.544245]]]}') 107 | assert extras['old-spatial'] == 'United States' 108 | assert extras['programCode'] == ['000:000'] 109 | 110 | def test_bad_data_JSONDecodeError(self): 111 | """ 112 | Test for JSONDecodeError when the data.json file is not valid JSON. 113 | """ 114 | self.organization = Organization() 115 | 116 | # testing with data from https://www.consumerfinance.gov/data.json 117 | url = f"http://127.0.0.1:{PORT}/sample6_bad_data.json" 118 | with pytest.raises(Exception) as error: 119 | self.run_gather(url=url) 120 | 121 | assert "JSONDecodeError" in error.value.args[0].message 122 | -------------------------------------------------------------------------------- /ckanext/geodatagov/harvesters/waf_collection.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | log = logging.getLogger(__name__) 4 | import hashlib 5 | 6 | import requests 7 | from ckan import model 8 | from ckan.lib.navl.validators import not_empty # , ignore_empty 9 | 10 | import ckanext.harvest.queue as queue 11 | from ckanext.geodatagov.harvesters.base import ( 12 | GeoDataGovWAFHarvester, 13 | ) # , validate_profiles; , validate_profiles 14 | from ckanext.harvest.model import HarvestObject 15 | from ckanext.harvest.model import HarvestObjectExtra as HOExtra 16 | from ckanext.geodatagov.helpers import string 17 | 18 | 19 | class WAFCollectionHarvester(GeoDataGovWAFHarvester): 20 | def info(self): 21 | return { 22 | "name": "waf-collection", 23 | "title": "Web Accessible Folder (WAF) Homogeneous Collection", 24 | "description": "A Web Accessible Folder (WAF) displaying a list" 25 | "of spatial metadata documents with a collection record", 26 | } 27 | 28 | def extra_schema(self): 29 | extra_schema = super(WAFCollectionHarvester, self).extra_schema() 30 | extra_schema["collection_metadata_url"] = [not_empty, string] 31 | log.debug( 32 | "Getting extra schema for WAFCollectionHarvester: {}".format(extra_schema) 33 | ) 34 | return extra_schema 35 | 36 | def get_package_dict(self, iso_values, harvest_object): 37 | 38 | package_dict = super(WAFCollectionHarvester, self).get_package_dict( 39 | iso_values, harvest_object 40 | ) 41 | if not package_dict: 42 | return None 43 | 44 | collection_package_id = self._get_object_extra( 45 | harvest_object, "collection_package_id" 46 | ) 47 | if collection_package_id: 48 | package_dict["extras"].append( 49 | dict(key="collection_package_id", value=collection_package_id) 50 | ) 51 | 52 | collection_metadata = self._get_object_extra( 53 | harvest_object, "collection_metadata" 54 | ) 55 | if collection_metadata: 56 | package_dict["extras"].append( 57 | dict(key="collection_metadata", value=collection_metadata) 58 | ) 59 | status = self._get_object_extra(harvest_object, "status") 60 | if status == "change": 61 | self.force_import = True 62 | else: 63 | self.force_import = False 64 | 65 | return package_dict 66 | 67 | def gather_stage(self, harvest_job): 68 | log.debug("WafHarvester gather_stage for job: %r", harvest_job) 69 | 70 | self.harvest_job = harvest_job 71 | 72 | # Get source URL 73 | source_url = harvest_job.source.url 74 | 75 | self._set_source_config(harvest_job.source.config) 76 | 77 | collection_metadata_url = self.source_config.get("collection_metadata_url") 78 | 79 | if not collection_metadata_url: 80 | self._save_gather_error("collection url does not exist", harvest_job) 81 | return None 82 | 83 | try: 84 | # Ignore F841 unused variable because if commented, code does nothing 85 | response = requests.get(source_url, timeout=60) # NOQA 86 | content = response.content # NOQA 87 | except Exception as e: 88 | self._save_gather_error( 89 | "Unable to get content for URL: %s: %r" % (source_url, e), harvest_job 90 | ) 91 | return None 92 | 93 | guid = hashlib.md5(collection_metadata_url.encode("utf8", "ignore")).hexdigest() 94 | 95 | existing_harvest_object = ( 96 | model.Session.query( 97 | HarvestObject.guid, HarvestObject.package_id, HOExtra.value 98 | ) 99 | .join(HOExtra, HarvestObject.extras) 100 | .filter(HOExtra.key == "collection_metadata") 101 | .filter(HOExtra.value == "true") 102 | .filter(True if HarvestObject.current else False) 103 | .filter(HarvestObject.harvest_source_id == harvest_job.source.id) 104 | .first() 105 | ) 106 | 107 | if existing_harvest_object: 108 | status = "change" 109 | guid = existing_harvest_object.guid 110 | package_id = existing_harvest_object.package_id 111 | else: 112 | status, package_id = "new", None 113 | 114 | obj = HarvestObject( 115 | job=harvest_job, 116 | extras=[ 117 | HOExtra(key="collection_metadata", value="true"), 118 | HOExtra(key="waf_location", value=collection_metadata_url), 119 | HOExtra(key="status", value=status), 120 | ], 121 | guid=guid, 122 | package_id=package_id, 123 | ) 124 | 125 | queue.fetch_and_import_stages(self, obj) 126 | 127 | if obj.state == "ERROR": 128 | self._save_gather_error( 129 | "Collection object failed to harvest, not harvesting", harvest_job 130 | ) 131 | return None 132 | 133 | return GeoDataGovWAFHarvester.gather_stage( 134 | self, harvest_job, collection_package_id=obj.package_id 135 | ) 136 | -------------------------------------------------------------------------------- /ckanext/geodatagov/tests/test_sitemap_creation.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import xml.etree.ElementTree as ET 3 | import pytest 4 | 5 | from ckan.tests import factories 6 | from click.testing import CliRunner, Result 7 | 8 | import ckanext.geodatagov.cli as cli 9 | 10 | 11 | log = logging.getLogger(__name__) 12 | 13 | # TODO - test for output, test checking complete s3 cycle 14 | 15 | 16 | class TestSitemapExport(object): 17 | 18 | def create_datasets(self) -> None: 19 | 20 | organization = factories.Organization() 21 | self.dataset1 = factories.Dataset(owner_org=organization["id"]) 22 | self.dataset2 = factories.Dataset(owner_org=organization["id"]) 23 | self.dataset3 = factories.Dataset(owner_org=organization["id"]) 24 | self.dataset4 = factories.Dataset(owner_org=organization["id"]) 25 | 26 | @pytest.fixture 27 | def cli_result(self) -> Result: 28 | self.create_datasets() 29 | 30 | runner = CliRunner() 31 | raw_cli_output = runner.invoke( 32 | cli.sitemap_to_s3, 33 | args=[ 34 | "--upload_to_s3", 35 | "False", 36 | "--page_size", 37 | "100", 38 | "--max_per_page", 39 | "100", 40 | ], 41 | ) 42 | 43 | return raw_cli_output 44 | 45 | @staticmethod 46 | def test_cli_output(cli_result: Result) -> None: 47 | # check successful cli run 48 | assert cli_result.exit_code == 0 49 | 50 | # the example output I have only has one element in it, 51 | # this and _handle_cli_output will need to be updated for examples with more elements 52 | # checks only one list element in output string 53 | assert cli_result.output.count("file_num") == 1 54 | 55 | @staticmethod 56 | def _handle_cli_output(cli_result: Result) -> list: 57 | """Parses cli output Result to an interable file_list""" 58 | 59 | file_list = cli_result.output.split("}\"\n") 60 | file_list = list(set([f + "}\"" for f in file_list]) - {'}\"'}) 61 | 62 | return file_list 63 | 64 | def test_create_sitemap(self, cli_result): 65 | """run sitemap-to-s3 and analyze results""" 66 | 67 | file_list = self._handle_cli_output(cli_result) 68 | 69 | files = 0 70 | datasets = 0 71 | for site_file in file_list: 72 | # site_file is dumped as string 73 | site_file = eval(eval(site_file)) 74 | 75 | files += 1 76 | """ expected something like 77 | 78 | 79 | 80 | http://ckan:5000/dataset/test_dataset_01 81 | 2020-09-29 82 | 83 | 84 | http://ckan:5000/dataset/test_dataset_02 85 | 2020-09-29 86 | 87 | ... 88 | 89 | """ 90 | log.info("Opening file {}".format(site_file["filename_s3"])) 91 | root = ET.fromstring(site_file["xml"]) 92 | log.info("XML Root {}".format(root)) 93 | assert root.tag == "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset" 94 | 95 | prev_last_mod = "" 96 | 97 | dataset1_found = False 98 | dataset2_found = False 99 | dataset3_found = False 100 | dataset4_found = False 101 | 102 | for url in root: 103 | for child in url: 104 | if child.tag == "{http://www.sitemaps.org/schemas/sitemap/0.9}loc": 105 | dataset_url = child.text 106 | dataset_name = dataset_url.split("/")[-1] 107 | if dataset_name == self.dataset1["name"]: 108 | dataset1_found = True 109 | elif dataset_name == self.dataset2["name"]: 110 | dataset2_found = True 111 | elif dataset_name == self.dataset3["name"]: 112 | dataset3_found = True 113 | elif dataset_name == self.dataset4["name"]: 114 | dataset4_found = True 115 | datasets += 1 116 | elif ( 117 | child.tag == "{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod" 118 | ): 119 | last_mod = child.text 120 | log.info("{} >= {} ".format(prev_last_mod, last_mod)) 121 | assert last_mod >= prev_last_mod 122 | prev_last_mod = last_mod 123 | else: 124 | raise Exception("Unexpected tag") 125 | 126 | assert files == 1 127 | assert site_file["filename_s3"] == "sitemap-0.xml" 128 | assert datasets >= 4 # at least this four 129 | assert dataset1_found 130 | assert dataset2_found 131 | assert dataset3_found 132 | assert dataset4_found 133 | -------------------------------------------------------------------------------- /ckanext/geodatagov/bin/scrapewaf.py: -------------------------------------------------------------------------------- 1 | import zip 2 | import str 3 | import csv 4 | import requests.exceptions 5 | import pyparsing as parse 6 | import urllib.parse 7 | import dateutil.parser 8 | from ckanext.spatial.harvesters.base import guess_standard 9 | 10 | 11 | def add_status(): 12 | records = open('wafurls.txt') 13 | results = open('wafurlsstatus.txt', 'w+') 14 | headers = 'count,count_with_date,server,status_code,error,standard,id,unapproved,url' 15 | results.write(headers + '\n') 16 | writer = csv.DictWriter( 17 | results, headers.split(',') 18 | ) 19 | 20 | for row in records: 21 | row_dict = dict(list(zip('id unapproved url'.split(), row.split()))) 22 | try: 23 | response = requests.get(row_dict['url'], timeout=60) 24 | content = response.content 25 | server = str(response.headers.get('server')) 26 | if server == 'Microsoft-IIS/7.5': 27 | scraper = 'iis' 28 | elif 'apache' in server.lower() or 'nginx' in server.lower() or not response.headers.get('server'): 29 | scraper = 'apache' 30 | else: 31 | scraper = 'other' 32 | 33 | row_dict['status_code'] = str(response.status_code) 34 | row_dict['server'] = server 35 | 36 | if content and response.status_code == 200: 37 | extracted_waf = extract_waf(content, row_dict['url'], scraper) 38 | row_dict['count'] = str(len(extracted_waf)) 39 | row_dict['count_with_date'] = str(len([i for i in extracted_waf if i[1]])) 40 | if extracted_waf: 41 | try: 42 | content_doc = requests.get(extracted_waf[0][0], timeout=60).content 43 | standard = guess_standard(content_doc) 44 | row_dict['standard'] = standard 45 | except Exception as e: 46 | print(('Error guessing format. Error is ', e)) 47 | else: 48 | row_dict['count'] = "0" 49 | row_dict['count_with_date'] = "0" 50 | except Exception as e: 51 | row_dict['error'] = str(e) 52 | row_dict['count'] = "0" 53 | row_dict['count_with_date'] = "0" 54 | 55 | writer.writerow(row_dict) 56 | results.flush() 57 | 58 | 59 | apache = parse.SkipTo(parse.CaselessLiteral("", include=True).suppress() \ 62 | + parse.Optional(parse.Literal('')).suppress() \ 63 | + parse.Optional(parse.Combine( 64 | parse.Word(parse.alphanums + '-') + parse.Word(parse.alphanums + ':'), 65 | adjacent=False, joinString=' ').setResultsName('date')) 66 | 67 | iis = parse.SkipTo("
").suppress() \ 68 | + parse.OneOrMore("
").suppress() \ 69 | + parse.Optional(parse.Combine( 70 | parse.Word(parse.alphanums + '/') + parse.Word(parse.alphanums + ':') + parse.Word(parse.alphas), 71 | adjacent=False, joinString=' ').setResultsName('date')) \ 72 | + parse.Word(parse.nums).suppress() \ 73 | + parse.Literal('
9 | {% set database = source_config.get('database') or data.database %} 10 | {% set port = source_config.get('port') or data.port %} 11 | {% set collection_metadata_url = source_config.get('collection_metadata_url') or data.collection_metadata_url %} 12 | {% set extra_search_criteria = source_config.get('extra_search_criteria') or data.extra_search_criteria %} 13 | 14 | {{ form.input('database', id='field-database', label=_('Database'), placeholder=_('eg. My Database'), value=database, error=errors.database, classes=['control-full', 'control-group'] ) }} 15 | {{ form.input('port', id='field-port', label=_('Port'), placeholder=_('eg. 3452'), value=port, error=errors.port, classes=['control-full', 'control-group']) }} 16 | 17 | {{ form.input('collection_metadata_url', id='field-collection_metadata_url', label=_('Collection Metadata Url'), placeholder=_('eg. http://example.com/collection.xml'), value=collection_metadata_url, error=errors.port, classes=['control-full', 'control-group']) }} 18 | {{ form.input('extra_search_criteria', id='field-extra_search_criteria', label=_('Extra Search Criteria'), placeholder=_('eg. accountid:0123456789ABCDEF'), value=extra_search_criteria, error=errors.extra_search_criteria, classes=['control-full', 'control-group']) }} 19 | 20 | {% set validator_profiles = source_config.get('validator_profiles') or data.validator_profiles %} 21 | {% set validator_schema = source_config.get('validator_schema') or data.validator_schema %} 22 |
23 |
24 | 25 |
26 | {% for key, value in h.get_validation_profiles() %} 27 | {% set checked = key == (validator_profiles or '') %} 28 | 32 | {% endfor %} 33 |
34 |
35 | 36 |
37 | 38 |
39 | {% if h.get_validation_schema() %} 40 | {% for key, value in h.get_validation_schema() %} 41 | {% set checked = key == (validator_schema or '') %} 42 | 46 | {% endfor %} 47 | {% endif %} 48 |
49 |
50 | 51 | 52 | 53 | {% set private_datasets = data.private_datasets or source_config.get('private_datasets') %} 54 | 55 |
56 | 57 |
58 | 63 | 64 |
65 |
66 | 67 | {% set existing_group = source_config.default_groups or data.default_groups %} 68 | {% set groups_available = h.groups_available() %} 69 | {% if groups_available %} 70 |
71 | 72 |
73 | 81 | 82 | 83 | 84 |
85 |
86 | {% endif %} 87 | 88 | {% endblock extra_config %} 89 | 90 | 91 | {% block delete_button %} 92 | {% set locale = h.dump_json({'content': _('Warning: Apart from deleting this source, this command will remove all its datasets, as well as all previous job reports. Are you sure you want to continue?')}) %} 93 |
{% block delete_button_text %}{{ _('Delete') }}{% endblock %} 94 | 95 | {% endblock %} 96 | -------------------------------------------------------------------------------- /ckanext/geodatagov/validation/xml/fgdc-std-012-2002/fgdc-std-001-1998-sect03.xsd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | -------------------------------------------------------------------------------- /ckanext/geodatagov/tests/test_fix_spatial.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import ckan.plugins as p 4 | import ckan.tests.factories as factories 5 | import ckan.tests.helpers as helpers 6 | 7 | from utils import populate_locations_table 8 | 9 | 10 | @pytest.mark.usefixtures("with_plugins") 11 | class TestSpatialField(object): 12 | 13 | @classmethod 14 | def setup_class(cls): 15 | populate_locations_table() 16 | cls.user = factories.Sysadmin(name='spatial_user') 17 | 18 | def test_numeric_spatial_transformation(self): 19 | old_geo = '10.0,0.0,15.0,5.0' 20 | 21 | context = {'user': self.user['name'], 'ignore_auth': True} 22 | pkg = { 23 | 'title': 'Spatial num', 24 | 'name': 'spatial-num', 25 | 'extras': [ 26 | {'key': 'spatial', 'value': old_geo} 27 | ] 28 | } 29 | dataset = p.toolkit.get_action('package_create')(context, pkg) 30 | 31 | expected_spatial = ('{"type": "Polygon", "coordinates": [[[10.0, 0.0], [10.0, 5.0], [15.0, 5.0], ' 32 | '[15.0, 0.0], [10.0, 0.0]]]}') 33 | 34 | spatial_extra_exists = False 35 | for extra in dataset['extras']: 36 | if extra['key'] == 'spatial': 37 | spatial_extra_exists = True 38 | assert extra['value'] == expected_spatial 39 | 40 | assert spatial_extra_exists is True 41 | 42 | result = helpers.call_action( 43 | 'package_search', 44 | extras={'ext_bbox': '9,-1,16,4'}) 45 | assert result['count'] == 1 46 | assert result['results'][0]['id'] == dataset['id'] 47 | 48 | def test_string_spatial_transformation(self): 49 | 50 | old_geo = 'California' 51 | # require locations table to be installed 52 | 53 | context = {'user': self.user['name'], 'ignore_auth': True} 54 | pkg = { 55 | 'title': 'Spatial String', 56 | 'name': 'spatial-str', 57 | 'extras': [ 58 | {'key': 'spatial', 'value': old_geo} 59 | ] 60 | } 61 | dataset = p.toolkit.get_action('package_create')(context, pkg) 62 | 63 | expected_spatial = ('{"type":"Polygon",' 64 | '"coordinates":[[[-124.3926,32.5358],[-124.3926,42.0022],[-114.1252,42.0022],' 65 | '[-114.1252,32.5358],[-124.3926,32.5358]]]}') 66 | spatial_extra_exists = False 67 | for extra in dataset['extras']: 68 | if extra['key'] == 'spatial': 69 | spatial_extra_exists = True 70 | assert extra['value'] == expected_spatial 71 | 72 | assert spatial_extra_exists is True 73 | 74 | result = helpers.call_action( 75 | 'package_search', 76 | extras={'ext_bbox': '-125,31,-113,43'}) 77 | 78 | assert result['count'] == 1 79 | assert result['results'][0]['id'] == dataset['id'] 80 | 81 | def test_list_spatial_transformation(self): 82 | 83 | old_geo = '[[20.0, 10.0], [25.0, 15.0]]' 84 | 85 | context = {'user': self.user['name'], 'ignore_auth': True} 86 | pkg = { 87 | 'title': 'Spatial List', 88 | 'name': 'spatial-list', 89 | 'extras': [ 90 | {'key': 'spatial', 'value': old_geo} 91 | ] 92 | } 93 | dataset = p.toolkit.get_action('package_create')(context, pkg) 94 | 95 | expected_spatial = ('{"type": "Polygon", "coordinates": [[[20.0, 10.0], [20.0, 15.0], [25.0, 15.0], ' 96 | '[25.0, 10.0], [20.0, 10.0]]]}') 97 | spatial_extra_exists = False 98 | for extra in dataset['extras']: 99 | if extra['key'] == 'spatial': 100 | spatial_extra_exists = True 101 | assert extra['value'] == expected_spatial 102 | 103 | assert spatial_extra_exists is True 104 | 105 | result = helpers.call_action( 106 | 'package_search', 107 | extras={'ext_bbox': '19,9,26,16'}) 108 | 109 | assert result['count'] == 1 110 | assert result['results'][0]['id'] == dataset['id'] 111 | 112 | def test_spatial_plus_sign(self): 113 | 114 | old_geo = '-179.231086,-14.601813,+179.859681,+71.441059' 115 | 116 | context = {'user': self.user['name'], 'ignore_auth': True} 117 | pkg = { 118 | 'title': 'Spatial Plus Sign', 119 | 'name': 'spatial-plus', 120 | 'extras': [ 121 | {'key': 'spatial', 'value': old_geo} 122 | ] 123 | } 124 | dataset = p.toolkit.get_action('package_create')(context, pkg) 125 | 126 | expected_spatial = ('{"type": "Polygon", "coordinates": [[[-179.231086, -14.601813], ' 127 | '[-179.231086, 71.441059], [179.859681, 71.441059], [179.859681, ' 128 | '-14.601813], [-179.231086, -14.601813]]]}') 129 | spatial_extra_exists = False 130 | for extra in dataset['extras']: 131 | if extra['key'] == 'spatial': 132 | spatial_extra_exists = True 133 | assert extra['value'] == expected_spatial 134 | 135 | assert spatial_extra_exists is True 136 | 137 | def test_bad_string_transformation(self): 138 | 139 | old_geo = 'US Domestic' 140 | # require locations table to be installed 141 | 142 | context = {'user': self.user['name'], 'ignore_auth': True} 143 | pkg = { 144 | 'title': 'Spatial US Domestic', 145 | 'name': 'spatial-usd', 146 | 'extras': [ 147 | {'key': 'spatial', 'value': old_geo} 148 | ] 149 | } 150 | dataset = p.toolkit.get_action('package_create')(context, pkg) 151 | 152 | expected_spatial = "" 153 | spatial_extra_exists = False 154 | for extra in dataset['extras']: 155 | if extra['key'] == 'spatial': 156 | spatial_extra_exists = True 157 | assert extra['value'] == expected_spatial 158 | if extra['key'] == 'old-spatial': 159 | assert extra['value'] == old_geo 160 | 161 | assert spatial_extra_exists is True 162 | -------------------------------------------------------------------------------- /ckanext/geodatagov/validation/xml/fgdc-std-012-2002/fgdc-std-012-2002-sect03.xsd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | Bit representation of data value in raster cell. 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | Specification for the independent axes in the coordinate system in which spatial data are located. 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | Number of axes used in spatial data matrix. 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | Description of individual axis in spatial data matrix. 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | Designation assigned to an axis. 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | The maximum number of data points along the corresponding axis. 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Github Actions](https://github.com/GSA/ckanext-geodatagov/actions/workflows/test.yml/badge.svg)](https://github.com/GSA/ckanext-geodatagov/actions) 2 | [![PyPI version](https://badge.fury.io/py/ckanext-geodatagov.svg)](https://badge.fury.io/py/ckanext-geodatagov) 3 | 4 | # Data.gov 5 | 6 | [Data.gov](http://data.gov) is an open data website created by the [U.S. General Services Administration](https://github.com/GSA/) that is based on two robust open source projects: [CKAN](http://ckan.org) and [WordPress](http://wordpress.org). The data catalog at [catalog.data.gov](catalog.data.gov) is powered by CKAN, while the content seen at [Data.gov](Data.gov) is powered by WordPress. 7 | 8 | **For all code, bugs, and feature requests related to Data.gov, see the project wide Data.gov [issue tracker](https://github.com/GSA/data.gov/issues).** 9 | 10 | Currently this repository is only used for source version control on the code for the CKAN extension for geospatial data, but you can see all of the Data.gov relevant repos listed in the [GSA Data.gov README file](https://github.com/GSA/data.gov/blob/master/README.md). 11 | 12 | ## CKAN Extension for Geospatial Data 13 | 14 | Most Data.gov specific CKAN customizations are contained within this extension, but the extension also provides additional geospatial capabilities. 15 | 16 | ### Customization 17 | 18 | Due to CKAN 2.3 and 2.8 migrations, some features should be removed or moved to the official community versions: 19 | - [Stop rolling up the extras](https://github.com/GSA/ckanext-geodatagov/issues/178) 20 | - [Move to the official search by geolocation](https://github.com/GSA/datagov-deploy/issues/2440) (probably sharing our version that has improvements) 21 | - Do a general analysis of this extension to detect other personalized functionalities that should be discontinued. 22 | 23 | ### Requirements 24 | 25 | Package | Notes 26 | ---------------------------------------------------------------------- | ------------- 27 | [ckanext-harvest](https://github.com/ckan/ckanext-harvest/) | -- 28 | [ckanext-spatial](https://github.com/ckan/ckanext-spatial) | -- 29 | [PyZ3950](https://github.com/asl2/PyZ3950) | -- 30 | [werkzeug](https://github.com/nickumia-reisys/werkzeug) | This only effects the tests. For all intents and purposes, this should be tracking [upstream](https://github.com/pallets/werkzeug) 31 | 32 | This extension is compatible with these versions of CKAN. 33 | 34 | CKAN version | Compatibility 35 | ------------ | ------------- 36 | <=2.8 | no 37 | 2.9 | 0.1.37 (last supported) 38 | 2.10 | >=0.2.0 39 | 40 | ## Tests 41 | 42 | All the tests live in the [/ckanext/geodatagov/tests](/ckanext/geodatagov/tests) folder. [Github actions](https://github.com/GSA/ckanext-geodatagov/blob/main/.github/workflows/test.yml) is configured to run the tests against CKAN 2.10 when you open a pull request. 43 | 44 | ## Using the Docker Dev Environment 45 | 46 | ### Build Environment 47 | 48 | To start environment, run: 49 | ```docker compose build``` 50 | ```docker compose up``` 51 | 52 | CKAN will start at localhost:5000 53 | 54 | To shut down environment, run: 55 | 56 | ```docker compose down``` 57 | 58 | To docker exec into the CKAN image, run: 59 | 60 | ```docker compose exec app /bin/bash``` 61 | 62 | ### Testing 63 | 64 | They follow the guidelines for [testing CKAN 65 | extensions](https://docs.ckan.org/en/2.10/extensions/testing-extensions.html#testing-extensions). 66 | 67 | To run the extension tests, start the containers with `make up`, then: 68 | 69 | $ make test 70 | 71 | Lint the code. 72 | 73 | $ make lint 74 | 75 | ### Debugging 76 | 77 | We have not determined a good way for most IDE native debugging, however you can use the built in 78 | Python pdb debugger. Simply run `make debug`, which will run docker with an interactive shell. 79 | Add `import pdb; pdb.set_trace()` anywhere you want to start debugging, and if the code is triggered 80 | you should see a command prompt waiting in the shell. Use a pdb cheat sheet when starting to learn 81 | like [this](https://kapeli.com/cheat_sheets/Python_Debugger.docset/Contents/Resources/Documents/index). 82 | 83 | When you edit/add/remove code, the server is smart enough to restart. If you are editing logic that is 84 | not part of the webserver (ckan command, etc) then you should be able to run the command after edits 85 | and get the same debugger prompt. 86 | 87 | ### Matrix builds 88 | 89 | The existing development environment assumes a full catalog.data.gov test setup. This makes 90 | it difficult to develop and test against new versions of CKAN (or really any 91 | dependency) because everything is tightly coupled and would require us to 92 | upgrade everything at once which doesn't really work. A new make target 93 | `test-new` is introduced with a new docker-compose file. 94 | 95 | The "new" development environment drops as many dependencies as possible. It is 96 | not meant to have feature parity with 97 | [GSA/catalog.data.gov](https://github.com/GSA/catalog.data.gov/). Tests should 98 | mock external dependencies where possible. 99 | 100 | In order to support multiple versions of CKAN, or even upgrade to new versions 101 | of CKAN, we support development and testing through the `CKAN_VERSION` 102 | environment variable. 103 | 104 | $ make CKAN_VERSION=2.11 test 105 | 106 | ### Command line interface 107 | 108 | The following operations can be run from the command line as described underneath:: 109 | 110 | geodatagov sitemap-to-s3 [{upload_to_s3}] [{page_size}] [{max_per_page}] 111 | - Generates sitemap and uploads to s3 112 | 113 | geodatagov db-solr-sync [{dryrun}] [{cleanup_solr}] [{update_solr}] 114 | - DB Solr sync. 115 | 116 | geodatagov tracking-update [{start_date}] 117 | - ckan tracking update with customized options and output 118 | 119 | ## Credit / Copying 120 | 121 | Original work written by the HealthData.gov team. It has been modified in support of Data.gov. 122 | 123 | As a work of the United States Government, this package is in the public 124 | domain within the United States. Additionally, we waive copyright and 125 | related rights in the work worldwide through the CC0 1.0 Universal 126 | public domain dedication (which can be found at http://creativecommons.org/publicdomain/zero/1.0/). 127 | 128 | ## Ways to Contribute 129 | We're so glad you're thinking about contributing to ckanext-datajson! 130 | 131 | Before contributing to ckanext-datajson we encourage you to read our 132 | [CONTRIBUTING](CONTRIBUTING.md) guide, our [LICENSE](LICENSE.md), and our README 133 | (you are here), all of which should be in this repository. If you have any 134 | questions, you can email the Data.gov team at 135 | [datagov@gsa.gov](mailto:datagov@gsa.gov). 136 | -------------------------------------------------------------------------------- /ADR.md: -------------------------------------------------------------------------------- 1 | 2 | ADRs for CKANEXT_GEODATAGOV 3 | ============================================== 4 | 5 | # 1. Fix encoding issue for waf harvester 6 | 7 | Date: 2021-07-16 8 | 9 | ## Status 10 | 11 | Accepted 12 | 13 | ## Context 14 | 15 | We are using the upstream ckan version of ckanext-spatial. They upgraded the extension to PY3; however, their harvester tests were removed. The waf harvester was not being encoded properly to support PY2 and PY3 so our tests were failing. 16 | 17 | ## Decision 18 | 19 | We decided to fix the bug and submit a PR [upstream](https://github.com/ckan/ckanext-spatial/pull/252). 20 | 21 | ## Consequences 22 | 23 | - Until the fix is merged upstream, the ckanext-geodatagov repo will be tracking a pinned version of ckanext-spatial fork which adds complexity. 24 | - All of the customization of the GSA fork of ckanext-spatial is disregarded. The GSA fork was messy already. 25 | 26 | 27 | 28 | # 2. Fix JSON Serialization of dictionary 29 | 30 | Date: 2021-07-19 31 | 32 | ## Status 33 | 34 | Accepted 35 | 36 | ## Context 37 | 38 | We are using the upstream ckan version of ckanext-harvest. They upgraded the extension to PY3; however, there is a PY3-compatibility issue that causes our tests were failing. 39 | 40 | ## Decision 41 | 42 | We decided to fix the bug and submit a PR [upstream](https://github.com/ckan/ckanext-harvest/pull/450). 43 | 44 | ## Consequences 45 | 46 | - Until the fix is merged upstream, the ckanext-geodatagov repo will be tracking a pinned version of ckanext-spatial fork 47 | which adds complexity. 48 | - All of the customization of the GSA fork of ckanext-spatial is disregarded. The GSA fork was messy already. 49 | 50 | 51 | # 3. Use catalog.data.gov Solr Image 52 | 53 | Date: 2021-06-21 54 | 55 | ## Status 56 | 57 | Accepted 58 | 59 | ## Context 60 | 61 | The Solr dev image that ckanext-datajson uses was incompatible with ckanext-geodatagov. There was a 'solrsearch issue' that popped up with no clear resolution. 62 | 63 | ## Decision 64 | 65 | Using the catalog.data.gov stopped solr from throwing exceptions. 66 | 67 | ## Consequences 68 | 69 | - Consequences unknown. 70 | - All of the ckanext repos shouldn't be using varying versions of solr/postgres/etc.. 71 | 72 | 73 | # 4. Fix CKAN Test Suite, specifically reset_db() 74 | 75 | Date: 2021-06-24 76 | 77 | ## Status 78 | 79 | Accepted 80 | 81 | ## Context 82 | 83 | If all of the tables are not initialized, the 'reset_db' function attempts to delete all of the tables and reinitialize everything. Becaues geodatagov requires the postgis tables which has a complicated initialized, the ckan function doesn't support it's maintenance (the current code doesn't support it, it doesn't mean they can't or won't). This is the [logic](https://github.com/ckan/ckan/blob/e2d9d1610e63d2256739a09ba2a18e59a29a45db/ckan/model/__init__.py#L225-L236) that breaks it. Either way, if reset_db() is called to early, the postgis tables will be deleted and will break the code. If reset_db() is called too late, the db can't initialize and the code breaks. 84 | 85 | ## Decision 86 | 87 | Implement two customizations. 88 | - https://github.com/GSA/ckanext-geodatagov/pull/190/commits/627a8ad689d50b446527ea39ff4b9290203929a9 89 | - https://github.com/GSA/ckanext-geodatagov/pull/190/commits/8e34ee0164ac1ce454d4c8944ee5fbc5d025b2ed 90 | 91 | ## Consequences 92 | 93 | - Consequences unknown. 94 | - If the commands called in the test_category_tags.py is called anywhere else, the tests fail. 95 | - If the commands are repeated in multiple files, the tests fail. 96 | - If any test needs to be run in isolation, the test_category_tags.py test needs to precede it, otherwise the independent test will fail.. 97 | 98 | 99 | # 5. Track PY2 pip requirements separately from PY3 100 | 101 | Date: 2021-07-08 102 | 103 | ## Status 104 | 105 | Accepted 106 | 107 | ## Context 108 | 109 | There are a few libraries that either operate differently in py2 and py3 or have different support for py2 and py3 needed to use two separate version. 110 | 111 | PY2: 112 | - https://github.com/asl2/PyZ3950.git#egg=PyZ3950 113 | - OWSLib == 0.8.6 114 | - pyproj 1.9.6 115 | - factory-boy==2.1.1 116 | - werkzeug (no customization; it installed based on other dependencies) 117 | 118 | PY3: 119 | - https://github.com/danizen/PyZ3950.git#egg=PyZ3950 120 | - OWSLib >= 0.18.0 121 | - pyproj 2.6.1 122 | - factory-boy==2.12.0 123 | - https://github.com/nickumia-reisys/werkzeug@e1f6527604ab30e4b46b5430a5fb97e7a7055cd7#egg=werkzeug 124 | 125 | The PY3 upgrade for ckanext-harvest and ckanext-spatial had small bugs that were submitted as PRs upstream, until they are accepted, the local change needs to be tracked. 126 | - https://github.com/nickumia-reisys/ckanext-harvest.git@9d1f647d247c16b6c3acba26e321e9500cafb18c#egg=ckanext-harvest 127 | - https://github.com/GSA/ckanext-spatial.git@93c430ffc36ba7e306652fd511efd0d1e7081381#egg=ckanext-spatial 128 | 129 | ## Decision 130 | 131 | See [commit](https://github.com/GSA/ckanext-geodatagov/pull/190/commits/0cbd146d286fc1467fd2f3fba4800f7ba66b76ce) 132 | 133 | ## Consequences 134 | 135 | - A lot of specificity 136 | 137 | 138 | # 6. Remove csw harvester tests 139 | 140 | Date: 2021-07-16 141 | 142 | ## Status 143 | 144 | Accepted 145 | 146 | ## Context 147 | 148 | We don't have any customizations to the csw harvesting capability, so we no longer need to test our unique cases. 149 | 150 | ## Decision 151 | 152 | Remove [tests](https://github.com/GSA/ckanext-geodatagov/pull/190/commits/18927273785a8b2f06939c259f909c0d1ae36faf). 153 | 154 | ## Consequences 155 | 156 | - ckanext-spatial or ckanext-harvester are not testing csw harvesting, so there are missing tests overall. 157 | 158 | 159 | # 6. Rewrite source form test 160 | 161 | Date: 2021-07-19 162 | 163 | ## Status 164 | 165 | Unreviewed 166 | 167 | ## Context 168 | 169 | The CKAN test suite no longer supports forms in web pages; so custom parsing needs to be done to extract form options and data. The new tests leverage [this](https://docs.python.org/3/library/html.parser.html). The CKAN test suite changed the return type of the test app from [2.8](https://github.com/ckan/ckan/blob/2.8/ckan/tests/helpers.py#L147-L159) to [2.9](https://github.com/ckan/ckan/blob/2.9/ckan/tests/helpers.py#L194-L240). 170 | 171 | ## Decision 172 | 173 | Write [custom test functions](https://github.com/GSA/ckanext-geodatagov/pull/190/commits/18927273785a8b2f06939c259f909c0d1ae36faf). 174 | 175 | ## Consequences 176 | 177 | - ckanext-spatial or ckanext-harvester are not testing csw harvesting, so there are missing tests overall. 178 | 179 | 180 | # 7. Remove test_source_form test 181 | 182 | Date: 2022-12-12 183 | 184 | ## Status 185 | 186 | Unreviewed 187 | 188 | ## Context 189 | 190 | The test was trying to create a harvest source with a post request to `/harvest/new`; however, we suspect something in ckanext-harvest changed and broke this functionality. Since we are doing harvest tests in catalog.data.gov, we thought it was acceptable to remove this test altogether. 191 | 192 | ## Decision 193 | 194 | Remove test 195 | 196 | ## Consequences 197 | 198 | - Less tests? 199 | -------------------------------------------------------------------------------- /ckanext/geodatagov/validation/xml/fgdc-std-012-2002/fgdc-std-012-2002-sect05.xsd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | Function converting set of values on one scale to another. 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | A function in successive powers of the independent variable, or the ratio of such functions, used in a transformation, one example of which is scaling, derivation of a set of values on one scale or coordinate system from the value in another, in the sense derived value = polynomial (initial value). 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | The polynomial function when not a ratio, and the dividend of the ratio when it is. 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | The number of nonzero terms in the numerator of the polynomial. 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | The divisor of a polynomial function that is a ratio. (<i>Note: if absent, assumed equal to 1.</i>) 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | The number of nonzero terms in the denominator of the polynomial. 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | The coefficient of one term in the numerator or denominator of a polynomial function.(<i>Note: For a polynomial numerator or denominator of order m, there will be m+1 coefficients. Any of these coefficients, except that of the m power term, may be zero. When the function is linear, the coefficient of the zero-power term is the offset and the coefficient of the first power term is the scale factor.</i>) 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | Text description of the function used to derive a set of values on one scale from their value in another, using a function that is not a polynomial. 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | -------------------------------------------------------------------------------- /ckanext/geodatagov/validation/xml/fgdc-std-001.2-2001/fgdc-std-001.2-2001.xsd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | Federal Geographic Data Committee's Shoreline Metadata Profile of the Content Standard for Digital Geospatial Metadata (FGDC-STD-001.2-2001), June 2001 version of the standard. Data about the content, quality, condition, and other characteristics of data. 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | -------------------------------------------------------------------------------- /ckanext/geodatagov/validation/xml/fgdc-std-012-2002/fgdc-std-012-2002-locainfo.xsd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Information about the location of a set of one or more points.(<i>Note: this section provides a means of describing position in a coordinate system relevant to the calling element and is used by other sections of the metadata extensions. This section is never used alone. It differs from the Spatial Reference Information in that it provides positions in a coordinate system relevant to metadata elements, whereas the Spatial Reference Information refers only to positions at which the data are located.</i>) 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | Number of coordinate positions. 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | Definition of axes of coordinate system in which location of positions is provided. 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | Coordinate system which is not georeferenced and for which georeferencing information is unavailable or irrelevant. 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | Coordinate system that can be georeferenced. 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | Physical dimension corresponding to value of unity in x and y coordinate directions as defined in Coordinate System or referencing element, where the coordinates correspond to physical dimensions. 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | Physical dimension corresponding to value of unity in z coordinate directions Coordinate System or referencing element, where the coordinates correspond to physical dimensions. 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | Location of a coordinate point described by the referencing element. 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | Location of point along x-axis. 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | Location of point along y-axis. 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | Location of point along z-axis. 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | -------------------------------------------------------------------------------- /ckanext/geodatagov/tests/test_relink.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import datetime 4 | import pytest 5 | 6 | from ckan.common import config 7 | from ckan.lib.search.common import make_connection 8 | import ckan.model as model 9 | from ckanext.geodatagov.rebuild import rebuild 10 | from ckan.tests import factories 11 | from click.testing import CliRunner 12 | from ckanext.harvest.model import HarvestObject 13 | from ckanext.harvest.tests import factories as harvest_factories 14 | from ckanext.harvest.logic import HarvestJobExists 15 | 16 | import ckanext.geodatagov.cli as cli 17 | 18 | 19 | log = logging.getLogger(__name__) 20 | 21 | 22 | class TestRelink(object): 23 | 24 | @classmethod 25 | def setup_class(cls): 26 | organization = factories.Organization() 27 | # create two harvest sources 28 | cls.source1 = harvest_factories.HarvestSourceObj( 29 | url="http://test1", 30 | name="test-ho-id1", 31 | title="Test relink 1", 32 | source_type="ckan", 33 | frequency="MANUAL" 34 | ) 35 | cls.source2 = harvest_factories.HarvestSourceObj( 36 | url="http://test2", 37 | name="test-ho-id2", 38 | title="Test relink 2", 39 | source_type="ckan", 40 | frequency="MANUAL" 41 | ) 42 | 43 | # dataset 1 is for source 1 44 | cls.dataset1 = factories.Dataset(owner_org=organization["id"]) 45 | # with false hoid1 and true hoid2 46 | cls.dataset1_hoid1 = HarvestObject( 47 | package_id=cls.dataset1['id'], 48 | job=create_harvest_job(cls.source1), 49 | import_finished=datetime.datetime.utcnow(), 50 | state='COMPLETE', 51 | report_status='', 52 | current=False 53 | ) 54 | cls.dataset1_hoid2 = HarvestObject( 55 | package_id=cls.dataset1['id'], 56 | job=create_harvest_job(cls.source2), 57 | import_finished=datetime.datetime.utcnow(), 58 | state='COMPLETE', 59 | current=True 60 | ) 61 | cls.dataset1_hoid1.save() 62 | cls.dataset1_hoid2.save() 63 | 64 | # dataset 2 is for source 2 65 | cls.dataset2 = factories.Dataset(owner_org=organization["id"]) 66 | # with false hoid1 and true hoid2 67 | cls.dataset2_hoid1 = HarvestObject( 68 | package_id=cls.dataset2['id'], 69 | job=create_harvest_job(cls.source2), 70 | import_finished=datetime.datetime.utcnow(), 71 | state='COMPLETE', 72 | report_status='', 73 | current=False 74 | ) 75 | cls.dataset2_hoid2 = HarvestObject( 76 | package_id=cls.dataset2['id'], 77 | job=create_harvest_job(cls.source2), 78 | import_finished=datetime.datetime.utcnow(), 79 | state='COMPLETE', 80 | current=True 81 | ) 82 | cls.dataset2_hoid1.save() 83 | cls.dataset2_hoid2.save() 84 | 85 | rebuild() 86 | 87 | # check solr is using the current=True harvest object hoid2 88 | assert get_solr_hoid(cls.dataset1['id']) == cls.dataset1_hoid2.id 89 | assert get_solr_hoid(cls.dataset2['id']) == cls.dataset2_hoid2.id 90 | 91 | # make all harvest objects current=False, but hoid1 with newer import_finished 92 | cls.dataset1_hoid1.current = False 93 | cls.dataset1_hoid1.import_finished = datetime.datetime.utcnow() 94 | cls.dataset1_hoid1.save() 95 | cls.dataset1_hoid2.current = False 96 | cls.dataset1_hoid2.save() 97 | 98 | cls.dataset2_hoid1.current = False 99 | cls.dataset2_hoid1.import_finished = datetime.datetime.utcnow() 100 | cls.dataset2_hoid1.save() 101 | cls.dataset2_hoid2.current = False 102 | cls.dataset2_hoid2.save() 103 | 104 | @pytest.fixture 105 | def cli_result_source1(self): 106 | runner = CliRunner() 107 | raw_cli_output = runner.invoke( 108 | cli.harvest_object_relink, 109 | args=[self.source1.id], 110 | ) 111 | 112 | return raw_cli_output 113 | 114 | @pytest.fixture 115 | def cli_result_all(self): 116 | runner = CliRunner() 117 | raw_cli_output = runner.invoke( 118 | cli.harvest_object_relink, 119 | args=[], 120 | ) 121 | 122 | return raw_cli_output 123 | 124 | @pytest.mark.order1 125 | def test_relink_source1(self, cli_result_source1): 126 | """run harvest_object_relink and analyze results""" 127 | # check successful cli run 128 | assert cli_result_source1.exit_code == 0 129 | 130 | # check harvest object with newer import_finished is now current 131 | assert get_hoid_current(self.dataset1_hoid1.id) is True 132 | assert get_hoid_current(self.dataset1_hoid2.id) is False 133 | 134 | # check that solr has current harvest object for source1 dataset 135 | assert get_solr_hoid(self.dataset1['id']) == self.dataset1_hoid1.id 136 | 137 | # check that solr has not changed for source2 dataset 138 | assert get_solr_hoid(self.dataset2['id']) == self.dataset2_hoid2.id 139 | 140 | @pytest.mark.order2 141 | def test_relink_all(self, cli_result_all): 142 | """run harvest_object_relink and analyze results""" 143 | # check successful cli run 144 | assert cli_result_all.exit_code == 0 145 | 146 | # check harvest object with newer import_finished is now current 147 | assert get_hoid_current(self.dataset2_hoid1.id) is True 148 | assert get_hoid_current(self.dataset2_hoid2.id) is False 149 | 150 | # check that solr has current harvest object for both sources' datasets 151 | assert get_solr_hoid(self.dataset1['id']) == self.dataset1_hoid1.id 152 | assert get_solr_hoid(self.dataset2['id']) == self.dataset2_hoid1.id 153 | 154 | 155 | def get_hoid_current(id): 156 | """ 157 | Return the current value for a particular harvest object in DB. 158 | """ 159 | return model.Session.query( 160 | HarvestObject.current).filter(HarvestObject.id == id).first()[0] 161 | 162 | 163 | def get_solr_hoid(id): 164 | """ 165 | Return the harvest_object_id for a particular package id in Solr. 166 | """ 167 | query = "*:*" 168 | fq = "+site_id:\"%s\" " % config.get('ckan.site_id') 169 | fq += "+state:active " 170 | fq += "+id:%s" % (id) 171 | 172 | conn = make_connection() 173 | data = conn.search(query, fq=fq, rows=10, fl='validated_data_dict') 174 | 175 | harvest_object_id = None 176 | if data.docs: 177 | data_dict = json.loads(data.docs[0].get("validated_data_dict")) 178 | for extra in data_dict.get("extras", []): 179 | if extra["key"] == "harvest_object_id": 180 | harvest_object_id = extra["value"] 181 | break 182 | 183 | return harvest_object_id 184 | 185 | 186 | def create_harvest_job(source): 187 | """ 188 | Create a fictitious harvest job object and return it 189 | """ 190 | try: 191 | job = harvest_factories.HarvestJobObj(source=source) 192 | except HarvestJobExists: # not sure why 193 | job = source.get_jobs()[0] 194 | 195 | job.save() 196 | 197 | return job 198 | -------------------------------------------------------------------------------- /ckanext/geodatagov/validation/xml/fgdc-std-012-2002/fgdc-std-001-1998-sect05.xsd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | -------------------------------------------------------------------------------- /ckanext/geodatagov/validation/xml/fgdc-std-012-2002/fgdc-std-001-1998-sect09.xsd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Information about the date and time of an event. 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | Means of encoding a single date and time. 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | The year (and optionally month, or month and day). 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | The hour (and optionally minute, or minute and second) of the day. 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | Means of encoding multiple individual dates and times. 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | Means of encoding a range of dates and times. 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | The first year (and optionally month, or month and day) of the event. 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | The first hour (and optionally minute, or minute and second) of the day for the event. 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | The last year (and optionally month, or month and day) for the event. 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | The last hour (and optionally minute, or minute and second) of the day for the event. 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | -------------------------------------------------------------------------------- /ckanext/geodatagov/tests/test_waf-collection.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import pytest 4 | 5 | from ckan import model 6 | from ckanext.geodatagov.harvesters.waf_collection import WAFCollectionHarvester 7 | from ckanext.spatial.validation import all_validators 8 | import ckanext.harvest.model as harvest_model 9 | from ckan.tests.factories import Organization 10 | from ckan.tests.helpers import call_action 11 | 12 | from factories import HarvestJobObj, WafCollectionHarvestSourceObj 13 | from utils import PORT, reset_db_and_solr 14 | 15 | 16 | log = logging.getLogger(__name__) 17 | 18 | 19 | @pytest.mark.usefixtures("with_plugins") 20 | class TestWafCollectionHarvester(object): 21 | 22 | def setup_method(self): 23 | reset_db_and_solr() 24 | 25 | self.organization = Organization() 26 | 27 | def run_gather(self, url, source_config): 28 | 29 | sc = json.loads(source_config) 30 | existing_profiles = [v.name for v in all_validators] 31 | log.info('Existing validator profiles: {}'.format(existing_profiles)) 32 | source = WafCollectionHarvestSourceObj(url=url, 33 | owner_org=self.organization['id'], 34 | # config=source_config, 35 | **sc) 36 | self.job = HarvestJobObj(source=source) 37 | 38 | self.harvester = WAFCollectionHarvester() 39 | 40 | # gather stage 41 | log.info('GATHERING %s', url) 42 | obj_ids = self.harvester.gather_stage(self.job) 43 | log.info('job.gather_errors=%s', self.job.gather_errors) 44 | if len(self.job.gather_errors) > 0: 45 | raise Exception(self.job.gather_errors[0]) 46 | 47 | log.info('obj_ids=%s', obj_ids) 48 | if obj_ids is None or len(obj_ids) == 0: 49 | # nothing to see 50 | return 51 | 52 | self.harvest_objects = [] 53 | for obj_id in obj_ids: 54 | harvest_object = harvest_model.HarvestObject.get(obj_id) 55 | log.info('ho guid=%s', harvest_object.guid) 56 | log.info('ho content=%s', harvest_object.content) 57 | self.harvest_objects.append(harvest_object) 58 | 59 | # this is a list of harvestObjects IDs. One for dataset 60 | return obj_ids 61 | 62 | def run_fetch(self): 63 | # fetch stage 64 | for harvest_object in self.harvest_objects: 65 | log.info('FETCHING %s' % harvest_object.id) 66 | result = self.harvester.fetch_stage(harvest_object) 67 | 68 | log.info('ho errors=%s', harvest_object.errors) 69 | log.info('result 1=%s', result) 70 | if len(harvest_object.errors) > 0: 71 | raise Exception(harvest_object.errors[0]) 72 | 73 | def run_import(self): 74 | # fetch stage 75 | datasets = [] 76 | for harvest_object in self.harvest_objects: 77 | log.info('IMPORTING %s' % harvest_object.id) 78 | result = self.harvester.import_stage(harvest_object) 79 | 80 | log.info('ho errors 2=%s', harvest_object.errors) 81 | log.info('result 2=%s', result) 82 | if len(harvest_object.errors) > 0: 83 | raise Exception(harvest_object.errors[0]) 84 | 85 | log.info('ho pkg id=%s', harvest_object.package_id) 86 | dataset = model.Package.get(harvest_object.package_id) 87 | datasets.append(dataset) 88 | log.info('dataset name=%s', dataset.name) 89 | 90 | return datasets 91 | 92 | def get_datasets_from_waf_collection1_sample(self): 93 | """ harvest waf-collection1/ folder as waf-collection source """ 94 | url = f'http://127.0.0.1:{PORT}/waf-collection1/index.html' 95 | 96 | collection_metadata = f"http://127.0.0.1:{PORT}/waf-collection1/cfg/SeriesCollection_tl_2013_county.shp.iso.xml" 97 | config = '{"collection_metadata_url": "%s", "validator_profiles": ["iso19139ngdc"], "private_datasets": false}' %\ 98 | collection_metadata 99 | self.run_gather(url=url, source_config=config) 100 | self.run_fetch() 101 | datasets = self.run_import() 102 | self.job.status = 'Finished' 103 | self.job.save() 104 | 105 | return datasets 106 | 107 | def test_waf_collection1_datasets_count(self): 108 | """ Get datasets from waf-collection1/ folder as waf-collection source 109 | and test we have one dataset with the expected name """ 110 | 111 | datasets = self.get_datasets_from_waf_collection1_sample() 112 | assert len(datasets) == 1 113 | dataset = datasets[0] 114 | assert dataset.name == 'tiger-line-shapefile-2013-nation-u-s-current-county-and-equivalent-national-shapefile' 115 | 116 | def test_waf_collection1_datasets_as_child(self): 117 | """ Harvest waf-collection1/ folder as waf-collection source 118 | and test we get one dataset and this dataset is a "child" (it have a "collection_package_id" extra) 119 | and is not a "parent" (do not include the collection_metadata extra) """ 120 | 121 | datasets = self.get_datasets_from_waf_collection1_sample() 122 | dataset = datasets[0] 123 | 124 | extras = json.loads(dataset.extras['extras_rollup']) 125 | print(f'extras: {extras}') 126 | keys = [key for key in list(extras.keys())] 127 | assert 'collection_package_id' in keys 128 | assert 'collection_metadata' not in keys 129 | 130 | def test_waf_collection1_parent_exists(self): 131 | """ Harvest waf-collection1/ folder as waf-collection source 132 | and test parent dataset exists (include the collection_metadata=true extra) """ 133 | 134 | datasets = self.get_datasets_from_waf_collection1_sample() 135 | dataset = datasets[0] 136 | extras = json.loads(dataset.extras['extras_rollup']) 137 | 138 | parent = call_action('package_show', context={'user': 'dummy'}, id=extras['collection_package_id']) 139 | parent_keys = [extra['key'] for extra in parent['extras']] 140 | assert 'collection_metadata' in parent_keys 141 | assert 'true' == [extra['value'] for extra in parent['extras'] if extra['key'] == 'collection_metadata'][0] 142 | 143 | def test_waf_collection1_parent_title(self): 144 | """ Harvest waf-collection1/ folder as waf-collection source 145 | and test parent dataset have the expected title and name """ 146 | 147 | datasets = self.get_datasets_from_waf_collection1_sample() 148 | dataset = datasets[0] 149 | extras = json.loads(dataset.extras['extras_rollup']) 150 | 151 | parent = call_action('package_show', context={'user': 'dummy'}, id=extras['collection_package_id']) 152 | 153 | assert parent['title'] == ('TIGER/Line Shapefile, 2013, ' 154 | 'Series Information File for the Current county and Equivalent National Shapefile') 155 | assert parent['name'] == ('tiger-line-shapefile-2013-' 156 | 'series-information-file-for-the-current-county-and-equivalent-nationa') 157 | 158 | def test_waf_collection_transformation_failed(self): 159 | url = f'http://127.0.0.1:{PORT}/waf-collection2/index.html' 160 | 161 | collection_metadata = f"http://127.0.0.1:{PORT}/waf-collection2/cfg/SeriesCollection_tl_2013_county.shp.iso.xml" 162 | config = '{"collection_metadata_url": "%s", "validator_profiles": ["iso19139ngdc"], "private_datasets": false}' %\ 163 | collection_metadata 164 | self.run_gather(url=url, source_config=config) 165 | 166 | self.run_fetch() 167 | 168 | # we don't manage IS0 19110 169 | with pytest.raises(Exception) as e: 170 | self.run_import() 171 | assert 'Transformation to ISO failed' in str(e.value) 172 | -------------------------------------------------------------------------------- /ckanext/geodatagov/validation/xml/fgdc-std-001.2-2001/fgdc-std-001.2-2001-sect09.xsd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Information about the date and time of an event. 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | Means of encoding a single date and time. 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | The year (and optionally month, or month and day). 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | The hour and minute, and (optionally second) of the day. 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | Means of encoding multiple individual dates and times. 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | Means of encoding a range of dates and times. 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | The first year (and optionally month, or month and day) of the event. 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | The first hour and minute, or (optionally second) of the day for the event. 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | The last year (and optionally month, or month and day) for the event. 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | The last hour and minute, or (optionally second) of the day for the event. 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | --------------------------------------------------------------------------------