├── results └── .gitempty ├── data └── .gitignore ├── aws-scripts ├── unittests ├── composer_install └── install_dependencies ├── backup └── readme.md ├── tests ├── bootstrap.php └── Base.php ├── .gitignore ├── .travis.yml ├── docker-compose.yml ├── .circleci └── config.yml ├── src └── CKAN │ └── Manager │ ├── Adapters │ ├── FilePutContentsWrapper.php │ └── FileGetContentsWrapper.php │ ├── Dataset.php │ └── ExploreApi.php ├── cli ├── epa-gov_process │ ├── 2_find_matches.php │ ├── README.md │ ├── 1_export_everything.php │ ├── 3_rename_datasets.php │ ├── 4_assign_groups_and_tags.php │ ├── compare_qa_vs_prod_epa.php │ └── __compare_json_vs_prod_epa.php ├── doc-gov_process │ ├── 3_find_matches.php │ ├── 1_export_everything.php │ ├── 4_add_legacy_dms_and_make_private.php │ ├── 2_compare_prod_vs_prod.php │ ├── 0_compare_prod_vs_uat.php │ └── 5_compare_prod_vs_qa.php ├── nrc-gov_process │ ├── 2_find_matches.php │ ├── 1_export_everything.php │ └── compare_prod_vs_uat_nrc.php ├── tools │ ├── find_matches_one_file.php │ ├── find_matches_separate_files.php │ ├── diff.php │ ├── organizations_json_to_csv.php │ └── convert_json_to_csv.php ├── harvest_stats_csv.php ├── cleanup_tags_of_datasets_by_topic.php ├── dev_test.php ├── faa-gov │ └── export_faa.php ├── check_staging_vs_prod.php ├── active_users.php ├── mark_source_datajson_by_identifier.php ├── organization_purge.php ├── inventory │ └── redacted_stats.php ├── orphaned_tags_seeker.php ├── organization_patch.php ├── organizations_stats.php ├── export_datasets_by_topic_with_tags.php ├── resource_create.php ├── export_private_datasets.php ├── export_resource_list.php ├── search_by_topics_csv.php ├── search_by_terms_csv.php ├── update_harvest.php ├── breakdown_by_group.php ├── export.php ├── search_by_organizations_csv.php ├── find_socrata_txt_pairs.php ├── interactive_in_catalog_resources.php ├── reorganize_datasets.php ├── export_tracking_by_org.php ├── export_orgs.php ├── export_packages_by_org_with_tagging.php ├── doj-gov │ └── 1_export_everything.php ├── archive_dataset_list.php ├── pbgc-gov │ ├── 1_export_everything.php │ └── compare_uat_vs_prod_pbgc.php ├── noaa-gov │ ├── 1_export_everything.php │ ├── compare_qa_vs_prod_noaa.php │ └── compare_uat_vs_prod_noaa.php ├── fix_modified_inventory.php ├── add_legacy_dms_and_make_private.php ├── mark_as_private.php ├── search_by_titles_csv.php ├── rename.php ├── tagging │ ├── remove_groups_and_tags.php │ ├── brother_assign.php │ ├── generate_brothers_assign_csv.php │ └── assign_groups_and_tags.php ├── undelete_datasets.php ├── rename_then_mark_public.php ├── update_modified_date.php ├── add_license_url.php ├── delete_datasets.php ├── add_resource_to_dataset.php ├── export_orgs_full.php ├── rename_then_delete.php ├── update_organization.php ├── rename_then_mark_private.php ├── restore_script.php ├── update_extra_fields.php ├── export_by_list.php ├── update_field.php ├── export_full_by_list.php ├── socrata_log_redirects.php ├── compare_prod_vs_prod.php ├── export_short.php ├── compare_prod_vs_uat.php ├── recheck_socrata_redirects.php ├── compare_basic.php ├── check_aapi.php └── ntsb-gov_process │ └── compare_uat_vs_prod_ntsb.php ├── phpunit.xml ├── appspec.yml ├── inc ├── config.sample.php └── common.php ├── .editorconfig ├── docker ├── install-composer.sh └── Dockerfile ├── composer.json └── README.md /results/.gitempty: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/.gitignore: -------------------------------------------------------------------------------- 1 | *.json 2 | *.log 3 | *.csv -------------------------------------------------------------------------------- /aws-scripts/unittests: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cd /var/www/html 3 | phpunit 4 | 5 | -------------------------------------------------------------------------------- /aws-scripts/composer_install: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cd /var/www/html 3 | composer install -n 4 | -------------------------------------------------------------------------------- /backup/readme.md: -------------------------------------------------------------------------------- 1 | Place json backups here for each organization. They should use the same name (URL slug) from CKAN, eg ocsit-gsa-gov.json -------------------------------------------------------------------------------- /tests/bootstrap.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 13 | 14 | $CkanManager->findMatches(); 15 | -------------------------------------------------------------------------------- /phpunit.xml: -------------------------------------------------------------------------------- 1 | 9 | 10 | 11 | ./tests/ 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /appspec.yml: -------------------------------------------------------------------------------- 1 | version: 0.0 2 | os: linux 3 | files: 4 | - source: / 5 | destination: /var/www/html/ 6 | hooks: 7 | BeforeInstall: 8 | - location: aws-scripts/install_dependencies 9 | timeout: 300 10 | runas: root 11 | AfterInstall: 12 | - location: aws-scripts/composer_install 13 | timeout: 300 14 | runas: codedeployuser 15 | - location: aws-scripts/unittests 16 | timeout: 3600 17 | runas: codedeployuser 18 | -------------------------------------------------------------------------------- /cli/doc-gov_process/3_find_matches.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 13 | 14 | $CkanManager->findMatchesByAgency('doc'); 15 | -------------------------------------------------------------------------------- /cli/nrc-gov_process/2_find_matches.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 13 | 14 | $CkanManager->findMatchesByAgency('nrc'); 15 | -------------------------------------------------------------------------------- /cli/tools/find_matches_one_file.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 13 | 14 | $CkanManager->findMatchesOneFile(); 15 | -------------------------------------------------------------------------------- /cli/tools/find_matches_separate_files.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 13 | 14 | $CkanManager->findMatchesSeparateFiles(); 15 | -------------------------------------------------------------------------------- /cli/tools/diff.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 19 | $CkanManager->harvestStats(); 20 | 21 | // show running time on finish 22 | timer(); 23 | -------------------------------------------------------------------------------- /inc/config.sample.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 19 | 20 | $topicTitle = 'ecosystems0617'; 21 | $CkanManager->cleanUpTagsByTopic($topicTitle); 22 | 23 | // show running time on finish 24 | timer(); 25 | -------------------------------------------------------------------------------- /cli/dev_test.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 24 | $CkanManager->test_dev(); 25 | 26 | // show running time on finish 27 | timer(); 28 | -------------------------------------------------------------------------------- /cli/tools/organizations_json_to_csv.php: -------------------------------------------------------------------------------- 1 | writeRow([ 14 | // 'from', 15 | // 'to' 16 | // ]); 17 | 18 | foreach ($json['result'] as $organization) { 19 | $writer->writeRow([$organization['name']]); 20 | } 21 | } 22 | 23 | // show running time on finish 24 | timer(); 25 | -------------------------------------------------------------------------------- /docker/install-composer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # https://getcomposer.org/doc/faqs/how-to-install-composer-programmatically.md 3 | 4 | set -eu 5 | 6 | EXPECTED_SIGNATURE="$(wget -q -O - https://composer.github.io/installer.sig)" 7 | php -r "copy('https://getcomposer.org/installer', 'composer-setup.php');" 8 | ACTUAL_SIGNATURE="$(php -r "echo hash_file('sha384', 'composer-setup.php');")" 9 | 10 | if [ "$EXPECTED_SIGNATURE" != "$ACTUAL_SIGNATURE" ] 11 | then 12 | >&2 echo 'ERROR: Invalid installer signature' 13 | rm composer-setup.php 14 | exit 1 15 | fi 16 | 17 | php composer-setup.php --quiet 18 | RESULT=$? 19 | rm composer-setup.php 20 | 21 | # next two lines are local to our docker setup 22 | mv composer.phar /usr/local/bin/composer 23 | chmod +x /usr/local/bin/composer 24 | 25 | exit $RESULT -------------------------------------------------------------------------------- /cli/tools/convert_json_to_csv.php: -------------------------------------------------------------------------------- 1 | writeRow([ 14 | 'from', 15 | 'to' 16 | ]); 17 | 18 | foreach ($dataset_names['name'] as $name => $count) { 19 | $newName = preg_replace("/^deleted-/", '', $name); 20 | $writer->writeRow([ 21 | $name, 22 | $newName 23 | ]); 24 | } 25 | } 26 | 27 | // show running time on finish 28 | timer(); 29 | -------------------------------------------------------------------------------- /cli/faa-gov/export_faa.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 20 | 21 | $brief = $CkanManager->exportShort('organization:dot-gov AND (dataset_type:dataset) AND publisher:"Federal Aviation Administration"'); 22 | 23 | $headers = array_keys($brief[array_keys($brief)[0]]); 24 | $csv->writeRow($headers); 25 | $csv->writeFromArray($brief); 26 | 27 | // show running time on finish 28 | timer(); 29 | -------------------------------------------------------------------------------- /cli/check_staging_vs_prod.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 23 | $CkanManagerProduction->resultsDir = $results_dir; 24 | 25 | $groups = $CkanManagerStaging->groupsArray(); 26 | 27 | foreach ($groups as $category) { 28 | $CkanManagerStaging->checkGroupAgainstProd($category, $CkanManagerProduction); 29 | } 30 | 31 | // show running time on finish 32 | timer(); 33 | -------------------------------------------------------------------------------- /cli/active_users.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 33 | 34 | $CkanManager->activeUsers(); 35 | 36 | // show running time on finish 37 | timer(); 38 | -------------------------------------------------------------------------------- /cli/mark_source_datajson_by_identifier.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 24 | 25 | $CkanManager->tagByExtraField('identifier', 'source_datajson_identifier'); 26 | 27 | // show running time on finish 28 | timer(); 29 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "gsa/ckan-php-manager", 3 | "description": "CKAN php manager by GSA", 4 | "minimum-stability": "dev", 5 | "license": "GPL-3.0+", 6 | "authors": [ 7 | { 8 | "name": "Alex Perfilov", 9 | "email": "alexandr.perfilov@reisystems.com", 10 | "role": "Developer" 11 | } 12 | ], 13 | "autoload": { 14 | "psr-0": { 15 | "CKAN\\Manager\\": "src/" 16 | } 17 | }, 18 | "repositories": [ 19 | { 20 | "type": "git", 21 | "url": "https://github.com/GSA/ckan-php-client.git" 22 | } 23 | ], 24 | "prefer-stable": true, 25 | "require": { 26 | "php": "^7.0", 27 | "ext-json": "*", 28 | "kevinlebrun/colors.php": "~1", 29 | "gsa/ckan-php-client": "dev-master", 30 | "jwage/easy-csv": "~0" 31 | }, 32 | "require-dev": { 33 | "doctrine/instantiator": "1.0.5", 34 | "phpunit/phpunit": "~6" 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /cli/organization_purge.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 35 | 36 | $CkanManager->purgeOrganization('fs-fed-us'); 37 | 38 | // show running time on finish 39 | timer(); 40 | -------------------------------------------------------------------------------- /cli/inventory/redacted_stats.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 23 | 24 | $organization_list = $CkanManager->organization_list(true); 25 | //foreach ($organization_list as $organization) { 26 | // $members = $CkanManager-> 27 | //} 28 | 29 | var_dump($organization_list); 30 | // 31 | //$headers = array_keys($brief[array_keys($brief)[0]]); 32 | //$csv->writeRow($headers); 33 | //$csv->writeFromArray($brief); 34 | 35 | // show running time on finish 36 | timer(); 37 | -------------------------------------------------------------------------------- /cli/orphaned_tags_seeker.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 33 | 34 | $CkanManager->orphanedTagsSeek($limit, $start); 35 | 36 | // show running time on finish 37 | timer(); 38 | -------------------------------------------------------------------------------- /tests/Base.php: -------------------------------------------------------------------------------- 1 | reflection = new \ReflectionClass($this->testClass); 12 | } 13 | 14 | public function getMethod($method) 15 | { 16 | $method = $this->reflection->getMethod($method); 17 | $method->setAccessible(true); 18 | 19 | return $method; 20 | } 21 | 22 | public function getProperty($property) 23 | { 24 | $property = $this->reflection->getProperty($property); 25 | $property->setAccessible(true); 26 | 27 | return $property->getValue($this->testClass); 28 | } 29 | 30 | public function setProperty($property, $value) 31 | { 32 | $property = $this->reflection->getProperty($property); 33 | $property->setAccessible(true); 34 | $property->setValue($this->testClass, $value); 35 | 36 | return; 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /cli/organization_patch.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 35 | 36 | $fields = array( 37 | 'name' => 'fs-fed-us-legacy' 38 | ); 39 | 40 | $CkanManager->patchOrganization('fs-fed-us', $fields); 41 | 42 | // show running time on finish 43 | timer(); 44 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM alpine:latest 2 | 3 | RUN apk --no-cache upgrade 4 | RUN apk add --no-cache apache2 \ 5 | bash \ 6 | curl \ 7 | git \ 8 | jq \ 9 | mariadb \ 10 | openrc \ 11 | php7 \ 12 | php7-apache2 \ 13 | php7-curl \ 14 | php7-iconv \ 15 | php7-json \ 16 | php7-mbstring \ 17 | php7-mysqli \ 18 | php7-openssl \ 19 | php7-pcntl \ 20 | php7-pdo \ 21 | php7-phar \ 22 | php7-posix \ 23 | php7-session \ 24 | php7-simplexml \ 25 | php7-sodium \ 26 | php7-sqlite3 \ 27 | php7-tokenizer \ 28 | php7-xml \ 29 | php7-xmlreader \ 30 | php7-xmlwriter \ 31 | php7-zlib \ 32 | wget \ 33 | zip 34 | 35 | ARG APP_DIR=/var/www/app 36 | 37 | # Install composer 38 | COPY docker/install-composer.sh /tmp/install-composer.sh 39 | RUN /tmp/install-composer.sh 40 | 41 | # Add composer-installed libs to path 42 | ENV PATH=/var/www/app/vendor/bin:$PATH 43 | 44 | ADD composer.json composer.lock $APP_DIR/ 45 | 46 | WORKDIR $APP_DIR 47 | RUN composer install 48 | -------------------------------------------------------------------------------- /cli/organizations_stats.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 33 | 34 | $CkanManager->organizations_stats(); 35 | 36 | if ($CkanManager->logOutput) { 37 | file_put_contents($results_dir . '/log.csv', $CkanManager->logOutput); 38 | } 39 | 40 | // show running time on finish 41 | timer(); 42 | -------------------------------------------------------------------------------- /cli/export_datasets_by_topic_with_tags.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 41 | 42 | $CkanManager->exportDatasetsWithTagsByGroup($topic); 43 | 44 | // show running time on finish 45 | timer(); 46 | -------------------------------------------------------------------------------- /cli/resource_create.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 17 | 18 | $logFile = $results_dir . '/_log.csv'; 19 | 20 | $CkanManager->resourceCreate([ 21 | 'package_id' => 'department-of-the-interior-enterprise-data-inventory', 22 | // 'package_id' => 'u-s-widget-manufacturing-statistics-92174', 23 | 'url' => 'http://data.doi.gov/WAF/edi.json', 24 | 'name' => 'EDI Json', 25 | 'format' => 'application/json' 26 | ]); 27 | 28 | file_put_contents($logFile, $CkanManager->logOutput, FILE_APPEND | LOCK_EX); 29 | //$CkanManager->logOutput = ''; 30 | 31 | // show running time on finish 32 | timer(); 33 | -------------------------------------------------------------------------------- /cli/export_private_datasets.php: -------------------------------------------------------------------------------- 1 | getTreeArray(); 36 | 37 | $CkanManager->resultsDir = $results_dir; 38 | 39 | $CkanManager->getPrivateList($termsArray, $start, $limit); 40 | 41 | // show running time on finish 42 | timer(); 43 | -------------------------------------------------------------------------------- /cli/export_resource_list.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 35 | $CkanManager->exportResourceList(); 36 | 37 | // show running time on finish 38 | timer(); 39 | -------------------------------------------------------------------------------- /cli/search_by_topics_csv.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 42 | 43 | $CkanManager->searchByTopics($topics_list); 44 | 45 | // show running time on finish 46 | timer(); 47 | -------------------------------------------------------------------------------- /cli/search_by_terms_csv.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 42 | 43 | $CkanManager->searchByTerms($keywords_list); 44 | 45 | // show running time on finish 46 | timer(); 47 | -------------------------------------------------------------------------------- /cli/update_harvest.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 25 | 26 | $harvest_sources = file_get_contents(CKANMNGR_DATA_DIR . '/harvest_sources_automated_remainders-c.json'); 27 | $harvest_sources = json_decode($harvest_sources, true); 28 | 29 | $time = time(); 30 | $log_file = "$time.log"; 31 | 32 | foreach ($harvest_sources['result']['results'] as $harvest_source) { 33 | $CkanManager->updateHarvest($harvest_source['name'], 'frequency', 'MANUAL'); 34 | } 35 | 36 | file_put_contents($results_dir . '/' . $log_file, $CkanManager->logOutput); 37 | 38 | // show running time on finish 39 | timer(); -------------------------------------------------------------------------------- /cli/breakdown_by_group.php: -------------------------------------------------------------------------------- 1 | breakdownByGroup($csv_agencies, $csv_categories); 46 | 47 | // show running time on finish 48 | timer(); 49 | -------------------------------------------------------------------------------- /cli/export.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 46 | $CkanManager->exportDatasetsBySearch($strip_search); 47 | 48 | // show running time on finish 49 | timer(); 50 | -------------------------------------------------------------------------------- /cli/search_by_organizations_csv.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 42 | 43 | $CkanManager->searchByOrganizations($organizations_list); 44 | 45 | // show running time on finish 46 | timer(); 47 | -------------------------------------------------------------------------------- /cli/find_socrata_txt_pairs.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 26 | 27 | /** 28 | * 29 | */ 30 | define('ERROR_REPORTING', E_ALL & ~E_NOTICE); 31 | 32 | // https://explore.data.gov/api/views/bxfh-jivs.json 33 | /** 34 | * 35 | */ 36 | define('SOCRATA_URL', 'https://explore.data.gov/api/views/'); 37 | 38 | if (!is_readable($socrata_file_path = CKANMNGR_DATA_DIR . '/socrata.txt')) { 39 | die($socrata_file_path . ' not readable'); 40 | } 41 | 42 | $socrata_list = file_get_contents($socrata_file_path); 43 | $socrata_list = preg_replace('/[\\r\\n]+/', "\n", $socrata_list); 44 | $socrata_list = explode("\n", $socrata_list); 45 | 46 | $CkanManager->getSocrataPairs($socrata_list); 47 | 48 | // show running time on finish 49 | timer(); 50 | -------------------------------------------------------------------------------- /cli/interactive_in_catalog_resources.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 44 | 45 | $CkanManager->getInteractiveResources(); 46 | 47 | // show running time on finish 48 | timer(); 49 | -------------------------------------------------------------------------------- /cli/reorganize_datasets.php: -------------------------------------------------------------------------------- 1 | getTreeArrayFor(ORGANIZATION_TO_TAG); 24 | 25 | /** 26 | * sometimes there is no parent term (ex. Department of Labor) 27 | */ 28 | if (!defined('PARENT_TERM')) { 29 | define('PARENT_TERM', '_'); 30 | } 31 | 32 | /** 33 | * Create results dir for logs 34 | */ 35 | $results_dir = CKANMNGR_RESULTS_DIR . date('/Ymd-His') . '_LEGACY_' . PARENT_TERM; 36 | mkdir($results_dir); 37 | 38 | $CkanManager = new CkanManager(CKAN_API_URL, CKAN_API_KEY); 39 | //$CkanManager = new CkanManager(CKAN_STAGING_API_URL, CKAN_STAGING_API_KEY); 40 | 41 | $CkanManager->resultsDir = $results_dir; 42 | 43 | $CkanManager->reorganizeDatasets(ORGANIZATION_TO_TAG, $termsArray, CKANMNGR_BACKUP_DIR); 44 | 45 | // show running time on finish 46 | timer(); 47 | -------------------------------------------------------------------------------- /cli/export_tracking_by_org.php: -------------------------------------------------------------------------------- 1 | getTreeArrayFor(ORGANIZATION_TO_EXPORT); 23 | 24 | /** 25 | * sometimes there is no parent term (ex. Department of Labor) 26 | */ 27 | if (!defined('PARENT_TERM')) { 28 | define('PARENT_TERM', '_'); 29 | } 30 | 31 | /** 32 | * Create results dir for logs and json results 33 | */ 34 | $results_dir = CKANMNGR_RESULTS_DIR . date('/Ymd-His') . '_TRACKING_' . PARENT_TERM; 35 | mkdir($results_dir); 36 | 37 | /** 38 | * Search for packages by terms found 39 | */ 40 | 41 | /** 42 | * Production 43 | */ 44 | $CkanManager = new CkanManager(CKAN_API_URL); 45 | 46 | /** 47 | * Staging 48 | */ 49 | //$CkanManager = new CkanManager(CKAN_STAGING_API_URL); 50 | 51 | $CkanManager->resultsDir = $results_dir; 52 | 53 | $CkanManager->exportTrackingByOrgTerms($termsArray); 54 | 55 | // show running time on finish 56 | timer(); 57 | -------------------------------------------------------------------------------- /cli/export_orgs.php: -------------------------------------------------------------------------------- 1 | getTreeArrayFor(ORGANIZATION_TO_EXPORT); 22 | 23 | /** 24 | * sometimes there is no parent term (ex. Department of Labor) 25 | */ 26 | if (!defined('PARENT_TERM')) { 27 | define('PARENT_TERM', '_'); 28 | } 29 | 30 | /** 31 | * Create results dir for logs and json results 32 | */ 33 | $results_dir = CKANMNGR_RESULTS_DIR . date('/Ymd-His') . '_EXPORT_' . PARENT_TERM; 34 | mkdir($results_dir); 35 | 36 | /** 37 | * Search for packages by terms found 38 | */ 39 | 40 | /** 41 | * Production 42 | */ 43 | //$CkanManager = new CkanManager(CKAN_API_URL); 44 | $CkanManager = new CkanManager(INVENTORY_CKAN_PROD_API_URL, INVENTORY_CKAN_PROD_API_KEY); 45 | 46 | /** 47 | * Staging 48 | */ 49 | //$CkanManager = new CkanManager(CKAN_STAGING_API_URL); 50 | 51 | $CkanManager->resultsDir = $results_dir; 52 | 53 | $CkanManager->exportOrganizations($termsArray); 54 | 55 | // show running time on finish 56 | timer(); 57 | -------------------------------------------------------------------------------- /cli/nrc-gov_process/1_export_everything.php: -------------------------------------------------------------------------------- 1 | getTreeArrayFor(ORGANIZATION_TO_EXPORT); 20 | 21 | /** 22 | * sometimes there is no parent term (ex. Department of Labor) 23 | */ 24 | if (!defined('PARENT_TERM')) { 25 | define('PARENT_TERM', '_'); 26 | } 27 | 28 | /** 29 | * Create results dir for logs and json results 30 | */ 31 | $results_dir = CKANMNGR_RESULTS_DIR . date('/Ymd-His') . '_EXPORT_' . PARENT_TERM; 32 | mkdir($results_dir); 33 | 34 | /** 35 | * Search for packages by terms found 36 | */ 37 | 38 | /** 39 | * Production 40 | */ 41 | $CkanManager = new CkanManager(CKAN_API_URL); 42 | //$CkanManager = new CkanManager(CKAN_QA_API_URL); 43 | //$CkanManager = new CkanManager(INVENTORY_CKAN_PROD_API_URL, INVENTORY_CKAN_PROD_API_KEY); 44 | 45 | /** 46 | * Staging 47 | */ 48 | //$CkanManager = new CkanManager(CKAN_STAGING_API_URL); 49 | 50 | $CkanManager->resultsDir = $results_dir; 51 | 52 | $CkanManager->exportPackagesByOrgTerms($termsArray); 53 | 54 | // show running time on finish 55 | timer(); 56 | -------------------------------------------------------------------------------- /cli/epa-gov_process/1_export_everything.php: -------------------------------------------------------------------------------- 1 | getTreeArrayFor(ORGANIZATION_TO_EXPORT); 20 | 21 | /** 22 | * sometimes there is no parent term (ex. Department of Labor) 23 | */ 24 | if (!defined('PARENT_TERM')) { 25 | define('PARENT_TERM', '_'); 26 | } 27 | 28 | /** 29 | * Create results dir for logs and json results 30 | */ 31 | $results_dir = CKANMNGR_RESULTS_DIR . date('/Ymd-His') . '_EXPORT_' . PARENT_TERM; 32 | mkdir($results_dir); 33 | 34 | /** 35 | * Search for packages by terms found 36 | */ 37 | 38 | /** 39 | * Production 40 | */ 41 | $CkanManager = new CkanManager(CKAN_API_URL); 42 | //$CkanManager = new CkanManager(CKAN_QA_API_URL); 43 | //$CkanManager = new CkanManager(INVENTORY_CKAN_PROD_API_URL, INVENTORY_CKAN_PROD_API_KEY); 44 | 45 | /** 46 | * Staging 47 | */ 48 | //$CkanManager = new CkanManager(CKAN_STAGING_API_URL); 49 | 50 | $CkanManager->resultsDir = $results_dir; 51 | 52 | $CkanManager->exportPackagesByOrgTerms($termsArray); 53 | 54 | // show running time on finish 55 | timer(); 56 | -------------------------------------------------------------------------------- /cli/export_packages_by_org_with_tagging.php: -------------------------------------------------------------------------------- 1 | getTreeArrayFor(ORGANIZATION_TO_EXPORT); 20 | 21 | 22 | /** 23 | * sometimes there is no parent term (ex. Department of Labor) 24 | */ 25 | if (!defined('PARENT_TERM')) { 26 | define('PARENT_TERM', '_'); 27 | } 28 | 29 | /** 30 | * Create results dir for logs and json results 31 | */ 32 | $results_dir = CKANMNGR_RESULTS_DIR . date('/Ymd-His') . '_EXPORT_' . PARENT_TERM; 33 | mkdir($results_dir); 34 | 35 | /** 36 | * Search for packages by terms found 37 | */ 38 | 39 | /** 40 | * Production 41 | */ 42 | $CkanManager = new CkanManager(CKAN_API_URL); 43 | //$CkanManager = new CkanManager(CKAN_QA_API_URL); 44 | //$CkanManager = new CkanManager(INVENTORY_CKAN_PROD_API_URL, INVENTORY_CKAN_PROD_API_KEY); 45 | 46 | /** 47 | * Staging 48 | */ 49 | //$CkanManager = new CkanManager(CKAN_STAGING_API_URL); 50 | 51 | $CkanManager->resultsDir = $results_dir; 52 | 53 | $CkanManager->exportPackagesByOrgTerms($termsArray); 54 | 55 | // show running time on finish 56 | timer(); 57 | -------------------------------------------------------------------------------- /cli/doc-gov_process/1_export_everything.php: -------------------------------------------------------------------------------- 1 | getTreeArrayFor(ORGANIZATION_TO_EXPORT); 20 | 21 | /** 22 | * sometimes there is no parent term (ex. Department of Labor) 23 | */ 24 | if (!defined('PARENT_TERM')) { 25 | define('PARENT_TERM', '_'); 26 | } 27 | 28 | /** 29 | * Create results dir for logs and json results 30 | */ 31 | $results_dir = CKANMNGR_RESULTS_DIR . date('/Ymd-His') . '_EXPORT_' . PARENT_TERM; 32 | mkdir($results_dir); 33 | 34 | $CkanManager = new CkanManager(CKAN_API_URL); 35 | //$CkanManager = new CkanManager(CKAN_QA_API_URL); 36 | //$CkanManager = new CkanManager(INVENTORY_CKAN_PROD_API_URL, INVENTORY_CKAN_PROD_API_KEY); 37 | //$CkanManager = new CkanManager(CKAN_STAGING_API_URL); 38 | 39 | $CkanManager->resultsDir = $results_dir; 40 | 41 | /** 42 | * We are skipping noaa-gov and nist-gov within current process 43 | */ 44 | unset($termsArray['noaa-gov']); 45 | unset($termsArray['nist-gov']); 46 | 47 | $CkanManager->exportPackagesByOrgTerms($termsArray); 48 | 49 | // show running time on finish 50 | timer(); 51 | -------------------------------------------------------------------------------- /cli/doj-gov/1_export_everything.php: -------------------------------------------------------------------------------- 1 | getTreeArrayFor(ORGANIZATION_TO_EXPORT); 20 | 21 | $termsArray = ['doj-gov' => 'Department of Justice']; 22 | 23 | /** 24 | * sometimes there is no parent term (ex. Department of Labor) 25 | */ 26 | if (!defined('PARENT_TERM')) { 27 | define('PARENT_TERM', '_'); 28 | } 29 | 30 | /** 31 | * Create results dir for logs and json results 32 | */ 33 | $results_dir = CKANMNGR_RESULTS_DIR . date('/Ymd-His') . '_EXPORT_DOJ' . PARENT_TERM; 34 | mkdir($results_dir); 35 | 36 | /** 37 | * Search for packages by terms found 38 | */ 39 | 40 | /** 41 | * Production 42 | */ 43 | $CkanManager = new CkanManager(CKAN_API_URL); 44 | //$CkanManager = new CkanManager(CKAN_UAT_API_URL); 45 | //$CkanManager = new CkanManager(CKAN_QA_API_URL); 46 | //$CkanManager = new CkanManager(INVENTORY_CKAN_PROD_API_URL, INVENTORY_CKAN_PROD_API_KEY); 47 | 48 | /** 49 | * Staging 50 | */ 51 | //$CkanManager = new CkanManager(CKAN_STAGING_API_URL); 52 | 53 | $CkanManager->resultsDir = $results_dir; 54 | 55 | $CkanManager->exportPackagesByOrgTerms($termsArray); 56 | 57 | // show running time on finish 58 | timer(); 59 | -------------------------------------------------------------------------------- /cli/archive_dataset_list.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 21 | foreach (glob(CKANMNGR_DATA_DIR . '/private*.csv') as $csv_file) { 22 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL; 23 | echo $status; 24 | 25 | $basename = str_replace('.csv', '', basename($csv_file)); 26 | 27 | // fix wrong END-OF-LINE 28 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file))); 29 | 30 | file_put_contents($results_dir . '/' . $basename . '.log', $status, FILE_APPEND | LOCK_EX); 31 | 32 | $csv = new EasyCSV\Reader($csv_file, 'r+', false); 33 | while (true) { 34 | $row = $csv->getRow(); 35 | if (!$row) { 36 | break; 37 | } 38 | // skip headers 39 | if (in_array(strtolower($row['0']), ['dataset', 'uid', 'uuid', 'name', 'url', 'data.gov url'])) { 40 | continue; 41 | } 42 | 43 | $dataset = basename($row['0']); 44 | $CkanManager->makeDatasetPrivate($dataset, $basename); 45 | } 46 | } 47 | 48 | // show running time on finish 49 | timer(); 50 | -------------------------------------------------------------------------------- /cli/pbgc-gov/1_export_everything.php: -------------------------------------------------------------------------------- 1 | getTreeArrayFor(ORGANIZATION_TO_EXPORT); 20 | 21 | $termsArray = ['pbgc-gov' => 'Pension Benefit Guaranty Corporation']; 22 | 23 | /** 24 | * sometimes there is no parent term (ex. Department of Labor) 25 | */ 26 | if (!defined('PARENT_TERM')) { 27 | define('PARENT_TERM', '_'); 28 | } 29 | 30 | /** 31 | * Create results dir for logs and json results 32 | */ 33 | $results_dir = CKANMNGR_RESULTS_DIR . date('/Ymd-His') . '_EXPORT_PBGC' . PARENT_TERM; 34 | mkdir($results_dir); 35 | 36 | /** 37 | * Search for packages by terms found 38 | */ 39 | 40 | /** 41 | * Production 42 | */ 43 | $CkanManager = new CkanManager(CKAN_API_URL); 44 | //$CkanManager = new CkanManager(CKAN_UAT_API_URL); 45 | //$CkanManager = new CkanManager(CKAN_QA_API_URL); 46 | //$CkanManager = new CkanManager(INVENTORY_CKAN_PROD_API_URL, INVENTORY_CKAN_PROD_API_KEY); 47 | 48 | /** 49 | * Staging 50 | */ 51 | //$CkanManager = new CkanManager(CKAN_STAGING_API_URL); 52 | 53 | $CkanManager->resultsDir = $results_dir; 54 | 55 | $CkanManager->exportPackagesByOrgTerms($termsArray); 56 | 57 | // show running time on finish 58 | timer(); 59 | -------------------------------------------------------------------------------- /cli/noaa-gov/1_export_everything.php: -------------------------------------------------------------------------------- 1 | getTreeArrayFor(ORGANIZATION_TO_EXPORT); 20 | 21 | $termsArray = ['noaa-gov' => 'National Oceanic and Atmospheric Administration']; 22 | 23 | /** 24 | * sometimes there is no parent term (ex. Department of Labor) 25 | */ 26 | if (!defined('PARENT_TERM')) { 27 | define('PARENT_TERM', '_'); 28 | } 29 | 30 | /** 31 | * Create results dir for logs and json results 32 | */ 33 | $results_dir = CKANMNGR_RESULTS_DIR . date('/Ymd-His') . '_EXPORT_NOAA' . PARENT_TERM; 34 | mkdir($results_dir); 35 | 36 | /** 37 | * Search for packages by terms found 38 | */ 39 | 40 | /** 41 | * Production 42 | */ 43 | $CkanManager = new CkanManager(CKAN_API_URL); 44 | //$CkanManager = new CkanManager(CKAN_UAT_API_URL); 45 | //$CkanManager = new CkanManager(CKAN_QA_API_URL); 46 | //$CkanManager = new CkanManager(INVENTORY_CKAN_PROD_API_URL, INVENTORY_CKAN_PROD_API_KEY); 47 | 48 | /** 49 | * Staging 50 | */ 51 | //$CkanManager = new CkanManager(CKAN_STAGING_API_URL); 52 | 53 | $CkanManager->resultsDir = $results_dir; 54 | 55 | $CkanManager->exportPackagesByOrgTerms($termsArray); 56 | 57 | // show running time on finish 58 | timer(); 59 | -------------------------------------------------------------------------------- /cli/fix_modified_inventory.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 18 | 19 | 20 | foreach (glob(CKANMNGR_DATA_DIR . '/metadata*.csv') as $csv_file) { 21 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL; 22 | echo $status; 23 | 24 | // fix wrong END-OF-LINE 25 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file))); 26 | 27 | // file_put_contents($resultsDir . '/' . $basename . '_tags.log', $status, FILE_APPEND | LOCK_EX); 28 | 29 | $csv = new EasyCSV\Reader($csv_file, 'r+', false); 30 | while (true) { 31 | $row = $csv->getRow(); 32 | if (!$row) { 33 | break; 34 | } 35 | // skip headers 36 | if (in_array(trim(strtolower($row['0'])), ['link', 'dataset', 'url', 'data.gov url'])) { 37 | continue; 38 | } 39 | 40 | // no anchors please 41 | list($dataset,) = explode('#', basename(trim($row['0']))); 42 | 43 | if (!$dataset) { 44 | continue; 45 | } 46 | 47 | $CkanManager->fixModified($dataset); 48 | file_put_contents($results_dir . '/log.csv', $CkanManager->logOutput, FILE_APPEND | LOCK_EX); 49 | $CkanManager->logOutput = ''; 50 | } 51 | } 52 | 53 | // show running time on finish 54 | timer(); 55 | -------------------------------------------------------------------------------- /inc/common.php: -------------------------------------------------------------------------------- 1 | bold 54 | . $clr($minutes_spent . ' minutes ' . $seconds_spent . ' seconds ')->green->bold . PHP_EOL; 55 | } 56 | -------------------------------------------------------------------------------- /cli/add_legacy_dms_and_make_private.php: -------------------------------------------------------------------------------- 1 | getTreeArrayFor(ORGANIZATION_TO_TAG); 40 | 41 | /** 42 | * sometimes there is no parent term (ex. Department of Labor) 43 | */ 44 | if (!defined('PARENT_TERM')) { 45 | die('PARENT_TERM not found'); 46 | } 47 | 48 | /** 49 | * Create results dir for logs 50 | */ 51 | $results_dir = CKANMNGR_RESULTS_DIR . date('/Ymd-His') . '_LEGACY_' . PARENT_TERM; 52 | mkdir($results_dir); 53 | 54 | /** 55 | * Adding Legacy dms tag 56 | */ 57 | $CkanManager = new CkanManager(CKAN_API_URL, LIST_ONLY ? null : CKAN_API_KEY); 58 | //$CkanManager = new CkanManager(CKAN_STAGING_API_URL, CKAN_STAGING_API_KEY); 59 | 60 | $CkanManager->resultsDir = $results_dir; 61 | $CkanManager->tagLegacyDms($termsArray, 'metadata_from_legacy_dms'); 62 | 63 | // show running time on finish 64 | timer(); 65 | -------------------------------------------------------------------------------- /cli/mark_as_private.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 27 | 28 | foreach (glob(CKANMNGR_DATA_DIR . '/private_*.csv') as $csv_file) { 29 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL; 30 | echo $status; 31 | 32 | // fix wrong END-OF-LINE 33 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file))); 34 | 35 | $basename = str_replace('.csv', '', basename($csv_file)); 36 | file_put_contents($results_dir . '/' . $basename . '_private.log', $status, FILE_APPEND | LOCK_EX); 37 | 38 | $csv = new EasyCSV\Reader($csv_file, 'r+', false); 39 | $i = 1; 40 | while (true) { 41 | $row = $csv->getRow(); 42 | if (!$row) { 43 | break; 44 | } 45 | // skip headers 46 | if (in_array(trim(strtolower($row['0'])), ['dataset', 'url', 'old dataset url', 'from'])) { 47 | continue; 48 | } 49 | 50 | $datasetName = basename($row['0']); 51 | 52 | printf('[%04d] ', $i++); 53 | $CkanManager->makeDatasetPrivate($datasetName, $basename); 54 | } 55 | } 56 | 57 | // show running time on finish 58 | timer(); 59 | -------------------------------------------------------------------------------- /cli/search_by_titles_csv.php: -------------------------------------------------------------------------------- 1 | writeRow(['url', 'exact match', 'title', 'found by title']); 41 | 42 | $i = 0; 43 | while (true) { 44 | if (!($i++ % 10)) { 45 | echo $i . PHP_EOL; 46 | } 47 | $row = $csv_source->getRow(); 48 | if (!$row) { 49 | break; 50 | } 51 | // skip headers 52 | if (in_array(trim(strtolower($row[0])), ['url', 'from', 'source url'])) { 53 | continue; 54 | } 55 | 56 | $title = $row[0]; 57 | 58 | /** 59 | * Search for packages by terms found 60 | */ 61 | $CkanManager->searchByTitle($title, $csv_destination); 62 | } 63 | 64 | // fix wrong END-OF-LINE 65 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file))); 66 | } 67 | 68 | // show running time on finish 69 | timer(); 70 | -------------------------------------------------------------------------------- /cli/doc-gov_process/4_add_legacy_dms_and_make_private.php: -------------------------------------------------------------------------------- 1 | getTreeArrayFor(ORGANIZATION_TO_TAG); 40 | 41 | /** 42 | * sometimes there is no parent term (ex. Department of Labor) 43 | */ 44 | if (!defined('PARENT_TERM')) { 45 | die('PARENT_TERM not found'); 46 | } 47 | 48 | /** 49 | * Create results dir for logs 50 | */ 51 | $results_dir = CKANMNGR_RESULTS_DIR . date('/Ymd-His') . '_LEGACY_' . PARENT_TERM; 52 | mkdir($results_dir); 53 | 54 | /** 55 | * Adding Legacy dms tag 56 | */ 57 | $CkanManager = new CkanManager(CKAN_API_URL, LIST_ONLY ? null : CKAN_API_KEY); 58 | //$CkanManager = new CkanManager(CKAN_STAGING_API_URL, CKAN_STAGING_API_KEY); 59 | 60 | $CkanManager->resultsDir = $results_dir; 61 | 62 | /** 63 | * We are skipping noaa-gov and nist-gov within current process 64 | */ 65 | unset($termsArray['noaa-gov']); 66 | unset($termsArray['nist-gov']); 67 | 68 | $CkanManager->tagLegacyDms($termsArray, 'metadata_from_legacy_dms'); 69 | 70 | // show running time on finish 71 | timer(); 72 | -------------------------------------------------------------------------------- /cli/rename.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 31 | 32 | foreach (glob(CKANMNGR_DATA_DIR . '/rename*.csv') as $csv_file) { 33 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL; 34 | echo $status; 35 | 36 | // fix wrong END-OF-LINE 37 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file))); 38 | 39 | $basename = str_replace('.csv', '', basename($csv_file)); 40 | file_put_contents($results_dir . '/' . $basename . '_rename.log', $status, FILE_APPEND | LOCK_EX); 41 | 42 | $csv = new EasyCSV\Reader($csv_file, 'r+', false); 43 | $i = 1; 44 | while (true) { 45 | $row = $csv->getRow(); 46 | if (!$row) { 47 | break; 48 | } 49 | // skip headers 50 | if (in_array(trim(strtolower($row['0'])), ['dataset', 'url', 'old dataset url', 'from'])) { 51 | continue; 52 | } 53 | 54 | $datasetName = trim(basename($row['0'])); 55 | $newDatasetName = basename($row['1']); 56 | 57 | printf('[%04d] ', $i++); 58 | $CkanManager->renameDataset($datasetName, $newDatasetName, $basename); 59 | } 60 | } 61 | 62 | // show running time on finish 63 | timer(); 64 | -------------------------------------------------------------------------------- /cli/tagging/remove_groups_and_tags.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 23 | foreach (glob(CKANMNGR_DATA_DIR . '/remove_*.csv') as $csv_file) { 24 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL; 25 | echo $status; 26 | 27 | $basename = str_replace('.csv', '', basename($csv_file)); 28 | 29 | // fix wrong END-OF-LINE 30 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file))); 31 | 32 | file_put_contents($results_dir . '/' . $basename . '_remove.log', $status, FILE_APPEND | LOCK_EX); 33 | 34 | $csv = new EasyCSV\Reader($csv_file, 'r+', false); 35 | while (true) { 36 | $row = $csv->getRow(); 37 | if (!$row) { 38 | break; 39 | } 40 | // skip headers 41 | if (in_array(strtolower($row['0']), 42 | ['dataset', 'uid', 'uuid', 'name', 'url', 'data.gov url', 'dataset link'])) { 43 | continue; 44 | } 45 | 46 | // no anchors please 47 | list($dataset,) = explode('#', basename(trim($row['0']))); 48 | $category = trim(isset($row['1']) ? ($row['1'] ?: '') : ''); 49 | $tags = trim(isset($row['2']) ? ($row['2'] ?: '') : ''); 50 | $CkanManager->removeTagsAndGroupsFromDatasets([$dataset], $category, $tags, $basename); 51 | } 52 | } 53 | 54 | // show running time on finish 55 | timer(); 56 | -------------------------------------------------------------------------------- /cli/undelete_datasets.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 35 | 36 | /** 37 | * CSV 38 | * datasetName, orgId 39 | */ 40 | 41 | foreach (glob(CKANMNGR_DATA_DIR . '/undelete*.csv') as $csv_file) { 42 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL; 43 | echo $status; 44 | 45 | // fix wrong END-OF-LINE 46 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file))); 47 | 48 | $basename = str_replace('.csv', '', basename($csv_file)); 49 | $logFile = $results_dir . '/' . $basename . '_log.csv'; 50 | 51 | $csv = new EasyCSV\Reader($csv_file, 'r+', false); 52 | $i = 1; 53 | while (true) { 54 | $row = $csv->getRow(); 55 | if (!$row) { 56 | break; 57 | } 58 | // skip headers 59 | if (in_array(trim(strtolower($row['0'])), ['dataset', 'url', 'old dataset url', 'from'])) { 60 | continue; 61 | } 62 | 63 | $datasetName = basename($row['0']); 64 | 65 | printf('[%04d] ', $i++); 66 | $CkanManager->undeleteDataset($datasetName); 67 | file_put_contents($logFile, $CkanManager->logOutput, FILE_APPEND | LOCK_EX); 68 | $CkanManager->logOutput = ''; 69 | } 70 | } 71 | 72 | // show running time on finish 73 | timer(); 74 | -------------------------------------------------------------------------------- /cli/rename_then_mark_public.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 31 | 32 | foreach (glob(CKANMNGR_DATA_DIR . '/rename*.csv') as $csv_file) { 33 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL; 34 | echo $status; 35 | 36 | // fix wrong END-OF-LINE 37 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file))); 38 | 39 | $basename = str_replace('.csv', '', basename($csv_file)); 40 | file_put_contents($results_dir . '/' . $basename . '_rename.log', $status, FILE_APPEND | LOCK_EX); 41 | 42 | $csv = new EasyCSV\Reader($csv_file, 'r+', false); 43 | $i = 1; 44 | while (true) { 45 | $row = $csv->getRow(); 46 | if (!$row) { 47 | break; 48 | } 49 | // skip headers 50 | if (in_array(trim(strtolower($row['0'])), ['dataset', 'url', 'old dataset url', 'from'])) { 51 | continue; 52 | } 53 | 54 | $datasetName = trim(basename($row['0'])); 55 | $newDatasetName = basename($row['1']); 56 | 57 | printf('[%04d] ', $i++); 58 | $CkanManager->renameDataset($datasetName, $newDatasetName, $basename); 59 | $CkanManager->makeDatasetPublic($newDatasetName, $basename); 60 | } 61 | } 62 | 63 | // show running time on finish 64 | timer(); 65 | -------------------------------------------------------------------------------- /cli/update_modified_date.php: -------------------------------------------------------------------------------- 1 | results_dir = $results_dir; 41 | //foreach (glob(DATA_DIR . '/update_modified_date.csv') as $csv_file) { 42 | // $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL; 43 | // echo $status; 44 | // 45 | // $basename = str_replace('.csv', '', basename($csv_file)); 46 | // 47 | // // fix wrong END-OF-LINE 48 | // file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file))); 49 | // 50 | // file_put_contents($results_dir . '/' . $basename . 'update_modified_date.log', $status, FILE_APPEND | LOCK_EX); 51 | // 52 | // $csv = new EasyCSV\Reader($csv_file, 'r+', false); 53 | // while (true) { 54 | // $row = $csv->getRow(); 55 | // if (!$row) { 56 | // break; 57 | // } 58 | //// skip headers 59 | // if (in_array(strtolower($row['0']), ["Name of Dataset", "Agency", "Name"])) { 60 | // continue; 61 | // } 62 | // 63 | // $package_id = $row['2']; 64 | // $CkanManager->update_dataset_update_date($package_id, $basename); 65 | // } 66 | //} 67 | // 68 | //// show running time on finish 69 | //timer(); 70 | -------------------------------------------------------------------------------- /cli/add_license_url.php: -------------------------------------------------------------------------------- 1 | results_dir = $results_dir; 40 | //foreach (glob(DATA_DIR . '/public_package.csv') as $csv_file) { 41 | // $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL; 42 | // echo $status; 43 | // 44 | // $basename = str_replace('.csv', '', basename($csv_file)); 45 | // 46 | // // fix wrong END-OF-LINE 47 | // file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file))); 48 | // 49 | // file_put_contents($results_dir . '/' . $basename . '_add_licese_url.log', $status, FILE_APPEND | LOCK_EX); 50 | // 51 | // $csv = new EasyCSV\Reader($csv_file, 'r+', false); 52 | // while (true) { 53 | // $row = $csv->getRow(); 54 | // if (!$row) { 55 | // break; 56 | // } 57 | //// skip headers 58 | // if (in_array(strtolower($row['0']), ['#', 'id', 'name', 'title', 'license_id'])) { 59 | // continue; 60 | // } 61 | // 62 | // $package_id = $row['2']; 63 | // $license_id = $row['4']; 64 | // $CkanManager->update_dataset_license($package_id, $license_id, $basename); 65 | // } 66 | //} 67 | // 68 | //// show running time on finish 69 | //timer(); 70 | -------------------------------------------------------------------------------- /cli/delete_datasets.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 35 | 36 | /** 37 | * CSV 38 | * datasetName, orgId 39 | */ 40 | 41 | foreach (glob(CKANMNGR_DATA_DIR . '/delete*.csv') as $csv_file) { 42 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL; 43 | echo $status; 44 | 45 | // fix wrong END-OF-LINE 46 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file))); 47 | 48 | $basename = str_replace('.csv', '', basename($csv_file)); 49 | $logFile = $results_dir . '/' . $basename . '_log.csv'; 50 | 51 | $csv = new EasyCSV\Reader($csv_file, 'r+', false); 52 | $i = 1; 53 | while (true) { 54 | $row = $csv->getRow(); 55 | if (!$row) { 56 | break; 57 | } 58 | // skip headers 59 | if (in_array(trim(strtolower($row['0'])), ['dataset', 'url', 'old dataset url', 'from'])) { 60 | continue; 61 | } 62 | 63 | $datasetName = basename($row['0']); 64 | // $organizationName = basename($row['1']); 65 | 66 | printf('[%04d] ', $i++); 67 | $CkanManager->deleteDataset($datasetName);//, $organizationName 68 | file_put_contents($logFile, $CkanManager->logOutput, FILE_APPEND | LOCK_EX); 69 | $CkanManager->logOutput = ''; 70 | } 71 | } 72 | 73 | // show running time on finish 74 | timer(); 75 | -------------------------------------------------------------------------------- /cli/add_resource_to_dataset.php: -------------------------------------------------------------------------------- 1 | results_dir = $results_dir; 40 | //foreach (glob(DATA_DIR . '/webservices.csv') as $csv_file) { 41 | // $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL; 42 | // echo $status; 43 | // 44 | // $basename = str_replace('.csv', '', basename($csv_file)); 45 | // 46 | // // fix wrong END-OF-LINE 47 | // file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file))); 48 | // 49 | // file_put_contents($results_dir . '/' . $basename . '_add_resource.log', $status, FILE_APPEND | LOCK_EX); 50 | // 51 | // $csv = new EasyCSV\Reader($csv_file, 'r+', false); 52 | // while (true) { 53 | // $row = $csv->getRow(); 54 | // if (!$row) { 55 | // break; 56 | // } 57 | //// skip headers 58 | // if (in_array(strtolower($row['0']), ['#', 'id', 'package_id', 'key', 'value', 'revision_id', 'state'])) { 59 | // continue; 60 | // } 61 | // 62 | // $package_id = $row['2']; 63 | // $api_url = $row['4']; 64 | // $CkanManager->add_resource_to_dataset($package_id, $api_url, $basename); 65 | // } 66 | //} 67 | // 68 | //// show running time on finish 69 | //timer(); 70 | -------------------------------------------------------------------------------- /cli/export_orgs_full.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 18 | 19 | 20 | foreach (glob(CKANMNGR_DATA_DIR . '/export_*.csv') as $csv_file) { 21 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL; 22 | echo $status; 23 | 24 | // fix wrong END-OF-LINE 25 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file))); 26 | 27 | $basename = str_replace('.csv', '', basename($csv_file)); 28 | $logFile = $results_dir . '/' . $basename . '.log'; 29 | // file_put_contents($logFile, $status, FILE_APPEND | LOCK_EX); 30 | 31 | $csv = new EasyCSV\Reader($csv_file, 'r+', false); 32 | $i = 1; 33 | while (true) { 34 | $row = $csv->getRow(); 35 | if (!$row) { 36 | break; 37 | } 38 | // skip headers 39 | if (in_array(trim(strtolower($row['0'])), ['dataset', 'url', 'old dataset url', 'from'])) { 40 | continue; 41 | } 42 | 43 | $organization = basename($row['0']); 44 | 45 | printf('[%04d] ', $i++); 46 | // Options available: 47 | // CkanManager::EXPORT_PUBLIC_ONLY 48 | // CkanManager::EXPORT_PRIVATE_ONLY 49 | // CkanManager::EXPORT_DMS_ONLY 50 | // CkanManager::EXPORT_DMS_ONLY | CkanManager::EXPORT_PRIVATE_ONLY 51 | // CkanManager::EXPORT_DMS_ONLY | CkanManager::EXPORT_PUBLIC_ONLY 52 | $CkanManager->fullOrganizationExport($organization, 53 | // CkanManager::EXPORT_DMS_ONLY | CkanManager::EXPORT_PUBLIC_ONLY); 54 | CkanManager::EXPORT_PRIVATE_ONLY); 55 | } 56 | 57 | file_put_contents($logFile, $CkanManager->logOutput, FILE_APPEND | LOCK_EX); 58 | } 59 | 60 | // show running time on finish 61 | timer(); 62 | -------------------------------------------------------------------------------- /cli/rename_then_delete.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 31 | 32 | foreach (glob(CKANMNGR_DATA_DIR . '/rdelete*.csv') as $csv_file) { 33 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL; 34 | echo $status; 35 | 36 | // fix wrong END-OF-LINE 37 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file))); 38 | 39 | $basename = str_replace('.csv', '', basename($csv_file)); 40 | file_put_contents($results_dir . '/' . $basename . '_rename.log', $status, FILE_APPEND | LOCK_EX); 41 | 42 | $csv = new EasyCSV\Reader($csv_file, 'r+', false); 43 | $i = 1; 44 | while (true) { 45 | $row = $csv->getRow(); 46 | if (!$row) { 47 | break; 48 | } 49 | // skip headers 50 | if (in_array(trim(strtolower($row['0'])), ['dataset', 'url', 'old dataset url', 'from', 'name'])) { 51 | continue; 52 | } 53 | 54 | $datasetName = trim(basename($row['0'])); 55 | $newDatasetName = substr($datasetName, 0, 70) . $i . '_delete'; 56 | // $newDatasetName = $datasetName.'_del_legacy'; 57 | 58 | printf('[%04d] ', $i++); 59 | $CkanManager->renameDataset($datasetName, $newDatasetName, $basename); 60 | $CkanManager->deleteDataset($newDatasetName); 61 | } 62 | } 63 | 64 | // show running time on finish 65 | timer(); 66 | -------------------------------------------------------------------------------- /cli/epa-gov_process/3_rename_datasets.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 31 | 32 | foreach (glob(CKANMNGR_DATA_DIR . '/rename*.csv') as $csv_file) { 33 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL; 34 | echo $status; 35 | 36 | // fix wrong END-OF-LINE 37 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file))); 38 | 39 | $basename = str_replace('.csv', '', basename($csv_file)); 40 | file_put_contents($results_dir . '/' . $basename . '_rename.log', $status, FILE_APPEND | LOCK_EX); 41 | 42 | $csv = new EasyCSV\Reader($csv_file, 'r+', false); 43 | $i = 1; 44 | while (true) { 45 | $row = $csv->getRow(); 46 | if (!$row) { 47 | break; 48 | } 49 | // skip headers 50 | if (in_array(trim(strtolower($row['0'])), ['dataset', 'url', 'old dataset url', 'from'])) { 51 | continue; 52 | } 53 | 54 | $datasetName = basename($row['0']); 55 | $newDatasetName = basename($row['1']); 56 | 57 | // if (strlen($newDatasetName) > 100) { 58 | // $suffix = substr(md5($datasetName),0,3); 59 | // $newDatasetName = substr($newDatasetName,0,85).$suffix.'_epa_deleted'; 60 | // } 61 | 62 | printf('[%04d] ', $i++); 63 | $CkanManager->renameDataset($datasetName, $newDatasetName, $basename); 64 | } 65 | } 66 | 67 | // show running time on finish 68 | timer(); 69 | -------------------------------------------------------------------------------- /src/CKAN/Manager/Dataset.php: -------------------------------------------------------------------------------- 1 | dataset = $dataset; 25 | if (isset($dataset['extras'])) { 26 | foreach ($dataset['extras'] as $extra) { 27 | $this->extras[$extra['key']] = $extra['value']; 28 | } 29 | } 30 | } 31 | 32 | /** 33 | * @return array 34 | */ 35 | public function get_groups_and_tags(){ 36 | $groups = []; 37 | if (isset($this->dataset['groups'])) { 38 | foreach ($this->dataset['groups'] as $group) { 39 | if (strlen(trim($group['title']))) { 40 | $tags = []; 41 | if (isset($this->extras['__category_tag_'.$group['id']])) { 42 | $tags = trim($this->extras['__category_tag_'.$group['id']],'[]'); 43 | $tags = explode('","', $tags); 44 | foreach ($tags as &$tag) { 45 | $tag = trim($tag, '" '); 46 | } 47 | } 48 | $groups[trim($group['title'])] = $tags; 49 | } 50 | } 51 | } 52 | return $groups; 53 | } 54 | 55 | /** 56 | * Sometimes harvested ckan title does not exactly matches, but dataset is same, ex. double spaces 57 | * To avoid these cases, we remove all non-word chars, leaving only alphabetic and digit chars 58 | * Ex. 59 | * Input: Tree dog dataset , agriculture, 1997 ?????!!! 60 | * Output: treedogdatasetagriculture1997 61 | * 62 | * @param $string 63 | * 64 | * @return mixed|string 65 | */ 66 | public static function simplifyTitle( 67 | $string 68 | ) { 69 | $string = preg_replace('/[\W]+/', '', $string); 70 | $string = strtolower($string); 71 | 72 | return $string; 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /cli/update_organization.php: -------------------------------------------------------------------------------- 1 | results_dir = $results_dir; 40 | //foreach (glob(DATA_DIR . '/update_doe_datasets.csv') as $csv_file) { 41 | // $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL; 42 | // echo $status; 43 | // 44 | // $basename = str_replace('.csv', '', basename($csv_file)); 45 | // 46 | // // fix wrong END-OF-LINE 47 | // file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file))); 48 | // 49 | // file_put_contents($results_dir . '/' . $basename . '_update_organization.log', $status, FILE_APPEND | LOCK_EX); 50 | // 51 | // $csv = new EasyCSV\Reader($csv_file, 'r+', false); 52 | // while (true) { 53 | // $row = $csv->getRow(); 54 | // if (!$row) { 55 | // break; 56 | // } 57 | //// skip headers 58 | // if (in_array(strtolower($row['0']), ['url', 'exact match', 'title', 'found by title'])) { 59 | // continue; 60 | // } 61 | // 62 | // $package_id = str_replace("https://inventory.data.gov/dataset/", "", $row[0]); 63 | // //$organization_id = "ers-usda-gov"; 64 | // $package_name = "1bef2082-a4ca-45c5-b307-3d8bfce384df"; 65 | // $CkanManager->update_dataset_parent($package_id, $package_name, $basename); 66 | // } 67 | //} 68 | // 69 | //// show running time on finish 70 | //timer(); 71 | -------------------------------------------------------------------------------- /cli/rename_then_mark_private.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 31 | 32 | foreach (glob(CKANMNGR_DATA_DIR . '/prename*.csv') as $csv_file) { 33 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL; 34 | echo $status; 35 | 36 | // fix wrong END-OF-LINE 37 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file))); 38 | 39 | $basename = str_replace('.csv', '', basename($csv_file)); 40 | file_put_contents($results_dir . '/' . $basename . '_rename.log', $status, FILE_APPEND | LOCK_EX); 41 | 42 | $csv = new EasyCSV\Reader($csv_file, 'r+', false); 43 | $i = 1; 44 | while (true) { 45 | $row = $csv->getRow(); 46 | if (!$row) { 47 | break; 48 | } 49 | // skip headers 50 | if (in_array(trim(strtolower($row['0'])), ['dataset', 'url', 'old dataset url', 'from'])) { 51 | continue; 52 | } 53 | 54 | $datasetName = trim(basename($row['0'])); 55 | $newDatasetName = substr($datasetName, 0, 70) . $i . '_legacy'; 56 | // $newDatasetName = str_replace('_legacy_legacy','_legacy',$newDatasetName); 57 | // $newDatasetName = str_replace('_legacy_legacy','_legacy',$newDatasetName); 58 | // $newDatasetName = basename($row['1']); 59 | 60 | printf('[%04d] ', $i++); 61 | 62 | // echo $newDatasetName.PHP_EOL; 63 | // continue; 64 | 65 | $CkanManager->renameDataset($datasetName, $newDatasetName, $basename); 66 | $CkanManager->makeDatasetPrivate($newDatasetName, $basename); 67 | } 68 | } 69 | 70 | // show running time on finish 71 | timer(); 72 | -------------------------------------------------------------------------------- /cli/restore_script.php: -------------------------------------------------------------------------------- 1 | getRow(); 40 | if (!$row) { 41 | break; 42 | } 43 | // skip headers 44 | if (in_array(trim(strtolower($row['0'])), ['dataset', 'url'])) { 45 | continue; 46 | } 47 | 48 | $datasetName = basename($row['0']); 49 | 50 | $StagingClient->say(str_pad($datasetName, 100, ' . '), ''); 51 | 52 | try { 53 | $DatasetArray = $StagingClient->getDataset($datasetName); 54 | // no exception, cool 55 | $StagingClient->say(str_pad('Staging OK', 15, ' . '), ''); 56 | 57 | $ProductionClient->diffUpdate($datasetName, $DatasetArray); 58 | // var_dump($DatasetArray);die(); 59 | } catch (CKAN\NotFoundHttpException $ex) { 60 | $StagingClient->say(str_pad('Staging 404', 15, ' . ')); 61 | } catch (\Exception $ex) { 62 | $StagingClient->say(str_pad('Staging Error: ' . $ex->getMessage(), 15, ' . ')); 63 | } 64 | 65 | // debug 66 | // die(); 67 | } 68 | } 69 | 70 | // show running time on finish 71 | timer(); 72 | -------------------------------------------------------------------------------- /cli/update_extra_fields.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 28 | foreach (glob(CKANMNGR_DATA_DIR . '/extra-*.csv') as $csv_file) { 29 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL; 30 | echo $status; 31 | 32 | $basename = str_replace('.csv', '', basename($csv_file)); 33 | 34 | // fix wrong END-OF-LINE 35 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file))); 36 | 37 | // file_put_contents($resultsDir . '/' . $basename . '_tags.log', $status, FILE_APPEND | LOCK_EX); 38 | 39 | $csv = new EasyCSV\Reader($csv_file, 'r+', false); 40 | while (true) { 41 | $row = $csv->getRow(); 42 | if (!$row) { 43 | break; 44 | } 45 | // skip headers 46 | if (in_array(trim(strtolower($row['0'])), ['link', 'dataset', 'url', 'data.gov url'])) { 47 | continue; 48 | } 49 | 50 | // no anchors please 51 | list($dataset,) = explode('#', basename(trim($row['0']))); 52 | 53 | if (!$dataset) { 54 | continue; 55 | } 56 | 57 | // double trouble check 58 | if (strpos($row['0'], '://')) { 59 | if (!strpos($row['0'], '/dataset/')) { 60 | file_put_contents( 61 | $results_dir . '/' . $basename . '_tags.log.csv', 62 | $row['0'] . ',WRONG URL' . PHP_EOL, 63 | FILE_APPEND | LOCK_EX 64 | ); 65 | continue; 66 | } 67 | } 68 | 69 | $CkanManager->updateExtraFields( 70 | [$dataset], 71 | $row['1'], 72 | $row['2'], 73 | $row['3'], 74 | $basename 75 | ); 76 | } 77 | } 78 | 79 | // show running time on finish 80 | timer(); 81 | -------------------------------------------------------------------------------- /cli/export_by_list.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 27 | foreach (glob(CKANMNGR_DATA_DIR . '/export_*.csv') as $csv_file) { 28 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL; 29 | echo $status; 30 | 31 | $basename = str_replace('.csv', '', basename($csv_file)); 32 | 33 | // fix wrong END-OF-LINE 34 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file))); 35 | 36 | 37 | $csv = new Reader($csv_file, 'r+', false); 38 | while (true) { 39 | $row = $csv->getRow(); 40 | if (!$row) { 41 | break; 42 | } 43 | 44 | // skip headers 45 | if (in_array(trim(strtolower($row['0'])), ['link', 'dataset', 'url', 'data.gov url'])) { 46 | continue; 47 | } 48 | 49 | if ($start > 0) { 50 | $start--; 51 | continue; 52 | } 53 | 54 | // no anchors please 55 | list($dataset,) = explode('#', basename(trim($row['0']))); 56 | 57 | // echo $dataset.PHP_EOL; 58 | 59 | if (!$dataset) { 60 | continue; 61 | } 62 | 63 | // double trouble check 64 | if (strpos($row['0'], '://')) { 65 | if (!strpos($row['0'], '/dataset/')) { 66 | file_put_contents( 67 | $results_dir . '/' . $basename . '_export.log.csv', 68 | $row['0'] . ',WRONG URL' . PHP_EOL, 69 | FILE_APPEND | LOCK_EX 70 | ); 71 | continue; 72 | } 73 | } 74 | 75 | $lines = $CkanManager->exportPackage($dataset); 76 | 77 | foreach ($lines as $line) { 78 | $tags_csv->writeRow($line); 79 | } 80 | } 81 | } 82 | 83 | 84 | //$brief = $CkanManager->exportShort('extras_harvest_source_title:Test ISO WAF AND (dataset_type:dataset)'); 85 | //$csv->writeFromArray($brief); 86 | 87 | // show running time on finish 88 | timer(); 89 | -------------------------------------------------------------------------------- /cli/update_field.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 28 | foreach (glob(CKANMNGR_DATA_DIR . '/license_update*.csv') as $csv_file) { 29 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL; 30 | echo $status; 31 | 32 | $basename = str_replace('.csv', '', basename($csv_file)); 33 | 34 | // fix wrong END-OF-LINE 35 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file))); 36 | 37 | // file_put_contents($resultsDir . '/' . $basename . '_tags.log', $status, FILE_APPEND | LOCK_EX); 38 | 39 | $csv = new EasyCSV\Reader($csv_file, 'r+', false); 40 | while (true) { 41 | $row = $csv->getRow(); 42 | if (!$row) { 43 | break; 44 | } 45 | // skip headers 46 | if (in_array(trim(strtolower($row['0'])), ['title','name','url','identifier','topics','categories'])) { 47 | continue; 48 | } 49 | 50 | // no anchors please 51 | list($dataset,) = explode('#', basename(trim($row['0']))); 52 | 53 | if (!$dataset) { 54 | continue; 55 | } 56 | 57 | // double trouble check 58 | if (strpos($row['0'], '://')) { 59 | if (!strpos($row['0'], '/dataset/')) { 60 | file_put_contents( 61 | $results_dir . '/' . $basename . '_tags.log.csv', 62 | $row['0'] . ',WRONG URL' . PHP_EOL, 63 | FILE_APPEND | LOCK_EX 64 | ); 65 | continue; 66 | } 67 | } 68 | $package_id = $row['1']; 69 | $license_id = "cc-zero"; 70 | $CkanManager->updateLicenseId($package_id, $license_id); 71 | } 72 | } 73 | 74 | // show running time on finish 75 | timer(); 76 | -------------------------------------------------------------------------------- /cli/export_full_by_list.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 24 | foreach (glob(CKANMNGR_DATA_DIR . '/export_*.csv') as $csv_file) { 25 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL; 26 | echo $status; 27 | 28 | $basename = str_replace('.csv', '', basename($csv_file)); 29 | 30 | // fix wrong END-OF-LINE 31 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file))); 32 | 33 | 34 | $csv = new Reader($csv_file, 'r+', false); 35 | $return = []; 36 | while (true) { 37 | $row = $csv->getRow(); 38 | if (!$row) { 39 | break; 40 | } 41 | 42 | // skip headers 43 | if (in_array(trim(strtolower($row['0'])), ['link', 'dataset', 'url', 'data.gov url'])) { 44 | continue; 45 | } 46 | 47 | if ($start > 0) { 48 | $start--; 49 | continue; 50 | } 51 | 52 | // no anchors please 53 | list($dataset_name,) = explode('#', basename(trim($row['0']))); 54 | 55 | if (!$dataset_name) { 56 | continue; 57 | } 58 | 59 | // double trouble check 60 | if (strpos($row['0'], '://')) { 61 | if (!strpos($row['0'], '/dataset/')) { 62 | file_put_contents( 63 | $results_dir . '/' . $basename . '_export.log.csv', 64 | $row['0'] . ',WRONG URL' . PHP_EOL, 65 | FILE_APPEND | LOCK_EX 66 | ); 67 | continue; 68 | } 69 | } 70 | 71 | printf('%50s',$dataset_name); 72 | $dataset = $CkanManager->tryPackageShow($dataset_name); 73 | if ($dataset) { 74 | printf('%10s','OK'); 75 | $return[] = $dataset; 76 | } else { 77 | printf('%10s','FAIL'); 78 | } 79 | echo PHP_EOL; 80 | } 81 | 82 | file_put_contents( 83 | $results_dir . '/' . $basename . '.json', 84 | json_encode($return, JSON_PRETTY_PRINT), 85 | FILE_APPEND | LOCK_EX 86 | ); 87 | } 88 | 89 | 90 | //$brief = $CkanManager->exportShort('extras_harvest_source_title:Test ISO WAF AND (dataset_type:dataset)'); 91 | //$csv->writeFromArray($brief); 92 | 93 | // show running time on finish 94 | timer(); 95 | -------------------------------------------------------------------------------- /cli/epa-gov_process/4_assign_groups_and_tags.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 31 | foreach (glob(CKANMNGR_DATA_DIR . '/assign*.csv') as $csv_file) { 32 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL; 33 | echo $status; 34 | 35 | $basename = str_replace('.csv', '', basename($csv_file)); 36 | 37 | // fix wrong END-OF-LINE 38 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file))); 39 | 40 | // file_put_contents($resultsDir . '/' . $basename . '_tags.log', $status, FILE_APPEND | LOCK_EX); 41 | 42 | $csv = new EasyCSV\Reader($csv_file, 'r+', false); 43 | while (true) { 44 | $row = $csv->getRow(); 45 | if (!$row) { 46 | break; 47 | } 48 | 49 | // skip headers 50 | if (in_array(trim(strtolower($row['0'])), ['link', 'dataset', 'url', 'data.gov url'])) { 51 | continue; 52 | } 53 | 54 | if ($start > 0) { 55 | $start--; 56 | continue; 57 | } 58 | 59 | // format group tags 60 | $categories = []; 61 | if (isset($row['2']) && $row['2']) { 62 | $categories = explode(';', trim($row['2'])); 63 | $categories = array_map('trim', $categories); 64 | 65 | } 66 | 67 | // no anchors please 68 | list($dataset,) = explode('#', basename(trim($row['0']))); 69 | 70 | if (!$dataset) { 71 | continue; 72 | } 73 | 74 | // double trouble check 75 | if (strpos($row['0'], '://')) { 76 | if (!strpos($row['0'], '/dataset/')) { 77 | file_put_contents( 78 | $results_dir . '/' . $basename . '_tags.log.csv', 79 | $row['0'] . ',WRONG URL' . PHP_EOL, 80 | FILE_APPEND | LOCK_EX 81 | ); 82 | continue; 83 | } 84 | } 85 | 86 | $CkanManager->assignGroupsAndCategoriesToDatasets( 87 | [$dataset], 88 | trim($row['1']), 89 | $basename, 90 | $categories 91 | ); 92 | } 93 | } 94 | 95 | // show running time on finish 96 | timer(); 97 | -------------------------------------------------------------------------------- /cli/socrata_log_redirects.php: -------------------------------------------------------------------------------- 1 | fail 37 | curl_setopt($curl_ch, CURLOPT_TIMEOUT, 60 * 5); 38 | // We don't want the header (use curl_getinfo()) 39 | curl_setopt($curl_ch, CURLOPT_HEADER, false); 40 | // Track the handle's request string 41 | curl_setopt($curl_ch, CURLINFO_HEADER_OUT, true); 42 | // Attempt to retrieve the modification date of the remote document. 43 | curl_setopt($curl_ch, CURLOPT_FILETIME, true); 44 | // Initialize cURL headers 45 | 46 | foreach (glob(CKANMNGR_DATA_DIR . '/socrata_*.csv') as $csv_file) { 47 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL; 48 | echo $status; 49 | 50 | $basename = str_replace('.csv', '', basename($csv_file)); 51 | 52 | // fix wrong END-OF-LINE 53 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file))); 54 | 55 | $csv_source = new Reader($csv_file, 'r+', false); 56 | $csv_destination = new Writer($results_dir . '/' . $basename . '_long.csv'); 57 | 58 | $i = 0; 59 | while (true) { 60 | if (!($i++ % 50)) { 61 | echo $i . PHP_EOL; 62 | } 63 | $row = $csv_source->getRow(); 64 | if (!$row) { 65 | break; 66 | } 67 | // skip headers 68 | if (in_array(trim(strtolower($row[0])), ['socrata code', 'from'])) { 69 | $csv_destination->writeRow($row); 70 | continue; 71 | } 72 | 73 | $socrata_id = $row[0]; 74 | $ckan_url = $row[1]; 75 | 76 | // writing short redirect 77 | $socrata_short_url = 'https://explore.data.gov/d/' . $socrata_id; 78 | $csv_destination->writeRow([$socrata_short_url, $ckan_url]); 79 | 80 | $socrata_long_url = get_long_socrata_url($curl_ch, $socrata_short_url); 81 | if (!$socrata_long_url) { 82 | echo 'No result: ' . $socrata_short_url . PHP_EOL; 83 | } 84 | $csv_destination->writeRow([$socrata_long_url, $ckan_url]); 85 | } 86 | } 87 | 88 | /** 89 | * @param $curl_ch 90 | * @param $url 91 | * 92 | * @return bool 93 | */ 94 | function get_long_socrata_url($curl_ch, $url) 95 | { 96 | 97 | curl_setopt($curl_ch, CURLOPT_URL, $url); 98 | $method = 'GET'; 99 | 100 | // Set cURL method. 101 | curl_setopt($curl_ch, CURLOPT_CUSTOMREQUEST, $method); 102 | 103 | // Execute request and get response headers. 104 | curl_exec($curl_ch); 105 | $info = curl_getinfo($curl_ch); 106 | if (isset($info['redirect_url'])) { 107 | return $info['redirect_url']; 108 | } 109 | 110 | return false; 111 | } 112 | 113 | // show running time on finish 114 | timer(); 115 | -------------------------------------------------------------------------------- /cli/nrc-gov_process/compare_prod_vs_uat_nrc.php: -------------------------------------------------------------------------------- 1 | writeRow([ 30 | 'title', 31 | 'title_simple', 32 | 'name', 33 | 'url', 34 | 'topics', 35 | 'categories', 36 | ]); 37 | 38 | $ProdCkanManager = new CkanManager(CKAN_API_URL); 39 | $ProdCkanManager->resultsDir = $results_dir; 40 | 41 | $prod_nuclear = $ProdCkanManager->exportBrief('organization:(nrc-gov)' . 42 | ' AND -metadata_type:geospatial AND dataset_type:dataset'); 43 | $prod->writeFromArray($prod_nuclear); 44 | } else { 45 | $prod = new Reader($results_dir . '/prod.csv'); 46 | $prod_nuclear = $prod->getAll(); 47 | } 48 | 49 | echo 'uat.csv' . PHP_EOL; 50 | if (!is_file($results_dir . '/uat.csv')) { 51 | $uat = new Writer($results_dir . '/uat.csv'); 52 | 53 | $uat->writeRow([ 54 | 'title', 55 | 'title_simple', 56 | 'name', 57 | 'url', 58 | 'topics', 59 | 'categories', 60 | ]); 61 | 62 | $UatCkanManager = new CkanManager(CKAN_UAT_API_URL); 63 | $UatCkanManager->resultsDir = $results_dir; 64 | 65 | $uat_nuclear = $UatCkanManager->exportBrief('extras_harvest_source_title:NRC data.json', '', 66 | 'http://uat-catalog-fe-data.reisys.com/dataset/'); 67 | $uat->writeFromArray($uat_nuclear); 68 | 69 | } else { 70 | $uat = new Reader($results_dir . '/uat.csv'); 71 | $uat_nuclear = $uat->getAll(); 72 | } 73 | 74 | $uat_nuclear_by_title = []; 75 | 76 | foreach ($uat_nuclear as $name => $dataset) { 77 | $title = $dataset['title_simple']; 78 | 79 | $uat_nuclear_by_title[$title] = isset($uat_nuclear_by_title[$title]) ? $uat_nuclear_by_title[$title] : []; 80 | $uat_nuclear_by_title[$title][] = $dataset; 81 | } 82 | 83 | echo 'prod_vs_uat.csv' . PHP_EOL; 84 | is_file($results_dir . '/prod_vs_uat_nuclear_geospatial.csv') && unlink($results_dir . '/prod_vs_uat_nuclear_geospatial.csv'); 85 | $csv = new Writer($results_dir . '/prod_vs_uat_nuclear_geospatial.csv'); 86 | $csv->writeRow([ 87 | 'Prod Title', 88 | 'Prod URL', 89 | 'Prod Topics', 90 | 'Prod Categories', 91 | 'Matched', 92 | 'UAT Title', 93 | 'UAT URL', 94 | ]); 95 | 96 | foreach ($prod_nuclear as $name => $prod_dataset) { 97 | if (isset($uat_nuclear_by_title[$prod_dataset['title_simple']])) { 98 | foreach ($uat_nuclear_by_title[$prod_dataset['title_simple']] as $uat_dataset) { 99 | $csv->writeRow([ 100 | $prod_dataset['title'], 101 | $prod_dataset['url'], 102 | $prod_dataset['topics'], 103 | $prod_dataset['categories'], 104 | true, 105 | $uat_dataset['title'], 106 | $uat_dataset['url'], 107 | ]); 108 | } 109 | continue; 110 | } 111 | 112 | $csv->writeRow([ 113 | $prod_dataset['title'], 114 | $prod_dataset['url'], 115 | $prod_dataset['topics'], 116 | $prod_dataset['categories'], 117 | false, 118 | '', 119 | '', 120 | ]); 121 | } 122 | 123 | // show running time on finish 124 | timer(); 125 | -------------------------------------------------------------------------------- /cli/compare_prod_vs_prod.php: -------------------------------------------------------------------------------- 1 | writeRow([ 38 | 'title', 39 | 'title_simple', 40 | 'name', 41 | 'url', 42 | 'topics', 43 | 'categories', 44 | ]); 45 | 46 | $Prod1CkanManager = new CkanManager(CKAN_API_URL); 47 | $Prod1CkanManager->resultsDir = $results_dir; 48 | 49 | $prod1_data = $Prod1CkanManager->exportBrief('organization:(' . $prod1_org . ') AND dataset_type:dataset'); 50 | $prod1->writeFromArray($prod1_data); 51 | } else { 52 | $prod1 = new Reader($prod1_csv_path); 53 | $prod1_data = $prod1->getAll(); 54 | } 55 | 56 | echo $prod2_org . '.csv' . PHP_EOL; 57 | if (!is_file($prod2_csv_path)) { 58 | $prod2 = new Writer($prod2_csv_path); 59 | 60 | $prod2->writeRow([ 61 | 'title', 62 | 'title_simple', 63 | 'name', 64 | 'url', 65 | 'topics', 66 | 'categories', 67 | ]); 68 | 69 | $Prod2CkanManager = new CkanManager(CKAN_API_URL); 70 | $Prod2CkanManager->resultsDir = $results_dir; 71 | 72 | $prod2_data = $Prod2CkanManager->exportBrief('organization:(' . $prod2_org . ') AND dataset_type:dataset'); 73 | $prod2->writeFromArray($prod2_data); 74 | 75 | } else { 76 | $prod2 = new Reader($prod2_csv_path); 77 | $prod2_data = $prod2->getAll(); 78 | } 79 | 80 | 81 | $prod2_by_title = []; 82 | 83 | foreach ($prod2_data as $name => $dataset) { 84 | $title = $dataset['title_simple']; 85 | 86 | $prod2_by_title[$title] = isset($prod2_by_title[$title]) ? $prod2_by_title[$title] : []; 87 | $prod2_by_title[$title][] = $dataset; 88 | } 89 | 90 | echo $prod1_org . '_VS_' . $prod2_org . '.csv' . PHP_EOL; 91 | is_file($comparison_csv_path) && unlink($comparison_csv_path); 92 | $csv = new Writer($comparison_csv_path); 93 | $csv->writeRow([ 94 | $prod1_org . ' Title', 95 | $prod1_org . ' URL', 96 | $prod1_org . ' Topics', 97 | $prod1_org . ' Categories', 98 | 'Matched', 99 | $prod2_org . ' Title', 100 | $prod2_org . ' URL', 101 | 'URL Match', 102 | ]); 103 | 104 | foreach ($prod1_data as $name => $prod1_dataset) { 105 | if (isset($prod2_by_title[$prod1_dataset['title_simple']])) { 106 | foreach ($prod2_by_title[$prod1_dataset['title_simple']] as $prod2_dataset) { 107 | $csv->writeRow([ 108 | $prod1_dataset['title'], 109 | $prod1_dataset['url'], 110 | $prod1_dataset['topics'], 111 | $prod1_dataset['categories'], 112 | true, 113 | $prod2_dataset['title'], 114 | $prod2_dataset['url'], 115 | true, 116 | ]); 117 | } 118 | continue; 119 | } 120 | 121 | $csv->writeRow([ 122 | $prod1_dataset['title'], 123 | $prod1_dataset['url'], 124 | $prod1_dataset['topics'], 125 | $prod1_dataset['categories'], 126 | false, 127 | '', 128 | '', 129 | false, 130 | ]); 131 | } 132 | 133 | // show running time on finish 134 | timer(); 135 | -------------------------------------------------------------------------------- /cli/doc-gov_process/2_compare_prod_vs_prod.php: -------------------------------------------------------------------------------- 1 | writeRow([ 30 | 'title', 31 | 'title_simple', 32 | 'name', 33 | 'url', 34 | 'topics', 35 | 'categories', 36 | ]); 37 | 38 | $ProdCkanManager = new CkanManager(CKAN_API_URL); 39 | $ProdCkanManager->resultsDir = $results_dir; 40 | 41 | $prod_commerce = $ProdCkanManager->exportBrief('organization:(doc-gov OR bis-doc-gov OR mbda-doc-gov OR trade-gov OR census-gov ' . 42 | ' OR eda-doc-gov OR ntia-doc-gov OR ntis-gov OR nws-doc-gov OR bea-gov OR uspto-gov)' . 43 | ' AND -metadata_type:geospatial AND dataset_type:dataset AND -harvest_source_id:[\'\' TO *]'); 44 | $prod->writeFromArray($prod_commerce); 45 | } else { 46 | $prod = new Reader($results_dir . '/prod.csv'); 47 | $prod_commerce = $prod->getAll(); 48 | } 49 | 50 | echo 'new.csv' . PHP_EOL; 51 | if (!is_file($results_dir . '/new.csv')) { 52 | $new = new Writer($results_dir . '/new.csv'); 53 | 54 | $new->writeRow([ 55 | 'title', 56 | 'title_simple', 57 | 'name', 58 | 'url', 59 | 'topics', 60 | 'categories', 61 | ]); 62 | 63 | $UatCkanManager = new CkanManager(CKAN_API_URL); 64 | $UatCkanManager->resultsDir = $results_dir; 65 | 66 | $new_commerce = $UatCkanManager->exportBrief('extras_harvest_source_title:Commerce Non Spatial Data.json Harvest Source'); 67 | $new->writeFromArray($new_commerce); 68 | 69 | } else { 70 | $new = new Reader($results_dir . '/new.csv'); 71 | $new_commerce = $new->getAll(); 72 | } 73 | 74 | $new_commerce_by_title = []; 75 | 76 | foreach ($new_commerce as $name => $dataset) { 77 | $title = $dataset['title_simple']; 78 | 79 | $new_commerce_by_title[$title] = isset($new_commerce_by_title[$title]) ? $new_commerce_by_title[$title] : []; 80 | $new_commerce_by_title[$title][] = $dataset; 81 | } 82 | 83 | echo 'prod_vs_new.csv' . PHP_EOL; 84 | is_file($results_dir . '/prod_vs_prod_commerce.csv') && unlink($results_dir . '/prod_vs_prod_commerce.csv'); 85 | $csv = new Writer($results_dir . '/prod_vs_prod_commerce.csv'); 86 | $csv->writeRow([ 87 | 'Prod Title', 88 | 'Prod URL', 89 | 'Prod Topics', 90 | 'Prod Categories', 91 | 'Matched', 92 | 'NEW Title', 93 | 'NEW URL', 94 | 'URL Match', 95 | ]); 96 | 97 | foreach ($prod_commerce as $name => $prod_dataset) { 98 | if (isset($new_commerce_by_title[$prod_dataset['title_simple']])) { 99 | foreach ($new_commerce_by_title[$prod_dataset['title_simple']] as $new_dataset) { 100 | $csv->writeRow([ 101 | $prod_dataset['title'], 102 | $prod_dataset['url'], 103 | $prod_dataset['topics'], 104 | $prod_dataset['categories'], 105 | true, 106 | $new_dataset['title'], 107 | $new_dataset['url'], 108 | true, 109 | ]); 110 | } 111 | continue; 112 | } 113 | 114 | $csv->writeRow([ 115 | $prod_dataset['title'], 116 | $prod_dataset['url'], 117 | $prod_dataset['topics'], 118 | $prod_dataset['categories'], 119 | false, 120 | '', 121 | '', 122 | false, 123 | ]); 124 | } 125 | 126 | // show running time on finish 127 | timer(); 128 | -------------------------------------------------------------------------------- /cli/doc-gov_process/0_compare_prod_vs_uat.php: -------------------------------------------------------------------------------- 1 | writeRow([ 30 | 'title', 31 | 'title_simple', 32 | 'name', 33 | 'url', 34 | 'topics', 35 | 'categories', 36 | ]); 37 | 38 | $ProdCkanManager = new CkanManager(CKAN_API_URL); 39 | $ProdCkanManager->resultsDir = $results_dir; 40 | 41 | $prod_commerce = $ProdCkanManager->exportBrief('organization:(doc-gov OR bis-doc-gov OR mbda-doc-gov OR trade-gov OR census-gov ' . 42 | ' OR eda-doc-gov OR ntia-doc-gov OR ntis-gov OR nws-doc-gov OR bea-gov OR uspto-gov)' . 43 | ' AND -metadata_type:geospatial AND dataset_type:dataset AND -harvest_source_id:[\'\' TO *]'); 44 | $prod->writeFromArray($prod_commerce); 45 | } else { 46 | $prod = new Reader($results_dir . '/prod.csv'); 47 | $prod_commerce = $prod->getAll(); 48 | } 49 | 50 | echo 'uat.csv' . PHP_EOL; 51 | if (!is_file($results_dir . '/uat.csv')) { 52 | $uat = new Writer($results_dir . '/uat.csv'); 53 | 54 | $uat->writeRow([ 55 | 'title', 56 | 'title_simple', 57 | 'name', 58 | 'url', 59 | 'topics', 60 | 'categories', 61 | ]); 62 | 63 | $UatCkanManager = new CkanManager(CKAN_UAT_API_URL); 64 | $UatCkanManager->resultsDir = $results_dir; 65 | 66 | $uat_commerce = $UatCkanManager->exportBrief('extras_harvest_source_title:Commerce JSON', '', 67 | 'http://uat-catalog-fe-data.reisys.com/dataset/'); 68 | $uat->writeFromArray($uat_commerce); 69 | 70 | } else { 71 | $uat = new Reader($results_dir . '/uat.csv'); 72 | $uat_commerce = $uat->getAll(); 73 | } 74 | 75 | $uat_commerce_by_title = []; 76 | 77 | foreach ($uat_commerce as $name => $dataset) { 78 | $title = $dataset['title_simple']; 79 | 80 | $uat_commerce_by_title[$title] = isset($uat_commerce_by_title[$title]) ? $uat_commerce_by_title[$title] : []; 81 | $uat_commerce_by_title[$title][] = $dataset; 82 | } 83 | 84 | echo 'prod_vs_uat.csv' . PHP_EOL; 85 | is_file($results_dir . '/prod_vs_uat_commerce.csv') && unlink($results_dir . '/prod_vs_uat_commerce.csv'); 86 | $csv = new Writer($results_dir . '/prod_vs_uat_commerce.csv'); 87 | $csv->writeRow([ 88 | 'Prod Title', 89 | 'Prod URL', 90 | 'Prod Topics', 91 | 'Prod Categories', 92 | 'Matched', 93 | 'UAT Title', 94 | 'UAT URL', 95 | 'URL Match', 96 | ]); 97 | 98 | foreach ($prod_commerce as $name => $prod_dataset) { 99 | if (isset($uat_commerce_by_title[$prod_dataset['title_simple']])) { 100 | foreach ($uat_commerce_by_title[$prod_dataset['title_simple']] as $uat_dataset) { 101 | $csv->writeRow([ 102 | $prod_dataset['title'], 103 | $prod_dataset['url'], 104 | $prod_dataset['topics'], 105 | $prod_dataset['categories'], 106 | true, 107 | $uat_dataset['title'], 108 | $uat_dataset['url'], 109 | true, 110 | ]); 111 | } 112 | continue; 113 | } 114 | 115 | $csv->writeRow([ 116 | $prod_dataset['title'], 117 | $prod_dataset['url'], 118 | $prod_dataset['topics'], 119 | $prod_dataset['categories'], 120 | false, 121 | '', 122 | '', 123 | false, 124 | ]); 125 | } 126 | 127 | // show running time on finish 128 | timer(); 129 | -------------------------------------------------------------------------------- /cli/doc-gov_process/5_compare_prod_vs_qa.php: -------------------------------------------------------------------------------- 1 | writeRow([ 30 | 'title', 31 | 'title_simple', 32 | 'name', 33 | 'url', 34 | 'topics', 35 | 'categories', 36 | ]); 37 | 38 | $ProdCkanManager = new CkanManager(CKAN_API_URL); 39 | $ProdCkanManager->resultsDir = $results_dir; 40 | 41 | $prod_commerce = $ProdCkanManager->exportBrief('organization:(doc-gov OR bis-doc-gov OR mbda-doc-gov OR trade-gov OR census-gov ' . 42 | ' OR eda-doc-gov OR ntia-doc-gov OR ntis-gov OR nws-doc-gov OR bea-gov OR uspto-gov)' . 43 | ' AND -metadata_type:geospatial AND dataset_type:dataset'); 44 | $prod->writeFromArray($prod_commerce); 45 | } else { 46 | $prod = new Reader($results_dir . '/prod.csv'); 47 | $prod_commerce = $prod->getAll(); 48 | } 49 | 50 | echo 'qa.csv' . PHP_EOL; 51 | if (!is_file($results_dir . '/qa.csv')) { 52 | $qa = new Writer($results_dir . '/qa.csv'); 53 | 54 | $qa->writeRow([ 55 | 'title', 56 | 'title_simple', 57 | 'name', 58 | 'url', 59 | 'topics', 60 | 'categories', 61 | ]); 62 | 63 | $UatCkanManager = new CkanManager(CKAN_QA_API_URL); 64 | $UatCkanManager->resultsDir = $results_dir; 65 | 66 | $qa_commerce = $UatCkanManager->exportBrief('organization:(doc-gov OR bis-doc-gov OR mbda-doc-gov OR trade-gov OR census-gov ' . 67 | ' OR eda-doc-gov OR ntia-doc-gov OR ntis-gov OR nws-doc-gov OR bea-gov OR uspto-gov)' . 68 | ' AND -metadata_type:geospatial AND dataset_type:dataset', '', 69 | 'http://qa-catalog-fe-data.reisys.com/dataset/'); 70 | $qa->writeFromArray($qa_commerce); 71 | 72 | } else { 73 | $qa = new Reader($results_dir . '/qa.csv'); 74 | $qa_commerce = $qa->getAll(); 75 | } 76 | 77 | $qa_commerce_by_title = []; 78 | 79 | foreach ($qa_commerce as $name => $dataset) { 80 | $title = $dataset['title_simple']; 81 | 82 | $qa_commerce_by_title[$title] = isset($qa_commerce_by_title[$title]) ? $qa_commerce_by_title[$title] : []; 83 | $qa_commerce_by_title[$title][] = $dataset; 84 | } 85 | 86 | echo 'prod_vs_qa.csv' . PHP_EOL; 87 | is_file($results_dir . '/prod_vs_qa_commerce.csv') && unlink($results_dir . '/prod_vs_qa_commerce.csv'); 88 | $csv = new Writer($results_dir . '/prod_vs_qa_commerce.csv'); 89 | $csv->writeRow([ 90 | 'Prod Title', 91 | 'Prod URL', 92 | 'Prod Topics', 93 | 'Prod Categories', 94 | 'Matched', 95 | 'QA Title', 96 | 'QA URL', 97 | 'URL Match', 98 | ]); 99 | 100 | foreach ($prod_commerce as $name => $prod_dataset) { 101 | if (isset($qa_commerce_by_title[$prod_dataset['title_simple']])) { 102 | foreach ($qa_commerce_by_title[$prod_dataset['title_simple']] as $qa_dataset) { 103 | $csv->writeRow([ 104 | $prod_dataset['title'], 105 | $prod_dataset['url'], 106 | $prod_dataset['topics'], 107 | $prod_dataset['categories'], 108 | true, 109 | $qa_dataset['title'], 110 | $qa_dataset['url'], 111 | true, 112 | ]); 113 | } 114 | continue; 115 | } 116 | 117 | $csv->writeRow([ 118 | $prod_dataset['title'], 119 | $prod_dataset['url'], 120 | $prod_dataset['topics'], 121 | $prod_dataset['categories'], 122 | false, 123 | '', 124 | '', 125 | false, 126 | ]); 127 | } 128 | 129 | // show running time on finish 130 | timer(); 131 | -------------------------------------------------------------------------------- /cli/tagging/brother_assign.php: -------------------------------------------------------------------------------- 1 | getRow(); 29 | if (!$row) { 30 | break; 31 | } 32 | if (1 == sizeof($row)) { 33 | continue; 34 | } 35 | $original = get_dataset_basename(array_shift($row)); 36 | $brothers[$original] = $row; 37 | } 38 | } 39 | 40 | //var_dump($brothers); 41 | //die(); 42 | 43 | $CkanManager = new CkanManager(CKAN_API_URL, CKAN_API_KEY); 44 | //$CkanManager = new CkanManager(CKAN_STAGING_API_URL, CKAN_STAGING_API_KEY); 45 | //$CkanManager = new CkanManager(CKAN_DEV_API_URL, CKAN_DEV_API_KEY); 46 | //$CkanManager = new CkanManager(CKAN_UAT_API_URL, CKAN_UAT_API_KEY); 47 | //$CkanManager = new CkanManager(CKAN_QA_API_URL, CKAN_QA_API_KEY); 48 | 49 | /** 50 | * Sample csv 51 | * dataset,group,categories 52 | * https://catalog.data.gov/dataset/food-access-research-atlas,Agriculture,"Natural Resources and Environment" 53 | * download-crossing-inventory-data-highway-rail-crossing,Agriculture, "Natural Resources and Environment;Plants and Plant Systems Agriculture" 54 | */ 55 | 56 | $CkanManager->resultsDir = $results_dir; 57 | foreach (glob(CKANMNGR_DATA_DIR . '/assign*.csv') as $csv_file) { 58 | $csv_source = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL; 59 | echo $CkanManager->color->green($csv_source); 60 | 61 | $basename = str_replace('.csv', '', basename($csv_file)); 62 | 63 | // fix wrong END-OF-LINE 64 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file))); 65 | 66 | // file_put_contents($resultsDir . '/' . $basename . '_tags.log', $status, FILE_APPEND | LOCK_EX); 67 | 68 | $csv = new EasyCSV\Reader($csv_file, 'r+', false); 69 | while (true) { 70 | $row = $csv->getRow(); 71 | if (!$row) { 72 | break; 73 | } 74 | 75 | // skip headers 76 | if (in_array(trim(strtolower($row['0'])), ['link', 'dataset', 'url', 'data.gov url'])) { 77 | continue; 78 | } 79 | 80 | if ($start > 0) { 81 | $start--; 82 | continue; 83 | } 84 | 85 | // format group tags 86 | $categories = []; 87 | if (isset($row['2']) && $row['2']) { 88 | $categories = explode(';', trim($row['2'])); 89 | $categories = array_map('trim', $categories); 90 | 91 | } 92 | 93 | // no anchors please 94 | $dataset = get_dataset_basename($row['0']); 95 | 96 | if (!$dataset) { 97 | continue; 98 | } 99 | 100 | echo "\tOriginal: ".$dataset . PHP_EOL; 101 | // $CkanManager->assignGroupsAndCategoriesToDatasets( 102 | // [$dataset], 103 | // trim($row['1']), 104 | // $categories, 105 | // $basename 106 | // ); 107 | if (isset($brothers[$dataset])) { 108 | foreach ($brothers[$dataset] as $brother) { 109 | if (!strlen(trim($brother))) { 110 | continue; 111 | } 112 | $brother = get_dataset_basename($brother); 113 | if (!$brother) { 114 | continue; 115 | } 116 | echo "\tUat (s):" . PHP_EOL; 117 | $CkanManager->assignGroupsAndCategoriesToDatasets( 118 | [$brother], 119 | trim($row['1']), 120 | $basename, 121 | $categories 122 | ); 123 | } 124 | } 125 | } 126 | } 127 | 128 | // show running time on finish 129 | timer(); 130 | -------------------------------------------------------------------------------- /cli/export_short.php: -------------------------------------------------------------------------------- 1 | writeRow([ 27 | // 'ckan id', 28 | // 'title', 29 | // 'name', 30 | // 'url', 31 | // 'identifier', 32 | // 'org title', 33 | // 'org name', 34 | // 'topics', 35 | // 'categories', 36 | //]); 37 | 38 | $CkanManager->resultsDir = $results_dir; 39 | 40 | //$brief = $CkanManager->exportShort('extras_license:"https\://creativecommons.org/publicdomain/zero/1.0/" AND (dataset_type:dataset)'); 41 | //$brief = $CkanManager->exportShort('','((collection_package_id:* OR *:*) AND license_id:"cc-by-sa" AND license:"https\://creativecommons.org/publicdomain/zero/1.0/") AND (dataset_type:dataset)'); 42 | //$brief = $CkanManager->exportShort('%28%28collection_package_id:*%20OR%20*:*%29+AND+license_id:"cc-by-sa"+AND+license:"https://creativecommons.org/publicdomain/zero/1.0/"%29'); 43 | //$brief = $CkanManager->exportShort('organization:wake-county AND (dataset_type:dataset)'); 44 | //$brief = $CkanManager->exportShort('organization:gsa-gov AND harvest_source_title:Open* AND (dataset_type:dataset)', 45 | //$brief = $CkanManager->exportShort('organization:doe-gov AND (dataset_type:dataset)'); 46 | //$brief = $CkanManager->exportShort('organization:dhs-gov AND (harvest_source_title:DHS*) AND (dataset_type:dataset)'); 47 | //$brief = $CkanManager->exportShort('organization:epa-gov AND (harvest_source_title:*Gateway) AND (dataset_type:dataset)'); 48 | //$brief = $CkanManager->exportShort('organization:epa-gov AND (metadata_type:geospatial) AND (dataset_type:dataset)'); 49 | //$brief = $CkanManager->exportShort('organization:nasa-gov AND (harvest_source_title:NASA*) AND (dataset_type:dataset)'); 50 | //$brief = $CkanManager->exportShort('organization:ntsb-gov AND (dataset_type:dataset)'); 51 | //$brief = $CkanManager->exportShort('organization:noaa-gov AND metadata_type:geospatial AND (dataset_type:dataset) AND groups:*'); 52 | //$brief = $CkanManager->exportShort('metadata-source:dms AND (dataset_type:dataset)'); 53 | //$brief = $CkanManager->exportShort('organization:doj-gov AND (dataset_type:dataset)'); 54 | // 'http://uat-catalog-fe-data.reisys.com/dataset/'); 55 | //$brief = $CkanManager->exportShort('(extra_harvest_source_title:Open+*) AND (dataset_type:dataset)'); 56 | //$brief = $CkanManager->exportShort('organization:gsa-gov AND (dataset_type:dataset)'); 57 | //$brief = $CkanManager->exportShort('extras_harvest_source_title:Test ISO WAF AND (dataset_type:dataset)'); 58 | //$brief = $CkanManager->exportShort('organization:doe-gov AND (harvest_source_title:Energy*) AND (dataset_type:dataset)'); 59 | //$brief = $CkanManager->exportShort('organization:state-of-oklahoma AND (dataset_type:dataset)'); 60 | //$brief = $CkanManager->exportShort('organization:state-of-oklahoma AND -metadata_modified:[2016-02-24T23:59:59.999Z TO 2016-02-27T00:00:00Z] AND (dataset_type:dataset)'); 61 | //$brief = $CkanManager->exportShort('organization:noaa-gov AND metadata-source:dms AND (dataset_type:dataset)'); 62 | //$brief = $CkanManager->exportShort('organization:dot-gov AND (dataset_type:dataset) AND publisher:"Federal Aviation Administration"'); 63 | //$brief = $CkanManager->exportShort('organization:nd-gov AND (dataset_type:dataset)'); 64 | //$brief = $CkanManager->exportShort('organization:opm-gov AND (dataset_type:dataset)'); 65 | //$brief = $CkanManager->exportShort('organization:fs-fed-us AND (dataset_type:dataset)'); 66 | //$brief = $CkanManager->exportShort('metadata-source:dms AND (dataset_type:dataset)'); 67 | $brief = $CkanManager->exportShort('organization:usa-net AND (dataset_type:dataset)'); 68 | 69 | $headers = array_keys($brief[array_keys($brief)[0]]); 70 | $csv->writeRow($headers); 71 | $csv->writeFromArray($brief); 72 | 73 | // show running time on finish 74 | timer(); 75 | -------------------------------------------------------------------------------- /cli/compare_prod_vs_uat.php: -------------------------------------------------------------------------------- 1 | writeRow([ 30 | 'title', 31 | 'title_simple', 32 | 'name', 33 | 'url', 34 | 'topics', 35 | 'categories', 36 | ]); 37 | 38 | $ProdCkanManager = new CkanManager(CKAN_API_URL); 39 | $ProdCkanManager->resultsDir = $results_dir; 40 | 41 | // $prod_commerce = $ProdCkanManager->exportBrief('organization:(doc-gov OR bis-doc-gov OR mbda-doc-gov OR trade-gov OR census-gov ' . 42 | // ' OR eda-doc-gov OR ntia-doc-gov OR ntis-gov OR nws-doc-gov OR bea-gov OR uspto-gov)' . 43 | // ' AND -metadata_type:geospatial AND dataset_type:dataset AND -harvest_source_id:[\'\' TO *]'); 44 | 45 | 46 | // https://catalog.data.gov/organization/nd-gov?harvest_source_title=North+Dakota+GIS+Hub+Data+Portal 47 | $prod_commerce = $ProdCkanManager->exportBrief('organization:nd-gov AND dataset_type:dataset' . 48 | ' AND harvest_source_title:North*'); 49 | $prod->writeFromArray($prod_commerce); 50 | } else { 51 | $prod = new Reader($results_dir . '/prod.csv'); 52 | $prod_commerce = $prod->getAll(); 53 | } 54 | 55 | echo 'uat.csv' . PHP_EOL; 56 | if (!is_file($results_dir . '/uat.csv')) { 57 | $uat = new Writer($results_dir . '/uat.csv'); 58 | 59 | $uat->writeRow([ 60 | 'title', 61 | 'title_simple', 62 | 'name', 63 | 'url', 64 | 'topics', 65 | 'categories', 66 | ]); 67 | 68 | $UatCkanManager = new CkanManager(CKAN_UAT_API_URL); 69 | $UatCkanManager->resultsDir = $results_dir; 70 | 71 | // $uat_commerce = $UatCkanManager->exportBrief('extras_harvest_source_title:Commerce JSON', '', 72 | // 'http://uat-catalog-fe-data.reisys.com/dataset/'); 73 | 74 | // http://uat-catalog-fe-data.reisys.com/organization/test-org-082615?harvest_source_title=ND.gov+New+Data.json+HS 75 | 76 | $uat_commerce = $UatCkanManager->exportBrief('organization:test-org-082615 AND harvest_source_title:ND*', '', 77 | 'http://uat-catalog-fe-data.reisys.com/dataset/'); 78 | $uat->writeFromArray($uat_commerce); 79 | 80 | } else { 81 | $uat = new Reader($results_dir . '/uat.csv'); 82 | $uat_commerce = $uat->getAll(); 83 | } 84 | 85 | $uat_commerce_by_title = []; 86 | 87 | foreach ($uat_commerce as $name => $dataset) { 88 | $title = $dataset['title_simple']; 89 | 90 | $uat_commerce_by_title[$title] = isset($uat_commerce_by_title[$title]) ? $uat_commerce_by_title[$title] : []; 91 | $uat_commerce_by_title[$title][] = $dataset; 92 | } 93 | 94 | echo 'prod_vs_uat.csv' . PHP_EOL; 95 | is_file($results_dir . '/prod_vs_uat_commerce.csv') && unlink($results_dir . '/prod_vs_uat_commerce.csv'); 96 | $csv = new Writer($results_dir . '/prod_vs_uat_commerce.csv'); 97 | $csv->writeRow([ 98 | 'Prod Title', 99 | 'Prod URL', 100 | 'Prod Topics', 101 | 'Prod Categories', 102 | 'Matched', 103 | 'UAT Title', 104 | 'UAT URL', 105 | 'URL Match', 106 | ]); 107 | 108 | foreach ($prod_commerce as $name => $prod_dataset) { 109 | if (isset($uat_commerce_by_title[$prod_dataset['title_simple']])) { 110 | foreach ($uat_commerce_by_title[$prod_dataset['title_simple']] as $uat_dataset) { 111 | $csv->writeRow([ 112 | $prod_dataset['title'], 113 | $prod_dataset['url'], 114 | $prod_dataset['topics'], 115 | $prod_dataset['categories'], 116 | true, 117 | $uat_dataset['title'], 118 | $uat_dataset['url'], 119 | true, 120 | ]); 121 | } 122 | continue; 123 | } 124 | 125 | $csv->writeRow([ 126 | $prod_dataset['title'], 127 | $prod_dataset['url'], 128 | $prod_dataset['topics'], 129 | $prod_dataset['categories'], 130 | false, 131 | '', 132 | '', 133 | false, 134 | ]); 135 | } 136 | 137 | // show running time on finish 138 | timer(); 139 | -------------------------------------------------------------------------------- /cli/recheck_socrata_redirects.php: -------------------------------------------------------------------------------- 1 | fail 33 | curl_setopt($curl_ch, CURLOPT_TIMEOUT, 60 * 5); 34 | // We don't want the header (use curl_getinfo()) 35 | curl_setopt($curl_ch, CURLOPT_HEADER, false); 36 | // Track the handle's request string 37 | curl_setopt($curl_ch, CURLINFO_HEADER_OUT, true); 38 | // Attempt to retrieve the modification date of the remote document. 39 | curl_setopt($curl_ch, CURLOPT_FILETIME, true); 40 | // Initialize cURL headers 41 | 42 | 43 | foreach (glob(CKANMNGR_DATA_DIR . '/redirects_*.csv') as $csv_file) { 44 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL; 45 | echo $status; 46 | 47 | $basename = str_replace('.csv', '', basename($csv_file)); 48 | 49 | // fix wrong END-OF-LINE 50 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file))); 51 | 52 | $csv_source = new EasyCSV\Reader($csv_file, 'r+', false); 53 | $csv_destination = new EasyCSV\Writer($results_dir . '/' . $basename . '_log.csv'); 54 | 55 | $csv_destination->writeRow(['from', 'to', 'status', 'real_redirect']); 56 | 57 | $i = 0; 58 | while (true) { 59 | if (!($i++ % 100)) { 60 | echo $i . PHP_EOL; 61 | } 62 | $row = $csv_source->getRow(); 63 | if (!$row) { 64 | break; 65 | } 66 | // skip headers 67 | if (in_array(trim(strtolower($row[0])), ['socrata code', 'from', 'source url'])) { 68 | // $csv_destination->writeRow($row); 69 | continue; 70 | } 71 | 72 | $socrata_url = $row[0]; 73 | $redirect_url = $row[1]; 74 | 75 | $redirect = try_get_redirect($curl_ch, $socrata_url); 76 | if (!$redirect) { 77 | echo 'No redirect: ' . $socrata_url . PHP_EOL; 78 | $csv_destination->writeRow([$socrata_url, $redirect_url, 'no redirect', '']); 79 | continue; 80 | } 81 | 82 | if (url_compare($redirect, $redirect_url)) { 83 | $csv_destination->writeRow([$socrata_url, $redirect_url, 'correct', '']); 84 | } else { 85 | echo 'Wrong redirect: ' . $socrata_url . PHP_EOL; 86 | $csv_destination->writeRow([$socrata_url, $redirect_url, 'wrong redirect', '' . $redirect]); 87 | continue; 88 | } 89 | } 90 | } 91 | 92 | /** 93 | * @param $url1 94 | * @param $url2 95 | * 96 | * @return bool 97 | */ 98 | function url_compare($url1, $url2) 99 | { 100 | $url1_strip = trim(str_replace(['http:', 'https:'], '', $url1), '/ '); 101 | $url2_strip = trim(str_replace(['http:', 'https:'], '', $url2), '/ '); 102 | 103 | return ($url1_strip === $url2_strip); 104 | } 105 | 106 | /** 107 | * @param $curl_ch 108 | * @param $url 109 | * 110 | * @return bool 111 | */ 112 | function try_get_redirect($curl_ch, $url) 113 | { 114 | curl_setopt($curl_ch, CURLOPT_URL, $url); 115 | $method = 'GET'; 116 | 117 | // Set cURL method. 118 | curl_setopt($curl_ch, CURLOPT_CUSTOMREQUEST, $method); 119 | 120 | // Execute request and get response headers. 121 | $response = curl_exec($curl_ch); 122 | $info = curl_getinfo($curl_ch); 123 | if (isset($info['redirect_url']) && $info['redirect_url']) { 124 | return $info['redirect_url']; 125 | } 126 | 127 | if (stripos($response, 'http-equiv="refresh"')) { 128 | $pattern = '/content="0;URL=(http[\S\/\-\.]+)"/'; 129 | preg_match($pattern, $response, $matches, PREG_OFFSET_CAPTURE, 3); 130 | if ($matches && isset($matches[1]) && isset($matches[1][0])) { 131 | return $matches[1][0]; 132 | } 133 | } 134 | 135 | return false; 136 | } 137 | 138 | // show running time on finish 139 | timer(); 140 | -------------------------------------------------------------------------------- /cli/tagging/generate_brothers_assign_csv.php: -------------------------------------------------------------------------------- 1 | getRow(); 29 | if (!$row) { 30 | break; 31 | } 32 | if (1 == sizeof($row)) { 33 | continue; 34 | } 35 | $original = get_dataset_basename(array_shift($row)); 36 | $brothers[$original] = $row; 37 | } 38 | } 39 | 40 | //var_dump($brothers); 41 | //die(); 42 | 43 | //$CkanManager = new CkanManager(CKAN_API_URL, CKAN_API_KEY); 44 | //$CkanManager = new CkanManager(CKAN_STAGING_API_URL, CKAN_STAGING_API_KEY); 45 | //$CkanManager = new CkanManager(CKAN_DEV_API_URL, CKAN_DEV_API_KEY); 46 | $CkanManager = new CkanManager(CKAN_UAT_API_URL, CKAN_UAT_API_KEY); 47 | //$CkanManager = new CkanManager(CKAN_QA_API_URL, CKAN_QA_API_KEY); 48 | 49 | /** 50 | * Sample csv 51 | * dataset,group,categories 52 | * https://catalog.data.gov/dataset/food-access-research-atlas,Agriculture,"Natural Resources and Environment" 53 | * download-crossing-inventory-data-highway-rail-crossing,Agriculture, "Natural Resources and Environment;Plants and Plant Systems Agriculture" 54 | */ 55 | 56 | $CkanManager->resultsDir = $results_dir; 57 | foreach (glob(CKANMNGR_DATA_DIR . '/assign*.csv') as $csv_file) { 58 | $csv_source = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL; 59 | echo $CkanManager->color->green($csv_source); 60 | 61 | $basename = str_replace('.csv', '', basename($csv_file)); 62 | 63 | // fix wrong END-OF-LINE 64 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file))); 65 | 66 | // file_put_contents($resultsDir . '/' . $basename . '_tags.log', $status, FILE_APPEND | LOCK_EX); 67 | 68 | $csv = new EasyCSV\Reader($csv_file, 'r+', false); 69 | $output = new EasyCSV\Writer($results_dir.'/'.$basename.'_clones.csv'); 70 | while (true) { 71 | $row = $csv->getRow(); 72 | if (!$row) { 73 | break; 74 | } 75 | 76 | // skip headers 77 | if (in_array(trim(strtolower($row['0'])), ['link', 'dataset', 'url', 'data.gov url'])) { 78 | continue; 79 | } 80 | 81 | if ($start > 0) { 82 | $start--; 83 | continue; 84 | } 85 | 86 | // format group tags 87 | $categories = isset($row['2'])?trim($row['2']):''; 88 | // if (isset($row['2']) && $row['2']) { 89 | // $categories = explode(';', trim($row['2'])); 90 | // $categories = array_map('trim', $categories); 91 | // } 92 | 93 | // no anchors please 94 | $dataset = get_dataset_basename($row['0']); 95 | 96 | if (!$dataset) { 97 | continue; 98 | } 99 | 100 | // echo "\tOriginal: ".$dataset . PHP_EOL; 101 | // $CkanManager->assignGroupsAndCategoriesToDatasets( 102 | // [$dataset], 103 | // trim($row['1']), 104 | // $categories, 105 | // $basename 106 | // ); 107 | $output->writeRow([$dataset,trim($row['1']),$categories]); 108 | echo join(' , ',[$dataset,trim($row['1']),$categories]).PHP_EOL; 109 | 110 | 111 | if (isset($brothers[$dataset])) { 112 | foreach ($brothers[$dataset] as $brother) { 113 | if (!strlen(trim($brother))) { 114 | continue; 115 | } 116 | $brother = get_dataset_basename($brother); 117 | if (!$brother) { 118 | continue; 119 | } 120 | $output->writeRow([$brother,trim($row['1']),$categories]); 121 | echo join(' , ',[$brother,trim($row['1']),$categories]).PHP_EOL; 122 | // echo "\tUat (s):" . PHP_EOL; 123 | // $CkanManager->assignGroupsAndCategoriesToDatasets( 124 | // [$brother], 125 | // trim($row['1']), 126 | // $categories, 127 | // $basename 128 | // ); 129 | } 130 | } 131 | } 132 | } 133 | 134 | // show running time on finish 135 | timer(); 136 | -------------------------------------------------------------------------------- /cli/epa-gov_process/compare_qa_vs_prod_epa.php: -------------------------------------------------------------------------------- 1 | writeRow([ 30 | 'title', 31 | 'title_simple', 32 | 'name', 33 | 'url', 34 | 'guid', 35 | 'topics', 36 | 'categories', 37 | ]); 38 | 39 | $ProdCkanManager = new CkanManager(CKAN_API_URL); 40 | $ProdCkanManager->resultsDir = $results_dir; 41 | 42 | $prod_epa = $ProdCkanManager->exportBrief('organization:epa-gov'); 43 | $prod->writeFromArray($prod_epa); 44 | } else { 45 | $prod = new Reader($results_dir . '/prod.csv'); 46 | $prod_epa = $prod->getAll(); 47 | } 48 | 49 | echo 'qa.csv' . PHP_EOL; 50 | if (!is_file($results_dir . '/qa.csv')) { 51 | $qa = new Writer($results_dir . '/qa.csv'); 52 | 53 | $qa->writeRow([ 54 | 'title', 55 | 'title_simple', 56 | 'name', 57 | 'url', 58 | 'guid', 59 | 'topics', 60 | 'categories', 61 | ]); 62 | 63 | $QaCkanManager = new CkanManager(CKAN_QA_API_URL); 64 | $QaCkanManager->resultsDir = $results_dir; 65 | 66 | $qa_epa = $QaCkanManager->exportBrief('organization:epa-gov', '', 'http://qa-catalog-fe-data.reisys.com/dataset/'); 67 | $qa->writeFromArray($qa_epa); 68 | 69 | } else { 70 | $qa = new Reader($results_dir . '/qa.csv'); 71 | $qa_epa = $qa->getAll(); 72 | } 73 | 74 | $qa_epa_by_title = $qa_epa_by_guid = []; 75 | 76 | foreach ($qa_epa as $name => $dataset) { 77 | $title = $dataset['title_simple']; 78 | 79 | $qa_epa_by_title[$title] = isset($qa_epa_by_title[$title]) ? $qa_epa_by_title[$title] : []; 80 | $qa_epa_by_title[$title][] = $dataset; 81 | 82 | $guid = trim($dataset['guid']); 83 | if ($guid) { 84 | $qa_epa_by_guid[$guid] = isset($qa_epa_by_guid[$guid]) ? $qa_epa_by_guid[$guid] : []; 85 | $qa_epa_by_guid[$guid][] = $dataset; 86 | } 87 | } 88 | 89 | echo 'prod_vs_qa.csv' . PHP_EOL; 90 | is_file($results_dir . '/prod_vs_qa_epa.csv') && unlink($results_dir . '/prod_vs_qa_epa.csv'); 91 | $csv = new Writer($results_dir . '/prod_vs_qa_epa.csv'); 92 | $csv->writeRow([ 93 | 'Prod Title', 94 | 'Prod URL', 95 | 'Prod GUID', 96 | 'Prod Topics', 97 | 'Prod Categories', 98 | 'Matched', 99 | 'QA Title', 100 | 'QA URL', 101 | 'QA GUID', 102 | 'URL Match', 103 | 'GUID Match', 104 | ]); 105 | 106 | foreach ($prod_epa as $name => $prod_dataset) { 107 | if (isset($qa_epa_by_guid[$prod_dataset['guid']])) { 108 | foreach ($qa_epa_by_guid[$prod_dataset['guid']] as $qa_dataset) { 109 | $csv->writeRow([ 110 | $prod_dataset['title'], 111 | $prod_dataset['url'], 112 | $prod_dataset['guid'], 113 | $prod_dataset['topics'], 114 | $prod_dataset['categories'], 115 | true, 116 | $qa_dataset['title'], 117 | $qa_dataset['url'], 118 | $qa_dataset['guid'], 119 | (bool)($prod_dataset['name'] == $qa_dataset['name']), 120 | true, 121 | ]); 122 | } 123 | continue; 124 | } 125 | 126 | if (isset($qa_epa_by_title[$prod_dataset['title_simple']])) { 127 | foreach ($qa_epa_by_title[$prod_dataset['title_simple']] as $qa_dataset) { 128 | $csv->writeRow([ 129 | $prod_dataset['title'], 130 | $prod_dataset['url'], 131 | $prod_dataset['guid'], 132 | $prod_dataset['topics'], 133 | $prod_dataset['categories'], 134 | true, 135 | $qa_dataset['title'], 136 | $qa_dataset['url'], 137 | $qa_dataset['guid'], 138 | true, 139 | (bool)($prod_dataset['guid'] == $qa_dataset['guid']), 140 | ]); 141 | } 142 | continue; 143 | } 144 | 145 | $csv->writeRow([ 146 | $prod_dataset['title'], 147 | $prod_dataset['url'], 148 | $prod_dataset['guid'], 149 | $prod_dataset['topics'], 150 | $prod_dataset['categories'], 151 | false, 152 | '', 153 | '', 154 | '', 155 | false, 156 | false, 157 | ]); 158 | } 159 | 160 | // show running time on finish 161 | timer(); 162 | -------------------------------------------------------------------------------- /cli/epa-gov_process/__compare_json_vs_prod_epa.php: -------------------------------------------------------------------------------- 1 | writeRow([ 30 | 'title', 31 | 'title_simple', 32 | 'name', 33 | 'url', 34 | 'guid', 35 | 'topics', 36 | 'categories', 37 | ]); 38 | 39 | $ProdCkanManager = new CkanManager(CKAN_API_URL); 40 | $ProdCkanManager->resultsDir = $results_dir; 41 | 42 | $json_backup_epa = $ProdCkanManager->exportBrief('organization:epa-gov AND metadata_type:geospatial'); 43 | $json->writeFromArray($json_backup_epa); 44 | } else { 45 | $json = new Reader($results_dir . '/json.csv'); 46 | $json_backup_epa = $json->getAll(); 47 | } 48 | 49 | echo 'prod.csv' . PHP_EOL; 50 | if (!is_file($results_dir . '/prod.csv')) { 51 | $prod = new Writer($results_dir . '/prod.csv'); 52 | 53 | $prod->writeRow([ 54 | 'title', 55 | 'title_simple', 56 | 'name', 57 | 'url', 58 | 'guid', 59 | 'topics', 60 | 'categories', 61 | ]); 62 | 63 | $QaCkanManager = new CkanManager(CKAN_UAT_API_URL); 64 | $QaCkanManager->resultsDir = $results_dir; 65 | 66 | $prod_epa = $QaCkanManager->exportBrief('organization:epa-gov AND metadata_type:geospatial'); 67 | $prod->writeFromArray($prod_epa); 68 | 69 | } else { 70 | $prod = new Reader($results_dir . '/prod.csv'); 71 | $prod_epa = $prod->getAll(); 72 | } 73 | 74 | $prod_epa_by_title = $prod_epa_by_guid = []; 75 | 76 | foreach ($prod_epa as $name => $dataset) { 77 | $title = $dataset['title_simple']; 78 | 79 | $prod_epa_by_title[$title] = isset($prod_epa_by_title[$title]) ? $prod_epa_by_title[$title] : []; 80 | $prod_epa_by_title[$title][] = $dataset; 81 | 82 | $guid = trim($dataset['guid']); 83 | if ($guid) { 84 | $prod_epa_by_guid[$guid] = isset($prod_epa_by_guid[$guid]) ? $prod_epa_by_guid[$guid] : []; 85 | $prod_epa_by_guid[$guid][] = $dataset; 86 | } 87 | } 88 | 89 | echo 'json_vs_prod.csv' . PHP_EOL; 90 | is_file($results_dir . '/json_vs_prod_epa.csv') && unlink($results_dir . '/json_vs_prod_epa.csv'); 91 | $csv = new Writer($results_dir . '/json_vs_prod_epa.csv'); 92 | $csv->writeRow([ 93 | 'Backup Title', 94 | 'Backup URL', 95 | 'Backup GUID', 96 | 'Backup Topics', 97 | 'Backup Categories', 98 | 'Matched', 99 | 'Prod Title', 100 | 'Prod URL', 101 | 'Prod GUID', 102 | 'URL Match', 103 | 'GUID Match', 104 | ]); 105 | 106 | foreach ($json_backup_epa as $name => $backup_dataset) { 107 | if (isset($prod_epa_by_guid[$backup_dataset['guid']])) { 108 | foreach ($prod_epa_by_guid[$backup_dataset['guid']] as $prod_dataset) { 109 | $csv->writeRow([ 110 | $backup_dataset['title'], 111 | $backup_dataset['url'], 112 | $backup_dataset['guid'], 113 | $backup_dataset['topics'], 114 | $backup_dataset['categories'], 115 | true, 116 | $prod_dataset['title'], 117 | $prod_dataset['url'], 118 | $prod_dataset['guid'], 119 | (bool)($backup_dataset['name'] == $prod_dataset['name']), 120 | true, 121 | ]); 122 | } 123 | continue; 124 | } 125 | 126 | if (isset($prod_epa_by_title[$backup_dataset['title_simple']])) { 127 | foreach ($prod_epa_by_title[$backup_dataset['title_simple']] as $prod_dataset) { 128 | $csv->writeRow([ 129 | $backup_dataset['title'], 130 | $backup_dataset['url'], 131 | $backup_dataset['guid'], 132 | $backup_dataset['topics'], 133 | $backup_dataset['categories'], 134 | true, 135 | $prod_dataset['title'], 136 | $prod_dataset['url'], 137 | $prod_dataset['guid'], 138 | true, 139 | (bool)($backup_dataset['guid'] == $prod_dataset['guid']), 140 | ]); 141 | } 142 | continue; 143 | } 144 | 145 | $csv->writeRow([ 146 | $backup_dataset['title'], 147 | $backup_dataset['url'], 148 | $backup_dataset['guid'], 149 | $backup_dataset['topics'], 150 | $backup_dataset['categories'], 151 | false, 152 | '', 153 | '', 154 | '', 155 | false, 156 | false, 157 | ]); 158 | } 159 | 160 | // show running time on finish 161 | timer(); 162 | -------------------------------------------------------------------------------- /src/CKAN/Manager/ExploreApi.php: -------------------------------------------------------------------------------- 1 | api_url = $api_url; 35 | 36 | // Create cURL object. 37 | $this->curl_handler = curl_init(); 38 | // Follow any Location: headers that the server sends. 39 | curl_setopt($this->curl_handler, CURLOPT_FOLLOWLOCATION, true); 40 | // However, don't follow more than five Location: headers. 41 | curl_setopt($this->curl_handler, CURLOPT_MAXREDIRS, 5); 42 | // Automatically set the Referrer: field in requests 43 | // following a Location: redirect. 44 | curl_setopt($this->curl_handler, CURLOPT_AUTOREFERER, true); 45 | // Return the transfer as a string instead of dumping to screen. 46 | curl_setopt($this->curl_handler, CURLOPT_RETURNTRANSFER, true); 47 | // If it takes more than 5 minutes => fail 48 | curl_setopt($this->curl_handler, CURLOPT_TIMEOUT, 60 * 5); 49 | // We don't want the header (use curl_getinfo()) 50 | curl_setopt($this->curl_handler, CURLOPT_HEADER, false); 51 | // Track the handle's request string 52 | curl_setopt($this->curl_handler, CURLINFO_HEADER_OUT, true); 53 | // Attempt to retrieve the modification date of the remote document. 54 | curl_setopt($this->curl_handler, CURLOPT_FILETIME, true); 55 | // Initialize cURL headers 56 | $this->set_headers(); 57 | } 58 | 59 | /** 60 | * Sets the custom cURL headers. 61 | * @access private 62 | * @return void 63 | * @since Version 0.1.0 64 | */ 65 | private function set_headers() 66 | { 67 | $date = new \DateTime(null, new \DateTimeZone('UTC')); 68 | $this->ch_headers = [ 69 | 'Date: ' . $date->format('D, d M Y H:i:s') . ' GMT', // RFC 1123 70 | 'Accept: application/json', 71 | 'Accept-Charset: utf-8', 72 | 'Accept-Encoding: gzip' 73 | ]; 74 | } 75 | 76 | /** 77 | * @param $json_id 78 | * 79 | * @return mixed 80 | */ 81 | public function get_json($json_id) 82 | { 83 | return $this->make_request( 84 | 'GET', 85 | 'views/' . $json_id . '.json' 86 | ); 87 | } 88 | 89 | /** 90 | * @param string $method // HTTP method (GET, POST) 91 | * @param string $uri // URI fragment to CKAN resource 92 | * @param string $data // Optional. String in JSON-format that will be in request body 93 | * 94 | * @return mixed // If success, either an array or object. Otherwise FALSE. 95 | * @throws \Exception 96 | */ 97 | private function make_request($method, $uri, $data = null) 98 | { 99 | $method = strtoupper($method); 100 | if (!in_array($method, ['GET', 'POST'])) { 101 | throw new \Exception('Method ' . $method . ' is not supported'); 102 | } 103 | // Set cURL URI. 104 | curl_setopt($this->curl_handler, CURLOPT_URL, $this->api_url . $uri); 105 | if ($method === 'POST') { 106 | if ($data) { 107 | curl_setopt($this->curl_handler, CURLOPT_POSTFIELDS, urlencode($data)); 108 | } else { 109 | $method = 'GET'; 110 | } 111 | } 112 | 113 | // Set cURL method. 114 | curl_setopt($this->curl_handler, CURLOPT_CUSTOMREQUEST, $method); 115 | 116 | // Set headers. 117 | curl_setopt($this->curl_handler, CURLOPT_HTTPHEADER, $this->ch_headers); 118 | // Execute request and get response headers. 119 | $response = curl_exec($this->curl_handler); 120 | $info = curl_getinfo($this->curl_handler); 121 | // Check HTTP response code 122 | if ($info['http_code'] !== 200) { 123 | switch ($info['http_code']) { 124 | case 0: 125 | var_dump($info); 126 | break; 127 | case 404: 128 | throw new \Exception($data); 129 | break; 130 | default: 131 | throw new \Exception( 132 | $info['http_code'] . ': ' . PHP_EOL . $data . PHP_EOL 133 | ); 134 | } 135 | } 136 | 137 | return $response; 138 | } 139 | 140 | /** 141 | * Since it's possible to leave cURL open, this is the last chance to close it 142 | */ 143 | public function __destruct() 144 | { 145 | if ($this->curl_handler) { 146 | curl_close($this->curl_handler); 147 | unset($this->curl_handler); 148 | } 149 | } 150 | } 151 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ckan-php-manager 2 | ================ 3 | 4 | [![Build Status](https://travis-ci.org/GSA/ckan-php-manager.svg?branch=master)](https://travis-ci.org/GSA/ckan-php-manager) 5 | [![Codacy Badge](https://api.codacy.com/project/badge/a07828e07ef9416583a88beedf6ff072)](https://www.codacy.com/app/alexandr-perfilov/ckan-php-manager) 6 | [![Join the chat at https://gitter.im/GSA/ckan-php-manager](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/GSA/ckan-php-manager?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) 7 | 8 | A bunch of scripts to perform tasks using CKAN API and https://github.com/GSA/ckan-php-client 9 | 10 | ## Requirements 11 | 12 | * PHP 7.0+ : 13 | 14 | ## Installation 15 | 16 | ### Clone repository 17 | $ git clone https://github.com/GSA/ckan-php-manager.git 18 | 19 | ### Composer 20 | Use [composer](#composer) to install/update dependencies 21 | 22 | If you don't have Composer, [install](https://getcomposer.org/download/) it: 23 | 24 | $ curl -sS https://getcomposer.org/installer | php 25 | $ mv composer.phar /usr/local/bin/composer 26 | 27 | #### Install dependencies: 28 | 29 | $ composer install 30 | 31 | ### Configuration 32 | Copy config.sample.php to config.php. Update it with your custom values, if needed. 33 | 34 | $ cp inc/config.sample.php inc/config.php 35 | 36 | ## Usage 37 | 38 | ### Export all packages by Agency name, including all Sub Agencies 39 | 40 | * Update `cli/export_packages_by_org.php`, editing the title of exported organization ORGANIZATION_TO_EXPORT 41 | * Run importer using php 42 | 43 | ``` 44 | $ php cli/export_packages_by_org.php 45 | ``` 46 | 47 | Script is taking all terms, including sub-agencies from http://www.data.gov/app/themes/roots-nextdatagov/assets/Json/fed_agency.json and makes CKAN requests, 48 | looking for packages by these organization list. 49 | 50 | Results can be found in /results/{timestamp} dir after script finished its work, including `_{term}.log` with package counts for each agency. 51 | 52 | ### DMS legacy tag 53 | 54 | To add tag `add_legacy_dms_and_make_private` to all datasets of some group: 55 | 56 | * Update ORGANIZATION_TO_TAG in the `cli/add_legacy_dms_and_make_private.php` 57 | * Double check CKAN_URL and CKAN_API_KEY for editing datasets 58 | * Run script 59 | 60 | ``` 61 | $ php cli/add_legacy_dms_and_make_private.php 62 | ``` 63 | 64 | ### Assign groups and category tags to datasets 65 | 66 | * Put csv files to /data dir, with `assign_.csv` (must have `assign_` prefix) 67 | The format of these files must be: 68 | `dataset, group, categories` 69 | 70 | First line is caption, leave the first line in each file: 71 | `dataset,group,categories` 72 | 73 | Then put one dataset per line. 74 | 75 | 1. Dataset can be: 76 | * Dataset url, ex. https://catalog.data.gov/dataset/food-access-research-atlas 77 | * Dataset name, ex. download-crossing-inventory-data-highway-rail-crossing 78 | * Dataset id 79 | 80 | 2. Group 81 | just one group per line. If you need to add multiple groups, you must create another row in csv with same dataset and another group, 82 | because all the categories are tagged by current row group. Make sure your group exist in your CKAN instance (to list all 83 | existing groups, go to http://catalog.data.gov/api/3/action/group_list?all_fields=true , replacing `catalog.data.gov` with your 84 | CKAN domain) 85 | 86 | 3. Categories 87 | one of multiple categories per current row group, separated by semicolon `;` 88 | 89 | Example csv file: 90 | 91 | ``` 92 | dataset, group, categories 93 | https://catalog.data.gov/dataset/food-access-research-atlas,Agriculture,"Natural Resources and Environment" 94 | aerial-image-of-alaskas-arctic-coastal-plain-1955,Climate,"Arctic; Arctic Ocean, Sea Ice and Coasts; Permafrost and Arctic Landscapes" 95 | 28d30c1f-75a5-4042-b0fc-de26cc7d70f2,Climate,Arctic; Arctic Development and Transport 96 | ``` 97 | * Double check CKAN_URL and CKAN_API_KEY for editing datasets, defined in `inc/config.php` 98 | * Run script 99 | 100 | ``` 101 | $ php cli/tagging/assign_groups_and_tags.php 102 | ``` 103 | * Detailed logs and results are stored in folder `results/[time-stamp]_ASSIGN_GROUPS` 104 | 105 | ### Remove groups and category tags from datasets (revert previous script changes) 106 | 107 | * Prepare same csv file as for previous script, and put them to /data dir, with `remove_.csv` 108 | 109 | ``` 110 | $ php cli/tagging/remove_groups_and_tags.php 111 | ``` 112 | * This command will remove listed categories from the dataset of the row. If an empty list of categories is provided, this command will remove the group and all categories from the dataset. 113 | 114 | ## CKAN API DOCs 115 | 116 | http://docs.ckan.org/en/latest/api/index.html 117 | 118 | 119 | ## Docker setup 120 | 121 | To minimize requirements on a system, we've added a minimal setup with 122 | docker-compose. This should replace the above usage instructions as the default 123 | workflow. 124 | 125 | $ docker-compose build 126 | $ docker-compose run --rm app php cli/harvest_stats_csv.php 127 | 128 | Run the tests. 129 | 130 | $ docker-compose run --rm app phpunit 131 | -------------------------------------------------------------------------------- /cli/compare_basic.php: -------------------------------------------------------------------------------- 1 | writeRow([ 30 | 'title', 31 | 'title_simple', 32 | 'name', 33 | 'url', 34 | 'identifier', 35 | 'guid', 36 | 'topics', 37 | 'categories', 38 | ]); 39 | 40 | $CkanManager = new CkanManager(CKAN_API_URL); 41 | $CkanManager->resultsDir = $results_dir; 42 | 43 | $cmp1 = $CkanManager->exportBrief('organization:((eop-gov) OR (omb-eop-gov) OR (ondcp-eop-gov) OR (ceq-eop-gov) ' . 44 | 'OR (ostp-eop-gov) OR (ustr-eop-gov) OR (wh-eop-gov)) DMS AND dataset_type:dataset'); 45 | $cmp1_csv->writeFromArray($cmp1); 46 | } else { 47 | $cmp1_csv = new Reader($results_dir . '/cmp1.csv'); 48 | $cmp1_csv->getHeaders(); 49 | $cmp1 = $cmp1_csv->getAll(); 50 | } 51 | 52 | echo 'cmp2.csv' . PHP_EOL; 53 | if (!is_file($results_dir . '/cmp2.csv')) { 54 | $cmp2_csv = new Writer($results_dir . '/cmp2.csv'); 55 | 56 | $cmp2_csv->writeRow([ 57 | 'title', 58 | 'title_simple', 59 | 'name', 60 | 'url', 61 | 'identifier', 62 | 'guid', 63 | 'topics', 64 | 'categories', 65 | ]); 66 | 67 | $CkanManager = new CkanManager(CKAN_API_URL); 68 | $CkanManager->resultsDir = $results_dir; 69 | 70 | $cmp2 = $CkanManager->exportBrief('organization:((eop-gov) OR (omb-eop-gov) OR (ondcp-eop-gov) OR (ceq-eop-gov) ' . 71 | 'OR (ostp-eop-gov) OR (ustr-eop-gov) OR (wh-eop-gov)) -DMS AND dataset_type:dataset'); 72 | $cmp2_csv->writeFromArray($cmp2); 73 | 74 | } else { 75 | $cmp2_csv = new Reader($results_dir . '/cmp2.csv'); 76 | $cmp2 = $cmp2_csv->getAll(); 77 | } 78 | 79 | $cmp2_by_title = $cmp2_by_guid = []; 80 | 81 | foreach ($cmp2 as $name => $dataset) { 82 | $title = $dataset['title_simple']; 83 | 84 | $cmp2_by_title[$title] = isset($cmp2_by_title[$title]) ? $cmp2_by_title[$title] : []; 85 | $cmp2_by_title[$title][] = $dataset; 86 | 87 | $guid = trim($dataset['guid']); 88 | if ($guid) { 89 | $cmp2_by_guid[$guid] = isset($cmp2_by_guid[$guid]) ? $cmp2_by_guid[$guid] : []; 90 | $cmp2_by_guid[$guid][] = $dataset; 91 | } 92 | } 93 | 94 | echo 'comparison.csv' . PHP_EOL; 95 | is_file($results_dir . '/comparison.csv') && unlink($results_dir . '/comparison.csv'); 96 | $csv = new Writer($results_dir . '/comparison.csv'); 97 | $cmp1_header = "DMS"; 98 | $cmp2_header = "NON-DMS"; 99 | $csv->writeRow([ 100 | $cmp1_header . ' Title', 101 | $cmp1_header . ' URL', 102 | $cmp1_header . ' GUID', 103 | $cmp1_header . ' Topics', 104 | $cmp1_header . ' Categories', 105 | 'Matched', 106 | $cmp2_header . ' Title', 107 | $cmp2_header . ' URL', 108 | $cmp2_header . ' GUID', 109 | 'URL Match', 110 | 'GUID Match', 111 | ]); 112 | 113 | foreach ($cmp1 as $name => $cmp1_dataset) { 114 | if (isset($cmp2_by_guid[$cmp1_dataset['guid']])) { 115 | foreach ($cmp2_by_guid[$cmp1_dataset['guid']] as $cmp2_dataset) { 116 | $csv->writeRow([ 117 | $cmp1_dataset['title'], 118 | $cmp1_dataset['url'], 119 | $cmp1_dataset['guid'], 120 | $cmp1_dataset['topics'], 121 | $cmp1_dataset['categories'], 122 | true, 123 | $cmp2_dataset['title'], 124 | $cmp2_dataset['url'], 125 | $cmp2_dataset['guid'], 126 | (bool)($cmp1_dataset['name'] && $cmp1_dataset['name'] == $cmp2_dataset['name']), 127 | true, 128 | ]); 129 | } 130 | continue; 131 | } 132 | 133 | if (isset($cmp2_by_title[$cmp1_dataset['title_simple']])) { 134 | foreach ($cmp2_by_title[$cmp1_dataset['title_simple']] as $cmp2_dataset) { 135 | $csv->writeRow([ 136 | $cmp1_dataset['title'], 137 | $cmp1_dataset['url'], 138 | $cmp1_dataset['guid'], 139 | $cmp1_dataset['topics'], 140 | $cmp1_dataset['categories'], 141 | true, 142 | $cmp2_dataset['title'], 143 | $cmp2_dataset['url'], 144 | $cmp2_dataset['guid'], 145 | true, 146 | (bool)($cmp1_dataset['guid'] && $cmp1_dataset['guid'] == $cmp2_dataset['guid']), 147 | ]); 148 | } 149 | continue; 150 | } 151 | 152 | $csv->writeRow([ 153 | $cmp1_dataset['title'], 154 | $cmp1_dataset['url'], 155 | $cmp1_dataset['guid'], 156 | $cmp1_dataset['topics'], 157 | $cmp1_dataset['categories'], 158 | false, 159 | '', 160 | '', 161 | '', 162 | false, 163 | false, 164 | ]); 165 | } 166 | 167 | // show running time on finish 168 | timer(); 169 | -------------------------------------------------------------------------------- /cli/noaa-gov/compare_qa_vs_prod_noaa.php: -------------------------------------------------------------------------------- 1 | writeRow([ 29 | 'title', 30 | 'title_simple', 31 | 'name', 32 | 'url', 33 | 'identifier', 34 | 'guid', 35 | 'topics', 36 | 'categories', 37 | ]); 38 | $ProdCkanManager = new CkanManager(CKAN_API_URL); 39 | $ProdCkanManager->resultsDir = $results_dir; 40 | 41 | $prod_noaa = $ProdCkanManager->exportBrief('organization:noaa-gov AND metadata_type:geospatial AND dataset_type:dataset'); 42 | $prod->writeFromArray($prod_noaa); 43 | file_put_contents($results_dir . '/prod.json', json_encode($prod_noaa, JSON_PRETTY_PRINT)); 44 | } else { 45 | $prod_noaa = json_decode(file_get_contents($results_dir . '/prod.json')); 46 | } 47 | 48 | echo 'qa.json' . PHP_EOL; 49 | if (!is_file($results_dir . '/qa.json')) { 50 | $qa = new Writer($results_dir . '/qa.csv'); 51 | 52 | $qa->writeRow([ 53 | 'title', 54 | 'title_simple', 55 | 'name', 56 | 'url', 57 | 'identifier', 58 | 'guid', 59 | 'topics', 60 | 'categories', 61 | ]); 62 | $QaCkanManager = new CkanManager(CKAN_QA_API_URL); 63 | $QaCkanManager->resultsDir = $results_dir; 64 | 65 | $qa_noaa = $QaCkanManager->exportBrief('organization:noaa-gov', '', 66 | 'http://qa-catalog-fe-data.reisys.com/dataset/'); 67 | $qa->writeFromArray($qa_noaa); 68 | file_put_contents($results_dir . '/qa.json', json_encode($qa_noaa, JSON_PRETTY_PRINT)); 69 | } else { 70 | $qa_noaa = json_decode(file_get_contents($results_dir . '/qa.json')); 71 | } 72 | 73 | $qa_noaa_by_title = $qa_noaa_by_guid = []; 74 | 75 | foreach ($qa_noaa as $name => $dataset) { 76 | $title = $dataset['title_simple']; 77 | 78 | $qa_noaa_by_title[$title] = isset($qa_noaa_by_title[$title]) ? $qa_noaa_by_title[$title] : []; 79 | $qa_noaa_by_title[$title][] = $dataset; 80 | 81 | $guid = trim($dataset['guid']); 82 | if ($guid) { 83 | $qa_noaa_by_guid[$guid] = isset($qa_noaa_by_guid[$guid]) ? $qa_noaa_by_guid[$guid] : []; 84 | $qa_noaa_by_guid[$guid][] = $dataset; 85 | } 86 | } 87 | 88 | echo 'prod_vs_qa.csv' . PHP_EOL; 89 | is_file($results_dir . '/prod_vs_qa_noaa_geospatial.csv') && unlink($results_dir . '/prod_vs_qa_noaa_geospatial.csv'); 90 | $csv = new Writer($results_dir . '/prod_vs_qa_noaa_geospatial.csv'); 91 | $csv->writeRow([ 92 | 'Prod Title', 93 | 'Prod URL', 94 | 'Prod GUID', 95 | 'Prod Topics', 96 | 'Prod Categories', 97 | 'Matched', 98 | 'QA Title', 99 | 'QA URL', 100 | 'QA GUID', 101 | 'URL Match', 102 | 'Title Match', 103 | 'GUID Match', 104 | ]); 105 | 106 | foreach ($prod_noaa as $name => $prod_dataset) { 107 | if (isset($qa_noaa_by_guid[$prod_dataset['guid']])) { 108 | foreach ($qa_noaa_by_guid[$prod_dataset['guid']] as $qa_dataset) { 109 | $csv->writeRow([ 110 | $prod_dataset['title'], 111 | $prod_dataset['url'], 112 | $prod_dataset['guid'], 113 | $prod_dataset['topics'], 114 | $prod_dataset['categories'], 115 | true, 116 | $qa_dataset['title'], 117 | $qa_dataset['url'], 118 | $qa_dataset['guid'], 119 | (bool)($prod_dataset['name'] && $prod_dataset['name'] == $qa_dataset['name']), 120 | (bool)($prod_dataset['title_simple'] && $prod_dataset['title_simple'] == $qa_dataset['title_simple']), 121 | true, 122 | ]); 123 | } 124 | continue; 125 | } 126 | 127 | if (isset($qa_noaa_by_title[$prod_dataset['title_simple']])) { 128 | foreach ($qa_noaa_by_title[$prod_dataset['title_simple']] as $qa_dataset) { 129 | $csv->writeRow([ 130 | $prod_dataset['title'], 131 | $prod_dataset['url'], 132 | $prod_dataset['guid'], 133 | $prod_dataset['topics'], 134 | $prod_dataset['categories'], 135 | true, 136 | $qa_dataset['title'], 137 | $qa_dataset['url'], 138 | $qa_dataset['guid'], 139 | (bool)($prod_dataset['name'] && $prod_dataset['name'] == $qa_dataset['name']), 140 | true, 141 | (bool)($prod_dataset['guid'] == $qa_dataset['guid']), 142 | ]); 143 | } 144 | continue; 145 | } 146 | 147 | $csv->writeRow([ 148 | $prod_dataset['title'], 149 | $prod_dataset['url'], 150 | $prod_dataset['guid'], 151 | $prod_dataset['topics'], 152 | $prod_dataset['categories'], 153 | false, 154 | '', 155 | '', 156 | '', 157 | false, 158 | false, 159 | ]); 160 | } 161 | 162 | // show running time on finish 163 | timer(); 164 | -------------------------------------------------------------------------------- /cli/tagging/assign_groups_and_tags.php: -------------------------------------------------------------------------------- 1 | resultsDir = $results_dir; 32 | foreach (glob(CKANMNGR_DATA_DIR . '/assign_*.csv') as $csv_file) { 33 | $csv_source = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL; 34 | echo $CkanManager->color->green($csv_source); 35 | 36 | $basename = str_replace('.csv', '', basename($csv_file)); 37 | 38 | // fix wrong END-OF-LINE 39 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file))); 40 | 41 | // file_put_contents($resultsDir . '/' . $basename . '_tags.log', $status, FILE_APPEND | LOCK_EX); 42 | 43 | $csv = new EasyCSV\Reader($csv_file, 'r+', false); 44 | while (true) { 45 | $row = $csv->getRow(); 46 | if (!$row) { 47 | break; 48 | } 49 | 50 | // skip headers 51 | if (in_array(trim(strtolower($row['0'])), ['link', 'dataset', 'url', 'data.gov url'])) { 52 | continue; 53 | } 54 | 55 | if ($start > 0) { 56 | $start--; 57 | continue; 58 | } 59 | 60 | // format group tags 61 | $categories = []; 62 | if (isset($row['2']) && $row['2']) { 63 | $categories = explode(';', trim($row['2'])); 64 | $categories = array_map('trim', $categories); 65 | 66 | } 67 | 68 | // no anchors please 69 | list($dataset,) = explode('#', basename(trim($row['0']))); 70 | 71 | if (!$dataset) { 72 | continue; 73 | } 74 | 75 | // double trouble check 76 | if (strpos($row['0'], '://')) { 77 | if (!strpos($row['0'], '/dataset/')) { 78 | if (strpos($row['0'], 'dataset?q=')) { 79 | parse_str(parse_url($row['0'], PHP_URL_QUERY), $query_array); 80 | if (isset($query_array['q'])) { 81 | $query = $query_array['q']; 82 | if (isset($query_array['organization'])) { 83 | $org = $query_array['organization']; 84 | $organizationList = new OrganizationList(); 85 | $org = $organizationList->getTreeArrayFor($organizationList->getNameFor($org)); 86 | if (!is_array($org) || !sizeof($org)) { 87 | continue; 88 | } 89 | $org = join(' OR ', array_keys($org)); 90 | // var_dump($organizationList->getTreeArrayFor($organizationList->getNameFor($org))); 91 | // continue; 92 | $query = "$query AND organization:($org)"; 93 | 94 | 95 | // echo $query.PHP_EOL; 96 | } 97 | $packages = $CkanManager->tryPackageSearch($query, '', 200); 98 | $CkanManager->say(sizeof($packages) . " found searching: $query,API SEARCH"); 99 | file_put_contents( 100 | $results_dir . '/' . $basename . '_tags.log.csv', 101 | sizeof($packages) . " found searching: $query,API SEARCH" . PHP_EOL, 102 | FILE_APPEND | LOCK_EX 103 | ); 104 | // print $query_array['q']; 105 | if (!sizeof($packages)) { 106 | continue; 107 | } 108 | 109 | foreach ($packages as $package) { 110 | $CkanManager->assignGroupsAndCategoriesToDatasets( 111 | [$package['name']], 112 | trim($row['1']), 113 | $basename, 114 | $categories 115 | ); 116 | continue; 117 | } 118 | } 119 | continue; 120 | } 121 | 122 | 123 | continue; 124 | } 125 | } 126 | 127 | $CkanManager->assignGroupsAndCategoriesToDatasets( 128 | [$dataset], 129 | trim($row['1']), 130 | $basename, 131 | $categories 132 | ); 133 | } 134 | } 135 | 136 | // show running time on finish 137 | timer(); 138 | -------------------------------------------------------------------------------- /cli/check_aapi.php: -------------------------------------------------------------------------------- 1 | fail 32 | curl_setopt($curl_ch, CURLOPT_TIMEOUT, 60 * 5); 33 | // We don't want the header (use curl_getinfo()) 34 | curl_setopt($curl_ch, CURLOPT_HEADER, false); 35 | // Track the handle's request string 36 | curl_setopt($curl_ch, CURLINFO_HEADER_OUT, true); 37 | // Attempt to retrieve the modification date of the remote document. 38 | curl_setopt($curl_ch, CURLOPT_FILETIME, true); 39 | // Initialize cURL headers 40 | 41 | foreach (glob(CKANMNGR_DATA_DIR . '/check_*.csv') as $csv_file) { 42 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL; 43 | echo $status; 44 | 45 | $basename = str_replace('.csv', '', basename($csv_file)); 46 | 47 | // fix wrong END-OF-LINE 48 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file))); 49 | 50 | $csv_source = new EasyCSV\Reader($csv_file, 'r+', false); 51 | $csv_destination = new EasyCSV\Writer($results_dir . '/' . $basename . '_log.csv'); 52 | 53 | $csv_destination->writeRow(['dataset', 'status', 'aapi found']); 54 | 55 | $i = 0; 56 | while (true) { 57 | if (!($i++ % 100)) { 58 | echo $i . PHP_EOL; 59 | } 60 | $row = $csv_source->getRow(); 61 | if (!$row) { 62 | break; 63 | } 64 | // skip headers 65 | if (in_array(trim(strtolower($row[0])), ['data.gov url'])) { 66 | continue; 67 | } 68 | 69 | $url = strtolower($row[0]); 70 | 71 | if (!strpos($url, '/dataset/')) { 72 | $csv_destination->writeRow([$url, 'not a dataset', '0']); 73 | continue; 74 | } 75 | 76 | $dataset = try_get_dataset($curl_ch, str_replace('/dataset/', '/api/rest/dataset/', $url)); 77 | 78 | if (200 !== $dataset['info']['http_code']) { 79 | // Redirect check 80 | $dataset2 = try_get_dataset($curl_ch, $url); 81 | if ((404 == $dataset['info']['http_code']) && (200 == $dataset2['info']['http_code'])) { 82 | $response = $dataset2['response']; 83 | if (stripos($response, 'http-equiv="refresh"')) { 84 | $pattern = '/content="0;URL=(http[\S\/\-\.]+)"/'; 85 | preg_match($pattern, $response, $matches, PREG_OFFSET_CAPTURE, 3); 86 | if ($matches && isset($matches[1]) && isset($matches[1][0])) { 87 | $url2 = $matches[1][0]; 88 | 89 | $dataset3 = try_get_dataset($curl_ch, str_replace('/dataset/', '/api/rest/dataset/', $url2)); 90 | if (200 == $dataset3['info']['http_code']) { 91 | $aapi_found = strpos($dataset3['response'], 'aapi0916'); 92 | $csv_destination->writeRow([$url, 'ok (redirect)', ($aapi_found ? '1' : '0')]); 93 | continue; 94 | } 95 | } 96 | } 97 | } 98 | $csv_destination->writeRow([$url, $dataset['info']['http_code'], '0']); 99 | continue; 100 | } else { 101 | if (!strpos($dataset['response'], '"type": "dataset",')) { 102 | $csv_destination->writeRow([$url, 'not a dataset', '0']); 103 | continue; 104 | } 105 | $aapi_found = strpos($dataset['response'], 'aapi0916'); 106 | $csv_destination->writeRow([$url, 'ok', ($aapi_found ? '1' : '0')]); 107 | continue; 108 | } 109 | } 110 | } 111 | 112 | /** 113 | * @param $url1 114 | * @param $url2 115 | * 116 | * @return bool 117 | */ 118 | function url_compare($url1, $url2) 119 | { 120 | $url1_strip = trim(str_replace(['http:', 'https:'], '', $url1), '/ '); 121 | $url2_strip = trim(str_replace(['http:', 'https:'], '', $url2), '/ '); 122 | 123 | return ($url1_strip === $url2_strip); 124 | } 125 | 126 | /** 127 | * @param $curl_ch 128 | * @param $url 129 | * @return array 130 | */ 131 | function try_get_dataset($curl_ch, $url) 132 | { 133 | curl_setopt($curl_ch, CURLOPT_URL, $url); 134 | $method = 'GET'; 135 | 136 | // Set cURL method. 137 | curl_setopt($curl_ch, CURLOPT_CUSTOMREQUEST, $method); 138 | 139 | // Execute request and get response headers. 140 | $response = curl_exec($curl_ch); 141 | $info = curl_getinfo($curl_ch); 142 | 143 | $return = [ 144 | 'response' => $response, 145 | 'info' => $info 146 | ]; 147 | 148 | return $return; 149 | } 150 | 151 | // show running time on finish 152 | timer(); 153 | -------------------------------------------------------------------------------- /cli/ntsb-gov_process/compare_uat_vs_prod_ntsb.php: -------------------------------------------------------------------------------- 1 | writeRow([ 30 | 'title', 31 | 'title_simple', 32 | 'name', 33 | 'url', 34 | 'guid', 35 | 'topics', 36 | 'categories', 37 | ]); 38 | 39 | $ProdCkanManager = new CkanManager(CKAN_API_URL); 40 | $ProdCkanManager->resultsDir = $results_dir; 41 | 42 | $prod_ntsb = $ProdCkanManager->exportBrief('organization:ntsb-gov AND dataset_type:dataset'); 43 | $prod->writeFromArray($prod_ntsb); 44 | } else { 45 | $prod = new Reader($results_dir . '/prod.csv'); 46 | $prod_ntsb = $prod->getAll(); 47 | } 48 | 49 | echo 'uat.csv' . PHP_EOL; 50 | if (!is_file($results_dir . '/uat.csv')) { 51 | $uat = new Writer($results_dir . '/uat.csv'); 52 | 53 | $uat->writeRow([ 54 | 'title', 55 | 'title_simple', 56 | 'name', 57 | 'url', 58 | 'guid', 59 | 'topics', 60 | 'categories', 61 | ]); 62 | 63 | $QaCkanManager = new CkanManager(CKAN_UAT_API_URL); 64 | $QaCkanManager->resultsDir = $results_dir; 65 | 66 | $uat_ntsb = $QaCkanManager->exportBrief('organization:ntsb-gov AND (harvest_source_title:NTSB*) AND dataset_type:dataset', 67 | '', 'http://uat-catalog-fe-data.reisys.com/dataset/'); 68 | $uat->writeFromArray($uat_ntsb); 69 | 70 | } else { 71 | $uat = new Reader($results_dir . '/uat.csv'); 72 | $uat_ntsb = $uat->getAll(); 73 | } 74 | 75 | $uat_ntsb_by_title = $uat_ntsb_by_guid = []; 76 | 77 | foreach ($uat_ntsb as $name => $dataset) { 78 | $title = $dataset['title_simple']; 79 | 80 | $uat_ntsb_by_title[$title] = isset($uat_ntsb_by_title[$title]) ? $uat_ntsb_by_title[$title] : []; 81 | $uat_ntsb_by_title[$title][] = $dataset; 82 | 83 | $guid = trim($dataset['guid']); 84 | if ($guid) { 85 | $uat_ntsb_by_guid[$guid] = isset($uat_ntsb_by_guid[$guid]) ? $uat_ntsb_by_guid[$guid] : []; 86 | $uat_ntsb_by_guid[$guid][] = $dataset; 87 | } 88 | } 89 | 90 | echo 'prod_vs_uat.csv' . PHP_EOL; 91 | is_file($results_dir . '/prod_vs_uat_ntsb.csv') && unlink($results_dir . '/prod_vs_uat_ntsb.csv'); 92 | $csv = new Writer($results_dir . '/prod_vs_uat_ntsb.csv'); 93 | $csv->writeRow([ 94 | 'Prod Title', 95 | 'Prod URL', 96 | 'Prod GUID', 97 | 'Prod Topics', 98 | 'Prod Categories', 99 | 'Matched', 100 | 'UAT Title', 101 | 'UAT URL', 102 | 'UAT GUID', 103 | 'URL Match', 104 | 'GUID Match', 105 | ]); 106 | 107 | $matched = []; 108 | 109 | foreach ($prod_ntsb as $name => $prod_dataset) { 110 | if (isset($uat_ntsb_by_guid[$prod_dataset['guid']])) { 111 | foreach ($uat_ntsb_by_guid[$prod_dataset['guid']] as $uat_dataset) { 112 | $csv->writeRow([ 113 | $prod_dataset['title'], 114 | $prod_dataset['url'], 115 | $prod_dataset['guid'], 116 | $prod_dataset['topics'], 117 | $prod_dataset['categories'], 118 | true, 119 | $uat_dataset['title'], 120 | $uat_dataset['url'], 121 | $uat_dataset['guid'], 122 | (bool)($prod_dataset['name'] == $uat_dataset['name']), 123 | true, 124 | ]); 125 | $matched[] = $uat_dataset['title_simple']; 126 | } 127 | continue; 128 | } 129 | 130 | if (isset($uat_ntsb_by_title[$prod_dataset['title_simple']])) { 131 | foreach ($uat_ntsb_by_title[$prod_dataset['title_simple']] as $uat_dataset) { 132 | $csv->writeRow([ 133 | $prod_dataset['title'], 134 | $prod_dataset['url'], 135 | $prod_dataset['guid'], 136 | $prod_dataset['topics'], 137 | $prod_dataset['categories'], 138 | true, 139 | $uat_dataset['title'], 140 | $uat_dataset['url'], 141 | $uat_dataset['guid'], 142 | true, 143 | (bool)($prod_dataset['guid'] == $uat_dataset['guid']), 144 | ]); 145 | $matched[] = $uat_dataset['title_simple']; 146 | } 147 | continue; 148 | } 149 | 150 | $csv->writeRow([ 151 | $prod_dataset['title'], 152 | $prod_dataset['url'], 153 | $prod_dataset['guid'], 154 | $prod_dataset['topics'], 155 | $prod_dataset['categories'], 156 | false, 157 | '', 158 | '', 159 | '', 160 | false, 161 | false, 162 | ]); 163 | } 164 | 165 | foreach ($uat_ntsb as $name => $uat_dataset) { 166 | if (!in_array($uat_dataset['title_simple'], $matched)) { 167 | $csv->writeRow([ 168 | '', 169 | '', 170 | '', 171 | '', 172 | '', 173 | false, 174 | $uat_dataset['title'], 175 | $uat_dataset['url'], 176 | $uat_dataset['guid'], 177 | false, 178 | false, 179 | ]); 180 | } 181 | } 182 | 183 | // show running time on finish 184 | timer(); 185 | -------------------------------------------------------------------------------- /cli/noaa-gov/compare_uat_vs_prod_noaa.php: -------------------------------------------------------------------------------- 1 | writeRow([ 29 | 'title', 30 | 'title_simple', 31 | 'name', 32 | 'url', 33 | 'identifier', 34 | 'guid', 35 | 'topics', 36 | 'categories', 37 | ]); 38 | $ProdCkanManager = new CkanManager(CKAN_API_URL); 39 | $ProdCkanManager->resultsDir = $results_dir; 40 | 41 | $prod_noaa = $ProdCkanManager->exportBrief('organization:noaa-gov AND metadata_type:geospatial AND dataset_type:dataset'); 42 | file_put_contents($results_dir . '/prod.json', json_encode($prod_noaa, JSON_PRETTY_PRINT)); 43 | $prod->writeFromArray($prod_noaa); 44 | echo PHP_EOL.'datasets from prod: '.sizeof($prod_noaa).PHP_EOL.PHP_EOL; 45 | } else { 46 | $prod_noaa = json_decode(file_get_contents($results_dir . '/prod.json')); 47 | echo PHP_EOL.'datasets from prod: '.sizeof($prod_noaa).PHP_EOL.PHP_EOL; 48 | } 49 | 50 | echo 'uat.json' . PHP_EOL; 51 | if (!is_file($results_dir . '/uat.json')) { 52 | $uat = new Writer($results_dir . '/uat.csv'); 53 | 54 | $uat->writeRow([ 55 | 'title', 56 | 'title_simple', 57 | 'name', 58 | 'url', 59 | 'identifier', 60 | 'guid', 61 | 'topics', 62 | 'categories', 63 | ]); 64 | $uatCkanManager = new CkanManager(CKAN_UAT_API_URL); 65 | $uatCkanManager->resultsDir = $results_dir; 66 | 67 | $uat_noaa = $uatCkanManager->exportBrief('organization:noaa-gov AND extras_harvest_source_title:NOAA New CSW AND dataset_type:dataset', 68 | '', 'http://uat-catalog-fe-data.reisys.com/dataset/'); 69 | file_put_contents($results_dir . '/uat.json', json_encode($uat_noaa, JSON_PRETTY_PRINT)); 70 | $uat->writeFromArray($uat_noaa); 71 | echo PHP_EOL.'datasets from uat: '.sizeof($uat_noaa).PHP_EOL.PHP_EOL; 72 | } else { 73 | $uat_noaa = json_decode(file_get_contents($results_dir . '/uat.json')); 74 | echo PHP_EOL.'datasets from uat: '.sizeof($uat_noaa).PHP_EOL.PHP_EOL; 75 | } 76 | 77 | $uat_noaa_by_title = $uat_noaa_by_guid = []; 78 | 79 | foreach ($uat_noaa as $name => $dataset) { 80 | $title = $dataset['title_simple']; 81 | 82 | $uat_noaa_by_title[$title] = isset($uat_noaa_by_title[$title]) ? $uat_noaa_by_title[$title] : []; 83 | $uat_noaa_by_title[$title][] = $dataset; 84 | 85 | $guid = trim($dataset['guid']); 86 | if ($guid) { 87 | $uat_noaa_by_guid[$guid] = isset($uat_noaa_by_guid[$guid]) ? $uat_noaa_by_guid[$guid] : []; 88 | $uat_noaa_by_guid[$guid][] = $dataset; 89 | } 90 | } 91 | 92 | echo 'prod_vs_uat.csv' . PHP_EOL; 93 | is_file($results_dir . '/prod_vs_uat_noaa_geospatial.csv') && unlink($results_dir . '/prod_vs_uat_noaa_geospatial.csv'); 94 | $csv = new Writer($results_dir . '/prod_vs_uat_noaa_geospatial.csv'); 95 | $csv->writeRow([ 96 | 'Prod Title', 97 | 'Prod URL', 98 | 'Prod GUID', 99 | 'Prod Topics', 100 | 'Prod Categories', 101 | 'Matched', 102 | 'UAT Title', 103 | 'UAT URL', 104 | 'UAT GUID', 105 | 'URL Match', 106 | 'Title Match', 107 | 'GUID Match', 108 | ]); 109 | 110 | foreach ($prod_noaa as $name => $prod_dataset) { 111 | if (isset($uat_noaa_by_guid[$prod_dataset['guid']])) { 112 | foreach ($uat_noaa_by_guid[$prod_dataset['guid']] as $uat_dataset) { 113 | $csv->writeRow([ 114 | $prod_dataset['title'], 115 | $prod_dataset['url'], 116 | $prod_dataset['guid'], 117 | $prod_dataset['topics'], 118 | $prod_dataset['categories'], 119 | true, 120 | $uat_dataset['title'], 121 | $uat_dataset['url'], 122 | $uat_dataset['guid'], 123 | (bool)($prod_dataset['name'] && $prod_dataset['name'] == $uat_dataset['name']), 124 | (bool)($prod_dataset['title_simple'] && $prod_dataset['title_simple'] == $uat_dataset['title_simple']), 125 | true, 126 | ]); 127 | } 128 | continue; 129 | } 130 | 131 | if (isset($uat_noaa_by_title[$prod_dataset['title_simple']])) { 132 | foreach ($uat_noaa_by_title[$prod_dataset['title_simple']] as $uat_dataset) { 133 | $csv->writeRow([ 134 | $prod_dataset['title'], 135 | $prod_dataset['url'], 136 | $prod_dataset['guid'], 137 | $prod_dataset['topics'], 138 | $prod_dataset['categories'], 139 | true, 140 | $uat_dataset['title'], 141 | $uat_dataset['url'], 142 | $uat_dataset['guid'], 143 | (bool)($prod_dataset['name'] && $prod_dataset['name'] == $uat_dataset['name']), 144 | true, 145 | (bool)($prod_dataset['guid'] == $uat_dataset['guid']), 146 | ]); 147 | } 148 | continue; 149 | } 150 | 151 | $csv->writeRow([ 152 | $prod_dataset['title'], 153 | $prod_dataset['url'], 154 | $prod_dataset['guid'], 155 | $prod_dataset['topics'], 156 | $prod_dataset['categories'], 157 | false, 158 | '', 159 | '', 160 | '', 161 | false, 162 | false, 163 | ]); 164 | } 165 | 166 | // show running time on finish 167 | timer(); 168 | -------------------------------------------------------------------------------- /cli/pbgc-gov/compare_uat_vs_prod_pbgc.php: -------------------------------------------------------------------------------- 1 | writeRow([ 29 | 'title', 30 | 'title_simple', 31 | 'name', 32 | 'url', 33 | 'identifier', 34 | 'guid', 35 | 'topics', 36 | 'categories', 37 | ]); 38 | $ProdCkanManager = new CkanManager(CKAN_API_URL); 39 | $ProdCkanManager->resultsDir = $results_dir; 40 | 41 | $prod_pbgc = $ProdCkanManager->exportBrief('organization:pbgc-gov AND dataset_type:dataset'); 42 | file_put_contents($results_dir . '/prod.json', json_encode($prod_pbgc, JSON_PRETTY_PRINT)); 43 | $prod->writeFromArray($prod_pbgc); 44 | echo PHP_EOL.'datasets from prod: '.sizeof($prod_pbgc).PHP_EOL.PHP_EOL; 45 | } else { 46 | $prod_pbgc = json_decode(file_get_contents($results_dir . '/prod.json')); 47 | echo PHP_EOL.'datasets from prod: '.sizeof($prod_pbgc).PHP_EOL.PHP_EOL; 48 | } 49 | 50 | echo 'uat.json' . PHP_EOL; 51 | if (!is_file($results_dir . '/uat.json')) { 52 | $uat = new Writer($results_dir . '/uat.csv'); 53 | 54 | $uat->writeRow([ 55 | 'title', 56 | 'title_simple', 57 | 'name', 58 | 'url', 59 | 'identifier', 60 | 'guid', 61 | 'topics', 62 | 'categories', 63 | ]); 64 | $uatCkanManager = new CkanManager(CKAN_UAT_API_URL); 65 | $uatCkanManager->resultsDir = $results_dir; 66 | 67 | $uat_pbgc = $uatCkanManager->exportBrief('organization:pbgc-gov AND extras_harvest_source_title:PDGC Data.json Source AND dataset_type:dataset', 68 | '', 'http://uat-catalog-fe-data.reisys.com/dataset/'); 69 | file_put_contents($results_dir . '/uat.json', json_encode($uat_pbgc, JSON_PRETTY_PRINT)); 70 | $uat->writeFromArray($uat_pbgc); 71 | echo PHP_EOL.'datasets from uat: '.sizeof($uat_pbgc).PHP_EOL.PHP_EOL; 72 | } else { 73 | $uat_pbgc = json_decode(file_get_contents($results_dir . '/uat.json')); 74 | echo PHP_EOL.'datasets from uat: '.sizeof($uat_pbgc).PHP_EOL.PHP_EOL; 75 | } 76 | 77 | $uat_pbgc_by_title = $uat_pbgc_by_guid = []; 78 | 79 | foreach ($uat_pbgc as $name => $dataset) { 80 | $title = $dataset['title_simple']; 81 | 82 | $uat_pbgc_by_title[$title] = isset($uat_pbgc_by_title[$title]) ? $uat_pbgc_by_title[$title] : []; 83 | $uat_pbgc_by_title[$title][] = $dataset; 84 | 85 | $guid = trim($dataset['guid']); 86 | if ($guid) { 87 | $uat_pbgc_by_guid[$guid] = isset($uat_pbgc_by_guid[$guid]) ? $uat_pbgc_by_guid[$guid] : []; 88 | $uat_pbgc_by_guid[$guid][] = $dataset; 89 | } 90 | } 91 | 92 | echo 'prod_vs_uat.csv' . PHP_EOL; 93 | is_file($results_dir . '/prod_vs_uat_pbgc_geospatial.csv') && unlink($results_dir . '/prod_vs_uat_pbgc_geospatial.csv'); 94 | $csv = new Writer($results_dir . '/prod_vs_uat_pbgc_geospatial.csv'); 95 | $csv->writeRow([ 96 | 'Prod Title', 97 | 'Prod URL', 98 | 'Prod GUID', 99 | 'Prod Topics', 100 | 'Prod Categories', 101 | 'Matched', 102 | 'UAT Title', 103 | 'UAT URL', 104 | 'UAT GUID', 105 | 'URL Match', 106 | 'Title Match', 107 | 'GUID Match', 108 | ]); 109 | 110 | foreach ($prod_pbgc as $name => $prod_dataset) { 111 | if (isset($uat_pbgc_by_guid[$prod_dataset['guid']])) { 112 | foreach ($uat_pbgc_by_guid[$prod_dataset['guid']] as $uat_dataset) { 113 | $csv->writeRow([ 114 | $prod_dataset['title'], 115 | $prod_dataset['url'], 116 | $prod_dataset['guid'], 117 | $prod_dataset['topics'], 118 | $prod_dataset['categories'], 119 | true, 120 | $uat_dataset['title'], 121 | $uat_dataset['url'], 122 | $uat_dataset['guid'], 123 | (bool)($prod_dataset['name'] && $prod_dataset['name'] == $uat_dataset['name']), 124 | (bool)($prod_dataset['title_simple'] && $prod_dataset['title_simple'] == $uat_dataset['title_simple']), 125 | true, 126 | ]); 127 | } 128 | continue; 129 | } 130 | 131 | if (isset($uat_pbgc_by_title[$prod_dataset['title_simple']])) { 132 | foreach ($uat_pbgc_by_title[$prod_dataset['title_simple']] as $uat_dataset) { 133 | $csv->writeRow([ 134 | $prod_dataset['title'], 135 | $prod_dataset['url'], 136 | $prod_dataset['guid'], 137 | $prod_dataset['topics'], 138 | $prod_dataset['categories'], 139 | true, 140 | $uat_dataset['title'], 141 | $uat_dataset['url'], 142 | $uat_dataset['guid'], 143 | (bool)($prod_dataset['name'] && $prod_dataset['name'] == $uat_dataset['name']), 144 | true, 145 | (bool)($prod_dataset['guid'] && $prod_dataset['guid'] == $uat_dataset['guid']), 146 | ]); 147 | } 148 | continue; 149 | } 150 | 151 | $csv->writeRow([ 152 | $prod_dataset['title'], 153 | $prod_dataset['url'], 154 | $prod_dataset['guid'], 155 | $prod_dataset['topics'], 156 | $prod_dataset['categories'], 157 | false, 158 | '', 159 | '', 160 | '', 161 | false, 162 | false, 163 | ]); 164 | } 165 | 166 | // show running time on finish 167 | timer(); 168 | --------------------------------------------------------------------------------