├── results
└── .gitempty
├── data
└── .gitignore
├── aws-scripts
├── unittests
├── composer_install
└── install_dependencies
├── backup
└── readme.md
├── tests
├── bootstrap.php
└── Base.php
├── .gitignore
├── .travis.yml
├── docker-compose.yml
├── .circleci
└── config.yml
├── src
└── CKAN
│ └── Manager
│ ├── Adapters
│ ├── FilePutContentsWrapper.php
│ └── FileGetContentsWrapper.php
│ ├── Dataset.php
│ └── ExploreApi.php
├── cli
├── epa-gov_process
│ ├── 2_find_matches.php
│ ├── README.md
│ ├── 1_export_everything.php
│ ├── 3_rename_datasets.php
│ ├── 4_assign_groups_and_tags.php
│ ├── compare_qa_vs_prod_epa.php
│ └── __compare_json_vs_prod_epa.php
├── doc-gov_process
│ ├── 3_find_matches.php
│ ├── 1_export_everything.php
│ ├── 4_add_legacy_dms_and_make_private.php
│ ├── 2_compare_prod_vs_prod.php
│ ├── 0_compare_prod_vs_uat.php
│ └── 5_compare_prod_vs_qa.php
├── nrc-gov_process
│ ├── 2_find_matches.php
│ ├── 1_export_everything.php
│ └── compare_prod_vs_uat_nrc.php
├── tools
│ ├── find_matches_one_file.php
│ ├── find_matches_separate_files.php
│ ├── diff.php
│ ├── organizations_json_to_csv.php
│ └── convert_json_to_csv.php
├── harvest_stats_csv.php
├── cleanup_tags_of_datasets_by_topic.php
├── dev_test.php
├── faa-gov
│ └── export_faa.php
├── check_staging_vs_prod.php
├── active_users.php
├── mark_source_datajson_by_identifier.php
├── organization_purge.php
├── inventory
│ └── redacted_stats.php
├── orphaned_tags_seeker.php
├── organization_patch.php
├── organizations_stats.php
├── export_datasets_by_topic_with_tags.php
├── resource_create.php
├── export_private_datasets.php
├── export_resource_list.php
├── search_by_topics_csv.php
├── search_by_terms_csv.php
├── update_harvest.php
├── breakdown_by_group.php
├── export.php
├── search_by_organizations_csv.php
├── find_socrata_txt_pairs.php
├── interactive_in_catalog_resources.php
├── reorganize_datasets.php
├── export_tracking_by_org.php
├── export_orgs.php
├── export_packages_by_org_with_tagging.php
├── doj-gov
│ └── 1_export_everything.php
├── archive_dataset_list.php
├── pbgc-gov
│ ├── 1_export_everything.php
│ └── compare_uat_vs_prod_pbgc.php
├── noaa-gov
│ ├── 1_export_everything.php
│ ├── compare_qa_vs_prod_noaa.php
│ └── compare_uat_vs_prod_noaa.php
├── fix_modified_inventory.php
├── add_legacy_dms_and_make_private.php
├── mark_as_private.php
├── search_by_titles_csv.php
├── rename.php
├── tagging
│ ├── remove_groups_and_tags.php
│ ├── brother_assign.php
│ ├── generate_brothers_assign_csv.php
│ └── assign_groups_and_tags.php
├── undelete_datasets.php
├── rename_then_mark_public.php
├── update_modified_date.php
├── add_license_url.php
├── delete_datasets.php
├── add_resource_to_dataset.php
├── export_orgs_full.php
├── rename_then_delete.php
├── update_organization.php
├── rename_then_mark_private.php
├── restore_script.php
├── update_extra_fields.php
├── export_by_list.php
├── update_field.php
├── export_full_by_list.php
├── socrata_log_redirects.php
├── compare_prod_vs_prod.php
├── export_short.php
├── compare_prod_vs_uat.php
├── recheck_socrata_redirects.php
├── compare_basic.php
├── check_aapi.php
└── ntsb-gov_process
│ └── compare_uat_vs_prod_ntsb.php
├── phpunit.xml
├── appspec.yml
├── inc
├── config.sample.php
└── common.php
├── .editorconfig
├── docker
├── install-composer.sh
└── Dockerfile
├── composer.json
└── README.md
/results/.gitempty:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/data/.gitignore:
--------------------------------------------------------------------------------
1 | *.json
2 | *.log
3 | *.csv
--------------------------------------------------------------------------------
/aws-scripts/unittests:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | cd /var/www/html
3 | phpunit
4 |
5 |
--------------------------------------------------------------------------------
/aws-scripts/composer_install:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | cd /var/www/html
3 | composer install -n
4 |
--------------------------------------------------------------------------------
/backup/readme.md:
--------------------------------------------------------------------------------
1 | Place json backups here for each organization. They should use the same name (URL slug) from CKAN, eg ocsit-gsa-gov.json
--------------------------------------------------------------------------------
/tests/bootstrap.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
13 |
14 | $CkanManager->findMatches();
15 |
--------------------------------------------------------------------------------
/phpunit.xml:
--------------------------------------------------------------------------------
1 |
9 |
10 |
11 | ./tests/
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/appspec.yml:
--------------------------------------------------------------------------------
1 | version: 0.0
2 | os: linux
3 | files:
4 | - source: /
5 | destination: /var/www/html/
6 | hooks:
7 | BeforeInstall:
8 | - location: aws-scripts/install_dependencies
9 | timeout: 300
10 | runas: root
11 | AfterInstall:
12 | - location: aws-scripts/composer_install
13 | timeout: 300
14 | runas: codedeployuser
15 | - location: aws-scripts/unittests
16 | timeout: 3600
17 | runas: codedeployuser
18 |
--------------------------------------------------------------------------------
/cli/doc-gov_process/3_find_matches.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
13 |
14 | $CkanManager->findMatchesByAgency('doc');
15 |
--------------------------------------------------------------------------------
/cli/nrc-gov_process/2_find_matches.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
13 |
14 | $CkanManager->findMatchesByAgency('nrc');
15 |
--------------------------------------------------------------------------------
/cli/tools/find_matches_one_file.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
13 |
14 | $CkanManager->findMatchesOneFile();
15 |
--------------------------------------------------------------------------------
/cli/tools/find_matches_separate_files.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
13 |
14 | $CkanManager->findMatchesSeparateFiles();
15 |
--------------------------------------------------------------------------------
/cli/tools/diff.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
19 | $CkanManager->harvestStats();
20 |
21 | // show running time on finish
22 | timer();
23 |
--------------------------------------------------------------------------------
/inc/config.sample.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
19 |
20 | $topicTitle = 'ecosystems0617';
21 | $CkanManager->cleanUpTagsByTopic($topicTitle);
22 |
23 | // show running time on finish
24 | timer();
25 |
--------------------------------------------------------------------------------
/cli/dev_test.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
24 | $CkanManager->test_dev();
25 |
26 | // show running time on finish
27 | timer();
28 |
--------------------------------------------------------------------------------
/cli/tools/organizations_json_to_csv.php:
--------------------------------------------------------------------------------
1 | writeRow([
14 | // 'from',
15 | // 'to'
16 | // ]);
17 |
18 | foreach ($json['result'] as $organization) {
19 | $writer->writeRow([$organization['name']]);
20 | }
21 | }
22 |
23 | // show running time on finish
24 | timer();
25 |
--------------------------------------------------------------------------------
/docker/install-composer.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # https://getcomposer.org/doc/faqs/how-to-install-composer-programmatically.md
3 |
4 | set -eu
5 |
6 | EXPECTED_SIGNATURE="$(wget -q -O - https://composer.github.io/installer.sig)"
7 | php -r "copy('https://getcomposer.org/installer', 'composer-setup.php');"
8 | ACTUAL_SIGNATURE="$(php -r "echo hash_file('sha384', 'composer-setup.php');")"
9 |
10 | if [ "$EXPECTED_SIGNATURE" != "$ACTUAL_SIGNATURE" ]
11 | then
12 | >&2 echo 'ERROR: Invalid installer signature'
13 | rm composer-setup.php
14 | exit 1
15 | fi
16 |
17 | php composer-setup.php --quiet
18 | RESULT=$?
19 | rm composer-setup.php
20 |
21 | # next two lines are local to our docker setup
22 | mv composer.phar /usr/local/bin/composer
23 | chmod +x /usr/local/bin/composer
24 |
25 | exit $RESULT
--------------------------------------------------------------------------------
/cli/tools/convert_json_to_csv.php:
--------------------------------------------------------------------------------
1 | writeRow([
14 | 'from',
15 | 'to'
16 | ]);
17 |
18 | foreach ($dataset_names['name'] as $name => $count) {
19 | $newName = preg_replace("/^deleted-/", '', $name);
20 | $writer->writeRow([
21 | $name,
22 | $newName
23 | ]);
24 | }
25 | }
26 |
27 | // show running time on finish
28 | timer();
29 |
--------------------------------------------------------------------------------
/cli/faa-gov/export_faa.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
20 |
21 | $brief = $CkanManager->exportShort('organization:dot-gov AND (dataset_type:dataset) AND publisher:"Federal Aviation Administration"');
22 |
23 | $headers = array_keys($brief[array_keys($brief)[0]]);
24 | $csv->writeRow($headers);
25 | $csv->writeFromArray($brief);
26 |
27 | // show running time on finish
28 | timer();
29 |
--------------------------------------------------------------------------------
/cli/check_staging_vs_prod.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
23 | $CkanManagerProduction->resultsDir = $results_dir;
24 |
25 | $groups = $CkanManagerStaging->groupsArray();
26 |
27 | foreach ($groups as $category) {
28 | $CkanManagerStaging->checkGroupAgainstProd($category, $CkanManagerProduction);
29 | }
30 |
31 | // show running time on finish
32 | timer();
33 |
--------------------------------------------------------------------------------
/cli/active_users.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
33 |
34 | $CkanManager->activeUsers();
35 |
36 | // show running time on finish
37 | timer();
38 |
--------------------------------------------------------------------------------
/cli/mark_source_datajson_by_identifier.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
24 |
25 | $CkanManager->tagByExtraField('identifier', 'source_datajson_identifier');
26 |
27 | // show running time on finish
28 | timer();
29 |
--------------------------------------------------------------------------------
/composer.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "gsa/ckan-php-manager",
3 | "description": "CKAN php manager by GSA",
4 | "minimum-stability": "dev",
5 | "license": "GPL-3.0+",
6 | "authors": [
7 | {
8 | "name": "Alex Perfilov",
9 | "email": "alexandr.perfilov@reisystems.com",
10 | "role": "Developer"
11 | }
12 | ],
13 | "autoload": {
14 | "psr-0": {
15 | "CKAN\\Manager\\": "src/"
16 | }
17 | },
18 | "repositories": [
19 | {
20 | "type": "git",
21 | "url": "https://github.com/GSA/ckan-php-client.git"
22 | }
23 | ],
24 | "prefer-stable": true,
25 | "require": {
26 | "php": "^7.0",
27 | "ext-json": "*",
28 | "kevinlebrun/colors.php": "~1",
29 | "gsa/ckan-php-client": "dev-master",
30 | "jwage/easy-csv": "~0"
31 | },
32 | "require-dev": {
33 | "doctrine/instantiator": "1.0.5",
34 | "phpunit/phpunit": "~6"
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/cli/organization_purge.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
35 |
36 | $CkanManager->purgeOrganization('fs-fed-us');
37 |
38 | // show running time on finish
39 | timer();
40 |
--------------------------------------------------------------------------------
/cli/inventory/redacted_stats.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
23 |
24 | $organization_list = $CkanManager->organization_list(true);
25 | //foreach ($organization_list as $organization) {
26 | // $members = $CkanManager->
27 | //}
28 |
29 | var_dump($organization_list);
30 | //
31 | //$headers = array_keys($brief[array_keys($brief)[0]]);
32 | //$csv->writeRow($headers);
33 | //$csv->writeFromArray($brief);
34 |
35 | // show running time on finish
36 | timer();
37 |
--------------------------------------------------------------------------------
/cli/orphaned_tags_seeker.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
33 |
34 | $CkanManager->orphanedTagsSeek($limit, $start);
35 |
36 | // show running time on finish
37 | timer();
38 |
--------------------------------------------------------------------------------
/tests/Base.php:
--------------------------------------------------------------------------------
1 | reflection = new \ReflectionClass($this->testClass);
12 | }
13 |
14 | public function getMethod($method)
15 | {
16 | $method = $this->reflection->getMethod($method);
17 | $method->setAccessible(true);
18 |
19 | return $method;
20 | }
21 |
22 | public function getProperty($property)
23 | {
24 | $property = $this->reflection->getProperty($property);
25 | $property->setAccessible(true);
26 |
27 | return $property->getValue($this->testClass);
28 | }
29 |
30 | public function setProperty($property, $value)
31 | {
32 | $property = $this->reflection->getProperty($property);
33 | $property->setAccessible(true);
34 | $property->setValue($this->testClass, $value);
35 |
36 | return;
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/cli/organization_patch.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
35 |
36 | $fields = array(
37 | 'name' => 'fs-fed-us-legacy'
38 | );
39 |
40 | $CkanManager->patchOrganization('fs-fed-us', $fields);
41 |
42 | // show running time on finish
43 | timer();
44 |
--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM alpine:latest
2 |
3 | RUN apk --no-cache upgrade
4 | RUN apk add --no-cache apache2 \
5 | bash \
6 | curl \
7 | git \
8 | jq \
9 | mariadb \
10 | openrc \
11 | php7 \
12 | php7-apache2 \
13 | php7-curl \
14 | php7-iconv \
15 | php7-json \
16 | php7-mbstring \
17 | php7-mysqli \
18 | php7-openssl \
19 | php7-pcntl \
20 | php7-pdo \
21 | php7-phar \
22 | php7-posix \
23 | php7-session \
24 | php7-simplexml \
25 | php7-sodium \
26 | php7-sqlite3 \
27 | php7-tokenizer \
28 | php7-xml \
29 | php7-xmlreader \
30 | php7-xmlwriter \
31 | php7-zlib \
32 | wget \
33 | zip
34 |
35 | ARG APP_DIR=/var/www/app
36 |
37 | # Install composer
38 | COPY docker/install-composer.sh /tmp/install-composer.sh
39 | RUN /tmp/install-composer.sh
40 |
41 | # Add composer-installed libs to path
42 | ENV PATH=/var/www/app/vendor/bin:$PATH
43 |
44 | ADD composer.json composer.lock $APP_DIR/
45 |
46 | WORKDIR $APP_DIR
47 | RUN composer install
48 |
--------------------------------------------------------------------------------
/cli/organizations_stats.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
33 |
34 | $CkanManager->organizations_stats();
35 |
36 | if ($CkanManager->logOutput) {
37 | file_put_contents($results_dir . '/log.csv', $CkanManager->logOutput);
38 | }
39 |
40 | // show running time on finish
41 | timer();
42 |
--------------------------------------------------------------------------------
/cli/export_datasets_by_topic_with_tags.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
41 |
42 | $CkanManager->exportDatasetsWithTagsByGroup($topic);
43 |
44 | // show running time on finish
45 | timer();
46 |
--------------------------------------------------------------------------------
/cli/resource_create.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
17 |
18 | $logFile = $results_dir . '/_log.csv';
19 |
20 | $CkanManager->resourceCreate([
21 | 'package_id' => 'department-of-the-interior-enterprise-data-inventory',
22 | // 'package_id' => 'u-s-widget-manufacturing-statistics-92174',
23 | 'url' => 'http://data.doi.gov/WAF/edi.json',
24 | 'name' => 'EDI Json',
25 | 'format' => 'application/json'
26 | ]);
27 |
28 | file_put_contents($logFile, $CkanManager->logOutput, FILE_APPEND | LOCK_EX);
29 | //$CkanManager->logOutput = '';
30 |
31 | // show running time on finish
32 | timer();
33 |
--------------------------------------------------------------------------------
/cli/export_private_datasets.php:
--------------------------------------------------------------------------------
1 | getTreeArray();
36 |
37 | $CkanManager->resultsDir = $results_dir;
38 |
39 | $CkanManager->getPrivateList($termsArray, $start, $limit);
40 |
41 | // show running time on finish
42 | timer();
43 |
--------------------------------------------------------------------------------
/cli/export_resource_list.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
35 | $CkanManager->exportResourceList();
36 |
37 | // show running time on finish
38 | timer();
39 |
--------------------------------------------------------------------------------
/cli/search_by_topics_csv.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
42 |
43 | $CkanManager->searchByTopics($topics_list);
44 |
45 | // show running time on finish
46 | timer();
47 |
--------------------------------------------------------------------------------
/cli/search_by_terms_csv.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
42 |
43 | $CkanManager->searchByTerms($keywords_list);
44 |
45 | // show running time on finish
46 | timer();
47 |
--------------------------------------------------------------------------------
/cli/update_harvest.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
25 |
26 | $harvest_sources = file_get_contents(CKANMNGR_DATA_DIR . '/harvest_sources_automated_remainders-c.json');
27 | $harvest_sources = json_decode($harvest_sources, true);
28 |
29 | $time = time();
30 | $log_file = "$time.log";
31 |
32 | foreach ($harvest_sources['result']['results'] as $harvest_source) {
33 | $CkanManager->updateHarvest($harvest_source['name'], 'frequency', 'MANUAL');
34 | }
35 |
36 | file_put_contents($results_dir . '/' . $log_file, $CkanManager->logOutput);
37 |
38 | // show running time on finish
39 | timer();
--------------------------------------------------------------------------------
/cli/breakdown_by_group.php:
--------------------------------------------------------------------------------
1 | breakdownByGroup($csv_agencies, $csv_categories);
46 |
47 | // show running time on finish
48 | timer();
49 |
--------------------------------------------------------------------------------
/cli/export.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
46 | $CkanManager->exportDatasetsBySearch($strip_search);
47 |
48 | // show running time on finish
49 | timer();
50 |
--------------------------------------------------------------------------------
/cli/search_by_organizations_csv.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
42 |
43 | $CkanManager->searchByOrganizations($organizations_list);
44 |
45 | // show running time on finish
46 | timer();
47 |
--------------------------------------------------------------------------------
/cli/find_socrata_txt_pairs.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
26 |
27 | /**
28 | *
29 | */
30 | define('ERROR_REPORTING', E_ALL & ~E_NOTICE);
31 |
32 | // https://explore.data.gov/api/views/bxfh-jivs.json
33 | /**
34 | *
35 | */
36 | define('SOCRATA_URL', 'https://explore.data.gov/api/views/');
37 |
38 | if (!is_readable($socrata_file_path = CKANMNGR_DATA_DIR . '/socrata.txt')) {
39 | die($socrata_file_path . ' not readable');
40 | }
41 |
42 | $socrata_list = file_get_contents($socrata_file_path);
43 | $socrata_list = preg_replace('/[\\r\\n]+/', "\n", $socrata_list);
44 | $socrata_list = explode("\n", $socrata_list);
45 |
46 | $CkanManager->getSocrataPairs($socrata_list);
47 |
48 | // show running time on finish
49 | timer();
50 |
--------------------------------------------------------------------------------
/cli/interactive_in_catalog_resources.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
44 |
45 | $CkanManager->getInteractiveResources();
46 |
47 | // show running time on finish
48 | timer();
49 |
--------------------------------------------------------------------------------
/cli/reorganize_datasets.php:
--------------------------------------------------------------------------------
1 | getTreeArrayFor(ORGANIZATION_TO_TAG);
24 |
25 | /**
26 | * sometimes there is no parent term (ex. Department of Labor)
27 | */
28 | if (!defined('PARENT_TERM')) {
29 | define('PARENT_TERM', '_');
30 | }
31 |
32 | /**
33 | * Create results dir for logs
34 | */
35 | $results_dir = CKANMNGR_RESULTS_DIR . date('/Ymd-His') . '_LEGACY_' . PARENT_TERM;
36 | mkdir($results_dir);
37 |
38 | $CkanManager = new CkanManager(CKAN_API_URL, CKAN_API_KEY);
39 | //$CkanManager = new CkanManager(CKAN_STAGING_API_URL, CKAN_STAGING_API_KEY);
40 |
41 | $CkanManager->resultsDir = $results_dir;
42 |
43 | $CkanManager->reorganizeDatasets(ORGANIZATION_TO_TAG, $termsArray, CKANMNGR_BACKUP_DIR);
44 |
45 | // show running time on finish
46 | timer();
47 |
--------------------------------------------------------------------------------
/cli/export_tracking_by_org.php:
--------------------------------------------------------------------------------
1 | getTreeArrayFor(ORGANIZATION_TO_EXPORT);
23 |
24 | /**
25 | * sometimes there is no parent term (ex. Department of Labor)
26 | */
27 | if (!defined('PARENT_TERM')) {
28 | define('PARENT_TERM', '_');
29 | }
30 |
31 | /**
32 | * Create results dir for logs and json results
33 | */
34 | $results_dir = CKANMNGR_RESULTS_DIR . date('/Ymd-His') . '_TRACKING_' . PARENT_TERM;
35 | mkdir($results_dir);
36 |
37 | /**
38 | * Search for packages by terms found
39 | */
40 |
41 | /**
42 | * Production
43 | */
44 | $CkanManager = new CkanManager(CKAN_API_URL);
45 |
46 | /**
47 | * Staging
48 | */
49 | //$CkanManager = new CkanManager(CKAN_STAGING_API_URL);
50 |
51 | $CkanManager->resultsDir = $results_dir;
52 |
53 | $CkanManager->exportTrackingByOrgTerms($termsArray);
54 |
55 | // show running time on finish
56 | timer();
57 |
--------------------------------------------------------------------------------
/cli/export_orgs.php:
--------------------------------------------------------------------------------
1 | getTreeArrayFor(ORGANIZATION_TO_EXPORT);
22 |
23 | /**
24 | * sometimes there is no parent term (ex. Department of Labor)
25 | */
26 | if (!defined('PARENT_TERM')) {
27 | define('PARENT_TERM', '_');
28 | }
29 |
30 | /**
31 | * Create results dir for logs and json results
32 | */
33 | $results_dir = CKANMNGR_RESULTS_DIR . date('/Ymd-His') . '_EXPORT_' . PARENT_TERM;
34 | mkdir($results_dir);
35 |
36 | /**
37 | * Search for packages by terms found
38 | */
39 |
40 | /**
41 | * Production
42 | */
43 | //$CkanManager = new CkanManager(CKAN_API_URL);
44 | $CkanManager = new CkanManager(INVENTORY_CKAN_PROD_API_URL, INVENTORY_CKAN_PROD_API_KEY);
45 |
46 | /**
47 | * Staging
48 | */
49 | //$CkanManager = new CkanManager(CKAN_STAGING_API_URL);
50 |
51 | $CkanManager->resultsDir = $results_dir;
52 |
53 | $CkanManager->exportOrganizations($termsArray);
54 |
55 | // show running time on finish
56 | timer();
57 |
--------------------------------------------------------------------------------
/cli/nrc-gov_process/1_export_everything.php:
--------------------------------------------------------------------------------
1 | getTreeArrayFor(ORGANIZATION_TO_EXPORT);
20 |
21 | /**
22 | * sometimes there is no parent term (ex. Department of Labor)
23 | */
24 | if (!defined('PARENT_TERM')) {
25 | define('PARENT_TERM', '_');
26 | }
27 |
28 | /**
29 | * Create results dir for logs and json results
30 | */
31 | $results_dir = CKANMNGR_RESULTS_DIR . date('/Ymd-His') . '_EXPORT_' . PARENT_TERM;
32 | mkdir($results_dir);
33 |
34 | /**
35 | * Search for packages by terms found
36 | */
37 |
38 | /**
39 | * Production
40 | */
41 | $CkanManager = new CkanManager(CKAN_API_URL);
42 | //$CkanManager = new CkanManager(CKAN_QA_API_URL);
43 | //$CkanManager = new CkanManager(INVENTORY_CKAN_PROD_API_URL, INVENTORY_CKAN_PROD_API_KEY);
44 |
45 | /**
46 | * Staging
47 | */
48 | //$CkanManager = new CkanManager(CKAN_STAGING_API_URL);
49 |
50 | $CkanManager->resultsDir = $results_dir;
51 |
52 | $CkanManager->exportPackagesByOrgTerms($termsArray);
53 |
54 | // show running time on finish
55 | timer();
56 |
--------------------------------------------------------------------------------
/cli/epa-gov_process/1_export_everything.php:
--------------------------------------------------------------------------------
1 | getTreeArrayFor(ORGANIZATION_TO_EXPORT);
20 |
21 | /**
22 | * sometimes there is no parent term (ex. Department of Labor)
23 | */
24 | if (!defined('PARENT_TERM')) {
25 | define('PARENT_TERM', '_');
26 | }
27 |
28 | /**
29 | * Create results dir for logs and json results
30 | */
31 | $results_dir = CKANMNGR_RESULTS_DIR . date('/Ymd-His') . '_EXPORT_' . PARENT_TERM;
32 | mkdir($results_dir);
33 |
34 | /**
35 | * Search for packages by terms found
36 | */
37 |
38 | /**
39 | * Production
40 | */
41 | $CkanManager = new CkanManager(CKAN_API_URL);
42 | //$CkanManager = new CkanManager(CKAN_QA_API_URL);
43 | //$CkanManager = new CkanManager(INVENTORY_CKAN_PROD_API_URL, INVENTORY_CKAN_PROD_API_KEY);
44 |
45 | /**
46 | * Staging
47 | */
48 | //$CkanManager = new CkanManager(CKAN_STAGING_API_URL);
49 |
50 | $CkanManager->resultsDir = $results_dir;
51 |
52 | $CkanManager->exportPackagesByOrgTerms($termsArray);
53 |
54 | // show running time on finish
55 | timer();
56 |
--------------------------------------------------------------------------------
/cli/export_packages_by_org_with_tagging.php:
--------------------------------------------------------------------------------
1 | getTreeArrayFor(ORGANIZATION_TO_EXPORT);
20 |
21 |
22 | /**
23 | * sometimes there is no parent term (ex. Department of Labor)
24 | */
25 | if (!defined('PARENT_TERM')) {
26 | define('PARENT_TERM', '_');
27 | }
28 |
29 | /**
30 | * Create results dir for logs and json results
31 | */
32 | $results_dir = CKANMNGR_RESULTS_DIR . date('/Ymd-His') . '_EXPORT_' . PARENT_TERM;
33 | mkdir($results_dir);
34 |
35 | /**
36 | * Search for packages by terms found
37 | */
38 |
39 | /**
40 | * Production
41 | */
42 | $CkanManager = new CkanManager(CKAN_API_URL);
43 | //$CkanManager = new CkanManager(CKAN_QA_API_URL);
44 | //$CkanManager = new CkanManager(INVENTORY_CKAN_PROD_API_URL, INVENTORY_CKAN_PROD_API_KEY);
45 |
46 | /**
47 | * Staging
48 | */
49 | //$CkanManager = new CkanManager(CKAN_STAGING_API_URL);
50 |
51 | $CkanManager->resultsDir = $results_dir;
52 |
53 | $CkanManager->exportPackagesByOrgTerms($termsArray);
54 |
55 | // show running time on finish
56 | timer();
57 |
--------------------------------------------------------------------------------
/cli/doc-gov_process/1_export_everything.php:
--------------------------------------------------------------------------------
1 | getTreeArrayFor(ORGANIZATION_TO_EXPORT);
20 |
21 | /**
22 | * sometimes there is no parent term (ex. Department of Labor)
23 | */
24 | if (!defined('PARENT_TERM')) {
25 | define('PARENT_TERM', '_');
26 | }
27 |
28 | /**
29 | * Create results dir for logs and json results
30 | */
31 | $results_dir = CKANMNGR_RESULTS_DIR . date('/Ymd-His') . '_EXPORT_' . PARENT_TERM;
32 | mkdir($results_dir);
33 |
34 | $CkanManager = new CkanManager(CKAN_API_URL);
35 | //$CkanManager = new CkanManager(CKAN_QA_API_URL);
36 | //$CkanManager = new CkanManager(INVENTORY_CKAN_PROD_API_URL, INVENTORY_CKAN_PROD_API_KEY);
37 | //$CkanManager = new CkanManager(CKAN_STAGING_API_URL);
38 |
39 | $CkanManager->resultsDir = $results_dir;
40 |
41 | /**
42 | * We are skipping noaa-gov and nist-gov within current process
43 | */
44 | unset($termsArray['noaa-gov']);
45 | unset($termsArray['nist-gov']);
46 |
47 | $CkanManager->exportPackagesByOrgTerms($termsArray);
48 |
49 | // show running time on finish
50 | timer();
51 |
--------------------------------------------------------------------------------
/cli/doj-gov/1_export_everything.php:
--------------------------------------------------------------------------------
1 | getTreeArrayFor(ORGANIZATION_TO_EXPORT);
20 |
21 | $termsArray = ['doj-gov' => 'Department of Justice'];
22 |
23 | /**
24 | * sometimes there is no parent term (ex. Department of Labor)
25 | */
26 | if (!defined('PARENT_TERM')) {
27 | define('PARENT_TERM', '_');
28 | }
29 |
30 | /**
31 | * Create results dir for logs and json results
32 | */
33 | $results_dir = CKANMNGR_RESULTS_DIR . date('/Ymd-His') . '_EXPORT_DOJ' . PARENT_TERM;
34 | mkdir($results_dir);
35 |
36 | /**
37 | * Search for packages by terms found
38 | */
39 |
40 | /**
41 | * Production
42 | */
43 | $CkanManager = new CkanManager(CKAN_API_URL);
44 | //$CkanManager = new CkanManager(CKAN_UAT_API_URL);
45 | //$CkanManager = new CkanManager(CKAN_QA_API_URL);
46 | //$CkanManager = new CkanManager(INVENTORY_CKAN_PROD_API_URL, INVENTORY_CKAN_PROD_API_KEY);
47 |
48 | /**
49 | * Staging
50 | */
51 | //$CkanManager = new CkanManager(CKAN_STAGING_API_URL);
52 |
53 | $CkanManager->resultsDir = $results_dir;
54 |
55 | $CkanManager->exportPackagesByOrgTerms($termsArray);
56 |
57 | // show running time on finish
58 | timer();
59 |
--------------------------------------------------------------------------------
/cli/archive_dataset_list.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
21 | foreach (glob(CKANMNGR_DATA_DIR . '/private*.csv') as $csv_file) {
22 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL;
23 | echo $status;
24 |
25 | $basename = str_replace('.csv', '', basename($csv_file));
26 |
27 | // fix wrong END-OF-LINE
28 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file)));
29 |
30 | file_put_contents($results_dir . '/' . $basename . '.log', $status, FILE_APPEND | LOCK_EX);
31 |
32 | $csv = new EasyCSV\Reader($csv_file, 'r+', false);
33 | while (true) {
34 | $row = $csv->getRow();
35 | if (!$row) {
36 | break;
37 | }
38 | // skip headers
39 | if (in_array(strtolower($row['0']), ['dataset', 'uid', 'uuid', 'name', 'url', 'data.gov url'])) {
40 | continue;
41 | }
42 |
43 | $dataset = basename($row['0']);
44 | $CkanManager->makeDatasetPrivate($dataset, $basename);
45 | }
46 | }
47 |
48 | // show running time on finish
49 | timer();
50 |
--------------------------------------------------------------------------------
/cli/pbgc-gov/1_export_everything.php:
--------------------------------------------------------------------------------
1 | getTreeArrayFor(ORGANIZATION_TO_EXPORT);
20 |
21 | $termsArray = ['pbgc-gov' => 'Pension Benefit Guaranty Corporation'];
22 |
23 | /**
24 | * sometimes there is no parent term (ex. Department of Labor)
25 | */
26 | if (!defined('PARENT_TERM')) {
27 | define('PARENT_TERM', '_');
28 | }
29 |
30 | /**
31 | * Create results dir for logs and json results
32 | */
33 | $results_dir = CKANMNGR_RESULTS_DIR . date('/Ymd-His') . '_EXPORT_PBGC' . PARENT_TERM;
34 | mkdir($results_dir);
35 |
36 | /**
37 | * Search for packages by terms found
38 | */
39 |
40 | /**
41 | * Production
42 | */
43 | $CkanManager = new CkanManager(CKAN_API_URL);
44 | //$CkanManager = new CkanManager(CKAN_UAT_API_URL);
45 | //$CkanManager = new CkanManager(CKAN_QA_API_URL);
46 | //$CkanManager = new CkanManager(INVENTORY_CKAN_PROD_API_URL, INVENTORY_CKAN_PROD_API_KEY);
47 |
48 | /**
49 | * Staging
50 | */
51 | //$CkanManager = new CkanManager(CKAN_STAGING_API_URL);
52 |
53 | $CkanManager->resultsDir = $results_dir;
54 |
55 | $CkanManager->exportPackagesByOrgTerms($termsArray);
56 |
57 | // show running time on finish
58 | timer();
59 |
--------------------------------------------------------------------------------
/cli/noaa-gov/1_export_everything.php:
--------------------------------------------------------------------------------
1 | getTreeArrayFor(ORGANIZATION_TO_EXPORT);
20 |
21 | $termsArray = ['noaa-gov' => 'National Oceanic and Atmospheric Administration'];
22 |
23 | /**
24 | * sometimes there is no parent term (ex. Department of Labor)
25 | */
26 | if (!defined('PARENT_TERM')) {
27 | define('PARENT_TERM', '_');
28 | }
29 |
30 | /**
31 | * Create results dir for logs and json results
32 | */
33 | $results_dir = CKANMNGR_RESULTS_DIR . date('/Ymd-His') . '_EXPORT_NOAA' . PARENT_TERM;
34 | mkdir($results_dir);
35 |
36 | /**
37 | * Search for packages by terms found
38 | */
39 |
40 | /**
41 | * Production
42 | */
43 | $CkanManager = new CkanManager(CKAN_API_URL);
44 | //$CkanManager = new CkanManager(CKAN_UAT_API_URL);
45 | //$CkanManager = new CkanManager(CKAN_QA_API_URL);
46 | //$CkanManager = new CkanManager(INVENTORY_CKAN_PROD_API_URL, INVENTORY_CKAN_PROD_API_KEY);
47 |
48 | /**
49 | * Staging
50 | */
51 | //$CkanManager = new CkanManager(CKAN_STAGING_API_URL);
52 |
53 | $CkanManager->resultsDir = $results_dir;
54 |
55 | $CkanManager->exportPackagesByOrgTerms($termsArray);
56 |
57 | // show running time on finish
58 | timer();
59 |
--------------------------------------------------------------------------------
/cli/fix_modified_inventory.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
18 |
19 |
20 | foreach (glob(CKANMNGR_DATA_DIR . '/metadata*.csv') as $csv_file) {
21 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL;
22 | echo $status;
23 |
24 | // fix wrong END-OF-LINE
25 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file)));
26 |
27 | // file_put_contents($resultsDir . '/' . $basename . '_tags.log', $status, FILE_APPEND | LOCK_EX);
28 |
29 | $csv = new EasyCSV\Reader($csv_file, 'r+', false);
30 | while (true) {
31 | $row = $csv->getRow();
32 | if (!$row) {
33 | break;
34 | }
35 | // skip headers
36 | if (in_array(trim(strtolower($row['0'])), ['link', 'dataset', 'url', 'data.gov url'])) {
37 | continue;
38 | }
39 |
40 | // no anchors please
41 | list($dataset,) = explode('#', basename(trim($row['0'])));
42 |
43 | if (!$dataset) {
44 | continue;
45 | }
46 |
47 | $CkanManager->fixModified($dataset);
48 | file_put_contents($results_dir . '/log.csv', $CkanManager->logOutput, FILE_APPEND | LOCK_EX);
49 | $CkanManager->logOutput = '';
50 | }
51 | }
52 |
53 | // show running time on finish
54 | timer();
55 |
--------------------------------------------------------------------------------
/inc/common.php:
--------------------------------------------------------------------------------
1 | bold
54 | . $clr($minutes_spent . ' minutes ' . $seconds_spent . ' seconds ')->green->bold . PHP_EOL;
55 | }
56 |
--------------------------------------------------------------------------------
/cli/add_legacy_dms_and_make_private.php:
--------------------------------------------------------------------------------
1 | getTreeArrayFor(ORGANIZATION_TO_TAG);
40 |
41 | /**
42 | * sometimes there is no parent term (ex. Department of Labor)
43 | */
44 | if (!defined('PARENT_TERM')) {
45 | die('PARENT_TERM not found');
46 | }
47 |
48 | /**
49 | * Create results dir for logs
50 | */
51 | $results_dir = CKANMNGR_RESULTS_DIR . date('/Ymd-His') . '_LEGACY_' . PARENT_TERM;
52 | mkdir($results_dir);
53 |
54 | /**
55 | * Adding Legacy dms tag
56 | */
57 | $CkanManager = new CkanManager(CKAN_API_URL, LIST_ONLY ? null : CKAN_API_KEY);
58 | //$CkanManager = new CkanManager(CKAN_STAGING_API_URL, CKAN_STAGING_API_KEY);
59 |
60 | $CkanManager->resultsDir = $results_dir;
61 | $CkanManager->tagLegacyDms($termsArray, 'metadata_from_legacy_dms');
62 |
63 | // show running time on finish
64 | timer();
65 |
--------------------------------------------------------------------------------
/cli/mark_as_private.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
27 |
28 | foreach (glob(CKANMNGR_DATA_DIR . '/private_*.csv') as $csv_file) {
29 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL;
30 | echo $status;
31 |
32 | // fix wrong END-OF-LINE
33 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file)));
34 |
35 | $basename = str_replace('.csv', '', basename($csv_file));
36 | file_put_contents($results_dir . '/' . $basename . '_private.log', $status, FILE_APPEND | LOCK_EX);
37 |
38 | $csv = new EasyCSV\Reader($csv_file, 'r+', false);
39 | $i = 1;
40 | while (true) {
41 | $row = $csv->getRow();
42 | if (!$row) {
43 | break;
44 | }
45 | // skip headers
46 | if (in_array(trim(strtolower($row['0'])), ['dataset', 'url', 'old dataset url', 'from'])) {
47 | continue;
48 | }
49 |
50 | $datasetName = basename($row['0']);
51 |
52 | printf('[%04d] ', $i++);
53 | $CkanManager->makeDatasetPrivate($datasetName, $basename);
54 | }
55 | }
56 |
57 | // show running time on finish
58 | timer();
59 |
--------------------------------------------------------------------------------
/cli/search_by_titles_csv.php:
--------------------------------------------------------------------------------
1 | writeRow(['url', 'exact match', 'title', 'found by title']);
41 |
42 | $i = 0;
43 | while (true) {
44 | if (!($i++ % 10)) {
45 | echo $i . PHP_EOL;
46 | }
47 | $row = $csv_source->getRow();
48 | if (!$row) {
49 | break;
50 | }
51 | // skip headers
52 | if (in_array(trim(strtolower($row[0])), ['url', 'from', 'source url'])) {
53 | continue;
54 | }
55 |
56 | $title = $row[0];
57 |
58 | /**
59 | * Search for packages by terms found
60 | */
61 | $CkanManager->searchByTitle($title, $csv_destination);
62 | }
63 |
64 | // fix wrong END-OF-LINE
65 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file)));
66 | }
67 |
68 | // show running time on finish
69 | timer();
70 |
--------------------------------------------------------------------------------
/cli/doc-gov_process/4_add_legacy_dms_and_make_private.php:
--------------------------------------------------------------------------------
1 | getTreeArrayFor(ORGANIZATION_TO_TAG);
40 |
41 | /**
42 | * sometimes there is no parent term (ex. Department of Labor)
43 | */
44 | if (!defined('PARENT_TERM')) {
45 | die('PARENT_TERM not found');
46 | }
47 |
48 | /**
49 | * Create results dir for logs
50 | */
51 | $results_dir = CKANMNGR_RESULTS_DIR . date('/Ymd-His') . '_LEGACY_' . PARENT_TERM;
52 | mkdir($results_dir);
53 |
54 | /**
55 | * Adding Legacy dms tag
56 | */
57 | $CkanManager = new CkanManager(CKAN_API_URL, LIST_ONLY ? null : CKAN_API_KEY);
58 | //$CkanManager = new CkanManager(CKAN_STAGING_API_URL, CKAN_STAGING_API_KEY);
59 |
60 | $CkanManager->resultsDir = $results_dir;
61 |
62 | /**
63 | * We are skipping noaa-gov and nist-gov within current process
64 | */
65 | unset($termsArray['noaa-gov']);
66 | unset($termsArray['nist-gov']);
67 |
68 | $CkanManager->tagLegacyDms($termsArray, 'metadata_from_legacy_dms');
69 |
70 | // show running time on finish
71 | timer();
72 |
--------------------------------------------------------------------------------
/cli/rename.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
31 |
32 | foreach (glob(CKANMNGR_DATA_DIR . '/rename*.csv') as $csv_file) {
33 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL;
34 | echo $status;
35 |
36 | // fix wrong END-OF-LINE
37 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file)));
38 |
39 | $basename = str_replace('.csv', '', basename($csv_file));
40 | file_put_contents($results_dir . '/' . $basename . '_rename.log', $status, FILE_APPEND | LOCK_EX);
41 |
42 | $csv = new EasyCSV\Reader($csv_file, 'r+', false);
43 | $i = 1;
44 | while (true) {
45 | $row = $csv->getRow();
46 | if (!$row) {
47 | break;
48 | }
49 | // skip headers
50 | if (in_array(trim(strtolower($row['0'])), ['dataset', 'url', 'old dataset url', 'from'])) {
51 | continue;
52 | }
53 |
54 | $datasetName = trim(basename($row['0']));
55 | $newDatasetName = basename($row['1']);
56 |
57 | printf('[%04d] ', $i++);
58 | $CkanManager->renameDataset($datasetName, $newDatasetName, $basename);
59 | }
60 | }
61 |
62 | // show running time on finish
63 | timer();
64 |
--------------------------------------------------------------------------------
/cli/tagging/remove_groups_and_tags.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
23 | foreach (glob(CKANMNGR_DATA_DIR . '/remove_*.csv') as $csv_file) {
24 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL;
25 | echo $status;
26 |
27 | $basename = str_replace('.csv', '', basename($csv_file));
28 |
29 | // fix wrong END-OF-LINE
30 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file)));
31 |
32 | file_put_contents($results_dir . '/' . $basename . '_remove.log', $status, FILE_APPEND | LOCK_EX);
33 |
34 | $csv = new EasyCSV\Reader($csv_file, 'r+', false);
35 | while (true) {
36 | $row = $csv->getRow();
37 | if (!$row) {
38 | break;
39 | }
40 | // skip headers
41 | if (in_array(strtolower($row['0']),
42 | ['dataset', 'uid', 'uuid', 'name', 'url', 'data.gov url', 'dataset link'])) {
43 | continue;
44 | }
45 |
46 | // no anchors please
47 | list($dataset,) = explode('#', basename(trim($row['0'])));
48 | $category = trim(isset($row['1']) ? ($row['1'] ?: '') : '');
49 | $tags = trim(isset($row['2']) ? ($row['2'] ?: '') : '');
50 | $CkanManager->removeTagsAndGroupsFromDatasets([$dataset], $category, $tags, $basename);
51 | }
52 | }
53 |
54 | // show running time on finish
55 | timer();
56 |
--------------------------------------------------------------------------------
/cli/undelete_datasets.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
35 |
36 | /**
37 | * CSV
38 | * datasetName, orgId
39 | */
40 |
41 | foreach (glob(CKANMNGR_DATA_DIR . '/undelete*.csv') as $csv_file) {
42 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL;
43 | echo $status;
44 |
45 | // fix wrong END-OF-LINE
46 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file)));
47 |
48 | $basename = str_replace('.csv', '', basename($csv_file));
49 | $logFile = $results_dir . '/' . $basename . '_log.csv';
50 |
51 | $csv = new EasyCSV\Reader($csv_file, 'r+', false);
52 | $i = 1;
53 | while (true) {
54 | $row = $csv->getRow();
55 | if (!$row) {
56 | break;
57 | }
58 | // skip headers
59 | if (in_array(trim(strtolower($row['0'])), ['dataset', 'url', 'old dataset url', 'from'])) {
60 | continue;
61 | }
62 |
63 | $datasetName = basename($row['0']);
64 |
65 | printf('[%04d] ', $i++);
66 | $CkanManager->undeleteDataset($datasetName);
67 | file_put_contents($logFile, $CkanManager->logOutput, FILE_APPEND | LOCK_EX);
68 | $CkanManager->logOutput = '';
69 | }
70 | }
71 |
72 | // show running time on finish
73 | timer();
74 |
--------------------------------------------------------------------------------
/cli/rename_then_mark_public.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
31 |
32 | foreach (glob(CKANMNGR_DATA_DIR . '/rename*.csv') as $csv_file) {
33 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL;
34 | echo $status;
35 |
36 | // fix wrong END-OF-LINE
37 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file)));
38 |
39 | $basename = str_replace('.csv', '', basename($csv_file));
40 | file_put_contents($results_dir . '/' . $basename . '_rename.log', $status, FILE_APPEND | LOCK_EX);
41 |
42 | $csv = new EasyCSV\Reader($csv_file, 'r+', false);
43 | $i = 1;
44 | while (true) {
45 | $row = $csv->getRow();
46 | if (!$row) {
47 | break;
48 | }
49 | // skip headers
50 | if (in_array(trim(strtolower($row['0'])), ['dataset', 'url', 'old dataset url', 'from'])) {
51 | continue;
52 | }
53 |
54 | $datasetName = trim(basename($row['0']));
55 | $newDatasetName = basename($row['1']);
56 |
57 | printf('[%04d] ', $i++);
58 | $CkanManager->renameDataset($datasetName, $newDatasetName, $basename);
59 | $CkanManager->makeDatasetPublic($newDatasetName, $basename);
60 | }
61 | }
62 |
63 | // show running time on finish
64 | timer();
65 |
--------------------------------------------------------------------------------
/cli/update_modified_date.php:
--------------------------------------------------------------------------------
1 | results_dir = $results_dir;
41 | //foreach (glob(DATA_DIR . '/update_modified_date.csv') as $csv_file) {
42 | // $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL;
43 | // echo $status;
44 | //
45 | // $basename = str_replace('.csv', '', basename($csv_file));
46 | //
47 | // // fix wrong END-OF-LINE
48 | // file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file)));
49 | //
50 | // file_put_contents($results_dir . '/' . $basename . 'update_modified_date.log', $status, FILE_APPEND | LOCK_EX);
51 | //
52 | // $csv = new EasyCSV\Reader($csv_file, 'r+', false);
53 | // while (true) {
54 | // $row = $csv->getRow();
55 | // if (!$row) {
56 | // break;
57 | // }
58 | //// skip headers
59 | // if (in_array(strtolower($row['0']), ["Name of Dataset", "Agency", "Name"])) {
60 | // continue;
61 | // }
62 | //
63 | // $package_id = $row['2'];
64 | // $CkanManager->update_dataset_update_date($package_id, $basename);
65 | // }
66 | //}
67 | //
68 | //// show running time on finish
69 | //timer();
70 |
--------------------------------------------------------------------------------
/cli/add_license_url.php:
--------------------------------------------------------------------------------
1 | results_dir = $results_dir;
40 | //foreach (glob(DATA_DIR . '/public_package.csv') as $csv_file) {
41 | // $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL;
42 | // echo $status;
43 | //
44 | // $basename = str_replace('.csv', '', basename($csv_file));
45 | //
46 | // // fix wrong END-OF-LINE
47 | // file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file)));
48 | //
49 | // file_put_contents($results_dir . '/' . $basename . '_add_licese_url.log', $status, FILE_APPEND | LOCK_EX);
50 | //
51 | // $csv = new EasyCSV\Reader($csv_file, 'r+', false);
52 | // while (true) {
53 | // $row = $csv->getRow();
54 | // if (!$row) {
55 | // break;
56 | // }
57 | //// skip headers
58 | // if (in_array(strtolower($row['0']), ['#', 'id', 'name', 'title', 'license_id'])) {
59 | // continue;
60 | // }
61 | //
62 | // $package_id = $row['2'];
63 | // $license_id = $row['4'];
64 | // $CkanManager->update_dataset_license($package_id, $license_id, $basename);
65 | // }
66 | //}
67 | //
68 | //// show running time on finish
69 | //timer();
70 |
--------------------------------------------------------------------------------
/cli/delete_datasets.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
35 |
36 | /**
37 | * CSV
38 | * datasetName, orgId
39 | */
40 |
41 | foreach (glob(CKANMNGR_DATA_DIR . '/delete*.csv') as $csv_file) {
42 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL;
43 | echo $status;
44 |
45 | // fix wrong END-OF-LINE
46 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file)));
47 |
48 | $basename = str_replace('.csv', '', basename($csv_file));
49 | $logFile = $results_dir . '/' . $basename . '_log.csv';
50 |
51 | $csv = new EasyCSV\Reader($csv_file, 'r+', false);
52 | $i = 1;
53 | while (true) {
54 | $row = $csv->getRow();
55 | if (!$row) {
56 | break;
57 | }
58 | // skip headers
59 | if (in_array(trim(strtolower($row['0'])), ['dataset', 'url', 'old dataset url', 'from'])) {
60 | continue;
61 | }
62 |
63 | $datasetName = basename($row['0']);
64 | // $organizationName = basename($row['1']);
65 |
66 | printf('[%04d] ', $i++);
67 | $CkanManager->deleteDataset($datasetName);//, $organizationName
68 | file_put_contents($logFile, $CkanManager->logOutput, FILE_APPEND | LOCK_EX);
69 | $CkanManager->logOutput = '';
70 | }
71 | }
72 |
73 | // show running time on finish
74 | timer();
75 |
--------------------------------------------------------------------------------
/cli/add_resource_to_dataset.php:
--------------------------------------------------------------------------------
1 | results_dir = $results_dir;
40 | //foreach (glob(DATA_DIR . '/webservices.csv') as $csv_file) {
41 | // $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL;
42 | // echo $status;
43 | //
44 | // $basename = str_replace('.csv', '', basename($csv_file));
45 | //
46 | // // fix wrong END-OF-LINE
47 | // file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file)));
48 | //
49 | // file_put_contents($results_dir . '/' . $basename . '_add_resource.log', $status, FILE_APPEND | LOCK_EX);
50 | //
51 | // $csv = new EasyCSV\Reader($csv_file, 'r+', false);
52 | // while (true) {
53 | // $row = $csv->getRow();
54 | // if (!$row) {
55 | // break;
56 | // }
57 | //// skip headers
58 | // if (in_array(strtolower($row['0']), ['#', 'id', 'package_id', 'key', 'value', 'revision_id', 'state'])) {
59 | // continue;
60 | // }
61 | //
62 | // $package_id = $row['2'];
63 | // $api_url = $row['4'];
64 | // $CkanManager->add_resource_to_dataset($package_id, $api_url, $basename);
65 | // }
66 | //}
67 | //
68 | //// show running time on finish
69 | //timer();
70 |
--------------------------------------------------------------------------------
/cli/export_orgs_full.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
18 |
19 |
20 | foreach (glob(CKANMNGR_DATA_DIR . '/export_*.csv') as $csv_file) {
21 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL;
22 | echo $status;
23 |
24 | // fix wrong END-OF-LINE
25 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file)));
26 |
27 | $basename = str_replace('.csv', '', basename($csv_file));
28 | $logFile = $results_dir . '/' . $basename . '.log';
29 | // file_put_contents($logFile, $status, FILE_APPEND | LOCK_EX);
30 |
31 | $csv = new EasyCSV\Reader($csv_file, 'r+', false);
32 | $i = 1;
33 | while (true) {
34 | $row = $csv->getRow();
35 | if (!$row) {
36 | break;
37 | }
38 | // skip headers
39 | if (in_array(trim(strtolower($row['0'])), ['dataset', 'url', 'old dataset url', 'from'])) {
40 | continue;
41 | }
42 |
43 | $organization = basename($row['0']);
44 |
45 | printf('[%04d] ', $i++);
46 | // Options available:
47 | // CkanManager::EXPORT_PUBLIC_ONLY
48 | // CkanManager::EXPORT_PRIVATE_ONLY
49 | // CkanManager::EXPORT_DMS_ONLY
50 | // CkanManager::EXPORT_DMS_ONLY | CkanManager::EXPORT_PRIVATE_ONLY
51 | // CkanManager::EXPORT_DMS_ONLY | CkanManager::EXPORT_PUBLIC_ONLY
52 | $CkanManager->fullOrganizationExport($organization,
53 | // CkanManager::EXPORT_DMS_ONLY | CkanManager::EXPORT_PUBLIC_ONLY);
54 | CkanManager::EXPORT_PRIVATE_ONLY);
55 | }
56 |
57 | file_put_contents($logFile, $CkanManager->logOutput, FILE_APPEND | LOCK_EX);
58 | }
59 |
60 | // show running time on finish
61 | timer();
62 |
--------------------------------------------------------------------------------
/cli/rename_then_delete.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
31 |
32 | foreach (glob(CKANMNGR_DATA_DIR . '/rdelete*.csv') as $csv_file) {
33 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL;
34 | echo $status;
35 |
36 | // fix wrong END-OF-LINE
37 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file)));
38 |
39 | $basename = str_replace('.csv', '', basename($csv_file));
40 | file_put_contents($results_dir . '/' . $basename . '_rename.log', $status, FILE_APPEND | LOCK_EX);
41 |
42 | $csv = new EasyCSV\Reader($csv_file, 'r+', false);
43 | $i = 1;
44 | while (true) {
45 | $row = $csv->getRow();
46 | if (!$row) {
47 | break;
48 | }
49 | // skip headers
50 | if (in_array(trim(strtolower($row['0'])), ['dataset', 'url', 'old dataset url', 'from', 'name'])) {
51 | continue;
52 | }
53 |
54 | $datasetName = trim(basename($row['0']));
55 | $newDatasetName = substr($datasetName, 0, 70) . $i . '_delete';
56 | // $newDatasetName = $datasetName.'_del_legacy';
57 |
58 | printf('[%04d] ', $i++);
59 | $CkanManager->renameDataset($datasetName, $newDatasetName, $basename);
60 | $CkanManager->deleteDataset($newDatasetName);
61 | }
62 | }
63 |
64 | // show running time on finish
65 | timer();
66 |
--------------------------------------------------------------------------------
/cli/epa-gov_process/3_rename_datasets.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
31 |
32 | foreach (glob(CKANMNGR_DATA_DIR . '/rename*.csv') as $csv_file) {
33 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL;
34 | echo $status;
35 |
36 | // fix wrong END-OF-LINE
37 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file)));
38 |
39 | $basename = str_replace('.csv', '', basename($csv_file));
40 | file_put_contents($results_dir . '/' . $basename . '_rename.log', $status, FILE_APPEND | LOCK_EX);
41 |
42 | $csv = new EasyCSV\Reader($csv_file, 'r+', false);
43 | $i = 1;
44 | while (true) {
45 | $row = $csv->getRow();
46 | if (!$row) {
47 | break;
48 | }
49 | // skip headers
50 | if (in_array(trim(strtolower($row['0'])), ['dataset', 'url', 'old dataset url', 'from'])) {
51 | continue;
52 | }
53 |
54 | $datasetName = basename($row['0']);
55 | $newDatasetName = basename($row['1']);
56 |
57 | // if (strlen($newDatasetName) > 100) {
58 | // $suffix = substr(md5($datasetName),0,3);
59 | // $newDatasetName = substr($newDatasetName,0,85).$suffix.'_epa_deleted';
60 | // }
61 |
62 | printf('[%04d] ', $i++);
63 | $CkanManager->renameDataset($datasetName, $newDatasetName, $basename);
64 | }
65 | }
66 |
67 | // show running time on finish
68 | timer();
69 |
--------------------------------------------------------------------------------
/src/CKAN/Manager/Dataset.php:
--------------------------------------------------------------------------------
1 | dataset = $dataset;
25 | if (isset($dataset['extras'])) {
26 | foreach ($dataset['extras'] as $extra) {
27 | $this->extras[$extra['key']] = $extra['value'];
28 | }
29 | }
30 | }
31 |
32 | /**
33 | * @return array
34 | */
35 | public function get_groups_and_tags(){
36 | $groups = [];
37 | if (isset($this->dataset['groups'])) {
38 | foreach ($this->dataset['groups'] as $group) {
39 | if (strlen(trim($group['title']))) {
40 | $tags = [];
41 | if (isset($this->extras['__category_tag_'.$group['id']])) {
42 | $tags = trim($this->extras['__category_tag_'.$group['id']],'[]');
43 | $tags = explode('","', $tags);
44 | foreach ($tags as &$tag) {
45 | $tag = trim($tag, '" ');
46 | }
47 | }
48 | $groups[trim($group['title'])] = $tags;
49 | }
50 | }
51 | }
52 | return $groups;
53 | }
54 |
55 | /**
56 | * Sometimes harvested ckan title does not exactly matches, but dataset is same, ex. double spaces
57 | * To avoid these cases, we remove all non-word chars, leaving only alphabetic and digit chars
58 | * Ex.
59 | * Input: Tree dog dataset , agriculture, 1997 ?????!!!
60 | * Output: treedogdatasetagriculture1997
61 | *
62 | * @param $string
63 | *
64 | * @return mixed|string
65 | */
66 | public static function simplifyTitle(
67 | $string
68 | ) {
69 | $string = preg_replace('/[\W]+/', '', $string);
70 | $string = strtolower($string);
71 |
72 | return $string;
73 | }
74 | }
75 |
--------------------------------------------------------------------------------
/cli/update_organization.php:
--------------------------------------------------------------------------------
1 | results_dir = $results_dir;
40 | //foreach (glob(DATA_DIR . '/update_doe_datasets.csv') as $csv_file) {
41 | // $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL;
42 | // echo $status;
43 | //
44 | // $basename = str_replace('.csv', '', basename($csv_file));
45 | //
46 | // // fix wrong END-OF-LINE
47 | // file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file)));
48 | //
49 | // file_put_contents($results_dir . '/' . $basename . '_update_organization.log', $status, FILE_APPEND | LOCK_EX);
50 | //
51 | // $csv = new EasyCSV\Reader($csv_file, 'r+', false);
52 | // while (true) {
53 | // $row = $csv->getRow();
54 | // if (!$row) {
55 | // break;
56 | // }
57 | //// skip headers
58 | // if (in_array(strtolower($row['0']), ['url', 'exact match', 'title', 'found by title'])) {
59 | // continue;
60 | // }
61 | //
62 | // $package_id = str_replace("https://inventory.data.gov/dataset/", "", $row[0]);
63 | // //$organization_id = "ers-usda-gov";
64 | // $package_name = "1bef2082-a4ca-45c5-b307-3d8bfce384df";
65 | // $CkanManager->update_dataset_parent($package_id, $package_name, $basename);
66 | // }
67 | //}
68 | //
69 | //// show running time on finish
70 | //timer();
71 |
--------------------------------------------------------------------------------
/cli/rename_then_mark_private.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
31 |
32 | foreach (glob(CKANMNGR_DATA_DIR . '/prename*.csv') as $csv_file) {
33 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL;
34 | echo $status;
35 |
36 | // fix wrong END-OF-LINE
37 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file)));
38 |
39 | $basename = str_replace('.csv', '', basename($csv_file));
40 | file_put_contents($results_dir . '/' . $basename . '_rename.log', $status, FILE_APPEND | LOCK_EX);
41 |
42 | $csv = new EasyCSV\Reader($csv_file, 'r+', false);
43 | $i = 1;
44 | while (true) {
45 | $row = $csv->getRow();
46 | if (!$row) {
47 | break;
48 | }
49 | // skip headers
50 | if (in_array(trim(strtolower($row['0'])), ['dataset', 'url', 'old dataset url', 'from'])) {
51 | continue;
52 | }
53 |
54 | $datasetName = trim(basename($row['0']));
55 | $newDatasetName = substr($datasetName, 0, 70) . $i . '_legacy';
56 | // $newDatasetName = str_replace('_legacy_legacy','_legacy',$newDatasetName);
57 | // $newDatasetName = str_replace('_legacy_legacy','_legacy',$newDatasetName);
58 | // $newDatasetName = basename($row['1']);
59 |
60 | printf('[%04d] ', $i++);
61 |
62 | // echo $newDatasetName.PHP_EOL;
63 | // continue;
64 |
65 | $CkanManager->renameDataset($datasetName, $newDatasetName, $basename);
66 | $CkanManager->makeDatasetPrivate($newDatasetName, $basename);
67 | }
68 | }
69 |
70 | // show running time on finish
71 | timer();
72 |
--------------------------------------------------------------------------------
/cli/restore_script.php:
--------------------------------------------------------------------------------
1 | getRow();
40 | if (!$row) {
41 | break;
42 | }
43 | // skip headers
44 | if (in_array(trim(strtolower($row['0'])), ['dataset', 'url'])) {
45 | continue;
46 | }
47 |
48 | $datasetName = basename($row['0']);
49 |
50 | $StagingClient->say(str_pad($datasetName, 100, ' . '), '');
51 |
52 | try {
53 | $DatasetArray = $StagingClient->getDataset($datasetName);
54 | // no exception, cool
55 | $StagingClient->say(str_pad('Staging OK', 15, ' . '), '');
56 |
57 | $ProductionClient->diffUpdate($datasetName, $DatasetArray);
58 | // var_dump($DatasetArray);die();
59 | } catch (CKAN\NotFoundHttpException $ex) {
60 | $StagingClient->say(str_pad('Staging 404', 15, ' . '));
61 | } catch (\Exception $ex) {
62 | $StagingClient->say(str_pad('Staging Error: ' . $ex->getMessage(), 15, ' . '));
63 | }
64 |
65 | // debug
66 | // die();
67 | }
68 | }
69 |
70 | // show running time on finish
71 | timer();
72 |
--------------------------------------------------------------------------------
/cli/update_extra_fields.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
28 | foreach (glob(CKANMNGR_DATA_DIR . '/extra-*.csv') as $csv_file) {
29 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL;
30 | echo $status;
31 |
32 | $basename = str_replace('.csv', '', basename($csv_file));
33 |
34 | // fix wrong END-OF-LINE
35 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file)));
36 |
37 | // file_put_contents($resultsDir . '/' . $basename . '_tags.log', $status, FILE_APPEND | LOCK_EX);
38 |
39 | $csv = new EasyCSV\Reader($csv_file, 'r+', false);
40 | while (true) {
41 | $row = $csv->getRow();
42 | if (!$row) {
43 | break;
44 | }
45 | // skip headers
46 | if (in_array(trim(strtolower($row['0'])), ['link', 'dataset', 'url', 'data.gov url'])) {
47 | continue;
48 | }
49 |
50 | // no anchors please
51 | list($dataset,) = explode('#', basename(trim($row['0'])));
52 |
53 | if (!$dataset) {
54 | continue;
55 | }
56 |
57 | // double trouble check
58 | if (strpos($row['0'], '://')) {
59 | if (!strpos($row['0'], '/dataset/')) {
60 | file_put_contents(
61 | $results_dir . '/' . $basename . '_tags.log.csv',
62 | $row['0'] . ',WRONG URL' . PHP_EOL,
63 | FILE_APPEND | LOCK_EX
64 | );
65 | continue;
66 | }
67 | }
68 |
69 | $CkanManager->updateExtraFields(
70 | [$dataset],
71 | $row['1'],
72 | $row['2'],
73 | $row['3'],
74 | $basename
75 | );
76 | }
77 | }
78 |
79 | // show running time on finish
80 | timer();
81 |
--------------------------------------------------------------------------------
/cli/export_by_list.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
27 | foreach (glob(CKANMNGR_DATA_DIR . '/export_*.csv') as $csv_file) {
28 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL;
29 | echo $status;
30 |
31 | $basename = str_replace('.csv', '', basename($csv_file));
32 |
33 | // fix wrong END-OF-LINE
34 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file)));
35 |
36 |
37 | $csv = new Reader($csv_file, 'r+', false);
38 | while (true) {
39 | $row = $csv->getRow();
40 | if (!$row) {
41 | break;
42 | }
43 |
44 | // skip headers
45 | if (in_array(trim(strtolower($row['0'])), ['link', 'dataset', 'url', 'data.gov url'])) {
46 | continue;
47 | }
48 |
49 | if ($start > 0) {
50 | $start--;
51 | continue;
52 | }
53 |
54 | // no anchors please
55 | list($dataset,) = explode('#', basename(trim($row['0'])));
56 |
57 | // echo $dataset.PHP_EOL;
58 |
59 | if (!$dataset) {
60 | continue;
61 | }
62 |
63 | // double trouble check
64 | if (strpos($row['0'], '://')) {
65 | if (!strpos($row['0'], '/dataset/')) {
66 | file_put_contents(
67 | $results_dir . '/' . $basename . '_export.log.csv',
68 | $row['0'] . ',WRONG URL' . PHP_EOL,
69 | FILE_APPEND | LOCK_EX
70 | );
71 | continue;
72 | }
73 | }
74 |
75 | $lines = $CkanManager->exportPackage($dataset);
76 |
77 | foreach ($lines as $line) {
78 | $tags_csv->writeRow($line);
79 | }
80 | }
81 | }
82 |
83 |
84 | //$brief = $CkanManager->exportShort('extras_harvest_source_title:Test ISO WAF AND (dataset_type:dataset)');
85 | //$csv->writeFromArray($brief);
86 |
87 | // show running time on finish
88 | timer();
89 |
--------------------------------------------------------------------------------
/cli/update_field.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
28 | foreach (glob(CKANMNGR_DATA_DIR . '/license_update*.csv') as $csv_file) {
29 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL;
30 | echo $status;
31 |
32 | $basename = str_replace('.csv', '', basename($csv_file));
33 |
34 | // fix wrong END-OF-LINE
35 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file)));
36 |
37 | // file_put_contents($resultsDir . '/' . $basename . '_tags.log', $status, FILE_APPEND | LOCK_EX);
38 |
39 | $csv = new EasyCSV\Reader($csv_file, 'r+', false);
40 | while (true) {
41 | $row = $csv->getRow();
42 | if (!$row) {
43 | break;
44 | }
45 | // skip headers
46 | if (in_array(trim(strtolower($row['0'])), ['title','name','url','identifier','topics','categories'])) {
47 | continue;
48 | }
49 |
50 | // no anchors please
51 | list($dataset,) = explode('#', basename(trim($row['0'])));
52 |
53 | if (!$dataset) {
54 | continue;
55 | }
56 |
57 | // double trouble check
58 | if (strpos($row['0'], '://')) {
59 | if (!strpos($row['0'], '/dataset/')) {
60 | file_put_contents(
61 | $results_dir . '/' . $basename . '_tags.log.csv',
62 | $row['0'] . ',WRONG URL' . PHP_EOL,
63 | FILE_APPEND | LOCK_EX
64 | );
65 | continue;
66 | }
67 | }
68 | $package_id = $row['1'];
69 | $license_id = "cc-zero";
70 | $CkanManager->updateLicenseId($package_id, $license_id);
71 | }
72 | }
73 |
74 | // show running time on finish
75 | timer();
76 |
--------------------------------------------------------------------------------
/cli/export_full_by_list.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
24 | foreach (glob(CKANMNGR_DATA_DIR . '/export_*.csv') as $csv_file) {
25 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL;
26 | echo $status;
27 |
28 | $basename = str_replace('.csv', '', basename($csv_file));
29 |
30 | // fix wrong END-OF-LINE
31 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file)));
32 |
33 |
34 | $csv = new Reader($csv_file, 'r+', false);
35 | $return = [];
36 | while (true) {
37 | $row = $csv->getRow();
38 | if (!$row) {
39 | break;
40 | }
41 |
42 | // skip headers
43 | if (in_array(trim(strtolower($row['0'])), ['link', 'dataset', 'url', 'data.gov url'])) {
44 | continue;
45 | }
46 |
47 | if ($start > 0) {
48 | $start--;
49 | continue;
50 | }
51 |
52 | // no anchors please
53 | list($dataset_name,) = explode('#', basename(trim($row['0'])));
54 |
55 | if (!$dataset_name) {
56 | continue;
57 | }
58 |
59 | // double trouble check
60 | if (strpos($row['0'], '://')) {
61 | if (!strpos($row['0'], '/dataset/')) {
62 | file_put_contents(
63 | $results_dir . '/' . $basename . '_export.log.csv',
64 | $row['0'] . ',WRONG URL' . PHP_EOL,
65 | FILE_APPEND | LOCK_EX
66 | );
67 | continue;
68 | }
69 | }
70 |
71 | printf('%50s',$dataset_name);
72 | $dataset = $CkanManager->tryPackageShow($dataset_name);
73 | if ($dataset) {
74 | printf('%10s','OK');
75 | $return[] = $dataset;
76 | } else {
77 | printf('%10s','FAIL');
78 | }
79 | echo PHP_EOL;
80 | }
81 |
82 | file_put_contents(
83 | $results_dir . '/' . $basename . '.json',
84 | json_encode($return, JSON_PRETTY_PRINT),
85 | FILE_APPEND | LOCK_EX
86 | );
87 | }
88 |
89 |
90 | //$brief = $CkanManager->exportShort('extras_harvest_source_title:Test ISO WAF AND (dataset_type:dataset)');
91 | //$csv->writeFromArray($brief);
92 |
93 | // show running time on finish
94 | timer();
95 |
--------------------------------------------------------------------------------
/cli/epa-gov_process/4_assign_groups_and_tags.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
31 | foreach (glob(CKANMNGR_DATA_DIR . '/assign*.csv') as $csv_file) {
32 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL;
33 | echo $status;
34 |
35 | $basename = str_replace('.csv', '', basename($csv_file));
36 |
37 | // fix wrong END-OF-LINE
38 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file)));
39 |
40 | // file_put_contents($resultsDir . '/' . $basename . '_tags.log', $status, FILE_APPEND | LOCK_EX);
41 |
42 | $csv = new EasyCSV\Reader($csv_file, 'r+', false);
43 | while (true) {
44 | $row = $csv->getRow();
45 | if (!$row) {
46 | break;
47 | }
48 |
49 | // skip headers
50 | if (in_array(trim(strtolower($row['0'])), ['link', 'dataset', 'url', 'data.gov url'])) {
51 | continue;
52 | }
53 |
54 | if ($start > 0) {
55 | $start--;
56 | continue;
57 | }
58 |
59 | // format group tags
60 | $categories = [];
61 | if (isset($row['2']) && $row['2']) {
62 | $categories = explode(';', trim($row['2']));
63 | $categories = array_map('trim', $categories);
64 |
65 | }
66 |
67 | // no anchors please
68 | list($dataset,) = explode('#', basename(trim($row['0'])));
69 |
70 | if (!$dataset) {
71 | continue;
72 | }
73 |
74 | // double trouble check
75 | if (strpos($row['0'], '://')) {
76 | if (!strpos($row['0'], '/dataset/')) {
77 | file_put_contents(
78 | $results_dir . '/' . $basename . '_tags.log.csv',
79 | $row['0'] . ',WRONG URL' . PHP_EOL,
80 | FILE_APPEND | LOCK_EX
81 | );
82 | continue;
83 | }
84 | }
85 |
86 | $CkanManager->assignGroupsAndCategoriesToDatasets(
87 | [$dataset],
88 | trim($row['1']),
89 | $basename,
90 | $categories
91 | );
92 | }
93 | }
94 |
95 | // show running time on finish
96 | timer();
97 |
--------------------------------------------------------------------------------
/cli/socrata_log_redirects.php:
--------------------------------------------------------------------------------
1 | fail
37 | curl_setopt($curl_ch, CURLOPT_TIMEOUT, 60 * 5);
38 | // We don't want the header (use curl_getinfo())
39 | curl_setopt($curl_ch, CURLOPT_HEADER, false);
40 | // Track the handle's request string
41 | curl_setopt($curl_ch, CURLINFO_HEADER_OUT, true);
42 | // Attempt to retrieve the modification date of the remote document.
43 | curl_setopt($curl_ch, CURLOPT_FILETIME, true);
44 | // Initialize cURL headers
45 |
46 | foreach (glob(CKANMNGR_DATA_DIR . '/socrata_*.csv') as $csv_file) {
47 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL;
48 | echo $status;
49 |
50 | $basename = str_replace('.csv', '', basename($csv_file));
51 |
52 | // fix wrong END-OF-LINE
53 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file)));
54 |
55 | $csv_source = new Reader($csv_file, 'r+', false);
56 | $csv_destination = new Writer($results_dir . '/' . $basename . '_long.csv');
57 |
58 | $i = 0;
59 | while (true) {
60 | if (!($i++ % 50)) {
61 | echo $i . PHP_EOL;
62 | }
63 | $row = $csv_source->getRow();
64 | if (!$row) {
65 | break;
66 | }
67 | // skip headers
68 | if (in_array(trim(strtolower($row[0])), ['socrata code', 'from'])) {
69 | $csv_destination->writeRow($row);
70 | continue;
71 | }
72 |
73 | $socrata_id = $row[0];
74 | $ckan_url = $row[1];
75 |
76 | // writing short redirect
77 | $socrata_short_url = 'https://explore.data.gov/d/' . $socrata_id;
78 | $csv_destination->writeRow([$socrata_short_url, $ckan_url]);
79 |
80 | $socrata_long_url = get_long_socrata_url($curl_ch, $socrata_short_url);
81 | if (!$socrata_long_url) {
82 | echo 'No result: ' . $socrata_short_url . PHP_EOL;
83 | }
84 | $csv_destination->writeRow([$socrata_long_url, $ckan_url]);
85 | }
86 | }
87 |
88 | /**
89 | * @param $curl_ch
90 | * @param $url
91 | *
92 | * @return bool
93 | */
94 | function get_long_socrata_url($curl_ch, $url)
95 | {
96 |
97 | curl_setopt($curl_ch, CURLOPT_URL, $url);
98 | $method = 'GET';
99 |
100 | // Set cURL method.
101 | curl_setopt($curl_ch, CURLOPT_CUSTOMREQUEST, $method);
102 |
103 | // Execute request and get response headers.
104 | curl_exec($curl_ch);
105 | $info = curl_getinfo($curl_ch);
106 | if (isset($info['redirect_url'])) {
107 | return $info['redirect_url'];
108 | }
109 |
110 | return false;
111 | }
112 |
113 | // show running time on finish
114 | timer();
115 |
--------------------------------------------------------------------------------
/cli/nrc-gov_process/compare_prod_vs_uat_nrc.php:
--------------------------------------------------------------------------------
1 | writeRow([
30 | 'title',
31 | 'title_simple',
32 | 'name',
33 | 'url',
34 | 'topics',
35 | 'categories',
36 | ]);
37 |
38 | $ProdCkanManager = new CkanManager(CKAN_API_URL);
39 | $ProdCkanManager->resultsDir = $results_dir;
40 |
41 | $prod_nuclear = $ProdCkanManager->exportBrief('organization:(nrc-gov)' .
42 | ' AND -metadata_type:geospatial AND dataset_type:dataset');
43 | $prod->writeFromArray($prod_nuclear);
44 | } else {
45 | $prod = new Reader($results_dir . '/prod.csv');
46 | $prod_nuclear = $prod->getAll();
47 | }
48 |
49 | echo 'uat.csv' . PHP_EOL;
50 | if (!is_file($results_dir . '/uat.csv')) {
51 | $uat = new Writer($results_dir . '/uat.csv');
52 |
53 | $uat->writeRow([
54 | 'title',
55 | 'title_simple',
56 | 'name',
57 | 'url',
58 | 'topics',
59 | 'categories',
60 | ]);
61 |
62 | $UatCkanManager = new CkanManager(CKAN_UAT_API_URL);
63 | $UatCkanManager->resultsDir = $results_dir;
64 |
65 | $uat_nuclear = $UatCkanManager->exportBrief('extras_harvest_source_title:NRC data.json', '',
66 | 'http://uat-catalog-fe-data.reisys.com/dataset/');
67 | $uat->writeFromArray($uat_nuclear);
68 |
69 | } else {
70 | $uat = new Reader($results_dir . '/uat.csv');
71 | $uat_nuclear = $uat->getAll();
72 | }
73 |
74 | $uat_nuclear_by_title = [];
75 |
76 | foreach ($uat_nuclear as $name => $dataset) {
77 | $title = $dataset['title_simple'];
78 |
79 | $uat_nuclear_by_title[$title] = isset($uat_nuclear_by_title[$title]) ? $uat_nuclear_by_title[$title] : [];
80 | $uat_nuclear_by_title[$title][] = $dataset;
81 | }
82 |
83 | echo 'prod_vs_uat.csv' . PHP_EOL;
84 | is_file($results_dir . '/prod_vs_uat_nuclear_geospatial.csv') && unlink($results_dir . '/prod_vs_uat_nuclear_geospatial.csv');
85 | $csv = new Writer($results_dir . '/prod_vs_uat_nuclear_geospatial.csv');
86 | $csv->writeRow([
87 | 'Prod Title',
88 | 'Prod URL',
89 | 'Prod Topics',
90 | 'Prod Categories',
91 | 'Matched',
92 | 'UAT Title',
93 | 'UAT URL',
94 | ]);
95 |
96 | foreach ($prod_nuclear as $name => $prod_dataset) {
97 | if (isset($uat_nuclear_by_title[$prod_dataset['title_simple']])) {
98 | foreach ($uat_nuclear_by_title[$prod_dataset['title_simple']] as $uat_dataset) {
99 | $csv->writeRow([
100 | $prod_dataset['title'],
101 | $prod_dataset['url'],
102 | $prod_dataset['topics'],
103 | $prod_dataset['categories'],
104 | true,
105 | $uat_dataset['title'],
106 | $uat_dataset['url'],
107 | ]);
108 | }
109 | continue;
110 | }
111 |
112 | $csv->writeRow([
113 | $prod_dataset['title'],
114 | $prod_dataset['url'],
115 | $prod_dataset['topics'],
116 | $prod_dataset['categories'],
117 | false,
118 | '',
119 | '',
120 | ]);
121 | }
122 |
123 | // show running time on finish
124 | timer();
125 |
--------------------------------------------------------------------------------
/cli/compare_prod_vs_prod.php:
--------------------------------------------------------------------------------
1 | writeRow([
38 | 'title',
39 | 'title_simple',
40 | 'name',
41 | 'url',
42 | 'topics',
43 | 'categories',
44 | ]);
45 |
46 | $Prod1CkanManager = new CkanManager(CKAN_API_URL);
47 | $Prod1CkanManager->resultsDir = $results_dir;
48 |
49 | $prod1_data = $Prod1CkanManager->exportBrief('organization:(' . $prod1_org . ') AND dataset_type:dataset');
50 | $prod1->writeFromArray($prod1_data);
51 | } else {
52 | $prod1 = new Reader($prod1_csv_path);
53 | $prod1_data = $prod1->getAll();
54 | }
55 |
56 | echo $prod2_org . '.csv' . PHP_EOL;
57 | if (!is_file($prod2_csv_path)) {
58 | $prod2 = new Writer($prod2_csv_path);
59 |
60 | $prod2->writeRow([
61 | 'title',
62 | 'title_simple',
63 | 'name',
64 | 'url',
65 | 'topics',
66 | 'categories',
67 | ]);
68 |
69 | $Prod2CkanManager = new CkanManager(CKAN_API_URL);
70 | $Prod2CkanManager->resultsDir = $results_dir;
71 |
72 | $prod2_data = $Prod2CkanManager->exportBrief('organization:(' . $prod2_org . ') AND dataset_type:dataset');
73 | $prod2->writeFromArray($prod2_data);
74 |
75 | } else {
76 | $prod2 = new Reader($prod2_csv_path);
77 | $prod2_data = $prod2->getAll();
78 | }
79 |
80 |
81 | $prod2_by_title = [];
82 |
83 | foreach ($prod2_data as $name => $dataset) {
84 | $title = $dataset['title_simple'];
85 |
86 | $prod2_by_title[$title] = isset($prod2_by_title[$title]) ? $prod2_by_title[$title] : [];
87 | $prod2_by_title[$title][] = $dataset;
88 | }
89 |
90 | echo $prod1_org . '_VS_' . $prod2_org . '.csv' . PHP_EOL;
91 | is_file($comparison_csv_path) && unlink($comparison_csv_path);
92 | $csv = new Writer($comparison_csv_path);
93 | $csv->writeRow([
94 | $prod1_org . ' Title',
95 | $prod1_org . ' URL',
96 | $prod1_org . ' Topics',
97 | $prod1_org . ' Categories',
98 | 'Matched',
99 | $prod2_org . ' Title',
100 | $prod2_org . ' URL',
101 | 'URL Match',
102 | ]);
103 |
104 | foreach ($prod1_data as $name => $prod1_dataset) {
105 | if (isset($prod2_by_title[$prod1_dataset['title_simple']])) {
106 | foreach ($prod2_by_title[$prod1_dataset['title_simple']] as $prod2_dataset) {
107 | $csv->writeRow([
108 | $prod1_dataset['title'],
109 | $prod1_dataset['url'],
110 | $prod1_dataset['topics'],
111 | $prod1_dataset['categories'],
112 | true,
113 | $prod2_dataset['title'],
114 | $prod2_dataset['url'],
115 | true,
116 | ]);
117 | }
118 | continue;
119 | }
120 |
121 | $csv->writeRow([
122 | $prod1_dataset['title'],
123 | $prod1_dataset['url'],
124 | $prod1_dataset['topics'],
125 | $prod1_dataset['categories'],
126 | false,
127 | '',
128 | '',
129 | false,
130 | ]);
131 | }
132 |
133 | // show running time on finish
134 | timer();
135 |
--------------------------------------------------------------------------------
/cli/doc-gov_process/2_compare_prod_vs_prod.php:
--------------------------------------------------------------------------------
1 | writeRow([
30 | 'title',
31 | 'title_simple',
32 | 'name',
33 | 'url',
34 | 'topics',
35 | 'categories',
36 | ]);
37 |
38 | $ProdCkanManager = new CkanManager(CKAN_API_URL);
39 | $ProdCkanManager->resultsDir = $results_dir;
40 |
41 | $prod_commerce = $ProdCkanManager->exportBrief('organization:(doc-gov OR bis-doc-gov OR mbda-doc-gov OR trade-gov OR census-gov ' .
42 | ' OR eda-doc-gov OR ntia-doc-gov OR ntis-gov OR nws-doc-gov OR bea-gov OR uspto-gov)' .
43 | ' AND -metadata_type:geospatial AND dataset_type:dataset AND -harvest_source_id:[\'\' TO *]');
44 | $prod->writeFromArray($prod_commerce);
45 | } else {
46 | $prod = new Reader($results_dir . '/prod.csv');
47 | $prod_commerce = $prod->getAll();
48 | }
49 |
50 | echo 'new.csv' . PHP_EOL;
51 | if (!is_file($results_dir . '/new.csv')) {
52 | $new = new Writer($results_dir . '/new.csv');
53 |
54 | $new->writeRow([
55 | 'title',
56 | 'title_simple',
57 | 'name',
58 | 'url',
59 | 'topics',
60 | 'categories',
61 | ]);
62 |
63 | $UatCkanManager = new CkanManager(CKAN_API_URL);
64 | $UatCkanManager->resultsDir = $results_dir;
65 |
66 | $new_commerce = $UatCkanManager->exportBrief('extras_harvest_source_title:Commerce Non Spatial Data.json Harvest Source');
67 | $new->writeFromArray($new_commerce);
68 |
69 | } else {
70 | $new = new Reader($results_dir . '/new.csv');
71 | $new_commerce = $new->getAll();
72 | }
73 |
74 | $new_commerce_by_title = [];
75 |
76 | foreach ($new_commerce as $name => $dataset) {
77 | $title = $dataset['title_simple'];
78 |
79 | $new_commerce_by_title[$title] = isset($new_commerce_by_title[$title]) ? $new_commerce_by_title[$title] : [];
80 | $new_commerce_by_title[$title][] = $dataset;
81 | }
82 |
83 | echo 'prod_vs_new.csv' . PHP_EOL;
84 | is_file($results_dir . '/prod_vs_prod_commerce.csv') && unlink($results_dir . '/prod_vs_prod_commerce.csv');
85 | $csv = new Writer($results_dir . '/prod_vs_prod_commerce.csv');
86 | $csv->writeRow([
87 | 'Prod Title',
88 | 'Prod URL',
89 | 'Prod Topics',
90 | 'Prod Categories',
91 | 'Matched',
92 | 'NEW Title',
93 | 'NEW URL',
94 | 'URL Match',
95 | ]);
96 |
97 | foreach ($prod_commerce as $name => $prod_dataset) {
98 | if (isset($new_commerce_by_title[$prod_dataset['title_simple']])) {
99 | foreach ($new_commerce_by_title[$prod_dataset['title_simple']] as $new_dataset) {
100 | $csv->writeRow([
101 | $prod_dataset['title'],
102 | $prod_dataset['url'],
103 | $prod_dataset['topics'],
104 | $prod_dataset['categories'],
105 | true,
106 | $new_dataset['title'],
107 | $new_dataset['url'],
108 | true,
109 | ]);
110 | }
111 | continue;
112 | }
113 |
114 | $csv->writeRow([
115 | $prod_dataset['title'],
116 | $prod_dataset['url'],
117 | $prod_dataset['topics'],
118 | $prod_dataset['categories'],
119 | false,
120 | '',
121 | '',
122 | false,
123 | ]);
124 | }
125 |
126 | // show running time on finish
127 | timer();
128 |
--------------------------------------------------------------------------------
/cli/doc-gov_process/0_compare_prod_vs_uat.php:
--------------------------------------------------------------------------------
1 | writeRow([
30 | 'title',
31 | 'title_simple',
32 | 'name',
33 | 'url',
34 | 'topics',
35 | 'categories',
36 | ]);
37 |
38 | $ProdCkanManager = new CkanManager(CKAN_API_URL);
39 | $ProdCkanManager->resultsDir = $results_dir;
40 |
41 | $prod_commerce = $ProdCkanManager->exportBrief('organization:(doc-gov OR bis-doc-gov OR mbda-doc-gov OR trade-gov OR census-gov ' .
42 | ' OR eda-doc-gov OR ntia-doc-gov OR ntis-gov OR nws-doc-gov OR bea-gov OR uspto-gov)' .
43 | ' AND -metadata_type:geospatial AND dataset_type:dataset AND -harvest_source_id:[\'\' TO *]');
44 | $prod->writeFromArray($prod_commerce);
45 | } else {
46 | $prod = new Reader($results_dir . '/prod.csv');
47 | $prod_commerce = $prod->getAll();
48 | }
49 |
50 | echo 'uat.csv' . PHP_EOL;
51 | if (!is_file($results_dir . '/uat.csv')) {
52 | $uat = new Writer($results_dir . '/uat.csv');
53 |
54 | $uat->writeRow([
55 | 'title',
56 | 'title_simple',
57 | 'name',
58 | 'url',
59 | 'topics',
60 | 'categories',
61 | ]);
62 |
63 | $UatCkanManager = new CkanManager(CKAN_UAT_API_URL);
64 | $UatCkanManager->resultsDir = $results_dir;
65 |
66 | $uat_commerce = $UatCkanManager->exportBrief('extras_harvest_source_title:Commerce JSON', '',
67 | 'http://uat-catalog-fe-data.reisys.com/dataset/');
68 | $uat->writeFromArray($uat_commerce);
69 |
70 | } else {
71 | $uat = new Reader($results_dir . '/uat.csv');
72 | $uat_commerce = $uat->getAll();
73 | }
74 |
75 | $uat_commerce_by_title = [];
76 |
77 | foreach ($uat_commerce as $name => $dataset) {
78 | $title = $dataset['title_simple'];
79 |
80 | $uat_commerce_by_title[$title] = isset($uat_commerce_by_title[$title]) ? $uat_commerce_by_title[$title] : [];
81 | $uat_commerce_by_title[$title][] = $dataset;
82 | }
83 |
84 | echo 'prod_vs_uat.csv' . PHP_EOL;
85 | is_file($results_dir . '/prod_vs_uat_commerce.csv') && unlink($results_dir . '/prod_vs_uat_commerce.csv');
86 | $csv = new Writer($results_dir . '/prod_vs_uat_commerce.csv');
87 | $csv->writeRow([
88 | 'Prod Title',
89 | 'Prod URL',
90 | 'Prod Topics',
91 | 'Prod Categories',
92 | 'Matched',
93 | 'UAT Title',
94 | 'UAT URL',
95 | 'URL Match',
96 | ]);
97 |
98 | foreach ($prod_commerce as $name => $prod_dataset) {
99 | if (isset($uat_commerce_by_title[$prod_dataset['title_simple']])) {
100 | foreach ($uat_commerce_by_title[$prod_dataset['title_simple']] as $uat_dataset) {
101 | $csv->writeRow([
102 | $prod_dataset['title'],
103 | $prod_dataset['url'],
104 | $prod_dataset['topics'],
105 | $prod_dataset['categories'],
106 | true,
107 | $uat_dataset['title'],
108 | $uat_dataset['url'],
109 | true,
110 | ]);
111 | }
112 | continue;
113 | }
114 |
115 | $csv->writeRow([
116 | $prod_dataset['title'],
117 | $prod_dataset['url'],
118 | $prod_dataset['topics'],
119 | $prod_dataset['categories'],
120 | false,
121 | '',
122 | '',
123 | false,
124 | ]);
125 | }
126 |
127 | // show running time on finish
128 | timer();
129 |
--------------------------------------------------------------------------------
/cli/doc-gov_process/5_compare_prod_vs_qa.php:
--------------------------------------------------------------------------------
1 | writeRow([
30 | 'title',
31 | 'title_simple',
32 | 'name',
33 | 'url',
34 | 'topics',
35 | 'categories',
36 | ]);
37 |
38 | $ProdCkanManager = new CkanManager(CKAN_API_URL);
39 | $ProdCkanManager->resultsDir = $results_dir;
40 |
41 | $prod_commerce = $ProdCkanManager->exportBrief('organization:(doc-gov OR bis-doc-gov OR mbda-doc-gov OR trade-gov OR census-gov ' .
42 | ' OR eda-doc-gov OR ntia-doc-gov OR ntis-gov OR nws-doc-gov OR bea-gov OR uspto-gov)' .
43 | ' AND -metadata_type:geospatial AND dataset_type:dataset');
44 | $prod->writeFromArray($prod_commerce);
45 | } else {
46 | $prod = new Reader($results_dir . '/prod.csv');
47 | $prod_commerce = $prod->getAll();
48 | }
49 |
50 | echo 'qa.csv' . PHP_EOL;
51 | if (!is_file($results_dir . '/qa.csv')) {
52 | $qa = new Writer($results_dir . '/qa.csv');
53 |
54 | $qa->writeRow([
55 | 'title',
56 | 'title_simple',
57 | 'name',
58 | 'url',
59 | 'topics',
60 | 'categories',
61 | ]);
62 |
63 | $UatCkanManager = new CkanManager(CKAN_QA_API_URL);
64 | $UatCkanManager->resultsDir = $results_dir;
65 |
66 | $qa_commerce = $UatCkanManager->exportBrief('organization:(doc-gov OR bis-doc-gov OR mbda-doc-gov OR trade-gov OR census-gov ' .
67 | ' OR eda-doc-gov OR ntia-doc-gov OR ntis-gov OR nws-doc-gov OR bea-gov OR uspto-gov)' .
68 | ' AND -metadata_type:geospatial AND dataset_type:dataset', '',
69 | 'http://qa-catalog-fe-data.reisys.com/dataset/');
70 | $qa->writeFromArray($qa_commerce);
71 |
72 | } else {
73 | $qa = new Reader($results_dir . '/qa.csv');
74 | $qa_commerce = $qa->getAll();
75 | }
76 |
77 | $qa_commerce_by_title = [];
78 |
79 | foreach ($qa_commerce as $name => $dataset) {
80 | $title = $dataset['title_simple'];
81 |
82 | $qa_commerce_by_title[$title] = isset($qa_commerce_by_title[$title]) ? $qa_commerce_by_title[$title] : [];
83 | $qa_commerce_by_title[$title][] = $dataset;
84 | }
85 |
86 | echo 'prod_vs_qa.csv' . PHP_EOL;
87 | is_file($results_dir . '/prod_vs_qa_commerce.csv') && unlink($results_dir . '/prod_vs_qa_commerce.csv');
88 | $csv = new Writer($results_dir . '/prod_vs_qa_commerce.csv');
89 | $csv->writeRow([
90 | 'Prod Title',
91 | 'Prod URL',
92 | 'Prod Topics',
93 | 'Prod Categories',
94 | 'Matched',
95 | 'QA Title',
96 | 'QA URL',
97 | 'URL Match',
98 | ]);
99 |
100 | foreach ($prod_commerce as $name => $prod_dataset) {
101 | if (isset($qa_commerce_by_title[$prod_dataset['title_simple']])) {
102 | foreach ($qa_commerce_by_title[$prod_dataset['title_simple']] as $qa_dataset) {
103 | $csv->writeRow([
104 | $prod_dataset['title'],
105 | $prod_dataset['url'],
106 | $prod_dataset['topics'],
107 | $prod_dataset['categories'],
108 | true,
109 | $qa_dataset['title'],
110 | $qa_dataset['url'],
111 | true,
112 | ]);
113 | }
114 | continue;
115 | }
116 |
117 | $csv->writeRow([
118 | $prod_dataset['title'],
119 | $prod_dataset['url'],
120 | $prod_dataset['topics'],
121 | $prod_dataset['categories'],
122 | false,
123 | '',
124 | '',
125 | false,
126 | ]);
127 | }
128 |
129 | // show running time on finish
130 | timer();
131 |
--------------------------------------------------------------------------------
/cli/tagging/brother_assign.php:
--------------------------------------------------------------------------------
1 | getRow();
29 | if (!$row) {
30 | break;
31 | }
32 | if (1 == sizeof($row)) {
33 | continue;
34 | }
35 | $original = get_dataset_basename(array_shift($row));
36 | $brothers[$original] = $row;
37 | }
38 | }
39 |
40 | //var_dump($brothers);
41 | //die();
42 |
43 | $CkanManager = new CkanManager(CKAN_API_URL, CKAN_API_KEY);
44 | //$CkanManager = new CkanManager(CKAN_STAGING_API_URL, CKAN_STAGING_API_KEY);
45 | //$CkanManager = new CkanManager(CKAN_DEV_API_URL, CKAN_DEV_API_KEY);
46 | //$CkanManager = new CkanManager(CKAN_UAT_API_URL, CKAN_UAT_API_KEY);
47 | //$CkanManager = new CkanManager(CKAN_QA_API_URL, CKAN_QA_API_KEY);
48 |
49 | /**
50 | * Sample csv
51 | * dataset,group,categories
52 | * https://catalog.data.gov/dataset/food-access-research-atlas,Agriculture,"Natural Resources and Environment"
53 | * download-crossing-inventory-data-highway-rail-crossing,Agriculture, "Natural Resources and Environment;Plants and Plant Systems Agriculture"
54 | */
55 |
56 | $CkanManager->resultsDir = $results_dir;
57 | foreach (glob(CKANMNGR_DATA_DIR . '/assign*.csv') as $csv_file) {
58 | $csv_source = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL;
59 | echo $CkanManager->color->green($csv_source);
60 |
61 | $basename = str_replace('.csv', '', basename($csv_file));
62 |
63 | // fix wrong END-OF-LINE
64 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file)));
65 |
66 | // file_put_contents($resultsDir . '/' . $basename . '_tags.log', $status, FILE_APPEND | LOCK_EX);
67 |
68 | $csv = new EasyCSV\Reader($csv_file, 'r+', false);
69 | while (true) {
70 | $row = $csv->getRow();
71 | if (!$row) {
72 | break;
73 | }
74 |
75 | // skip headers
76 | if (in_array(trim(strtolower($row['0'])), ['link', 'dataset', 'url', 'data.gov url'])) {
77 | continue;
78 | }
79 |
80 | if ($start > 0) {
81 | $start--;
82 | continue;
83 | }
84 |
85 | // format group tags
86 | $categories = [];
87 | if (isset($row['2']) && $row['2']) {
88 | $categories = explode(';', trim($row['2']));
89 | $categories = array_map('trim', $categories);
90 |
91 | }
92 |
93 | // no anchors please
94 | $dataset = get_dataset_basename($row['0']);
95 |
96 | if (!$dataset) {
97 | continue;
98 | }
99 |
100 | echo "\tOriginal: ".$dataset . PHP_EOL;
101 | // $CkanManager->assignGroupsAndCategoriesToDatasets(
102 | // [$dataset],
103 | // trim($row['1']),
104 | // $categories,
105 | // $basename
106 | // );
107 | if (isset($brothers[$dataset])) {
108 | foreach ($brothers[$dataset] as $brother) {
109 | if (!strlen(trim($brother))) {
110 | continue;
111 | }
112 | $brother = get_dataset_basename($brother);
113 | if (!$brother) {
114 | continue;
115 | }
116 | echo "\tUat (s):" . PHP_EOL;
117 | $CkanManager->assignGroupsAndCategoriesToDatasets(
118 | [$brother],
119 | trim($row['1']),
120 | $basename,
121 | $categories
122 | );
123 | }
124 | }
125 | }
126 | }
127 |
128 | // show running time on finish
129 | timer();
130 |
--------------------------------------------------------------------------------
/cli/export_short.php:
--------------------------------------------------------------------------------
1 | writeRow([
27 | // 'ckan id',
28 | // 'title',
29 | // 'name',
30 | // 'url',
31 | // 'identifier',
32 | // 'org title',
33 | // 'org name',
34 | // 'topics',
35 | // 'categories',
36 | //]);
37 |
38 | $CkanManager->resultsDir = $results_dir;
39 |
40 | //$brief = $CkanManager->exportShort('extras_license:"https\://creativecommons.org/publicdomain/zero/1.0/" AND (dataset_type:dataset)');
41 | //$brief = $CkanManager->exportShort('','((collection_package_id:* OR *:*) AND license_id:"cc-by-sa" AND license:"https\://creativecommons.org/publicdomain/zero/1.0/") AND (dataset_type:dataset)');
42 | //$brief = $CkanManager->exportShort('%28%28collection_package_id:*%20OR%20*:*%29+AND+license_id:"cc-by-sa"+AND+license:"https://creativecommons.org/publicdomain/zero/1.0/"%29');
43 | //$brief = $CkanManager->exportShort('organization:wake-county AND (dataset_type:dataset)');
44 | //$brief = $CkanManager->exportShort('organization:gsa-gov AND harvest_source_title:Open* AND (dataset_type:dataset)',
45 | //$brief = $CkanManager->exportShort('organization:doe-gov AND (dataset_type:dataset)');
46 | //$brief = $CkanManager->exportShort('organization:dhs-gov AND (harvest_source_title:DHS*) AND (dataset_type:dataset)');
47 | //$brief = $CkanManager->exportShort('organization:epa-gov AND (harvest_source_title:*Gateway) AND (dataset_type:dataset)');
48 | //$brief = $CkanManager->exportShort('organization:epa-gov AND (metadata_type:geospatial) AND (dataset_type:dataset)');
49 | //$brief = $CkanManager->exportShort('organization:nasa-gov AND (harvest_source_title:NASA*) AND (dataset_type:dataset)');
50 | //$brief = $CkanManager->exportShort('organization:ntsb-gov AND (dataset_type:dataset)');
51 | //$brief = $CkanManager->exportShort('organization:noaa-gov AND metadata_type:geospatial AND (dataset_type:dataset) AND groups:*');
52 | //$brief = $CkanManager->exportShort('metadata-source:dms AND (dataset_type:dataset)');
53 | //$brief = $CkanManager->exportShort('organization:doj-gov AND (dataset_type:dataset)');
54 | // 'http://uat-catalog-fe-data.reisys.com/dataset/');
55 | //$brief = $CkanManager->exportShort('(extra_harvest_source_title:Open+*) AND (dataset_type:dataset)');
56 | //$brief = $CkanManager->exportShort('organization:gsa-gov AND (dataset_type:dataset)');
57 | //$brief = $CkanManager->exportShort('extras_harvest_source_title:Test ISO WAF AND (dataset_type:dataset)');
58 | //$brief = $CkanManager->exportShort('organization:doe-gov AND (harvest_source_title:Energy*) AND (dataset_type:dataset)');
59 | //$brief = $CkanManager->exportShort('organization:state-of-oklahoma AND (dataset_type:dataset)');
60 | //$brief = $CkanManager->exportShort('organization:state-of-oklahoma AND -metadata_modified:[2016-02-24T23:59:59.999Z TO 2016-02-27T00:00:00Z] AND (dataset_type:dataset)');
61 | //$brief = $CkanManager->exportShort('organization:noaa-gov AND metadata-source:dms AND (dataset_type:dataset)');
62 | //$brief = $CkanManager->exportShort('organization:dot-gov AND (dataset_type:dataset) AND publisher:"Federal Aviation Administration"');
63 | //$brief = $CkanManager->exportShort('organization:nd-gov AND (dataset_type:dataset)');
64 | //$brief = $CkanManager->exportShort('organization:opm-gov AND (dataset_type:dataset)');
65 | //$brief = $CkanManager->exportShort('organization:fs-fed-us AND (dataset_type:dataset)');
66 | //$brief = $CkanManager->exportShort('metadata-source:dms AND (dataset_type:dataset)');
67 | $brief = $CkanManager->exportShort('organization:usa-net AND (dataset_type:dataset)');
68 |
69 | $headers = array_keys($brief[array_keys($brief)[0]]);
70 | $csv->writeRow($headers);
71 | $csv->writeFromArray($brief);
72 |
73 | // show running time on finish
74 | timer();
75 |
--------------------------------------------------------------------------------
/cli/compare_prod_vs_uat.php:
--------------------------------------------------------------------------------
1 | writeRow([
30 | 'title',
31 | 'title_simple',
32 | 'name',
33 | 'url',
34 | 'topics',
35 | 'categories',
36 | ]);
37 |
38 | $ProdCkanManager = new CkanManager(CKAN_API_URL);
39 | $ProdCkanManager->resultsDir = $results_dir;
40 |
41 | // $prod_commerce = $ProdCkanManager->exportBrief('organization:(doc-gov OR bis-doc-gov OR mbda-doc-gov OR trade-gov OR census-gov ' .
42 | // ' OR eda-doc-gov OR ntia-doc-gov OR ntis-gov OR nws-doc-gov OR bea-gov OR uspto-gov)' .
43 | // ' AND -metadata_type:geospatial AND dataset_type:dataset AND -harvest_source_id:[\'\' TO *]');
44 |
45 |
46 | // https://catalog.data.gov/organization/nd-gov?harvest_source_title=North+Dakota+GIS+Hub+Data+Portal
47 | $prod_commerce = $ProdCkanManager->exportBrief('organization:nd-gov AND dataset_type:dataset' .
48 | ' AND harvest_source_title:North*');
49 | $prod->writeFromArray($prod_commerce);
50 | } else {
51 | $prod = new Reader($results_dir . '/prod.csv');
52 | $prod_commerce = $prod->getAll();
53 | }
54 |
55 | echo 'uat.csv' . PHP_EOL;
56 | if (!is_file($results_dir . '/uat.csv')) {
57 | $uat = new Writer($results_dir . '/uat.csv');
58 |
59 | $uat->writeRow([
60 | 'title',
61 | 'title_simple',
62 | 'name',
63 | 'url',
64 | 'topics',
65 | 'categories',
66 | ]);
67 |
68 | $UatCkanManager = new CkanManager(CKAN_UAT_API_URL);
69 | $UatCkanManager->resultsDir = $results_dir;
70 |
71 | // $uat_commerce = $UatCkanManager->exportBrief('extras_harvest_source_title:Commerce JSON', '',
72 | // 'http://uat-catalog-fe-data.reisys.com/dataset/');
73 |
74 | // http://uat-catalog-fe-data.reisys.com/organization/test-org-082615?harvest_source_title=ND.gov+New+Data.json+HS
75 |
76 | $uat_commerce = $UatCkanManager->exportBrief('organization:test-org-082615 AND harvest_source_title:ND*', '',
77 | 'http://uat-catalog-fe-data.reisys.com/dataset/');
78 | $uat->writeFromArray($uat_commerce);
79 |
80 | } else {
81 | $uat = new Reader($results_dir . '/uat.csv');
82 | $uat_commerce = $uat->getAll();
83 | }
84 |
85 | $uat_commerce_by_title = [];
86 |
87 | foreach ($uat_commerce as $name => $dataset) {
88 | $title = $dataset['title_simple'];
89 |
90 | $uat_commerce_by_title[$title] = isset($uat_commerce_by_title[$title]) ? $uat_commerce_by_title[$title] : [];
91 | $uat_commerce_by_title[$title][] = $dataset;
92 | }
93 |
94 | echo 'prod_vs_uat.csv' . PHP_EOL;
95 | is_file($results_dir . '/prod_vs_uat_commerce.csv') && unlink($results_dir . '/prod_vs_uat_commerce.csv');
96 | $csv = new Writer($results_dir . '/prod_vs_uat_commerce.csv');
97 | $csv->writeRow([
98 | 'Prod Title',
99 | 'Prod URL',
100 | 'Prod Topics',
101 | 'Prod Categories',
102 | 'Matched',
103 | 'UAT Title',
104 | 'UAT URL',
105 | 'URL Match',
106 | ]);
107 |
108 | foreach ($prod_commerce as $name => $prod_dataset) {
109 | if (isset($uat_commerce_by_title[$prod_dataset['title_simple']])) {
110 | foreach ($uat_commerce_by_title[$prod_dataset['title_simple']] as $uat_dataset) {
111 | $csv->writeRow([
112 | $prod_dataset['title'],
113 | $prod_dataset['url'],
114 | $prod_dataset['topics'],
115 | $prod_dataset['categories'],
116 | true,
117 | $uat_dataset['title'],
118 | $uat_dataset['url'],
119 | true,
120 | ]);
121 | }
122 | continue;
123 | }
124 |
125 | $csv->writeRow([
126 | $prod_dataset['title'],
127 | $prod_dataset['url'],
128 | $prod_dataset['topics'],
129 | $prod_dataset['categories'],
130 | false,
131 | '',
132 | '',
133 | false,
134 | ]);
135 | }
136 |
137 | // show running time on finish
138 | timer();
139 |
--------------------------------------------------------------------------------
/cli/recheck_socrata_redirects.php:
--------------------------------------------------------------------------------
1 | fail
33 | curl_setopt($curl_ch, CURLOPT_TIMEOUT, 60 * 5);
34 | // We don't want the header (use curl_getinfo())
35 | curl_setopt($curl_ch, CURLOPT_HEADER, false);
36 | // Track the handle's request string
37 | curl_setopt($curl_ch, CURLINFO_HEADER_OUT, true);
38 | // Attempt to retrieve the modification date of the remote document.
39 | curl_setopt($curl_ch, CURLOPT_FILETIME, true);
40 | // Initialize cURL headers
41 |
42 |
43 | foreach (glob(CKANMNGR_DATA_DIR . '/redirects_*.csv') as $csv_file) {
44 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL;
45 | echo $status;
46 |
47 | $basename = str_replace('.csv', '', basename($csv_file));
48 |
49 | // fix wrong END-OF-LINE
50 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file)));
51 |
52 | $csv_source = new EasyCSV\Reader($csv_file, 'r+', false);
53 | $csv_destination = new EasyCSV\Writer($results_dir . '/' . $basename . '_log.csv');
54 |
55 | $csv_destination->writeRow(['from', 'to', 'status', 'real_redirect']);
56 |
57 | $i = 0;
58 | while (true) {
59 | if (!($i++ % 100)) {
60 | echo $i . PHP_EOL;
61 | }
62 | $row = $csv_source->getRow();
63 | if (!$row) {
64 | break;
65 | }
66 | // skip headers
67 | if (in_array(trim(strtolower($row[0])), ['socrata code', 'from', 'source url'])) {
68 | // $csv_destination->writeRow($row);
69 | continue;
70 | }
71 |
72 | $socrata_url = $row[0];
73 | $redirect_url = $row[1];
74 |
75 | $redirect = try_get_redirect($curl_ch, $socrata_url);
76 | if (!$redirect) {
77 | echo 'No redirect: ' . $socrata_url . PHP_EOL;
78 | $csv_destination->writeRow([$socrata_url, $redirect_url, 'no redirect', '']);
79 | continue;
80 | }
81 |
82 | if (url_compare($redirect, $redirect_url)) {
83 | $csv_destination->writeRow([$socrata_url, $redirect_url, 'correct', '']);
84 | } else {
85 | echo 'Wrong redirect: ' . $socrata_url . PHP_EOL;
86 | $csv_destination->writeRow([$socrata_url, $redirect_url, 'wrong redirect', '' . $redirect]);
87 | continue;
88 | }
89 | }
90 | }
91 |
92 | /**
93 | * @param $url1
94 | * @param $url2
95 | *
96 | * @return bool
97 | */
98 | function url_compare($url1, $url2)
99 | {
100 | $url1_strip = trim(str_replace(['http:', 'https:'], '', $url1), '/ ');
101 | $url2_strip = trim(str_replace(['http:', 'https:'], '', $url2), '/ ');
102 |
103 | return ($url1_strip === $url2_strip);
104 | }
105 |
106 | /**
107 | * @param $curl_ch
108 | * @param $url
109 | *
110 | * @return bool
111 | */
112 | function try_get_redirect($curl_ch, $url)
113 | {
114 | curl_setopt($curl_ch, CURLOPT_URL, $url);
115 | $method = 'GET';
116 |
117 | // Set cURL method.
118 | curl_setopt($curl_ch, CURLOPT_CUSTOMREQUEST, $method);
119 |
120 | // Execute request and get response headers.
121 | $response = curl_exec($curl_ch);
122 | $info = curl_getinfo($curl_ch);
123 | if (isset($info['redirect_url']) && $info['redirect_url']) {
124 | return $info['redirect_url'];
125 | }
126 |
127 | if (stripos($response, 'http-equiv="refresh"')) {
128 | $pattern = '/content="0;URL=(http[\S\/\-\.]+)"/';
129 | preg_match($pattern, $response, $matches, PREG_OFFSET_CAPTURE, 3);
130 | if ($matches && isset($matches[1]) && isset($matches[1][0])) {
131 | return $matches[1][0];
132 | }
133 | }
134 |
135 | return false;
136 | }
137 |
138 | // show running time on finish
139 | timer();
140 |
--------------------------------------------------------------------------------
/cli/tagging/generate_brothers_assign_csv.php:
--------------------------------------------------------------------------------
1 | getRow();
29 | if (!$row) {
30 | break;
31 | }
32 | if (1 == sizeof($row)) {
33 | continue;
34 | }
35 | $original = get_dataset_basename(array_shift($row));
36 | $brothers[$original] = $row;
37 | }
38 | }
39 |
40 | //var_dump($brothers);
41 | //die();
42 |
43 | //$CkanManager = new CkanManager(CKAN_API_URL, CKAN_API_KEY);
44 | //$CkanManager = new CkanManager(CKAN_STAGING_API_URL, CKAN_STAGING_API_KEY);
45 | //$CkanManager = new CkanManager(CKAN_DEV_API_URL, CKAN_DEV_API_KEY);
46 | $CkanManager = new CkanManager(CKAN_UAT_API_URL, CKAN_UAT_API_KEY);
47 | //$CkanManager = new CkanManager(CKAN_QA_API_URL, CKAN_QA_API_KEY);
48 |
49 | /**
50 | * Sample csv
51 | * dataset,group,categories
52 | * https://catalog.data.gov/dataset/food-access-research-atlas,Agriculture,"Natural Resources and Environment"
53 | * download-crossing-inventory-data-highway-rail-crossing,Agriculture, "Natural Resources and Environment;Plants and Plant Systems Agriculture"
54 | */
55 |
56 | $CkanManager->resultsDir = $results_dir;
57 | foreach (glob(CKANMNGR_DATA_DIR . '/assign*.csv') as $csv_file) {
58 | $csv_source = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL;
59 | echo $CkanManager->color->green($csv_source);
60 |
61 | $basename = str_replace('.csv', '', basename($csv_file));
62 |
63 | // fix wrong END-OF-LINE
64 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file)));
65 |
66 | // file_put_contents($resultsDir . '/' . $basename . '_tags.log', $status, FILE_APPEND | LOCK_EX);
67 |
68 | $csv = new EasyCSV\Reader($csv_file, 'r+', false);
69 | $output = new EasyCSV\Writer($results_dir.'/'.$basename.'_clones.csv');
70 | while (true) {
71 | $row = $csv->getRow();
72 | if (!$row) {
73 | break;
74 | }
75 |
76 | // skip headers
77 | if (in_array(trim(strtolower($row['0'])), ['link', 'dataset', 'url', 'data.gov url'])) {
78 | continue;
79 | }
80 |
81 | if ($start > 0) {
82 | $start--;
83 | continue;
84 | }
85 |
86 | // format group tags
87 | $categories = isset($row['2'])?trim($row['2']):'';
88 | // if (isset($row['2']) && $row['2']) {
89 | // $categories = explode(';', trim($row['2']));
90 | // $categories = array_map('trim', $categories);
91 | // }
92 |
93 | // no anchors please
94 | $dataset = get_dataset_basename($row['0']);
95 |
96 | if (!$dataset) {
97 | continue;
98 | }
99 |
100 | // echo "\tOriginal: ".$dataset . PHP_EOL;
101 | // $CkanManager->assignGroupsAndCategoriesToDatasets(
102 | // [$dataset],
103 | // trim($row['1']),
104 | // $categories,
105 | // $basename
106 | // );
107 | $output->writeRow([$dataset,trim($row['1']),$categories]);
108 | echo join(' , ',[$dataset,trim($row['1']),$categories]).PHP_EOL;
109 |
110 |
111 | if (isset($brothers[$dataset])) {
112 | foreach ($brothers[$dataset] as $brother) {
113 | if (!strlen(trim($brother))) {
114 | continue;
115 | }
116 | $brother = get_dataset_basename($brother);
117 | if (!$brother) {
118 | continue;
119 | }
120 | $output->writeRow([$brother,trim($row['1']),$categories]);
121 | echo join(' , ',[$brother,trim($row['1']),$categories]).PHP_EOL;
122 | // echo "\tUat (s):" . PHP_EOL;
123 | // $CkanManager->assignGroupsAndCategoriesToDatasets(
124 | // [$brother],
125 | // trim($row['1']),
126 | // $categories,
127 | // $basename
128 | // );
129 | }
130 | }
131 | }
132 | }
133 |
134 | // show running time on finish
135 | timer();
136 |
--------------------------------------------------------------------------------
/cli/epa-gov_process/compare_qa_vs_prod_epa.php:
--------------------------------------------------------------------------------
1 | writeRow([
30 | 'title',
31 | 'title_simple',
32 | 'name',
33 | 'url',
34 | 'guid',
35 | 'topics',
36 | 'categories',
37 | ]);
38 |
39 | $ProdCkanManager = new CkanManager(CKAN_API_URL);
40 | $ProdCkanManager->resultsDir = $results_dir;
41 |
42 | $prod_epa = $ProdCkanManager->exportBrief('organization:epa-gov');
43 | $prod->writeFromArray($prod_epa);
44 | } else {
45 | $prod = new Reader($results_dir . '/prod.csv');
46 | $prod_epa = $prod->getAll();
47 | }
48 |
49 | echo 'qa.csv' . PHP_EOL;
50 | if (!is_file($results_dir . '/qa.csv')) {
51 | $qa = new Writer($results_dir . '/qa.csv');
52 |
53 | $qa->writeRow([
54 | 'title',
55 | 'title_simple',
56 | 'name',
57 | 'url',
58 | 'guid',
59 | 'topics',
60 | 'categories',
61 | ]);
62 |
63 | $QaCkanManager = new CkanManager(CKAN_QA_API_URL);
64 | $QaCkanManager->resultsDir = $results_dir;
65 |
66 | $qa_epa = $QaCkanManager->exportBrief('organization:epa-gov', '', 'http://qa-catalog-fe-data.reisys.com/dataset/');
67 | $qa->writeFromArray($qa_epa);
68 |
69 | } else {
70 | $qa = new Reader($results_dir . '/qa.csv');
71 | $qa_epa = $qa->getAll();
72 | }
73 |
74 | $qa_epa_by_title = $qa_epa_by_guid = [];
75 |
76 | foreach ($qa_epa as $name => $dataset) {
77 | $title = $dataset['title_simple'];
78 |
79 | $qa_epa_by_title[$title] = isset($qa_epa_by_title[$title]) ? $qa_epa_by_title[$title] : [];
80 | $qa_epa_by_title[$title][] = $dataset;
81 |
82 | $guid = trim($dataset['guid']);
83 | if ($guid) {
84 | $qa_epa_by_guid[$guid] = isset($qa_epa_by_guid[$guid]) ? $qa_epa_by_guid[$guid] : [];
85 | $qa_epa_by_guid[$guid][] = $dataset;
86 | }
87 | }
88 |
89 | echo 'prod_vs_qa.csv' . PHP_EOL;
90 | is_file($results_dir . '/prod_vs_qa_epa.csv') && unlink($results_dir . '/prod_vs_qa_epa.csv');
91 | $csv = new Writer($results_dir . '/prod_vs_qa_epa.csv');
92 | $csv->writeRow([
93 | 'Prod Title',
94 | 'Prod URL',
95 | 'Prod GUID',
96 | 'Prod Topics',
97 | 'Prod Categories',
98 | 'Matched',
99 | 'QA Title',
100 | 'QA URL',
101 | 'QA GUID',
102 | 'URL Match',
103 | 'GUID Match',
104 | ]);
105 |
106 | foreach ($prod_epa as $name => $prod_dataset) {
107 | if (isset($qa_epa_by_guid[$prod_dataset['guid']])) {
108 | foreach ($qa_epa_by_guid[$prod_dataset['guid']] as $qa_dataset) {
109 | $csv->writeRow([
110 | $prod_dataset['title'],
111 | $prod_dataset['url'],
112 | $prod_dataset['guid'],
113 | $prod_dataset['topics'],
114 | $prod_dataset['categories'],
115 | true,
116 | $qa_dataset['title'],
117 | $qa_dataset['url'],
118 | $qa_dataset['guid'],
119 | (bool)($prod_dataset['name'] == $qa_dataset['name']),
120 | true,
121 | ]);
122 | }
123 | continue;
124 | }
125 |
126 | if (isset($qa_epa_by_title[$prod_dataset['title_simple']])) {
127 | foreach ($qa_epa_by_title[$prod_dataset['title_simple']] as $qa_dataset) {
128 | $csv->writeRow([
129 | $prod_dataset['title'],
130 | $prod_dataset['url'],
131 | $prod_dataset['guid'],
132 | $prod_dataset['topics'],
133 | $prod_dataset['categories'],
134 | true,
135 | $qa_dataset['title'],
136 | $qa_dataset['url'],
137 | $qa_dataset['guid'],
138 | true,
139 | (bool)($prod_dataset['guid'] == $qa_dataset['guid']),
140 | ]);
141 | }
142 | continue;
143 | }
144 |
145 | $csv->writeRow([
146 | $prod_dataset['title'],
147 | $prod_dataset['url'],
148 | $prod_dataset['guid'],
149 | $prod_dataset['topics'],
150 | $prod_dataset['categories'],
151 | false,
152 | '',
153 | '',
154 | '',
155 | false,
156 | false,
157 | ]);
158 | }
159 |
160 | // show running time on finish
161 | timer();
162 |
--------------------------------------------------------------------------------
/cli/epa-gov_process/__compare_json_vs_prod_epa.php:
--------------------------------------------------------------------------------
1 | writeRow([
30 | 'title',
31 | 'title_simple',
32 | 'name',
33 | 'url',
34 | 'guid',
35 | 'topics',
36 | 'categories',
37 | ]);
38 |
39 | $ProdCkanManager = new CkanManager(CKAN_API_URL);
40 | $ProdCkanManager->resultsDir = $results_dir;
41 |
42 | $json_backup_epa = $ProdCkanManager->exportBrief('organization:epa-gov AND metadata_type:geospatial');
43 | $json->writeFromArray($json_backup_epa);
44 | } else {
45 | $json = new Reader($results_dir . '/json.csv');
46 | $json_backup_epa = $json->getAll();
47 | }
48 |
49 | echo 'prod.csv' . PHP_EOL;
50 | if (!is_file($results_dir . '/prod.csv')) {
51 | $prod = new Writer($results_dir . '/prod.csv');
52 |
53 | $prod->writeRow([
54 | 'title',
55 | 'title_simple',
56 | 'name',
57 | 'url',
58 | 'guid',
59 | 'topics',
60 | 'categories',
61 | ]);
62 |
63 | $QaCkanManager = new CkanManager(CKAN_UAT_API_URL);
64 | $QaCkanManager->resultsDir = $results_dir;
65 |
66 | $prod_epa = $QaCkanManager->exportBrief('organization:epa-gov AND metadata_type:geospatial');
67 | $prod->writeFromArray($prod_epa);
68 |
69 | } else {
70 | $prod = new Reader($results_dir . '/prod.csv');
71 | $prod_epa = $prod->getAll();
72 | }
73 |
74 | $prod_epa_by_title = $prod_epa_by_guid = [];
75 |
76 | foreach ($prod_epa as $name => $dataset) {
77 | $title = $dataset['title_simple'];
78 |
79 | $prod_epa_by_title[$title] = isset($prod_epa_by_title[$title]) ? $prod_epa_by_title[$title] : [];
80 | $prod_epa_by_title[$title][] = $dataset;
81 |
82 | $guid = trim($dataset['guid']);
83 | if ($guid) {
84 | $prod_epa_by_guid[$guid] = isset($prod_epa_by_guid[$guid]) ? $prod_epa_by_guid[$guid] : [];
85 | $prod_epa_by_guid[$guid][] = $dataset;
86 | }
87 | }
88 |
89 | echo 'json_vs_prod.csv' . PHP_EOL;
90 | is_file($results_dir . '/json_vs_prod_epa.csv') && unlink($results_dir . '/json_vs_prod_epa.csv');
91 | $csv = new Writer($results_dir . '/json_vs_prod_epa.csv');
92 | $csv->writeRow([
93 | 'Backup Title',
94 | 'Backup URL',
95 | 'Backup GUID',
96 | 'Backup Topics',
97 | 'Backup Categories',
98 | 'Matched',
99 | 'Prod Title',
100 | 'Prod URL',
101 | 'Prod GUID',
102 | 'URL Match',
103 | 'GUID Match',
104 | ]);
105 |
106 | foreach ($json_backup_epa as $name => $backup_dataset) {
107 | if (isset($prod_epa_by_guid[$backup_dataset['guid']])) {
108 | foreach ($prod_epa_by_guid[$backup_dataset['guid']] as $prod_dataset) {
109 | $csv->writeRow([
110 | $backup_dataset['title'],
111 | $backup_dataset['url'],
112 | $backup_dataset['guid'],
113 | $backup_dataset['topics'],
114 | $backup_dataset['categories'],
115 | true,
116 | $prod_dataset['title'],
117 | $prod_dataset['url'],
118 | $prod_dataset['guid'],
119 | (bool)($backup_dataset['name'] == $prod_dataset['name']),
120 | true,
121 | ]);
122 | }
123 | continue;
124 | }
125 |
126 | if (isset($prod_epa_by_title[$backup_dataset['title_simple']])) {
127 | foreach ($prod_epa_by_title[$backup_dataset['title_simple']] as $prod_dataset) {
128 | $csv->writeRow([
129 | $backup_dataset['title'],
130 | $backup_dataset['url'],
131 | $backup_dataset['guid'],
132 | $backup_dataset['topics'],
133 | $backup_dataset['categories'],
134 | true,
135 | $prod_dataset['title'],
136 | $prod_dataset['url'],
137 | $prod_dataset['guid'],
138 | true,
139 | (bool)($backup_dataset['guid'] == $prod_dataset['guid']),
140 | ]);
141 | }
142 | continue;
143 | }
144 |
145 | $csv->writeRow([
146 | $backup_dataset['title'],
147 | $backup_dataset['url'],
148 | $backup_dataset['guid'],
149 | $backup_dataset['topics'],
150 | $backup_dataset['categories'],
151 | false,
152 | '',
153 | '',
154 | '',
155 | false,
156 | false,
157 | ]);
158 | }
159 |
160 | // show running time on finish
161 | timer();
162 |
--------------------------------------------------------------------------------
/src/CKAN/Manager/ExploreApi.php:
--------------------------------------------------------------------------------
1 | api_url = $api_url;
35 |
36 | // Create cURL object.
37 | $this->curl_handler = curl_init();
38 | // Follow any Location: headers that the server sends.
39 | curl_setopt($this->curl_handler, CURLOPT_FOLLOWLOCATION, true);
40 | // However, don't follow more than five Location: headers.
41 | curl_setopt($this->curl_handler, CURLOPT_MAXREDIRS, 5);
42 | // Automatically set the Referrer: field in requests
43 | // following a Location: redirect.
44 | curl_setopt($this->curl_handler, CURLOPT_AUTOREFERER, true);
45 | // Return the transfer as a string instead of dumping to screen.
46 | curl_setopt($this->curl_handler, CURLOPT_RETURNTRANSFER, true);
47 | // If it takes more than 5 minutes => fail
48 | curl_setopt($this->curl_handler, CURLOPT_TIMEOUT, 60 * 5);
49 | // We don't want the header (use curl_getinfo())
50 | curl_setopt($this->curl_handler, CURLOPT_HEADER, false);
51 | // Track the handle's request string
52 | curl_setopt($this->curl_handler, CURLINFO_HEADER_OUT, true);
53 | // Attempt to retrieve the modification date of the remote document.
54 | curl_setopt($this->curl_handler, CURLOPT_FILETIME, true);
55 | // Initialize cURL headers
56 | $this->set_headers();
57 | }
58 |
59 | /**
60 | * Sets the custom cURL headers.
61 | * @access private
62 | * @return void
63 | * @since Version 0.1.0
64 | */
65 | private function set_headers()
66 | {
67 | $date = new \DateTime(null, new \DateTimeZone('UTC'));
68 | $this->ch_headers = [
69 | 'Date: ' . $date->format('D, d M Y H:i:s') . ' GMT', // RFC 1123
70 | 'Accept: application/json',
71 | 'Accept-Charset: utf-8',
72 | 'Accept-Encoding: gzip'
73 | ];
74 | }
75 |
76 | /**
77 | * @param $json_id
78 | *
79 | * @return mixed
80 | */
81 | public function get_json($json_id)
82 | {
83 | return $this->make_request(
84 | 'GET',
85 | 'views/' . $json_id . '.json'
86 | );
87 | }
88 |
89 | /**
90 | * @param string $method // HTTP method (GET, POST)
91 | * @param string $uri // URI fragment to CKAN resource
92 | * @param string $data // Optional. String in JSON-format that will be in request body
93 | *
94 | * @return mixed // If success, either an array or object. Otherwise FALSE.
95 | * @throws \Exception
96 | */
97 | private function make_request($method, $uri, $data = null)
98 | {
99 | $method = strtoupper($method);
100 | if (!in_array($method, ['GET', 'POST'])) {
101 | throw new \Exception('Method ' . $method . ' is not supported');
102 | }
103 | // Set cURL URI.
104 | curl_setopt($this->curl_handler, CURLOPT_URL, $this->api_url . $uri);
105 | if ($method === 'POST') {
106 | if ($data) {
107 | curl_setopt($this->curl_handler, CURLOPT_POSTFIELDS, urlencode($data));
108 | } else {
109 | $method = 'GET';
110 | }
111 | }
112 |
113 | // Set cURL method.
114 | curl_setopt($this->curl_handler, CURLOPT_CUSTOMREQUEST, $method);
115 |
116 | // Set headers.
117 | curl_setopt($this->curl_handler, CURLOPT_HTTPHEADER, $this->ch_headers);
118 | // Execute request and get response headers.
119 | $response = curl_exec($this->curl_handler);
120 | $info = curl_getinfo($this->curl_handler);
121 | // Check HTTP response code
122 | if ($info['http_code'] !== 200) {
123 | switch ($info['http_code']) {
124 | case 0:
125 | var_dump($info);
126 | break;
127 | case 404:
128 | throw new \Exception($data);
129 | break;
130 | default:
131 | throw new \Exception(
132 | $info['http_code'] . ': ' . PHP_EOL . $data . PHP_EOL
133 | );
134 | }
135 | }
136 |
137 | return $response;
138 | }
139 |
140 | /**
141 | * Since it's possible to leave cURL open, this is the last chance to close it
142 | */
143 | public function __destruct()
144 | {
145 | if ($this->curl_handler) {
146 | curl_close($this->curl_handler);
147 | unset($this->curl_handler);
148 | }
149 | }
150 | }
151 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ckan-php-manager
2 | ================
3 |
4 | [](https://travis-ci.org/GSA/ckan-php-manager)
5 | [](https://www.codacy.com/app/alexandr-perfilov/ckan-php-manager)
6 | [](https://gitter.im/GSA/ckan-php-manager?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
7 |
8 | A bunch of scripts to perform tasks using CKAN API and https://github.com/GSA/ckan-php-client
9 |
10 | ## Requirements
11 |
12 | * PHP 7.0+ :
13 |
14 | ## Installation
15 |
16 | ### Clone repository
17 | $ git clone https://github.com/GSA/ckan-php-manager.git
18 |
19 | ### Composer
20 | Use [composer](#composer) to install/update dependencies
21 |
22 | If you don't have Composer, [install](https://getcomposer.org/download/) it:
23 |
24 | $ curl -sS https://getcomposer.org/installer | php
25 | $ mv composer.phar /usr/local/bin/composer
26 |
27 | #### Install dependencies:
28 |
29 | $ composer install
30 |
31 | ### Configuration
32 | Copy config.sample.php to config.php. Update it with your custom values, if needed.
33 |
34 | $ cp inc/config.sample.php inc/config.php
35 |
36 | ## Usage
37 |
38 | ### Export all packages by Agency name, including all Sub Agencies
39 |
40 | * Update `cli/export_packages_by_org.php`, editing the title of exported organization ORGANIZATION_TO_EXPORT
41 | * Run importer using php
42 |
43 | ```
44 | $ php cli/export_packages_by_org.php
45 | ```
46 |
47 | Script is taking all terms, including sub-agencies from http://www.data.gov/app/themes/roots-nextdatagov/assets/Json/fed_agency.json and makes CKAN requests,
48 | looking for packages by these organization list.
49 |
50 | Results can be found in /results/{timestamp} dir after script finished its work, including `_{term}.log` with package counts for each agency.
51 |
52 | ### DMS legacy tag
53 |
54 | To add tag `add_legacy_dms_and_make_private` to all datasets of some group:
55 |
56 | * Update ORGANIZATION_TO_TAG in the `cli/add_legacy_dms_and_make_private.php`
57 | * Double check CKAN_URL and CKAN_API_KEY for editing datasets
58 | * Run script
59 |
60 | ```
61 | $ php cli/add_legacy_dms_and_make_private.php
62 | ```
63 |
64 | ### Assign groups and category tags to datasets
65 |
66 | * Put csv files to /data dir, with `assign_.csv` (must have `assign_` prefix)
67 | The format of these files must be:
68 | `dataset, group, categories`
69 |
70 | First line is caption, leave the first line in each file:
71 | `dataset,group,categories`
72 |
73 | Then put one dataset per line.
74 |
75 | 1. Dataset can be:
76 | * Dataset url, ex. https://catalog.data.gov/dataset/food-access-research-atlas
77 | * Dataset name, ex. download-crossing-inventory-data-highway-rail-crossing
78 | * Dataset id
79 |
80 | 2. Group
81 | just one group per line. If you need to add multiple groups, you must create another row in csv with same dataset and another group,
82 | because all the categories are tagged by current row group. Make sure your group exist in your CKAN instance (to list all
83 | existing groups, go to http://catalog.data.gov/api/3/action/group_list?all_fields=true , replacing `catalog.data.gov` with your
84 | CKAN domain)
85 |
86 | 3. Categories
87 | one of multiple categories per current row group, separated by semicolon `;`
88 |
89 | Example csv file:
90 |
91 | ```
92 | dataset, group, categories
93 | https://catalog.data.gov/dataset/food-access-research-atlas,Agriculture,"Natural Resources and Environment"
94 | aerial-image-of-alaskas-arctic-coastal-plain-1955,Climate,"Arctic; Arctic Ocean, Sea Ice and Coasts; Permafrost and Arctic Landscapes"
95 | 28d30c1f-75a5-4042-b0fc-de26cc7d70f2,Climate,Arctic; Arctic Development and Transport
96 | ```
97 | * Double check CKAN_URL and CKAN_API_KEY for editing datasets, defined in `inc/config.php`
98 | * Run script
99 |
100 | ```
101 | $ php cli/tagging/assign_groups_and_tags.php
102 | ```
103 | * Detailed logs and results are stored in folder `results/[time-stamp]_ASSIGN_GROUPS`
104 |
105 | ### Remove groups and category tags from datasets (revert previous script changes)
106 |
107 | * Prepare same csv file as for previous script, and put them to /data dir, with `remove_.csv`
108 |
109 | ```
110 | $ php cli/tagging/remove_groups_and_tags.php
111 | ```
112 | * This command will remove listed categories from the dataset of the row. If an empty list of categories is provided, this command will remove the group and all categories from the dataset.
113 |
114 | ## CKAN API DOCs
115 |
116 | http://docs.ckan.org/en/latest/api/index.html
117 |
118 |
119 | ## Docker setup
120 |
121 | To minimize requirements on a system, we've added a minimal setup with
122 | docker-compose. This should replace the above usage instructions as the default
123 | workflow.
124 |
125 | $ docker-compose build
126 | $ docker-compose run --rm app php cli/harvest_stats_csv.php
127 |
128 | Run the tests.
129 |
130 | $ docker-compose run --rm app phpunit
131 |
--------------------------------------------------------------------------------
/cli/compare_basic.php:
--------------------------------------------------------------------------------
1 | writeRow([
30 | 'title',
31 | 'title_simple',
32 | 'name',
33 | 'url',
34 | 'identifier',
35 | 'guid',
36 | 'topics',
37 | 'categories',
38 | ]);
39 |
40 | $CkanManager = new CkanManager(CKAN_API_URL);
41 | $CkanManager->resultsDir = $results_dir;
42 |
43 | $cmp1 = $CkanManager->exportBrief('organization:((eop-gov) OR (omb-eop-gov) OR (ondcp-eop-gov) OR (ceq-eop-gov) ' .
44 | 'OR (ostp-eop-gov) OR (ustr-eop-gov) OR (wh-eop-gov)) DMS AND dataset_type:dataset');
45 | $cmp1_csv->writeFromArray($cmp1);
46 | } else {
47 | $cmp1_csv = new Reader($results_dir . '/cmp1.csv');
48 | $cmp1_csv->getHeaders();
49 | $cmp1 = $cmp1_csv->getAll();
50 | }
51 |
52 | echo 'cmp2.csv' . PHP_EOL;
53 | if (!is_file($results_dir . '/cmp2.csv')) {
54 | $cmp2_csv = new Writer($results_dir . '/cmp2.csv');
55 |
56 | $cmp2_csv->writeRow([
57 | 'title',
58 | 'title_simple',
59 | 'name',
60 | 'url',
61 | 'identifier',
62 | 'guid',
63 | 'topics',
64 | 'categories',
65 | ]);
66 |
67 | $CkanManager = new CkanManager(CKAN_API_URL);
68 | $CkanManager->resultsDir = $results_dir;
69 |
70 | $cmp2 = $CkanManager->exportBrief('organization:((eop-gov) OR (omb-eop-gov) OR (ondcp-eop-gov) OR (ceq-eop-gov) ' .
71 | 'OR (ostp-eop-gov) OR (ustr-eop-gov) OR (wh-eop-gov)) -DMS AND dataset_type:dataset');
72 | $cmp2_csv->writeFromArray($cmp2);
73 |
74 | } else {
75 | $cmp2_csv = new Reader($results_dir . '/cmp2.csv');
76 | $cmp2 = $cmp2_csv->getAll();
77 | }
78 |
79 | $cmp2_by_title = $cmp2_by_guid = [];
80 |
81 | foreach ($cmp2 as $name => $dataset) {
82 | $title = $dataset['title_simple'];
83 |
84 | $cmp2_by_title[$title] = isset($cmp2_by_title[$title]) ? $cmp2_by_title[$title] : [];
85 | $cmp2_by_title[$title][] = $dataset;
86 |
87 | $guid = trim($dataset['guid']);
88 | if ($guid) {
89 | $cmp2_by_guid[$guid] = isset($cmp2_by_guid[$guid]) ? $cmp2_by_guid[$guid] : [];
90 | $cmp2_by_guid[$guid][] = $dataset;
91 | }
92 | }
93 |
94 | echo 'comparison.csv' . PHP_EOL;
95 | is_file($results_dir . '/comparison.csv') && unlink($results_dir . '/comparison.csv');
96 | $csv = new Writer($results_dir . '/comparison.csv');
97 | $cmp1_header = "DMS";
98 | $cmp2_header = "NON-DMS";
99 | $csv->writeRow([
100 | $cmp1_header . ' Title',
101 | $cmp1_header . ' URL',
102 | $cmp1_header . ' GUID',
103 | $cmp1_header . ' Topics',
104 | $cmp1_header . ' Categories',
105 | 'Matched',
106 | $cmp2_header . ' Title',
107 | $cmp2_header . ' URL',
108 | $cmp2_header . ' GUID',
109 | 'URL Match',
110 | 'GUID Match',
111 | ]);
112 |
113 | foreach ($cmp1 as $name => $cmp1_dataset) {
114 | if (isset($cmp2_by_guid[$cmp1_dataset['guid']])) {
115 | foreach ($cmp2_by_guid[$cmp1_dataset['guid']] as $cmp2_dataset) {
116 | $csv->writeRow([
117 | $cmp1_dataset['title'],
118 | $cmp1_dataset['url'],
119 | $cmp1_dataset['guid'],
120 | $cmp1_dataset['topics'],
121 | $cmp1_dataset['categories'],
122 | true,
123 | $cmp2_dataset['title'],
124 | $cmp2_dataset['url'],
125 | $cmp2_dataset['guid'],
126 | (bool)($cmp1_dataset['name'] && $cmp1_dataset['name'] == $cmp2_dataset['name']),
127 | true,
128 | ]);
129 | }
130 | continue;
131 | }
132 |
133 | if (isset($cmp2_by_title[$cmp1_dataset['title_simple']])) {
134 | foreach ($cmp2_by_title[$cmp1_dataset['title_simple']] as $cmp2_dataset) {
135 | $csv->writeRow([
136 | $cmp1_dataset['title'],
137 | $cmp1_dataset['url'],
138 | $cmp1_dataset['guid'],
139 | $cmp1_dataset['topics'],
140 | $cmp1_dataset['categories'],
141 | true,
142 | $cmp2_dataset['title'],
143 | $cmp2_dataset['url'],
144 | $cmp2_dataset['guid'],
145 | true,
146 | (bool)($cmp1_dataset['guid'] && $cmp1_dataset['guid'] == $cmp2_dataset['guid']),
147 | ]);
148 | }
149 | continue;
150 | }
151 |
152 | $csv->writeRow([
153 | $cmp1_dataset['title'],
154 | $cmp1_dataset['url'],
155 | $cmp1_dataset['guid'],
156 | $cmp1_dataset['topics'],
157 | $cmp1_dataset['categories'],
158 | false,
159 | '',
160 | '',
161 | '',
162 | false,
163 | false,
164 | ]);
165 | }
166 |
167 | // show running time on finish
168 | timer();
169 |
--------------------------------------------------------------------------------
/cli/noaa-gov/compare_qa_vs_prod_noaa.php:
--------------------------------------------------------------------------------
1 | writeRow([
29 | 'title',
30 | 'title_simple',
31 | 'name',
32 | 'url',
33 | 'identifier',
34 | 'guid',
35 | 'topics',
36 | 'categories',
37 | ]);
38 | $ProdCkanManager = new CkanManager(CKAN_API_URL);
39 | $ProdCkanManager->resultsDir = $results_dir;
40 |
41 | $prod_noaa = $ProdCkanManager->exportBrief('organization:noaa-gov AND metadata_type:geospatial AND dataset_type:dataset');
42 | $prod->writeFromArray($prod_noaa);
43 | file_put_contents($results_dir . '/prod.json', json_encode($prod_noaa, JSON_PRETTY_PRINT));
44 | } else {
45 | $prod_noaa = json_decode(file_get_contents($results_dir . '/prod.json'));
46 | }
47 |
48 | echo 'qa.json' . PHP_EOL;
49 | if (!is_file($results_dir . '/qa.json')) {
50 | $qa = new Writer($results_dir . '/qa.csv');
51 |
52 | $qa->writeRow([
53 | 'title',
54 | 'title_simple',
55 | 'name',
56 | 'url',
57 | 'identifier',
58 | 'guid',
59 | 'topics',
60 | 'categories',
61 | ]);
62 | $QaCkanManager = new CkanManager(CKAN_QA_API_URL);
63 | $QaCkanManager->resultsDir = $results_dir;
64 |
65 | $qa_noaa = $QaCkanManager->exportBrief('organization:noaa-gov', '',
66 | 'http://qa-catalog-fe-data.reisys.com/dataset/');
67 | $qa->writeFromArray($qa_noaa);
68 | file_put_contents($results_dir . '/qa.json', json_encode($qa_noaa, JSON_PRETTY_PRINT));
69 | } else {
70 | $qa_noaa = json_decode(file_get_contents($results_dir . '/qa.json'));
71 | }
72 |
73 | $qa_noaa_by_title = $qa_noaa_by_guid = [];
74 |
75 | foreach ($qa_noaa as $name => $dataset) {
76 | $title = $dataset['title_simple'];
77 |
78 | $qa_noaa_by_title[$title] = isset($qa_noaa_by_title[$title]) ? $qa_noaa_by_title[$title] : [];
79 | $qa_noaa_by_title[$title][] = $dataset;
80 |
81 | $guid = trim($dataset['guid']);
82 | if ($guid) {
83 | $qa_noaa_by_guid[$guid] = isset($qa_noaa_by_guid[$guid]) ? $qa_noaa_by_guid[$guid] : [];
84 | $qa_noaa_by_guid[$guid][] = $dataset;
85 | }
86 | }
87 |
88 | echo 'prod_vs_qa.csv' . PHP_EOL;
89 | is_file($results_dir . '/prod_vs_qa_noaa_geospatial.csv') && unlink($results_dir . '/prod_vs_qa_noaa_geospatial.csv');
90 | $csv = new Writer($results_dir . '/prod_vs_qa_noaa_geospatial.csv');
91 | $csv->writeRow([
92 | 'Prod Title',
93 | 'Prod URL',
94 | 'Prod GUID',
95 | 'Prod Topics',
96 | 'Prod Categories',
97 | 'Matched',
98 | 'QA Title',
99 | 'QA URL',
100 | 'QA GUID',
101 | 'URL Match',
102 | 'Title Match',
103 | 'GUID Match',
104 | ]);
105 |
106 | foreach ($prod_noaa as $name => $prod_dataset) {
107 | if (isset($qa_noaa_by_guid[$prod_dataset['guid']])) {
108 | foreach ($qa_noaa_by_guid[$prod_dataset['guid']] as $qa_dataset) {
109 | $csv->writeRow([
110 | $prod_dataset['title'],
111 | $prod_dataset['url'],
112 | $prod_dataset['guid'],
113 | $prod_dataset['topics'],
114 | $prod_dataset['categories'],
115 | true,
116 | $qa_dataset['title'],
117 | $qa_dataset['url'],
118 | $qa_dataset['guid'],
119 | (bool)($prod_dataset['name'] && $prod_dataset['name'] == $qa_dataset['name']),
120 | (bool)($prod_dataset['title_simple'] && $prod_dataset['title_simple'] == $qa_dataset['title_simple']),
121 | true,
122 | ]);
123 | }
124 | continue;
125 | }
126 |
127 | if (isset($qa_noaa_by_title[$prod_dataset['title_simple']])) {
128 | foreach ($qa_noaa_by_title[$prod_dataset['title_simple']] as $qa_dataset) {
129 | $csv->writeRow([
130 | $prod_dataset['title'],
131 | $prod_dataset['url'],
132 | $prod_dataset['guid'],
133 | $prod_dataset['topics'],
134 | $prod_dataset['categories'],
135 | true,
136 | $qa_dataset['title'],
137 | $qa_dataset['url'],
138 | $qa_dataset['guid'],
139 | (bool)($prod_dataset['name'] && $prod_dataset['name'] == $qa_dataset['name']),
140 | true,
141 | (bool)($prod_dataset['guid'] == $qa_dataset['guid']),
142 | ]);
143 | }
144 | continue;
145 | }
146 |
147 | $csv->writeRow([
148 | $prod_dataset['title'],
149 | $prod_dataset['url'],
150 | $prod_dataset['guid'],
151 | $prod_dataset['topics'],
152 | $prod_dataset['categories'],
153 | false,
154 | '',
155 | '',
156 | '',
157 | false,
158 | false,
159 | ]);
160 | }
161 |
162 | // show running time on finish
163 | timer();
164 |
--------------------------------------------------------------------------------
/cli/tagging/assign_groups_and_tags.php:
--------------------------------------------------------------------------------
1 | resultsDir = $results_dir;
32 | foreach (glob(CKANMNGR_DATA_DIR . '/assign_*.csv') as $csv_file) {
33 | $csv_source = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL;
34 | echo $CkanManager->color->green($csv_source);
35 |
36 | $basename = str_replace('.csv', '', basename($csv_file));
37 |
38 | // fix wrong END-OF-LINE
39 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file)));
40 |
41 | // file_put_contents($resultsDir . '/' . $basename . '_tags.log', $status, FILE_APPEND | LOCK_EX);
42 |
43 | $csv = new EasyCSV\Reader($csv_file, 'r+', false);
44 | while (true) {
45 | $row = $csv->getRow();
46 | if (!$row) {
47 | break;
48 | }
49 |
50 | // skip headers
51 | if (in_array(trim(strtolower($row['0'])), ['link', 'dataset', 'url', 'data.gov url'])) {
52 | continue;
53 | }
54 |
55 | if ($start > 0) {
56 | $start--;
57 | continue;
58 | }
59 |
60 | // format group tags
61 | $categories = [];
62 | if (isset($row['2']) && $row['2']) {
63 | $categories = explode(';', trim($row['2']));
64 | $categories = array_map('trim', $categories);
65 |
66 | }
67 |
68 | // no anchors please
69 | list($dataset,) = explode('#', basename(trim($row['0'])));
70 |
71 | if (!$dataset) {
72 | continue;
73 | }
74 |
75 | // double trouble check
76 | if (strpos($row['0'], '://')) {
77 | if (!strpos($row['0'], '/dataset/')) {
78 | if (strpos($row['0'], 'dataset?q=')) {
79 | parse_str(parse_url($row['0'], PHP_URL_QUERY), $query_array);
80 | if (isset($query_array['q'])) {
81 | $query = $query_array['q'];
82 | if (isset($query_array['organization'])) {
83 | $org = $query_array['organization'];
84 | $organizationList = new OrganizationList();
85 | $org = $organizationList->getTreeArrayFor($organizationList->getNameFor($org));
86 | if (!is_array($org) || !sizeof($org)) {
87 | continue;
88 | }
89 | $org = join(' OR ', array_keys($org));
90 | // var_dump($organizationList->getTreeArrayFor($organizationList->getNameFor($org)));
91 | // continue;
92 | $query = "$query AND organization:($org)";
93 |
94 |
95 | // echo $query.PHP_EOL;
96 | }
97 | $packages = $CkanManager->tryPackageSearch($query, '', 200);
98 | $CkanManager->say(sizeof($packages) . " found searching: $query,API SEARCH");
99 | file_put_contents(
100 | $results_dir . '/' . $basename . '_tags.log.csv',
101 | sizeof($packages) . " found searching: $query,API SEARCH" . PHP_EOL,
102 | FILE_APPEND | LOCK_EX
103 | );
104 | // print $query_array['q'];
105 | if (!sizeof($packages)) {
106 | continue;
107 | }
108 |
109 | foreach ($packages as $package) {
110 | $CkanManager->assignGroupsAndCategoriesToDatasets(
111 | [$package['name']],
112 | trim($row['1']),
113 | $basename,
114 | $categories
115 | );
116 | continue;
117 | }
118 | }
119 | continue;
120 | }
121 |
122 |
123 | continue;
124 | }
125 | }
126 |
127 | $CkanManager->assignGroupsAndCategoriesToDatasets(
128 | [$dataset],
129 | trim($row['1']),
130 | $basename,
131 | $categories
132 | );
133 | }
134 | }
135 |
136 | // show running time on finish
137 | timer();
138 |
--------------------------------------------------------------------------------
/cli/check_aapi.php:
--------------------------------------------------------------------------------
1 | fail
32 | curl_setopt($curl_ch, CURLOPT_TIMEOUT, 60 * 5);
33 | // We don't want the header (use curl_getinfo())
34 | curl_setopt($curl_ch, CURLOPT_HEADER, false);
35 | // Track the handle's request string
36 | curl_setopt($curl_ch, CURLINFO_HEADER_OUT, true);
37 | // Attempt to retrieve the modification date of the remote document.
38 | curl_setopt($curl_ch, CURLOPT_FILETIME, true);
39 | // Initialize cURL headers
40 |
41 | foreach (glob(CKANMNGR_DATA_DIR . '/check_*.csv') as $csv_file) {
42 | $status = PHP_EOL . PHP_EOL . basename($csv_file) . PHP_EOL . PHP_EOL;
43 | echo $status;
44 |
45 | $basename = str_replace('.csv', '', basename($csv_file));
46 |
47 | // fix wrong END-OF-LINE
48 | file_put_contents($csv_file, preg_replace('/[\\r\\n]+/', "\n", file_get_contents($csv_file)));
49 |
50 | $csv_source = new EasyCSV\Reader($csv_file, 'r+', false);
51 | $csv_destination = new EasyCSV\Writer($results_dir . '/' . $basename . '_log.csv');
52 |
53 | $csv_destination->writeRow(['dataset', 'status', 'aapi found']);
54 |
55 | $i = 0;
56 | while (true) {
57 | if (!($i++ % 100)) {
58 | echo $i . PHP_EOL;
59 | }
60 | $row = $csv_source->getRow();
61 | if (!$row) {
62 | break;
63 | }
64 | // skip headers
65 | if (in_array(trim(strtolower($row[0])), ['data.gov url'])) {
66 | continue;
67 | }
68 |
69 | $url = strtolower($row[0]);
70 |
71 | if (!strpos($url, '/dataset/')) {
72 | $csv_destination->writeRow([$url, 'not a dataset', '0']);
73 | continue;
74 | }
75 |
76 | $dataset = try_get_dataset($curl_ch, str_replace('/dataset/', '/api/rest/dataset/', $url));
77 |
78 | if (200 !== $dataset['info']['http_code']) {
79 | // Redirect check
80 | $dataset2 = try_get_dataset($curl_ch, $url);
81 | if ((404 == $dataset['info']['http_code']) && (200 == $dataset2['info']['http_code'])) {
82 | $response = $dataset2['response'];
83 | if (stripos($response, 'http-equiv="refresh"')) {
84 | $pattern = '/content="0;URL=(http[\S\/\-\.]+)"/';
85 | preg_match($pattern, $response, $matches, PREG_OFFSET_CAPTURE, 3);
86 | if ($matches && isset($matches[1]) && isset($matches[1][0])) {
87 | $url2 = $matches[1][0];
88 |
89 | $dataset3 = try_get_dataset($curl_ch, str_replace('/dataset/', '/api/rest/dataset/', $url2));
90 | if (200 == $dataset3['info']['http_code']) {
91 | $aapi_found = strpos($dataset3['response'], 'aapi0916');
92 | $csv_destination->writeRow([$url, 'ok (redirect)', ($aapi_found ? '1' : '0')]);
93 | continue;
94 | }
95 | }
96 | }
97 | }
98 | $csv_destination->writeRow([$url, $dataset['info']['http_code'], '0']);
99 | continue;
100 | } else {
101 | if (!strpos($dataset['response'], '"type": "dataset",')) {
102 | $csv_destination->writeRow([$url, 'not a dataset', '0']);
103 | continue;
104 | }
105 | $aapi_found = strpos($dataset['response'], 'aapi0916');
106 | $csv_destination->writeRow([$url, 'ok', ($aapi_found ? '1' : '0')]);
107 | continue;
108 | }
109 | }
110 | }
111 |
112 | /**
113 | * @param $url1
114 | * @param $url2
115 | *
116 | * @return bool
117 | */
118 | function url_compare($url1, $url2)
119 | {
120 | $url1_strip = trim(str_replace(['http:', 'https:'], '', $url1), '/ ');
121 | $url2_strip = trim(str_replace(['http:', 'https:'], '', $url2), '/ ');
122 |
123 | return ($url1_strip === $url2_strip);
124 | }
125 |
126 | /**
127 | * @param $curl_ch
128 | * @param $url
129 | * @return array
130 | */
131 | function try_get_dataset($curl_ch, $url)
132 | {
133 | curl_setopt($curl_ch, CURLOPT_URL, $url);
134 | $method = 'GET';
135 |
136 | // Set cURL method.
137 | curl_setopt($curl_ch, CURLOPT_CUSTOMREQUEST, $method);
138 |
139 | // Execute request and get response headers.
140 | $response = curl_exec($curl_ch);
141 | $info = curl_getinfo($curl_ch);
142 |
143 | $return = [
144 | 'response' => $response,
145 | 'info' => $info
146 | ];
147 |
148 | return $return;
149 | }
150 |
151 | // show running time on finish
152 | timer();
153 |
--------------------------------------------------------------------------------
/cli/ntsb-gov_process/compare_uat_vs_prod_ntsb.php:
--------------------------------------------------------------------------------
1 | writeRow([
30 | 'title',
31 | 'title_simple',
32 | 'name',
33 | 'url',
34 | 'guid',
35 | 'topics',
36 | 'categories',
37 | ]);
38 |
39 | $ProdCkanManager = new CkanManager(CKAN_API_URL);
40 | $ProdCkanManager->resultsDir = $results_dir;
41 |
42 | $prod_ntsb = $ProdCkanManager->exportBrief('organization:ntsb-gov AND dataset_type:dataset');
43 | $prod->writeFromArray($prod_ntsb);
44 | } else {
45 | $prod = new Reader($results_dir . '/prod.csv');
46 | $prod_ntsb = $prod->getAll();
47 | }
48 |
49 | echo 'uat.csv' . PHP_EOL;
50 | if (!is_file($results_dir . '/uat.csv')) {
51 | $uat = new Writer($results_dir . '/uat.csv');
52 |
53 | $uat->writeRow([
54 | 'title',
55 | 'title_simple',
56 | 'name',
57 | 'url',
58 | 'guid',
59 | 'topics',
60 | 'categories',
61 | ]);
62 |
63 | $QaCkanManager = new CkanManager(CKAN_UAT_API_URL);
64 | $QaCkanManager->resultsDir = $results_dir;
65 |
66 | $uat_ntsb = $QaCkanManager->exportBrief('organization:ntsb-gov AND (harvest_source_title:NTSB*) AND dataset_type:dataset',
67 | '', 'http://uat-catalog-fe-data.reisys.com/dataset/');
68 | $uat->writeFromArray($uat_ntsb);
69 |
70 | } else {
71 | $uat = new Reader($results_dir . '/uat.csv');
72 | $uat_ntsb = $uat->getAll();
73 | }
74 |
75 | $uat_ntsb_by_title = $uat_ntsb_by_guid = [];
76 |
77 | foreach ($uat_ntsb as $name => $dataset) {
78 | $title = $dataset['title_simple'];
79 |
80 | $uat_ntsb_by_title[$title] = isset($uat_ntsb_by_title[$title]) ? $uat_ntsb_by_title[$title] : [];
81 | $uat_ntsb_by_title[$title][] = $dataset;
82 |
83 | $guid = trim($dataset['guid']);
84 | if ($guid) {
85 | $uat_ntsb_by_guid[$guid] = isset($uat_ntsb_by_guid[$guid]) ? $uat_ntsb_by_guid[$guid] : [];
86 | $uat_ntsb_by_guid[$guid][] = $dataset;
87 | }
88 | }
89 |
90 | echo 'prod_vs_uat.csv' . PHP_EOL;
91 | is_file($results_dir . '/prod_vs_uat_ntsb.csv') && unlink($results_dir . '/prod_vs_uat_ntsb.csv');
92 | $csv = new Writer($results_dir . '/prod_vs_uat_ntsb.csv');
93 | $csv->writeRow([
94 | 'Prod Title',
95 | 'Prod URL',
96 | 'Prod GUID',
97 | 'Prod Topics',
98 | 'Prod Categories',
99 | 'Matched',
100 | 'UAT Title',
101 | 'UAT URL',
102 | 'UAT GUID',
103 | 'URL Match',
104 | 'GUID Match',
105 | ]);
106 |
107 | $matched = [];
108 |
109 | foreach ($prod_ntsb as $name => $prod_dataset) {
110 | if (isset($uat_ntsb_by_guid[$prod_dataset['guid']])) {
111 | foreach ($uat_ntsb_by_guid[$prod_dataset['guid']] as $uat_dataset) {
112 | $csv->writeRow([
113 | $prod_dataset['title'],
114 | $prod_dataset['url'],
115 | $prod_dataset['guid'],
116 | $prod_dataset['topics'],
117 | $prod_dataset['categories'],
118 | true,
119 | $uat_dataset['title'],
120 | $uat_dataset['url'],
121 | $uat_dataset['guid'],
122 | (bool)($prod_dataset['name'] == $uat_dataset['name']),
123 | true,
124 | ]);
125 | $matched[] = $uat_dataset['title_simple'];
126 | }
127 | continue;
128 | }
129 |
130 | if (isset($uat_ntsb_by_title[$prod_dataset['title_simple']])) {
131 | foreach ($uat_ntsb_by_title[$prod_dataset['title_simple']] as $uat_dataset) {
132 | $csv->writeRow([
133 | $prod_dataset['title'],
134 | $prod_dataset['url'],
135 | $prod_dataset['guid'],
136 | $prod_dataset['topics'],
137 | $prod_dataset['categories'],
138 | true,
139 | $uat_dataset['title'],
140 | $uat_dataset['url'],
141 | $uat_dataset['guid'],
142 | true,
143 | (bool)($prod_dataset['guid'] == $uat_dataset['guid']),
144 | ]);
145 | $matched[] = $uat_dataset['title_simple'];
146 | }
147 | continue;
148 | }
149 |
150 | $csv->writeRow([
151 | $prod_dataset['title'],
152 | $prod_dataset['url'],
153 | $prod_dataset['guid'],
154 | $prod_dataset['topics'],
155 | $prod_dataset['categories'],
156 | false,
157 | '',
158 | '',
159 | '',
160 | false,
161 | false,
162 | ]);
163 | }
164 |
165 | foreach ($uat_ntsb as $name => $uat_dataset) {
166 | if (!in_array($uat_dataset['title_simple'], $matched)) {
167 | $csv->writeRow([
168 | '',
169 | '',
170 | '',
171 | '',
172 | '',
173 | false,
174 | $uat_dataset['title'],
175 | $uat_dataset['url'],
176 | $uat_dataset['guid'],
177 | false,
178 | false,
179 | ]);
180 | }
181 | }
182 |
183 | // show running time on finish
184 | timer();
185 |
--------------------------------------------------------------------------------
/cli/noaa-gov/compare_uat_vs_prod_noaa.php:
--------------------------------------------------------------------------------
1 | writeRow([
29 | 'title',
30 | 'title_simple',
31 | 'name',
32 | 'url',
33 | 'identifier',
34 | 'guid',
35 | 'topics',
36 | 'categories',
37 | ]);
38 | $ProdCkanManager = new CkanManager(CKAN_API_URL);
39 | $ProdCkanManager->resultsDir = $results_dir;
40 |
41 | $prod_noaa = $ProdCkanManager->exportBrief('organization:noaa-gov AND metadata_type:geospatial AND dataset_type:dataset');
42 | file_put_contents($results_dir . '/prod.json', json_encode($prod_noaa, JSON_PRETTY_PRINT));
43 | $prod->writeFromArray($prod_noaa);
44 | echo PHP_EOL.'datasets from prod: '.sizeof($prod_noaa).PHP_EOL.PHP_EOL;
45 | } else {
46 | $prod_noaa = json_decode(file_get_contents($results_dir . '/prod.json'));
47 | echo PHP_EOL.'datasets from prod: '.sizeof($prod_noaa).PHP_EOL.PHP_EOL;
48 | }
49 |
50 | echo 'uat.json' . PHP_EOL;
51 | if (!is_file($results_dir . '/uat.json')) {
52 | $uat = new Writer($results_dir . '/uat.csv');
53 |
54 | $uat->writeRow([
55 | 'title',
56 | 'title_simple',
57 | 'name',
58 | 'url',
59 | 'identifier',
60 | 'guid',
61 | 'topics',
62 | 'categories',
63 | ]);
64 | $uatCkanManager = new CkanManager(CKAN_UAT_API_URL);
65 | $uatCkanManager->resultsDir = $results_dir;
66 |
67 | $uat_noaa = $uatCkanManager->exportBrief('organization:noaa-gov AND extras_harvest_source_title:NOAA New CSW AND dataset_type:dataset',
68 | '', 'http://uat-catalog-fe-data.reisys.com/dataset/');
69 | file_put_contents($results_dir . '/uat.json', json_encode($uat_noaa, JSON_PRETTY_PRINT));
70 | $uat->writeFromArray($uat_noaa);
71 | echo PHP_EOL.'datasets from uat: '.sizeof($uat_noaa).PHP_EOL.PHP_EOL;
72 | } else {
73 | $uat_noaa = json_decode(file_get_contents($results_dir . '/uat.json'));
74 | echo PHP_EOL.'datasets from uat: '.sizeof($uat_noaa).PHP_EOL.PHP_EOL;
75 | }
76 |
77 | $uat_noaa_by_title = $uat_noaa_by_guid = [];
78 |
79 | foreach ($uat_noaa as $name => $dataset) {
80 | $title = $dataset['title_simple'];
81 |
82 | $uat_noaa_by_title[$title] = isset($uat_noaa_by_title[$title]) ? $uat_noaa_by_title[$title] : [];
83 | $uat_noaa_by_title[$title][] = $dataset;
84 |
85 | $guid = trim($dataset['guid']);
86 | if ($guid) {
87 | $uat_noaa_by_guid[$guid] = isset($uat_noaa_by_guid[$guid]) ? $uat_noaa_by_guid[$guid] : [];
88 | $uat_noaa_by_guid[$guid][] = $dataset;
89 | }
90 | }
91 |
92 | echo 'prod_vs_uat.csv' . PHP_EOL;
93 | is_file($results_dir . '/prod_vs_uat_noaa_geospatial.csv') && unlink($results_dir . '/prod_vs_uat_noaa_geospatial.csv');
94 | $csv = new Writer($results_dir . '/prod_vs_uat_noaa_geospatial.csv');
95 | $csv->writeRow([
96 | 'Prod Title',
97 | 'Prod URL',
98 | 'Prod GUID',
99 | 'Prod Topics',
100 | 'Prod Categories',
101 | 'Matched',
102 | 'UAT Title',
103 | 'UAT URL',
104 | 'UAT GUID',
105 | 'URL Match',
106 | 'Title Match',
107 | 'GUID Match',
108 | ]);
109 |
110 | foreach ($prod_noaa as $name => $prod_dataset) {
111 | if (isset($uat_noaa_by_guid[$prod_dataset['guid']])) {
112 | foreach ($uat_noaa_by_guid[$prod_dataset['guid']] as $uat_dataset) {
113 | $csv->writeRow([
114 | $prod_dataset['title'],
115 | $prod_dataset['url'],
116 | $prod_dataset['guid'],
117 | $prod_dataset['topics'],
118 | $prod_dataset['categories'],
119 | true,
120 | $uat_dataset['title'],
121 | $uat_dataset['url'],
122 | $uat_dataset['guid'],
123 | (bool)($prod_dataset['name'] && $prod_dataset['name'] == $uat_dataset['name']),
124 | (bool)($prod_dataset['title_simple'] && $prod_dataset['title_simple'] == $uat_dataset['title_simple']),
125 | true,
126 | ]);
127 | }
128 | continue;
129 | }
130 |
131 | if (isset($uat_noaa_by_title[$prod_dataset['title_simple']])) {
132 | foreach ($uat_noaa_by_title[$prod_dataset['title_simple']] as $uat_dataset) {
133 | $csv->writeRow([
134 | $prod_dataset['title'],
135 | $prod_dataset['url'],
136 | $prod_dataset['guid'],
137 | $prod_dataset['topics'],
138 | $prod_dataset['categories'],
139 | true,
140 | $uat_dataset['title'],
141 | $uat_dataset['url'],
142 | $uat_dataset['guid'],
143 | (bool)($prod_dataset['name'] && $prod_dataset['name'] == $uat_dataset['name']),
144 | true,
145 | (bool)($prod_dataset['guid'] == $uat_dataset['guid']),
146 | ]);
147 | }
148 | continue;
149 | }
150 |
151 | $csv->writeRow([
152 | $prod_dataset['title'],
153 | $prod_dataset['url'],
154 | $prod_dataset['guid'],
155 | $prod_dataset['topics'],
156 | $prod_dataset['categories'],
157 | false,
158 | '',
159 | '',
160 | '',
161 | false,
162 | false,
163 | ]);
164 | }
165 |
166 | // show running time on finish
167 | timer();
168 |
--------------------------------------------------------------------------------
/cli/pbgc-gov/compare_uat_vs_prod_pbgc.php:
--------------------------------------------------------------------------------
1 | writeRow([
29 | 'title',
30 | 'title_simple',
31 | 'name',
32 | 'url',
33 | 'identifier',
34 | 'guid',
35 | 'topics',
36 | 'categories',
37 | ]);
38 | $ProdCkanManager = new CkanManager(CKAN_API_URL);
39 | $ProdCkanManager->resultsDir = $results_dir;
40 |
41 | $prod_pbgc = $ProdCkanManager->exportBrief('organization:pbgc-gov AND dataset_type:dataset');
42 | file_put_contents($results_dir . '/prod.json', json_encode($prod_pbgc, JSON_PRETTY_PRINT));
43 | $prod->writeFromArray($prod_pbgc);
44 | echo PHP_EOL.'datasets from prod: '.sizeof($prod_pbgc).PHP_EOL.PHP_EOL;
45 | } else {
46 | $prod_pbgc = json_decode(file_get_contents($results_dir . '/prod.json'));
47 | echo PHP_EOL.'datasets from prod: '.sizeof($prod_pbgc).PHP_EOL.PHP_EOL;
48 | }
49 |
50 | echo 'uat.json' . PHP_EOL;
51 | if (!is_file($results_dir . '/uat.json')) {
52 | $uat = new Writer($results_dir . '/uat.csv');
53 |
54 | $uat->writeRow([
55 | 'title',
56 | 'title_simple',
57 | 'name',
58 | 'url',
59 | 'identifier',
60 | 'guid',
61 | 'topics',
62 | 'categories',
63 | ]);
64 | $uatCkanManager = new CkanManager(CKAN_UAT_API_URL);
65 | $uatCkanManager->resultsDir = $results_dir;
66 |
67 | $uat_pbgc = $uatCkanManager->exportBrief('organization:pbgc-gov AND extras_harvest_source_title:PDGC Data.json Source AND dataset_type:dataset',
68 | '', 'http://uat-catalog-fe-data.reisys.com/dataset/');
69 | file_put_contents($results_dir . '/uat.json', json_encode($uat_pbgc, JSON_PRETTY_PRINT));
70 | $uat->writeFromArray($uat_pbgc);
71 | echo PHP_EOL.'datasets from uat: '.sizeof($uat_pbgc).PHP_EOL.PHP_EOL;
72 | } else {
73 | $uat_pbgc = json_decode(file_get_contents($results_dir . '/uat.json'));
74 | echo PHP_EOL.'datasets from uat: '.sizeof($uat_pbgc).PHP_EOL.PHP_EOL;
75 | }
76 |
77 | $uat_pbgc_by_title = $uat_pbgc_by_guid = [];
78 |
79 | foreach ($uat_pbgc as $name => $dataset) {
80 | $title = $dataset['title_simple'];
81 |
82 | $uat_pbgc_by_title[$title] = isset($uat_pbgc_by_title[$title]) ? $uat_pbgc_by_title[$title] : [];
83 | $uat_pbgc_by_title[$title][] = $dataset;
84 |
85 | $guid = trim($dataset['guid']);
86 | if ($guid) {
87 | $uat_pbgc_by_guid[$guid] = isset($uat_pbgc_by_guid[$guid]) ? $uat_pbgc_by_guid[$guid] : [];
88 | $uat_pbgc_by_guid[$guid][] = $dataset;
89 | }
90 | }
91 |
92 | echo 'prod_vs_uat.csv' . PHP_EOL;
93 | is_file($results_dir . '/prod_vs_uat_pbgc_geospatial.csv') && unlink($results_dir . '/prod_vs_uat_pbgc_geospatial.csv');
94 | $csv = new Writer($results_dir . '/prod_vs_uat_pbgc_geospatial.csv');
95 | $csv->writeRow([
96 | 'Prod Title',
97 | 'Prod URL',
98 | 'Prod GUID',
99 | 'Prod Topics',
100 | 'Prod Categories',
101 | 'Matched',
102 | 'UAT Title',
103 | 'UAT URL',
104 | 'UAT GUID',
105 | 'URL Match',
106 | 'Title Match',
107 | 'GUID Match',
108 | ]);
109 |
110 | foreach ($prod_pbgc as $name => $prod_dataset) {
111 | if (isset($uat_pbgc_by_guid[$prod_dataset['guid']])) {
112 | foreach ($uat_pbgc_by_guid[$prod_dataset['guid']] as $uat_dataset) {
113 | $csv->writeRow([
114 | $prod_dataset['title'],
115 | $prod_dataset['url'],
116 | $prod_dataset['guid'],
117 | $prod_dataset['topics'],
118 | $prod_dataset['categories'],
119 | true,
120 | $uat_dataset['title'],
121 | $uat_dataset['url'],
122 | $uat_dataset['guid'],
123 | (bool)($prod_dataset['name'] && $prod_dataset['name'] == $uat_dataset['name']),
124 | (bool)($prod_dataset['title_simple'] && $prod_dataset['title_simple'] == $uat_dataset['title_simple']),
125 | true,
126 | ]);
127 | }
128 | continue;
129 | }
130 |
131 | if (isset($uat_pbgc_by_title[$prod_dataset['title_simple']])) {
132 | foreach ($uat_pbgc_by_title[$prod_dataset['title_simple']] as $uat_dataset) {
133 | $csv->writeRow([
134 | $prod_dataset['title'],
135 | $prod_dataset['url'],
136 | $prod_dataset['guid'],
137 | $prod_dataset['topics'],
138 | $prod_dataset['categories'],
139 | true,
140 | $uat_dataset['title'],
141 | $uat_dataset['url'],
142 | $uat_dataset['guid'],
143 | (bool)($prod_dataset['name'] && $prod_dataset['name'] == $uat_dataset['name']),
144 | true,
145 | (bool)($prod_dataset['guid'] && $prod_dataset['guid'] == $uat_dataset['guid']),
146 | ]);
147 | }
148 | continue;
149 | }
150 |
151 | $csv->writeRow([
152 | $prod_dataset['title'],
153 | $prod_dataset['url'],
154 | $prod_dataset['guid'],
155 | $prod_dataset['topics'],
156 | $prod_dataset['categories'],
157 | false,
158 | '',
159 | '',
160 | '',
161 | false,
162 | false,
163 | ]);
164 | }
165 |
166 | // show running time on finish
167 | timer();
168 |
--------------------------------------------------------------------------------