├── tests ├── __init__.py ├── data │ ├── invalid.bin │ ├── empty.csv │ ├── excel.xlsx │ ├── hourly_aug_days.csv │ ├── spss.sav │ ├── excel97.xls │ ├── stata114.dta │ ├── stata118.dta │ ├── basic_aug.csv │ ├── parquet.parquet │ ├── agg_aug.csv │ ├── daily_aug.csv │ ├── agg.csv │ ├── addresses.csv │ ├── admins.csv │ ├── hourly_aug.csv │ ├── years_pivoted.csv │ ├── dates_pivoted.csv │ ├── lat_longs.csv │ ├── lazo_aug.csv │ ├── geo_aug.csv │ ├── daily_aug_hours.csv │ ├── spatiotemporal_aug.csv │ ├── years_pivoted.converted.csv │ ├── basic.csv │ ├── daily.csv │ ├── basic.d3m.csv │ ├── dates_pivoted.converted.csv │ ├── lazo.csv │ ├── annotated.csv │ ├── temporal.py │ ├── hourly.csv │ ├── spatiotemporal.py │ └── geo.py ├── ci.env ├── __main__.py └── test_common.py ├── apiserver ├── apiserver │ ├── __init__.py │ ├── __main__.py │ ├── search │ │ └── base.py │ ├── enhance_metadata.py │ └── graceful_shutdown.py └── setup.py ├── coordinator ├── coordinator │ ├── __init__.py │ ├── __main__.py │ └── templates │ │ ├── errors.html │ │ ├── login.html │ │ └── base.html └── setup.py ├── snapshotter ├── snapshotter │ ├── __init__.py │ └── __main__.py └── setup.py ├── cache_cleaner ├── cache_cleaner │ ├── __init__.py │ └── __main__.py └── setup.py ├── discovery ├── noaa │ ├── noaa_discovery │ │ └── __init__.py │ └── setup.py ├── ckan │ └── setup.py ├── zenodo │ └── setup.py ├── isi │ └── setup.py ├── socrata │ └── setup.py ├── worldbank │ └── setup.py └── uaz_indicators │ └── setup.py ├── frontend ├── .eslintignore ├── src │ ├── react-app-env.d.ts │ ├── components │ │ ├── SearchResults │ │ │ ├── SearchState.ts │ │ │ ├── SearchResults.css │ │ │ ├── SimpleBar.tsx │ │ │ └── DatasetSample.css │ │ ├── RelatedFileFilter │ │ │ └── RelatedFileColumnsSelector.css │ │ ├── ui │ │ │ ├── Button │ │ │ │ ├── Button.css │ │ │ │ └── Button.tsx │ │ │ ├── Tabs │ │ │ │ ├── Tabs.css │ │ │ │ └── Tabs.tsx │ │ │ └── DropdownMenu │ │ │ │ └── DropdownMenu.tsx │ │ ├── visus │ │ │ ├── Loading │ │ │ │ ├── Loading.tsx │ │ │ │ └── Spinner.tsx │ │ │ ├── Card │ │ │ │ ├── card.css │ │ │ │ └── Card.tsx │ │ │ └── PersistentComponent │ │ │ │ └── PersistentComponent.tsx │ │ ├── AdvancedSearchBar │ │ │ └── AdvancedSearchBar.css │ │ ├── MainMenu │ │ │ ├── MainMenu.css │ │ │ └── MainMenu.tsx │ │ ├── SearchBar │ │ │ ├── SearchBar.css │ │ │ └── SearchBar.tsx │ │ ├── Logo │ │ │ ├── Logo.css │ │ │ ├── Logo.tsx │ │ │ └── auctus-logo.min.svg │ │ ├── Badges │ │ │ ├── Badges.css │ │ │ └── IconAbc.tsx │ │ ├── Upload │ │ │ └── Upload.css │ │ ├── FilterContainer │ │ │ └── FilterContainer.tsx │ │ ├── GeoSpatialCoverageMap │ │ │ └── GeoSpatialCoverageMap.css │ │ ├── Chip │ │ │ ├── Chip.tsx │ │ │ └── Chip.css │ │ ├── DateFilter │ │ │ └── DateFilter.css │ │ └── JoinColumnsSelector │ │ │ └── FunctionBin.tsx │ ├── setupTests.ts │ ├── index.tsx │ ├── App.test.tsx │ ├── index.css │ ├── spatial-utils.ts │ ├── App.tsx │ └── config.ts ├── .prettierrc.js ├── public │ ├── favicon.ico │ ├── robots.txt │ ├── manifest.json │ └── index.html ├── .gitignore ├── nginx.conf ├── tsconfig.json ├── Dockerfile ├── .eslintrc.json ├── README.md └── package.json ├── lib_fslock ├── MANIFEST.in ├── datamart_fslock │ └── __init__.py ├── LICENSE.txt └── setup.py ├── docker ├── ckan.json ├── zenodo.json ├── coveragerc ├── redis.conf ├── socrata.json ├── etc_rabbitmq │ ├── enabled_plugins │ └── rabbitmq.conf ├── haproxy.dockerfile ├── rabbitmq.dockerfile ├── nominatim.dockerfile ├── haproxy.conf ├── prometheus.yml ├── grafana.dockerfile └── install_deps.py ├── lib_core ├── MANIFEST.in ├── NOTICE.txt ├── datamart_core │ ├── __init__.py │ ├── prom.py │ ├── types.py │ └── augment.py └── setup.py ├── docs ├── amqp.png ├── architecture.png ├── screenshots │ ├── join.png │ ├── menu.png │ ├── union.png │ ├── results.png │ ├── search.png │ ├── upload.png │ ├── statistics.png │ ├── column-view.png │ ├── filter-source.png │ ├── filter-spatial.png │ ├── filter-temporal.png │ └── filter-related-file.png ├── python │ ├── index.rst │ ├── datamart-augmentation.rst │ ├── datamart-rest.rst │ └── datamart-profiler.rst ├── redoc │ └── index.html ├── schemas.rst ├── Makefile ├── index.rst ├── make.bat └── conf.py ├── lib_augmentation ├── MANIFEST.in ├── NOTICE.txt ├── datamart_augmentation │ └── __init__.py ├── README.rst └── setup.py ├── lib_materialize ├── MANIFEST.in ├── NOTICE.txt ├── datamart_materialize │ ├── spss.py │ ├── stata.py │ ├── tsv.py │ ├── parquet.py │ ├── common.py │ ├── excel.py │ ├── excel97.py │ ├── types.py │ ├── pivot.py │ └── utils.py └── README.rst ├── lib_profiler ├── MANIFEST.in ├── NOTICE.txt ├── datamart_profiler │ ├── __init__.py │ ├── warning_tools.py │ └── types.py ├── README.rst └── setup.py ├── .gitmodules ├── NOTICE.txt ├── scripts ├── load_env.sh ├── docker_clear_caches.sh ├── docker_save_uploads.sh ├── run_frontend_tests.sh ├── docker_purge_source.sh ├── upload_dataset.sh ├── docker_export_all.sh ├── setup.sh ├── docker_import_all.sh ├── docker_build_push.sh ├── docker_import_snapshot.sh ├── delete_dataset.py ├── report-uploads.sh ├── list_big_datasets.py ├── list_sources.py ├── clear_caches.py ├── canonicalize_yaml.py ├── purge_source.py ├── migrate-source-url.py ├── migrate-point-format.py ├── README.md ├── reprocess_all.py ├── export_all.py ├── migrate-types-and-attributes.py └── migrate-temporal-coverage.py ├── contrib └── k8s │ ├── Makefile │ ├── README.md │ ├── secrets.jsonnet │ ├── snapshotter.libsonnet │ ├── auctus.libsonnet │ └── discovery │ ├── worldbank.libsonnet │ └── uaz-indicators.libsonnet ├── .gitignore ├── .dockerignore ├── env.default ├── profiler └── setup.py └── pyproject.toml /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/data/invalid.bin: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /apiserver/apiserver/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /coordinator/coordinator/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /snapshotter/snapshotter/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cache_cleaner/cache_cleaner/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /discovery/noaa/noaa_discovery/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /frontend/.eslintignore: -------------------------------------------------------------------------------- 1 | **/node_modules 2 | build/ -------------------------------------------------------------------------------- /lib_fslock/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE.txt 2 | -------------------------------------------------------------------------------- /tests/data/empty.csv: -------------------------------------------------------------------------------- 1 | important features,not here 2 | -------------------------------------------------------------------------------- /docker/ckan.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"url": "data.humdata.org"} 3 | ] 4 | -------------------------------------------------------------------------------- /docker/zenodo.json: -------------------------------------------------------------------------------- 1 | { 2 | "keyword_query": "covid" 3 | } 4 | -------------------------------------------------------------------------------- /lib_core/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE.txt 2 | include NOTICE.txt 3 | -------------------------------------------------------------------------------- /docker/coveragerc: -------------------------------------------------------------------------------- 1 | [paths] 2 | source = 3 | . 4 | /usr/src/app 5 | -------------------------------------------------------------------------------- /docker/redis.conf: -------------------------------------------------------------------------------- 1 | maxmemory 500mb 2 | maxmemory-policy allkeys-lru 3 | -------------------------------------------------------------------------------- /docs/amqp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/auctus/HEAD/docs/amqp.png -------------------------------------------------------------------------------- /frontend/src/react-app-env.d.ts: -------------------------------------------------------------------------------- 1 | /// 2 | -------------------------------------------------------------------------------- /frontend/.prettierrc.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | ...require('gts/.prettierrc.json') 3 | } -------------------------------------------------------------------------------- /docs/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/auctus/HEAD/docs/architecture.png -------------------------------------------------------------------------------- /tests/data/excel.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/auctus/HEAD/tests/data/excel.xlsx -------------------------------------------------------------------------------- /tests/data/hourly_aug_days.csv: -------------------------------------------------------------------------------- 1 | orig_date,color 2 | 2019-06-12,pink 3 | 2019-06-13,grey 4 | -------------------------------------------------------------------------------- /tests/data/spss.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/auctus/HEAD/tests/data/spss.sav -------------------------------------------------------------------------------- /lib_augmentation/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE.txt 2 | include NOTICE.txt 3 | include README.rst 4 | -------------------------------------------------------------------------------- /lib_materialize/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE.txt 2 | include NOTICE.txt 3 | include README.rst 4 | -------------------------------------------------------------------------------- /lib_profiler/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE.txt 2 | include NOTICE.txt 3 | include README.rst 4 | -------------------------------------------------------------------------------- /tests/data/excel97.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/auctus/HEAD/tests/data/excel97.xls -------------------------------------------------------------------------------- /tests/data/stata114.dta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/auctus/HEAD/tests/data/stata114.dta -------------------------------------------------------------------------------- /tests/data/stata118.dta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/auctus/HEAD/tests/data/stata118.dta -------------------------------------------------------------------------------- /docs/screenshots/join.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/auctus/HEAD/docs/screenshots/join.png -------------------------------------------------------------------------------- /docs/screenshots/menu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/auctus/HEAD/docs/screenshots/menu.png -------------------------------------------------------------------------------- /docs/screenshots/union.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/auctus/HEAD/docs/screenshots/union.png -------------------------------------------------------------------------------- /tests/data/basic_aug.csv: -------------------------------------------------------------------------------- 1 | number,desk_faces 2 | 5,west 3 | 4,south 4 | 7,west 5 | 6,east 6 | 11, 7 | -------------------------------------------------------------------------------- /tests/data/parquet.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/auctus/HEAD/tests/data/parquet.parquet -------------------------------------------------------------------------------- /docs/screenshots/results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/auctus/HEAD/docs/screenshots/results.png -------------------------------------------------------------------------------- /docs/screenshots/search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/auctus/HEAD/docs/screenshots/search.png -------------------------------------------------------------------------------- /docs/screenshots/upload.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/auctus/HEAD/docs/screenshots/upload.png -------------------------------------------------------------------------------- /frontend/public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/auctus/HEAD/frontend/public/favicon.ico -------------------------------------------------------------------------------- /frontend/public/robots.txt: -------------------------------------------------------------------------------- 1 | # https://www.robotstxt.org/robotstxt.html 2 | User-agent: * 3 | Disallow: / 4 | -------------------------------------------------------------------------------- /docker/socrata.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"url": "data.cityofnewyork.us"}, 3 | {"url": "finances.worldbank.org"} 4 | ] 5 | -------------------------------------------------------------------------------- /docs/screenshots/statistics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/auctus/HEAD/docs/screenshots/statistics.png -------------------------------------------------------------------------------- /tests/data/agg_aug.csv: -------------------------------------------------------------------------------- 1 | id,location 2 | 40,brazil 3 | 30,south korea 4 | 70,usa 5 | 80,canada 6 | 100,france 7 | -------------------------------------------------------------------------------- /docs/screenshots/column-view.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/auctus/HEAD/docs/screenshots/column-view.png -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "lib_geo"] 2 | path = lib_geo 3 | url = https://gitlab.com/ViDA-NYU/auctus/datamart-geo.git 4 | -------------------------------------------------------------------------------- /apiserver/apiserver/__main__.py: -------------------------------------------------------------------------------- 1 | from apiserver.main import main 2 | 3 | 4 | if __name__ == '__main__': 5 | main() 6 | -------------------------------------------------------------------------------- /docs/screenshots/filter-source.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/auctus/HEAD/docs/screenshots/filter-source.png -------------------------------------------------------------------------------- /docs/screenshots/filter-spatial.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/auctus/HEAD/docs/screenshots/filter-spatial.png -------------------------------------------------------------------------------- /docs/screenshots/filter-temporal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/auctus/HEAD/docs/screenshots/filter-temporal.png -------------------------------------------------------------------------------- /coordinator/coordinator/__main__.py: -------------------------------------------------------------------------------- 1 | from coordinator.web import main 2 | 3 | 4 | if __name__ == '__main__': 5 | main() 6 | -------------------------------------------------------------------------------- /docker/etc_rabbitmq/enabled_plugins: -------------------------------------------------------------------------------- 1 | [rabbitmq_management,rabbitmq_prometheus,rabbitmq_shovel,rabbitmq_shovel_management]. 2 | -------------------------------------------------------------------------------- /docs/screenshots/filter-related-file.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIDA-NYU/auctus/HEAD/docs/screenshots/filter-related-file.png -------------------------------------------------------------------------------- /snapshotter/snapshotter/__main__.py: -------------------------------------------------------------------------------- 1 | from snapshotter.snapshot import main 2 | 3 | 4 | if __name__ == '__main__': 5 | main() 6 | -------------------------------------------------------------------------------- /cache_cleaner/cache_cleaner/__main__.py: -------------------------------------------------------------------------------- 1 | from cache_cleaner.cache import main 2 | 3 | 4 | if __name__ == '__main__': 5 | main() 6 | -------------------------------------------------------------------------------- /tests/data/daily_aug.csv: -------------------------------------------------------------------------------- 1 | orig_date,n_people 2 | 2019-04-28,3 3 | 2019-04-29,5 4 | 2019-04-30,0 5 | 2019-05-01,1 6 | 2019-05-02,3 7 | 2019-05-03,2 8 | -------------------------------------------------------------------------------- /tests/data/agg.csv: -------------------------------------------------------------------------------- 1 | id,work,salary 2 | 40,false, 3 | 30,true,200 4 | 70,true, 5 | 80,true,200 6 | 100,false,300 7 | 100,true,200 8 | 30,false,100 9 | 70,false,600 -------------------------------------------------------------------------------- /docker/haproxy.dockerfile: -------------------------------------------------------------------------------- 1 | FROM haproxy:2.4 2 | 3 | USER root 4 | RUN apt-get update && apt-get install -y curl && \ 5 | rm -rf /var/lib/apt/lists/* 6 | USER haproxy 7 | -------------------------------------------------------------------------------- /lib_fslock/datamart_fslock/__init__.py: -------------------------------------------------------------------------------- 1 | from .unix import FSLockExclusive, FSLockShared 2 | 3 | 4 | __all__ = ['FSLockExclusive', 'FSLockShared'] 5 | __version__ = '2.1' 6 | -------------------------------------------------------------------------------- /tests/data/addresses.csv: -------------------------------------------------------------------------------- 1 | place,loc 2 | Bobst,"70 Washington Square S, New York, NY 10012" 3 | Tandon,"6 MetroTech, Brooklyn, NY 11201" 4 | WWH,"251 Mercer St, New York, NY 10012" 5 | -------------------------------------------------------------------------------- /NOTICE.txt: -------------------------------------------------------------------------------- 1 | Datamart (codename Auctus) 2 | 3 | Copyright 2018 New York University 4 | 5 | This product includes software developed at 6 | New York University 7 | https://www.nyu.edu/ 8 | -------------------------------------------------------------------------------- /tests/data/admins.csv: -------------------------------------------------------------------------------- 1 | zero,one,mixed 2 | italy,Brittany,france 3 | italy,Normandie,Normandie 4 | spain,region occitanie,germany 5 | germany,Bavaria,Bavaria 6 | germany,Hamburg,Brittany 7 | -------------------------------------------------------------------------------- /frontend/src/components/SearchResults/SearchState.ts: -------------------------------------------------------------------------------- 1 | enum SearchState { 2 | CLEAN, 3 | SEARCH_REQUESTING, 4 | SEARCH_SUCCESS, 5 | SEARCH_FAILED, 6 | } 7 | 8 | export {SearchState}; 9 | -------------------------------------------------------------------------------- /lib_core/NOTICE.txt: -------------------------------------------------------------------------------- 1 | Datamart core library 2 | 3 | Copyright 2018 New York University 4 | 5 | This product includes software developed at 6 | New York University 7 | https://www.nyu.edu/ 8 | -------------------------------------------------------------------------------- /tests/data/hourly_aug.csv: -------------------------------------------------------------------------------- 1 | orig_date,color 2 | 2019-06-13T01:00:00,yellow 3 | 2019-06-13T02:00:00,yellow 4 | 2019-06-13T03:00:00,brown 5 | 2019-06-13T04:00:00,brown 6 | 2019-06-13T05:00:00,yellow 7 | -------------------------------------------------------------------------------- /lib_profiler/NOTICE.txt: -------------------------------------------------------------------------------- 1 | Datamart profiler library 2 | 3 | Copyright 2018 New York University 4 | 5 | This product includes software developed at 6 | New York University 7 | https://www.nyu.edu/ 8 | -------------------------------------------------------------------------------- /tests/data/years_pivoted.csv: -------------------------------------------------------------------------------- 1 | color,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017 2 | green,yes,no,no,yes,no,yes,yes,yes,yes,no,no,yes 3 | red,no,yes,yes,yes,no,no,no,yes,no,yes,yes,no 4 | -------------------------------------------------------------------------------- /lib_augmentation/NOTICE.txt: -------------------------------------------------------------------------------- 1 | Datamart augmentation library 2 | 3 | Copyright 2018 New York University 4 | 5 | This product includes software developed at 6 | New York University 7 | https://www.nyu.edu/ 8 | -------------------------------------------------------------------------------- /lib_augmentation/datamart_augmentation/__init__.py: -------------------------------------------------------------------------------- 1 | from .augmentation import AugmentationError, join, union 2 | 3 | 4 | __version__ = '0.10' 5 | 6 | 7 | __all__ = ['AugmentationError', 'join', 'union'] 8 | -------------------------------------------------------------------------------- /lib_materialize/NOTICE.txt: -------------------------------------------------------------------------------- 1 | Datamart materialization library 2 | 3 | Copyright 2018 New York University 4 | 5 | This product includes software developed at 6 | New York University 7 | https://www.nyu.edu/ 8 | -------------------------------------------------------------------------------- /docker/rabbitmq.dockerfile: -------------------------------------------------------------------------------- 1 | FROM rabbitmq:3.8.11-management 2 | 3 | COPY --chown=999:999 etc_rabbitmq/rabbitmq.conf /etc/rabbitmq/rabbitmq.conf 4 | COPY etc_rabbitmq/enabled_plugins /etc/rabbitmq/enabled_plugins 5 | -------------------------------------------------------------------------------- /lib_profiler/datamart_profiler/__init__.py: -------------------------------------------------------------------------------- 1 | from .core import count_rows_to_skip, process_dataset 2 | from .temporal import parse_date 3 | 4 | 5 | __version__ = '0.11' 6 | 7 | 8 | __all__ = ['count_rows_to_skip', 'process_dataset', 'parse_date'] 9 | -------------------------------------------------------------------------------- /docs/python/index.rst: -------------------------------------------------------------------------------- 1 | Python libraries 2 | ================ 3 | 4 | Some components 5 | 6 | .. toctree:: 7 | :maxdepth: 2 8 | 9 | datamart-rest 10 | datamart-profiler 11 | datamart-materialize 12 | datamart-augmentation 13 | -------------------------------------------------------------------------------- /frontend/src/components/RelatedFileFilter/RelatedFileColumnsSelector.css: -------------------------------------------------------------------------------- 1 | .label-button { 2 | cursor: pointer; 3 | text-decoration: underline; 4 | background: transparent; 5 | border: 0; 6 | } 7 | .danger { 8 | color: #f44336; 9 | } 10 | -------------------------------------------------------------------------------- /scripts/load_env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | while read line; do 4 | if [ "$line" != "" -a "${line:0:1}" != "#" ]; then 5 | export "$line" 6 | fi 7 | done <.env 8 | export DATAMART_VERSION=$(git describe) 9 | export DATAMART_GEO_DATA="$(pwd)/lib_geo/data" 10 | -------------------------------------------------------------------------------- /tests/data/dates_pivoted.csv: -------------------------------------------------------------------------------- 1 | color,2012-01-01,2012-02-01,2012-03-01,2012-04-01,2012-05-01,2012-06-01,2012-07-01,2012-08-01,2012-09-01,2012-10-01,2012-11-01,2012-12-01 2 | green,yes,no,no,yes,no,yes,yes,yes,yes,no,no,yes 3 | red,no,yes,yes,yes,no,no,no,yes,no,yes,yes,no 4 | -------------------------------------------------------------------------------- /scripts/docker_clear_caches.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -eu 3 | cd "$(dirname "$(dirname "$0")")" 4 | PROJ="$(basename "$(pwd)")" 5 | docker run -ti --rm --network ${PROJ}_default -v $PWD/scripts:/scripts -v $PWD/volumes/cache:/cache auctus python /scripts/clear_caches.py "$@" 6 | -------------------------------------------------------------------------------- /scripts/docker_save_uploads.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -eu 3 | cd "$(dirname "$(dirname "$0")")" 4 | PROJ="$(basename "$(pwd)")" 5 | docker run -ti --rm --network ${PROJ}_default -v $PWD/volumes/datasets:/datasets auctus sh -c 'tar zc /datasets/datamart.upload.*' >uploads.tar.gz 6 | -------------------------------------------------------------------------------- /contrib/k8s/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all clean generate 2 | 3 | all: clean generate 4 | 5 | clean: 6 | rm -rf yaml 7 | 8 | generate: *.jsonnet *.libsonnet discovery/*.libsonnet 9 | mkdir -p yaml 10 | jsonnet -S -m yaml deployment.jsonnet 11 | jsonnet -S -m yaml secrets.jsonnet 12 | -------------------------------------------------------------------------------- /scripts/run_frontend_tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd "$(dirname "$(dirname "$0")")" 4 | 5 | set -eu 6 | 7 | # Run frontend tests 8 | docker build -t auctus_frontend_npm -f frontend/Dockerfile --target build . 9 | docker run -ti --name auctus_npm_test --rm auctus_frontend_npm sh -c "CI=true npm run test" 10 | -------------------------------------------------------------------------------- /frontend/src/components/ui/Button/Button.css: -------------------------------------------------------------------------------- 1 | .button-group > * { 2 | display: inline-flex; 3 | margin-right: 0.25rem; 4 | margin-bottom: 0.075rem; 5 | margin-top: 0.075rem; 6 | } 7 | 8 | .button-group > *:last-child { 9 | margin-right: 0px; 10 | } 11 | 12 | .btn > .feather:first-child { 13 | margin-right: 0.2rem; 14 | } 15 | -------------------------------------------------------------------------------- /tests/data/lat_longs.csv: -------------------------------------------------------------------------------- 1 | from latitude,to long,to lat,from longitude,unpaired lat 2 | 40.734746,-74.000077,40.728026,-73.998869,40.728278 3 | 40.726640,-73.993186,40.732466,-74.004689,40.722948 4 | 40.735108,-73.996996,40.727577,-74.002853,40.730824 5 | 40.729115,-74.001726,40.734259,-73.996833,40.723674 6 | 40.728896,-73.998542,40.728711,-74.002426,40.733272 7 | -------------------------------------------------------------------------------- /tests/data/lazo_aug.csv: -------------------------------------------------------------------------------- 1 | favorite 2 | Peanut Butter 3 | Ice cream 4 | flan 5 | orange 6 | kiwi 7 | coconut 8 | liquorICE 9 | MACaron 10 | pear 11 | CANDY 12 | pudding 13 | doughnut 14 | marzipan 15 | tart 16 | pecan pie 17 | souffle 18 | Pastry 19 | banana 20 | caramel 21 | milkshake 22 | Chocolate 23 | tiramisu 24 | tres leches 25 | calisson 26 | taffy 27 | lemon 28 | -------------------------------------------------------------------------------- /contrib/k8s/README.md: -------------------------------------------------------------------------------- 1 | # How to configure 2 | 3 | We use [Jsonnet](https://jsonnet.org/) to automate the generation of the YAML config files for Kubernetes. 4 | 5 | You should only need to update `deployment.jsonnet` and `secrets.jsonnet`, then you can generate the YAML files using: 6 | ``` 7 | mkdir yaml 8 | jsonnet -S -m yaml/ deployment.jsonnet 9 | jsonnet -S -m yaml/ secrets.jsonnet 10 | ``` 11 | -------------------------------------------------------------------------------- /lib_core/datamart_core/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import prometheus_client 3 | 4 | from .discovery import Discoverer, AsyncDiscoverer 5 | 6 | 7 | __all__ = ['Discoverer', 'AsyncDiscoverer'] 8 | 9 | 10 | PROM_VERSION = prometheus_client.Gauge('version', "Datamart version", 11 | ['version']) 12 | PROM_VERSION.labels(os.environ['DATAMART_VERSION']).set(1) 13 | -------------------------------------------------------------------------------- /scripts/docker_purge_source.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -eu 3 | cd "$(dirname "$(dirname "$0")")" 4 | PROJ="$(basename "$(pwd)")" 5 | docker run -ti --rm --network ${PROJ}_default -v $PWD/scripts:/scripts -v $PWD/volumes/cache:/cache -e ELASTICSEARCH_HOSTS=elasticsearch:9200 -e ELASTICSEARCH_PREFIX=${ELASTICSEARCH_PREFIX} -e LAZO_SERVER_HOST=lazo -e LAZO_SERVER_PORT=50051 auctus python /scripts/purge_source.py "$1" 6 | -------------------------------------------------------------------------------- /tests/data/geo_aug.csv: -------------------------------------------------------------------------------- 1 | lat,long,id,letter 2 | 40.731191,-74.002677,place100,a 3 | 40.728870,-73.999367,place101,b 4 | 40.737170,-73.999883,place102,c 5 | 40.729107,-73.996659,place103,d 6 | 40.730194,-74.004258,place104,e 7 | 40.734620,-74.001027,place105,f 8 | 40.727132,-73.994823,place106,g 9 | 40.728087,-73.992458,place107,h 10 | 40.730429,-74.003744,place108,i 11 | 40.728873,-73.996520,place109,j 12 | -------------------------------------------------------------------------------- /tests/data/daily_aug_hours.csv: -------------------------------------------------------------------------------- 1 | orig_date,n_people 2 | 2019-04-25T21:00:00Z,3 3 | 2019-04-26T01:00:00Z,5 4 | 2019-04-26T05:00:00Z,6 5 | 2019-04-26T09:00:00Z,7 6 | 2019-04-26T13:00:00Z,6 7 | 2019-04-26T17:00:00Z,8 8 | 2019-04-26T21:00:00Z,7 9 | 2019-04-27T01:00:00Z,0 10 | 2019-04-27T05:00:00Z,1 11 | 2019-04-27T09:00:00Z,0 12 | 2019-04-27T13:00:00Z,3 13 | 2019-04-27T17:00:00Z,0 14 | 2019-04-27T13:00:00Z,0 15 | -------------------------------------------------------------------------------- /scripts/upload_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # This script profiles a dataset and adds it to the index 4 | 5 | set -eu 6 | 7 | if [ "$#" != 3 ]; then 8 | echo "Usage: upload_dataset.sh \"\" \"\"" >&2 9 | exit 2 10 | fi 11 | exec curl -X POST \ 12 | -F "file=@$1;filename=$(basename "$1")" -F "name=$2" -F "description=$3" \ 13 | http://localhost:8002/api/v1/upload 14 | -------------------------------------------------------------------------------- /frontend/.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. 2 | 3 | # dependencies 4 | /node_modules 5 | /.pnp 6 | .pnp.js 7 | 8 | # testing 9 | /coverage 10 | 11 | # production 12 | /build 13 | 14 | # misc 15 | .DS_Store 16 | .env.local 17 | .env.development.local 18 | .env.test.local 19 | .env.production.local 20 | 21 | npm-debug.log* 22 | yarn-debug.log* 23 | yarn-error.log* 24 | -------------------------------------------------------------------------------- /frontend/src/components/ui/Tabs/Tabs.css: -------------------------------------------------------------------------------- 1 | .nav-tabs .nav-item button { 2 | cursor: pointer; 3 | background-color: transparent; 4 | color: #63508b; 5 | } 6 | 7 | .nav-tabs .nav-item button { 8 | cursor: pointer; 9 | background-color: transparent; 10 | color: #63508b; 11 | } 12 | 13 | .nav-tabs .nav-item button:focus { 14 | outline: none; 15 | } 16 | 17 | .nav-tabs .nav-item .nav-link.active { 18 | color: #212529; 19 | } -------------------------------------------------------------------------------- /frontend/src/components/visus/Loading/Loading.tsx: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import {Spinner} from './Spinner'; 3 | 4 | interface LoadingProps { 5 | message?: string; 6 | } 7 | 8 | class Loading extends React.PureComponent { 9 | render() { 10 | const msg = this.props.message || 'Loading...'; 11 | return ( 12 | 13 | {msg} 14 | 15 | ); 16 | } 17 | } 18 | 19 | export {Loading}; 20 | -------------------------------------------------------------------------------- /scripts/docker_export_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -eu 3 | cd "$(dirname "$(dirname "$0")")" 4 | PROJ="$(basename "$(pwd)")" 5 | if [ -z "$1" ]; then 6 | echo "Missing argument" >&2 7 | exit 1 8 | fi 9 | mkdir "$1" 10 | chown 998 "$1" 11 | docker run --rm --network ${PROJ}_default -v $PWD/scripts:/scripts -v "$1:/index" -w /index -e ELASTICSEARCH_HOSTS=elasticsearch:9200 -e ELASTICSEARCH_PREFIX=${ELASTICSEARCH_PREFIX} auctus python /scripts/export_all.py 12 | -------------------------------------------------------------------------------- /frontend/src/setupTests.ts: -------------------------------------------------------------------------------- 1 | // jest-dom adds custom jest matchers for asserting on DOM nodes. 2 | // allows you to do things like: 3 | // expect(element).toHaveTextContent(/react/i) 4 | // learn more: https://github.com/testing-library/jest-dom 5 | import '@testing-library/jest-dom/extend-expect'; 6 | 7 | // Setup Jest canvas mock. This is required to test components that use canvas 8 | // (e.g., Open Layers library requires this) 9 | import 'jest-canvas-mock'; 10 | -------------------------------------------------------------------------------- /frontend/src/components/SearchResults/SearchResults.css: -------------------------------------------------------------------------------- 1 | .column-infobox { 2 | width: 60%; 3 | padding-right: .25rem; 4 | padding-left: .5rem; 5 | } 6 | 7 | .column-infobox > * { 8 | overflow: auto; 9 | } 10 | 11 | .column-search-hits { 12 | width: 40%; 13 | min-width: 200px; 14 | padding-left: .5rem; 15 | padding-right: .4rem; 16 | } 17 | 18 | .search-hits-group > * { 19 | margin-bottom: .5rem; 20 | } 21 | 22 | .search-hits-group > :last-child { 23 | margin-bottom: 0; 24 | } -------------------------------------------------------------------------------- /tests/data/spatiotemporal_aug.csv: -------------------------------------------------------------------------------- 1 | date,latitude,longitude 2 | 2006-06-20T06:00:00,43.237,6.072 3 | 2006-06-20T06:00:00,43.238,6.072 4 | 2006-06-20T06:00:00,43.237,6.073 5 | 2006-06-20T06:00:00,43.238,6.073 6 | 2006-06-20T07:00:00,43.237,6.072 7 | 2006-06-20T07:00:00,43.238,6.072 8 | 2006-06-20T07:00:00,43.237,6.073 9 | 2006-06-20T07:00:00,43.238,6.073 10 | 2006-06-20T08:00:00,43.237,6.072 11 | 2006-06-20T08:00:00,43.238,6.072 12 | 2006-06-20T08:00:00,43.237,6.073 13 | 2006-06-20T08:00:00,43.238,6.073 14 | -------------------------------------------------------------------------------- /tests/data/years_pivoted.converted.csv: -------------------------------------------------------------------------------- 1 | color,year,value 2 | green,2006,yes 3 | green,2007,no 4 | green,2008,no 5 | green,2009,yes 6 | green,2010,no 7 | green,2011,yes 8 | green,2012,yes 9 | green,2013,yes 10 | green,2014,yes 11 | green,2015,no 12 | green,2016,no 13 | green,2017,yes 14 | red,2006,no 15 | red,2007,yes 16 | red,2008,yes 17 | red,2009,yes 18 | red,2010,no 19 | red,2011,no 20 | red,2012,no 21 | red,2013,yes 22 | red,2014,no 23 | red,2015,yes 24 | red,2016,yes 25 | red,2017,no 26 | -------------------------------------------------------------------------------- /tests/data/basic.csv: -------------------------------------------------------------------------------- 1 | name,color,number,what 2 | james,green,5,false 3 | john,blue,4,false 4 | robert,blue,6,false 5 | michael,blue,7,true 6 | william,blue,7,true 7 | david,green,5,false 8 | richard,green,7,true 9 | joseph,blue,6,true 10 | thomas,blue,6,false 11 | charles,blue,7,false 12 | christopher,green,11,true 13 | daniel,blue,5,false 14 | matthew,green,7,true 15 | anthony,green,7,true 16 | donald,blue,6,true 17 | mark,blue,4,false 18 | paul,blue,4,false 19 | steven,blue,6,false 20 | andrew,green,6,false 21 | kenneth,green,7,true 22 | -------------------------------------------------------------------------------- /frontend/nginx.conf: -------------------------------------------------------------------------------- 1 | server { 2 | # HTTP redirects to HTTPS 3 | listen 80; 4 | listen [::]:80; 5 | 6 | server_name _; 7 | 8 | root /var/www/html; 9 | 10 | charset utf-8; 11 | 12 | location /static/ { 13 | root /var/www/html; 14 | autoindex off; 15 | if ($query_string) { 16 | expires max; 17 | } 18 | } 19 | 20 | location /.well-known/ { 21 | try_files $uri =404; 22 | } 23 | 24 | location / { 25 | try_files $uri /index.html; 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /tests/data/daily.csv: -------------------------------------------------------------------------------- 1 | aug_date,rain 2 | 20190423,no 3 | 20190424,no 4 | 20190425,yes 5 | 20190426,no 6 | 20190427,yes 7 | 20190428,yes 8 | 20190429,yes 9 | 20190430,yes 10 | 20190501,no 11 | 20190502,no 12 | 20190503,yes 13 | 20190504,no 14 | 20190505,yes 15 | 20190506,yes 16 | 20190507,no 17 | 20190508,yes 18 | 20190509,yes 19 | 20190510,no 20 | 20190511,no 21 | 20190512,yes 22 | 20190513,no 23 | 20190514,no 24 | 20190515,no 25 | 20190516,no 26 | 20190517,yes 27 | 20190518,no 28 | 20190519,yes 29 | 20190520,no 30 | 20190521,no 31 | 20190522,yes 32 | -------------------------------------------------------------------------------- /lib_materialize/datamart_materialize/spss.py: -------------------------------------------------------------------------------- 1 | import pyreadstat 2 | 3 | from datamart_materialize.utils import SimpleConverter 4 | 5 | 6 | def spss_to_csv(source_filename, dest_fileobj): 7 | df, meta = pyreadstat.read_sav(source_filename) 8 | df.to_csv( 9 | dest_fileobj, 10 | float_format='%g', 11 | index=False, 12 | line_terminator='\r\n', 13 | ) 14 | 15 | 16 | class SpssConverter(SimpleConverter): 17 | """Adapter converting an SPSS file to CSV. 18 | """ 19 | transform = staticmethod(spss_to_csv) 20 | -------------------------------------------------------------------------------- /scripts/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -eux 4 | 5 | sudo(){ 6 | if [ $(id -u) = 0 ]; then 7 | "$@" 8 | else 9 | command sudo "$@" 10 | fi 11 | } 12 | 13 | # Set up volume permissions 14 | mkdir -p volumes/datasets && sudo chown -R 998 volumes/datasets 15 | mkdir -p volumes/cache && sudo chown -R 998 volumes/cache 16 | mkdir -p volumes/prometheus && sudo chown -R 65534:65534 volumes/prometheus 17 | mkdir -p volumes/elasticsearch && sudo chown -R 1000:0 volumes/elasticsearch 18 | mkdir -p volumes/grafana && sudo chown -R 472:472 volumes/grafana 19 | -------------------------------------------------------------------------------- /frontend/src/index.tsx: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import ReactDOM from 'react-dom'; 3 | import './index.css'; 4 | import './bootstrap4-nyu-d3m.min.css'; 5 | 6 | import {App} from './App'; 7 | import * as serviceWorker from './serviceWorker'; 8 | 9 | ReactDOM.render(, document.getElementById('root')); 10 | 11 | // If you want your app to work offline and load faster, you can change 12 | // unregister() to register() below. Note this comes with some pitfalls. 13 | // Learn more about service workers: https://bit.ly/CRA-PWA 14 | serviceWorker.unregister(); 15 | -------------------------------------------------------------------------------- /scripts/docker_import_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -eu 3 | cd "$(dirname "$(dirname "$0")")" 4 | PROJ="$(basename "$(pwd)")" 5 | if [ -z "$1" ]; then 6 | echo "Missing argument" >&2 7 | exit 1 8 | fi 9 | docker run -ti --rm --network ${PROJ}_default -v $PWD/scripts:/scripts -v "$1:/index" -e ELASTICSEARCH_HOSTS=elasticsearch:9200 -e ELASTICSEARCH_PREFIX=${ELASTICSEARCH_PREFIX} -e AMQP_HOST=rabbitmq -e AMQP_PORT=5672 -e AMQP_USER=$AMQP_USER -e AMQP_PASSWORD=$AMQP_PASSWORD -e LAZO_SERVER_HOST=lazo -e LAZO_SERVER_PORT=50051 auctus python /scripts/import_all.py /index 10 | -------------------------------------------------------------------------------- /frontend/src/components/AdvancedSearchBar/AdvancedSearchBar.css: -------------------------------------------------------------------------------- 1 | .AdvancedSearchBar { 2 | max-width: 1000px; 3 | margin: 0.375rem auto 0 auto!important; 4 | } 5 | 6 | .AdvancedSearchBar-title { 7 | vertical-align: middle; 8 | padding: .375rem 0; 9 | font-size: 1rem; 10 | line-height: 1.5; 11 | } 12 | 13 | .AdvancedSearchBar-item { 14 | margin-left: .25rem; 15 | padding-right: 0; 16 | color: #707070; 17 | font-size: .9rem; 18 | cursor: pointer; 19 | } 20 | 21 | .AdvancedSearchBar-item span { 22 | padding-left: .25em; 23 | padding-right: .25em; 24 | } -------------------------------------------------------------------------------- /scripts/docker_build_push.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -eux 3 | 4 | cd "$(dirname "$0")/.." 5 | 6 | VERSION=$(git describe) 7 | 8 | # Build 9 | docker build -t auctus --build-arg version=$VERSION . 10 | docker build -t auctus_frontend -f frontend/Dockerfile . 11 | 12 | # Push 13 | docker tag auctus registry.gitlab.com/vida-nyu/auctus/auctus:$VERSION 14 | docker push registry.gitlab.com/vida-nyu/auctus/auctus:$VERSION 15 | docker tag auctus_frontend registry.gitlab.com/vida-nyu/auctus/auctus/frontend:$VERSION 16 | docker push registry.gitlab.com/vida-nyu/auctus/auctus/frontend:$VERSION 17 | -------------------------------------------------------------------------------- /docker/etc_rabbitmq/rabbitmq.conf: -------------------------------------------------------------------------------- 1 | loopback_users.guest = false 2 | listeners.tcp.default = 5672 3 | management.tcp.port = 15672 4 | 5 | management.sample_retention_policies.global.minute = 5 6 | management.sample_retention_policies.global.hour = 60 7 | management.sample_retention_policies.global.day = 1200 8 | 9 | management.sample_retention_policies.basic.minute = 5 10 | management.sample_retention_policies.basic.hour = 60 11 | management.sample_retention_policies.basic.day = 1200 12 | 13 | management.sample_retention_policies.detailed.10 = 5 14 | 15 | prometheus.return_per_object_metrics = true 16 | -------------------------------------------------------------------------------- /frontend/public/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "short_name": "Auctus", 3 | "name": "Auctus", 4 | "icons": [ 5 | { 6 | "src": "favicon.ico", 7 | "sizes": "64x64 32x32 24x24 16x16", 8 | "type": "image/x-icon" 9 | }, 10 | { 11 | "src": "logo192.png", 12 | "type": "image/png", 13 | "sizes": "192x192" 14 | }, 15 | { 16 | "src": "logo512.png", 17 | "type": "image/png", 18 | "sizes": "512x512" 19 | } 20 | ], 21 | "start_url": ".", 22 | "display": "standalone", 23 | "theme_color": "#000000", 24 | "background_color": "#ffffff" 25 | } 26 | -------------------------------------------------------------------------------- /docs/redoc/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Auctus REST Documentation 5 | 6 | 7 | 8 | 9 | 12 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /frontend/src/components/visus/Card/card.css: -------------------------------------------------------------------------------- 1 | .card-title { 2 | margin-bottom: .75rem; 3 | margin-top: -34px; 4 | background: white; 5 | padding: 3px 10px 3px 10px; 6 | width: fit-content; 7 | font-size: 1rem; 8 | border-radius: 3px; 9 | margin-left: -10px; 10 | } 11 | 12 | .card-attributes { 13 | display: flex; 14 | width: 100%; 15 | } 16 | 17 | .card-attributes .card-attr-field { 18 | font-weight: bold; 19 | text-align: right; 20 | max-width: 130px; 21 | } 22 | 23 | .card-hover { 24 | background-color: #FFFFFF; 25 | } 26 | 27 | .card-hover:hover { 28 | background-color: #f8f8f8; 29 | } -------------------------------------------------------------------------------- /tests/data/basic.d3m.csv: -------------------------------------------------------------------------------- 1 | d3mIndex,name,color,number,what 2 | 0,james,green,5,false 3 | 1,john,blue,4,false 4 | 2,robert,blue,6,false 5 | 3,michael,blue,7,true 6 | 4,william,blue,7,true 7 | 5,david,green,5,false 8 | 6,richard,green,7,true 9 | 7,joseph,blue,6,true 10 | 8,thomas,blue,6,false 11 | 9,charles,blue,7,false 12 | 10,christopher,green,11,true 13 | 11,daniel,blue,5,false 14 | 12,matthew,green,7,true 15 | 13,anthony,green,7,true 16 | 14,donald,blue,6,true 17 | 15,mark,blue,4,false 18 | 16,paul,blue,4,false 19 | 17,steven,blue,6,false 20 | 18,andrew,green,6,false 21 | 19,kenneth,green,7,true 22 | -------------------------------------------------------------------------------- /scripts/docker_import_snapshot.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -eu 3 | cd "$(dirname "$(dirname "$0")")" 4 | PROJ="$(basename "$(pwd)")" 5 | docker run -ti --rm --network ${PROJ}_default -v $PWD/scripts:/scripts -e ELASTICSEARCH_HOSTS=elasticsearch:9200 -e ELASTICSEARCH_PREFIX=${ELASTICSEARCH_PREFIX} -e AMQP_HOST=rabbitmq -e AMQP_PORT=5672 -e AMQP_USER=$AMQP_USER -e AMQP_PASSWORD=$AMQP_PASSWORD -w /tmp auctus sh -c 'curl -LO https://auctus.vida-nyu.org/snapshot/index.tar.gz && if [ -e index.snapshot ]; then rm -rf index.snapshot; fi && mkdir index.snapshot && tar xfC index.tar.gz index.snapshot && python /scripts/import_all.py index.snapshot; rm -rf index.snapshot' 6 | -------------------------------------------------------------------------------- /tests/data/dates_pivoted.converted.csv: -------------------------------------------------------------------------------- 1 | color,date,value 2 | green,2012-01-01,yes 3 | green,2012-02-01,no 4 | green,2012-03-01,no 5 | green,2012-04-01,yes 6 | green,2012-05-01,no 7 | green,2012-06-01,yes 8 | green,2012-07-01,yes 9 | green,2012-08-01,yes 10 | green,2012-09-01,yes 11 | green,2012-10-01,no 12 | green,2012-11-01,no 13 | green,2012-12-01,yes 14 | red,2012-01-01,no 15 | red,2012-02-01,yes 16 | red,2012-03-01,yes 17 | red,2012-04-01,yes 18 | red,2012-05-01,no 19 | red,2012-06-01,no 20 | red,2012-07-01,no 21 | red,2012-08-01,yes 22 | red,2012-09-01,no 23 | red,2012-10-01,yes 24 | red,2012-11-01,yes 25 | red,2012-12-01,no 26 | -------------------------------------------------------------------------------- /frontend/src/components/MainMenu/MainMenu.css: -------------------------------------------------------------------------------- 1 | .main-menu { 2 | float: right; 3 | color: #707070; 4 | font-size: 0.9rem; 5 | position: absolute; 6 | z-index: 1000; 7 | right: 10px; 8 | top: 7px; 9 | /* bottom: 0; */ 10 | } 11 | 12 | .card-menu { 13 | border: 1px solid #ced4da; 14 | border-radius: 3px; 15 | background-color: #fff; 16 | padding-top: 2px; 17 | padding-bottom: 2px; 18 | } 19 | 20 | .menu-link { 21 | background-color: transparent; 22 | padding: 0.8rem; 23 | cursor: pointer; 24 | } 25 | 26 | .menu-link:hover { 27 | background-color: #f0f0f0; 28 | } 29 | 30 | .menu-link a:hover { 31 | text-decoration: none; 32 | } 33 | -------------------------------------------------------------------------------- /frontend/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "./node_modules/gts/tsconfig-google.json", 3 | "compilerOptions": { 4 | "target": "es5", 5 | "lib": [ 6 | "dom", 7 | "dom.iterable", 8 | "esnext" 9 | ], 10 | "allowJs": true, 11 | "skipLibCheck": true, 12 | "esModuleInterop": true, 13 | "allowSyntheticDefaultImports": true, 14 | "strict": true, 15 | "forceConsistentCasingInFileNames": true, 16 | "module": "esnext", 17 | "moduleResolution": "node", 18 | "resolveJsonModule": true, 19 | "isolatedModules": true, 20 | "noEmit": true, 21 | "jsx": "react" 22 | }, 23 | "include": [ 24 | "src" 25 | ] 26 | } 27 | -------------------------------------------------------------------------------- /docs/schemas.rst: -------------------------------------------------------------------------------- 1 | JSON Schemas 2 | ============ 3 | 4 | .. _schema-query: 5 | 6 | Query 7 | ----- 8 | 9 | JSON objects expected by `the search endpoint <../rest/#operation/search>`__. 10 | 11 | .. literalinclude:: schemas/query_input_schema.json 12 | :language: json 13 | :linenos: 14 | 15 | .. _schema-result: 16 | 17 | Result schema 18 | ------------- 19 | 20 | Description of a dataset, such as a search result. `The search endpoint <../rest/#operation/search>`__ returns an array of those. They are also what you give the :func:`datamart_materialize.download`. 21 | 22 | .. literalinclude:: schemas/query_result_schema.json 23 | :language: json 24 | :linenos: 25 | -------------------------------------------------------------------------------- /docs/python/datamart-augmentation.rst: -------------------------------------------------------------------------------- 1 | Augmentation library 2 | ==================== 3 | 4 | This library performs data augmentation between datasets from Auctus. You can use it to augment a dataset with a search result directly on your side without relying on the server. It is also used internally by Auctus to perform augmentations (the ``/augment`` endpoint downloads the dataset using this library, performs augmentation, then sends the result to you). 5 | 6 | Installing datamart-augmentation 7 | -------------------------------- 8 | 9 | You can get it directly from the Python Package Index using PIP:: 10 | 11 | pip install datamart-augmentation 12 | 13 | API 14 | --- 15 | 16 | TODO 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | /volumes 3 | /docs/_build 4 | 5 | # Python 6 | *.py[co] 7 | .ipynb_checkpoints 8 | 9 | # Packages 10 | *.egg 11 | *.egg-info 12 | dist 13 | build 14 | eggs 15 | parts 16 | bin 17 | var 18 | sdist 19 | develop-eggs 20 | .installed.cfg 21 | lib 22 | lib64 23 | 24 | # Installer logs 25 | pip-log.txt 26 | 27 | # Unit test / coverage reports 28 | .coverage 29 | .tox 30 | nosetests.xml 31 | 32 | # Eclipse PyDev 33 | .project 34 | .pydevproject 35 | 36 | # PyCharm 37 | .idea 38 | 39 | # ViM 40 | .*.swp 41 | 42 | # Emacs 43 | \#*# 44 | 45 | # OS files 46 | .DS_Store 47 | desktop.ini 48 | 49 | # Archives 50 | *.tar 51 | *.tar.gz 52 | *.tar.bz2 53 | *.zip 54 | *.whl 55 | 56 | # Vagrant 57 | .vagrant 58 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /tests/data/lazo.csv: -------------------------------------------------------------------------------- 1 | here's a header but the profiler will throw it out 2 | 3 | dessert year 4 | pie 1991 5 | cake 1991 6 | candy 1990 7 | cookie 1990 8 | doughnut 1990 9 | ice cream 1990 10 | pastry 1990 11 | tart 1990 12 | pudding 1990 13 | jello 1990 14 | apple 1990 15 | pear 1990 16 | banana 1990 17 | fruitcake 1990 18 | orange 1990 19 | petit four 1990 20 | pop tart 1990 21 | tiramisu 1990 22 | tres leches 1990 23 | calisson 1990 24 | chocolate 1990 25 | 1990 26 | liquorice 1990 27 | nougat 1990 28 | coconut 1990 29 | marzipan 1990 30 | taffy 1990 31 | lemon 1990 32 | macaron 1990 33 | gingerbread 1990 34 | peanut butter 1990 35 | eclair 1990 36 | french toast 1990 37 | profiterole 1990 38 | caramel 1991 39 | milkshake 1991 40 | -------------------------------------------------------------------------------- /apiserver/apiserver/search/base.py: -------------------------------------------------------------------------------- 1 | TOP_K_SIZE = 50 2 | 3 | 4 | class ClientError(ValueError): 5 | """Error in query sent by client. 6 | """ 7 | 8 | 9 | def get_column_identifiers(es, column_names, dataset_id=None, data_profile=None): 10 | column_indices = [-1 for _ in column_names] 11 | if not data_profile: 12 | columns = es.get('datasets', dataset_id, _source='columns.name') 13 | columns = columns['_source']['columns'] 14 | else: 15 | columns = data_profile['columns'] 16 | for i in range(len(columns)): 17 | for j in range(len(column_names)): 18 | if columns[i]['name'] == column_names[j]: 19 | column_indices[j] = i 20 | return column_indices 21 | -------------------------------------------------------------------------------- /lib_materialize/datamart_materialize/stata.py: -------------------------------------------------------------------------------- 1 | import pandas 2 | 3 | from datamart_materialize.utils import SimpleConverter 4 | 5 | 6 | def stata_to_csv(source_filename, dest_fileobj): 7 | for i, chunk in enumerate( 8 | pandas.read_stata(source_filename, iterator=True, chunksize=1) 9 | ): 10 | chunk.to_csv( 11 | dest_fileobj, 12 | header=(i == 0), 13 | float_format='%g', 14 | date_format='%Y-%m-%dT%H:%M:%S', 15 | index=False, 16 | line_terminator='\r\n', 17 | ) 18 | 19 | 20 | class StataConverter(SimpleConverter): 21 | """Adapter converting a Stata file to CSV. 22 | """ 23 | transform = staticmethod(stata_to_csv) 24 | -------------------------------------------------------------------------------- /scripts/delete_dataset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import lazo_index_service 4 | import logging 5 | import os 6 | import sys 7 | 8 | from datamart_core.common import PrefixedElasticsearch, \ 9 | delete_dataset_from_index 10 | 11 | 12 | SIZE = 10000 13 | 14 | 15 | def delete(datasets): 16 | es = PrefixedElasticsearch() 17 | lazo_client = lazo_index_service.LazoIndexClient( 18 | host=os.environ['LAZO_SERVER_HOST'], 19 | port=int(os.environ['LAZO_SERVER_PORT']) 20 | ) 21 | for dataset in datasets: 22 | delete_dataset_from_index(es, dataset, lazo_client) 23 | 24 | 25 | if __name__ == '__main__': 26 | logging.basicConfig(level=logging.INFO) 27 | 28 | delete(sys.argv[1:]) 29 | -------------------------------------------------------------------------------- /docker/nominatim.dockerfile: -------------------------------------------------------------------------------- 1 | FROM mediagis/nominatim:3.3 2 | 3 | # Increase memory limit 4 | RUN find /etc -name php.ini -exec sed -i '/^memory_limit *= *[0-9]/ c memory_limit = 1024M' {} ';' 5 | RUN find /app -name \*.php\* -exec sed -i "s/ini_set('memory_limit', *'[0-9]\+M');/ini_set('memory_limit', '1024M');/" {} ';' 6 | 7 | # Enable batch mode 8 | RUN bash -c "cd /app/src/build && echo $'--- settings/settings.php\\n\ 9 | +++ settings/settings.php\\n\ 10 | @@ -106 +106 @@\\n\ 11 | -@define(\\'CONST_Search_BatchMode\\', false);\\n\ 12 | +@define(\\'CONST_Search_BatchMode\\', true);\\n\ 13 | ' | patch -p0" 14 | 15 | # Print apache log as well as postgres log 16 | RUN sed -i '/tail -f/ a tail -f /var/log/apache2/error.log &' start.sh 17 | -------------------------------------------------------------------------------- /scripts/report-uploads.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -eu 4 | 5 | # Query elasticsearch 6 | RECORD="$(curl -s -H content-type:application/json -d '{"query":{"bool":{"should":[{"term":{"materialize.identifier":"datamart.url"}},{"term":{"materialize.identifier":"datamart.upload"}}]}}, "_source":["date", "name"]}' http://localhost:8020/_search?size=1000 \ 7 | | jq -r '.hits.hits | sort_by(._source.date)[] | ._source.date + ": " + ._id + " (" + ._source.name + ")"' \ 8 | | tail -n 1)" 9 | LASTRECORD="$(cat $HOME/report-uploads.last)" 10 | if [ "$RECORD" != "$LASTRECORD" ]; then 11 | echo "Check https://coordinator.auctus.vida-nyu.org/" \ 12 | | mail -s "New uploaded datasets" root 13 | echo "$RECORD" >$HOME/report-uploads.last 14 | fi 15 | -------------------------------------------------------------------------------- /coordinator/coordinator/templates/errors.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | {% set active_page = "index" %} 3 | 4 | {% block contents %} 5 |

Datasets with error {{ error_type }}

6 |
    7 | {% if datasets %} 8 | {% for dataset in datasets %} 9 |
  • 10 | {{ dataset.metadata.name }} ({{ dataset.id }})
    11 |
    12 | {{ dataset.error_details.exception }} 13 |
    {{ dataset.error_details.traceback }}
    14 |
    15 |
  • 16 | {% endfor %} 17 | {% else %} 18 |
  • No dataset with that error
  • 19 | {% endif %} 20 |
21 | {% endblock %} 22 | -------------------------------------------------------------------------------- /tests/data/annotated.csv: -------------------------------------------------------------------------------- 1 | id,lt_coord,lg_coord,height,stmo 2 | place00,40.734746,-74.000077,85.772569,10 3 | place01,40.728026,-73.998869,58.730197,10 4 | place02,40.728278,-74.005837,51.929949,11 5 | place03,40.726640,-73.993186,12.730146,9 6 | place04,40.732466,-74.004689,44.452236,5 7 | place05,40.722948,-74.001501,42.904820,12 8 | place06,40.735108,-73.996996,48.345170,1 9 | place07,40.727577,-74.002853,37.459986,2 10 | place08,40.730824,-74.002225,49.123637,4 11 | place09,40.729115,-74.001726,40.455639,6 12 | place10,40.734259,-73.996833,23.722705,6 13 | place11,40.723674,-73.991001,67.692448,7 14 | place12,40.728896,-73.998542,67.626361,8 15 | place13,40.728711,-74.002426,84.191461,12 16 | place14,40.733272,-73.996875,51.000673,12 17 | place15,40.726559,-74.000678,41.906452,11 18 | -------------------------------------------------------------------------------- /frontend/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM node:14-buster AS build 2 | 3 | RUN mkdir /src 4 | RUN chown -R node /src 5 | USER node 6 | WORKDIR /src 7 | 8 | COPY frontend/package.json frontend/package-lock.json /src/ 9 | RUN lock_hash="$(shasum -a 256 package-lock.json)" && \ 10 | npm install && \ 11 | echo "$lock_hash" | shasum -c 12 | COPY frontend /src/ 13 | RUN npm run build 14 | 15 | 16 | FROM nginx:1.21 17 | 18 | COPY --from=build --chown=0:0 /src/build /var/www/html 19 | COPY frontend/nginx.conf /etc/nginx/conf.d/default.conf 20 | 21 | # nginx default CMD is ["nginx", "-g", "daemon off;"] 22 | CMD ["sh", "-c", "sed -i 's|]\\+>||' /var/www/html/index.html && exec nginx -g \"daemon off; worker_shutdown_timeout 2s;\""] 23 | -------------------------------------------------------------------------------- /frontend/src/components/SearchBar/SearchBar.css: -------------------------------------------------------------------------------- 1 | .SearchBar { 2 | max-width: 1000px; 3 | margin: 0 auto; 4 | } 5 | 6 | .SearchBar-input { 7 | height: 46px; 8 | border-right-width: 0; 9 | } 10 | 11 | .SearchBar-icon { 12 | width: 46px; 13 | margin: 0 auto; 14 | background-color: transparent; 15 | border-left-width: 0; 16 | } 17 | 18 | .SearchBar-icon-active { 19 | background-color: #63508b; 20 | } 21 | 22 | .SearchBar-icon-active .feather { 23 | color: #ffffff!important; 24 | } 25 | 26 | .SearchBar-icon .feather { 27 | margin: 0 auto; 28 | color: #707070; 29 | width: 24px; 30 | height: 24px; 31 | } 32 | 33 | .SearchBar input:focus { 34 | box-shadow: none; 35 | border-color: #ced4da!important; 36 | } 37 | 38 | .SearchBar .input-group-append { 39 | cursor: pointer; 40 | } 41 | -------------------------------------------------------------------------------- /tests/ci.env: -------------------------------------------------------------------------------- 1 | AUCTUS_DEBUG=yes 2 | ELASTICSEARCH_HOSTS=elasticsearch:9200 3 | ELASTICSEARCH_PREFIX=auctus_ 4 | LAZO_SERVER_HOST=lazo 5 | LAZO_SERVER_PORT=50051 6 | AMQP_HOST=rabbitmq 7 | AMQP_PORT=5672 8 | AMQP_USER=auctus 9 | AMQP_PASSWORD=auctus 10 | ADMIN_PASSWORD=auctus 11 | S3_KEY=devkey 12 | S3_SECRET=devpassword 13 | S3_URL=http://minio:9000 14 | S3_CLIENT_URL=http://minio:9000 15 | S3_BUCKET_PREFIX=dev- 16 | AUCTUS_REQUEST_WHITELIST=test-discoverer 17 | AUCTUS_REQUEST_BLACKLIST= 18 | FRONTEND_URL=http://frontend 19 | API_URL=http://apilb:8002/api/v1 20 | MAX_CACHE_BYTES=100000000000 21 | NOMINATIM_URL= 22 | NOAA_TOKEN= 23 | CUSTOM_FIELDS={"specialId": {"label": "Special ID", "type": "integer"}, "dept": {"label": "Department", "type": "keyword", "required": true}} 24 | SENTRY_DSN= 25 | SENTRY_ENVIRONMENT= 26 | -------------------------------------------------------------------------------- /scripts/list_big_datasets.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """This script lists datasets with a big size. 4 | """ 5 | 6 | from datamart_core.common import PrefixedElasticsearch 7 | 8 | 9 | SIZE = 10000 10 | 11 | 12 | def search(): 13 | es = PrefixedElasticsearch() 14 | hits = es.scan( 15 | index='datasets', 16 | query={ 17 | 'query': { 18 | 'range': { 19 | "size": { 20 | "gt": 10000000000, # 10 GB 21 | }, 22 | }, 23 | }, 24 | }, 25 | _source='size', 26 | size=SIZE, 27 | ) 28 | for h in hits: 29 | print("%s %.1f GB" % (h['_id'], h['_source']['size'] / 1000000000.0)) 30 | 31 | 32 | if __name__ == '__main__': 33 | search() 34 | -------------------------------------------------------------------------------- /frontend/src/components/Logo/Logo.css: -------------------------------------------------------------------------------- 1 | .logo-vertical { 2 | padding-top: 30px; 3 | padding-bottom: 30px; 4 | } 5 | 6 | .logo-vertical img { 7 | width: 190px; 8 | margin: 0 auto; 9 | } 10 | 11 | .logo-vertical span { 12 | font-size: 60px; 13 | line-height: 1; 14 | margin-top: -10px; 15 | } 16 | 17 | .logo-horizontal img { 18 | width: 56px; 19 | margin: 0 auto; 20 | } 21 | 22 | .logo-horizontal span { 23 | font-size: 25px; 24 | line-height: 1; 25 | margin-top: -10px; 26 | } 27 | 28 | .logo-centered-horizontal { 29 | text-align: center; 30 | padding-top: 30px; 31 | padding-bottom: 30px; 32 | } 33 | 34 | /* don't decorate the logo if inside a link */ 35 | a .logo-horizontal, a .logo-vertical, a:hover .logo-horizontal, a:hover .logo-vertical { 36 | text-decoration: none; 37 | color: #212529; 38 | } 39 | -------------------------------------------------------------------------------- /lib_materialize/datamart_materialize/tsv.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | from datamart_materialize.utils import SimpleConverter 4 | 5 | 6 | def tsv_to_csv(source_filename, dest_fileobj, separator='\t'): 7 | with open(source_filename, 'r') as src_fp: 8 | src = csv.reader(src_fp, delimiter=separator) 9 | dst = csv.writer(dest_fileobj) 10 | for line in src: 11 | dst.writerow(line) 12 | 13 | 14 | class TsvConverter(SimpleConverter): 15 | """Adapter converting a TSV or other separated file to CSV. 16 | """ 17 | def __init__(self, writer, separator='\t'): 18 | self.separator = separator 19 | super(TsvConverter, self).__init__(writer) 20 | 21 | def transform(self, source_filename, dest_fileobj): 22 | tsv_to_csv(source_filename, dest_fileobj, separator=self.separator) 23 | -------------------------------------------------------------------------------- /lib_materialize/datamart_materialize/parquet.py: -------------------------------------------------------------------------------- 1 | import fastparquet 2 | 3 | from datamart_materialize.utils import SimpleConverter 4 | 5 | 6 | def parquet_to_csv(source_filename, dest_fileobj): 7 | src = fastparquet.ParquetFile(source_filename) 8 | for i, chunk in enumerate(src.iter_row_groups()): 9 | chunk.to_csv( 10 | dest_fileobj, 11 | header=(i == 0), 12 | float_format='%g', 13 | date_format='%Y-%m-%dT%H:%M:%S', 14 | index=False, 15 | line_terminator='\r\n', 16 | ) 17 | 18 | 19 | class ParquetConverter(SimpleConverter): 20 | """Adapter pivoting a table. 21 | """ 22 | def transform(self, source_filename, dest_fileobj): 23 | parquet_to_csv( 24 | source_filename, 25 | dest_fileobj, 26 | ) 27 | -------------------------------------------------------------------------------- /docker/haproxy.conf: -------------------------------------------------------------------------------- 1 | global 2 | master-worker no-exit-on-failure 3 | 4 | defaults 5 | mode http 6 | balance roundrobin 7 | option httplog 8 | timeout connect 5000 9 | timeout client 5000 10 | timeout server 5000 11 | timeout http-request 900s 12 | timeout server 900s 13 | 14 | frontend stats 15 | bind :8004 16 | http-request use-service prometheus-exporter if { path /metrics } 17 | stats enable 18 | stats uri / 19 | 20 | resolvers systemdns 21 | parse-resolv-conf 22 | hold timeout 120s 23 | hold refused 120s 24 | 25 | frontend api-in 26 | bind *:8002 27 | default_backend api-servers 28 | 29 | backend api-servers 30 | option httpchk GET /health 31 | server-template apiserver 20 apiserver:8002 maxconn 1 check inter 1000 rise 2 fall 1 resolvers systemdns resolve-opts prevent-dup-ip 32 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to Auctus's documentation! 2 | ================================== 3 | 4 | Auctus is a dataset search engine and data augmentation platform developed at New York University. It can be used to index the content of datasets from a variety of sources, that can later be queried to find data that can be joined or appended to a user's data. 5 | 6 | The system can be found at this address: https://auctus.vida-nyu.org/ 7 | 8 | You can find the source code on GitLab: https://gitlab.com/ViDA-NYU/auctus/auctus 9 | 10 | .. toctree:: 11 | :maxdepth: 2 12 | :caption: Contents: 13 | 14 | webui 15 | Using the REST API 16 | schemas 17 | python/index 18 | internals 19 | 20 | Indices and tables 21 | ================== 22 | 23 | * :ref:`genindex` 24 | * :ref:`modindex` 25 | * :ref:`search` 26 | -------------------------------------------------------------------------------- /frontend/.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": ["./node_modules/gts", "plugin:react/recommended"], 3 | "env": { 4 | "browser": true, 5 | "node": false, 6 | "jest": true 7 | }, 8 | "rules": { 9 | "node/no-unsupported-features/node-builtins": ["off"], 10 | "node/no-extraneous-import": ["off"], 11 | "node/no-unpublished-import": ["off"], 12 | "react/prop-types": ["off"], 13 | "prefer-const": [ 14 | "error", 15 | { 16 | "destructuring": "all", 17 | "ignoreReadBeforeAssign": false 18 | } 19 | ], 20 | "@typescript-eslint/ban-types": [ 21 | "error", 22 | { 23 | "extendDefaults": true, 24 | "types": { 25 | "{}": false 26 | } 27 | } 28 | ] 29 | }, 30 | "settings": { 31 | "react": { 32 | "version": "detect" 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /docs/python/datamart-rest.rst: -------------------------------------------------------------------------------- 1 | API client 2 | ========== 3 | 4 | A client library for `the REST API <../rest>`__ is available for convenience. It supports searching, downloading, and augmenting datasets. 5 | 6 | It can perform some operations both on the client-side (for speed, the server has limited capacity; also saves time by not uploading the data) and on the server-side in "proxy mode" (working around the need to install and configure some dependencies on the client, and taking advantage of cached results on the server). 7 | 8 | Installing datamart-rest 9 | ------------------------ 10 | 11 | You can get it directly from the Python Package Index using PIP:: 12 | 13 | pip install datamart-rest 14 | 15 | API 16 | --- 17 | 18 | The REST client is currently maintained as part of the D3M project, with `documentation available here `__. 19 | -------------------------------------------------------------------------------- /tests/__main__.py: -------------------------------------------------------------------------------- 1 | import locale 2 | import os 3 | import sys 4 | import unittest 5 | 6 | 7 | top_level = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) 8 | start_dir = os.path.join(top_level, 'tests') 9 | if top_level not in sys.path: 10 | sys.path.insert(0, top_level) 11 | 12 | 13 | sys.path.append(start_dir) 14 | 15 | 16 | class Program(unittest.TestProgram): 17 | def createTests(self): 18 | if self.testNames is None: 19 | self.test = self.testLoader.discover( 20 | start_dir=start_dir, 21 | top_level_dir=top_level, 22 | pattern='test_*.py') 23 | else: 24 | self.test = self.testLoader.loadTestsFromNames(self.testNames) 25 | 26 | 27 | if __name__ == '__main__': 28 | # Locale 29 | locale.setlocale(locale.LC_ALL, '') 30 | 31 | prog = Program(argv=['tests'] + sys.argv[1:]) 32 | -------------------------------------------------------------------------------- /frontend/src/components/Badges/Badges.css: -------------------------------------------------------------------------------- 1 | .badge-group > * { 2 | display: inline-flex; 3 | margin-right: 0.25rem; 4 | } 5 | 6 | .badge-group > *:last-child { 7 | margin-right: 0px; 8 | } 9 | 10 | .badge-column, .badge-numerical, .badge-textual { 11 | background-color: #f0f0f0; 12 | font-size: .7rem; 13 | } 14 | 15 | .badge-numerical svg { 16 | color: #1ab082; 17 | } 18 | 19 | .badge-textual svg { 20 | color: #4d96b2; 21 | } 22 | 23 | .badge-categorical { 24 | color: #fff; 25 | background-color: #4d96b2; 26 | } 27 | 28 | .badge-number { 29 | color: #fff; 30 | background-color: #1ab082; 31 | } 32 | 33 | .badge-corner-button{ 34 | width: 12px; 35 | height: 12px; 36 | margin-top: -18px; 37 | margin-left: 1px; 38 | padding: 0px; 39 | margin-right: -6px; 40 | } 41 | 42 | .badge-corner-button svg{ 43 | color:#fdfdfe; 44 | fill: #757575; 45 | stroke-width: 2; 46 | } 47 | 48 | -------------------------------------------------------------------------------- /scripts/list_sources.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """This script gives a summary of the dataset sources. 4 | """ 5 | 6 | from datamart_core.common import PrefixedElasticsearch 7 | 8 | 9 | SIZE = 10000 10 | 11 | 12 | def count(): 13 | es = PrefixedElasticsearch() 14 | sources = {} 15 | hits = es.scan( 16 | index='datasets', 17 | query={ 18 | 'query': { 19 | 'match_all': {}, 20 | }, 21 | }, 22 | _source='source', 23 | size=SIZE, 24 | ) 25 | for h in hits: 26 | source = h['_source']['source'] 27 | 28 | try: 29 | sources[source] += 1 30 | except KeyError: 31 | sources[source] = 1 32 | 33 | for identifier, count in sorted(sources.items(), key=lambda p: -p[1]): 34 | print('{: 6d} {}'.format(count, identifier)) 35 | 36 | 37 | if __name__ == '__main__': 38 | count() 39 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | **/.env 2 | volumes 3 | docs 4 | frontend/node_modules 5 | frontend/build 6 | lib_geo/data 7 | 8 | # Git 9 | **/.git 10 | **/.gitignore 11 | 12 | # Python 13 | **/__pycache__ 14 | **/*.pyc 15 | **/.ipynb_checkpoints 16 | 17 | # Packages 18 | **/*.egg 19 | **/*.egg-info 20 | **/dist 21 | **/build 22 | **/eggs 23 | **/parts 24 | **/bin 25 | **/var 26 | **/sdist 27 | **/develop-eggs 28 | **/.installed.cfg 29 | **/lib 30 | **/lib64 31 | 32 | # Installer logs 33 | **/pip-log.txt 34 | 35 | # Unit test / coverage reports 36 | **/.coverage 37 | **/.tox 38 | **/nosetests.xml 39 | 40 | # Eclipse PyDev 41 | **/.project 42 | **/.pydevproject 43 | 44 | # PyCharm 45 | **/.idea 46 | 47 | # ViM 48 | **/.*.swp 49 | 50 | # Emacs 51 | **/#*# 52 | 53 | # OS files 54 | **/.DS_Store 55 | **/desktop.ini 56 | 57 | # Archives 58 | **/*.tar 59 | **/*.tar.gz 60 | **/*.tar.bz2 61 | **/*.zip 62 | **/*.whl 63 | 64 | # Vagrant 65 | **/.vagrant 66 | -------------------------------------------------------------------------------- /frontend/src/App.test.tsx: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import * as ReactDOM from 'react-dom'; 3 | import * as api from './api/rest'; 4 | import {render} from '@testing-library/react'; 5 | import {App} from './App'; 6 | import 'jest-canvas-mock'; 7 | 8 | beforeEach(() => { 9 | jest.spyOn(api, 'status').mockImplementation(() => 10 | Promise.resolve({ 11 | recent_discoveries: [], 12 | sources_counts: { 13 | remi: 23, 14 | fernando: 37, 15 | }, 16 | }) 17 | ); 18 | }); 19 | 20 | afterEach(() => jest.restoreAllMocks()); 21 | 22 | test('renders main app', () => { 23 | const {getByText} = render(); 24 | const linkElement = getByText(/Auctus/i); 25 | expect(linkElement).toBeInTheDocument(); 26 | }); 27 | 28 | test('renders without crashing', () => { 29 | const div = document.createElement('div'); 30 | ReactDOM.render(, div); 31 | ReactDOM.unmountComponentAtNode(div); 32 | }); 33 | -------------------------------------------------------------------------------- /coordinator/coordinator/templates/login.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | {% set active_page = "login" %} 3 | 4 | {% block contents %} 5 |
6 | {{ xsrf_form_html() }} 7 | {% if error %} 8 | 11 | {% endif %} 12 | 13 |
14 | 15 |
16 | 17 |
18 |
19 |
20 | 21 |
22 |
23 |
24 |
25 | {% endblock %} 26 | -------------------------------------------------------------------------------- /frontend/src/components/Upload/Upload.css: -------------------------------------------------------------------------------- 1 | .dropdown { 2 | position: relative; 3 | display: inline-block; 4 | } 5 | 6 | .dropdown-content { 7 | display: none; 8 | position: absolute; 9 | background-color: #f1f1f1; 10 | min-width: 110px; 11 | box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2); 12 | z-index: 4; 13 | max-height: 16vh; 14 | overflow-y: auto; 15 | margin-left: -4px; 16 | } 17 | 18 | .dropdown-content div { 19 | color: black; 20 | padding: 4px 4px; 21 | text-decoration: none; 22 | display: block; 23 | } 24 | 25 | .dropdown:hover .dropdown-content { 26 | display: inline-block; 27 | } 28 | 29 | .badge-button{ 30 | width: 12px; 31 | height: 12px; 32 | margin-top: -18px; 33 | margin-left: 1px; 34 | padding: 0px; 35 | } 36 | 37 | .dropdown-content div:hover {background-color: #ddd;} 38 | 39 | .dropdown:hover .dropdown-content {display: block;} 40 | 41 | .dropdown:hover .dropbtn {background-color: #3e8e41;} 42 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /frontend/src/components/FilterContainer/FilterContainer.tsx: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import * as Icon from 'react-feather'; 3 | 4 | class FilterContainer extends React.PureComponent<{ 5 | title: string; 6 | onClose: () => void; 7 | }> { 8 | render() { 9 | return ( 10 |
11 |
12 |
{this.props.title}
13 | this.props.onClose()} 15 | className="d-inline text-muted ml-1" 16 | style={{cursor: 'pointer'}} 17 | title="Remove this filter" 18 | > 19 | 23 | 24 |
25 |
{this.props.children}
26 |
27 | ); 28 | } 29 | } 30 | 31 | export {FilterContainer}; 32 | -------------------------------------------------------------------------------- /lib_materialize/README.rst: -------------------------------------------------------------------------------- 1 | Datamart materialization library 2 | ================================ 3 | 4 | This library can materialize datasets from Auctus, NYU's dataset search engine. You can use it to materialize search results directly on your side without relying on the server. It is also used internally by the service to materialize datasets (the ``/download`` endpoint downloads the dataset using this library then sends it to you). 5 | 6 | See also: 7 | 8 | * `The datamart-rest library for search/augmentation `__ 9 | * `The datamart-profiler library, used to profile datasets for search `__ 10 | * `The datamart-augmentation library, used to performs data augmentation with a dataset from Auctus `__ 11 | * `Auctus, NYU's dataset search engine `__ 12 | * `Our project on GitLab `__ 13 | -------------------------------------------------------------------------------- /lib_augmentation/README.rst: -------------------------------------------------------------------------------- 1 | Datamart augmentation library 2 | ============================= 3 | 4 | This library performs data augmentation between datasets from Auctus, NYU's dataset search engine. You can use it to augment a dataset with a search result directly on your side without relying on the server. It is also used internally by the service to perform augmentations (the ``/augment`` endpoint downloads the dataset using this library, performs augmentation, then sends the result to you). 5 | 6 | See also: 7 | 8 | * `The datamart-rest library for search/augmentation `__ 9 | * `The datamart-profiler library, used to profile datasets for search `__ 10 | * `The datamart-materialize library, used to materialize dataset from search results `__ 11 | * `Auctus, NYU's dataset search engine `__ 12 | * `Our project on GitLab `__ 13 | -------------------------------------------------------------------------------- /frontend/src/index.css: -------------------------------------------------------------------------------- 1 | /* 2 | * Icon sizes 3 | */ 4 | 5 | .feather { 6 | width: 15px; 7 | height: 17px; 8 | vertical-align: text-bottom; 9 | } 10 | 11 | .feather-xs { 12 | width: 13px; 13 | height: 13px; 14 | vertical-align: text-bottom; 15 | } 16 | 17 | .feather-xs-w { 18 | width: 17px; 19 | height: 13px; 20 | vertical-align: text-bottom; 21 | } 22 | 23 | .feather-lg { 24 | width: 20px; 25 | height: 20px; 26 | vertical-align: text-bottom; 27 | } 28 | 29 | .btn .feather { 30 | width: 18px; 31 | height: 20px; 32 | } 33 | 34 | .btn-sm .feather { 35 | width: 16px; 36 | height: 21px; 37 | } 38 | 39 | /* to be applied to buttons that should look like a simple link */ 40 | .btn-link { 41 | border: none; 42 | background: none; 43 | text-decoration: underline; 44 | cursor: pointer; 45 | padding: 0px; 46 | } 47 | 48 | html, body, body #root, .container-vh-full { 49 | overflow: hidden; 50 | height: 100%; 51 | } 52 | 53 | .container-vh-scroll { 54 | overflow: auto; 55 | height: 100%; 56 | } 57 | -------------------------------------------------------------------------------- /lib_profiler/README.rst: -------------------------------------------------------------------------------- 1 | Datamart profiling library 2 | ========================== 3 | 4 | This library can profile datasets for use with Auctus, NYU's dataset search engine. You can use it to profile datasets on your side and send that to the server for search, instead of uploading the whole dataset. It is also used internally by the service to process search-by-example queries (when sending a file to the ``/search`` endpoint) and to add datasets to the index (to be queried against later). 5 | 6 | See also: 7 | 8 | * `The datamart-rest library for search/augmentation `__ 9 | * `The datamart-materialize library, used to materialize dataset from search results `__ 10 | * `The datamart-augmentation library, used to performs data augmentation with a dataset from Auctus `__ 11 | * `Auctus, NYU's dataset search engine `__ 12 | * `Our project on GitLab `__ 13 | -------------------------------------------------------------------------------- /env.default: -------------------------------------------------------------------------------- 1 | # Fill that in and rename to .env 2 | 3 | AUCTUS_DEBUG=no 4 | ELASTICSEARCH_HOSTS=127.0.0.1:8020 5 | ELASTICSEARCH_PREFIX=auctus_ 6 | LAZO_SERVER_HOST=127.0.0.1 7 | LAZO_SERVER_PORT=8030 8 | AMQP_HOST=127.0.0.1 9 | AMQP_PORT=8011 10 | AMQP_USER=auctus 11 | AMQP_PASSWORD=auctus 12 | ADMIN_PASSWORD=auctus 13 | S3_KEY=devkey 14 | S3_SECRET=devpassword 15 | S3_URL=http://minio:9000 16 | S3_CLIENT_URL=http://127.0.0.1:8050 17 | S3_BUCKET_PREFIX=dev- 18 | GCS_PROJECT= 19 | GCS_CREDS= 20 | GCS_BUCKET_PREFIX=dev- 21 | AUCTUS_REQUEST_WHITELIST=test-discoverer 22 | AUCTUS_REQUEST_BLACKLIST= 23 | FRONTEND_URL=http://127.0.0.1:8001 24 | API_URL=http://127.0.0.1:8002/api/v1 25 | MAX_CACHE_BYTES=100000000000 26 | # Set to an empty string to disable address resolution 27 | NOMINATIM_URL=http://nominatim 28 | NOAA_TOKEN= 29 | ISI_DATAMART_URL=https://datamart:datamart-api-789@dsbox02.isi.edu/datamart-api 30 | CUSTOM_FIELDS={"specialId": {"label": "Special ID", "type": "integer"}, "dept": {"label": "Department", "type": "keyword", "required": true}} 31 | SENTRY_DSN= 32 | SENTRY_ENVIRONMENT=test 33 | -------------------------------------------------------------------------------- /frontend/src/components/SearchResults/SimpleBar.tsx: -------------------------------------------------------------------------------- 1 | import * as React from 'react'; 2 | import {SearchFacet} from '../../api/types'; 3 | 4 | interface SimpleBarProps { 5 | facetBuckets: SearchFacet; 6 | keyname: string; 7 | totalResults: number; 8 | } 9 | class SimpleBar extends React.PureComponent { 10 | render() { 11 | const {facetBuckets, keyname, totalResults} = this.props; 12 | const rectangleWidth = 200; 13 | return ( 14 | 15 | 29 | 30 | {facetBuckets.buckets[keyname]} 31 | 32 | 33 | ); 34 | } 35 | } 36 | 37 | export {SimpleBar}; 38 | -------------------------------------------------------------------------------- /tests/data/temporal.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | import os 3 | import random 4 | 5 | 6 | def main(): 7 | data_dir = os.path.dirname(__file__) 8 | 9 | with open(os.path.join(data_dir, 'daily.csv'), 'w') as f_daily: 10 | print('aug_date,rain', file=f_daily) 11 | date = datetime(2019, 4, 23) 12 | rand = random.Random(1) 13 | for _ in range(30): 14 | time = date.date().strftime('%Y%m%d') 15 | boolean = ['no', 'yes'][rand.randint(0, 1)] 16 | print('%s,%s' % (time, boolean), file=f_daily) 17 | date += timedelta(days=1) 18 | 19 | with open(os.path.join(data_dir, 'hourly.csv'), 'w') as f_hourly: 20 | print('aug_date,rain', file=f_hourly) 21 | date = datetime(2019, 6, 12) 22 | rand = random.Random(2) 23 | for _ in range(52): 24 | time = date.isoformat() 25 | boolean = ['no', 'yes'][rand.randint(0, 1)] 26 | print('%s,%s' % (time, boolean), file=f_hourly) 27 | date += timedelta(hours=1) 28 | 29 | 30 | if __name__ == '__main__': 31 | main() 32 | -------------------------------------------------------------------------------- /lib_fslock/LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2019 Remi Rampin 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /frontend/src/components/ui/Button/Button.tsx: -------------------------------------------------------------------------------- 1 | import {Tooltip} from '@material-ui/core'; 2 | import React from 'react'; 3 | import {Spinner} from '../../visus/Loading/Spinner'; 4 | import './Button.css'; 5 | 6 | const SubmitButton = (props: {label: string; loading: boolean}) => ( 7 | 15 | ); 16 | 17 | function ButtonGroup(props: React.PropsWithChildren<{}>) { 18 | return
{props.children}
; 19 | } 20 | 21 | function LinkButton( 22 | props: React.PropsWithChildren<{href: string; message?: string}> 23 | ) { 24 | return ( 25 | 30 | 31 | {props.children} 32 | 33 | 34 | ); 35 | } 36 | 37 | export {SubmitButton, ButtonGroup, LinkButton}; 38 | -------------------------------------------------------------------------------- /frontend/src/components/GeoSpatialCoverageMap/GeoSpatialCoverageMap.css: -------------------------------------------------------------------------------- 1 | .map { 2 | height: 400px; 3 | width: 100%; 4 | } 5 | 6 | .ol-popup { 7 | position: absolute; 8 | background-color: white; 9 | -webkit-filter: drop-shadow(0 1px 4px rgba(0,0,0,0.2)); 10 | filter: drop-shadow(0 1px 4px rgba(0,0,0,0.2)); 11 | padding: 15px; 12 | border-radius: 10px; 13 | border: 1px solid #cccccc; 14 | bottom: 12px; 15 | left: -50px; 16 | min-width: max-content; 17 | width: max-content; 18 | } 19 | 20 | .ol-popup:after, .ol-popup:before { 21 | top: 100%; 22 | border: solid transparent; 23 | content: " "; 24 | height: 0; 25 | width: 0; 26 | position: absolute; 27 | pointer-events: none; 28 | } 29 | 30 | .ol-popup:after { 31 | border-top-color: white; 32 | border-width: 10px; 33 | left: 48px; 34 | margin-left: -10px; 35 | } 36 | 37 | .ol-popup:before { 38 | border-top-color: #cccccc; 39 | border-width: 11px; 40 | left: 48px; 41 | margin-left: -11px; 42 | } 43 | 44 | .legend{ 45 | position:relative; 46 | left: 10px; 47 | top:-84px; 48 | z-index:10000; 49 | height: 54px; 50 | background-color:#fdfcfb; 51 | border-radius: 6px; 52 | } 53 | -------------------------------------------------------------------------------- /lib_core/datamart_core/prom.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | from prometheus_async.aio import time as prom_async_time 3 | 4 | 5 | class PromMeasureRequest(object): 6 | def __init__(self, count, time): 7 | self.count = count 8 | self.time = time 9 | 10 | def _wrap(self, *labels, timer): 11 | if labels: 12 | counter = self.count.labels(*labels) 13 | else: 14 | counter = self.count 15 | if labels: 16 | timer = timer(self.time.labels(*labels)) 17 | else: 18 | timer = timer(self.time) 19 | 20 | # Initialize count 21 | counter.inc(0) 22 | 23 | def decorator(func): 24 | @contextlib.wraps(func) 25 | def wrapper(*args, **kwargs): 26 | # Count requests 27 | counter.inc() 28 | return func(*args, **kwargs) 29 | 30 | return timer(wrapper) 31 | 32 | return decorator 33 | 34 | def sync(self, *labels): 35 | return self._wrap(*labels, timer=lambda metric: metric.time()) 36 | 37 | def async_(self, *labels): 38 | return self._wrap(*labels, timer=lambda metric: prom_async_time(metric)) 39 | -------------------------------------------------------------------------------- /lib_profiler/datamart_profiler/warning_tools.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import warnings 3 | 4 | 5 | @contextlib.contextmanager 6 | def ignore_warnings(*categories): 7 | """Context manager to ignore specific warning categories. 8 | """ 9 | orig_showarning = warnings.showwarning 10 | 11 | def record(message, category, filename, lineno, file=None, line=None): 12 | if not any(issubclass(category, c) for c in categories): 13 | orig_showarning(message, category, filename, lineno, file, line) 14 | 15 | try: 16 | warnings.showwarning = record 17 | yield 18 | finally: 19 | warnings.showwarning = orig_showarning 20 | 21 | 22 | @contextlib.contextmanager 23 | def raise_warnings(*categories): 24 | orig_showarning = warnings.showwarning 25 | 26 | def record(message, category, filename, lineno, file=None, line=None): 27 | if any(issubclass(category, c) for c in categories): 28 | raise category(message) 29 | orig_showarning(message, category, filename, lineno, file, line) 30 | 31 | try: 32 | warnings.showwarning = record 33 | yield 34 | finally: 35 | warnings.showwarning = orig_showarning 36 | -------------------------------------------------------------------------------- /lib_materialize/datamart_materialize/common.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | from datamart_materialize.utils import SimpleConverter 4 | 5 | 6 | class UnsupportedConversion(ValueError): 7 | """This conversion cannot work.""" 8 | 9 | 10 | def skip_rows(source_filename, dest_fileobj, nb_rows): 11 | with open(source_filename, 'r') as src_fp: 12 | src = iter(csv.reader(src_fp)) 13 | dst = csv.writer(dest_fileobj) 14 | 15 | # Skip rows 16 | for i in range(nb_rows): 17 | try: 18 | next(src) 19 | except StopIteration: 20 | raise ValueError( 21 | "Can't skip %d rows, table only has %d" % (nb_rows, i), 22 | ) 23 | 24 | # Copy rest 25 | for row in src: 26 | dst.writerow(row) 27 | 28 | 29 | class SkipRowsConverter(SimpleConverter): 30 | """Adapter skipping a given number of rows from a CSV file. 31 | """ 32 | def __init__(self, writer, *, nb_rows): 33 | super(SkipRowsConverter, self).__init__(writer) 34 | self.nb_rows = nb_rows 35 | 36 | def transform(self, source_filename, dest_fileobj): 37 | skip_rows(source_filename, dest_fileobj, self.nb_rows) 38 | -------------------------------------------------------------------------------- /docs/python/datamart-profiler.rst: -------------------------------------------------------------------------------- 1 | Profiling library 2 | ================= 3 | 4 | This library can be used to profile datasets standalone. You can use it to profile datasets on your side and send that to Auctus for search, instead of uploading the whole dataset. It is also used internally by Auctus to process search-by-example queries (when sending a file to the ``/search`` endpoint) and to add datasets to the index (to be queried against later). 5 | 6 | Installing datamart-profiler 7 | ---------------------------- 8 | 9 | You can get it directly from the Python Package Index using PIP:: 10 | 11 | pip install datamart-profiler 12 | 13 | API 14 | --- 15 | 16 | The :py:func:`datamart_profiler.process_dataset` function is the entrypoint for the library. It returns a dict following Auctus's JSON result schema. 17 | 18 | .. autofunction:: datamart_profiler.core.process_dataset 19 | 20 | .. autofunction:: datamart_profiler.temporal.parse_date 21 | 22 | .. autofunction:: datamart_profiler.core.count_rows_to_skip 23 | 24 | Command-line usage 25 | ------------------ 26 | 27 | You can also use datamart-profiler from the command-line like so:: 28 | 29 | $ python -m datamart_profiler 30 | 31 | It will output the extracted metadata as JSON. 32 | -------------------------------------------------------------------------------- /frontend/src/components/SearchResults/DatasetSample.css: -------------------------------------------------------------------------------- 1 | /* https://www.colorbox.io/#steps=7#hue_start=209#hue_end=172#hue_curve=easeInQuad#sat_start=12#sat_end=90#sat_curve=easeOutCubic#sat_rate=130#lum_start=84#lum_end=53#lum_curve=easeOutQuad#minor_steps_map=0 */ 2 | .badge.semtype { 3 | background-color: #b1c4d5; 4 | color: white; 5 | } 6 | 7 | .badge.semtype.semtype-text { 8 | background-color: #aec2d4; 9 | } 10 | 11 | .badge.semtype.semtype-boolean { 12 | background-color: #a1bbce; 13 | } 14 | 15 | .badge.semtype.semtype-enumeration { 16 | background-color: #89b1c4; 17 | } 18 | 19 | .badge.semtype.semtype-identifier { 20 | background-color: #5fa4b5; 21 | } 22 | 23 | .badge.semtype.semtype-latitude, .badge.semtype.semtype-longitude { 24 | background-color: #249ca0; 25 | } 26 | 27 | .badge.semtype.semtype-datetime { 28 | background-color: #008775; 29 | } 30 | 31 | #vg-tooltip-element { 32 | z-index: 2000; 33 | } 34 | 35 | .chip-btn-download { 36 | margin-left: 0; 37 | margin-bottom: 0.1rem!important; 38 | margin-top: -3px; 39 | color: rgb(0, 0, 0, 0.5); 40 | cursor: pointer; 41 | user-select: none; 42 | -webkit-tap-highlight-color: transparent; 43 | } 44 | 45 | .chip-btn-download:hover { 46 | color: rgb(0, 0, 0, 1.0); 47 | } 48 | -------------------------------------------------------------------------------- /scripts/clear_caches.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """This script clears the cache folders safely. 4 | 5 | This should not result in any data being lost or affect any running process. 6 | """ 7 | 8 | import logging 9 | import os 10 | import sys 11 | 12 | from datamart_fslock.cache import clear_cache 13 | 14 | 15 | if __name__ == '__main__': 16 | logging.basicConfig(level=logging.INFO) 17 | 18 | if sys.argv[1:] == []: 19 | only_if_possible = False 20 | elif sys.argv[1:] == ['--if-possible']: 21 | only_if_possible = True 22 | else: 23 | print("Usage: clear_caches.py [--if-possible]", file=sys.stderr) 24 | sys.exit(2) 25 | 26 | if ( 27 | not os.path.isdir('/cache/datasets') or 28 | not os.path.isdir('/cache/aug') or 29 | not os.path.isdir('/cache/user_data') 30 | ): 31 | print( 32 | "Cache directories don't exist; are you not running this script " 33 | "inside Docker?", 34 | file=sys.stderr, 35 | ) 36 | sys.exit(1) 37 | clear_cache('/cache/datasets', only_if_possible=only_if_possible) 38 | clear_cache('/cache/aug', only_if_possible=only_if_possible) 39 | clear_cache('/cache/user_data', only_if_possible=only_if_possible) 40 | -------------------------------------------------------------------------------- /scripts/canonicalize_yaml.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """This script sorts YAML documents and objects to allow diffing. 4 | 5 | It loads multiple YAML files, orders the documents by 'metadata/kind' and 6 | 'metadata/name', sorts the keys of each objects alphabetically, and dumps it 7 | all to stdout. 8 | 9 | In addition, it also sorts the 'env:' list/map. 10 | 11 | Usage: 12 | find yaml -type f -print0 | xargs -0 python canonicalize_yaml.py 13 | """ 14 | 15 | import sys 16 | import yaml 17 | 18 | 19 | def sort_env(obj): 20 | if isinstance(obj, list): 21 | return [sort_env(i) for i in obj] 22 | elif isinstance(obj, dict): 23 | return { 24 | k: ( 25 | sorted(v, key=lambda i: i['name']) if k == 'env' 26 | else sort_env(v) 27 | ) 28 | for k, v in obj.items() 29 | } 30 | else: 31 | return obj 32 | 33 | 34 | if __name__ == '__main__': 35 | objs = [] 36 | for filename in sys.argv[1:]: 37 | with open(filename, 'r') as fp_in: 38 | objs.extend(yaml.safe_load_all(fp_in)) 39 | 40 | objs = [sort_env(o) for o in objs] 41 | objs = sorted(objs, key=lambda o: (o['kind'], o['metadata']['name'])) 42 | 43 | yaml.safe_dump_all(objs, sys.stdout, sort_keys=True) 44 | -------------------------------------------------------------------------------- /lib_fslock/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup 3 | 4 | 5 | os.chdir(os.path.abspath(os.path.dirname(__file__))) 6 | 7 | 8 | req = [ 9 | 'prometheus_client', 10 | ] 11 | setup(name='datamart-fslock', 12 | version='2.1', 13 | packages=['datamart_fslock'], 14 | install_requires=req, 15 | description="Filesystem locking library for Auctus", 16 | author="Remi Rampin", 17 | author_email='remi.rampin@nyu.edu', 18 | maintainer="Remi Rampin", 19 | maintainer_email='remi.rampin@nyu.edu', 20 | url='https://gitlab.com/remram44/python-fslock', 21 | project_urls={ 22 | 'Homepage': 'https://gitlab.com/remram44/python-fslock', 23 | 'Source': 'https://gitlab.com/remram44/python-fslock', 24 | 'Tracker': 'https://gitlab.com/remram44/python-fslock/issues', 25 | }, 26 | long_description="Filesystem locking library for Auctus", 27 | license='MIT', 28 | keywords=['lock', 'flock', 'file lock', 'locking', 'filesystem'], 29 | classifiers=[ 30 | 'Development Status :: 5 - Production/Stable', 31 | 'Intended Audience :: Developers', 32 | 'License :: OSI Approved :: MIT License', 33 | 'Operating System :: POSIX', 34 | 'Programming Language :: Python :: 3 :: Only']) 35 | -------------------------------------------------------------------------------- /frontend/src/components/Badges/IconAbc.tsx: -------------------------------------------------------------------------------- 1 | import React, {SVGAttributes} from 'react'; 2 | 3 | interface Props extends SVGAttributes { 4 | color: string; 5 | size: string | number; 6 | } 7 | 8 | const IconAbc = (props: Props) => { 9 | const {color, size, ...otherProps} = props; 10 | return ( 11 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | ); 30 | }; 31 | 32 | IconAbc.defaultProps = { 33 | color: 'currentColor', 34 | size: '24', 35 | }; 36 | 37 | export {IconAbc}; 38 | -------------------------------------------------------------------------------- /frontend/src/components/Logo/Logo.tsx: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | // 3 | // auctus-logo.min.svg is a minified file generated from auctus-logo.svg 4 | // After updating source file, it can be with re-minified with: 5 | // npx svgo auctus-logo.svg -o auctus-logo.min.svg 6 | // 7 | import logo from './auctus-logo.min.svg'; 8 | import './Logo.css'; 9 | 10 | function VerticalLogo() { 11 | return ( 12 |
13 | Auctus Logo 14 | Auctus Dataset Search 15 |
16 | ); 17 | } 18 | 19 | function HorizontalLogo(props: {onClick?: () => void}) { 20 | const style = props.onClick ? {cursor: 'pointer'} : undefined; 21 | return ( 22 |
27 | Auctus Logo 28 | Auctus 29 |
30 | ); 31 | } 32 | 33 | function CenteredHorizontalLogo(props: {onClick?: () => void}) { 34 | return ( 35 |
36 | 37 |
38 | ); 39 | } 40 | 41 | export {VerticalLogo, HorizontalLogo, CenteredHorizontalLogo}; 42 | -------------------------------------------------------------------------------- /frontend/src/components/Chip/Chip.tsx: -------------------------------------------------------------------------------- 1 | import * as React from 'react'; 2 | import * as Icon from 'react-feather'; 3 | import './Chip.css'; 4 | 5 | interface ChipProps { 6 | label: string; 7 | onClose?: () => void; 8 | onEdit?: () => void; 9 | icon?: Icon.Icon; 10 | } 11 | 12 | function Chip(props: ChipProps) { 13 | let classes = 'chip chip-outline'; 14 | // chip-primary 15 | // chip-clickable 16 | if (props.onClose) { 17 | classes += ' chip-closeable'; 18 | } 19 | return ( 20 |
21 | {props.icon && ( 22 |
23 | 24 |
25 | )} 26 | 27 | {props.label} 28 |   29 | {props.onEdit ? ( 30 | 33 | ) : ( 34 | '' 35 | )} 36 | 37 | {props.onClose && ( 38 |
39 | 40 |
41 | )} 42 |
43 | ); 44 | } 45 | 46 | function ChipGroup(props: React.PropsWithChildren<{}>) { 47 | return
{props.children}
; 48 | } 49 | 50 | export {Chip, ChipGroup}; 51 | -------------------------------------------------------------------------------- /discovery/ckan/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup 3 | 4 | 5 | os.chdir(os.path.abspath(os.path.dirname(__file__))) 6 | 7 | 8 | req = [ 9 | 'elasticsearch~=7.0', 10 | 'requests', 11 | 'datamart-core', 12 | ] 13 | setup(name='datamart-ckan-discovery-service', 14 | version='0.0', 15 | py_modules=['ckan_discovery'], 16 | install_requires=req, 17 | description="CKAN discovery service for Auctus", 18 | author="Remi Rampin", 19 | author_email='remi.rampin@nyu.edu', 20 | maintainer="Remi Rampin", 21 | maintainer_email='remi.rampin@nyu.edu', 22 | url='https://gitlab.com/ViDA-NYU/auctus/auctus', 23 | project_urls={ 24 | 'Homepage': 'https://gitlab.com/ViDA-NYU/auctus/auctus', 25 | 'Source': 'https://gitlab.com/ViDA-NYU/auctus/auctus', 26 | 'Tracker': 'https://gitlab.com/ViDA-NYU/auctus/auctus/-/issues', 27 | }, 28 | long_description="CKAN discovery service for Auctus", 29 | license='Apache-2.0', 30 | keywords=['auctus', 'datamart'], 31 | classifiers=[ 32 | 'Development Status :: 4 - Beta', 33 | 'Intended Audience :: Science/Research', 34 | 'License :: OSI Approved :: Apache Software License', 35 | 'Operating System :: Unix', 36 | 'Programming Language :: Python :: 3 :: Only', 37 | 'Topic :: Scientific/Engineering :: Information Analysis']) 38 | -------------------------------------------------------------------------------- /discovery/zenodo/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup 3 | 4 | 5 | os.chdir(os.path.abspath(os.path.dirname(__file__))) 6 | 7 | 8 | req = [ 9 | 'elasticsearch~=7.0', 10 | 'requests', 11 | 'datamart-core', 12 | ] 13 | setup(name='datamart-zenodo-discovery-service', 14 | version='0.0', 15 | py_modules=['zenodo_discovery'], 16 | install_requires=req, 17 | description="Zenodo discovery service for Auctus", 18 | author="Remi Rampin", 19 | author_email='remi.rampin@nyu.edu', 20 | maintainer="Remi Rampin", 21 | maintainer_email='remi.rampin@nyu.edu', 22 | url='https://gitlab.com/ViDA-NYU/auctus/auctus', 23 | project_urls={ 24 | 'Homepage': 'https://gitlab.com/ViDA-NYU/auctus/auctus', 25 | 'Source': 'https://gitlab.com/ViDA-NYU/auctus/auctus', 26 | 'Tracker': 'https://gitlab.com/ViDA-NYU/auctus/auctus/-/issues', 27 | }, 28 | long_description="Zenodo discovery service for Auctus", 29 | license='Apache-2.0', 30 | keywords=['auctus', 'datamart'], 31 | classifiers=[ 32 | 'Development Status :: 4 - Beta', 33 | 'Intended Audience :: Science/Research', 34 | 'License :: OSI Approved :: Apache Software License', 35 | 'Operating System :: Unix', 36 | 'Programming Language :: Python :: 3 :: Only', 37 | 'Topic :: Scientific/Engineering :: Information Analysis']) 38 | -------------------------------------------------------------------------------- /lib_materialize/datamart_materialize/excel.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import csv 3 | from datetime import datetime 4 | import openpyxl 5 | 6 | from .common import UnsupportedConversion 7 | from .utils import SimpleConverter 8 | 9 | 10 | def xlsx_to_csv(source_filename, dest_fileobj): 11 | with contextlib.ExitStack() as stack: 12 | fp = stack.enter_context(open(source_filename, 'rb')) 13 | workbook = stack.enter_context(contextlib.closing( 14 | openpyxl.load_workbook(fp, read_only=True) 15 | )) 16 | 17 | sheets = workbook.worksheets 18 | if len(sheets) != 1: 19 | raise UnsupportedConversion( 20 | "Excel workbook has %d sheets" % len(sheets) 21 | ) 22 | sheet, = sheets 23 | 24 | writer = csv.writer(dest_fileobj) 25 | for values in sheet.iter_rows(values_only=True): 26 | values = [ 27 | # Avoid forced decimal point on integers 28 | '{0:g}'.format(v) if isinstance(v, float) 29 | # Decode dates into ISO-8601 strings 30 | else v.isoformat() if isinstance(v, datetime) 31 | else v 32 | for v in values 33 | ] 34 | 35 | writer.writerow(values) 36 | 37 | 38 | class ExcelConverter(SimpleConverter): 39 | """Adapter converting Excel files to CSV. 40 | """ 41 | transform = staticmethod(xlsx_to_csv) 42 | -------------------------------------------------------------------------------- /discovery/isi/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup 3 | 4 | 5 | os.chdir(os.path.abspath(os.path.dirname(__file__))) 6 | 7 | 8 | req = [ 9 | 'elasticsearch~=7.0', 10 | 'requests', 11 | 'datamart-core', 12 | ] 13 | setup(name='datamart-isi-discovery-service', 14 | version='0.0', 15 | py_modules=['isi_discovery'], 16 | install_requires=req, 17 | description="ISI Datamart discovery service for Auctus", 18 | author="Remi Rampin", 19 | author_email='remi.rampin@nyu.edu', 20 | maintainer="Remi Rampin", 21 | maintainer_email='remi.rampin@nyu.edu', 22 | url='https://gitlab.com/ViDA-NYU/auctus/auctus', 23 | project_urls={ 24 | 'Homepage': 'https://gitlab.com/ViDA-NYU/auctus/auctus', 25 | 'Source': 'https://gitlab.com/ViDA-NYU/auctus/auctus', 26 | 'Tracker': 'https://gitlab.com/ViDA-NYU/auctus/auctus/-/issues', 27 | }, 28 | long_description="ISI Datamart discovery service for Auctus", 29 | license='Apache-2.0', 30 | keywords=['auctus', 'datamart'], 31 | classifiers=[ 32 | 'Development Status :: 4 - Beta', 33 | 'Intended Audience :: Science/Research', 34 | 'License :: OSI Approved :: Apache Software License', 35 | 'Operating System :: Unix', 36 | 'Programming Language :: Python :: 3 :: Only', 37 | 'Topic :: Scientific/Engineering :: Information Analysis']) 38 | -------------------------------------------------------------------------------- /discovery/socrata/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup 3 | 4 | 5 | os.chdir(os.path.abspath(os.path.dirname(__file__))) 6 | 7 | 8 | req = [ 9 | 'elasticsearch~=7.0', 10 | 'sodapy', 11 | 'datamart-core', 12 | ] 13 | setup(name='datamart-socrata-discovery-service', 14 | version='0.0', 15 | py_modules=['socrata_discovery'], 16 | install_requires=req, 17 | description="Socrata discovery service for Auctus", 18 | author="Remi Rampin", 19 | author_email='remi.rampin@nyu.edu', 20 | maintainer="Remi Rampin", 21 | maintainer_email='remi.rampin@nyu.edu', 22 | url='https://gitlab.com/ViDA-NYU/auctus/auctus', 23 | project_urls={ 24 | 'Homepage': 'https://gitlab.com/ViDA-NYU/auctus/auctus', 25 | 'Source': 'https://gitlab.com/ViDA-NYU/auctus/auctus', 26 | 'Tracker': 'https://gitlab.com/ViDA-NYU/auctus/auctus/-/issues', 27 | }, 28 | long_description="Socrata discovery service for Auctus", 29 | license='Apache-2.0', 30 | keywords=['auctus', 'datamart'], 31 | classifiers=[ 32 | 'Development Status :: 4 - Beta', 33 | 'Intended Audience :: Science/Research', 34 | 'License :: OSI Approved :: Apache Software License', 35 | 'Operating System :: Unix', 36 | 'Programming Language :: Python :: 3 :: Only', 37 | 'Topic :: Scientific/Engineering :: Information Analysis']) 38 | -------------------------------------------------------------------------------- /frontend/src/components/ui/DropdownMenu/DropdownMenu.tsx: -------------------------------------------------------------------------------- 1 | import * as React from 'react'; 2 | 3 | interface Props { 4 | children: (api: {onClick: () => void; active: boolean}) => JSX.Element; 5 | } 6 | 7 | interface State { 8 | active: boolean; 9 | } 10 | 11 | class DropdownMenu extends React.Component { 12 | ref: HTMLDivElement | null = null; 13 | 14 | constructor(props: Props) { 15 | super(props); 16 | this.state = {active: false}; 17 | this.toggleState = this.toggleState.bind(this); 18 | this.handleClickOutside = this.handleClickOutside.bind(this); 19 | } 20 | 21 | toggleState() { 22 | this.setState({active: !this.state.active}); 23 | } 24 | 25 | handleClickOutside(e: MouseEvent) { 26 | if (this.ref && !this.ref.contains(e.target as Node)) { 27 | if (this.state.active) { 28 | this.toggleState(); 29 | } 30 | } 31 | } 32 | 33 | componentDidMount() { 34 | document.addEventListener('mousedown', this.handleClickOutside, false); 35 | } 36 | 37 | componentWillUnmount() { 38 | document.removeEventListener('mousedown', this.handleClickOutside, false); 39 | } 40 | 41 | render() { 42 | return ( 43 |
(this.ref = node)}> 44 | {this.props.children({ 45 | onClick: this.toggleState, 46 | active: this.state.active, 47 | })} 48 |
49 | ); 50 | } 51 | } 52 | 53 | export {DropdownMenu}; 54 | -------------------------------------------------------------------------------- /frontend/src/components/ui/Tabs/Tabs.tsx: -------------------------------------------------------------------------------- 1 | import * as React from 'react'; 2 | import './Tabs.css'; 3 | 4 | class Tabs extends React.PureComponent { 5 | render() { 6 | return
    {this.props.children}
; 7 | } 8 | } 9 | 10 | interface TabProps { 11 | onClick: ((event: React.MouseEvent) => void) | undefined; 12 | selected: boolean; 13 | } 14 | 15 | class Tab extends React.PureComponent { 16 | render() { 17 | const tabClassName = this.props.selected ? 'nav-link active' : 'nav-link'; 18 | return ( 19 |
  • 20 | 23 |
  • 24 | ); 25 | } 26 | } 27 | 28 | class TabContent extends React.PureComponent { 29 | render() { 30 | return
    {this.props.children}
    ; 31 | } 32 | } 33 | 34 | interface TabPaneProps { 35 | id: string; 36 | active: boolean; 37 | } 38 | 39 | class TabPane extends React.PureComponent { 40 | render() { 41 | const tabPaneClassName = this.props.active 42 | ? 'tab-pane fade show active' 43 | : 'tab-pane fade'; 44 | return ( 45 |
    46 | {this.props.children} 47 |
    48 | ); 49 | } 50 | } 51 | 52 | export {Tabs, Tab, TabContent, TabPane}; 53 | -------------------------------------------------------------------------------- /contrib/k8s/secrets.jsonnet: -------------------------------------------------------------------------------- 1 | // Set 'private_app: true' in the config to password-protect frontend & API 2 | // You can create this file using the htpasswd tool 3 | local private_app_password = ||| 4 | auctus:$apr1$ECD/OaHB$CMBSkoEdcA/2uX8gPZM3y1 5 | |||; 6 | 7 | local amqp_user = 'auctususer'; 8 | local amqp_password = 'auctuspassword'; 9 | local admin_password = 'auctuspassword'; 10 | local s3_key = 'devkey'; 11 | local s3_secret = 'devpassword'; 12 | local gcs_creds = std.base64(''); 13 | 14 | { 15 | 'secrets.yml': std.manifestYamlStream([ 16 | { 17 | apiVersion: 'v1', 18 | kind: 'Secret', 19 | type: 'Opaque', 20 | metadata: { 21 | name: 'secrets', 22 | }, 23 | local data = { 24 | 'amqp.user': amqp_user, 25 | 'amqp.password': amqp_password, 26 | 'admin.password': admin_password, 27 | 's3.key': s3_key, 28 | 's3.secret': s3_secret, 29 | 'gcs.creds': gcs_creds, 30 | 'smtp.user': 'auctususer', 31 | 'smtp.password': 'auctuspassword', 32 | }, 33 | data: { 34 | [k]: std.base64(data[k]) 35 | for k in std.objectFields(data) 36 | }, 37 | }, 38 | { 39 | apiVersion: 'v1', 40 | kind: 'Secret', 41 | type: 'Opaque', 42 | metadata: { 43 | name: 'basic-auth', 44 | }, 45 | data: { 46 | auth: std.base64(private_app_password), 47 | }, 48 | }, 49 | ]), 50 | } 51 | -------------------------------------------------------------------------------- /apiserver/apiserver/enhance_metadata.py: -------------------------------------------------------------------------------- 1 | from datamart_materialize.d3m import d3m_metadata 2 | 3 | 4 | def enhance_metadata(result): 5 | """Add more metadata (e.g. D3M) from the original metadata. 6 | 7 | :param result: A dict with 'id' and 'metadata' keys 8 | :type result: dict 9 | :return: A dict with the 'metadata' key and additional keys such as 10 | 'd3m-metadata' 11 | """ 12 | # Generate metadata in D3M format 13 | result = dict( 14 | result, 15 | d3m_dataset_description=d3m_metadata(result['id'], result['metadata']), 16 | ) 17 | 18 | # Add temporal coverage information to columns for compatibility 19 | if result['metadata'].get('temporal_coverage'): 20 | columns = list(result['metadata']['columns']) 21 | for temporal in result['metadata']['temporal_coverage']: 22 | # Only works for temporal coverage extracted from a single column 23 | if len(temporal['column_indexes']) == 1: 24 | idx = temporal['column_indexes'][0] 25 | columns[idx] = dict( 26 | columns[idx], 27 | coverage=temporal['ranges'], 28 | ) 29 | if 'temporal_resolution' in temporal: 30 | columns[idx]['temporal_resolution'] = \ 31 | temporal['temporal_resolution'] 32 | 33 | result['metadata'] = dict(result['metadata'], columns=columns) 34 | 35 | return result 36 | -------------------------------------------------------------------------------- /tests/data/hourly.csv: -------------------------------------------------------------------------------- 1 | aug_date,rain 2 | 2019-06-12T00:00:00,no 3 | 2019-06-12T01:00:00,no 4 | 2019-06-12T02:00:00,no 5 | 2019-06-12T03:00:00,yes 6 | 2019-06-12T04:00:00,no 7 | 2019-06-12T05:00:00,yes 8 | 2019-06-12T06:00:00,yes 9 | 2019-06-12T07:00:00,no 10 | 2019-06-12T08:00:00,no 11 | 2019-06-12T09:00:00,no 12 | 2019-06-12T10:00:00,yes 13 | 2019-06-12T11:00:00,yes 14 | 2019-06-12T12:00:00,yes 15 | 2019-06-12T13:00:00,yes 16 | 2019-06-12T14:00:00,yes 17 | 2019-06-12T15:00:00,no 18 | 2019-06-12T16:00:00,no 19 | 2019-06-12T17:00:00,yes 20 | 2019-06-12T18:00:00,yes 21 | 2019-06-12T19:00:00,yes 22 | 2019-06-12T20:00:00,yes 23 | 2019-06-12T21:00:00,yes 24 | 2019-06-12T22:00:00,no 25 | 2019-06-12T23:00:00,no 26 | 2019-06-13T00:00:00,no 27 | 2019-06-13T01:00:00,no 28 | 2019-06-13T02:00:00,no 29 | 2019-06-13T03:00:00,no 30 | 2019-06-13T04:00:00,yes 31 | 2019-06-13T05:00:00,no 32 | 2019-06-13T06:00:00,no 33 | 2019-06-13T07:00:00,yes 34 | 2019-06-13T08:00:00,no 35 | 2019-06-13T09:00:00,yes 36 | 2019-06-13T10:00:00,yes 37 | 2019-06-13T11:00:00,yes 38 | 2019-06-13T12:00:00,yes 39 | 2019-06-13T13:00:00,yes 40 | 2019-06-13T14:00:00,yes 41 | 2019-06-13T15:00:00,no 42 | 2019-06-13T16:00:00,yes 43 | 2019-06-13T17:00:00,yes 44 | 2019-06-13T18:00:00,no 45 | 2019-06-13T19:00:00,yes 46 | 2019-06-13T20:00:00,yes 47 | 2019-06-13T21:00:00,yes 48 | 2019-06-13T22:00:00,yes 49 | 2019-06-13T23:00:00,yes 50 | 2019-06-14T00:00:00,yes 51 | 2019-06-14T01:00:00,yes 52 | 2019-06-14T02:00:00,yes 53 | 2019-06-14T03:00:00,yes 54 | -------------------------------------------------------------------------------- /discovery/noaa/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup 3 | 4 | 5 | os.chdir(os.path.abspath(os.path.dirname(__file__))) 6 | 7 | 8 | req = [ 9 | 'elasticsearch~=7.0', 10 | 'requests', 11 | 'datamart-core', 12 | ] 13 | setup(name='datamart-noaa-discovery-service', 14 | version='0.0', 15 | packages=['noaa_discovery'], 16 | package_data={'noaa_discovery': [ 17 | 'noaa_city_stations.csv', 18 | ]}, 19 | install_requires=req, 20 | description="NOAA discovery service for Auctus", 21 | author="Remi Rampin", 22 | author_email='remi.rampin@nyu.edu', 23 | maintainer="Remi Rampin", 24 | maintainer_email='remi.rampin@nyu.edu', 25 | url='https://gitlab.com/ViDA-NYU/auctus/auctus', 26 | project_urls={ 27 | 'Homepage': 'https://gitlab.com/ViDA-NYU/auctus/auctus', 28 | 'Source': 'https://gitlab.com/ViDA-NYU/auctus/auctus', 29 | 'Tracker': 'https://gitlab.com/ViDA-NYU/auctus/auctus/-/issues', 30 | }, 31 | long_description="NOAA discovery service for Auctus", 32 | license='Apache-2.0', 33 | keywords=['auctus', 'datamart'], 34 | classifiers=[ 35 | 'Development Status :: 4 - Beta', 36 | 'Intended Audience :: Science/Research', 37 | 'License :: OSI Approved :: Apache Software License', 38 | 'Operating System :: Unix', 39 | 'Programming Language :: Python :: 3 :: Only', 40 | 'Topic :: Scientific/Engineering :: Information Analysis']) 41 | -------------------------------------------------------------------------------- /lib_materialize/datamart_materialize/excel97.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import xlrd 3 | import xlrd.sheet 4 | 5 | from .common import UnsupportedConversion 6 | from .utils import SimpleConverter 7 | 8 | 9 | def xls_to_csv(source_filename, dest_fileobj): 10 | with xlrd.open_workbook(source_filename) as workbook: 11 | datemode = workbook.datemode 12 | sheets = workbook.sheets() 13 | if len(sheets) != 1: 14 | raise UnsupportedConversion( 15 | "Excel workbook has %d sheets" % len(sheets) 16 | ) 17 | sheet, = sheets 18 | 19 | writer = csv.writer(dest_fileobj) 20 | for row_num in range(sheet.nrows): 21 | values = sheet.row_values(row_num) 22 | 23 | for col_num, cell_type in enumerate(sheet.row_types(row_num)): 24 | if cell_type == xlrd.sheet.XL_CELL_DATE: 25 | # Decode dates into ISO-8601 strings 26 | values[col_num] = xlrd.xldate_as_datetime( 27 | values[col_num], 28 | datemode, 29 | ).isoformat() 30 | elif cell_type == xlrd.sheet.XL_CELL_NUMBER: 31 | # Avoid forced decimal point on integers 32 | values[col_num] = '{0:g}'.format(values[col_num]) 33 | 34 | writer.writerow(values) 35 | 36 | 37 | class Excel97Converter(SimpleConverter): 38 | """Adapter converting Excel files to CSV. 39 | """ 40 | transform = staticmethod(xls_to_csv) 41 | -------------------------------------------------------------------------------- /discovery/worldbank/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup 3 | 4 | 5 | os.chdir(os.path.abspath(os.path.dirname(__file__))) 6 | 7 | 8 | req = [ 9 | 'elasticsearch~=7.0', 10 | 'beautifulsoup4[html5lib]', 11 | 'pandas', 12 | 'datamart-core', 13 | 'datamart-profiler', 14 | ] 15 | setup(name='datamart-worldbank-discovery-service', 16 | version='0.0', 17 | py_modules=['worldbank_discovery'], 18 | install_requires=req, 19 | description="World Bank indicator discovery service for Auctus", 20 | author="Remi Rampin", 21 | author_email='remi.rampin@nyu.edu', 22 | maintainer="Remi Rampin", 23 | maintainer_email='remi.rampin@nyu.edu', 24 | url='https://gitlab.com/ViDA-NYU/auctus/auctus', 25 | project_urls={ 26 | 'Homepage': 'https://gitlab.com/ViDA-NYU/auctus/auctus', 27 | 'Source': 'https://gitlab.com/ViDA-NYU/auctus/auctus', 28 | 'Tracker': 'https://gitlab.com/ViDA-NYU/auctus/auctus/-/issues', 29 | }, 30 | long_description="World Bank indicator discovery service for Auctus", 31 | license='Apache-2.0', 32 | keywords=['auctus', 'datamart'], 33 | classifiers=[ 34 | 'Development Status :: 4 - Beta', 35 | 'Intended Audience :: Science/Research', 36 | 'License :: OSI Approved :: Apache Software License', 37 | 'Operating System :: Unix', 38 | 'Programming Language :: Python :: 3 :: Only', 39 | 'Topic :: Scientific/Engineering :: Information Analysis']) 40 | -------------------------------------------------------------------------------- /discovery/uaz_indicators/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup 3 | 4 | 5 | os.chdir(os.path.abspath(os.path.dirname(__file__))) 6 | 7 | 8 | req = [ 9 | 'requests', 10 | 'datamart-core', 11 | ] 12 | setup(name='datamart-uaz-indicators-service', 13 | version='0.0', 14 | py_modules=['uaz_indicators'], 15 | install_requires=req, 16 | description="Auctus discovery service for indicators from the " + 17 | "University of Arizona", 18 | author="Remi Rampin", 19 | author_email='remi.rampin@nyu.edu', 20 | maintainer="Remi Rampin", 21 | maintainer_email='remi.rampin@nyu.edu', 22 | url='https://gitlab.com/ViDA-NYU/auctus/auctus', 23 | project_urls={ 24 | 'Homepage': 'https://gitlab.com/ViDA-NYU/auctus/auctus', 25 | 'Source': 'https://gitlab.com/ViDA-NYU/auctus/auctus', 26 | 'Tracker': 'https://gitlab.com/ViDA-NYU/auctus/auctus/-/issues', 27 | }, 28 | long_description="Auctus discovery service for indicators from the " + 29 | "University of Arizona", 30 | license='Apache-2.0', 31 | keywords=['auctus', 'datamart'], 32 | classifiers=[ 33 | 'Development Status :: 4 - Beta', 34 | 'Intended Audience :: Science/Research', 35 | 'License :: OSI Approved :: Apache Software License', 36 | 'Operating System :: Unix', 37 | 'Programming Language :: Python :: 3 :: Only', 38 | 'Topic :: Scientific/Engineering :: Information Analysis']) 39 | -------------------------------------------------------------------------------- /lib_augmentation/setup.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | from setuptools import setup 4 | 5 | 6 | os.chdir(os.path.abspath(os.path.dirname(__file__))) 7 | 8 | 9 | req = [ 10 | 'pandas', 11 | 'numpy', 12 | 'datamart-materialize==0.11', 13 | 'datamart-profiler==0.11', 14 | ] 15 | with io.open('README.rst', encoding='utf-8') as fp: 16 | description = fp.read() 17 | setup(name='datamart-augmentation', 18 | version='0.10', 19 | packages=['datamart_augmentation'], 20 | install_requires=req, 21 | description="Data augmentation functions for Auctus", 22 | author="Remi Rampin", 23 | author_email='remi.rampin@nyu.edu', 24 | maintainer="Remi Rampin", 25 | maintainer_email='remi.rampin@nyu.edu', 26 | url='https://gitlab.com/ViDA-NYU/auctus/auctus', 27 | project_urls={ 28 | 'Homepage': 'https://gitlab.com/ViDA-NYU/auctus/auctus', 29 | 'Source': 'https://gitlab.com/ViDA-NYU/auctus/auctus', 30 | 'Tracker': 'https://gitlab.com/ViDA-NYU/auctus/auctus/-/issues', 31 | }, 32 | long_description=description, 33 | license='Apache-2.0', 34 | keywords=['auctus', 'datamart'], 35 | classifiers=[ 36 | 'Development Status :: 4 - Beta', 37 | 'Intended Audience :: Science/Research', 38 | 'License :: OSI Approved :: Apache Software License', 39 | 'Operating System :: OS Independent', 40 | 'Programming Language :: Python :: 3 :: Only', 41 | 'Topic :: Scientific/Engineering :: Information Analysis']) 42 | -------------------------------------------------------------------------------- /frontend/src/components/DateFilter/DateFilter.css: -------------------------------------------------------------------------------- 1 | .react-datepicker__input-container input { 2 | border: 1px solid #ced4da; 3 | padding: 6px 10px; 4 | } 5 | 6 | 7 | .react-datepicker__day--in-range:hover, 8 | .react-datepicker__day--in-selecting-range:hover, 9 | .react-datepicker__day--selected:hover, 10 | .react-datepicker__month-text--in-range:hover, 11 | .react-datepicker__month-text--in-selecting-range:hover, 12 | .react-datepicker__month-text--selected:hover, 13 | .react-datepicker__quarter-text--in-range:hover, 14 | .react-datepicker__quarter-text--in-selecting-range:hover, 15 | .react-datepicker__quarter-text--selected:hover { 16 | background-color: #2e1b59; 17 | } 18 | 19 | .react-datepicker__day--keyboard-selected:hover, 20 | .react-datepicker__month-text--keyboard-selected:hover, 21 | .react-datepicker__quarter-text--keyboard-selected:hover { 22 | background-color: #2e1b59; 23 | } 24 | 25 | .react-datepicker__day--in-range, 26 | .react-datepicker__day--in-selecting-range, 27 | .react-datepicker__day--selected, 28 | .react-datepicker__month-text--in-range, 29 | .react-datepicker__month-text--in-selecting-range, 30 | .react-datepicker__month-text--selected, 31 | .react-datepicker__quarter-text--in-range, 32 | .react-datepicker__quarter-text--in-selecting-range, 33 | .react-datepicker__quarter-text--selected { 34 | background-color: #63508b; 35 | } 36 | 37 | .react-datepicker__day--keyboard-selected, 38 | .react-datepicker__month-text--keyboard-selected, 39 | .react-datepicker__quarter-text--keyboard-selected { 40 | background-color: #63508b; 41 | } 42 | -------------------------------------------------------------------------------- /lib_core/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup 3 | 4 | 5 | os.chdir(os.path.abspath(os.path.dirname(__file__))) 6 | 7 | 8 | req = [ 9 | 'aio-pika', 10 | 'elasticsearch~=7.0', 11 | 'lazo-index-service==0.7.0', 12 | 's3fs', 13 | 'gcsfs', 14 | 'advocate>=1.0,<2', 15 | 'prometheus_client', 16 | 'prometheus-async', 17 | 'sentry-sdk', 18 | ] 19 | setup(name='datamart-core', 20 | version='0.0', 21 | packages=['datamart_core'], 22 | install_requires=req, 23 | description="Core library for Auctus services", 24 | author="Remi Rampin", 25 | author_email='remi.rampin@nyu.edu', 26 | maintainer="Remi Rampin", 27 | maintainer_email='remi.rampin@nyu.edu', 28 | url='https://gitlab.com/ViDA-NYU/auctus/auctus', 29 | project_urls={ 30 | 'Homepage': 'https://gitlab.com/ViDA-NYU/auctus/auctus', 31 | 'Source': 'https://gitlab.com/ViDA-NYU/auctus/auctus', 32 | 'Tracker': 'https://gitlab.com/ViDA-NYU/auctus/auctus/-/issues', 33 | }, 34 | long_description="Core library for Auctus services", 35 | license='Apache-2.0', 36 | keywords=['auctus', 'datamart'], 37 | classifiers=[ 38 | 'Development Status :: 4 - Beta', 39 | 'Intended Audience :: Science/Research', 40 | 'License :: OSI Approved :: Apache Software License', 41 | 'Operating System :: OS Independent', 42 | 'Programming Language :: Python :: 3 :: Only', 43 | 'Topic :: Scientific/Engineering :: Information Analysis']) 44 | -------------------------------------------------------------------------------- /snapshotter/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup 3 | 4 | 5 | os.chdir(os.path.abspath(os.path.dirname(__file__))) 6 | 7 | 8 | req = [ 9 | 'prometheus_client', 10 | 'datamart-core', 11 | ] 12 | setup(name='datamart-snapshotter-service', 13 | version='0.0', 14 | packages=['snapshotter'], 15 | entry_points={ 16 | 'console_scripts': [ 17 | 'snapshotter = snapshotter.snapshot:main']}, 18 | install_requires=req, 19 | description="Snapshotter service for Auctus", 20 | author="Remi Rampin", 21 | author_email='remi.rampin@nyu.edu', 22 | maintainer="Remi Rampin", 23 | maintainer_email='remi.rampin@nyu.edu', 24 | url='https://gitlab.com/ViDA-NYU/auctus/auctus', 25 | project_urls={ 26 | 'Homepage': 'https://gitlab.com/ViDA-NYU/auctus/auctus', 27 | 'Source': 'https://gitlab.com/ViDA-NYU/auctus/auctus', 28 | 'Tracker': 'https://gitlab.com/ViDA-NYU/auctus/auctus/-/issues', 29 | }, 30 | long_description="Snapshotter service for Auctus", 31 | license='Apache-2.0', 32 | keywords=['auctus', 'datamart'], 33 | classifiers=[ 34 | 'Development Status :: 4 - Beta', 35 | 'Intended Audience :: Science/Research', 36 | 'License :: OSI Approved :: Apache Software License', 37 | 'Operating System :: Unix', 38 | 'Programming Language :: JavaScript', 39 | 'Programming Language :: Python :: 3 :: Only', 40 | 'Topic :: Scientific/Engineering :: Information Analysis']) 41 | -------------------------------------------------------------------------------- /cache_cleaner/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup 3 | 4 | 5 | os.chdir(os.path.abspath(os.path.dirname(__file__))) 6 | 7 | 8 | req = [ 9 | 'prometheus_client', 10 | 'datamart-core', 11 | ] 12 | setup(name='datamart-cache-cleaner-service', 13 | version='0.0', 14 | packages=['cache_cleaner'], 15 | entry_points={ 16 | 'console_scripts': [ 17 | 'cache_cleaner = cache_cleaner.cache:main']}, 18 | install_requires=req, 19 | description="Cache Cleaner service for Auctus", 20 | author="Remi Rampin", 21 | author_email='remi.rampin@nyu.edu', 22 | maintainer="Remi Rampin", 23 | maintainer_email='remi.rampin@nyu.edu', 24 | url='https://gitlab.com/ViDA-NYU/auctus/auctus', 25 | project_urls={ 26 | 'Homepage': 'https://gitlab.com/ViDA-NYU/auctus/auctus', 27 | 'Source': 'https://gitlab.com/ViDA-NYU/auctus/auctus', 28 | 'Tracker': 'https://gitlab.com/ViDA-NYU/auctus/auctus/-/issues', 29 | }, 30 | long_description="Cache Cleaner service for Auctus", 31 | license='Apache-2.0', 32 | keywords=['auctus', 'datamart'], 33 | classifiers=[ 34 | 'Development Status :: 4 - Beta', 35 | 'Intended Audience :: Science/Research', 36 | 'License :: OSI Approved :: Apache Software License', 37 | 'Operating System :: Unix', 38 | 'Programming Language :: JavaScript', 39 | 'Programming Language :: Python :: 3 :: Only', 40 | 'Topic :: Scientific/Engineering :: Information Analysis']) 41 | -------------------------------------------------------------------------------- /scripts/purge_source.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """This script deletes all the datasets in the index from a specific source. 4 | """ 5 | 6 | import lazo_index_service 7 | import logging 8 | import os 9 | import sys 10 | 11 | from datamart_core.common import PrefixedElasticsearch, \ 12 | delete_dataset_from_index 13 | 14 | 15 | SIZE = 10000 16 | 17 | 18 | def clear(source): 19 | es = PrefixedElasticsearch() 20 | lazo_client = lazo_index_service.LazoIndexClient( 21 | host=os.environ['LAZO_SERVER_HOST'], 22 | port=int(os.environ['LAZO_SERVER_PORT']) 23 | ) 24 | hits = es.scan( 25 | index='datasets,pending', 26 | query={ 27 | 'query': { 28 | 'bool': { 29 | 'should': [ 30 | { 31 | 'term': { 32 | 'materialize.identifier': source, 33 | }, 34 | }, 35 | { 36 | 'term': { 37 | 'source': source, 38 | }, 39 | }, 40 | ], 41 | 'minimum_should_match': 1, 42 | }, 43 | }, 44 | }, 45 | _source=False, 46 | size=SIZE, 47 | ) 48 | for h in hits: 49 | delete_dataset_from_index(es, h['_id'], lazo_client) 50 | 51 | 52 | if __name__ == '__main__': 53 | logging.basicConfig(level=logging.INFO) 54 | 55 | clear(sys.argv[1]) 56 | -------------------------------------------------------------------------------- /docker/prometheus.yml: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: 15s 3 | 4 | scrape_configs: 5 | - job_name: prometheus 6 | static_configs: 7 | - targets: ["localhost:9090"] 8 | - job_name: elasticsearch 9 | scrape_interval: 30s 10 | scrape_timeout: 10s 11 | static_configs: 12 | - targets: ["elasticsearch-exporter:9114"] 13 | - job_name: rabbitmq 14 | scrape_timeout: 5s 15 | metrics_path: /metrics 16 | static_configs: 17 | - targets: ["rabbitmq:15692"] 18 | - job_name: haproxy 19 | scrape_timeout: 5s 20 | metrics_path: /metrics 21 | static_configs: 22 | - targets: ['apilb:8000'] 23 | - job_name: apiserver 24 | dns_sd_configs: 25 | - names: 26 | - apiserver 27 | type: "A" 28 | port: 8000 29 | refresh_interval: 60s 30 | - job_name: coordinator 31 | static_configs: 32 | - targets: ["coordinator:8000"] 33 | - job_name: cache-cleaner 34 | dns_sd_configs: 35 | - names: 36 | - cache-cleaner 37 | type: "A" 38 | port: 8000 39 | refresh_interval: 60s 40 | - job_name: profiler 41 | dns_sd_configs: 42 | - names: 43 | - profiler 44 | type: "A" 45 | port: 8000 46 | refresh_interval: 60s 47 | - job_name: lazo 48 | dns_sd_configs: 49 | - names: 50 | - lazo 51 | type: "A" 52 | port: 8000 53 | refresh_interval: 60s 54 | - job_name: nominatim 55 | scrape_timeout: 5s 56 | metrics_path: /metrics 57 | static_configs: 58 | - targets: ["nominatim"] 59 | -------------------------------------------------------------------------------- /lib_profiler/setup.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | from setuptools import setup 4 | 5 | 6 | os.chdir(os.path.abspath(os.path.dirname(__file__))) 7 | 8 | 9 | req = [ 10 | 'numpy', 11 | 'opentelemetry-api', 12 | 'pandas', 13 | 'prometheus_client', 14 | 'python-dateutil', 15 | 'scikit-learn', 16 | 'regex', 17 | 'requests', 18 | 'datamart-geo>=0.2.3,<0.4', 19 | ] 20 | with io.open('README.rst', encoding='utf-8') as fp: 21 | description = fp.read() 22 | setup(name='datamart-profiler', 23 | version='0.11', 24 | packages=['datamart_profiler'], 25 | install_requires=req, 26 | description="Data profiling library for Auctus", 27 | author="Remi Rampin", 28 | author_email='remi.rampin@nyu.edu', 29 | maintainer="Remi Rampin", 30 | maintainer_email='remi.rampin@nyu.edu', 31 | url='https://gitlab.com/ViDA-NYU/auctus/auctus', 32 | project_urls={ 33 | 'Homepage': 'https://gitlab.com/ViDA-NYU/auctus/auctus', 34 | 'Source': 'https://gitlab.com/ViDA-NYU/auctus/auctus', 35 | 'Tracker': 'https://gitlab.com/ViDA-NYU/auctus/auctus/-/issues', 36 | }, 37 | long_description=description, 38 | license='Apache-2.0', 39 | keywords=['auctus', 'datamart'], 40 | classifiers=[ 41 | 'Development Status :: 4 - Beta', 42 | 'Intended Audience :: Science/Research', 43 | 'License :: OSI Approved :: Apache Software License', 44 | 'Operating System :: OS Independent', 45 | 'Programming Language :: Python :: 3 :: Only', 46 | 'Topic :: Scientific/Engineering :: Information Analysis']) 47 | -------------------------------------------------------------------------------- /scripts/migrate-source-url.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """This scripts adds the source_url for Socrata datasets. 4 | """ 5 | 6 | import json 7 | import os 8 | import shutil 9 | import sys 10 | 11 | 12 | def migrate(from_folder, to_folder): 13 | assert os.listdir(from_folder) 14 | assert not os.listdir(to_folder) 15 | 16 | datasets = [] 17 | lazo = [] 18 | for f in os.listdir(from_folder): 19 | if f.startswith('lazo.'): 20 | lazo.append(f) 21 | else: 22 | datasets.append(f) 23 | 24 | for i, dataset in enumerate(datasets): 25 | if i % 100 == 0: 26 | print("% 5d / %5d datasets processed" % (i, len(datasets))) 27 | 28 | with open(os.path.join(from_folder, dataset)) as fp: 29 | obj = json.load(fp) 30 | 31 | if obj['materialize']['identifier'] == 'datamart.socrata': 32 | if 'source_url' not in obj: 33 | obj['source_url'] = 'https://%s/_/_/%s' % ( 34 | obj['materialize']['socrata_domain'], 35 | obj['materialize']['socrata_id'], 36 | ) 37 | 38 | with open(os.path.join(to_folder, dataset), 'w') as fp: 39 | json.dump(obj, fp, sort_keys=True, indent=2) 40 | 41 | print("Copying lazo data...") 42 | for i, f in enumerate(lazo): 43 | if i % 1000 == 0: 44 | print("% 5d / %5d files copied" % (i, len(lazo))) 45 | shutil.copy2( 46 | os.path.join(from_folder, f), 47 | os.path.join(to_folder, f), 48 | ) 49 | 50 | 51 | if __name__ == '__main__': 52 | migrate(sys.argv[1], sys.argv[2]) 53 | -------------------------------------------------------------------------------- /scripts/migrate-point-format.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """This scripts updates the index for !166. 4 | 5 | It adds the column "point_format" information (the default one, "long,lat"). 6 | """ 7 | 8 | import json 9 | import os 10 | import shutil 11 | import sys 12 | 13 | 14 | def migrate(from_folder, to_folder): 15 | assert os.listdir(from_folder) 16 | assert not os.listdir(to_folder) 17 | 18 | datasets = [] 19 | lazo = [] 20 | for f in os.listdir(from_folder): 21 | if f.startswith('lazo.'): 22 | lazo.append(f) 23 | else: 24 | datasets.append(f) 25 | 26 | for i, dataset in enumerate(datasets): 27 | if i % 100 == 0: 28 | print("% 5d / %5d datasets processed" % (i, len(datasets))) 29 | 30 | with open(os.path.join(from_folder, dataset)) as fp: 31 | obj = json.load(fp) 32 | 33 | for column in obj['columns']: 34 | if ( 35 | column['structural_type'] == 'http://schema.org/GeoCoordinates' 36 | and 'point_format' not in column 37 | ): 38 | column['point_format'] = 'long,lat' 39 | 40 | with open(os.path.join(to_folder, dataset), 'w') as fp: 41 | json.dump(obj, fp, sort_keys=True, indent=2) 42 | 43 | print("Copying lazo data...") 44 | for i, f in enumerate(lazo): 45 | if i % 1000 == 0: 46 | print("% 5d / %5d files copied" % (i, len(lazo))) 47 | shutil.copy2( 48 | os.path.join(from_folder, f), 49 | os.path.join(to_folder, f), 50 | ) 51 | 52 | 53 | if __name__ == '__main__': 54 | migrate(sys.argv[1], sys.argv[2]) 55 | -------------------------------------------------------------------------------- /tests/test_common.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from datamart_core import common 4 | 5 | 6 | class TestDatasetIdEncoding(unittest.TestCase): 7 | def test_encode(self): 8 | """Test encoding a dataset ID to a file name""" 9 | self.assertEqual( 10 | common.encode_dataset_id('datamart_contrived/dataset#id;'), 11 | 'datamart__contrived_2Fdataset_23id_3B', 12 | ) 13 | 14 | def test_decode(self): 15 | """Test decoding a file name to a dataset ID""" 16 | self.assertEqual( 17 | common.decode_dataset_id('datamart__contrived_2Fdataset_23id_3B'), 18 | 'datamart_contrived/dataset#id;', 19 | ) 20 | 21 | 22 | class TestStripHtml(unittest.TestCase): 23 | def test_strip(self): 24 | """Strip HTML from text""" 25 | self.assertEqual( 26 | common.strip_html( 27 | "

    Text & tags & HTML

    ", 29 | ), 30 | "Text & tags & HTML", 31 | ) 32 | 33 | def test_link(self): 34 | """Keep link targets""" 35 | self.assertEqual( 36 | common.strip_html( 37 | "Some links here: " 38 | + "google.com", 39 | ), 40 | "Some links (https://google.com/) here: google.com", 41 | ) 42 | 43 | def test_unknown(self): 44 | """Unknown tags should be preserved""" 45 | self.assertEqual( 46 | common.strip_html( 47 | "Run python ", 48 | ), 49 | "Run python ", 50 | ) 51 | -------------------------------------------------------------------------------- /contrib/k8s/snapshotter.libsonnet: -------------------------------------------------------------------------------- 1 | local utils = import 'utils.libsonnet'; 2 | 3 | function( 4 | config, 5 | schedule='20 0 * * 5', 6 | ) { 7 | 'snapshotter-cronjob': config.kube('batch/v1beta1', 'CronJob', { 8 | file:: 'snapshotter.yml', 9 | metadata: { 10 | name: 'snapshotter', 11 | labels: { 12 | app: 'auctus', 13 | what: 'snapshotter', 14 | }, 15 | }, 16 | spec: { 17 | schedule: schedule, 18 | jobTemplate: { 19 | metadata: { 20 | labels: { 21 | app: 'auctus', 22 | what: 'snapshotter', 23 | }, 24 | }, 25 | spec: { 26 | template: { 27 | metadata: { 28 | labels: { 29 | app: 'auctus', 30 | what: 'snapshotter', 31 | }, 32 | }, 33 | spec: { 34 | restartPolicy: 'Never', 35 | securityContext: { 36 | runAsNonRoot: true, 37 | }, 38 | containers: [ 39 | { 40 | name: 'snapshotter', 41 | image: config.image, 42 | imagePullPolicy: 'IfNotPresent', 43 | args: ['snapshotter'], 44 | env: utils.env( 45 | { 46 | LOG_FORMAT: config.log_format, 47 | ELASTICSEARCH_HOSTS: 'elasticsearch:9200', 48 | ELASTICSEARCH_PREFIX: config.elasticsearch.prefix, 49 | } 50 | + utils.object_store_env(config.object_store) 51 | ), 52 | }, 53 | ], 54 | }, 55 | }, 56 | }, 57 | }, 58 | }, 59 | }), 60 | } 61 | -------------------------------------------------------------------------------- /contrib/k8s/auctus.libsonnet: -------------------------------------------------------------------------------- 1 | local app = import 'app.libsonnet'; 2 | local ckan = import 'discovery/ckan.libsonnet'; 3 | local socrata = import 'discovery/socrata.libsonnet'; 4 | local test_discoverer = import 'discovery/test.libsonnet'; 5 | local uaz_indicators = import 'discovery/uaz-indicators.libsonnet'; 6 | local worldbank = import 'discovery/worldbank.libsonnet'; 7 | local zenodo = import 'discovery/zenodo.libsonnet'; 8 | local elasticsearch = import 'elasticsearch.libsonnet'; 9 | local ingress = import 'ingress.libsonnet'; 10 | local jaeger = import 'jaeger.libsonnet'; 11 | local minio = import 'minio.libsonnet'; 12 | local monitoring = import 'monitoring.libsonnet'; 13 | local nominatim = import 'nominatim.libsonnet'; 14 | local rabbitmq = import 'rabbitmq.libsonnet'; 15 | local redis = import 'redis.libsonnet'; 16 | local snapshotter = import 'snapshotter.libsonnet'; 17 | local volumes_local = import 'volumes-local.libsonnet'; 18 | local volumes = import 'volumes.libsonnet'; 19 | 20 | function(config) ( 21 | local data = ( 22 | {} 23 | + redis(config) 24 | + elasticsearch(config) 25 | + rabbitmq(config) 26 | + nominatim(config) 27 | + app(config) 28 | + snapshotter(config) 29 | + ingress(config) 30 | + minio(config) 31 | + monitoring(config) 32 | + jaeger(config) 33 | + ckan(config) 34 | + socrata(config) 35 | + uaz_indicators(config) 36 | + worldbank(config) 37 | + zenodo(config) 38 | //+ test_discoverer(config) 39 | ); 40 | 41 | local files = std.set([data[k].file for k in std.objectFields(data)]); 42 | 43 | { 44 | [file]: std.manifestYamlStream([ 45 | data[k] 46 | for k in std.objectFields(data) 47 | if data[k] != null && data[k].file == file 48 | ]) 49 | for file in files 50 | } 51 | ) 52 | -------------------------------------------------------------------------------- /frontend/src/components/JoinColumnsSelector/FunctionBin.tsx: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import {useDrop} from 'react-dnd'; 3 | import * as Icon from 'react-feather'; 4 | 5 | const ItemType = 'badge'; 6 | 7 | const functionBinStyle = (background: string): React.CSSProperties => ({ 8 | border: '1px solid #c0c0c0', 9 | padding: '.5rem', 10 | margin: '0.25rem', 11 | minHeight: '100px', 12 | minWidth: '100px', 13 | verticalAlign: 'middle', 14 | backgroundColor: background, 15 | }); 16 | 17 | interface FunctionBinProps { 18 | fn: string; 19 | label?: string; 20 | } 21 | 22 | const FunctionBin: React.FC = ({fn, label}) => { 23 | const [{canDrop, isOver}, drop] = useDrop({ 24 | accept: ItemType, 25 | drop: () => ({name: fn}), 26 | collect: monitor => ({ 27 | isOver: monitor.isOver(), 28 | canDrop: monitor.canDrop(), 29 | }), 30 | }); 31 | const isActive = canDrop && isOver; 32 | return ( 33 |
    34 |
    43 | 44 | {isActive ? ( 45 | 'Release!' 46 | ) : ( 47 | <> 48 | {label ? ( 49 | {label} 50 | ) : ( 51 | <> 52 | {fn.toUpperCase()}( 53 | ) 54 | 55 | )} 56 | 57 | )} 58 | 59 |
    60 |
    61 | ); 62 | }; 63 | 64 | export {FunctionBin}; 65 | -------------------------------------------------------------------------------- /tests/data/spatiotemporal.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | import random 3 | import os 4 | 5 | 6 | GRID_CELL_SIZE = 0.001 7 | COLORS = ['red', 'green', 'blue', 'yellow', 'orange'] 8 | 9 | 10 | def main(): 11 | lat = 43.237597 12 | lon = 6.072545 13 | 14 | data_dir = os.path.dirname(__file__) 15 | 16 | with open(os.path.join(data_dir, 'spatiotemporal.csv'), 'w') as f_data: 17 | print('date,latitude,longitude,color', file=f_data) 18 | rand = random.Random(1) 19 | for t in range(20): 20 | time = datetime(2006, 6, 20) 21 | time += timedelta(minutes=t * 30) 22 | for _ in range(10): 23 | print( 24 | '%s,%.3f,%.3f,%s' % ( 25 | time.isoformat(), 26 | lat + GRID_CELL_SIZE * (rand.random() * 6 - 3), 27 | lon + GRID_CELL_SIZE * (rand.random() * 6 - 3), 28 | rand.choice(COLORS), 29 | ), 30 | file=f_data, 31 | ) 32 | 33 | with open(os.path.join(data_dir, 'spatiotemporal_aug.csv'), 'w') as f_data: 34 | print('date,latitude,longitude', file=f_data) 35 | for t in range(3): 36 | time = datetime(2006, 6, 20, 6) 37 | time += timedelta(hours=t) 38 | for x in range(-1, 1): 39 | for y in range(-1, 1): 40 | print( 41 | '%s,%.3f,%.3f' % ( 42 | time.isoformat(), 43 | lat + GRID_CELL_SIZE * y, 44 | lon + GRID_CELL_SIZE * x, 45 | ), 46 | file=f_data, 47 | ) 48 | 49 | 50 | if __name__ == '__main__': 51 | main() 52 | -------------------------------------------------------------------------------- /frontend/src/spatial-utils.ts: -------------------------------------------------------------------------------- 1 | import MapBrowserEvent from 'ol/MapBrowserEvent'; 2 | import Geometry from 'ol/geom/Geometry'; 3 | import {FeatureLike} from 'ol/Feature'; 4 | import {Map} from 'ol'; 5 | import {Extent} from 'ol/extent'; 6 | 7 | // 8 | // Following types are a temporary workaround to a bug in typings from the 9 | // OpenLayers library (package @types/ol) This may be removed after upgrading 10 | // this library to a newer version 11 | // 12 | export interface MyMapBrowserEvent extends MapBrowserEvent { 13 | pointerEvent: PointerEvent; 14 | } 15 | 16 | interface MyGeometry extends Geometry { 17 | getCoordinates(): number[][][]; 18 | } 19 | 20 | export function transformCoordinates(feature: FeatureLike) { 21 | const geometry = feature.getGeometry() as MyGeometry; 22 | const transformedGeometry = geometry 23 | .clone() 24 | .transform('EPSG:3857', 'EPSG:4326') as MyGeometry; 25 | const coordinates = transformedGeometry.getCoordinates()[0]; 26 | return { 27 | topLeftLat: coordinates[0][1], 28 | topLeftLon: coordinates[0][0], 29 | bottomRightLat: coordinates[2][1], 30 | bottomRightLon: coordinates[2][0], 31 | }; 32 | } 33 | 34 | export function centralizeMapToExtent(map: Map, extent: Extent) { 35 | map.getView().fit(extent); 36 | map.updateSize(); 37 | } 38 | 39 | export function centralizeMapToFeature(map: Map, feature: FeatureLike) { 40 | const extent = feature.getGeometry()?.getExtent(); 41 | if (extent) { 42 | centralizeMapToExtent(map, extent); 43 | } 44 | } 45 | 46 | export function wrapLongitude(x: number) { 47 | if (-180 <= x && x <= 180) { 48 | return x; 49 | } 50 | console.log(`> ${x} ...`); 51 | x += 180; 52 | x = x % 360; 53 | x = (x + 360) % 360; // second pass for negative remainders 54 | x -= 180; 55 | console.log(`<< ${x}`); 56 | return x; 57 | } 58 | -------------------------------------------------------------------------------- /frontend/src/components/visus/Loading/Spinner.tsx: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import styled, {keyframes} from 'styled-components'; 3 | 4 | export const keyFrameInfiniteSpin = keyframes` 5 | from {transform: rotate(0deg)} 6 | to {transform: rotate(360deg)} 7 | `; 8 | 9 | export const SpinningSvg = styled.svg` 10 | animation-name: ${keyFrameInfiniteSpin}; 11 | transition-property: transform; 12 | animation-iteration-count: infinite; 13 | animation-timing-function: linear; 14 | `; 15 | 16 | interface SpinnerProps { 17 | color?: string; 18 | speed?: string; 19 | gap?: number; 20 | thickness?: number; 21 | size?: string; 22 | } 23 | 24 | class Spinner extends React.PureComponent { 25 | static defaultProps = { 26 | color: 'rgba(0,0,0,0.4)', 27 | gap: 4, 28 | thickness: 4, 29 | size: '1.0em', 30 | }; 31 | 32 | speedSwitch(speed?: string) { 33 | if (speed === 'fast') { 34 | return 600; 35 | } 36 | if (speed === 'slow') { 37 | return 900; 38 | } 39 | return 750; 40 | } 41 | 42 | render() { 43 | return ( 44 | 54 | 65 | 66 | ); 67 | } 68 | } 69 | 70 | export {Spinner}; 71 | -------------------------------------------------------------------------------- /lib_core/datamart_core/types.py: -------------------------------------------------------------------------------- 1 | # Column types 2 | 3 | MISSING_DATA = 'https://metadata.datadrivendiscovery.org/types/MissingData' 4 | """No data (whole column is missing)""" 5 | 6 | INTEGER = 'http://schema.org/Integer' 7 | """Integer (numbers without a decimal point)""" 8 | 9 | FLOAT = 'http://schema.org/Float' 10 | """Floating-point numbers""" 11 | 12 | TEXT = 'http://schema.org/Text' 13 | """Text, better represented as strings""" 14 | 15 | BOOLEAN = 'http://schema.org/Boolean' 16 | """Booleans, e.g. only the two values \"true\" and \"false\"""" 17 | 18 | LATITUDE = 'http://schema.org/latitude' 19 | """Numerical values representing latitude coordinates""" 20 | 21 | LONGITUDE = 'http://schema.org/longitude' 22 | """Numerical values representing longitude coordinates""" 23 | 24 | DATE_TIME = 'http://schema.org/DateTime' 25 | """A specific instant in time (not partial ones such as "July 4" or "12am")""" 26 | 27 | ADDRESS = 'http://schema.org/address' 28 | """The street address of a location""" 29 | 30 | ADMIN = 'http://schema.org/AdministrativeArea' 31 | """A named administrative area, such as a country, state, or city""" 32 | 33 | URL = 'http://schema.org/URL' 34 | """A URL""" 35 | 36 | FILE_PATH = 'https://metadata.datadrivendiscovery.org/types/FileName' 37 | """A filename""" 38 | 39 | ID = 'http://schema.org/identifier' 40 | """An identifier""" 41 | 42 | CATEGORICAL = 'http://schema.org/Enumeration' 43 | """Categorical values, i.e. drawn from a limited number of options""" 44 | 45 | GEO_POINT = 'http://schema.org/GeoCoordinates' 46 | """A geographic location (latitude+longitude coordinates)""" 47 | 48 | GEO_POLYGON = 'http://schema.org/GeoShape' 49 | """A geographic shape described by its coordinates""" 50 | 51 | 52 | # Dataset types 53 | 54 | DATASET_NUMERICAL = 'numerical' 55 | DATASET_CATEGORICAL = 'categorical' 56 | DATASET_SPATIAL = 'spatial' 57 | DATASET_TEMPORAL = 'temporal' 58 | -------------------------------------------------------------------------------- /lib_profiler/datamart_profiler/types.py: -------------------------------------------------------------------------------- 1 | # Column types 2 | 3 | MISSING_DATA = 'https://metadata.datadrivendiscovery.org/types/MissingData' 4 | """No data (whole column is missing)""" 5 | 6 | INTEGER = 'http://schema.org/Integer' 7 | """Integer (numbers without a decimal point)""" 8 | 9 | FLOAT = 'http://schema.org/Float' 10 | """Floating-point numbers""" 11 | 12 | TEXT = 'http://schema.org/Text' 13 | """Text, better represented as strings""" 14 | 15 | BOOLEAN = 'http://schema.org/Boolean' 16 | """Booleans, e.g. only the two values \"true\" and \"false\"""" 17 | 18 | LATITUDE = 'http://schema.org/latitude' 19 | """Numerical values representing latitude coordinates""" 20 | 21 | LONGITUDE = 'http://schema.org/longitude' 22 | """Numerical values representing longitude coordinates""" 23 | 24 | DATE_TIME = 'http://schema.org/DateTime' 25 | """A specific instant in time (not partial ones such as "July 4" or "12am")""" 26 | 27 | ADDRESS = 'http://schema.org/address' 28 | """The street address of a location""" 29 | 30 | ADMIN = 'http://schema.org/AdministrativeArea' 31 | """A named administrative area, such as a country, state, or city""" 32 | 33 | URL = 'http://schema.org/URL' 34 | """A URL""" 35 | 36 | FILE_PATH = 'https://metadata.datadrivendiscovery.org/types/FileName' 37 | """A filename""" 38 | 39 | ID = 'http://schema.org/identifier' 40 | """An identifier""" 41 | 42 | CATEGORICAL = 'http://schema.org/Enumeration' 43 | """Categorical values, i.e. drawn from a limited number of options""" 44 | 45 | GEO_POINT = 'http://schema.org/GeoCoordinates' 46 | """A geographic location (latitude+longitude coordinates)""" 47 | 48 | GEO_POLYGON = 'http://schema.org/GeoShape' 49 | """A geographic shape described by its coordinates""" 50 | 51 | 52 | # Dataset types 53 | 54 | DATASET_NUMERICAL = 'numerical' 55 | DATASET_CATEGORICAL = 'categorical' 56 | DATASET_SPATIAL = 'spatial' 57 | DATASET_TEMPORAL = 'temporal' 58 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | import os 8 | import subprocess 9 | 10 | 11 | # -- Project information ----------------------------------------------------- 12 | 13 | project = 'Auctus' 14 | copyright = '2019, New York University' 15 | author = 'Remi Rampin' 16 | 17 | # The full version, including alpha/beta/rc tags 18 | release = subprocess.check_output(['git', 'describe'], encoding='ascii') 19 | os.environ['DATAMART_VERSION'] = release 20 | 21 | 22 | # -- General configuration --------------------------------------------------- 23 | 24 | # Add any Sphinx extension module names here, as strings. They can be 25 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 26 | # ones. 27 | extensions = [ 28 | 'sphinx.ext.autodoc', 29 | ] 30 | 31 | # Add any paths that contain templates here, relative to this directory. 32 | templates_path = ['_templates'] 33 | 34 | # List of patterns, relative to source directory, that match files and 35 | # directories to ignore when looking for source files. 36 | # This pattern also affects html_static_path and html_extra_path. 37 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 38 | 39 | 40 | # -- Options for HTML output ------------------------------------------------- 41 | 42 | # The theme to use for HTML and HTML Help pages. See the documentation for 43 | # a list of builtin themes. 44 | # 45 | html_theme = 'sphinx_rtd_theme' 46 | 47 | # Add any paths that contain custom static files (such as style sheets) here, 48 | # relative to this directory. They are copied after the builtin static files, 49 | # so a file named "default.css" will overwrite the builtin "default.css". 50 | html_static_path = ['_static'] 51 | -------------------------------------------------------------------------------- /profiler/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup 3 | 4 | 5 | os.chdir(os.path.abspath(os.path.dirname(__file__))) 6 | 7 | 8 | req = [ 9 | 'aio-pika', 10 | 'elasticsearch~=7.0', 11 | 'lazo-index-service==0.7.0', 12 | 'opentelemetry-api', 13 | 'opentelemetry-distro', 14 | 'opentelemetry-instrumentation-elasticsearch', 15 | 'opentelemetry-instrumentation-grpc', 16 | 'prometheus_client', 17 | 'xlrd', 18 | 'defusedxml', 19 | 'datamart-core', 20 | 'datamart-materialize', 21 | 'datamart-profiler', 22 | ] 23 | setup(name='datamart-profiler-service', 24 | version='0.0', 25 | py_modules=['profiler'], 26 | entry_points={ 27 | 'console_scripts': [ 28 | 'profiler = profiler:main']}, 29 | install_requires=req, 30 | description="Data profiling service of Auctus", 31 | author="Remi Rampin", 32 | author_email='remi.rampin@nyu.edu', 33 | maintainer="Remi Rampin", 34 | maintainer_email='remi.rampin@nyu.edu', 35 | url='https://gitlab.com/ViDA-NYU/auctus/auctus', 36 | project_urls={ 37 | 'Homepage': 'https://gitlab.com/ViDA-NYU/auctus/auctus', 38 | 'Source': 'https://gitlab.com/ViDA-NYU/auctus/auctus', 39 | 'Tracker': 'https://gitlab.com/ViDA-NYU/auctus/auctus/-/issues', 40 | }, 41 | long_description="Data profiling service of Auctus", 42 | license='Apache-2.0', 43 | keywords=['auctus', 'datamart'], 44 | classifiers=[ 45 | 'Development Status :: 3 - Alpha', 46 | 'Intended Audience :: Science/Research', 47 | 'License :: OSI Approved :: Apache Software License', 48 | 'Natural Language :: English', 49 | 'Operating System :: OS Independent', 50 | 'Programming Language :: Python :: 3 :: Only', 51 | 'Topic :: Scientific/Engineering :: Information Analysis']) 52 | -------------------------------------------------------------------------------- /coordinator/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup 3 | 4 | 5 | os.chdir(os.path.abspath(os.path.dirname(__file__))) 6 | 7 | 8 | req = [ 9 | 'aio-pika', 10 | 'elasticsearch~=7.0', 11 | 'prometheus_client', 12 | 'PyYaml', 13 | 'jinja2', 14 | 'tornado>=5.0', 15 | 'datamart-core', 16 | ] 17 | setup(name='datamart-coordinator-service', 18 | version='0.0', 19 | packages=['coordinator'], 20 | package_data={'coordinator': [ 21 | 'static/css/*.css', 'static/css/*.css.map', 22 | 'static/js/*.js', 'static/js/*.js.map', 23 | 'templates/*.html', 24 | 'elasticsearch.yml', 25 | ]}, 26 | entry_points={ 27 | 'console_scripts': [ 28 | 'coordinator = coordinator.web:main']}, 29 | install_requires=req, 30 | description="Coordinator service for Auctus", 31 | author="Remi Rampin", 32 | author_email='remi.rampin@nyu.edu', 33 | maintainer="Remi Rampin", 34 | maintainer_email='remi.rampin@nyu.edu', 35 | url='https://gitlab.com/ViDA-NYU/auctus/auctus', 36 | project_urls={ 37 | 'Homepage': 'https://gitlab.com/ViDA-NYU/auctus/auctus', 38 | 'Source': 'https://gitlab.com/ViDA-NYU/auctus/auctus', 39 | 'Tracker': 'https://gitlab.com/ViDA-NYU/auctus/auctus/-/issues', 40 | }, 41 | long_description="Coordinator service for Auctus", 42 | license='Apache-2.0', 43 | keywords=['auctus', 'datamart'], 44 | classifiers=[ 45 | 'Development Status :: 4 - Beta', 46 | 'Intended Audience :: Science/Research', 47 | 'License :: OSI Approved :: Apache Software License', 48 | 'Operating System :: Unix', 49 | 'Programming Language :: JavaScript', 50 | 'Programming Language :: Python :: 3 :: Only', 51 | 'Topic :: Scientific/Engineering :: Information Analysis']) 52 | -------------------------------------------------------------------------------- /lib_materialize/datamart_materialize/types.py: -------------------------------------------------------------------------------- 1 | # Column types 2 | 3 | MISSING_DATA = 'https://metadata.datadrivendiscovery.org/types/MissingData' 4 | """No data (whole column is missing)""" 5 | 6 | INTEGER = 'http://schema.org/Integer' 7 | """Integer (numbers without a decimal point)""" 8 | 9 | FLOAT = 'http://schema.org/Float' 10 | """Floating-point numbers""" 11 | 12 | TEXT = 'http://schema.org/Text' 13 | """Text, better represented as strings""" 14 | 15 | BOOLEAN = 'http://schema.org/Boolean' 16 | """Booleans, e.g. only the two values \"true\" and \"false\"""" 17 | 18 | LATITUDE = 'http://schema.org/latitude' 19 | """Numerical values representing latitude coordinates""" 20 | 21 | LONGITUDE = 'http://schema.org/longitude' 22 | """Numerical values representing longitude coordinates""" 23 | 24 | DATE_TIME = 'http://schema.org/DateTime' 25 | """A specific instant in time (not partial ones such as "July 4" or "12am")""" 26 | 27 | ADDRESS = 'http://schema.org/address' 28 | """The street address of a location""" 29 | 30 | ADMIN = 'http://schema.org/AdministrativeArea' 31 | """A named administrative area, such as a country, state, or city""" 32 | 33 | URL = 'http://schema.org/URL' 34 | """A URL""" 35 | 36 | FILE_PATH = 'https://metadata.datadrivendiscovery.org/types/FileName' 37 | """A filename""" 38 | 39 | ID = 'http://schema.org/identifier' 40 | """An identifier""" 41 | 42 | CATEGORICAL = 'http://schema.org/Enumeration' 43 | """Categorical values, i.e. drawn from a limited number of options""" 44 | 45 | GEO_POINT = 'http://schema.org/GeoCoordinates' 46 | """A geographic location (latitude+longitude coordinates)""" 47 | 48 | GEO_POLYGON = 'http://schema.org/GeoShape' 49 | """A geographic shape described by its coordinates""" 50 | 51 | 52 | # Dataset types 53 | 54 | DATASET_NUMERICAL = 'numerical' 55 | DATASET_CATEGORICAL = 'categorical' 56 | DATASET_SPATIAL = 'spatial' 57 | DATASET_TEMPORAL = 'temporal' 58 | -------------------------------------------------------------------------------- /lib_materialize/datamart_materialize/pivot.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | from datamart_materialize.utils import SimpleConverter 4 | 5 | 6 | VALUE_COLUMN_LABEL = 'value' 7 | 8 | 9 | def pivot_table( 10 | source_filename, dest_fileobj, except_columns, date_label='date', 11 | ): 12 | with open(source_filename, 'r') as src_fp: 13 | src = iter(csv.reader(src_fp)) 14 | dst = csv.writer(dest_fileobj) 15 | 16 | # Read original columns, some are carried over 17 | try: 18 | orig_columns = next(src) 19 | except StopIteration: 20 | raise ValueError("Empty table") 21 | carried_columns = [orig_columns[i] for i in except_columns] 22 | 23 | # Generate new header 24 | dst.writerow(carried_columns + [date_label, VALUE_COLUMN_LABEL]) 25 | 26 | # Indexes of date columns 27 | date_indexes = [ 28 | i for i in range(len(orig_columns)) 29 | if i not in except_columns 30 | ] 31 | dates = [ 32 | name for i, name in enumerate(orig_columns) 33 | if i not in except_columns 34 | ] 35 | 36 | for row in src: 37 | carried_values = [row[i] for i in except_columns] 38 | for date, date_idx in zip(dates, date_indexes): 39 | dst.writerow(carried_values + [date, row[date_idx]]) 40 | 41 | 42 | class PivotConverter(SimpleConverter): 43 | """Adapter pivoting a table. 44 | """ 45 | def __init__(self, writer, *, except_columns, date_label='date'): 46 | super(PivotConverter, self).__init__(writer) 47 | self.except_columns = except_columns 48 | self.date_label = date_label 49 | 50 | def transform(self, source_filename, dest_fileobj): 51 | pivot_table( 52 | source_filename, 53 | dest_fileobj, 54 | self.except_columns, 55 | self.date_label, 56 | ) 57 | -------------------------------------------------------------------------------- /apiserver/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup 3 | 4 | 5 | os.chdir(os.path.abspath(os.path.dirname(__file__))) 6 | 7 | 8 | req = [ 9 | 'advocate>=1.0,<2', 10 | 'aio-pika', 11 | 'elasticsearch~=7.0', 12 | 'redis~=3.4', 13 | 'lazo-index-service==0.7.0', 14 | 'opentelemetry-distro', 15 | 'opentelemetry-instrumentation-elasticsearch', 16 | 'opentelemetry-instrumentation-grpc', 17 | 'opentelemetry-instrumentation-tornado', 18 | 'prometheus_client', 19 | 'tornado>=5.0', 20 | 'datamart-augmentation', 21 | 'datamart-core', 22 | 'datamart-materialize', 23 | 'datamart-profiler', 24 | ] 25 | setup(name='datamart-api-service', 26 | version='0.0', 27 | packages=['apiserver'], 28 | entry_points={ 29 | 'console_scripts': [ 30 | 'datamart-apiserver = apiserver.main:main']}, 31 | install_requires=req, 32 | description="API service of Auctus", 33 | author="Remi Rampin", 34 | author_email='remi.rampin@nyu.edu', 35 | maintainer="Remi Rampin", 36 | maintainer_email='remi.rampin@nyu.edu', 37 | url='https://gitlab.com/ViDA-NYU/auctus/auctus', 38 | project_urls={ 39 | 'Homepage': 'https://gitlab.com/ViDA-NYU/auctus/auctus', 40 | 'Source': 'https://gitlab.com/ViDA-NYU/auctus/auctus', 41 | 'Tracker': 'https://gitlab.com/ViDA-NYU/auctus/auctus/-/issues', 42 | }, 43 | long_description="API service of Auctus", 44 | license='Apache-2.0', 45 | keywords=['auctus', 'datamart'], 46 | classifiers=[ 47 | 'Development Status :: 4 - Beta', 48 | 'Intended Audience :: Science/Research', 49 | 'License :: OSI Approved :: Apache Software License', 50 | 'Operating System :: Unix', 51 | 'Programming Language :: Python :: 3 :: Only', 52 | 'Topic :: Scientific/Engineering :: Information Analysis']) 53 | -------------------------------------------------------------------------------- /frontend/src/components/SearchBar/SearchBar.tsx: -------------------------------------------------------------------------------- 1 | import './SearchBar.css'; 2 | import React from 'react'; 3 | import * as Icon from 'react-feather'; 4 | 5 | interface SearchBarProps { 6 | active: boolean; 7 | placeholder?: string; 8 | value: string; 9 | onQueryChange: (query: string) => void; 10 | onSubmitQuery: () => void; 11 | } 12 | 13 | class SearchBar extends React.PureComponent { 14 | constructor(props: SearchBarProps) { 15 | super(props); 16 | this.handleChange = this.handleChange.bind(this); 17 | this.handleSubmit = this.handleSubmit.bind(this); 18 | } 19 | 20 | isActive() { 21 | return this.props.active; 22 | } 23 | 24 | handleChange(event: React.ChangeEvent) { 25 | const query = event.target.value; 26 | this.props.onQueryChange(query); 27 | } 28 | 29 | handleSubmit(e: React.FormEvent) { 30 | e.preventDefault(); 31 | this.props.onSubmitQuery(); 32 | } 33 | 34 | render() { 35 | return ( 36 |
    37 |
    38 | 46 |
    this.props.onSubmitQuery()} 49 | > 50 | 55 | 56 | 57 |
    58 |
    59 |
    60 | ); 61 | } 62 | } 63 | 64 | export {SearchBar}; 65 | -------------------------------------------------------------------------------- /frontend/src/components/Chip/Chip.css: -------------------------------------------------------------------------------- 1 | .chip { 2 | /* color: rgba(0, 0, 0, 0.87); */ 3 | border: none; 4 | cursor: default; 5 | height: 32px; 6 | display: inline-flex; 7 | outline: 0; 8 | padding: 0 0.5rem; 9 | font-size: 0.8125rem; 10 | box-sizing: border-box; 11 | transition: background-color 300ms cubic-bezier(0.4, 0, 0.2, 1) 0ms, 12 | box-shadow 300ms cubic-bezier(0.4, 0, 0.2, 1) 0ms; 13 | align-items: center; 14 | /* font-family: "Roboto", "Helvetica", "Arial", sans-serif; */ 15 | font-family: "Oswald"; 16 | white-space: nowrap; 17 | border-radius: 16px; 18 | vertical-align: middle; 19 | justify-content: center; 20 | text-decoration: none; 21 | background-color: #e0e0e0; 22 | } 23 | 24 | .chip-clickable { 25 | cursor: pointer; 26 | user-select: none; 27 | -webkit-tap-highlight-color: transparent; 28 | } 29 | 30 | .chip-clickable:hover { 31 | background-color: #f0f0f0; 32 | } 33 | 34 | .chip-btn-close { 35 | margin-left: .25rem!important; 36 | margin-bottom: 0.1rem!important; 37 | } 38 | 39 | .chip-icon { 40 | margin-right: .3rem!important; 41 | margin-bottom: 0.1rem!important; 42 | } 43 | 44 | .chip .chip-btn-close { 45 | color: rgb(0, 0, 0, 0.5); 46 | cursor: pointer; 47 | user-select: none; 48 | -webkit-tap-highlight-color: transparent; 49 | } 50 | .chip .chip-btn-close:hover { 51 | color: rgb(0, 0, 0, 1.0); 52 | } 53 | 54 | .chip-outline { 55 | border: 1px solid #ced4da; 56 | background-color: transparent; 57 | } 58 | .chip-outline .chip-icon { 59 | color: var(--primary); 60 | } 61 | 62 | 63 | .chip-primary { 64 | border-color: var(--primary); 65 | } 66 | .chip-primary .chip-icon { 67 | color: var(--primary); 68 | } 69 | .chip-primary .chip-btn-close { 70 | color: rgb(99, 80, 139, 0.7); 71 | } 72 | .chip-primary .chip-btn-close:hover { 73 | color: rgb(99, 80, 139); 74 | } 75 | 76 | 77 | .chip-group > * { 78 | display: inline-flex; 79 | margin-right: 0.25rem; 80 | } 81 | 82 | .chip-group > *:last-child { 83 | margin-left: 0px; 84 | } 85 | -------------------------------------------------------------------------------- /frontend/src/App.tsx: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import {CenteredHorizontalLogo} from './components/Logo/Logo'; 3 | import {MainMenu} from './components/MainMenu/MainMenu'; 4 | import {BrowserRouter as Router, Link, Switch, Route} from 'react-router-dom'; 5 | import {Upload} from './components/Upload/Upload'; 6 | import {Statistics} from './components/Statistics/Statistics'; 7 | import {SearchApp} from './components/SearchApp/SearchApp'; 8 | 9 | class App extends React.Component { 10 | render() { 11 | return ( 12 |
    13 | 14 | 15 | ( 18 |
    19 | 20 | 21 | 22 | 23 | 24 |
    25 | )} 26 | /> 27 | ( 30 |
    31 | 32 | 33 | 34 | 35 | 36 |
    37 | )} 38 | /> 39 | ( 42 | <> 43 | 44 | 49 | 50 | )} 51 | /> 52 |
    53 |
    54 |
    55 | ); 56 | } 57 | } 58 | 59 | export {App}; 60 | -------------------------------------------------------------------------------- /scripts/README.md: -------------------------------------------------------------------------------- 1 | Scripts 2 | ======= 3 | 4 | This folder contains scripts that make it easy to perform certain tasks. Those scripts are meant to be run inside the Auctus environment; some of them are prefixed with `docker_`, those will run the corresponding script inside an Auctus container (provided the images have been built using the default docker-compose names). 5 | 6 | * setup.sh: Run this once to setup your local checkout. This sets up the permissions on the volumes for docker-compose. 7 | * docker_import_snapshot.sh: This downloads a dump of Elasticsearch from https://auctus.vida-nyu.org/snapshot/ and imports it using import_all.py 8 | * docker_import_all.sh / import_all.py: This can be used to load a dump of Elasticsearch as JSON files. Useful to restore a backup 9 | * import.py: Import a single dataset from a JSON file 10 | * reprocess_all.py: This loads a dump of Elasticsearch as JSON files, but reprocesses the datasets 11 | * freshen_old_index.py: This reprocesses datasets that were profiled by old versions 12 | * docker_export_all.sh / export_all.py: This can be used to do a backup of the index. It creates a dump of Elasticsearch as JSON files 13 | * docker-save_uploads.sh: This can be used to save the datasets that have been manually uploaded into Auctus (the data itself, not the indexed JSON documents) 14 | * delete_dataset.py: Removes a single dataset from the index 15 | * list_big_datasets.py: Lists the big datasets that have been indexed (by looking for the 'size' property above 50 MB) 16 | * list_sources.py: This lists the number of datasets in the index per source (this is now shown on the index page of the coordinator as well) 17 | * docker_purge_source.sh / purge_source.py: This removes all datasets from a given source 18 | * clear_caches.py / docker_clear_caches.sh: This safely clears the caches 19 | * upload_dataset.sh: This profiles and adds a dataset to the index 20 | * report-uploads.sh: Alerts when datasets are uploaded to the system 21 | * dataset_to_sup_index.py: This creates the supplementary column indices after 5507ab47 22 | -------------------------------------------------------------------------------- /docker/grafana.dockerfile: -------------------------------------------------------------------------------- 1 | ARG GRAFANA_VERSION="latest" 2 | 3 | FROM python:3.8 AS tini 4 | 5 | ENV TINI_VERSION v0.18.0 6 | RUN curl -Lo /tini https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini && \ 7 | chmod +x /tini 8 | 9 | FROM grafana/grafana:${GRAFANA_VERSION} 10 | 11 | USER root 12 | 13 | ARG GF_INSTALL_IMAGE_RENDERER_PLUGIN="false" 14 | 15 | ENV GF_PATHS_PLUGINS="/var/lib/grafana-plugins" 16 | 17 | RUN mkdir -p "$GF_PATHS_PLUGINS" && \ 18 | chown -R grafana:root "$GF_PATHS_PLUGINS" 19 | 20 | RUN if [ $GF_INSTALL_IMAGE_RENDERER_PLUGIN != "false" ]; then \ 21 | echo "http://dl-cdn.alpinelinux.org/alpine/edge/community" >> /etc/apk/repositories && \ 22 | echo "http://dl-cdn.alpinelinux.org/alpine/edge/main" >> /etc/apk/repositories && \ 23 | echo "http://dl-cdn.alpinelinux.org/alpine/edge/testing" >> /etc/apk/repositories && \ 24 | apk --no-cache upgrade && \ 25 | apk add --no-cache udev ttf-opensans chromium && \ 26 | rm -rf /tmp/* && \ 27 | rm -rf /usr/share/grafana/tools/phantomjs; \ 28 | fi 29 | 30 | USER grafana 31 | 32 | ENV GF_RENDERER_PLUGIN_CHROME_BIN="/usr/bin/chromium-browser" 33 | ENV GF_PLUGIN_RENDERING_CHROME_BIN="/usr/bin/chromium-browser" 34 | 35 | RUN if [ $GF_INSTALL_IMAGE_RENDERER_PLUGIN != "false" ]; then \ 36 | grafana-cli \ 37 | --pluginsDir "$GF_PATHS_PLUGINS" \ 38 | --pluginUrl https://github.com/grafana/grafana-image-renderer/releases/download/$GF_INSTALL_IMAGE_RENDERER_PLUGIN/plugin-linux-x64-glibc-no-chromium.zip \ 39 | plugins install grafana-image-renderer; \ 40 | fi 41 | 42 | ARG GF_INSTALL_PLUGINS="" 43 | 44 | RUN if [ ! -z "${GF_INSTALL_PLUGINS}" ]; then \ 45 | OLDIFS=$IFS; \ 46 | IFS=','; \ 47 | for plugin in ${GF_INSTALL_PLUGINS}; do \ 48 | IFS=$OLDIFS; \ 49 | grafana-cli --pluginsDir "$GF_PATHS_PLUGINS" plugins install ${plugin}; \ 50 | done; \ 51 | fi 52 | 53 | # Use tini so Chrome processes get reaped 54 | # https://github.com/grafana/grafana-image-renderer/issues/179 55 | COPY --from=tini /tini /tini 56 | ENTRYPOINT ["/tini", "--", "/run.sh"] 57 | -------------------------------------------------------------------------------- /scripts/reprocess_all.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """This script imports an exported index, but reprocesses the datasets. 4 | 5 | The name, description, and date are loaded from the old JSON, but the rest will 6 | be reprofiled instead. 7 | """ 8 | 9 | import aio_pika 10 | import asyncio 11 | import json 12 | import logging 13 | import os 14 | import sys 15 | 16 | from datamart_core.common import json2msg, decode_dataset_id 17 | 18 | 19 | async def import_all(folder): 20 | amqp_conn = await aio_pika.connect_robust( 21 | host=os.environ['AMQP_HOST'], 22 | port=int(os.environ['AMQP_PORT']), 23 | login=os.environ['AMQP_USER'], 24 | password=os.environ['AMQP_PASSWORD'], 25 | ) 26 | amqp_chan = await amqp_conn.channel() 27 | amqp_profile_exchange = await amqp_chan.declare_exchange( 28 | 'profile', 29 | aio_pika.ExchangeType.FANOUT, 30 | ) 31 | 32 | for name in os.listdir(folder): 33 | if not name.startswith('lazo.'): 34 | dataset_id = decode_dataset_id(name) 35 | path = os.path.join(folder, name) 36 | with open(path, 'r') as fp: 37 | obj = json.load(fp) 38 | metadata = dict(name=obj['name'], 39 | materialize=obj['materialize'], 40 | source=obj.get('source', 'unknown')) 41 | if obj.get('description'): 42 | metadata['description'] = obj['description'] 43 | if obj.get('date'): 44 | metadata['date'] = obj['date'] 45 | if obj.get('manual_annotations'): 46 | metadata['manual_annotations'] = obj['manual_annotations'] 47 | await amqp_profile_exchange.publish( 48 | json2msg(dict(id=dataset_id, metadata=metadata)), 49 | '', 50 | ) 51 | print('.', end='', flush=True) 52 | 53 | 54 | if __name__ == '__main__': 55 | logging.basicConfig(level=logging.INFO) 56 | 57 | loop = asyncio.get_event_loop() 58 | loop.run_until_complete(loop.create_task( 59 | import_all(sys.argv[1]) 60 | )) 61 | -------------------------------------------------------------------------------- /frontend/src/components/MainMenu/MainMenu.tsx: -------------------------------------------------------------------------------- 1 | import * as React from 'react'; 2 | import * as Icon from 'react-feather'; 3 | import './MainMenu.css'; 4 | import {DropdownMenu} from '../ui/DropdownMenu/DropdownMenu'; 5 | import {Link as RouterLink} from 'react-router-dom'; 6 | 7 | function Link(props: {path: string; label: string; icon: Icon.Icon}) { 8 | const content = ( 9 | 10 | {props.label} 11 | 12 | ); 13 | return ( 14 |
    15 | {props.path.startsWith('http:') || props.path.startsWith('https:') ? ( 16 | {content} 17 | ) : ( 18 | {content} 19 | )} 20 |
    21 | ); 22 | } 23 | 24 | class MainMenu extends React.PureComponent { 25 | render() { 26 | return ( 27 | 28 | {({active, onClick}) => ( 29 | <> 30 |
    31 |
    32 | 33 | 34 | 35 |
    36 | {active && ( 37 |
    38 | 43 | 48 | 53 |
    54 | )} 55 |
    56 | 57 | )} 58 |
    59 | ); 60 | } 61 | } 62 | 63 | export {MainMenu}; 64 | -------------------------------------------------------------------------------- /docker/install_deps.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """Install dependencies from poetry.lock 4 | 5 | This scripts is used as part of the Docker build to install all dependencies as 6 | an initial step before building the images. This makes caching efficient, 7 | allowing for faster builds that work offline. 8 | 9 | It means all images have all dependencies installed, but thanks to 10 | de-duplication, this generally uses less space if all images exist on the same 11 | machine. 12 | """ 13 | 14 | import subprocess 15 | import sys 16 | import toml 17 | 18 | 19 | def main(args): 20 | devel = False 21 | if args[0] == '--dev': 22 | devel = True 23 | args = args[1:] 24 | 25 | with open(args[0]) as fp: 26 | lockfile = toml.load(fp) 27 | 28 | packages = [] 29 | 30 | for package in lockfile['package']: 31 | if package['category'] == 'dev': 32 | if not devel: 33 | continue 34 | elif package['category'] != 'main': 35 | raise ValueError( 36 | "Unknown package category %s" % package['category'] 37 | ) 38 | 39 | if 'source' in package: 40 | if package['source']['type'] == 'git': 41 | packages.append('git+%s@%s' % ( 42 | package['source']['url'], 43 | package['source']['reference'], 44 | )) 45 | elif package['source']['type'] == 'url': 46 | packages.append(package['source']['url']) 47 | elif package['source']['type'] != 'directory': 48 | raise ValueError( 49 | "Unknown package source %s" % package['source']['type'] 50 | ) 51 | # Ignore 'directory' dependencies 52 | else: 53 | packages.append('%s==%s' % (package['name'], package['version'])) 54 | 55 | subprocess.check_call( 56 | [ 57 | 'pip3', 58 | '--disable-pip-version-check', 59 | '--no-cache-dir', 60 | 'install', 61 | ] + packages, 62 | ) 63 | 64 | 65 | if __name__ == '__main__': 66 | main(sys.argv[1:]) 67 | -------------------------------------------------------------------------------- /apiserver/apiserver/graceful_shutdown.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | import signal 4 | import tornado.ioloop 5 | import tornado.web 6 | 7 | from datamart_core.common import log_future 8 | 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class GracefulApplication(tornado.web.Application): 14 | """Application that exits on SIGTERM once no GracefulHandlers are running. 15 | """ 16 | def __init__(self, *args, **kwargs): 17 | super(GracefulApplication, self).__init__(*args, **kwargs) 18 | 19 | self.is_closing = False 20 | self.nb_requests = 0 21 | self.close_condition = asyncio.Condition() 22 | 23 | signal.signal(signal.SIGTERM, self.signal_handler) 24 | 25 | def signal_handler(self, signum, frame): 26 | logger.warning("Got signal %s, exiting...", signum) 27 | self.is_closing = True 28 | tornado.ioloop.IOLoop.current().add_callback_from_signal(self.try_exit) 29 | 30 | def try_exit(self): 31 | async def do_exit(): 32 | async with self.close_condition: 33 | while self.nb_requests > 0: 34 | logger.info("%d requests in progress, waiting...", 35 | self.nb_requests) 36 | await self.close_condition.wait() 37 | logger.warning("Closing gracefully") 38 | tornado.ioloop.IOLoop.current().stop() 39 | 40 | log_future(asyncio.get_event_loop().create_task(do_exit()), logger) 41 | 42 | 43 | class GracefulHandler(tornado.web.RequestHandler): 44 | """Handlers that will prevent the application to exit until they're done. 45 | """ 46 | def prepare(self): 47 | super(GracefulHandler, self).prepare() 48 | self.application.nb_requests += 1 49 | 50 | def on_finish(self): 51 | super(GracefulHandler, self).on_finish() 52 | 53 | app = self.application 54 | 55 | async def do_decrease(): 56 | async with app.close_condition: 57 | app.nb_requests -= 1 58 | app.close_condition.notify_all() 59 | 60 | log_future(asyncio.get_event_loop().create_task(do_decrease()), logger) 61 | -------------------------------------------------------------------------------- /frontend/src/components/Logo/auctus-logo.min.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /frontend/src/config.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * This function loads variables from HTML tags. It allows us to 3 | * dynamically configure the base path for the application. It can useful 4 | * when configuring the system to run behind a proxy under a non-root path 5 | * such as: http://example.com/auctus/{all-paths}. 6 | * 7 | * This can be configured by adding an HTML meta-tag to the static HTML. 8 | * 9 | * 10 | */ 11 | function loadVariableFromHTML(name: string): string { 12 | const meta = document.getElementsByName(name)[0]; 13 | let value: string | null = meta ? meta.getAttribute('content') : null; 14 | if (!value) { 15 | value = ''; 16 | } else if (value.endsWith('/')) { 17 | value = value.substring(0, value.length - 1); 18 | } 19 | return value; 20 | } 21 | 22 | /* 23 | * During web development, 24 | * - the web server is started via "npm start" and it runs at localhost:3000; 25 | * - the API server should be running at the address configured in the "proxy" 26 | * key from the package.json file in the project's root directory 27 | * 28 | * In the client code, we always send the API requests to address where the 29 | * page is being served from. In 'development' mode, create-react-app dev 30 | * server will proxy the requests to the appropriate backend running the REST 31 | * API. In production, the app is already served from the same server where 32 | * the REST API is server, so the requests will work seamlessly. 33 | */ 34 | 35 | let baseUrl = `//${window.location.host}`; 36 | let apiUrl = baseUrl; 37 | 38 | const isDev = process.env.NODE_ENV === 'development'; 39 | if (isDev && process.env.REACT_APP_BASE_URL) { 40 | baseUrl = process.env.REACT_APP_BASE_URL; 41 | } 42 | if (isDev && process.env.REACT_APP_API_URL) { 43 | apiUrl = process.env.REACT_APP_API_URL; 44 | } 45 | 46 | const BASE_URL: string = loadVariableFromHTML('base_url') || baseUrl; 47 | const API_URL = loadVariableFromHTML('api_url') || apiUrl; 48 | 49 | console.log('BASE_URL', BASE_URL); 50 | console.log('API_URL', API_URL); 51 | 52 | export {BASE_URL, API_URL}; 53 | -------------------------------------------------------------------------------- /frontend/README.md: -------------------------------------------------------------------------------- 1 | This project was bootstrapped with [Create React App](https://github.com/facebook/create-react-app). 2 | 3 | ## Available Scripts 4 | 5 | In the project directory, you can run: 6 | 7 | ### `npm start` 8 | 9 | Runs the app in the development mode.
    10 | Open [http://localhost:3000](http://localhost:3000) to view it in the browser. 11 | 12 | The page will reload if you make edits.
    13 | You will also see any lint errors in the console. 14 | 15 | ### `npm test` 16 | 17 | Launches the test runner in the interactive watch mode.
    18 | See the section about [running tests](https://facebook.github.io/create-react-app/docs/running-tests) for more information. 19 | 20 | ### `npm run build` 21 | 22 | Builds the app for production to the `build` folder.
    23 | It correctly bundles React in production mode and optimizes the build for the best performance. 24 | 25 | The build is minified and the filenames include the hashes.
    26 | Your app is ready to be deployed! 27 | 28 | See the section about [deployment](https://facebook.github.io/create-react-app/docs/deployment) for more information. 29 | 30 | ### `npm run eject` 31 | 32 | **Note: this is a one-way operation. Once you `eject`, you can’t go back!** 33 | 34 | If you aren’t satisfied with the build tool and configuration choices, you can `eject` at any time. This command will remove the single build dependency from your project. 35 | 36 | Instead, it will copy all the configuration files and the transitive dependencies (webpack, Babel, ESLint, etc) right into your project so you have full control over them. All of the commands except `eject` will still work, but they will point to the copied scripts so you can tweak them. At this point you’re on your own. 37 | 38 | You don’t have to ever use `eject`. The curated feature set is suitable for small and middle deployments, and you shouldn’t feel obligated to use this feature. However we understand that this tool wouldn’t be useful if you couldn’t customize it when you are ready for it. 39 | 40 | ## Learn More 41 | 42 | You can learn more in the [Create React App documentation](https://facebook.github.io/create-react-app/docs/getting-started). 43 | 44 | To learn React, check out the [React documentation](https://reactjs.org/). 45 | -------------------------------------------------------------------------------- /frontend/public/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 16 | 17 | 18 | 19 | 23 | 24 | 25 | 34 | Auctus 35 | 36 | 37 | 38 |
    39 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /scripts/export_all.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """This script exports the index to JSON files. 4 | 5 | It is useful as backup, and to provide snapshots to users so they don't have to 6 | profile everything to get a system going. 7 | 8 | The exported folder can be loaded in using `import_all.py` (which will simply 9 | load the JSON files) or `reprocess_all.py` (which will only read some fields, 10 | and get the metadata by reprocessing the datasets). 11 | """ 12 | 13 | import logging 14 | import json 15 | 16 | from datamart_core.common import PrefixedElasticsearch, encode_dataset_id 17 | 18 | 19 | SIZE = 10000 20 | 21 | 22 | _unique_filenames = {} 23 | 24 | 25 | def unique_filename(pattern): 26 | """Return a file name with an incrementing number to make it unique. 27 | """ 28 | number = _unique_filenames.get(pattern, 0) + 1 29 | _unique_filenames[pattern] = number 30 | return pattern.format(number) 31 | 32 | 33 | def export(): 34 | es = PrefixedElasticsearch() 35 | 36 | print("Dumping datasets") 37 | hits = es.scan( 38 | index='datasets', 39 | query={ 40 | 'query': { 41 | 'match_all': {}, 42 | }, 43 | }, 44 | size=SIZE, 45 | ) 46 | for h in hits: 47 | # Use dataset ID as file name 48 | with open(encode_dataset_id(h['_id']), 'w') as fp: 49 | json.dump(h['_source'], fp, sort_keys=True, indent=2) 50 | 51 | print("Dumping Lazo data") 52 | hits = es.scan( 53 | index='lazo', 54 | query={ 55 | 'query': { 56 | 'match_all': {}, 57 | }, 58 | }, 59 | size=SIZE, 60 | ) 61 | for h in hits: 62 | # Use "lazo." dataset_id ".NB" as file name 63 | dataset_id = h['_id'].split('__.__')[0] 64 | fname = unique_filename( 65 | 'lazo.{0}.{{0}}'.format(encode_dataset_id(dataset_id)) 66 | ) 67 | with open(fname, 'w') as fp: 68 | json.dump( 69 | dict(h['_source'], _id=h['_id']), 70 | fp, 71 | sort_keys=True, 72 | indent=2, 73 | ) 74 | 75 | 76 | if __name__ == '__main__': 77 | logging.basicConfig(level=logging.INFO) 78 | 79 | export() 80 | -------------------------------------------------------------------------------- /tests/data/geo.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import numpy.random 3 | import os 4 | import string 5 | 6 | 7 | SIZE = 50 8 | 9 | 10 | def main(): 11 | lat1, long1 = 40.7298648, -73.9986808 12 | lat1m, long1m = 40.73287, -74.002031 13 | lat2, long2 = 40.692725, -73.9865644 14 | lat2m, long2m = 40.694316, -73.988495 15 | 16 | random = numpy.random.RandomState(1) 17 | latitudes = numpy.concatenate([ 18 | random.normal(lat1, abs(lat1 - lat1m), SIZE), 19 | random.normal(lat2, abs(lat2 - lat2m), SIZE), 20 | ]) 21 | random = numpy.random.RandomState(2) 22 | longitudes = numpy.concatenate([ 23 | random.normal(long1, abs(long1 - long1m), SIZE), 24 | random.normal(long2, abs(long2 - long2m), SIZE), 25 | ]) 26 | random = numpy.random.RandomState(3) 27 | heights = random.normal(50.0, 20.0, 2 * SIZE) 28 | 29 | data_dir = os.path.dirname(__file__) 30 | with open(os.path.join(data_dir, 'geo.csv'), 'w') as f_data: 31 | print("id,lat,long,height", file=f_data) 32 | for i, (lat, long, h) in enumerate(zip(latitudes, longitudes, heights)): 33 | if i == 42: 34 | i = '' 35 | else: 36 | i = 'place%02d' % i 37 | print("%s,%f,%f,%f" % (i, lat, long, h), file=f_data) 38 | 39 | with open(os.path.join(data_dir, 'geo_wkt.csv'), 'w') as f_data: 40 | print("id,coords,height", file=f_data) 41 | for i, (lat, long, h) in enumerate(zip(latitudes, longitudes, heights)): 42 | if i == 42: 43 | i = '' 44 | else: 45 | i = 'place%02d' % i 46 | print("%s,POINT (%f %f),%f" % (i, long, lat, h), file=f_data) 47 | 48 | random = numpy.random.RandomState(5) 49 | aug_latitudes = random.normal(lat1, abs(lat1 - lat1m), 10) 50 | aug_longitudes = random.normal(long1, abs(long1 - long1m), 10) 51 | 52 | with open(os.path.join(data_dir, 'geo_aug.csv'), 'w') as f_data: 53 | print("lat,long,id,letter", file=f_data) 54 | for i, (lat, long, letter) in enumerate( 55 | zip(aug_latitudes, aug_longitudes, string.ascii_letters), 56 | 100, 57 | ): 58 | print("%f,%f,place%d,%s" % (lat, long, i, letter), file=f_data) 59 | 60 | 61 | if __name__ == '__main__': 62 | main() 63 | -------------------------------------------------------------------------------- /coordinator/coordinator/templates/base.html: -------------------------------------------------------------------------------- 1 | 2 | {% set fluid = False %} 3 | 4 | 5 | 6 | 7 | 8 | {% block title %}NYU Auctus{% endblock %} 9 | 13 | 14 | 15 | 33 | 34 |
    35 | {% block contents %}{% endblock %} 36 | 37 | 43 |
    44 | 45 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /lib_materialize/datamart_materialize/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | 4 | 5 | class SimpleConverterProxy(object): 6 | def __init__(self, writer, transform, name, temp_file, fp): 7 | self._writer = writer 8 | self._transform = transform 9 | self._name = name 10 | self._temp_file = temp_file 11 | self._fp = fp 12 | 13 | def close(self): 14 | self._fp.close() 15 | self._convert() 16 | 17 | def _convert(self): 18 | # Read back the file we wrote, and transform it to the final file 19 | with self._writer.open_file('w', self._name) as dst: 20 | self._transform(self._temp_file, dst) 21 | 22 | # Those methods forward to the actual file object 23 | 24 | def write(self, buffer): 25 | return self._fp.write(buffer) 26 | 27 | def flush(self): 28 | self._fp.flush() 29 | 30 | def __enter__(self): 31 | self._fp.__enter__() 32 | return self 33 | 34 | def __exit__(self, exc, value, tb): 35 | self._fp.__exit__(exc, value, tb) 36 | if exc is None: 37 | self._convert() 38 | 39 | 40 | class SimpleConverter(object): 41 | """Base class for converters simply transforming files through a function. 42 | """ 43 | def __init__(self, writer): 44 | self.writer = writer 45 | self.dir = tempfile.TemporaryDirectory(prefix='datamart_excel_') 46 | 47 | def set_metadata(self, dataset_id, metadata): 48 | self.writer.set_metadata(dataset_id, metadata) 49 | 50 | def open_file(self, mode='wb', name=None): 51 | temp_file = os.path.join(self.dir.name, 'file.xls') 52 | 53 | # Return a proxy that will write to the destination when closed 54 | if mode == 'wb': 55 | fp = open(temp_file, mode) 56 | elif mode == 'w': 57 | fp = open(temp_file, mode, encoding='utf-8', newline='') 58 | else: 59 | raise ValueError("Invalid write mode %r" % mode) 60 | return SimpleConverterProxy( 61 | self.writer, self.transform, 62 | name, 63 | temp_file, fp, 64 | ) 65 | 66 | def finish(self): 67 | self.dir.cleanup() 68 | self.dir = None 69 | 70 | @staticmethod 71 | def transform(source_filename, dest_fileobj): 72 | raise NotImplementedError 73 | -------------------------------------------------------------------------------- /frontend/src/components/visus/PersistentComponent/PersistentComponent.tsx: -------------------------------------------------------------------------------- 1 | import {PureComponent} from 'react'; 2 | import {shallowEqual} from '../../../utils'; 3 | 4 | const cache = new Map(); 5 | 6 | // Patch PureComponent type declaration so that we can access React internal 7 | // variables. We disable eslint here because the declaration has to match the 8 | // the declaration from @types/react package. 9 | declare module 'react' { 10 | // eslint-disable-next-line @typescript-eslint/no-explicit-any 11 | interface PureComponent

    12 | extends React.Component { 13 | _reactInternalFiber: { 14 | key: string; 15 | type: { 16 | displayName: string; 17 | name: string; 18 | }; 19 | }; 20 | } 21 | } 22 | 23 | /** 24 | * This component uses the key provided to a component to generate a cache key for the data. 25 | * We chose to use key for the following reasons: 26 | * 1. React uses key to identify if the element associated with the component. 27 | * In some cases this helps it to identify that two instance are the same, and avoid re-constructing the instance. 28 | * It is expected that this strategy will help react to avoid destroying a component unnecessarily. 29 | * 2. React does some work to avoid siblings with the same key, This should provide some warnings when reusing a key. 30 | * 3. Since it is an internal from each component, it doesn't pollute the props of components. 31 | * 32 | */ 33 | export class PersistentComponent< 34 | TProps = {}, 35 | TState = {} 36 | > extends PureComponent { 37 | componentDidMount() { 38 | if (!this._reactInternalFiber.key) { 39 | console.warn( 40 | 'When using PersistentComponent please provide the key prop' 41 | ); 42 | } 43 | const cacheKey = this.getCacheKey(); 44 | const previousState = cache.get(cacheKey); 45 | if (previousState && !shallowEqual(this.state, previousState)) { 46 | this.setState(previousState); 47 | } 48 | } 49 | 50 | componentWillUnmount() { 51 | const key = this.getCacheKey(); 52 | cache.set(key, this.state); 53 | } 54 | 55 | private getCacheKey() { 56 | const name = 57 | this._reactInternalFiber.type.displayName || 58 | this._reactInternalFiber.type.name; 59 | return `${name}-${this._reactInternalFiber.key}`; 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /lib_core/datamart_core/augment.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | import uuid 4 | 5 | from datamart_augmentation import AugmentationError, join, union 6 | 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | def augment(data, newdata, metadata, task, writer, columns=None): 12 | """ 13 | Augments original data based on the task. 14 | 15 | :param data: the data to be augmented, as binary file object. 16 | :param newdata: the path to the CSV file to augment with. 17 | :param metadata: the metadata of the data to be augmented. 18 | :param task: the augmentation task. 19 | :param writer: Writer on which to save the files. 20 | :param columns: a list of column indices from newdata that will be added to data 21 | well with data. 22 | """ 23 | 24 | if 'id' not in task: 25 | raise AugmentationError("Dataset id for the augmentation task not provided") 26 | 27 | # TODO: add support for combining multiple columns before an augmentation 28 | # e.g.: [['street number', 'street', 'city']] and [['address']] 29 | # currently, Datamart does not support such cases 30 | # this means that spatial joins (with GPS) are not supported for now 31 | 32 | # Perform augmentation 33 | start = time.perf_counter() 34 | if task['augmentation']['type'] == 'join': 35 | output_metadata = join( 36 | data, 37 | newdata, 38 | metadata, 39 | task['metadata'], 40 | writer, 41 | task['augmentation']['left_columns'], 42 | task['augmentation']['right_columns'], 43 | columns=columns, 44 | agg_functions=task['augmentation'].get('agg_functions'), 45 | temporal_resolution=task['augmentation'].get('temporal_resolution'), 46 | ) 47 | elif task['augmentation']['type'] == 'union': 48 | output_metadata = union( 49 | data, 50 | newdata, 51 | metadata, 52 | task['metadata'], 53 | writer, 54 | task['augmentation']['left_columns'], 55 | task['augmentation']['right_columns'], 56 | ) 57 | else: 58 | raise AugmentationError("Augmentation task not provided") 59 | logger.info("Total augmentation: %.4fs", time.perf_counter() - start) 60 | 61 | # Write out the metadata 62 | writer.set_metadata(uuid.uuid4().hex, output_metadata) 63 | return writer.finish() 64 | -------------------------------------------------------------------------------- /scripts/migrate-types-and-attributes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """This scripts updates the index for !115 and !127. 4 | 5 | It adds the dataset "types" information (computed from column semantic types) 6 | and the "attribute_keywords" field (compute from column names). 7 | """ 8 | 9 | import json 10 | import os 11 | import shutil 12 | import sys 13 | 14 | from datamart_profiler.core import expand_attribute_name 15 | from datamart_profiler.profile_types import determine_dataset_type 16 | 17 | 18 | def migrate(from_folder, to_folder): 19 | assert os.listdir(from_folder) 20 | assert not os.listdir(to_folder) 21 | 22 | datasets = [] 23 | lazo = [] 24 | for f in os.listdir(from_folder): 25 | if f.startswith('lazo.'): 26 | lazo.append(f) 27 | else: 28 | datasets.append(f) 29 | 30 | for i, dataset in enumerate(datasets): 31 | if i % 100 == 0: 32 | print("% 5d / %5d datasets processed" % (i, len(datasets))) 33 | 34 | with open(os.path.join(from_folder, dataset)) as fp: 35 | obj = json.load(fp) 36 | 37 | if 'attribute_keywords' not in obj: 38 | attribute_keywords = [] 39 | for col in obj['columns']: 40 | attribute_keywords.append(col['name']) 41 | kw = list(expand_attribute_name(col['name'])) 42 | if kw != [col['name']]: 43 | attribute_keywords.extend(kw) 44 | obj['attribute_keywords'] = attribute_keywords 45 | 46 | if 'types' not in obj: 47 | dataset_types = set() 48 | for col in obj['columns']: 49 | type_ = determine_dataset_type( 50 | col['structural_type'], 51 | col['semantic_types'], 52 | ) 53 | if type_: 54 | dataset_types.add(type_) 55 | obj['types'] = sorted(dataset_types) 56 | 57 | with open(os.path.join(to_folder, dataset), 'w') as fp: 58 | json.dump(obj, fp, sort_keys=True, indent=2) 59 | 60 | print("Copying lazo data...") 61 | for i, f in enumerate(lazo): 62 | if i % 1000 == 0: 63 | print("% 5d / %5d files copied" % (i, len(lazo))) 64 | shutil.copy2( 65 | os.path.join(from_folder, f), 66 | os.path.join(to_folder, f), 67 | ) 68 | 69 | 70 | if __name__ == '__main__': 71 | migrate(sys.argv[1], sys.argv[2]) 72 | -------------------------------------------------------------------------------- /frontend/src/components/visus/Card/Card.tsx: -------------------------------------------------------------------------------- 1 | import * as React from 'react'; 2 | import styled from 'styled-components'; 3 | import './card.css'; 4 | 5 | interface CardProps { 6 | title: string; 7 | className?: string; 8 | style?: React.CSSProperties; 9 | } 10 | 11 | class Card extends React.PureComponent { 12 | render() { 13 | const cardClassName = this.props.className 14 | ? 'card ' + this.props.className 15 | : 'card'; 16 | return ( 17 |

    18 |
    19 | {this.props.title ? ( 20 |
    {this.props.title}
    21 | ) : ( 22 | '' 23 | )} 24 | {this.props.children} 25 |
    26 |
    27 | ); 28 | } 29 | } 30 | 31 | interface CardShadowProps { 32 | className?: string; 33 | height?: string; 34 | } 35 | 36 | class CardShadow extends React.PureComponent { 37 | render() { 38 | const cardClassName = this.props.className 39 | ? 'card-hover card card-attributes' + this.props.className 40 | : 'card-hover card card-attributes'; 41 | return ( 42 |
    50 |
    {this.props.children}
    51 |
    52 | ); 53 | } 54 | } 55 | 56 | interface CardAttrFieldProps { 57 | textAlign?: string; 58 | width?: string; 59 | fontWeight?: string; 60 | padding?: string; 61 | } 62 | 63 | const CardAttrField = styled.div` 64 | font-weight: ${({fontWeight}) => fontWeight || 'normal'}; 65 | text-align: ${({textAlign}) => textAlign || 'right'}; 66 | width: ${({width}) => width || '110px'}; 67 | padding: ${({padding}) => padding || '0 15px'}; 68 | `; 69 | 70 | const CardAttrValue = styled.div` 71 | flex: 1; 72 | padding-right: 15px; 73 | overflow-wrap: break-word; 74 | word-wrap: break-word; 75 | word-break: break-word; 76 | `; 77 | 78 | export const CardButton = styled.div` 79 | display: flex; 80 | justify-content: center; 81 | flex-direction: column; 82 | text-align: center; 83 | height: 100%; 84 | cursor: pointer; 85 | `; 86 | 87 | export {Card, CardShadow, CardAttrField, CardAttrValue}; 88 | -------------------------------------------------------------------------------- /frontend/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "frontend", 3 | "version": "0.1.0", 4 | "private": true, 5 | "dependencies": { 6 | "@hanreev/types-ol": "^2.0.8", 7 | "@material-ui/core": "^4.11.1", 8 | "@testing-library/jest-dom": "^4.2.4", 9 | "@testing-library/react": "^9.4.1", 10 | "@testing-library/user-event": "^7.2.1", 11 | "@types/d3-scale": "^3.2.2", 12 | "@types/jest": "^24.9.1", 13 | "@types/node": "^12.12.29", 14 | "@types/ol": "^5.3.7", 15 | "@types/react": "^16.9.23", 16 | "@types/react-dom": "^16.9.5", 17 | "@types/react-router-dom": "^5.1.3", 18 | "@types/react-table": "^7.0.10", 19 | "@types/styled-components": "^5.0.1", 20 | "axios": "^0.21.4", 21 | "d3-scale": "^3.2.4", 22 | "moment": "^2.24.0", 23 | "ol": "^5.3.3", 24 | "react": "^16.13.1", 25 | "react-datepicker": "^2.13.0", 26 | "react-dnd": "^10.0.2", 27 | "react-dnd-html5-backend": "^10.0.2", 28 | "react-dom": "^16.13.0", 29 | "react-dropzone": "^10.2.1", 30 | "react-feather": "^2.0.3", 31 | "react-router-dom": "^5.1.2", 32 | "react-scripts": "4.0.3", 33 | "react-table": "^7.0.0", 34 | "react-vega": "^7.3.0", 35 | "styled-components": "^5.0.1", 36 | "vega": "^5.11.1", 37 | "vega-lite": "^4.12.0" 38 | }, 39 | "scripts": { 40 | "start": "react-scripts start", 41 | "build": "react-scripts build", 42 | "test": "react-scripts test", 43 | "eject": "react-scripts eject", 44 | "check": "gts check", 45 | "clean": "gts clean", 46 | "compile": "tsc -p .", 47 | "fix": "gts fix", 48 | "pretest": "npm run compile", 49 | "posttest": "npm run check" 50 | }, 51 | "eslintConfig": { 52 | "extends": "react-app" 53 | }, 54 | "browserslist": { 55 | "production": [ 56 | ">0.2%", 57 | "not dead", 58 | "not op_mini all" 59 | ], 60 | "development": [ 61 | "last 1 chrome version", 62 | "last 1 firefox version", 63 | "last 1 safari version" 64 | ] 65 | }, 66 | "devDependencies": { 67 | "@types/node": "^12.12.29", 68 | "@types/react-datepicker": "^2.11.0", 69 | "canvas": "^2.8.0", 70 | "eslint-plugin-react": "^7.20.6", 71 | "gts": "2.0.2", 72 | "jest-canvas-mock": "^2.2.0", 73 | "typescript": "^3.9.10" 74 | }, 75 | "jest": { 76 | "transformIgnorePatterns": [ 77 | "/node_modules/(?!ol).+\\.js$" 78 | ] 79 | }, 80 | "proxy": "https://auctus.vida-nyu.org/api/v1" 81 | } 82 | -------------------------------------------------------------------------------- /scripts/migrate-temporal-coverage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """This scripts updates the index for !162. 4 | 5 | It creates the 'temporal_coverage' (in 'datasets' index) and the 6 | 'temporal_coverage' index. 7 | """ 8 | 9 | import json 10 | import os 11 | import shutil 12 | import sys 13 | 14 | 15 | def migrate(from_folder, to_folder): 16 | assert os.listdir(from_folder) 17 | assert not os.listdir(to_folder) 18 | 19 | datasets = [] 20 | lazo = [] 21 | for f in os.listdir(from_folder): 22 | if f.startswith('lazo.'): 23 | lazo.append(f) 24 | else: 25 | datasets.append(f) 26 | 27 | for i, dataset in enumerate(datasets): 28 | if i % 100 == 0: 29 | print("% 5d / %5d datasets processed" % (i, len(datasets))) 30 | 31 | with open(os.path.join(from_folder, dataset)) as fp: 32 | obj = json.load(fp) 33 | 34 | if 'temporal_coverage' not in obj: 35 | temporal_coverage = [] 36 | for idx, column in enumerate(obj['columns']): 37 | if 'http://schema.org/DateTime' in column['semantic_types']: 38 | coverage = { 39 | 'type': 'datetime', 40 | 'column_names': [column['name']], 41 | 'column_indexes': [idx], 42 | 'column_types': ['http://schema.org/DateTime'], 43 | 'ranges': column.pop('coverage'), 44 | } 45 | column.pop('mean', None) 46 | column.pop('stddev', None) 47 | if 'temporal_resolution' in column: 48 | coverage['temporal_resolution'] = \ 49 | column.pop('temporal_resolution') 50 | temporal_coverage.append(coverage) 51 | 52 | if temporal_coverage: 53 | obj['temporal_coverage'] = temporal_coverage 54 | 55 | with open(os.path.join(to_folder, dataset), 'w') as fp: 56 | json.dump(obj, fp, sort_keys=True, indent=2) 57 | 58 | print("Copying lazo data...") 59 | for i, f in enumerate(lazo): 60 | if i % 1000 == 0: 61 | print("% 5d / %5d files copied" % (i, len(lazo))) 62 | shutil.copy2( 63 | os.path.join(from_folder, f), 64 | os.path.join(to_folder, f), 65 | ) 66 | 67 | 68 | if __name__ == '__main__': 69 | migrate(sys.argv[1], sys.argv[2]) 70 | -------------------------------------------------------------------------------- /contrib/k8s/discovery/worldbank.libsonnet: -------------------------------------------------------------------------------- 1 | local utils = import '../utils.libsonnet'; 2 | 3 | function( 4 | config, 5 | schedule='0 1 * * 1,3,5', 6 | ) { 7 | 'worldbank-cronjob': config.kube('batch/v1beta1', 'CronJob', { 8 | file:: 'discovery.yml', 9 | metadata: { 10 | name: 'worldbank', 11 | labels: { 12 | app: 'auctus', 13 | what: 'worldbank', 14 | }, 15 | }, 16 | spec: { 17 | schedule: schedule, 18 | jobTemplate: { 19 | metadata: { 20 | labels: { 21 | app: 'auctus', 22 | what: 'worldbank', 23 | }, 24 | }, 25 | spec: { 26 | template: { 27 | metadata: { 28 | labels: { 29 | app: 'auctus', 30 | what: 'worldbank', 31 | }, 32 | }, 33 | spec: { 34 | restartPolicy: 'Never', 35 | securityContext: { 36 | runAsNonRoot: true, 37 | }, 38 | containers: [ 39 | { 40 | name: 'worldbank', 41 | image: config.image, 42 | imagePullPolicy: 'IfNotPresent', 43 | args: ['python', '-m', 'worldbank_discovery'], 44 | env: utils.env( 45 | { 46 | LOG_FORMAT: config.log_format, 47 | ELASTICSEARCH_HOSTS: 'elasticsearch:9200', 48 | ELASTICSEARCH_PREFIX: config.elasticsearch.prefix, 49 | AMQP_HOST: 'rabbitmq', 50 | AMQP_PORT: '5672', 51 | AMQP_USER: { 52 | secretKeyRef: { 53 | name: 'secrets', 54 | key: 'amqp.user', 55 | }, 56 | }, 57 | AMQP_PASSWORD: { 58 | secretKeyRef: { 59 | name: 'secrets', 60 | key: 'amqp.password', 61 | }, 62 | }, 63 | LAZO_SERVER_HOST: 'lazo', 64 | LAZO_SERVER_PORT: '50051', 65 | } 66 | + utils.object_store_env(config.object_store) 67 | ), 68 | }, 69 | ], 70 | }, 71 | }, 72 | }, 73 | }, 74 | }, 75 | }), 76 | } 77 | -------------------------------------------------------------------------------- /contrib/k8s/discovery/uaz-indicators.libsonnet: -------------------------------------------------------------------------------- 1 | local utils = import '../utils.libsonnet'; 2 | 3 | function( 4 | config, 5 | schedule='20 1 * * 1,3,5', 6 | ) { 7 | 'uaz-indicators-cronjob': config.kube('batch/v1beta1', 'CronJob', { 8 | file:: 'discovery.yml', 9 | metadata: { 10 | name: 'uaz-indicators', 11 | labels: { 12 | app: 'auctus', 13 | what: 'uaz-indicators', 14 | }, 15 | }, 16 | spec: { 17 | schedule: schedule, 18 | jobTemplate: { 19 | metadata: { 20 | labels: { 21 | app: 'auctus', 22 | what: 'uaz-indicators', 23 | }, 24 | }, 25 | spec: { 26 | template: { 27 | metadata: { 28 | labels: { 29 | app: 'auctus', 30 | what: 'uaz-indicators', 31 | }, 32 | }, 33 | spec: { 34 | restartPolicy: 'Never', 35 | securityContext: { 36 | runAsNonRoot: true, 37 | }, 38 | containers: [ 39 | { 40 | name: 'uaz-indicators', 41 | image: config.image, 42 | imagePullPolicy: 'IfNotPresent', 43 | args: ['python', '-m', 'uaz_indicators'], 44 | env: utils.env( 45 | { 46 | LOG_FORMAT: config.log_format, 47 | ELASTICSEARCH_HOSTS: 'elasticsearch:9200', 48 | ELASTICSEARCH_PREFIX: config.elasticsearch.prefix, 49 | AMQP_HOST: 'rabbitmq', 50 | AMQP_PORT: '5672', 51 | AMQP_USER: { 52 | secretKeyRef: { 53 | name: 'secrets', 54 | key: 'amqp.user', 55 | }, 56 | }, 57 | AMQP_PASSWORD: { 58 | secretKeyRef: { 59 | name: 'secrets', 60 | key: 'amqp.password', 61 | }, 62 | }, 63 | LAZO_SERVER_HOST: 'lazo', 64 | LAZO_SERVER_PORT: '50051', 65 | } 66 | + utils.object_store_env(config.object_store) 67 | ), 68 | }, 69 | ], 70 | }, 71 | }, 72 | }, 73 | }, 74 | }, 75 | }), 76 | } 77 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "auctus" 3 | version = "0.10" 4 | description = "Auctus system meta-package" 5 | 6 | license = "Apache-2.0" 7 | 8 | authors = [ 9 | "Remi Rampin ", 10 | ] 11 | 12 | repository = "https://gitlab.com/ViDA-NYU/auctus/auctus" 13 | homepage = "https://auctus.vida-nyu.org/" 14 | 15 | keywords = ["auctus", "datamart"] 16 | 17 | classifiers = [ 18 | "Development Status :: 4 - Beta", 19 | "Intended Audience :: Science/Research", 20 | "Operating System :: Unix", 21 | "Programming Language :: Python :: 3 :: Only", 22 | "Topic :: Scientific/Engineering :: Information Analysis", 23 | ] 24 | 25 | packages = [] 26 | 27 | include = [] 28 | 29 | [tool.poetry.dependencies] 30 | python = "^3.8,<3.11" # Upper bound for numpy 31 | datamart-profiler = {path = "./lib_profiler", develop=true} 32 | datamart-materialize = {path = "./lib_materialize", develop=true} 33 | datamart-augmentation = {path = "./lib_augmentation", develop=true} 34 | datamart-geo = {path = "lib_geo", develop=true} 35 | datamart-core = {path = "./lib_core", develop=true} 36 | datamart-fslock = {path = "./lib_fslock", develop=true} 37 | datamart-coordinator-service = {path = "./coordinator", develop=true} 38 | datamart-profiler-service = {path = "./profiler", develop=true} 39 | datamart-api-service = {path = "./apiserver", develop=true} 40 | datamart-cache-cleaner-service = {path = "cache_cleaner", develop=true} 41 | datamart-snapshotter-service = {path = "snapshotter", develop=true} 42 | datamart-noaa-discovery-service = {path = "./discovery/noaa", develop=true} 43 | datamart-socrata-discovery-service = {path = "./discovery/socrata", develop=true} 44 | datamart-zenodo-discovery-service = {path = "./discovery/zenodo", develop=true} 45 | datamart-ckan-discovery-service = {path = "./discovery/ckan", develop=true} 46 | datamart-uaz-indicators-service = {path = "./discovery/uaz_indicators", develop=true} 47 | datamart-worldbank-discovery-service = {path= "./discovery/worldbank", develop=true} 48 | datamart-isi-discovery-service = {path= "./discovery/isi", develop=true} 49 | toml = "*" 50 | opentelemetry-exporter-jaeger-thrift = "*" 51 | 52 | [tool.poetry.dev-dependencies] 53 | flake8 = "*" 54 | PyYaml = "*" 55 | requests = "*" 56 | coverage = "*" # Keep it in sync with Dockerfiles for CI 57 | jsonschema = ">=3.0,<4" 58 | readme_renderer = "*" 59 | Sphinx = "*" 60 | sphinx-rtd-theme = "^0.5.0" 61 | 62 | [build-system] 63 | requires = ["poetry-core>=1.0.0"] 64 | build-backend = "poetry.core.masonry.api" 65 | --------------------------------------------------------------------------------