├── metatab ├── test │ ├── __init__.py │ ├── test-data │ │ ├── __init__.py │ │ ├── scripts │ │ │ ├── __init__.py │ │ │ ├── programsource.py │ │ │ ├── Py3Notebook.ipynb │ │ │ └── complex-text.txt │ │ ├── include3.csv │ │ ├── declare-only.csv │ │ ├── json │ │ │ ├── include3.json │ │ │ ├── include2.json │ │ │ ├── include1.json │ │ │ ├── datapackage_ex1.json │ │ │ ├── datapackage_ex1_web.json │ │ │ ├── issue1.json │ │ │ ├── children2.json │ │ │ ├── children.json │ │ │ ├── datapackage_ex2.json │ │ │ ├── example2.json │ │ │ ├── example1-web.json │ │ │ └── example1.json │ │ ├── include1.csv │ │ ├── include2.csv │ │ ├── line │ │ │ ├── line-oriented-doc-contacts.txt │ │ │ ├── line-oriented-doc-root.txt │ │ │ ├── line-oriented-doc-references-1.txt │ │ │ ├── line-oriented-doc-bib.txt │ │ │ ├── line-oriented-doc-references-2.txt │ │ │ └── line-oriented-doc.txt │ │ ├── short.csv │ │ ├── childpropertytype.csv │ │ ├── headers.csv │ │ ├── name.csv │ │ ├── name2.csv │ │ ├── nested.csv │ │ ├── errors │ │ │ ├── bad_include.csv │ │ │ ├── bad_declare.csv │ │ │ └── errors2.csv │ │ ├── children.csv │ │ ├── issue1.csv │ │ ├── children2.csv │ │ ├── census.csv │ │ ├── children3.csv │ │ ├── url_classes.csv │ │ ├── programsource.csv │ │ ├── resolve_urls.csv │ │ ├── packages │ │ │ └── example.com-test_package │ │ │ │ ├── metadata.csv │ │ │ │ └── notebooks │ │ │ │ └── Test_Notebook.ipynb │ │ ├── simple-text.txt │ │ ├── example2.csv │ │ ├── simple1.csv │ │ ├── resources.csv │ │ ├── short-declare.csv │ │ ├── datapackage_ex1.csv │ │ ├── datapackage_ex1_web.csv │ │ ├── geo.csv │ │ ├── datapackage_ex2.csv │ │ ├── schema.csv │ │ ├── yaml │ │ │ ├── yaml-example-1.csv │ │ │ └── yaml-example-1.yaml │ │ ├── notebooks │ │ │ ├── ImportTest.ipynb │ │ │ ├── CellExecuteError.ipynb │ │ │ └── SimpleMagicsTest.ipynb │ │ ├── example1-web.csv │ │ ├── example1.csv │ │ ├── example1-headers.csv │ │ ├── example1.txt │ │ ├── properties.csv │ │ ├── almost-everything.csv │ │ └── civicknowledge.com-rcfe_affordability-2015.csv │ ├── Dockerfile │ ├── core.py │ ├── Makefile │ ├── outputs │ │ ├── datapackage.json │ │ └── metadata.json │ └── test_doc.py ├── templates │ ├── __init__.py │ ├── datapackage.csv │ └── metatab.csv ├── __init__.py ├── exc.py ├── resolver.py ├── datapackage.py ├── appurl.py ├── rowgen.py ├── util.py └── cli.py ├── requirements.txt ├── MANIFEST.in ├── pyproject.toml ├── .travis.yml ├── docker ├── Dockerfile └── Makefile ├── examples ├── pandas-reporter.py └── Pandas Reporter Example.ipynb ├── develop.sh ├── Makefile ├── LICENSE ├── .gitignore ├── setup.py ├── docs ├── Census.rst ├── GeneratingRowsWithPrograms.rst ├── PrivateDatasets.rst └── Wrangling packages.rst └── README.rst /metatab/test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /metatab/templates/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /metatab/test/test-data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /metatab/test/test-data/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /metatab/test/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM n42org/tox 2 | 3 | WORKDIR /code -------------------------------------------------------------------------------- /metatab/test/test-data/include3.csv: -------------------------------------------------------------------------------- 1 | "Note","Include File 3" 2 | -------------------------------------------------------------------------------- /metatab/test/test-data/declare-only.csv: -------------------------------------------------------------------------------- 1 | "Declare","metadata.csv",,, -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | metatabdecl 2 | rowgenerators>=0.7.0 3 | tabulate -------------------------------------------------------------------------------- /metatab/test/test-data/scripts/programsource.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | include README.rst 3 | include LICENSE 4 | -------------------------------------------------------------------------------- /metatab/test/test-data/json/include3.json: -------------------------------------------------------------------------------- 1 | { 2 | "note": "Include File 3" 3 | } -------------------------------------------------------------------------------- /metatab/test/test-data/include1.csv: -------------------------------------------------------------------------------- 1 | Note,Include File 1 2 | Include,include2.csv 3 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel"] 3 | build-backend = "setuptools.build_meta:__legacy__" 4 | 5 | [tool.setuptools_scm] -------------------------------------------------------------------------------- /metatab/test/test-data/include2.csv: -------------------------------------------------------------------------------- 1 | "Note","Include File 2" 2 | "Include","https://raw.githubusercontent.com/CivicKnowledge/structured_tables/master/test/data/include3.csv" 3 | -------------------------------------------------------------------------------- /metatab/test/test-data/line/line-oriented-doc-contacts.txt: -------------------------------------------------------------------------------- 1 | Section: Contacts 2 | Wrangler: Eric Busboom 3 | Wrangler.Email: eric@civicknowledge.com 4 | Wrangler.Organization: Civic Knowledge 5 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | install: 5 | - pip install -r requirements.txt 6 | script: python setup.py test 7 | branches: 8 | only: 9 | - master 10 | 11 | -------------------------------------------------------------------------------- /metatab/test/test-data/short.csv: -------------------------------------------------------------------------------- 1 | "Declare","short-declare.csv" 2 | "include","include3.csv" 3 | "Title","Title1" 4 | ".Language","en" 5 | "Section","Section1" 6 | "Title","Title2" 7 | "Include","include3.csv" 8 | "Title","Title3" 9 | -------------------------------------------------------------------------------- /metatab/test/test-data/childpropertytype.csv: -------------------------------------------------------------------------------- 1 | ,, 2 | "ChildPropertyType","Parent.Child","scalar" 3 | ,, 4 | "Parent","parent", 5 | "Parent.Child","child1", 6 | "Parent.Child","child2", 7 | "Parent.Child","child3", 8 | "Parent.Child","child4", 9 | -------------------------------------------------------------------------------- /metatab/test/test-data/json/include2.json: -------------------------------------------------------------------------------- 1 | { 2 | "note": [ 3 | "Include File 2", 4 | "Include File 3" 5 | ], 6 | "include": "https://raw.githubusercontent.com/CivicKnowledge/structured_tables/master/test/data/include3.csv" 7 | } -------------------------------------------------------------------------------- /metatab/test/test-data/line/line-oriented-doc-root.txt: -------------------------------------------------------------------------------- 1 | Identifier: 47bc1089-7584-41f0-b804-602ec42f1249 2 | Origin: civicknowledge.com 3 | Dataset: rcfe_affordability 4 | Version: 4 5 | Time: 2015 6 | Name: civicknowledge.com-rcfe_affordability-2015-4 7 | -------------------------------------------------------------------------------- /metatab/test/test-data/headers.csv: -------------------------------------------------------------------------------- 1 | "Section ","One",, 2 | "Header","A","B","C" 3 | "one",1,2,3 4 | "two",4,5,6 5 | "three",7,8,9 6 | ,,, 7 | "Section ","One",, 8 | "Header","D","E","F" 9 | "one",10,11,12 10 | "two",13,14,15 11 | "three",16,17,18 12 | -------------------------------------------------------------------------------- /metatab/test/test-data/name.csv: -------------------------------------------------------------------------------- 1 | "Declare","metatab-latest" 2 | "Title","Registered Voters, By County" 3 | "Name","this_name_should_be_replaced" 4 | "Dataset","FooBar" 5 | "Version",1 6 | "Origin","example.com" 7 | "Time",2017 8 | "Space","CA" 9 | "Grain","people" 10 | -------------------------------------------------------------------------------- /metatab/test/test-data/name2.csv: -------------------------------------------------------------------------------- 1 | "Declare","metatab-latest" 2 | "Title","Registered Voters, By County" 3 | "Name","this_name_should_be_replaced" 4 | "Dataset","FooBar" 5 | "Version",1 6 | "Origin","example.com" 7 | "Time",2017 8 | "Space","CA" 9 | "Grain","people" 10 | -------------------------------------------------------------------------------- /metatab/test/test-data/nested.csv: -------------------------------------------------------------------------------- 1 | "Section","Nesting", 2 | "A",1, 3 | ".B",2, 4 | ".B",3, 5 | "X",4, 6 | ".Y",5, 7 | ".Y",6, 8 | ,, 9 | "Section","More Nesting","Alt" 10 | "A",1,"Alt" 11 | ".B",2,"b" 12 | ".B",3,"c" 13 | "X",4,"d" 14 | ".Y",5,"e" 15 | ".Y",6,"f" 16 | -------------------------------------------------------------------------------- /metatab/test/test-data/line/line-oriented-doc-references-1.txt: -------------------------------------------------------------------------------- 1 | 2 | Section: References 3 | 4 | Reference: censusreporter:B09020/140/05000US06073 5 | Reference.Name: B09020 6 | Reference.Description: Relationship by Household Type (Including Living Alone) for Population 65 Years and Over 7 | -------------------------------------------------------------------------------- /metatab/test/test-data/errors/bad_include.csv: -------------------------------------------------------------------------------- 1 | "Include","doesntexist.csv" 2 | "Title","Registered Voters, By County" 3 | "Description","Percent of the eligible population registered to vote and the percent who voted in statewide elections." 4 | "Identifier","cdph.ca.gov-hci-registered_voters-county" 5 | -------------------------------------------------------------------------------- /metatab/test/test-data/errors/bad_declare.csv: -------------------------------------------------------------------------------- 1 | "Declare","http://example.com/doesntexist.csv" 2 | "Title","Registered Voters, By County" 3 | "Description","Percent of the eligible population registered to vote and the percent who voted in statewide elections." 4 | "Identifier","cdph.ca.gov-hci-registered_voters-county" 5 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | 2 | FROM python:3.6.1-alpine 3 | 4 | MAINTAINER Eric Busboom "eric@civicknowledge.com" 5 | 6 | VOLUME /opt/metatab 7 | 8 | RUN apk add --update --no-cache g++ gcc python-dev py-lxml libxslt-dev==1.1.29-r0 bash git 9 | 10 | RUN pip install https://github.com/CivicKnowledge/metatab-py/archive/master.zip # 9 11 | -------------------------------------------------------------------------------- /metatab/test/test-data/json/include1.json: -------------------------------------------------------------------------------- 1 | { 2 | "note": [ 3 | "Include File 1", 4 | "Include File 2", 5 | "Include File 3" 6 | ], 7 | "include": [ 8 | "include2.csv", 9 | "https://raw.githubusercontent.com/CivicKnowledge/structured_tables/master/test/data/include3.csv" 10 | ] 11 | } -------------------------------------------------------------------------------- /examples/pandas-reporter.py: -------------------------------------------------------------------------------- 1 | 2 | import pandas as pd 3 | import numpy as np 4 | import pandasreporter as pr 5 | 6 | b17001 = pr.get_dataframe('B17001', '140', '05000US06073', cache=True) 7 | b17024 = pr.get_dataframe('B17024', '140', '05000US06073', cache=True) 8 | b17017 = pr.get_dataframe('B17017', '140', '05000US06073', cache=True) 9 | 10 | print df.head(2) -------------------------------------------------------------------------------- /metatab/test/core.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017 Civic Knowledge. This file is licensed under the terms of the 2 | # Revised BSD License, included in this distribution as LICENSE 3 | 4 | """ 5 | 6 | """ 7 | 8 | 9 | def test_data(*paths): 10 | from os.path import dirname, join, abspath 11 | 12 | return abspath(join(dirname(abspath(__file__)), 'test-data', *paths)) -------------------------------------------------------------------------------- /metatab/test/test-data/children.csv: -------------------------------------------------------------------------------- 1 | "NOte","This is a note",, 2 | ,,, 3 | "Section","Arguments","prop1","prop2" 4 | "Parent","parent","prop1","prop2" 5 | ,,, 6 | "Section","ExplicitChildren",, 7 | "Parent","parent",, 8 | "Parent.Prop1","prop1",, 9 | "Parent.Prop2","prop2",, 10 | ,,, 11 | "Section","ElidedChildren",, 12 | "Parent","parent",, 13 | ".Prop1","prop1",, 14 | ".Prop2","prop2",, 15 | -------------------------------------------------------------------------------- /metatab/test/test-data/json/datapackage_ex1.json: -------------------------------------------------------------------------------- 1 | { 2 | "declare": "datapackage-latest.csv", 3 | "title": "Registered Voters, By County", 4 | "description": "Percent of the eligible population registered to vote and the percent who voted in statewide elections.", 5 | "name": "cdph.ca.gov-hci-registered_voters-county", 6 | "version": "1.3.4", 7 | "section": [ 8 | { 9 | "section": -------------------------------------------------------------------------------- /metatab/templates/datapackage.csv: -------------------------------------------------------------------------------- 1 | "# ","Declarations for producing package.json files",,,,, 2 | ,,,,,, 3 | "Section","DeclaredTerms","TermValueName","ChildPropertyType","Section","Synonym","ValueSet" 4 | "DeclareTerm","resources","url",,,, 5 | "DeclareTerm","resource",,,,"resources", 6 | "DeclareTerm","schema",,,,"resources.schema", 7 | "DeclareTerm","field","name",,,"schema.fields", 8 | "DeclareTerm","schema.fields","name",,,, 9 | -------------------------------------------------------------------------------- /metatab/test/test-data/issue1.csv: -------------------------------------------------------------------------------- 1 | ,,,, 2 | "Section","Resources","table","Grain","Title" 3 | "Documentation","https://www.cdph.ca.gov/programs/Documents/HCI_RegisteredVoters_653_Narrative_and_examples_6-2-14.pdf","Indicator Documentation for Voter Registration / Participation",, 4 | ".description","Voter Registration/Participation: Percent of the eligible population registered to vote and the percent who voted in statewide elections",,, 5 | -------------------------------------------------------------------------------- /metatab/test/test-data/json/datapackage_ex1_web.json: -------------------------------------------------------------------------------- 1 | { 2 | "declare": "http://assets.metatab.org/datapackage.csv", 3 | "title": "Registered Voters, By County", 4 | "description": "Percent of the eligible population registered to vote and the percent who voted in statewide elections.", 5 | "name": "cdph.ca.gov-hci-registered_voters-county", 6 | "version": "1.3.4", 7 | "section": [ 8 | { 9 | "section": -------------------------------------------------------------------------------- /metatab/test/test-data/children2.csv: -------------------------------------------------------------------------------- 1 | "# Like children.csv, but with different values for debugging. ",,, 2 | ,,, 3 | "Section","Arguments","prop1","prop2" 4 | "Parent","parent","prop11","prop12" 5 | ,,, 6 | "Section","ExplicitChildren",, 7 | "Parent","parent",, 8 | "Parent.Prop1","prop21",, 9 | "Parent.Prop2","prop22",, 10 | ,,, 11 | "Section","ElidedChildren",, 12 | "Parent","parent",, 13 | ".Prop1","prop31",, 14 | ".Prop2","prop32",, 15 | -------------------------------------------------------------------------------- /metatab/test/test-data/census.csv: -------------------------------------------------------------------------------- 1 | Section,DeclaredSections,,,, 2 | DeclareSection,Section,Schema,title,column_ref,indent 3 | ,,,,, 4 | ,,,,, 5 | Section,DeclaredTerms,,,, 6 | Header,Term,TermValueName,ChildPropertyType,Section, 7 | DeclareTerm,Table,Name,,Schema, 8 | DeclareTerm,Table.Universe,,,Root, 9 | DeclareTerm,Table.Segment,,,Root, 10 | DeclareTerm,Table.Topics,,,, 11 | DeclareTerm,Table.Subject,,,, 12 | DeclareTerm,Table.Column,Name,,, 13 | -------------------------------------------------------------------------------- /metatab/test/test-data/json/issue1.json: -------------------------------------------------------------------------------- 1 | { 2 | "documentation": { 3 | "table": "Indicator Documentation for Voter Registration / Participation", 4 | "description": "Voter Registration/Participation: Percent of the eligible population registered to vote and the percent who voted in statewide elections", 5 | "@value": "https://www.cdph.ca.gov/programs/Documents/HCI_RegisteredVoters_653_Narrative_and_examples_6-2-14.pdf" 6 | } 7 | } -------------------------------------------------------------------------------- /metatab/test/test-data/children3.csv: -------------------------------------------------------------------------------- 1 | ,,, 2 | ,,, 3 | ,,, 4 | "Section","Arguments","child1","child2" 5 | "Parent","parent","child1","child2" 6 | ,,, 7 | "Section","ExplicitChildren",, 8 | "Parent","parent",, 9 | "Parent.Child1","child1",, 10 | "Parent.Child2","child2",, 11 | ,,, 12 | "Section","ElidedChildren",, 13 | "Parent","parent",, 14 | ".Child1","child1",, 15 | ".Child2","child2",, 16 | "Child1.grand1","grand1",, 17 | "Child2.grand2","grand2",, 18 | "Grand1.Great1","great1",, 19 | -------------------------------------------------------------------------------- /metatab/test/test-data/json/children2.json: -------------------------------------------------------------------------------- 1 | { 2 | "parent": [ 3 | { 4 | "prop1": "prop11", 5 | "prop2": "prop12", 6 | "@value": "parent" 7 | }, 8 | { 9 | "prop1": "prop21", 10 | "prop2": "prop22", 11 | "@value": "parent" 12 | }, 13 | { 14 | "prop1": "prop31", 15 | "prop2": "prop32", 16 | "@value": "parent" 17 | } 18 | ] 19 | } -------------------------------------------------------------------------------- /metatab/templates/metatab.csv: -------------------------------------------------------------------------------- 1 | Declare,metatab-latest,,, 2 | Title,,,, 3 | Description,,,, 4 | Identifier,,,, 5 | Name,,,, 6 | Dataset,,,, 7 | Origin,,,, 8 | Space,,,, 9 | Time,,,, 10 | Grain,,,, 11 | Variant,,,, 12 | Version,1,,, 13 | ,,,, 14 | Section,References,Name,Description, 15 | ,,,, 16 | Section,Resources,Name,Description, 17 | ,,,, 18 | Section ,Documentation,Title,Description, 19 | ,,,, 20 | Section,Contacts,Email,Organization,Url 21 | ,,,, 22 | Section,Schema,DataType,AltName,Description 23 | -------------------------------------------------------------------------------- /metatab/test/test-data/json/children.json: -------------------------------------------------------------------------------- 1 | { 2 | "note": "This is a note", 3 | "parent": [ 4 | { 5 | "prop1": "prop1", 6 | "prop2": "prop2", 7 | "@value": "parent" 8 | }, 9 | { 10 | "prop1": "prop1", 11 | "prop2": "prop2", 12 | "@value": "parent" 13 | }, 14 | { 15 | "prop1": "prop1", 16 | "prop2": "prop2", 17 | "@value": "parent" 18 | } 19 | ] 20 | } -------------------------------------------------------------------------------- /metatab/test/test-data/url_classes.csv: -------------------------------------------------------------------------------- 1 | in_url,download_file,download_format,download_url,encoding,file_segment,is_archive,proto,target_file,target_format,url 2 | http://example.com/simple-example-altnames.csv,simple-example-altnames.csv,csv,http://example.com/simple-example-altnames.csv,,,False,http,simple-example-altnames.csv,csv,http://example.com/simple-example-altnames.csv 3 | http://example.com/test_data.zip,test_data.zip,zip,http://example.com/test_data.zip,,,True,http,test_data.zip,zip,http://example.com/test_data.zip 4 | -------------------------------------------------------------------------------- /metatab/test/test-data/line/line-oriented-doc-bib.txt: -------------------------------------------------------------------------------- 1 | Section: Bibliography 2 | Citation: ipums 3 | Citation.Type: dataset 4 | Citation.Author: Steven Ruggles; Katie Genadek; Ronald Goeken; Josiah Grover; Matthew Sobek 5 | Citation.Title: Integrated Public Use Microdata Series 6 | Citation.Year: 2017 7 | Citation.Publisher: University of Minnesota 8 | Citation.Version: 7.0 9 | Citation.AccessDate: 20170718 10 | Citation.Url: https://usa.ipums.org/usa/index.shtml 11 | Citation.Doi: https://doi.org/10.18128/D010.V7.0 12 | 13 | -------------------------------------------------------------------------------- /develop.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | git clone https://github.com/CivicKnowledge/rowpipe.git && (cd rowpipe && python setup.py develop) 3 | git clone https://github.com/CivicKnowledge/tableintuit.git && (cd tableintuit && python setup.py develop) 4 | git clone https://github.com/CivicKnowledge/rowgenerators.git && (cd rowgenerators && python setup.py develop) 5 | git clone https://github.com/CivicKnowledge/pandas-reporter.git && (cd pandas-reporter && python setup.py develop) 6 | git clone https://github.com/CivicKnowledge/metatab-py.git; (cd metatab-py && python setup.py develop) 7 | -------------------------------------------------------------------------------- /metatab/test/test-data/programsource.csv: -------------------------------------------------------------------------------- 1 | "Declare","metatab-latest",,, 2 | "Title","Program Source Text",,, 3 | "Description","Test using a program to generate the data",,, 4 | "Identifier","6e5cc47a-b712-4868-afc1-76a5797d1e98",,, 5 | "Name","program_source-1",,, 6 | "Name.Origin",,,, 7 | "Name.Space",,,, 8 | "Name.Time",,,, 9 | "Name.Dataset","program-source",,, 10 | "Name.Version",1,,, 11 | "Name.Grain",,,, 12 | ,,,, 13 | "Section","Resources","Name","VarName","GeoType" 14 | "Datafile","program:scripts/dumpvar.py","Obesity","OBESEA","ZCTA" 15 | ,,,, 16 | ,,,, 17 | "Section","Schema","DataType","AltName","Description" 18 | -------------------------------------------------------------------------------- /metatab/test/test-data/resolve_urls.csv: -------------------------------------------------------------------------------- 1 | "doc","base_url","resource_url","url" 2 | "example1.csv",,"c/d.csv","file:/c/d.csv" 3 | "example1.csv","http://example/a/b","c/d.csv","http://example/a/c/d.csv" 4 | "example1.csv",,"program:c/d.csv","program+file:/c/d.csv" 5 | "example1.csv","http://example/a/b","program:c/d.csv","program+http://example/a/c/d.csv" 6 | "example1.csv",,"/c/d.csv","file:/c/d.csv" 7 | "example1.csv","http://example/a/b","/c/d.csv","http://example/c/d.csv" 8 | "example1.csv",,"program:/c/d.csv","program+file:/c/d.csv" 9 | "example1.csv","http://example/a/b","program:/c/d.csv","program+http://example/c/d.csv" 10 | -------------------------------------------------------------------------------- /metatab/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016 Civic Knowledge. This file is licensed under the terms of the 2 | # Revised BSD License, included in this distribution as LICENSE 3 | """ 4 | Record objects for the Simple Data Package format. 5 | """ 6 | 7 | # default metadata file 8 | DEFAULT_METATAB_FILE = 'metadata.csv' 9 | LINES_METATAB_FILE = 'metadata.txt' 10 | IPYNB_METATAB_FILE = 'metadata.ipynb' 11 | 12 | from .parser import * 13 | from .exc import * 14 | from .doc import MetatabDoc 15 | from .resolver import WebResolver 16 | 17 | from pkg_resources import get_distribution, DistributionNotFound 18 | try: 19 | __version__ = get_distribution(__name__).version 20 | except DistributionNotFound: 21 | # package is not installed 22 | pass 23 | 24 | -------------------------------------------------------------------------------- /metatab/test/test-data/packages/example.com-test_package/metadata.csv: -------------------------------------------------------------------------------- 1 | Declare,metatab-latest 2 | Title,Test Package 3 | Description,Package for Testing 4 | Identifier, 5 | Identifier,36c7e945-943c-435e-923c-1af21d831b3b 6 | Name,example.com-test_package-1 7 | Dataset,test_package 8 | Origin,example.com 9 | Time, 10 | Space, 11 | Grain, 12 | Version,1 13 | Created,2017-08-03T21:15:56 14 | Modified,2017-08-03T21:15:56 15 | Modified,2017-08-03T21:16:42 16 | Giturl,https://github.com/CivicKnowledge/metatab-py.git 17 | 18 | Section,Resources,Name,Description, 19 | Datafile,http://example.com/data.csv,,, 20 | 21 | Section,Documentation,Title,Description, 22 | Note,,,, 23 | 24 | Section,Contacts,Email,Organization,Url 25 | 26 | Section,Schema,DataType,AltName,Description 27 | -------------------------------------------------------------------------------- /metatab/test/test-data/json/datapackage_ex2.json: -------------------------------------------------------------------------------- 1 | { 2 | "declare": "datapackage-latest", 3 | "title": "Country, Regional and World GDP (Gross Domestic Product)", 4 | "description": "Country, regional and world GDP in current US Dollars ($). Regional means collections of countries e.g. Europe & Central Asia. Data is sourced from the World Bank and turned into a standard normalized CSV.", 5 | "name": "gdp", 6 | "version": "2011", 7 | "license": "PDDL-1.0", 8 | "keyword": [ 9 | "GDP", 10 | "World", 11 | "Gross Domestic Product", 12 | "Time series" 13 | ], 14 | "image": "http://assets.okfn.org/p/opendatahandbook/img/data-wrench.png", 15 | "last-updated": "2011-09-21", 16 | "section": [ 17 | { 18 | "section": -------------------------------------------------------------------------------- /metatab/test/test-data/simple-text.txt: -------------------------------------------------------------------------------- 1 | Declare: metatab-latest 2 | Title: Registered Voters, By County 3 | Description: An Example Whatever. 4 | Origin: example.com 5 | Dataset: foobar.com 6 | 7 | Section: Contacts 8 | Wrangler: Eric Busboom 9 | Wrangler.Email: eric@civicknowledge.com 10 | 11 | Section: Resources 12 | Datafile: http://public.source.civicknowledge.com/example.com/sources/renter_cost.csv 13 | Datafile.Name: resource 14 | Datafile.Title: The First Example Data File 15 | Datafile.Startline: 5 16 | Datafile.HeaderLines: 3,4 17 | 18 | Section: References 19 | Reference: http://public.source.civicknowledge.com/example.com/sources/renter_cost.csv 20 | Reference.Name: reference 21 | Reference.Title: The First Example Data File 22 | Reference.Startline: 5 23 | Reference.HeaderLines: 3,4 24 | -------------------------------------------------------------------------------- /metatab/test/test-data/example2.csv: -------------------------------------------------------------------------------- 1 | "Term","value", 2 | "Title","Registered Voters, By County", 3 | "Description","Percent of the eligible population registered to vote and the percent who voted in statewide elections.", 4 | "Identifier","cdph.ca.gov-hci-registered_voters-county", 5 | "Version",201404, 6 | ,, 7 | "Section","documentation","title" 8 | "Homepage","https://www.cdph.ca.gov/programs/pages/healthycommunityindicators.aspx","Healthy Communities Data and Indicators Project (HCI)" 9 | "Documentation","https://www.cdph.ca.gov/programs/Documents/HCI_RegisteredVoters_653_Narrative_and_examples_6-2-14.pdf","Indicator Documentation for Voter Registration / Participation" 10 | ".description","Voter Registration/Participation: Percent of the eligible population registered to vote and the percent who voted in statewide elections", 11 | -------------------------------------------------------------------------------- /metatab/test/test-data/simple1.csv: -------------------------------------------------------------------------------- 1 | "Declare","metatab-latest", 2 | "Title","Registered Voters, By County", 3 | "Name","cdph.ca.gov-hci-registered_voters-county", 4 | ,, 5 | "Section","Resources","Name" 6 | "Datafile","http://example.com/example1.csv","namea" 7 | "Datafile","http://example.com/example2.csv","nameb" 8 | "Homepage","https://www.cdph.ca.gov/programs/pages/healthycommunityindicators.aspx","namec" 9 | "Documentation","https://www.cdph.ca.gov/programs/Documents/HCI_RegisteredVoters_653_Narrative_and_examples_6-2-14.pdf","named" 10 | ".description","Voter Registration/Participation: Percent of the eligible population registered to vote and the percent who voted in statewide elections","namee" 11 | ,, 12 | ,, 13 | "Section","Schema","datatype" 14 | "Table","registered_voters", 15 | "Table.Column","reportyear","int" 16 | "Table.Column","type","str" 17 | -------------------------------------------------------------------------------- /metatab/test/Makefile: -------------------------------------------------------------------------------- 1 | 2 | NS = civicknowledge.com 3 | VERSION = latest 4 | 5 | REPO = tox 6 | NAME = tox 7 | INSTANCE = default 8 | DOCKER ?= docker 9 | 10 | .PHONY: test build push shell run start stop restart reload rm rmf release 11 | CWD = $(notdir $(shell pwd)) 12 | 13 | VOLUMES=-v $(abspath $(CWD)/../../..):/code 14 | 15 | test: 16 | $(DOCKER) run --rm --name $(NAME) $(PORTS) $(VOLUMES) $(ENV) $(NS)/$(REPO):$(VERSION) tox 17 | 18 | build: 19 | $(DOCKER) build -t $(NS)/$(REPO):$(VERSION) . 20 | 21 | push: 22 | $(DOCKER) push $(NS)/$(REPO):$(VERSION) 23 | 24 | shell: 25 | $(DOCKER) run --rm -i -t $(PORTS) $(VOLUMES) $(ENV) $(NS)/$(REPO):$(VERSION) /bin/bash 26 | 27 | logs: 28 | $(DOCKER) logs -f $(NAME) 29 | 30 | rmf: 31 | $(DOCKER) rm -f $(NAME) 32 | 33 | rm: 34 | $(DOCKER) rm $(NAME) 35 | 36 | release: build 37 | make push -e VERSION=$(VERSION) 38 | 39 | default: test -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: default install reset check test tox readme docs publish clean 2 | 3 | MAKE := $(MAKE) --no-print-directory 4 | 5 | test: 6 | python setup.py test 7 | 8 | develop: 9 | python setup.py develop 10 | 11 | publish: 12 | $(MAKE) clean 13 | python setup.py sdist 14 | twine upload dist/* 15 | $(MAKE) clean 16 | 17 | clean: 18 | @rm -Rf *.egg .cache .coverage .tox build dist docs/build htmlcov 19 | @find -depth -type d -name __pycache__ -exec rm -Rf {} \; 20 | @find -type f -name '*.pyc' -delete 21 | test: 22 | python setup.py test 23 | 24 | develop: 25 | python setup.py develop 26 | 27 | publish: 28 | git push --tags origin 29 | $(MAKE) clean 30 | python setup.py sdist 31 | twine upload dist/* 32 | $(MAKE) clean 33 | 34 | clean: 35 | @rm -Rf *.egg .cache .coverage .tox build dist docs/build htmlcov 36 | #@find . -type d -name __pycache__ -exec rm -Rf {} \; 37 | #@find . -type f -name '*.pyc' -delete -------------------------------------------------------------------------------- /metatab/test/test-data/json/example2.json: -------------------------------------------------------------------------------- 1 | { 2 | "term": "value", 3 | "title": "Registered Voters, By County", 4 | "description": "Percent of the eligible population registered to vote and the percent who voted in statewide elections.", 5 | "identifier": "cdph.ca.gov-hci-registered_voters-county", 6 | "version": "201404", 7 | "homepage": { 8 | "title": "Healthy Communities Data and Indicators Project (HCI)", 9 | "@value": "https://www.cdph.ca.gov/programs/pages/healthycommunityindicators.aspx" 10 | }, 11 | "documentation": { 12 | "title": "Indicator Documentation for Voter Registration / Participation", 13 | "description": "Voter Registration/Participation: Percent of the eligible population registered to vote and the percent who voted in statewide elections", 14 | "@value": "https://www.cdph.ca.gov/programs/Documents/HCI_RegisteredVoters_653_Narrative_and_examples_6-2-14.pdf" 15 | } 16 | } -------------------------------------------------------------------------------- /metatab/exc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016 Civic Knowledge. This file is licensed under the terms of the 2 | # Revised BSD License, included in this distribution as LICENSE 3 | 4 | """ 5 | 6 | """ 7 | 8 | 9 | class MetatabError(Exception): 10 | pass 11 | 12 | 13 | 14 | 15 | class ReferenceError(MetatabError): 16 | pass 17 | 18 | 19 | class ParserError(MetatabError): 20 | def __init__(self, *args, **kwargs): 21 | super(ParserError, self).__init__(*args, **kwargs) 22 | self.term = kwargs.get('term', None) 23 | 24 | 25 | class IncludeError(MetatabError): 26 | def __init__(self, *args, **kwargs): 27 | self.message = '' 28 | super(IncludeError, self).__init__(*args, **kwargs) 29 | 30 | 31 | class DeclarationError(ParserError): 32 | pass 33 | 34 | 35 | class GenerateError(MetatabError): 36 | pass 37 | 38 | 39 | class ConversionError(MetatabError): 40 | pass 41 | 42 | class FormatError(MetatabError): 43 | pass 44 | -------------------------------------------------------------------------------- /metatab/test/test-data/resources.csv: -------------------------------------------------------------------------------- 1 | "Declare","metatab-latest",,,, 2 | "Title","Resource test",,,, 3 | "Name","resource-test",,,, 4 | "Description","Percent of the eligible population registered to vote and the percent who voted in statewide elections.",,,, 5 | "Identifier","cdph.ca.gov-hci-registered_voters-county",,,, 6 | ,,,,, 7 | "Section","Resources",,,, 8 | "Header","url","name",,,"Title" 9 | "Datafile","http://example.com/example1.csv","example1",,,"The First Example Data File" 10 | "Datafile","http://example.com/example3.csv","example2",,,"The Second Example Data File" 11 | "Reference","http://example.com/example3.csv","example3",,, 12 | "Reference","http://example.com/example4.csv","example4",,, 13 | "Documentation","http://example.com/example5.csv","example5",,, 14 | "Documentation","http://example.com/example6.csv","example6",,, 15 | "Homepage","http://example.com/example7.csv","example7",,, 16 | "Homepage","http://example.com/example8.csv","example8",,, 17 | "Citation","example9",,,, 18 | "Citation","example10",,,, 19 | -------------------------------------------------------------------------------- /metatab/test/test-data/short-declare.csv: -------------------------------------------------------------------------------- 1 | "Section","DeclaredSections",,,, 2 | "DeclareSection","DeclaredSections","Arg0","Arg1","Arg2", 3 | "DeclareSection","Root",,,, 4 | "DeclareSection","DeclaredTerms","TermValueName","ChildPropertyType","Section", 5 | "DeclareSection","Resources","Table","Grain","Title", 6 | "DeclareSection","Contacts","Email",,, 7 | "DeclareSection","Schemas","DataType","ValueType","Description", 8 | ,,,,, 9 | "Section","DeclaredTerms","TermValueName","ChildPropertyType","Section","InheritsFrom" 10 | "DeclareTerm","DeclareTerm","Term",,"DeclaredTerms", 11 | "DeclareTerm","Declare",,,"Root", 12 | "DeclareTerm","Include",,,"Root", 13 | "DeclareTerm","Section","Name","sequence","Root", 14 | "DeclareTerm","DeclareSection","Section","sequence","DeclaredSections", 15 | ,,,,, 16 | "Section ","DeclaredTerms","TermValueName","InheritsFrom","Section", 17 | "# Top Level Dataset Terms",,,,, 18 | "DeclareTerm","Root.Title",,,"Root", 19 | "DeclareTerm","Title.Language",,,"Root", 20 | "DeclareTerm","Root.Summary",,"Root.Title","Root", 21 | -------------------------------------------------------------------------------- /metatab/test/test-data/datapackage_ex1.csv: -------------------------------------------------------------------------------- 1 | "Declare","datapackage-latest.csv",, 2 | "title","Registered Voters, By County",, 3 | "description","Percent of the eligible population registered to vote and the percent who voted in statewide elections.",, 4 | "name","cdph.ca.gov-hci-registered_voters-county",, 5 | "version","1.3.4",, 6 | ,,, 7 | ,,, 8 | "Section","Resources","type","description" 9 | "resource","http://example.com/resource1.csv",, 10 | ".title","First Resource",, 11 | ".name","the-first-resource",, 12 | ".mediatype","text/csv",, 13 | ".format","csv",, 14 | ,,, 15 | "schema",,, 16 | "field","id","string","description" 17 | "field","state","string","description" 18 | "field","income","string","description" 19 | ,,, 20 | "resource","http://example.com/resource2.csv",, 21 | ".title","Second Resource",, 22 | ".name","the-second-resource",, 23 | ".mediatype","text/csv",, 24 | ".format","csv",, 25 | ,,, 26 | "schema",,, 27 | "field","id","string","description" 28 | "field","country","string","description" 29 | "field","gdp","string","description" 30 | -------------------------------------------------------------------------------- /metatab/test/test-data/scripts/Py3Notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from string import ascii_uppercase\n", 10 | "\n", 11 | "lst = [ascii_uppercase[:11] ] + [ list(range(10))+ [mult(i)] for i in range(10)]" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "" 21 | ] 22 | } 23 | ], 24 | "metadata": { 25 | "kernelspec": { 26 | "display_name": "Python 3", 27 | "language": "python", 28 | "name": "python3" 29 | }, 30 | "language_info": { 31 | "codemirror_mode": { 32 | "name": "ipython", 33 | "version": 3.0 34 | }, 35 | "file_extension": ".py", 36 | "mimetype": "text/x-python", 37 | "name": "python", 38 | "nbconvert_exporter": "python", 39 | "pygments_lexer": "ipython3", 40 | "version": "3.5.0" 41 | } 42 | }, 43 | "nbformat": 4, 44 | "nbformat_minor": 2 45 | } -------------------------------------------------------------------------------- /metatab/test/test-data/datapackage_ex1_web.csv: -------------------------------------------------------------------------------- 1 | "Declare","http://assets.metatab.org/datapackage.csv",, 2 | "title","Registered Voters, By County",, 3 | "description","Percent of the eligible population registered to vote and the percent who voted in statewide elections.",, 4 | "name","cdph.ca.gov-hci-registered_voters-county",, 5 | "version","1.3.4",, 6 | ,,, 7 | ,,, 8 | "Section","Resources","type","description" 9 | "resource","http://example.com/resource1.csv",, 10 | ".title","First Resource",, 11 | ".name","the-first-resource",, 12 | ".mediatype","text/csv",, 13 | ".format","csv",, 14 | ,,, 15 | "schema",,, 16 | "field","country","string","description" 17 | "field","country","string","description" 18 | "field","country","string","description" 19 | ,,, 20 | "resource","http://example.com/resource2.csv",, 21 | ".title","Second Resource",, 22 | ".name","the-second-resource",, 23 | ".mediatype","text/csv",, 24 | ".format","csv",, 25 | ,,, 26 | "schema",,, 27 | "field","country","string","description" 28 | "field","country","string","description" 29 | "field","country","string","description" 30 | -------------------------------------------------------------------------------- /metatab/test/test-data/geo.csv: -------------------------------------------------------------------------------- 1 | Declare,metatab-latest 2 | Title,US States 3 | Description,US States 4 | Identifier,11585edd-20f4-4b15-a0da-9b5197b5ecc5 5 | Name,us_states-1 6 | Name.Time, 7 | Name.Version,1 8 | Name.Dataset,us-states 9 | Name.Origin, 10 | Name.Grain, 11 | Name.Space, 12 | 13 | Section,Resources,Name,Description, 14 | Datafile,shape+http://s3.amazonaws.com/test.library.civicknowledge.com/census/tl_2016_us_state.geojson.zip,us_states,, 15 | 16 | Section,Schema,DataType,AltName,Description 17 | Table,us_states,,, 18 | Table.Column,id,integer,, 19 | Table.Column,REGION,integer,region, 20 | Table.Column,DIVISION,integer,division, 21 | Table.Column,STATEFP,integer,statefp, 22 | Table.Column,STATENS,integer,statens, 23 | Table.Column,GEOID,integer,geoid, 24 | Table.Column,STUSPS,text,stusps, 25 | Table.Column,NAME,text,name, 26 | Table.Column,LSAD,integer,lsad, 27 | Table.Column,MTFCC,text,mtfcc, 28 | Table.Column,FUNCSTAT,text,funcstat, 29 | Table.Column,ALAND,integer,aland, 30 | Table.Column,AWATER,integer,awater, 31 | Table.Column,INTPTLAT,number,intptlat, 32 | Table.Column,INTPTLON,number,intptlon, 33 | Table.Column,geometry,text,, 34 | -------------------------------------------------------------------------------- /metatab/test/test-data/line/line-oriented-doc-references-2.txt: -------------------------------------------------------------------------------- 1 | 2 | Section: References 3 | 4 | # 5 | # Tract crosswalk 6 | # 7 | Reference: metatab+http://s3.amazonaws.com/library.metatab.org/sangis.org-census_regions-2010-sandiego-7.csv#tract-sra-msa-xwalk 8 | Reference.Name: tracts 9 | Reference.Description: Crosswalk between crosswalks, tracts, zip codes and SRAs in San Diego County 10 | 11 | # 12 | # Tract boundaries 13 | # 14 | Reference: metatab+http://s3.amazonaws.com/library.metatab.org/sangis.org-census_regions-2010-sandiego-7.csv#tracts 15 | Reference.Name: tracts_geo 16 | Reference.Description: Geographics Boundaries for Tracts 17 | 18 | # 19 | # SRA boundaries 20 | # 21 | Reference: metatab+http://s3.amazonaws.com/library.metatab.org/sangis.org-census_regions-2010-sandiego-7.csv#sra 22 | Reference.Name: sra_geo 23 | Reference.Description: Geographics Boundaries for SRAs 24 | 25 | # 26 | # IPUMS Housing and Income Data 27 | # 28 | # Need to use the ZIP version b/c we need to import the Python Code 29 | Reference: metatab+http://s3.amazonaws.com/library.metatab.org/ipums.org-income_homevalue-5.zip#income_homeval 30 | Reference.Name: incv 31 | Reference.Description: Income and Home value records from IPUMS for San Diego County 32 | -------------------------------------------------------------------------------- /docker/Makefile: -------------------------------------------------------------------------------- 1 | 2 | INSTANCE = default 3 | DOCKER ?= docker 4 | 5 | NS = civicknowledge 6 | VERSION = latest 7 | 8 | REPO = metatab 9 | NAME = metatab 10 | 11 | DOCKER ?= docker 12 | 13 | PORTS = 14 | 15 | VOLUMES= -v /data 16 | 17 | ENV = 18 | 19 | 20 | .PHONY: build rebuild push shell run start stop restart reload rm rmf release test 21 | 22 | build: 23 | $(DOCKER) build -t $(NS)/$(REPO):$(VERSION) . 24 | 25 | rebuild: 26 | $(DOCKER) build --no-cache=true -t $(NS)/$(REPO):$(VERSION) . 27 | 28 | push: 29 | $(DOCKER) push $(NS)/$(REPO):$(VERSION) 30 | 31 | shell: 32 | $(DOCKER) run --rm -i -t $(PORTS) $(VOLUMES) $(LINKS) $(ENV) $(NS)/$(REPO):$(VERSION) /bin/bash 33 | 34 | run: 35 | $(DOCKER) run --rm --name $(NAME) $(PORTS) $(VOLUMES) $(LINKS) $(ENV) $(NS)/$(REPO):$(VERSION) 36 | 37 | logs: 38 | $(DOCKER) logs -f $(NAME) 39 | 40 | start: 41 | $(DOCKER) run -d --name $(NAME) $(PORTS) $(VOLUMES) $(LINKS) $(ENV) $(NS)/$(REPO):$(VERSION) 42 | 43 | stop: 44 | $(DOCKER) stop $(NAME) 45 | 46 | restart: stop start 47 | 48 | reload: build rmf start 49 | 50 | rmf: 51 | $(DOCKER) rm -f $(NAME) 52 | 53 | rm: 54 | $(DOCKER) rm $(NAME) 55 | 56 | release: build 57 | make push -e VERSION=$(VERSION) 58 | 59 | default: build 60 | 61 | -------------------------------------------------------------------------------- /metatab/resolver.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016 Civic Knowledge. This file is licensed under the terms of the 2 | # Revised BSD License, included in this distribution as LICENSE 3 | 4 | """ 5 | Generate rows from a variety of paths, references or other input 6 | """ 7 | 8 | from .exc import IncludeError, GenerateError 9 | 10 | class WebResolver(object): 11 | 12 | def fetch_row_source(self, url): 13 | pass 14 | 15 | def find_decl_doc(self, name): 16 | 17 | 18 | raise IncludeError(name) 19 | 20 | import requests 21 | from requests.exceptions import InvalidSchema 22 | url = METATAB_ASSETS_URL + name + '.csv' 23 | try: 24 | # See if it exists online in the official repo 25 | r = requests.head(url, allow_redirects=False) 26 | if r.status_code == requests.codes.ok: 27 | 28 | return url 29 | 30 | except InvalidSchema: 31 | pass # It's probably FTP 32 | 33 | 34 | def get_row_generator(self, ref, cache=None): 35 | 36 | """Return a row generator for a reference""" 37 | from inspect import isgenerator 38 | from rowgenerators import get_generator 39 | 40 | g = get_generator(ref) 41 | 42 | if not g: 43 | raise GenerateError("Cant figure out how to generate rows from {} ref: {}".format(type(ref), ref)) 44 | else: 45 | return g 46 | 47 | -------------------------------------------------------------------------------- /metatab/test/test-data/datapackage_ex2.csv: -------------------------------------------------------------------------------- 1 | "Declare","datapackage-latest",,,, 2 | "title","Country, Regional and World GDP (Gross Domestic Product)",,,, 3 | "description","Country, regional and world GDP in current US Dollars ($). Regional means collections of countries e.g. Europe & Central Asia. Data is sourced from the World Bank and turned into a standard normalized CSV.",,,, 4 | "name","gdp",,,, 5 | "version",2011,,,, 6 | "license","PDDL-1.0",,,, 7 | "keyword","GDP",,,, 8 | "keyword","World",,,, 9 | "keyword","Gross Domestic Product",,,, 10 | "keyword","Time series",,,, 11 | "image","http://assets.okfn.org/p/opendatahandbook/img/data-wrench.png",,,, 12 | "last-updated","2011-09-21",,,, 13 | ,,,,, 14 | "Section","Sources","web",,, 15 | "Source","World Bank and OECD","http://data.worldbank.org/indicator/NY.GDP.MKTP.CD",,, 16 | ,,,,, 17 | "Section ","Resources","type","format","foreignkey","description" 18 | "resource","gdp",,,, 19 | "resource.path","data/gdp.csv",,,, 20 | "schema",,,,, 21 | "field","Country Name","string",,, 22 | "field","Contry Code","string",,"iso-3-geo-codes/id", 23 | "field","Year","date","yyyy",, 24 | "field","Value","number",,,"GDP in current USD" 25 | ,,,,, 26 | "resource","another_gdp_resource",,,, 27 | "resource.path","data/other_gdp.csv",,,, 28 | "schema",,,,, 29 | "field","Country Name","string",,, 30 | "field","Contry Code","string",,"iso-3-geo-codes/id", 31 | "field","Year","date","yyyy",, 32 | "field","Value","number",,,"GDP in current USD" 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017, Civic Knowledge 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of Civic Knowledge nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | ## 3 | ## Python Ignores 4 | ## 5 | 6 | *.py[cod] 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Packages 12 | *.egg 13 | *.egg-info 14 | *.eggs 15 | *.cache 16 | dist 17 | build 18 | eggs 19 | parts 20 | var 21 | sdist 22 | develop-eggs 23 | .installed.cfg 24 | lib 25 | lib64 26 | __pycache__ 27 | 28 | # Installer logs 29 | pip-log.txt 30 | 31 | # Unit test / coverage reports 32 | .coverage 33 | .tox 34 | nosetests.xml 35 | htmlcov/* 36 | 37 | # Translations 38 | *.mo 39 | 40 | # Mr Developer 41 | .mr.developer.cfg 42 | .project 43 | .pydevproject 44 | .idea 45 | test/testbundle/build-save 46 | test/bundles/testbundle/meta/schema-old.csv 47 | bundle.yaml.old 48 | schema-revised.csv 49 | build-save 50 | *.sqlite3 51 | 52 | test/coverage 53 | meta/coverage.yaml 54 | 55 | ## 56 | ## Javascript Ignores 57 | ## 58 | 59 | # Logs 60 | logs 61 | *.log 62 | npm-debug.log* 63 | 64 | # Runtime data 65 | pids 66 | *.pid 67 | *.seed 68 | 69 | # Directory for instrumented libs generated by jscoverage/JSCover 70 | lib-cov 71 | 72 | # Coverage directory used by tools like istanbul 73 | coverage 74 | 75 | # nyc test coverage 76 | .nyc_output 77 | 78 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 79 | .grunt 80 | 81 | # node-waf configuration 82 | .lock-wscript 83 | 84 | # Compiled binary addons (http://nodejs.org/api/addons.html) 85 | build/Release 86 | 87 | # Dependency directories 88 | node_modules 89 | jspm_packages 90 | 91 | # Optional npm cache directory 92 | .npm 93 | 94 | # Optional REPL history 95 | .node_repl_history 96 | 97 | _metapack 98 | .DS_Store 99 | -------------------------------------------------------------------------------- /metatab/datapackage.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016 Civic Knowledge. This file is licensed under the terms of the 2 | # Revised BSD License, included in this distribution as LICENSE 3 | 4 | """ 5 | Convert Metatab terms into datapackage.json file 6 | """ 7 | 8 | from metatab.exc import ConversionError 9 | 10 | type_map = { 11 | 'str': 'string', 12 | 'text': 'string', 13 | 'unicode': 'string', 14 | 'int': 'integer', 15 | 'float': 'number' 16 | } 17 | 18 | 19 | def convert_to_datapackage(doc): 20 | dp = doc['root'].as_dict() 21 | 22 | try: 23 | dp.update(doc['identity'].as_dict()) 24 | except KeyError as e: 25 | pass 26 | 27 | if not 'name' in dp: 28 | if 'indentifier' in dp: 29 | dp['name'] = dp['indentifier'] 30 | else: 31 | raise ConversionError("Datapackage.json requires a Name or Identity term") 32 | 33 | try: 34 | table_schemas = {t.value: t.as_dict()['column'] for t in doc['schema']} 35 | except KeyError as e: 36 | raise ConversionError("Failed to get schemas: " + str(e)) 37 | 38 | file_resources = [fr.arg_props for fr in doc['resources'] if fr.term_is('root.datafile')] 39 | 40 | dp['resources'] = [] 41 | 42 | for r in file_resources: 43 | 44 | try: 45 | columns = table_schemas[r['name']] if r.get('name', '') in table_schemas else table_schemas[ 46 | r['table']] 47 | except KeyError as e: 48 | continue 49 | 50 | def mkdict(c): 51 | d = {} 52 | 53 | for prop in ('name', 'title', 'description'): 54 | if c.get(prop): 55 | d[prop] = c[prop] 56 | 57 | d['type'] = type_map.get(c.get('datatype'), c.get('datatype')) 58 | 59 | return d 60 | 61 | dr = dict( 62 | path=r['url'], 63 | name=r['name'], 64 | schema={'fields': [mkdict(c) for c in columns]} 65 | ) 66 | 67 | dp['resources'].append(dr) 68 | 69 | return dp 70 | -------------------------------------------------------------------------------- /metatab/test/test-data/schema.csv: -------------------------------------------------------------------------------- 1 | "Declare","metatab-0.1",,, 2 | "Title","Registered Voters, By County",,, 3 | "Description","Percent of the eligible population registered to vote and the percent who voted in statewide elections.",,, 4 | ,,,, 5 | "Section","Resources","table","Grain","Title" 6 | "Datafile","http://example.com/example1.csv","registered_voters","County","The First Example Data File" 7 | "Datafile","http://example.com/example2.csv","registered_voters","Tract","The Second Example Data File" 8 | "Homepage","https://www.cdph.ca.gov/programs/pages/healthycommunityindicators.aspx","Healthy Communities Data and Indicators Project (HCI)",, 9 | "Documentation","https://www.cdph.ca.gov/programs/Documents/HCI_RegisteredVoters_653_Narrative_and_examples_6-2-14.pdf","Indicator Documentation for Voter Registration / Participation",, 10 | ".description","Voter Registration/Participation: Percent of the eligible population registered to vote and the percent who voted in statewide elections",,, 11 | ,,,, 12 | "Section ","Contacts","email",, 13 | "Creator","Office of Health Equity","HCIOHE@cdph.ca.gov",, 14 | "Wrangler","Eric Busboom","eric@civicknowledge.com",, 15 | ,,,, 16 | "Section","Schema","datatype","valuetype","description" 17 | "Table","Table1",,,"HCI Indicator 653.0: Percent of adults age 18 years and older who are registered voters" 18 | "Column","Column1","int","year range","Year or years that indicator was reported" 19 | "Column","Column2","str","dimension","Type of record" 20 | "Column","Column3","str","gvid","GVid version of the geotype and geotypeval" 21 | "Column","Column4","str","label for gvid","Census name of geographic area" 22 | "Table","Table1",,,"HCI Indicator 653.0: Percent of adults age 18 years and older who are registered voters" 23 | "Column","Column1","int","year range","Year or years that indicator was reported" 24 | "Column","Column2","str","dimension","Type of record" 25 | "Column","Column3","str","gvid","GVid version of the geotype and geotypeval" 26 | "Column","Column4","str","label for gvid","Census name of geographic area" 27 | "Column.Foo","Bingo",,, 28 | ,,"Bingo 1","BIngo 2", 29 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | import os 6 | import sys 7 | from setuptools import setup 8 | 9 | if sys.argv[-1] == 'publish': 10 | os.system('python setup.py sdist upload') 11 | sys.exit() 12 | 13 | with open(os.path.join(os.path.dirname(__file__), 'README.rst')) as f: 14 | readme = f.read() 15 | 16 | classifiers = [ 17 | 'Development Status :: 4 - Beta', 18 | 'Intended Audience :: Developers', 19 | 'License :: OSI Approved :: BSD License', 20 | 'Operating System :: OS Independent', 21 | 'Programming Language :: Python', 22 | 'Programming Language :: Python :: 3.6', 23 | 'Topic :: Software Development :: Libraries :: Python Modules', 24 | ] 25 | 26 | # Setup a directory for a fake package for importing plugins 27 | 28 | setup( 29 | name='metatab', 30 | version='0.8.2', 31 | description='Data format for storing structured data in spreadsheet tables', 32 | long_description=readme, 33 | packages=['metatab','metatab.templates', 'metatab.test', 'metatab.test.test-data'], 34 | 35 | package_data={ 36 | '': ['*.csv','*.json','*.txt','*.ipynb',''], 37 | }, 38 | 39 | install_requires=[ 40 | 'metatabdecl', 41 | 'rowgenerators', 42 | ], 43 | 44 | # test_suite='appurl.test.test_suite', 45 | test_suite='nose.collector', 46 | tests_require=['nose', 'tabulate'], 47 | 48 | entry_points={ 49 | 'console_scripts': [ 50 | 'metatab=metatab.cli:metatab' 51 | ], 52 | 53 | 'appurl.urls': [ 54 | "metatab+ = metatab.appurl:MetatabUrl", 55 | ], 56 | 57 | 'rowgenerators': [ 58 | "metatab+.txt = metatab.rowgenerators:TextRowGenerator", 59 | ".yaml = metatab.rowgenerators:YamlMetatabSource" 60 | ] 61 | }, 62 | 63 | author='Eric Busboom', 64 | author_email='eric@civicknowledge.com', 65 | url='https://github.com/Metatab/metatab-py.git', 66 | license='BSD', 67 | classifiers=classifiers, 68 | extras_require={ 69 | 'datapackage': ['datapackage'], 70 | } 71 | ) 72 | -------------------------------------------------------------------------------- /metatab/test/test-data/line/line-oriented-doc.txt: -------------------------------------------------------------------------------- 1 | Identifier: 47bc1089-7584-41f0-b804-602ec42f1249 2 | Origin: civicknowledge.com 3 | Dataset: rcfe_affordability 4 | Version: 4 5 | Time: 2015 6 | Name: civicknowledge.com-rcfe_affordability-2015-4 7 | 8 | Section: Contacts 9 | Wrangler: Eric Busboom 10 | Wrangler.Email: eric@civicknowledge.com 11 | Wrangler.Organization: Civic Knowledge 12 | 13 | Section: References 14 | 15 | Reference: censusreporter:B09020/140/05000US06073 16 | Reference.Name: B09020 17 | Reference.Description: Relationship by Household Type (Including Living Alone) for Population 65 Years and Over 18 | 19 | 20 | # 21 | # Tract crosswalk 22 | # 23 | Reference: metatab+http://s3.amazonaws.com/library.metatab.org/sangis.org-census_regions-2010-sandiego-7.csv#tract-sra-msa-xwalk 24 | Reference.Name: tracts 25 | Reference.Description: Crosswalk between crosswalks, tracts, zip codes and SRAs in San Diego County 26 | 27 | # 28 | # Tract boundaries 29 | # 30 | Reference: metatab+http://s3.amazonaws.com/library.metatab.org/sangis.org-census_regions-2010-sandiego-7.csv#tracts 31 | Reference.Name: tracts_geo 32 | Reference.Description: Geographics Boundaries for Tracts 33 | 34 | # 35 | # SRA boundaries 36 | # 37 | Reference: metatab+http://s3.amazonaws.com/library.metatab.org/sangis.org-census_regions-2010-sandiego-7.csv#sra 38 | Reference.Name: sra_geo 39 | Reference.Description: Geographics Boundaries for SRAs 40 | 41 | # 42 | # IPUMS Housing and Income Data 43 | # 44 | # Need to use the ZIP version b/c we need to import the Python Code 45 | Reference: metatab+http://s3.amazonaws.com/library.metatab.org/ipums.org-income_homevalue-5.zip#income_homeval 46 | Reference.Name: incv 47 | Reference.Description: Income and Home value records from IPUMS for San Diego County 48 | 49 | 50 | ==== Bibliography 51 | Citation: ipums 52 | Citation.Type: dataset 53 | Citation.Author: Steven Ruggles; Katie Genadek; Ronald Goeken; Josiah Grover; Matthew Sobek 54 | Citation.Title: Integrated Public Use Microdata Series 55 | Citation.Year: 2017 56 | Citation.Publisher: University of Minnesota 57 | Citation.Version: 7.0 58 | Citation.AccessDate: 20170718 59 | Citation.Url: https://usa.ipums.org/usa/index.shtml 60 | Citation.Doi: https://doi.org/10.18128/D010.V7.0 61 | 62 | -------------------------------------------------------------------------------- /metatab/test/outputs/datapackage.json: -------------------------------------------------------------------------------- 1 | { 2 | "resources": [ 3 | { 4 | "name": "the-first-resource", 5 | "format": "csv", 6 | "url": "http://example.com/resource1.csv", 7 | "title": "First Resource", 8 | "mediatype": "text/csv", 9 | "schema": { 10 | "fields": [ 11 | { 12 | "type": "string", 13 | "description": "description", 14 | "name": "country" 15 | }, 16 | { 17 | "type": "string", 18 | "description": "description", 19 | "name": "country" 20 | }, 21 | { 22 | "type": "string", 23 | "description": "description", 24 | "name": "country" 25 | } 26 | ] 27 | } 28 | }, 29 | { 30 | "name": "the-second-resource", 31 | "format": "csv", 32 | "url": "http://example.com/resource2.csv", 33 | "title": "Second Resource", 34 | "mediatype": "text/csv", 35 | "schema": { 36 | "fields": [ 37 | { 38 | "type": "string", 39 | "description": "description", 40 | "name": "country" 41 | }, 42 | { 43 | "type": "string", 44 | "description": "description", 45 | "name": "country" 46 | }, 47 | { 48 | "type": "string", 49 | "description": "description", 50 | "name": "country" 51 | } 52 | ] 53 | } 54 | } 55 | ], 56 | "version": "1.3.4", 57 | "description": "Percent of the eligible population registered to vote and the percent who voted in statewide elections.", 58 | "name": "cdph.ca.gov-hci-registered_voters-county", 59 | "title": "Registered Voters, By County" 60 | } 61 | -------------------------------------------------------------------------------- /docs/Census.rst: -------------------------------------------------------------------------------- 1 | Loading Census Data With Pandas Reporter 2 | ======================================== 3 | 4 | The general process for creating a census package is similar to the package process described in the `Getting Started tutorial, `_ but with a ``DataFile`` term that uses a program to fetch data from Census Reporter. First we'll create the program, then link it into a Metatab package. The program uses the `pandas-reporter` module, so the reation process is very similar to the `Pandas-Reporter tutorial. `_ 5 | 6 | Creating a Pandas-Reporter program 7 | ---------------------------------- 8 | 9 | First, read the `Pandas-Reporter tutorial. `_ You'l need to install the `pandasreporter` python module. 10 | 11 | Then, visit `Census Reporter `_ to locate information about tables, regions and and summary levels. 12 | 13 | For this tutorial, we will use these tables: 14 | 15 | - B17001, Poverty Status by Sex by Age 16 | - B17024, Age by Ratio of Income to Poverty Level 17 | - B17017, Poverty Status by Household Type by Age of Householder 18 | 19 | For the geography, we will use tracts in San Diego County. 20 | 21 | To find the geoid code for San Diego County, visit the main page at `Census Reporter `_ and search for San Diego County. You should get a `profile page for the county '_. In the URL for the page, you should see the code `05000US06073`. This code is the geoid for San Diego County. 22 | 23 | Next, visit the page for `Cartographic Boundary File Summary Level Codes `_ to get the summary level code for tracts. It is actually listed by all of its components, in this case, "State-County-Census Tract." It is code "140". ( BTW, that is a string, not a number. ) 24 | 25 | The start of our program is similar to the program in the `Pandas-Reporter tutorial. `_, except using the table, summary level and region codes for this example: 26 | 27 | .. code-block:: python 28 | 29 | $ mkdir example-data-package 30 | $ cd example-data-package 31 | $ metapack -c -------------------------------------------------------------------------------- /metatab/test/test-data/yaml/yaml-example-1.csv: -------------------------------------------------------------------------------- 1 | Declare,metatab-latest,,, 2 | Title,San Diego County Weather,,, 3 | Description,Daily summaries from a selection of San Diego county weather stations,,, 4 | Identifier,2dc83efa-e6da-4561-bdf9-63263360ccf0,,, 5 | Name,noaa.gov-daily_summary-1998e-san-1,,, 6 | Dataset,daily_summary,,, 7 | Origin,noaa.gov,,, 8 | Time,1998e,,, 9 | Space,san,,, 10 | Grain,,,, 11 | Variant,,,, 12 | Version,1,,, 13 | Created,2018-08-17T15:44:24,,, 14 | Modified,2018-08-17T16:18:19,,, 15 | Giturl,https://github.com/san-diego-water-quality/water-datasets.git,,, 16 | ,,,, 17 | Section,Contacts,Email,Organization,Url 18 | Wrangler,Eric Busboom,eric@civicknowledge.com,Civic Knowledge,http://civicknowledge.com 19 | ,,,, 20 | ,,,, 21 | Section,Documentation,Title,Description, 22 | Documentation,file:README.md,README,, 23 | Documentation,https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/GHCND_documentation.pdf,Documentation,Main documentation, 24 | ,,,, 25 | Section,Resources,Name,Description, 26 | Datafile,http://ds.civicknowledge.org.s3.amazonaws.com/noaa.gov/daily-summary-1998-2018-san.csv,daily_summary_san,Daily weather summaries, 27 | Datafile,http://ds.civicknowledge.org.s3.amazonaws.com/noaa.gov/daily-summary-1998-2018-san.csv,daily_summary_la,Daily weather summaries, 28 | ,,,, 29 | ,,,, 30 | ,,,, 31 | Section,Schema,DataType,AltName,Description 32 | Table,daily_summary_san,,, 33 | Table.Column,STATION,string,station, 34 | Table.Column,NAME,string,name,Station code 35 | Table.Column,LATITUDE,number,latitude,Station name 36 | Table.Column,LONGITUDE,number,longitude,Station lattitude 37 | Table.Column,ELEVATION,number,elevation,Station longitude 38 | Table.Column,DATE,date,date,Station elevation 39 | Table.Column,AWND,number,awnd,Measurement date 40 | Table.Column,DAPR,string,dapr,Average daily wind speed (meters per second or miles per hour as per user preference 41 | Table.Column,FMTM,integer,fmtm,Number of days included in the multiday precipitation total (MDPR) 42 | Table.Column,MDPR,string,mdpr,"Time of fastest mile or fastest 1-minute wind (hours and minutes, i.e., HHMM)" 43 | Table.Column,PGTM,string,pgtm,"Multiday precipitation total (mm or inches as per user preference; use with DAPR and DWPR, if available)" 44 | Table.Column,PRCP,number,prcp,"Peak gust time (hours and minutes, i.e., HHMM)" 45 | Table.Column,SNOW,integer,snow,"Precipitation (mm or inches as per user preference, inches to hundredths on Daily Form pdf file)" 46 | Table.Column,SNWD,integer,snwd,"Snowfall (mm or inches as per user preference, inches to tenths on Daily Form pdf file)" 47 | Table.Column,TAVG,string,tavg,"Snow depth (mm or inches as per user preference, inches on Daily Form pdf file)" 48 | Table.Column,TMAX,integer,tmax,Average temerature 49 | Table.Column,TMIN,integer,tmin,"Maximum temperature (Fahrenheit or Celsius as per user preference, Fahrenheit to tenths on Daily Form pdf file" 50 | -------------------------------------------------------------------------------- /docs/GeneratingRowsWithPrograms.rst: -------------------------------------------------------------------------------- 1 | 2 | Row Generating Programs 3 | ======================= 4 | 5 | Metatab Datafile terms can reference programs and IPython notebooks to generate rows. 6 | 7 | To reference a program, the ``Root.Datafile`` must be a URL with a ``program`` scheme and a relative path. Usually, the file is placed in a subdirectory named 'scripts' at the same level as the ``metadata.csv`` file. It must be an executable program, and may be any executable program. 8 | 9 | When a data package is created, regardless of the type, a filesystem package is created first, then other types of packages are created from the filesystem package. This means that the row-generating program is only run once per resource when multiple packages are created, and also that the program can open the Metatab package being used to run the program to access previously created resource files. 10 | 11 | Program Inputs 12 | ************** 13 | 14 | The program can receive information from Metatab through program options and environmental variables, and must print CSV formatted lines to std out. 15 | 16 | There are two broad sources for inputs to the program. The first is are several values that are passed into the program regardless of the configuration of the ``Root.DataFile`` term. The second are the properties of the ``Root.DataFile`` terms. 17 | 18 | The inputs for all programs are: 19 | 20 | - METATAB_DOC: An env var that holds the URL for the Metatab document being processed 21 | - METATAB_PACKAGE: An env var that holds the metatab document's package URL. ( Which is usually the same as the document URL ) 22 | - METATAB_WORKING_DIR: An env var that holds the path to the directory holding the metatab file. 23 | - PROPERTIES: An env var with holds a JSON encoded dict with the three previous env values, along with the ``properties`` dict for the ``Root.DataFile`` term. 24 | 25 | Additionally, the program receives the ``Root.DataFile`` properties in these forms: 26 | 27 | - Properties that have names that are all uppercased are assigned to env variables. 28 | - Properties that have names that begin with '-' are assigned to program options. 29 | 30 | 31 | Common Patterns 32 | *************** 33 | 34 | It is very common for a program to open the Metatab document that is being used to run the program. In Python: 35 | 36 | .. code-block:: python 37 | 38 | import metatab as mt 39 | doc = mt.MetatabDoc(environ['METATAB_DOC']) 40 | 41 | Since the program must output CSV formatted lines, a CSV writer can be constructed on ``sys.stdout``: 42 | 43 | .. code-block:: python 44 | 45 | import sys 46 | import csv 47 | 48 | w = csv.writer(sys.stdout) 49 | 50 | w.writerow(...) 51 | 52 | 53 | If the program generates logging or warnings, they must be printed to ``sys.stderr`` 54 | 55 | .. code-block:: python 56 | 57 | import sys 58 | 59 | print("ERROR!", file=sys.stderr) 60 | 61 | -------------------------------------------------------------------------------- /metatab/test/test_doc.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import unittest 4 | from os.path import join, dirname 5 | 6 | from metatab import MetatabDoc 7 | from metatab.rowgenerators import TextRowGenerator 8 | from metatab.test.core import test_data 9 | 10 | 11 | class TestDoc(unittest.TestCase): 12 | 13 | def test_open(self): 14 | 15 | doc = MetatabDoc(test_data('almost-everything.csv')) 16 | 17 | self.assertEquals('9FC11204-B291-4E0E-A841-5372090ADEC0', doc.find_first_value('Root.Identifier')) 18 | 19 | self.assertEquals('9FC11204-B291-4E0E-A841-5372090ADEC0', doc['Root'].find_first_value('Root.Identifier')) 20 | 21 | 22 | def test_new(self): 23 | 24 | import metatab.templates as tmpl 25 | 26 | template_path = join(dirname(tmpl.__file__), 'metatab.csv') 27 | 28 | doc = MetatabDoc(template_path) 29 | doc.cleanse() 30 | 31 | print(doc.as_csv()[:200]) 32 | 33 | def test_version(self): 34 | 35 | from textwrap import dedent 36 | 37 | 38 | doc = MetatabDoc(TextRowGenerator( 39 | dedent( 40 | """ 41 | Root.Version: 42 | """))) 43 | 44 | # None because there are no Minor, Major, Patch value 45 | self.assertIsNone(doc.update_version()) 46 | 47 | self.assertFalse(doc._has_semver()) 48 | 49 | doc = MetatabDoc(TextRowGenerator( 50 | dedent( 51 | """ 52 | Root.Version: 10 53 | """))) 54 | 55 | # None because there are no Minor, Major, Patch value 56 | self.assertEqual("10", doc.update_version()) 57 | self.assertFalse(doc._has_semver()) 58 | 59 | doc = MetatabDoc(TextRowGenerator( 60 | dedent( 61 | """ 62 | Root.Version: 10 63 | Version.Patch: 5 64 | """))) 65 | 66 | # None because there are no Minor, Major, Patch value 67 | self.assertEqual("0.0.5", doc.update_version()) 68 | self.assertTrue(doc._has_semver()) 69 | 70 | doc = MetatabDoc(TextRowGenerator( 71 | dedent( 72 | """ 73 | Root.Version: 10 74 | Version.Major: 2 75 | Version.Patch: 5 76 | """))) 77 | 78 | # None because there are no Minor, Major, Patch value 79 | self.assertEqual("2.0.5", doc.update_version()) 80 | 81 | doc = MetatabDoc(TextRowGenerator( 82 | dedent( 83 | """ 84 | Root.Name: 85 | Root.Origin: example.com 86 | Root.Dataset: foobar 87 | Root.Version: 88 | Version.Minor: 24 89 | Version.Major: 2 90 | Version.Patch: 5 91 | """))) 92 | 93 | # None because there are no Minor, Major, Patch value 94 | self.assertEqual("2.24.5", doc.update_version()) 95 | 96 | doc.update_name() 97 | self.assertEqual('example.com-foobar-2.24', doc.get_value('Root.Name')) 98 | 99 | if __name__ == '__main__': 100 | unittest.main() 101 | -------------------------------------------------------------------------------- /metatab/test/test-data/packages/example.com-test_package/notebooks/Test_Notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "The metatab extension is already loaded. To reload it, use:\n", 13 | " %reload_ext metatab\n", 14 | "The autoreload extension is already loaded. To reload it, use:\n", 15 | " %reload_ext autoreload\n" 16 | ] 17 | } 18 | ], 19 | "source": [ 20 | "%matplotlib inline\n", 21 | "%load_ext metatab\n", 22 | "\n", 23 | "%load_ext autoreload\n", 24 | "%autoreload 2\n", 25 | "\n", 26 | "import pandas as pd\n", 27 | "import numpy as np \n", 28 | "import metatab as mt" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 5, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "%mt_open_package\n", 38 | "assert mt_pkg.path.endswith('metatab-py/test-data/packages/example.com-test_package/metadata.csv')\n", 39 | "orig_path = mt_pkg.path.endswith" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 6, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "from metatab.pands import MetatabDataFrame\n", 49 | "\n", 50 | "odf = MetatabDataFrame({ 'cola':range(10), 'colb': range(10)})\n", 51 | "\n", 52 | "odf.name = 'income_homeval'\n", 53 | "odf.title = 'Income and Home Value Records for San Diego County'\n", 54 | "odf.cola.description = 'Household income'\n", 55 | "odf.colb.description = 'Home value'\n", 56 | "\n", 57 | "%mt_add_dataframe odf --materialize\n", 58 | "\n", 59 | "cols = list(mt_pkg.resource('income_homeval').columns())\n", 60 | "assert 'cola' in [ c['name'] for c in cols]\n", 61 | "assert 'colb' in [ c['name'] for c in cols]" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": { 68 | "collapsed": true 69 | }, 70 | "outputs": [], 71 | "source": [] 72 | } 73 | ], 74 | "metadata": { 75 | "kernelspec": { 76 | "display_name": "Python 3", 77 | "language": "python", 78 | "name": "python3" 79 | }, 80 | "language_info": { 81 | "codemirror_mode": { 82 | "name": "ipython", 83 | "version": 3 84 | }, 85 | "file_extension": ".py", 86 | "mimetype": "text/x-python", 87 | "name": "python", 88 | "nbconvert_exporter": "python", 89 | "pygments_lexer": "ipython3", 90 | "version": "3.6.1" 91 | }, 92 | "varInspector": { 93 | "cols": { 94 | "lenName": 16, 95 | "lenType": 16, 96 | "lenVar": 40 97 | }, 98 | "kernels_config": { 99 | "python": { 100 | "delete_cmd_postfix": "", 101 | "delete_cmd_prefix": "del ", 102 | "library": "var_list.py", 103 | "varRefreshCmd": "print(var_dic_list())" 104 | }, 105 | "r": { 106 | "delete_cmd_postfix": ") ", 107 | "delete_cmd_prefix": "rm(", 108 | "library": "var_list.r", 109 | "varRefreshCmd": "cat(var_dic_list()) " 110 | } 111 | }, 112 | "types_to_exclude": [ 113 | "module", 114 | "function", 115 | "builtin_function_or_method", 116 | "instance", 117 | "_Feature" 118 | ], 119 | "window_display": false 120 | } 121 | }, 122 | "nbformat": 4, 123 | "nbformat_minor": 2 124 | } 125 | -------------------------------------------------------------------------------- /metatab/test/test-data/notebooks/ImportTest.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%load_ext metatab\n", 12 | "%mt_lib_dir lib\n", 13 | "\n", 14 | "import file\n" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "assert file.__file__.endswith('test-data/notebooks/lib/file.py')" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 3, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "%mt_lib_dir http://s3.amazonaws.com/library.metatab.org/ipums.org-income_homevalue-5.zip" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 4, 42 | "metadata": { 43 | "collapsed": true 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "import lib.incomedist" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 5, 53 | "metadata": { 54 | "collapsed": true 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "assert lib.incomedist.__file__.endswith('ipums.org-income_homevalue-5.zip/ipums.org-income_homevalue-5/lib/incomedist.py')" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 6, 64 | "metadata": { 65 | "collapsed": true 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "%%metatab\n", 70 | "Identifier: 47bc1089-7584-41f0-b804-602ec42f1249\n", 71 | "Name: FooBarBaz\n", 72 | "\n", 73 | "Section: References \n", 74 | "Reference: metatab+http://s3.amazonaws.com/library.metatab.org/ipums.org-income_homevalue-5.zip#income_homeval\n", 75 | "Reference.Name: incv\n", 76 | "Reference.Description: Income and Home value records from IPUMS for San Diego County\n", 77 | "\n" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 7, 83 | "metadata": { 84 | "collapsed": true 85 | }, 86 | "outputs": [], 87 | "source": [ 88 | "%mt_lib_dir incv" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": { 95 | "collapsed": true 96 | }, 97 | "outputs": [], 98 | "source": [] 99 | } 100 | ], 101 | "metadata": { 102 | "kernelspec": { 103 | "display_name": "Python 3", 104 | "language": "python", 105 | "name": "python3" 106 | }, 107 | "language_info": { 108 | "codemirror_mode": { 109 | "name": "ipython", 110 | "version": 3 111 | }, 112 | "file_extension": ".py", 113 | "mimetype": "text/x-python", 114 | "name": "python", 115 | "nbconvert_exporter": "python", 116 | "pygments_lexer": "ipython3", 117 | "version": "3.6.1" 118 | }, 119 | "varInspector": { 120 | "cols": { 121 | "lenName": 16, 122 | "lenType": 16, 123 | "lenVar": 40 124 | }, 125 | "kernels_config": { 126 | "python": { 127 | "delete_cmd_postfix": "", 128 | "delete_cmd_prefix": "del ", 129 | "library": "var_list.py", 130 | "varRefreshCmd": "print(var_dic_list())" 131 | }, 132 | "r": { 133 | "delete_cmd_postfix": ") ", 134 | "delete_cmd_prefix": "rm(", 135 | "library": "var_list.r", 136 | "varRefreshCmd": "cat(var_dic_list()) " 137 | } 138 | }, 139 | "types_to_exclude": [ 140 | "module", 141 | "function", 142 | "builtin_function_or_method", 143 | "instance", 144 | "_Feature" 145 | ], 146 | "window_display": false 147 | } 148 | }, 149 | "nbformat": 4, 150 | "nbformat_minor": 2 151 | } 152 | -------------------------------------------------------------------------------- /metatab/test/test-data/notebooks/CellExecuteError.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%load_ext metatab" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 4, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "%%metatab\n", 23 | "Origin: example.com\n", 24 | "Dataset: foobar.com \n", 25 | "Identifier: de097279-28ef-42f5-a4f5-0eaac53b7dc4\n", 26 | "Name: example.com-foobar.com \n", 27 | "\n", 28 | "Section: Contacts\n", 29 | "Wrangler: Eric Busboom\n", 30 | "Wrangler.Email: eric@civicknowledge.com\n", 31 | "\n", 32 | "Section: References\n", 33 | "Reference: http://public.source.civicknowledge.com/example.com/sources/renter_cost.csv\n", 34 | "Reference.Name: reference\n", 35 | "Reference.Title: The First Example Data File\n", 36 | "Reference.Startline: 5\n", 37 | "Reference.HeaderLines: 3,4\n", 38 | " \n", 39 | "Section: Resources\n", 40 | "Datafile: http://public.source.civicknowledge.com/example.com/sources/renter_cost.csv\n", 41 | "Datafile.Name: ext_resource\n", 42 | "Datafile.Title: An Extern CSV Resource\n", 43 | "Datafile.Startline: 5\n", 44 | "Datafile.HeaderLines: 3,4" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 5, 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "ename": "ZeroDivisionError", 54 | "evalue": "division by zero", 55 | "output_type": "error", 56 | "traceback": [ 57 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 58 | "\u001b[0;31mZeroDivisionError\u001b[0m Traceback (most recent call last)", 59 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;36m1\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 60 | "\u001b[0;31mZeroDivisionError\u001b[0m: division by zero" 61 | ] 62 | } 63 | ], 64 | "source": [ 65 | "1/0" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": { 72 | "collapsed": true 73 | }, 74 | "outputs": [], 75 | "source": [] 76 | } 77 | ], 78 | "metadata": { 79 | "kernelspec": { 80 | "display_name": "Python 3", 81 | "language": "python", 82 | "name": "python3" 83 | }, 84 | "language_info": { 85 | "codemirror_mode": { 86 | "name": "ipython", 87 | "version": 3 88 | }, 89 | "file_extension": ".py", 90 | "mimetype": "text/x-python", 91 | "name": "python", 92 | "nbconvert_exporter": "python", 93 | "pygments_lexer": "ipython3", 94 | "version": "3.6.1" 95 | }, 96 | "varInspector": { 97 | "cols": { 98 | "lenName": 16, 99 | "lenType": 16, 100 | "lenVar": 40 101 | }, 102 | "kernels_config": { 103 | "python": { 104 | "delete_cmd_postfix": "", 105 | "delete_cmd_prefix": "del ", 106 | "library": "var_list.py", 107 | "varRefreshCmd": "print(var_dic_list())" 108 | }, 109 | "r": { 110 | "delete_cmd_postfix": ") ", 111 | "delete_cmd_prefix": "rm(", 112 | "library": "var_list.r", 113 | "varRefreshCmd": "cat(var_dic_list()) " 114 | } 115 | }, 116 | "types_to_exclude": [ 117 | "module", 118 | "function", 119 | "builtin_function_or_method", 120 | "instance", 121 | "_Feature" 122 | ], 123 | "window_display": false 124 | } 125 | }, 126 | "nbformat": 4, 127 | "nbformat_minor": 2 128 | } 129 | -------------------------------------------------------------------------------- /metatab/test/test-data/errors/errors2.csv: -------------------------------------------------------------------------------- 1 | "Declare","http://doesntexist.csv",,, 2 | "Title","Registered Voters, By County",,, 3 | "Description","Percent of the eligible population registered to vote and the percent who voted in statewide elections.",,, 4 | "Identifier","cdph.ca.gov-hci-registered_voters-county",,, 5 | ,201404,,, 6 | ,"cdph.ca.gov-hci-registered_voters-county-201304",,, 7 | "Format","excel",,, 8 | "Spatial","California <04000US06>",,, 9 | "Time","2002-2014",,, 10 | "SpatialGrain","County <05000US>",,, 11 | ,,,, 12 | "Section","Resources","table","Grain","Title" 13 | "Datafile","http://example.com/example1.csv","registered_voters","County","The First Example Data File" 14 | "Datafile","http://example.com/example2.csv","registered_voters","Tract","The Second Example Data File" 15 | ,,"Healthy Communities Data and Indicators Project (HCI)",, 16 | ,,"Indicator Documentation for Voter Registration / Participation",, 17 | ".description","Voter Registration/Participation: Percent of the eligible population registered to vote and the percent who voted in statewide elections",,, 18 | ,,,, 19 | "Section ","Contacts","email",, 20 | "Creator","Office of Health Equity","HCIOHE@cdph.ca.gov",, 21 | "Wrangler","Eric Busboom","eric@civicknowledge.com",, 22 | ,,,, 23 | "Section ","Notes",,, 24 | "Note","This file is an example of a data bundle, a simple format for linking data to metadata using spreadsheets. See the specification for more details. ",,, 25 | "Documentation","https://docs.google.com/document/d/16tb7x73AyF8pJ6e6IBcaIJAioEZCNBGDEksKYTXfdfg/edit#",,, 26 | ".title","Data Bundles Packaging Specification",,, 27 | ,,,, 28 | "Section","Schema",,"valuetype","description" 29 | "Table","registered_voters",,,"HCI Indicator 653.0: Percent of adults age 18 years and older who are registered voters" 30 | "Column","reportyear","int","year range","Year or years that indicator was reported" 31 | "Column","type","str","dimension","Type of record" 32 | "Column","gvid","str","gvid","GVid version of the geotype and geotypeval" 33 | "Column","geoname","str","label for gvid","Census name of geographic area" 34 | "Column","geotype","str","label","Code for type of geographic area" 35 | "Column","geotypevalue","str","census","Census geoid code" 36 | "Column","county_fips","str","FIPS county code","County FIPS code" 37 | "Column","county_name","str","label for counrty_fips","County name" 38 | "Column","region_code","str","census code","Numeric code of region" 39 | "Column","region_name","str","label for region_code","Name of region" 40 | "Column","raceth","str","raceth/civick","Civic Knowledge race / ethnicity code." 41 | "Column","raceth_name","str","label for raceeth","Race / Ethnicity Name" 42 | "Column","race_eth_code","str","raceth/hci","Race / ethnicity code" 43 | "Column","race_eth_name","str","label for race_eth_code","Race / ethnicity name" 44 | "Column","numerator","int","count","Adults who are registered to vote, or who voted, depending on type of record" 45 | "Column","denominator","int","count","Population of Adults, 18 years or older" 46 | "Column","percent","float","percent of numerator over denominator","Percent of adults who are registered to vote, or who voted, depending on type of record" 47 | "Column","ll_95ci","float","ci95l for percent","Lower bound of 95% confidence interval" 48 | "Column","ul_95ci","float","ci95u for percent","Upper bound of 95% confidence interval" 49 | "Column","se","float","se for percent","Standard error" 50 | "Column","rse","float","rse for percent","Relative standard error (se/percent * 100) expressed as a percent" 51 | "Column","ca_decile","float","decile","Statewide decile ranking" 52 | "Column","ca_rr","float","ratio","Ratio of indicator to state average" 53 | "Column","vap","float","measure","Voter age population, from CA Department of Finance." 54 | "Column","ind_id","str","dimension", 55 | "Column","ind_definition","str","dimension", 56 | "Column","version","str","other", 57 | -------------------------------------------------------------------------------- /metatab/test/test-data/example1-web.csv: -------------------------------------------------------------------------------- 1 | "Declare","http://assets.metatab.org/metatab-0.1.csv",,, 2 | "Title","Registered Voters, By County",,, 3 | "Description","Percent of the eligible population registered to vote and the percent who voted in statewide elections.",,, 4 | "Identifier","cdph.ca.gov-hci-registered_voters-county",,, 5 | "Version",201404,,, 6 | "Obsoletes","cdph.ca.gov-hci-registered_voters-county-201304",,, 7 | "Format","excel",,, 8 | "Spatial","California <04000US06>",,, 9 | "Time","2002-2014",,, 10 | "SpatialGrain","County <05000US>",,, 11 | ,,,, 12 | "Section","Resources","table","Grain","Title" 13 | "Datafile","http://example.com/example1.csv","registered_voters","County","The First Example Data File" 14 | "Datafile","http://example.com/example2.csv","registered_voters","Tract","The Second Example Data File" 15 | "Homepage","https://www.cdph.ca.gov/programs/pages/healthycommunityindicators.aspx","Healthy Communities Data and Indicators Project (HCI)",, 16 | "Documentation","https://www.cdph.ca.gov/programs/Documents/HCI_RegisteredVoters_653_Narrative_and_examples_6-2-14.pdf","Indicator Documentation for Voter Registration / Participation",, 17 | ".description","Voter Registration/Participation: Percent of the eligible population registered to vote and the percent who voted in statewide elections",,, 18 | ,,,, 19 | "Section ","Contacts","email",, 20 | "Creator","Office of Health Equity","HCIOHE@cdph.ca.gov",, 21 | "Wrangler","Eric Busboom","eric@civicknowledge.com",, 22 | ,,,, 23 | "Section ","Notes",,, 24 | "Note","This file is an example of a data bundle, a simple format for linking data to metadata using spreadsheets. See the specification for more details. ",,, 25 | "Documentation","https://docs.google.com/document/d/16tb7x73AyF8pJ6e6IBcaIJAioEZCNBGDEksKYTXfdfg/edit#",,, 26 | ".title","Data Bundles Packaging Specification",,, 27 | ,,,, 28 | "Section","Schema","datatype","valuetype","description" 29 | "Table","registered_voters",,,"HCI Indicator 653.0: Percent of adults age 18 years and older who are registered voters" 30 | "Column","reportyear","int","year range","Year or years that indicator was reported" 31 | "Column","type","str","dimension","Type of record" 32 | "Column","gvid","str","gvid","GVid version of the geotype and geotypeval" 33 | "Column","geoname","str","label for gvid","Census name of geographic area" 34 | "Column","geotype","str","label","Code for type of geographic area" 35 | "Column","geotypevalue","str","census","Census geoid code" 36 | "Column","county_fips","str","FIPS county code","County FIPS code" 37 | "Column","county_name","str","label for counrty_fips","County name" 38 | "Column","region_code","str","census code","Numeric code of region" 39 | "Column","region_name","str","label for region_code","Name of region" 40 | "Column","raceth","str","raceth/civick","Civic Knowledge race / ethnicity code." 41 | "Column","raceth_name","str","label for raceeth","Race / Ethnicity Name" 42 | "Column","race_eth_code","str","raceth/hci","Race / ethnicity code" 43 | "Column","race_eth_name","str","label for race_eth_code","Race / ethnicity name" 44 | "Column","numerator","int","count","Adults who are registered to vote, or who voted, depending on type of record" 45 | "Column","denominator","int","count","Population of Adults, 18 years or older" 46 | "Column","percent","float","percent of numerator over denominator","Percent of adults who are registered to vote, or who voted, depending on type of record" 47 | "Column","ll_95ci","float","ci95l for percent","Lower bound of 95% confidence interval" 48 | "Column","ul_95ci","float","ci95u for percent","Upper bound of 95% confidence interval" 49 | "Column","se","float","se for percent","Standard error" 50 | "Column","rse","float","rse for percent","Relative standard error (se/percent * 100) expressed as a percent" 51 | "Column","ca_decile","float","decile","Statewide decile ranking" 52 | "Column","ca_rr","float","ratio","Ratio of indicator to state average" 53 | "Column","vap","float","measure","Voter age population, from CA Department of Finance." 54 | "Column","ind_id","str","dimension", 55 | "Column","ind_definition","str","dimension", 56 | "Column","version","str","other", 57 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Metatab 2 | ======= 3 | 4 | .. image:: https://travis-ci.org/Metatab/metatab.svg?branch=master 5 | :target: https://travis-ci.org/Metatab/metatab 6 | 7 | Parse and manipulate structured data and metadata in a tabular format. 8 | 9 | `Metatab `_ is a data format that allows structured 10 | metadata -- the sort you'd normally store in JSON, YAML or XML -- to be stored 11 | and edited in tabular forms like CSV or Excel. Metatab files look exactly like 12 | you'd expect, so they are very easy for non-technical users to read and edit, 13 | using tools they already have. Metatab is an excellent format for creating, 14 | storing and transmitting metadata. For more information about metatab, visit 15 | http://metatab.org. 16 | 17 | This repository has a Python module and executable. For a Javascript version, 18 | see the `metatab-js `_ repository. 19 | 20 | What is Metatab For? 21 | -------------------- 22 | 23 | Metatab is a tabular format that allows storing metadata for demographics, 24 | health and research datasets in a tabular format. The tabular format is much 25 | easier for data creators to write and for data consumers to read, and it allows 26 | a complete data packages to be stored in a single Excel file. 27 | 28 | 29 | Install 30 | ------- 31 | 32 | 33 | 34 | Install the package from PiPy with: 35 | 36 | .. code-block:: bash 37 | 38 | $ pip install metatab 39 | 40 | Or, install the master branch from github with: 41 | 42 | .. code-block:: bash 43 | 44 | $ pip install https://github.com/CivicKnowledge/metatab.git 45 | 46 | Then test parsing using a remote file with: 47 | 48 | .. code-block:: bash 49 | 50 | $ metatab -j https://raw.githubusercontent.com/CivicKnowledge/metatab/master/test-data/example1.csv 51 | 52 | Run ``metatab -h`` to get other program options. 53 | 54 | The ``test-data`` directory has test files that also serve as examples to 55 | parse. You can either clone the repo and parse them from the files, or from the 56 | Github page for the file, click on the ``raw`` button to get raw view of the 57 | flie, then copy the URL. 58 | 59 | 60 | Running tests 61 | +++++++++++++ 62 | 63 | Run ``python setup.py tests`` to run normal development tests. You can also run 64 | ``tox``, which will try to run the tests with python 3.4, 3.5 and 3.6, ignoring 65 | non-existent interpreters. 66 | 67 | 68 | Development Testing with Docker 69 | +++++++++++++++++++++++++++++++ 70 | 71 | Testing during development for other versions of Python is a bit of a pain, 72 | since you have to install the alternate version, and Tox will run all of the 73 | tests, not just the one you want. 74 | 75 | One way to deal with this is to install Docker locally, then run the docker 76 | test container on the source directory. This is done automatically from the 77 | Makefile in metatab/test, just run: 78 | 79 | .. code-block:: bash 80 | 81 | $ cd metatab/test 82 | $ make build # to create the container image 83 | $ make test 84 | # or just .. 85 | $ make 86 | 87 | You can also run the container shell, and run tests from the command line. 88 | 89 | .. code-block:: bash 90 | 91 | $ cd metatab/test 92 | $ make build # to create the container image 93 | $ make shell # to run bash the container 94 | 95 | You now have a docker container where the /code directory is the metatab source dir. 96 | 97 | Now, run tox to build the tox virtual environments, then enter the specific version you want to 98 | run tests for and activate the virtual environment. 99 | 100 | .. code-block:: bash 101 | 102 | # tox 103 | # cd .tox/py34 104 | # source bin/activate # Activate the python 3.4 virtual env 105 | # cd ../../ 106 | # python setup.py test # Cause test deps to get installed 107 | # 108 | # python -munittest metatab.test.test_parser.TestParser.test_parse_everython # Run one test 109 | 110 | Note that your development environment is mounted into the Docker container, so you can edit local 111 | files and test the changes in Docker. 112 | 113 | 114 | 115 | 116 | 117 | 118 | -------------------------------------------------------------------------------- /metatab/test/test-data/yaml/yaml-example-1.yaml: -------------------------------------------------------------------------------- 1 | declare: metatab-latest 2 | title: San Diego County Weather 3 | description: Daily summaries from a selection of San Diego county weather stations 4 | identifier: 2dc83efa-e6da-4561-bdf9-63263360ccf0 5 | name: noaa.gov-daily_summary-1998e-san-1 6 | dataset: daily_summary 7 | origin: noaa.gov 8 | time: 1998e 9 | space: san 10 | grain: null 11 | variant: null 12 | version: '1' 13 | created: '2018-08-17T15:44:24' 14 | modified: '2018-08-17T16:18:19' 15 | giturl: https://github.com/san-diego-water-quality/water-datasets.git 16 | wrangler: 17 | - email: eric@civicknowledge.com 18 | organization: Civic Knowledge 19 | url: http://civicknowledge.com 20 | name: Eric Busboom 21 | documentation: 22 | - title: README 23 | url: file:README.md 24 | - title: Documentation 25 | description: Main documentation 26 | url: https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/GHCND_documentation.pdf 27 | datafile: 28 | - name: daily_summary_san 29 | description: Daily weather summaries 30 | url: http://ds.civicknowledge.org.s3.amazonaws.com/noaa.gov/daily-summary-1998-2018-san.csv 31 | - name: daily_summary_la 32 | description: Daily weather summaries 33 | url: http://ds.civicknowledge.org.s3.amazonaws.com/noaa.gov/daily-summary-1998-2018-san.csv 34 | table: 35 | - column: 36 | - datatype: string 37 | altname: station 38 | name: STATION 39 | - datatype: string 40 | altname: name 41 | description: Station code 42 | name: NAME 43 | - datatype: number 44 | altname: latitude 45 | description: Station name 46 | name: LATITUDE 47 | - datatype: number 48 | altname: longitude 49 | description: Station lattitude 50 | name: LONGITUDE 51 | - datatype: number 52 | altname: elevation 53 | description: Station longitude 54 | name: ELEVATION 55 | - datatype: date 56 | altname: date 57 | description: Station elevation 58 | name: DATE 59 | - datatype: number 60 | altname: awnd 61 | description: Measurement date 62 | name: AWND 63 | - datatype: string 64 | altname: dapr 65 | description: Average daily wind speed (meters per second or miles per hour 66 | as per user preference 67 | name: DAPR 68 | - datatype: integer 69 | altname: fmtm 70 | description: Number of days included in the multiday precipitation total (MDPR) 71 | name: FMTM 72 | - datatype: string 73 | altname: mdpr 74 | description: Time of fastest mile or fastest 1-minute wind (hours and minutes, 75 | i.e., HHMM) 76 | name: MDPR 77 | - datatype: string 78 | altname: pgtm 79 | description: Multiday precipitation total (mm or inches as per user preference; 80 | use with DAPR and DWPR, if available) 81 | name: PGTM 82 | - datatype: number 83 | altname: prcp 84 | description: Peak gust time (hours and minutes, i.e., HHMM) 85 | name: PRCP 86 | - datatype: integer 87 | altname: snow 88 | description: Precipitation (mm or inches as per user preference, inches to 89 | hundredths on Daily Form pdf file) 90 | name: SNOW 91 | - datatype: integer 92 | altname: snwd 93 | description: Snowfall (mm or inches as per user preference, inches to tenths 94 | on Daily Form pdf file) 95 | name: SNWD 96 | - datatype: string 97 | altname: tavg 98 | description: Snow depth (mm or inches as per user preference, inches on Daily 99 | Form pdf file) 100 | name: TAVG 101 | - datatype: integer 102 | altname: tmax 103 | description: Average temerature 104 | name: TMAX 105 | - datatype: integer 106 | altname: tmin 107 | description: Maximum temperature (Fahrenheit or Celsius as per user preference, 108 | Fahrenheit to tenths on Daily Form pdf file 109 | name: TMIN 110 | name: daily_summary_san 111 | 112 | -------------------------------------------------------------------------------- /metatab/test/test-data/example1.csv: -------------------------------------------------------------------------------- 1 | "Declare","metatab-latest",,,, 2 | "Title","Registered Voters, By County",,,, 3 | "Name","cdph.ca.gov-hci-registered_voters-county",,,, 4 | "Description","Percent of the eligible population registered to vote and the percent who voted in statewide elections.",,,, 5 | "Identifier","cdph.ca.gov-hci-registered_voters-county",,,, 6 | "Version",201404,,,, 7 | "Obsoletes","cdph.ca.gov-hci-registered_voters-county-201304",,,, 8 | "Dataset","voters",,,, 9 | "Origin","example.com",,,, 10 | "Space","Ca",,,, 11 | "Time","2002-2014",,,, 12 | "Grain","County",,,, 13 | "Format","excel",,,, 14 | ,,,,, 15 | ,,,,, 16 | "Section","Resources",,,, 17 | "Header","url","name","schema","Grain","Title" 18 | "Datafile","http://example.com/example1.csv","example1","registered_voters","County","The First Example Data File" 19 | "Datafile","http://example.com/example2.csv","example2","registered_voters","Tract","The Second Example Data File" 20 | "Homepage","https://www.cdph.ca.gov/programs/pages/healthycommunityindicators.aspx",,"Healthy Communities Data and Indicators Project (HCI)",, 21 | "Documentation","https://www.cdph.ca.gov/programs/Documents/HCI_RegisteredVoters_653_Narrative_and_examples_6-2-14.pdf",,"Indicator Documentation for Voter Registration / Participation",, 22 | ".description","Voter Registration/Participation: Percent of the eligible population registered to vote and the percent who voted in statewide elections",,,, 23 | ,,,,, 24 | "Section ","Contacts",,"email",, 25 | "Creator","Office of Health Equity",,"HCIOHE@cdph.ca.gov",, 26 | "Wrangler","Eric Busboom",,"eric@civicknowledge.com",, 27 | ,,,,, 28 | "Section ","Notes",,,, 29 | "Note","This file is an example of a data bundle, a simple format for linking data to metadata using spreadsheets. See the specification for more details. ",,,, 30 | "Documentation","https://docs.google.com/document/d/16tb7x73AyF8pJ6e6IBcaIJAioEZCNBGDEksKYTXfdfg/edit#",,,, 31 | ".Title","Data Bundles Packaging Specification",,,, 32 | ,,,,, 33 | "Section","Schema",,"datatype","valuetype","description" 34 | "Table","registered_voters",,,,"HCI Indicator 653.0: Percent of adults age 18 years and older who are registered voters" 35 | "Table.Column","reportyear",,"int","year range","Year or years that indicator was reported" 36 | "Table.Column","type",,"str","dimension","Type of record" 37 | "Table.Column","gvid",,"str","gvid","GVid version of the geotype and geotypeval" 38 | "Table.Column","geoname",,"str","label for gvid","Census name of geographic area" 39 | "Table.Column","geotype",,"str","label","Code for type of geographic area" 40 | "Table.Column","geotypevalue",,"str","census","Census geoid code" 41 | "Table.Column","county_fips",,"str","FIPS county code","County FIPS code" 42 | "Table.Column","county_name",,"str","label for counrty_fips","County name" 43 | "Table.Column","region_code",,"str","census code","Numeric code of region" 44 | "Table.Column","region_name",,"str","label for region_code","Name of region" 45 | "Table.Column","raceth",,"str","raceth/civick","Civic Knowledge race / ethnicity code." 46 | "Table.Column","raceth_name",,"str","label for raceeth","Race / Ethnicity Name" 47 | "Table.Column","race_eth_code",,"str","raceth/hci","Race / ethnicity code" 48 | "Table.Column","race_eth_name",,"str","label for race_eth_code","Race / ethnicity name" 49 | "Table.Column","numerator",,"int","count","Adults who are registered to vote, or who voted, depending on type of record" 50 | "Table.Column","denominator",,"int","count","Population of Adults, 18 years or older" 51 | "Table.Column","percent",,"float","percent of numerator over denominator","Percent of adults who are registered to vote, or who voted, depending on type of record" 52 | "Table.Column","ll_95ci",,"float","ci95l for percent","Lower bound of 95% confidence interval" 53 | "Table.Column","ul_95ci",,"float","ci95u for percent","Upper bound of 95% confidence interval" 54 | "Table.Column","se",,"float","se for percent","Standard error" 55 | "Table.Column","rse",,"float","rse for percent","Relative standard error (se/percent * 100) expressed as a percent" 56 | "Table.Column","ca_decile",,"float","decile","Statewide decile ranking" 57 | "Table.Column","ca_rr",,"float","ratio","Ratio of indicator to state average" 58 | "Table.Column","vap",,"float","measure","Voter age population, from CA Department of Finance." 59 | "Table.Column","ind_id",,"str","dimension", 60 | "Table.Column","ind_definition",,"str","dimension", 61 | "Table.Column","version",,"str","other", 62 | -------------------------------------------------------------------------------- /metatab/test/test-data/example1-headers.csv: -------------------------------------------------------------------------------- 1 | "Declare","metatab-latest",,,, 2 | "Title","Registered Voters, By County",,,, 3 | "Name","cdph.ca.gov-hci-registered_voters-county",,,, 4 | "Description","Percent of the eligible population registered to vote and the percent who voted in statewide elections.",,,, 5 | "Identifier","cdph.ca.gov-hci-registered_voters-county",,,, 6 | "Version",201404,,,, 7 | "Obsoletes","cdph.ca.gov-hci-registered_voters-county-201304",,,, 8 | "Dataset","voters",,,, 9 | "Origin","example.com",,,, 10 | "Space","Ca",,,, 11 | "Time","2002-2014",,,, 12 | "Grain","County",,,, 13 | "Format","excel",,,, 14 | ,,,,, 15 | ,,,,, 16 | ,,,,, 17 | "Section","Resources",,,, 18 | "Header","url","name","schema","Grain","Title" 19 | "Datafile","http://example.com/example1.csv","example1","registered_voters","County","The First Example Data File" 20 | "Datafile","http://example.com/example2.csv","example2","registered_voters","Tract","The Second Example Data File" 21 | "Homepage","https://www.cdph.ca.gov/programs/pages/healthycommunityindicators.aspx",,"Healthy Communities Data and Indicators Project (HCI)",, 22 | "Documentation","https://www.cdph.ca.gov/programs/Documents/HCI_RegisteredVoters_653_Narrative_and_examples_6-2-14.pdf",,"Indicator Documentation for Voter Registration / Participation",, 23 | ".description","Voter Registration/Participation: Percent of the eligible population registered to vote and the percent who voted in statewide elections",,,, 24 | ,,,,, 25 | "Section ","Contacts",,"email",, 26 | "Creator","Office of Health Equity",,"HCIOHE@cdph.ca.gov",, 27 | "Wrangler","Eric Busboom",,"eric@civicknowledge.com",, 28 | ,,,,, 29 | "Section ","Notes",,,, 30 | "Note","This file is an example of a data bundle, a simple format for linking data to metadata using spreadsheets. See the specification for more details. ",,,, 31 | "Documentation","https://docs.google.com/document/d/16tb7x73AyF8pJ6e6IBcaIJAioEZCNBGDEksKYTXfdfg/edit#",,,, 32 | ".Title","Data Bundles Packaging Specification",,,, 33 | ,,,,, 34 | "Section","Schema",,"datatype","valuetype","description" 35 | "Table","registered_voters",,,,"HCI Indicator 653.0: Percent of adults age 18 years and older who are registered voters" 36 | "Table.Column","reportyear",,"int","year range","Year or years that indicator was reported" 37 | "Table.Column","type",,"str","dimension","Type of record" 38 | "Table.Column","gvid",,"str","gvid","GVid version of the geotype and geotypeval" 39 | "Table.Column","geoname",,"str","label for gvid","Census name of geographic area" 40 | "Table.Column","geotype",,"str","label","Code for type of geographic area" 41 | "Table.Column","geotypevalue",,"str","census","Census geoid code" 42 | "Table.Column","county_fips",,"str","FIPS county code","County FIPS code" 43 | "Table.Column","county_name",,"str","label for counrty_fips","County name" 44 | "Table.Column","region_code",,"str","census code","Numeric code of region" 45 | "Table.Column","region_name",,"str","label for region_code","Name of region" 46 | "Table.Column","raceth",,"str","raceth/civick","Civic Knowledge race / ethnicity code." 47 | "Table.Column","raceth_name",,"str","label for raceeth","Race / Ethnicity Name" 48 | "Table.Column","race_eth_code",,"str","raceth/hci","Race / ethnicity code" 49 | "Table.Column","race_eth_name",,"str","label for race_eth_code","Race / ethnicity name" 50 | "Table.Column","numerator",,"int","count","Adults who are registered to vote, or who voted, depending on type of record" 51 | "Table.Column","denominator",,"int","count","Population of Adults, 18 years or older" 52 | "Table.Column","percent",,"float","percent of numerator over denominator","Percent of adults who are registered to vote, or who voted, depending on type of record" 53 | "Table.Column","ll_95ci",,"float","ci95l for percent","Lower bound of 95% confidence interval" 54 | "Table.Column","ul_95ci",,"float","ci95u for percent","Upper bound of 95% confidence interval" 55 | "Table.Column","se",,"float","se for percent","Standard error" 56 | "Table.Column","rse",,"float","rse for percent","Relative standard error (se/percent * 100) expressed as a percent" 57 | "Table.Column","ca_decile",,"float","decile","Statewide decile ranking" 58 | "Table.Column","ca_rr",,"float","ratio","Ratio of indicator to state average" 59 | "Table.Column","vap",,"float","measure","Voter age population, from CA Department of Finance." 60 | "Table.Column","ind_id",,"str","dimension", 61 | "Table.Column","ind_definition",,"str","dimension", 62 | "Table.Column","version",,"str","other", 63 | -------------------------------------------------------------------------------- /metatab/appurl.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017 Civic Knowledge. This file is licensed under the terms of the 2 | # Revised BSD License, included in this distribution as LICENSE 3 | 4 | """ 5 | 6 | """ 7 | 8 | from metatab import DEFAULT_METATAB_FILE 9 | from os.path import basename, join 10 | from rowgenerators import Url 11 | from rowgenerators.appurl.file.file import InnerFile 12 | from rowgenerators.appurl.util import file_ext 13 | from rowgenerators.appurl.web.web import WebUrl 14 | 15 | class MetatabUrl(InnerFile, Url): 16 | match_priority = WebUrl.match_priority - 1 17 | 18 | simple_file_formats = ('csv', 'txt', 'ipynb') 19 | 20 | def __init__(self, url=None, downloader=None, **kwargs): 21 | kwargs['proto'] = 'metatab' 22 | 23 | u = Url(url, **kwargs) 24 | 25 | assert downloader 26 | 27 | # If there is no file with an extension in the path, assume that this 28 | # is a filesystem package, and that the path should have DEFAULT_METATAB_FILE 29 | if file_ext(basename(u.path)) not in ('zip', 'xlsx') + self.simple_file_formats: 30 | u.path = join(u.path, DEFAULT_METATAB_FILE) 31 | 32 | super().__init__(str(u), downloader=downloader, **kwargs) 33 | 34 | self.scheme_extension = 'metatab' 35 | 36 | if basename(self.path) == DEFAULT_METATAB_FILE: 37 | frag = '' 38 | elif self.resource_format in self.simple_file_formats: 39 | frag = '' 40 | elif self.resource_format == 'xlsx': 41 | frag = 'meta' 42 | elif self.resource_format == 'zip': 43 | frag = DEFAULT_METATAB_FILE 44 | 45 | self.fragment = [frag, None] 46 | 47 | @classmethod 48 | def _match(cls, url, **kwargs): 49 | return url.proto == 'metatab' 50 | 51 | @property 52 | def resource_format(self): 53 | 54 | resource_format = file_ext(basename(self.path)) 55 | 56 | assert resource_format, self.path # Should have either a definite file, or have added one in __init__ 57 | 58 | return resource_format 59 | 60 | @property 61 | def resource_file(self): 62 | 63 | assert basename(self.resource_url) 64 | 65 | return basename(self.resource_url) 66 | 67 | @property 68 | def target_file(self): 69 | if self.path.endswith(DEFAULT_METATAB_FILE): 70 | return DEFAULT_METATAB_FILE 71 | elif self.resource_format in self.simple_file_formats: 72 | return self.resource_file 73 | elif self.resource_format == 'xlsx': 74 | return 'meta' 75 | elif self.resource_format == 'zip': 76 | return 'metadata.csv' 77 | else: 78 | return self.resource_file 79 | 80 | @property 81 | def target_format(self): 82 | if self.resource_format in self.simple_file_formats: 83 | return self.resource_format 84 | elif self.resource_format == 'xlsx': 85 | return 'xlsx' 86 | elif self.resource_format == 'zip': 87 | return 'csv' 88 | else: 89 | return 'csv' 90 | 91 | @property 92 | def doc(self): 93 | """Return the metatab document for the URL""" 94 | from metatab import MetatabDoc 95 | t = self.get_resource().get_target() 96 | return MetatabDoc(t.inner) 97 | 98 | @property 99 | def generator(self): 100 | 101 | from rowgenerators import get_generator 102 | 103 | ## 104 | ## Hack! This used to be 105 | ## target = self.get_resource().get_target().inner 106 | 107 | target = self.get_resource().get_target() 108 | 109 | return get_generator(target) 110 | 111 | def get_resource(self): 112 | 113 | if self.scheme == 'file': 114 | u = self 115 | else: 116 | u = WebUrl(str(self), downloader=self._downloader).get_resource() 117 | 118 | return MetatabUrl(str(u), downloader=self._downloader) 119 | 120 | def get_target(self): 121 | return MetatabUrl(str(self.inner.get_target()), downloader=self._downloader) 122 | 123 | def join_target(self, tf): 124 | 125 | print("Type=", type(self)) 126 | 127 | if self.target_file == DEFAULT_METATAB_FILE: 128 | return self.inner.join_dir(tf) 129 | else: 130 | return self.inner.join_target(tf) 131 | 132 | def exists(self): 133 | return self.inner.exists() 134 | -------------------------------------------------------------------------------- /metatab/test/test-data/notebooks/SimpleMagicsTest.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# This is the Title of the Notebook" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "And this is the description" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "from metatab.jupyter.script import get_ipython\n", 26 | "import pandas as pd\n", 27 | "from os.path import exists\n", 28 | "from os import remove" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 2, 34 | "metadata": {}, 35 | "outputs": [ 36 | { 37 | "name": "stdout", 38 | "output_type": "stream", 39 | "text": [ 40 | "This is a Bash cell\n" 41 | ] 42 | } 43 | ], 44 | "source": [ 45 | "%%bash\n", 46 | "echo \"This is a Bash cell\"\n", 47 | "touch /tmp/footouched" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": { 54 | "collapsed": true 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "assert exists('/tmp/footouched')\n", 59 | "remove('/tmp/footouched')\n", 60 | "assert not exists('/tmp/footouched')" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 2, 66 | "metadata": {}, 67 | "outputs": [ 68 | { 69 | "name": "stdout", 70 | "output_type": "stream", 71 | "text": [ 72 | "MagicsTest.ipynb\r\n" 73 | ] 74 | } 75 | ], 76 | "source": [ 77 | "!ls" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 2, 83 | "metadata": { 84 | "collapsed": true 85 | }, 86 | "outputs": [], 87 | "source": [ 88 | "%%metatab -p . \n", 89 | "Origin: example.com\n", 90 | "Dataset: foobar.com \n", 91 | "Identifier: de097279-28ef-42f5-a4f5-0eaac53b7dc4\n", 92 | "Name: example.com-foobar.com" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": { 99 | "collapsed": true 100 | }, 101 | "outputs": [], 102 | "source": [ 103 | "assert mt_pkg.find_first_value('Root.Name') == 'example.com-foobar.com'" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 1, 109 | "metadata": {}, 110 | "outputs": [ 111 | { 112 | "data": { 113 | "text/plain": [ 114 | "20" 115 | ] 116 | }, 117 | "execution_count": 1, 118 | "metadata": {}, 119 | "output_type": "execute_result" 120 | } 121 | ], 122 | "source": [ 123 | "foo = 10\n", 124 | "bar = 20\n", 125 | "print(bar)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": { 132 | "collapsed": true 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "print(\"DIsplaying Locals\")\n", 137 | "print(locals())" 138 | ] 139 | } 140 | ], 141 | "metadata": { 142 | "celltoolbar": "Tags", 143 | "kernelspec": { 144 | "display_name": "Python 3", 145 | "language": "python", 146 | "name": "python3" 147 | }, 148 | "language_info": { 149 | "codemirror_mode": { 150 | "name": "ipython", 151 | "version": 3 152 | }, 153 | "file_extension": ".py", 154 | "mimetype": "text/x-python", 155 | "name": "python", 156 | "nbconvert_exporter": "python", 157 | "pygments_lexer": "ipython3", 158 | "version": "3.6.1" 159 | }, 160 | "varInspector": { 161 | "cols": { 162 | "lenName": 16, 163 | "lenType": 16, 164 | "lenVar": 40 165 | }, 166 | "kernels_config": { 167 | "python": { 168 | "delete_cmd_postfix": "", 169 | "delete_cmd_prefix": "del ", 170 | "library": "var_list.py", 171 | "varRefreshCmd": "print(var_dic_list())" 172 | }, 173 | "r": { 174 | "delete_cmd_postfix": ") ", 175 | "delete_cmd_prefix": "rm(", 176 | "library": "var_list.r", 177 | "varRefreshCmd": "cat(var_dic_list()) " 178 | } 179 | }, 180 | "types_to_exclude": [ 181 | "module", 182 | "function", 183 | "builtin_function_or_method", 184 | "instance", 185 | "_Feature" 186 | ], 187 | "window_display": false 188 | } 189 | }, 190 | "nbformat": 4, 191 | "nbformat_minor": 2 192 | } 193 | -------------------------------------------------------------------------------- /metatab/test/test-data/scripts/complex-text.txt: -------------------------------------------------------------------------------- 1 | Identifier: 47bc1089-7584-41f0-b804-602ec42f1249 2 | Origin: civicknowledge.com 3 | Dataset: rcfe_affordability 4 | Version: 4 5 | Time: 2015 6 | Name: civicknowledge.com-rcfe_affordability-2015-4 7 | 8 | Section: Contacts 9 | Wrangler: Eric Busboom 10 | Wrangler.Email: eric@civicknowledge.com 11 | Wrangler.Organization: Civic Knowledge 12 | 13 | Section: References 14 | 15 | Reference: censusreporter:B09020/140/05000US06073 16 | Reference.Name: B09020 17 | Reference.Description: Relationship by Household Type (Including Living Alone) for Population 65 Years and Over 18 | 19 | Reference: censusreporter:B25007/140/05000US06073 20 | Reference.Name: B25007 21 | Reference.Description: Tenure by Age of Householder 22 | 23 | # 24 | # Household Income 25 | # 26 | 27 | Reference: censusreporter:B19049/140/05000US06073 28 | Reference.Name: B19049 29 | Reference.Description: Median Household Income by Age of Householder 30 | 31 | 32 | # For whole county 33 | 34 | Reference: censusreporter:B19049/050/05000US06073 35 | Reference.Name: B19049_county 36 | Reference.Description: Median Household Income by Age of Householder 37 | 38 | # 39 | # Home value distributions, by tract 40 | # 41 | 42 | Reference: censusreporter:B25076/140/05000US06073 43 | Reference.Name: B25076 44 | Reference.Description: Lower Value Quartile (Dollars) 45 | 46 | Reference: censusreporter:B25077/140/05000US06073 47 | Reference.Name: B25077 48 | Reference.Description: Median Value 49 | 50 | Reference: censusreporter:B25078/140/05000US06073 51 | Reference.Name: B25078 52 | Reference.Description: Upper Value Quartile (Dollars) 53 | 54 | # 55 | # Home value distributions, for SD County 56 | # 57 | Reference: censusreporter:B25076/050/05000US06073 58 | Reference.Name: B25076_county 59 | Reference.Description: Lower Value Quartile (Dollars) 60 | 61 | Reference: censusreporter:B25077/050/05000US06073 62 | Reference.Name: B25077_county 63 | Reference.Description: Median Value 64 | 65 | Reference: censusreporter:B25078/050/05000US06073 66 | Reference.Name: B25078_county 67 | Reference.Description: Upper Value Quartile (Dollars) 68 | 69 | # 70 | # Tract crosswalk 71 | # 72 | Reference: metatab+http://s3.amazonaws.com/library.metatab.org/sangis.org-census_regions-2010-sandiego-7.csv#tract-sra-msa-xwalk 73 | Reference.Name: tracts 74 | Reference.Description: Crosswalk between crosswalks, tracts, zip codes and SRAs in San Diego County 75 | 76 | # 77 | # Tract boundaries 78 | # 79 | Reference: metatab+http://s3.amazonaws.com/library.metatab.org/sangis.org-census_regions-2010-sandiego-7.csv#tracts 80 | Reference.Name: tracts_geo 81 | Reference.Description: Geographics Boundaries for Tracts 82 | 83 | # 84 | # SRA boundaries 85 | # 86 | Reference: metatab+http://s3.amazonaws.com/library.metatab.org/sangis.org-census_regions-2010-sandiego-7.csv#sra 87 | Reference.Name: sra_geo 88 | Reference.Description: Geographics Boundaries for SRAs 89 | 90 | # 91 | # IPUMS Housing and Income Data 92 | # 93 | # Need to use the ZIP version b/c we need to import the Python Code 94 | Reference: metatab+http://s3.amazonaws.com/library.metatab.org/ipums.org-income_homevalue-5.zip#income_homeval 95 | Reference.Name: incv 96 | Reference.Description: Income and Home value records from IPUMS for San Diego County 97 | Section: Resources 98 | 99 | 100 | Section: Bibliography 101 | Citation: ipums 102 | Citation.Type: dataset 103 | Citation.Author: Steven Ruggles; Katie Genadek; Ronald Goeken; Josiah Grover; Matthew Sobek 104 | Citation.Title: Integrated Public Use Microdata Series 105 | Citation.Year: 2017 106 | Citation.Publisher: University of Minnesota 107 | Citation.Version: 7.0 108 | Citation.AccessDate: 20170718 109 | Citation.Url: https://usa.ipums.org/usa/index.shtml 110 | Citation.Doi: https://doi.org/10.18128/D010.V7.0 111 | 112 | Citation: bordley 113 | Citation.Type: article 114 | Citation.Author: Robert F. Bordley; James B. McDonald; Anand Mantrala 115 | Citation.Title: Something New, Something Old: Parametric Models for the Size of Distribution of Income 116 | Citation.Year: 1997 117 | Citation.Month: June 118 | Citation.Journal: Journal of Income Distribution 119 | Citation.Volume: 6 120 | Citation.Number: 1 121 | Citation.Pages: 5-5 122 | Citation.Url: https://ideas.repec.org/a/jid/journl/y1997v06i1p5-5.html 123 | 124 | Citation: mcdonald 125 | Citation.Type: article 126 | Citation.Author: McDonald, James B.; Mantrala, Anand 127 | Citation.Title: The distribution of personal income: Revisited 128 | Citation.Journal: Journal of Applied Econometrics 129 | Citation.Volume: 10 130 | Citation.Number: 2 131 | Citation.Publisher: Wiley Subscription Services, Inc., A Wiley Company 132 | Citation.Issn: 1099-1255 133 | Citation.Doi: 10.1002/jae.3950100208 134 | Citation.Pages: 201--204, 135 | Citation.Year: 1995 136 | 137 | Citation: majumder 138 | Citation.Type: article 139 | Citation.Author: Majumder, Amita; Chakravarty, Satya Ranjan 140 | Citation.Title: Distribution of personal income: Development of a new model and its application to U.S. income data 141 | Citation.Journal: Journal of Applied Econometrics 142 | Citation.Volume: 5 143 | Citation.Number: 2 144 | Citation.Publisher: Wiley Subscription Services, Inc., A Wiley Company 145 | Citation.Issn: 1099-1255 146 | Citation.Doi: 10.1002/jae.3950050206 147 | Citation.Pages: 189--196 148 | Citation.Year: 1990 -------------------------------------------------------------------------------- /docs/PrivateDatasets.rst: -------------------------------------------------------------------------------- 1 | 2 | Private Datasets 3 | ================ 4 | 5 | Datasets that should be protected from unauthorized access can be written to S3 with a private ACL and access using S3 credentials. To use private datasets: 6 | 7 | - Use the **metaaws** program to setup an S3 bucket with a policy and users 8 | - Add a ``Root.Access`` term to the dataset's metatab document. 9 | - Syncronize the dataset to s3 with **metasync** 10 | - Setup credentials for an S3 user 11 | - Access the dataset using an S3 url. 12 | 13 | Setup The S3 Bucket 14 | ------------------- 15 | 16 | Suppose we want to store datasets in a bucket ``bucket.example.com``. After creating the bucjet, initialize it with subdirectories and policies with the **metaaws** program. 17 | 18 | .. code-block:: bash 19 | 20 | $ metaaws init-bucket bucket.example.com 21 | 22 | 23 | 24 | Configure and Sync a Dataset 25 | ---------------------------- 26 | 27 | To make a dataset private, add a ``Root.Access`` term to the ``Root`` section, with a value of ``private`` 28 | 29 | 30 | 31 | Create S3 Users 32 | --------------- 33 | 34 | Use the **metaaws** program to create users and add permissions to the bucket. First, initialize a bucket with the apprpriate policies: 35 | 36 | .. code-block:: bash 37 | 38 | $ metaaws init-bucket bucket.example.com 39 | 40 | Then, create a new user. 41 | 42 | .. code-block:: bash 43 | 44 | $ metaaws new-user foobar 45 | Created user : foobar 46 | arn : arn:aws:iam::095555823111:user/metatab/foobar 47 | Access Key : AKIAJXMFAP3X5TRYYQ5Q 48 | Secret Key : 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 49 | 50 | The secret key and access key should be given to the user, to set up as according to the next 51 | section. 52 | 53 | Setup S3 Credentials 54 | -------------------- 55 | 56 | The access and secret keys should be stored in a boto configuration file, such as ``~/.aws/credentials``. See 57 | the `boto3 configuration documentation `_ for details. Here is an example of a ``credentials`` file 58 | 59 | .. code-block:: 60 | 61 | [default] 62 | aws_access_key_id = AKIAJXMFAP3X5TRYYQ5Q 63 | aws_secret_access_key = b81zw4LRDKVILzrZbS0B8KMn88xbY9BEEnwzKrz2 64 | 65 | 66 | If you have multiple credentials, you can put them in different sections by changing ``[default]`` to the name of another profile. For instance, here is a credentials file with a default and alternate profile: 67 | 68 | .. code-block:: 69 | 70 | [default] 71 | aws_access_key_id = AKIAJXMFAP3X5TRYYQ5Q 72 | aws_secret_access_key = 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 73 | [fooprofile] 74 | aws_access_key_id = AKIAX5TRYYQ5QJXMFAP3 75 | aws_secret_access_key = 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 76 | 77 | To use the alternate credentials with the ``metasync`` program, use the ``-p`` option: 78 | 79 | .. code-block:: bash 80 | 81 | $ metasync -p fooprofile -S library.metatab.org 82 | 83 | To use the alternate credentials with the ``open_package()`` function, you will need to set them in the shell before you run any programs. The ``metasync -C`` program will display the credentials in a form that can be shell eval'd, and the ``-p`` option can select an alternate profile. 84 | 85 | .. code-block:: bash 86 | 87 | $ metasync -C -p fooprofile 88 | export AWS_ACCESS_KEY_ID=AKIAX5TRYYQ5QJXMFAP3 89 | export AWS_SECRET_ACCESS_KEY=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 90 | # Run 'eval $(metasync -C -p fooprofile )' to configure credentials in a shell 91 | 92 | The last line of the output shows the command to run to set the credentials in the shell: 93 | 94 | .. code-block:: bash 95 | 96 | $ eval $(metasync -C -p fooprofile ) 97 | 98 | Setting credentials in the shell is only required if you access the private dataset via ``open_package()`` although it should also work when using the ``metasync`` and ``metapack`` program. 99 | 100 | Using Private Files 101 | ------------------- 102 | 103 | Private files can't be easily downloaded using a web browser, but there are a few other ways to fetch them. 104 | 105 | * Use an S3 client, such as CyberDuck, S3 Browser, CloudBerry or S3 Tools. 106 | * Use the ``metapack`` program to dump a CSV file. 107 | 108 | To use the matpack program, first list the resources in the remote package: 109 | 110 | .. code-block:: bash 111 | 112 | $ metapack -r s3://library.civicknowledge.com/private/carr/civicknowledge.com-rcfe_health-1.csv 113 | seniors s3://library.civicknowledge.com/private/carr/civicknowledge.com-rcfe_health-1/data/seniors.csv 114 | rcfe_tract s3://library.civicknowledge.com/private/carr/civicknowledge.com-rcfe_health-1/data/rcfe_tract.csv 115 | rcfe_sra s3://library.civicknowledge.com/private/carr/civicknowledge.com-rcfe_health-1/data/rcfe_sra.csv 116 | rcfe_seniors_tract s3://library.civicknowledge.com/private/carr/civicknowledge.com-rcfe_health-1/data/rcfe_seniors_tract.csv 117 | 118 | Then, run the same command again, but appending a fragment to the url, and redirecting to a csv file. For instance, for the 'seniors' file, append ``#seniors`` to the url: 119 | 120 | 121 | .. code-block:: bash 122 | 123 | $ metapack -r s3://.../civicknowledge.com-rcfe_health-1.csv#seniors > seniors.csv 124 | 125 | You can also fetch the entire data package, downloading all of the data files, by creating a local file system, zip or excel package. The easiest to use is the Filesystem package, created with ``metapack -f`` 126 | 127 | .. code-block:: bash 128 | 129 | $ metapack -f s3://.../civicknowledge.com-rcfe_health-1.csv 130 | 131 | The command will create a complete data package with unpacked CSV files in the ``_packages`` subdirectory. 132 | 133 | 134 | 135 | 136 | 137 | 138 | -------------------------------------------------------------------------------- /metatab/test/test-data/example1.txt: -------------------------------------------------------------------------------- 1 | Declare: metatab-latest 2 | Title: Registered Voters, By County 3 | Name: cdph.ca.gov-hci-registered_voters-county 4 | Description: Percent of the eligible population registered to vote and the percent who voted in statewide elections. 5 | Identifier: cdph.ca.gov-hci-registered_voters-county 6 | Version: 201404 7 | Obsoletes: cdph.ca.gov-hci-registered_voters-county-201304 8 | Format: excel 9 | Spatial: California <04000US06> 10 | Time: 2002-2014 11 | Spatialgrain: County <05000US> 12 | Section: Resources 13 | Datafile: http://example.com/example1.csv 14 | Datafile.Name: example1 15 | Datafile.Schema: registered_voters 16 | Datafile.Grain: County 17 | Datafile.Title: The First Example Data File 18 | Datafile: http://example.com/example2.csv 19 | Datafile.Name: example2 20 | Datafile.Schema: registered_voters 21 | Datafile.Grain: Tract 22 | Datafile.Title: The Second Example Data File 23 | Homepage: https://www.cdph.ca.gov/programs/pages/healthycommunityindicators.aspx 24 | Homepage.Schema: Healthy Communities Data and Indicators Project (HCI) 25 | Documentation: https://www.cdph.ca.gov/programs/Documents/HCI_RegisteredVoters_653_Narrative_and_examples_6-2-14.pdf 26 | Documentation.Schema: Indicator Documentation for Voter Registration / Participation 27 | Documentation.Description: Voter Registration/Participation: Percent of the eligible population registered to vote and the percent who voted in statewide elections 28 | Section: Contacts 29 | Creator: Office of Health Equity 30 | Creator.Email: HCIOHE@cdph.ca.gov 31 | Wrangler: Eric Busboom 32 | Wrangler.Email: eric@civicknowledge.com 33 | Section: Notes 34 | Note: This file is an example of a data bundle, a simple format for linking data to metadata using spreadsheets. See the specification for more details. 35 | Documentation: https://docs.google.com/document/d/16tb7x73AyF8pJ6e6IBcaIJAioEZCNBGDEksKYTXfdfg/edit# 36 | Documentation.Title: Data Bundles Packaging Specification 37 | Section: Schema 38 | Table: registered_voters 39 | Table.Description: HCI Indicator 653.0: Percent of adults age 18 years and older who are registered voters 40 | Table.Column: reportyear 41 | Column.Datatype: int 42 | Column.Valuetype: year range 43 | Column.Description: Year or years that indicator was reported 44 | Table.Column: type 45 | Column.Datatype: str 46 | Column.Valuetype: dimension 47 | Column.Description: Type of record 48 | Table.Column: gvid 49 | Column.Datatype: str 50 | Column.Valuetype: gvid 51 | Column.Description: GVid version of the geotype and geotypeval 52 | Table.Column: geoname 53 | Column.Datatype: str 54 | Column.Valuetype: label for gvid 55 | Column.Description: Census name of geographic area 56 | Table.Column: geotype 57 | Column.Datatype: str 58 | Column.Valuetype: label 59 | Column.Description: Code for type of geographic area 60 | Table.Column: geotypevalue 61 | Column.Datatype: str 62 | Column.Valuetype: census 63 | Column.Description: Census geoid code 64 | Table.Column: county_fips 65 | Column.Datatype: str 66 | Column.Valuetype: FIPS county code 67 | Column.Description: County FIPS code 68 | Table.Column: county_name 69 | Column.Datatype: str 70 | Column.Valuetype: label for counrty_fips 71 | Column.Description: County name 72 | Table.Column: region_code 73 | Column.Datatype: str 74 | Column.Valuetype: census code 75 | Column.Description: Numeric code of region 76 | Table.Column: region_name 77 | Column.Datatype: str 78 | Column.Valuetype: label for region_code 79 | Column.Description: Name of region 80 | Table.Column: raceth 81 | Column.Datatype: str 82 | Column.Valuetype: raceth/civick 83 | Column.Description: Civic Knowledge race / ethnicity code. 84 | Table.Column: raceth_name 85 | Column.Datatype: str 86 | Column.Valuetype: label for raceeth 87 | Column.Description: Race / Ethnicity Name 88 | Table.Column: race_eth_code 89 | Column.Datatype: str 90 | Column.Valuetype: raceth/hci 91 | Column.Description: Race / ethnicity code 92 | Table.Column: race_eth_name 93 | Column.Datatype: str 94 | Column.Valuetype: label for race_eth_code 95 | Column.Description: Race / ethnicity name 96 | Table.Column: numerator 97 | Column.Datatype: int 98 | Column.Valuetype: count 99 | Column.Description: Adults who are registered to vote, or who voted, depending on type of record 100 | Table.Column: denominator 101 | Column.Datatype: int 102 | Column.Valuetype: count 103 | Column.Description: Population of Adults, 18 years or older 104 | Table.Column: percent 105 | Column.Datatype: float 106 | Column.Valuetype: percent of numerator over denominator 107 | Column.Description: Percent of adults who are registered to vote, or who voted, depending on type of record 108 | Table.Column: ll_95ci 109 | Column.Datatype: float 110 | Column.Valuetype: ci95l for percent 111 | Column.Description: Lower bound of 95% confidence interval 112 | Table.Column: ul_95ci 113 | Column.Datatype: float 114 | Column.Valuetype: ci95u for percent 115 | Column.Description: Upper bound of 95% confidence interval 116 | Table.Column: se 117 | Column.Datatype: float 118 | Column.Valuetype: se for percent 119 | Column.Description: Standard error 120 | Table.Column: rse 121 | Column.Datatype: float 122 | Column.Valuetype: rse for percent 123 | Column.Description: Relative standard error (se/percent * 100) expressed as a percent 124 | Table.Column: ca_decile 125 | Column.Datatype: float 126 | Column.Valuetype: decile 127 | Column.Description: Statewide decile ranking 128 | Table.Column: ca_rr 129 | Column.Datatype: float 130 | Column.Valuetype: ratio 131 | Column.Description: Ratio of indicator to state average 132 | Table.Column: vap 133 | Column.Datatype: float 134 | Column.Valuetype: measure 135 | Column.Description: Voter age population, from CA Department of Finance. 136 | Table.Column: ind_id 137 | Column.Datatype: str 138 | Column.Valuetype: dimension 139 | Table.Column: ind_definition 140 | Column.Datatype: str 141 | Column.Valuetype: dimension 142 | Table.Column: version 143 | Column.Datatype: str 144 | Column.Valuetype: other 145 | -------------------------------------------------------------------------------- /metatab/rowgen.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017 Civic Knowledge. This file is licensed under the terms of the 2 | # MIT License, included in this distribution as LICENSE.txt 3 | 4 | """ """ 5 | from rowgenerators import Source 6 | from rowgenerators.source import Source 7 | from rowgenerators import SourceError 8 | 9 | class YamlMetatabSource(Source): 10 | """Turn a metatab-formated YAML file into Metatab rows.""" 11 | 12 | def __init__(self, ref, table=None, cache=None, working_dir=None, env=None, **kwargs): 13 | super().__init__(ref, cache, working_dir, **kwargs) 14 | 15 | self.url = ref 16 | self.section_map = {} 17 | self.sections = {} 18 | 19 | def yield_dict(self, doc, d, parent=None): 20 | 21 | for k, v in d.items(): 22 | 23 | tn = "{}.{}".format((parent or 'Root').split('.')[-1], k).lower() 24 | t = doc.decl_terms.get(tn,{}) 25 | vtn = t.get('termvaluename','').lower() 26 | 27 | if isinstance(v, list): 28 | for e in v: 29 | try: 30 | value = e[vtn] 31 | del e[vtn] 32 | yield (tn, value, parent) 33 | except KeyError: 34 | pass 35 | 36 | yield from self.yield_dict(doc, e, tn) 37 | elif isinstance(v, dict): 38 | yield from self.yield_dict(doc, v, tn) 39 | else: 40 | yield (tn,v, parent) 41 | 42 | 43 | def __iter__(self): 44 | """Iterate over all of the lines in the file""" 45 | 46 | import yaml 47 | from metatab import MetatabDoc 48 | 49 | with open(self.url.fspath) as f: 50 | d = yaml.load(f) 51 | 52 | decl = d.get('declare', 'metatab-latest') 53 | 54 | doc = MetatabDoc(decl=decl) 55 | 56 | #yield from doc.rows 57 | 58 | section_names = ['root','contacts','documentation','resources','references','schema'] 59 | 60 | for section_name in section_names: 61 | section = doc.decl_sections[section_name] 62 | #print(section_name, section) 63 | 64 | for tn in section.get('terms',[]): 65 | self.section_map[tn.lower()] = section_name 66 | 67 | self.sections[section_name] = doc.get_or_new_section(section_name, section['args']) 68 | 69 | last_section = None 70 | last_term = { } 71 | for term_name, value, parent in self.yield_dict(doc, d): 72 | 73 | print(term_name, value, parent) 74 | 75 | section = self.sections.get(self.section_map.get(term_name) or 'root') 76 | 77 | if parent is None: 78 | term = section.new_term(term_name, value) 79 | else: 80 | 81 | parent_term = last_term[parent] 82 | term = parent_term.new_child(term_name, value) 83 | 84 | last_term[term_name] = term 85 | 86 | 87 | 88 | 89 | yield from doc.rows 90 | 91 | 92 | class MetatabRowGenerator(Source): 93 | """An object that generates rows. The current implementation mostly just a wrapper around 94 | csv.reader, but it adds a path property so term interperters know where the terms are coming from 95 | """ 96 | 97 | def __init__(self, ref, cache=None, working_dir=None, path = None, **kwargs): 98 | super().__init__(ref, cache, working_dir, **kwargs) 99 | 100 | self._rows = ref 101 | self._path = path or '' 102 | 103 | @property 104 | def path(self): 105 | return self._path 106 | 107 | def open(self): 108 | pass 109 | 110 | def close(self): 111 | pass 112 | 113 | def __iter__(self): 114 | for row in self._rows: 115 | yield row 116 | 117 | 118 | class TextRowGenerator(MetatabRowGenerator): 119 | """Return lines of text of a line-oriented metatab file, breaking them to be used as Metatab rows. 120 | This is the core of the Lines format implementation""" 121 | 122 | def __init__(self, ref, cache=None, working_dir=None, path = None, **kwargs): 123 | super().__init__(ref, cache, working_dir, path, **kwargs) 124 | 125 | while True: 126 | 127 | try: 128 | # Pathlib Path 129 | with ref.open() as r: 130 | text = r.read() 131 | break 132 | except: 133 | pass 134 | 135 | try: 136 | # Filehandle 137 | text = ref.read() 138 | break 139 | except: 140 | pass 141 | 142 | try: 143 | # Url 144 | with ref.inner.fspath.open() as f: 145 | text = f.read() 146 | break 147 | except: 148 | 149 | pass 150 | 151 | try: 152 | # File name 153 | with open(ref) as r: 154 | text = r.read() 155 | break 156 | except: 157 | pass 158 | 159 | try: 160 | text = ref 161 | text.splitlines() 162 | break 163 | except AttributeError: 164 | pass 165 | 166 | 167 | raise SourceError("Can't handle ref of type {}".format(type(ref))) 168 | 169 | self._text = text 170 | self._text_lines = text.splitlines() 171 | self._path = path or '' 172 | 173 | @property 174 | def path(self): 175 | return self._path 176 | 177 | def open(self): 178 | pass 179 | 180 | def close(self): 181 | pass 182 | 183 | def __iter__(self): 184 | import re 185 | 186 | for row in self._text_lines: 187 | if re.match(r'^\s*#', row): # Skip comments 188 | continue 189 | 190 | # Special handling for ====, which implies a section: 191 | # ==== Schema 192 | # is also 193 | # Section: Schema 194 | 195 | if row.startswith('===='): 196 | row = re.sub(r'^=*','Section:', row) 197 | 198 | row = [e.strip() for e in row.split(':', 1)] 199 | 200 | # Pipe characters seperate columns 201 | if len(row) > 1: 202 | row = [row[0]] + [ e.replace('\|','|') for e in re.split(r'(?/') 111 | 112 | The result should be the same documentation, but with different URLs. -------------------------------------------------------------------------------- /metatab/util.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016 Civic Knowledge. This file is licensed under the terms of the 2 | # Revised BSD License, included in this distribution as LICENSE 3 | 4 | """Classes to build a Metatab document 5 | """ 6 | import logging 7 | import os 8 | import shutil 9 | import sys 10 | from genericpath import exists, isfile 11 | from os import makedirs 12 | from os.path import join, basename, dirname, isdir, abspath 13 | 14 | #from rowgenerators import reparse_url, parse_url_to_dict, unparse_url_dict, Url 15 | 16 | from metatab import DEFAULT_METATAB_FILE 17 | from rowgenerators import get_cache 18 | 19 | 20 | def declaration_path(name): 21 | """Return the path to an included declaration""" 22 | from os.path import dirname, join, exists 23 | import metatabdecl 24 | from metatab.exc import IncludeError 25 | 26 | d = dirname(metatabdecl.__file__) 27 | 28 | path = join(d, name) 29 | 30 | if not exists(path): 31 | path = join(d, name + '.csv') 32 | 33 | if not exists(path): 34 | raise IncludeError("No local declaration file for name '{}' ".format(name)) 35 | 36 | return path 37 | 38 | 39 | # From http://stackoverflow.com/a/295466 40 | def slugify(value): 41 | """ 42 | Normalizes string, converts to lowercase, removes non-alpha characters, 43 | and converts spaces to hyphens. 44 | """ 45 | import re 46 | import unicodedata 47 | value = str(value) 48 | value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('utf8').strip().lower() 49 | value = re.sub(r'[^\w\s\-\.]', '', value) 50 | value = re.sub(r'[-\s]+', '-', value) 51 | return value 52 | 53 | 54 | def flatten(d, sep='.'): 55 | """Flatten a data structure into tuples""" 56 | 57 | def _flatten(e, parent_key='', sep='.'): 58 | import collections 59 | 60 | prefix = parent_key + sep if parent_key else '' 61 | 62 | if isinstance(e, collections.MutableMapping): 63 | return tuple((prefix + k2, v2) for k, v in e.items() for k2, v2 in _flatten(v, k, sep)) 64 | elif isinstance(e, collections.MutableSequence): 65 | return tuple((prefix + k2, v2) for i, v in enumerate(e) for k2, v2 in _flatten(v, str(i), sep)) 66 | else: 67 | return (parent_key, (e,)), 68 | 69 | return tuple((k, v[0]) for k, v in _flatten(d, '', sep)) 70 | 71 | 72 | # From http://stackoverflow.com/a/2597440 73 | class Bunch(object): 74 | def __init__(self, adict): 75 | self.__dict__.update(adict) 76 | 77 | 78 | MP_DIR = '_metapack' 79 | DOWNLOAD_DIR = join(MP_DIR, 'download') 80 | PACKAGE_DIR = join(MP_DIR, 'package') 81 | OLD_DIR = join(MP_DIR, 'old') 82 | 83 | 84 | def make_dir_structure(base_dir): 85 | """Make the build directory structure. """ 86 | 87 | def maybe_makedir(*args): 88 | 89 | p = join(base_dir, *args) 90 | 91 | if exists(p) and not isdir(p): 92 | raise IOError("File '{}' exists but is not a directory ".format(p)) 93 | 94 | if not exists(p): 95 | makedirs(p) 96 | 97 | maybe_makedir(DOWNLOAD_DIR) 98 | maybe_makedir(PACKAGE_DIR) 99 | maybe_makedir(OLD_DIR) 100 | 101 | 102 | def make_metatab_file(template='metatab'): 103 | from os.path import dirname 104 | from rowgenerators.util import fs_join as join 105 | import metatab.templates 106 | from metatab.doc import MetatabDoc 107 | 108 | template_path = join(dirname(metatab.templates.__file__), template + '.csv') 109 | 110 | doc = MetatabDoc(template_path) 111 | 112 | return doc 113 | 114 | 115 | 116 | import mimetypes 117 | 118 | mimetypes.init() 119 | mime_map = {v: k.strip('.') for k, v in mimetypes.types_map.items()} 120 | mime_map['application/x-zip-compressed'] = 'zip' 121 | mime_map['application/vnd.ms-excel'] = 'xls' 122 | mime_map['text/html'] = 'html' 123 | 124 | 125 | # From https://gist.github.com/zdavkeos/1098474 126 | def walk_up(bottom): 127 | """ mimic os.walk, but walk 'up' instead of down the directory tree 128 | :param bottom: 129 | :return: 130 | """ 131 | import os 132 | from os import path 133 | 134 | bottom = path.realpath(bottom) 135 | 136 | # get files in current dir 137 | try: 138 | names = os.listdir(bottom) 139 | except Exception as e: 140 | raise e 141 | 142 | dirs, nondirs = [], [] 143 | for name in names: 144 | if path.isdir(path.join(bottom, name)): 145 | dirs.append(name) 146 | else: 147 | nondirs.append(name) 148 | 149 | yield bottom, dirs, nondirs 150 | 151 | new_path = path.realpath(path.join(bottom, '..')) 152 | 153 | # see if we are at the top 154 | if new_path == bottom: 155 | return 156 | 157 | for x in walk_up(new_path): 158 | yield x 159 | 160 | 161 | def ensure_dir(path): 162 | if path and not exists(path): 163 | makedirs(path) 164 | 165 | 166 | def copytree(src, dst, symlinks=False, ignore=None): 167 | for item in os.listdir(src): 168 | s = os.path.join(src, item) 169 | d = os.path.join(dst, item) 170 | if os.path.isdir(s): 171 | shutil.copytree(s, d, symlinks, ignore) 172 | else: 173 | shutil.copy2(s, d) 174 | 175 | 176 | logger = logging.getLogger('user') 177 | logger_err = logging.getLogger('cli-errors') 178 | debug_logger = logging.getLogger('debug') 179 | 180 | 181 | def cli_init(log_level=logging.INFO): 182 | out_hdlr = logging.StreamHandler(sys.stdout) 183 | out_hdlr.setFormatter(logging.Formatter('%(message)s')) 184 | out_hdlr.setLevel(log_level) 185 | logger.addHandler(out_hdlr) 186 | logger.setLevel(log_level) 187 | 188 | out_hdlr = logging.StreamHandler(sys.stderr) 189 | out_hdlr.setFormatter(logging.Formatter('%(levelname)s: %(message)s')) 190 | out_hdlr.setLevel(logging.WARN) 191 | logger_err.addHandler(out_hdlr) 192 | logger_err.setLevel(logging.WARN) 193 | 194 | 195 | def prt(*args, **kwargs): 196 | logger.info(' '.join(str(e) for e in args), **kwargs) 197 | 198 | 199 | def warn(*args, **kwargs): 200 | logger_err.warn(' '.join(str(e) for e in args), **kwargs) 201 | 202 | 203 | def err(*args, **kwargs): 204 | logger_err.critical(' '.join(str(e) for e in args), **kwargs) 205 | sys.exit(1) 206 | 207 | 208 | def import_name_or_class(name): 209 | " Import an obect as either a fully qualified, dotted name, " 210 | 211 | if isinstance(name, str): 212 | 213 | # for "a.b.c.d" -> [ 'a.b.c', 'd' ] 214 | module_name, object_name = name.rsplit('.',1) 215 | # __import__ loads the multi-level of module, but returns 216 | # the top level, which we have to descend into 217 | mod = __import__(module_name) 218 | 219 | components = name.split('.') 220 | 221 | for comp in components[1:]: # Already got the top level, so start at 1 222 | 223 | mod = getattr(mod, comp) 224 | return mod 225 | else: 226 | return name # Assume it is already the thing we want to import 227 | 228 | 229 | def md5_file(filePath): 230 | import hashlib 231 | 232 | try: 233 | with open(filePath, 'rb') as fh: 234 | m = hashlib.md5() 235 | while True: 236 | data = fh.read(8192) 237 | if not data: 238 | break 239 | m.update(data) 240 | return m.hexdigest() 241 | except (FileNotFoundError, IsADirectoryError): 242 | return None -------------------------------------------------------------------------------- /metatab/cli.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017 Civic Knowledge. This file is licensed under the terms of the 2 | # Revised BSD License, included in this distribution as LICENSE 3 | 4 | """ 5 | CLI program for managing Metatab files 6 | """ 7 | 8 | import json 9 | import sys 10 | from genericpath import exists 11 | 12 | from metatab import DEFAULT_METATAB_FILE, MetatabDoc, parse_app_url 13 | from rowgenerators.util import get_cache, clean_cache 14 | from os.path import dirname 15 | from rowgenerators.util import fs_join as join 16 | 17 | import logging 18 | 19 | logger = logging.getLogger('user') 20 | logger_err = logging.getLogger('cli-errors') 21 | debug_logger = logging.getLogger('debug') 22 | 23 | cache = get_cache() 24 | 25 | def metatab(): 26 | import argparse 27 | parser = argparse.ArgumentParser( 28 | prog='metatab', 29 | description='Matatab file parser', 30 | epilog='Cache dir: {}\n'.format(str(cache.getsyspath('/') ) )) 31 | 32 | g = parser.add_mutually_exclusive_group() 33 | 34 | g.add_argument('-C', '--create', action='store', nargs='?', default=False, 35 | help="Create a new metatab file, from named template. With no argument, uses the 'metatab' template ") 36 | 37 | g.add_argument('-t', '--terms', default=False, action='store_const', dest='out_type', const='terms', 38 | help='Parse a file and print out the stream of terms, before interpretation') 39 | 40 | g.add_argument('-j', '--json', default=False, action='store_const', dest='out_type', const='json', 41 | help='Parse a file and print out a JSON representation') 42 | 43 | g.add_argument('-y', '--yaml', default=False, action='store_const', dest='out_type', const='yaml', 44 | help='Parse a file and print out a YAML representation') 45 | 46 | g.add_argument('-l', '--line', default=False, action='store_const', dest='out_type', const='line', 47 | help='Parse a file and print out a Metatab Line representation') 48 | 49 | g.add_argument('-c', '--csv', default=False, action='store_const', dest='out_type', const='csv', 50 | help='Parse a file and print out a Metatab Line representation') 51 | 52 | g.add_argument('-p', '--prety', default=False, action='store_const', dest='out_type', const='prety', 53 | help='Pretty print the python Dict representation ') 54 | 55 | parser.add_argument('-W', '--write-in-place', 56 | help='When outputting as yaml, json, csv or line, write the file instead of printing it, ' 57 | 'to a file with same base name and appropriate extension ', action='store_true') 58 | 59 | parser.set_defaults(out_type='csv') 60 | 61 | parser.add_argument('-f', '--find-first', 62 | help='Find and print the first value for a fully qualified term name') 63 | 64 | parser.add_argument('-d', '--show-declaration', default=False, action='store_true', 65 | help='Parse a declaration file and print out declaration dict. Use -j or -y for the format') 66 | 67 | parser.add_argument('file', nargs='?', default=DEFAULT_METATAB_FILE, help='Path to a Metatab file') 68 | 69 | cli_init() 70 | 71 | args = parser.parse_args(sys.argv[1:]) 72 | 73 | # Specing a fragment screws up setting the default metadata file name 74 | if args.file.startswith('#'): 75 | args.file = DEFAULT_METATAB_FILE + args.file 76 | 77 | if args.create is not False: 78 | if new_metatab_file(args.file, args.create): 79 | prt("Created ", args.file) 80 | else: 81 | warn("File",args.file,'already exists.') 82 | 83 | exit(0) 84 | 85 | metadata_url = parse_app_url(args.file, proto='metatab') 86 | try: 87 | doc = MetatabDoc(metadata_url, cache=cache) 88 | except IOError as e: 89 | 90 | err("Failed to open '{}': {}".format(metadata_url, e)) 91 | 92 | def write_or_print(t): 93 | from pathlib import Path 94 | 95 | if metadata_url.scheme != 'file': 96 | err("Can only use -w with local files") 97 | return 98 | 99 | ext = 'txt' if args.out_type == 'line' else args.out_type 100 | 101 | if args.write_in_place: 102 | with metadata_url.fspath.with_suffix('.'+ext).open('w') as f: 103 | f.write(t) 104 | else: 105 | print(t) 106 | 107 | 108 | 109 | if args.show_declaration: 110 | 111 | decl_doc = MetatabDoc('', cache=cache, decl=metadata_url.path) 112 | 113 | d = { 114 | 'terms': decl_doc.decl_terms, 115 | 'sections': decl_doc.decl_sections 116 | } 117 | 118 | if args.out_type == 'json': 119 | print(json.dumps(d, indent=4)) 120 | 121 | elif args.out_type == 'yaml': 122 | import yaml 123 | print(yaml.safe_dump(d, default_flow_style=False, indent=4)) 124 | 125 | elif args.find_first: 126 | 127 | t = doc.find_first(args.find_first) 128 | print(t.value) 129 | 130 | 131 | elif args.out_type == 'terms': 132 | for t in doc._term_parser: 133 | print(t) 134 | 135 | elif args.out_type == 'json': 136 | write_or_print(json.dumps(doc.as_dict(), indent=4)) 137 | 138 | elif args.out_type == 'yaml': 139 | import yaml 140 | from collections import OrderedDict 141 | 142 | def ordered_dump(data, stream=None, Dumper=yaml.Dumper, **kwds): 143 | class OrderedDumper(Dumper): 144 | pass 145 | 146 | def _dict_representer(dumper, data): 147 | return dumper.represent_mapping( 148 | yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, 149 | data.items()) 150 | 151 | OrderedDumper.add_representer(OrderedDict, _dict_representer) 152 | return yaml.dump(data, stream, OrderedDumper, **kwds) 153 | 154 | write_or_print(ordered_dump(doc.as_dict(), default_flow_style=False, indent=4, Dumper=yaml.SafeDumper)) 155 | 156 | elif args.out_type == 'line': 157 | write_or_print(doc.as_lines()) 158 | 159 | elif args.out_type == 'csv': 160 | write_or_print(doc.as_csv()) 161 | 162 | elif args.out_type == 'prety': 163 | from pprint import pprint 164 | pprint(doc.as_dict()) 165 | 166 | exit(0) 167 | 168 | 169 | 170 | 171 | 172 | def cli_init(log_level=logging.INFO): 173 | 174 | out_hdlr = logging.StreamHandler(sys.stdout) 175 | out_hdlr.setFormatter(logging.Formatter('%(message)s')) 176 | out_hdlr.setLevel(log_level) 177 | logger.addHandler(out_hdlr) 178 | logger.setLevel(log_level) 179 | 180 | out_hdlr = logging.StreamHandler(sys.stderr) 181 | out_hdlr.setFormatter(logging.Formatter('%(levelname)s: %(message)s')) 182 | out_hdlr.setLevel(logging.WARN) 183 | logger_err.addHandler(out_hdlr) 184 | logger_err.setLevel(logging.WARN) 185 | 186 | def prt(*args, **kwargs): 187 | logger.info(' '.join(str(e) for e in args),**kwargs) 188 | 189 | def warn(*args, **kwargs): 190 | logger_err.warn(' '.join(str(e) for e in args),**kwargs) 191 | 192 | def err(*args, **kwargs): 193 | logger_err.critical(' '.join(str(e) for e in args),**kwargs) 194 | sys.exit(1) 195 | 196 | 197 | def make_metatab_file(template='metatab'): 198 | import metatab.templates as tmpl 199 | 200 | template_path = join(dirname(tmpl.__file__),template+'.csv') 201 | 202 | doc = MetatabDoc(template_path) 203 | 204 | return doc 205 | 206 | 207 | 208 | def new_metatab_file(mt_file, template): 209 | template = template if template else 'metatab' 210 | 211 | if not exists(mt_file): 212 | doc = make_metatab_file(template) 213 | 214 | doc.write_csv(mt_file) 215 | 216 | return True 217 | 218 | else: 219 | 220 | return False 221 | 222 | 223 | def get_table(doc, name): 224 | t = doc.find_first('Root.Table', value=name) 225 | 226 | if not t: 227 | 228 | table_names = ["'" + t.value + "'" for t in doc.find('Root.Table')] 229 | 230 | if not table_names: 231 | table_names = [""] 232 | 233 | err("Did not find schema for table name '{}' Tables are: {}" 234 | .format(name, " ".join(table_names))) 235 | 236 | return t 237 | -------------------------------------------------------------------------------- /metatab/test/test-data/json/example1-web.json: -------------------------------------------------------------------------------- 1 | { 2 | "declare": "http://assets.metatab.org/metatab-0.1.csv", 3 | "title": "Registered Voters, By County", 4 | "description": "Percent of the eligible population registered to vote and the percent who voted in statewide elections.", 5 | "identifier": "cdph.ca.gov-hci-registered_voters-county", 6 | "version": "201404", 7 | "obsoletes": "cdph.ca.gov-hci-registered_voters-county-201304", 8 | "format": "excel", 9 | "spatial": "California <04000US06>", 10 | "time": "2002-2014", 11 | "spatialgrain": "County <05000US>", 12 | "datafile": [ 13 | { 14 | "table": "registered_voters", 15 | "grain": "County", 16 | "title": "The First Example Data File", 17 | "url": "http://example.com/example1.csv" 18 | }, 19 | { 20 | "table": "registered_voters", 21 | "grain": "Tract", 22 | "title": "The Second Example Data File", 23 | "url": "http://example.com/example2.csv" 24 | } 25 | ], 26 | "homepage": { 27 | "table": "Healthy Communities Data and Indicators Project (HCI)", 28 | "url": "https://www.cdph.ca.gov/programs/pages/healthycommunityindicators.aspx" 29 | }, 30 | "documentation": [ 31 | { 32 | "table": "Indicator Documentation for Voter Registration / Participation", 33 | "description": "Voter Registration/Participation: Percent of the eligible population registered to vote and the percent who voted in statewide elections", 34 | "url": "https://www.cdph.ca.gov/programs/Documents/HCI_RegisteredVoters_653_Narrative_and_examples_6-2-14.pdf" 35 | }, 36 | { 37 | "title": "Data Bundles Packaging Specification", 38 | "url": "https://docs.google.com/document/d/16tb7x73AyF8pJ6e6IBcaIJAioEZCNBGDEksKYTXfdfg/edit#" 39 | } 40 | ], 41 | "creator": { 42 | "email": "HCIOHE@cdph.ca.gov", 43 | "name": "Office of Health Equity" 44 | }, 45 | "wrangler": { 46 | "email": "eric@civicknowledge.com", 47 | "name": "Eric Busboom" 48 | }, 49 | "note": "This file is an example of a data bundle, a simple format for linking data to metadata using spreadsheets. See the specification for more details.", 50 | "table": { 51 | "description": "HCI Indicator 653.0: Percent of adults age 18 years and older who are registered voters", 52 | "column": [ 53 | { 54 | "datatype": "int", 55 | "valuetype": "year range", 56 | "description": "Year or years that indicator was reported", 57 | "name": "reportyear" 58 | }, 59 | { 60 | "datatype": "str", 61 | "valuetype": "dimension", 62 | "description": "Type of record", 63 | "name": "type" 64 | }, 65 | { 66 | "datatype": "str", 67 | "valuetype": "gvid", 68 | "description": "GVid version of the geotype and geotypeval", 69 | "name": "gvid" 70 | }, 71 | { 72 | "datatype": "str", 73 | "valuetype": "label for gvid", 74 | "description": "Census name of geographic area", 75 | "name": "geoname" 76 | }, 77 | { 78 | "datatype": "str", 79 | "valuetype": "label", 80 | "description": "Code for type of geographic area", 81 | "name": "geotype" 82 | }, 83 | { 84 | "datatype": "str", 85 | "valuetype": "census", 86 | "description": "Census geoid code", 87 | "name": "geotypevalue" 88 | }, 89 | { 90 | "datatype": "str", 91 | "valuetype": "FIPS county code", 92 | "description": "County FIPS code", 93 | "name": "county_fips" 94 | }, 95 | { 96 | "datatype": "str", 97 | "valuetype": "label for counrty_fips", 98 | "description": "County name", 99 | "name": "county_name" 100 | }, 101 | { 102 | "datatype": "str", 103 | "valuetype": "census code", 104 | "description": "Numeric code of region", 105 | "name": "region_code" 106 | }, 107 | { 108 | "datatype": "str", 109 | "valuetype": "label for region_code", 110 | "description": "Name of region", 111 | "name": "region_name" 112 | }, 113 | { 114 | "datatype": "str", 115 | "valuetype": "raceth/civick", 116 | "description": "Civic Knowledge race / ethnicity code.", 117 | "name": "raceth" 118 | }, 119 | { 120 | "datatype": "str", 121 | "valuetype": "label for raceeth", 122 | "description": "Race / Ethnicity Name", 123 | "name": "raceth_name" 124 | }, 125 | { 126 | "datatype": "str", 127 | "valuetype": "raceth/hci", 128 | "description": "Race / ethnicity code", 129 | "name": "race_eth_code" 130 | }, 131 | { 132 | "datatype": "str", 133 | "valuetype": "label for race_eth_code", 134 | "description": "Race / ethnicity name", 135 | "name": "race_eth_name" 136 | }, 137 | { 138 | "datatype": "int", 139 | "valuetype": "count", 140 | "description": "Adults who are registered to vote, or who voted, depending on type of record", 141 | "name": "numerator" 142 | }, 143 | { 144 | "datatype": "int", 145 | "valuetype": "count", 146 | "description": "Population of Adults, 18 years or older", 147 | "name": "denominator" 148 | }, 149 | { 150 | "datatype": "float", 151 | "valuetype": "percent of numerator over denominator", 152 | "description": "Percent of adults who are registered to vote, or who voted, depending on type of record", 153 | "name": "percent" 154 | }, 155 | { 156 | "datatype": "float", 157 | "valuetype": "ci95l for percent", 158 | "description": "Lower bound of 95% confidence interval", 159 | "name": "ll_95ci" 160 | }, 161 | { 162 | "datatype": "float", 163 | "valuetype": "ci95u for percent", 164 | "description": "Upper bound of 95% confidence interval", 165 | "name": "ul_95ci" 166 | }, 167 | { 168 | "datatype": "float", 169 | "valuetype": "se for percent", 170 | "description": "Standard error", 171 | "name": "se" 172 | }, 173 | { 174 | "datatype": "float", 175 | "valuetype": "rse for percent", 176 | "description": "Relative standard error (se/percent * 100) expressed as a percent", 177 | "name": "rse" 178 | }, 179 | { 180 | "datatype": "float", 181 | "valuetype": "decile", 182 | "description": "Statewide decile ranking", 183 | "name": "ca_decile" 184 | }, 185 | { 186 | "datatype": "float", 187 | "valuetype": "ratio", 188 | "description": "Ratio of indicator to state average", 189 | "name": "ca_rr" 190 | }, 191 | { 192 | "datatype": "float", 193 | "valuetype": "measure", 194 | "description": "Voter age population, from CA Department of Finance.", 195 | "name": "vap" 196 | }, 197 | { 198 | "datatype": "str", 199 | "valuetype": "dimension", 200 | "name": "ind_id" 201 | }, 202 | { 203 | "datatype": "str", 204 | "valuetype": "dimension", 205 | "name": "ind_definition" 206 | }, 207 | { 208 | "datatype": "str", 209 | "valuetype": "other", 210 | "name": "version" 211 | } 212 | ], 213 | "name": "registered_voters" 214 | } 215 | } -------------------------------------------------------------------------------- /metatab/test/outputs/metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "sections": { 3 | "contacts": { 4 | "terms": [ 5 | "Wrangler", 6 | "Wrangler.Email", 7 | "Wrangler.Name", 8 | "Creator", 9 | "Creator.Email", 10 | "Creator.Name", 11 | "Publisher", 12 | "Publisher.Email", 13 | "Publisher.Name" 14 | ], 15 | "args": [ 16 | "email" 17 | ] 18 | }, 19 | "declaredterms": { 20 | "terms": [ 21 | "DeclareTerm" 22 | ], 23 | "args": [] 24 | }, 25 | "resources": { 26 | "terms": [ 27 | "Datafile", 28 | "Datafile.Grain", 29 | "Datafile.Table", 30 | "Datafile.Title", 31 | "Datafile.Url", 32 | "Documentation", 33 | "Documentation.Description", 34 | "Documentation.Title", 35 | "Documentation.Url", 36 | "Homepage", 37 | "Homepage.Title", 38 | "Homepage.Url" 39 | ], 40 | "args": [ 41 | "table", 42 | "grain", 43 | "title" 44 | ] 45 | }, 46 | "declaredsections": { 47 | "terms": [ 48 | "DeclareSection" 49 | ], 50 | "args": [] 51 | }, 52 | "schema": { 53 | "terms": [ 54 | "Table", 55 | "Table.Column", 56 | "Table.Description", 57 | "Table.Name", 58 | "Column", 59 | "Column.Datatype", 60 | "Column.Description", 61 | "Column.Name", 62 | "Column.Valuetype" 63 | ], 64 | "args": [ 65 | "datatype", 66 | "valuetype", 67 | "description" 68 | ] 69 | }, 70 | "root": { 71 | "terms": [ 72 | "Declare", 73 | "Include", 74 | "Section", 75 | "Synonym", 76 | "Title", 77 | "Description", 78 | "Time", 79 | "Version", 80 | "Format", 81 | "Identifier", 82 | "Note", 83 | "Obsoletes", 84 | "Spatial", 85 | "SpatialGrain" 86 | ], 87 | "args": [] 88 | } 89 | }, 90 | "terms": { 91 | ".include": { 92 | "term_name": "Include", 93 | "section": "root" 94 | }, 95 | ".declare": { 96 | "term_name": "Declare", 97 | "section": "root" 98 | }, 99 | ".title": { 100 | "term_name": "Title", 101 | "section": "root" 102 | }, 103 | "datafile.grain": { 104 | "term_name": "Datafile.Grain", 105 | "section": "resources" 106 | }, 107 | "homepage.url": { 108 | "term_name": "Homepage.Url", 109 | "section": "resources" 110 | }, 111 | ".synonym": { 112 | "term_name": "Synonym", 113 | "childpropertytype": "sequence", 114 | "termvaluename": "term", 115 | "section": "root" 116 | }, 117 | "homepage.title": { 118 | "term_name": "Homepage.Title", 119 | "section": "resources" 120 | }, 121 | ".datafile": { 122 | "term_name": "Datafile", 123 | "termvaluename": "url", 124 | "section": "resources" 125 | }, 126 | ".obsoletes": { 127 | "term_name": "Obsoletes", 128 | "section": "root" 129 | }, 130 | "documentation.url": { 131 | "term_name": "Documentation.Url", 132 | "section": "resources" 133 | }, 134 | "table.description": { 135 | "term_name": "Table.Description", 136 | "section": "schema" 137 | }, 138 | ".table": { 139 | "term_name": "Table", 140 | "termvaluename": "name", 141 | "section": "schema" 142 | }, 143 | "documentation.description": { 144 | "term_name": "Documentation.Description", 145 | "section": "resources" 146 | }, 147 | ".publisher": { 148 | "term_name": "Publisher", 149 | "termvaluename": "name", 150 | "section": "contacts" 151 | }, 152 | "wrangler.email": { 153 | "term_name": "Wrangler.Email", 154 | "section": "contacts" 155 | }, 156 | "publisher.name": { 157 | "term_name": "Publisher.Name", 158 | "section": "contacts" 159 | }, 160 | ".note": { 161 | "term_name": "Note", 162 | "section": "root" 163 | }, 164 | ".description": { 165 | "term_name": "Description", 166 | "section": "root" 167 | }, 168 | "creator.email": { 169 | "term_name": "Creator.Email", 170 | "section": "contacts" 171 | }, 172 | "column.valuetype": { 173 | "term_name": "Column.Valuetype", 174 | "section": "schema" 175 | }, 176 | ".declareterm": { 177 | "term_name": "DeclareTerm", 178 | "termvaluename": "term", 179 | "section": "DeclaredTerms" 180 | }, 181 | "datafile.table": { 182 | "term_name": "Datafile.Table", 183 | "section": "resources" 184 | }, 185 | "table.column": { 186 | "term_name": "Table.Column", 187 | "childpropertytype": "sequence", 188 | "termvaluename": "name", 189 | "section": "schema" 190 | }, 191 | ".documentation": { 192 | "term_name": "Documentation", 193 | "section": "resources" 194 | }, 195 | "wrangler.name": { 196 | "term_name": "Wrangler.Name", 197 | "section": "contacts" 198 | }, 199 | "column.description": { 200 | "term_name": "Column.Description", 201 | "section": "schema" 202 | }, 203 | "documentation.title": { 204 | "term_name": "Documentation.Title", 205 | "section": "resources" 206 | }, 207 | ".column": { 208 | "term_name": "Column", 209 | "termvaluename": "name", 210 | "synonym": "Table.Column", 211 | "section": "schema" 212 | }, 213 | ".identifier": { 214 | "term_name": "Identifier", 215 | "section": "root" 216 | }, 217 | "column.datatype": { 218 | "term_name": "Column.Datatype", 219 | "section": "schema" 220 | }, 221 | "creator.name": { 222 | "term_name": "Creator.Name", 223 | "section": "contacts" 224 | }, 225 | "column.name": { 226 | "term_name": "Column.Name", 227 | "section": "schema" 228 | }, 229 | ".format": { 230 | "term_name": "Format", 231 | "section": "root" 232 | }, 233 | ".spatialgrain": { 234 | "term_name": "SpatialGrain", 235 | "section": "root" 236 | }, 237 | ".section": { 238 | "term_name": "Section", 239 | "childpropertytype": "sequence", 240 | "termvaluename": "name", 241 | "section": "root" 242 | }, 243 | ".declaresection": { 244 | "term_name": "DeclareSection", 245 | "childpropertytype": "sequence", 246 | "termvaluename": "section", 247 | "section": "DeclaredSections" 248 | }, 249 | "datafile.url": { 250 | "term_name": "Datafile.Url", 251 | "section": "resources" 252 | }, 253 | "table.name": { 254 | "term_name": "Table.Name", 255 | "section": "schema" 256 | }, 257 | ".time": { 258 | "term_name": "Time", 259 | "section": "root" 260 | }, 261 | "datafile.title": { 262 | "term_name": "Datafile.Title", 263 | "section": "resources" 264 | }, 265 | ".creator": { 266 | "term_name": "Creator", 267 | "termvaluename": "name", 268 | "section": "contacts" 269 | }, 270 | ".homepage": { 271 | "term_name": "Homepage", 272 | "termvaluename": "url", 273 | "section": "resources" 274 | }, 275 | ".spatial": { 276 | "term_name": "Spatial", 277 | "section": "root" 278 | }, 279 | ".wrangler": { 280 | "term_name": "Wrangler", 281 | "termvaluename": "name", 282 | "section": "contacts" 283 | }, 284 | "publisher.email": { 285 | "term_name": "Publisher.Email", 286 | "section": "contacts" 287 | }, 288 | ".version": { 289 | "term_name": "Version", 290 | "section": "root" 291 | } 292 | } 293 | } 294 | -------------------------------------------------------------------------------- /metatab/test/test-data/json/example1.json: -------------------------------------------------------------------------------- 1 | { 2 | "declare": "metatab-latest", 3 | "title": "Registered Voters, By County", 4 | "name": "cdph.ca.gov-hci-registered_voters-county", 5 | "description": "Percent of the eligible population registered to vote and the percent who voted in statewide elections.", 6 | "identifier": "cdph.ca.gov-hci-registered_voters-county", 7 | "version": "201404", 8 | "obsoletes": "cdph.ca.gov-hci-registered_voters-county-201304", 9 | "dataset": "voters", 10 | "origin": "example.com", 11 | "space": "Ca", 12 | "time": "2002-2014", 13 | "grain": "County", 14 | "format": "excel", 15 | "datafile": [ 16 | { 17 | "name": "example1", 18 | "schema": "registered_voters", 19 | "grain": "County", 20 | "title": "The First Example Data File", 21 | "url": "http://example.com/example1.csv" 22 | }, 23 | { 24 | "name": "example2", 25 | "schema": "registered_voters", 26 | "grain": "Tract", 27 | "title": "The Second Example Data File", 28 | "url": "http://example.com/example2.csv" 29 | } 30 | ], 31 | "homepage": [ 32 | { 33 | "schema": "Healthy Communities Data and Indicators Project (HCI)", 34 | "url": "https://www.cdph.ca.gov/programs/pages/healthycommunityindicators.aspx" 35 | } 36 | ], 37 | "documentation": [ 38 | { 39 | "schema": "Indicator Documentation for Voter Registration / Participation", 40 | "description": "Voter Registration/Participation: Percent of the eligible population registered to vote and the percent who voted in statewide elections", 41 | "url": "https://www.cdph.ca.gov/programs/Documents/HCI_RegisteredVoters_653_Narrative_and_examples_6-2-14.pdf" 42 | }, 43 | { 44 | "title": "Data Bundles Packaging Specification", 45 | "url": "https://docs.google.com/document/d/16tb7x73AyF8pJ6e6IBcaIJAioEZCNBGDEksKYTXfdfg/edit#" 46 | } 47 | ], 48 | "creator": [ 49 | { 50 | "email": "HCIOHE@cdph.ca.gov", 51 | "name": "Office of Health Equity" 52 | } 53 | ], 54 | "wrangler": [ 55 | { 56 | "email": "eric@civicknowledge.com", 57 | "name": "Eric Busboom" 58 | } 59 | ], 60 | "note": [ 61 | "This file is an example of a data bundle, a simple format for linking data to metadata using spreadsheets. See the specification for more details." 62 | ], 63 | "table": [ 64 | { 65 | "description": "HCI Indicator 653.0: Percent of adults age 18 years and older who are registered voters", 66 | "column": [ 67 | { 68 | "datatype": "int", 69 | "valuetype": "year range", 70 | "description": "Year or years that indicator was reported", 71 | "name": "reportyear" 72 | }, 73 | { 74 | "datatype": "str", 75 | "valuetype": "dimension", 76 | "description": "Type of record", 77 | "name": "type" 78 | }, 79 | { 80 | "datatype": "str", 81 | "valuetype": "gvid", 82 | "description": "GVid version of the geotype and geotypeval", 83 | "name": "gvid" 84 | }, 85 | { 86 | "datatype": "str", 87 | "valuetype": "label for gvid", 88 | "description": "Census name of geographic area", 89 | "name": "geoname" 90 | }, 91 | { 92 | "datatype": "str", 93 | "valuetype": "label", 94 | "description": "Code for type of geographic area", 95 | "name": "geotype" 96 | }, 97 | { 98 | "datatype": "str", 99 | "valuetype": "census", 100 | "description": "Census geoid code", 101 | "name": "geotypevalue" 102 | }, 103 | { 104 | "datatype": "str", 105 | "valuetype": "FIPS county code", 106 | "description": "County FIPS code", 107 | "name": "county_fips" 108 | }, 109 | { 110 | "datatype": "str", 111 | "valuetype": "label for counrty_fips", 112 | "description": "County name", 113 | "name": "county_name" 114 | }, 115 | { 116 | "datatype": "str", 117 | "valuetype": "census code", 118 | "description": "Numeric code of region", 119 | "name": "region_code" 120 | }, 121 | { 122 | "datatype": "str", 123 | "valuetype": "label for region_code", 124 | "description": "Name of region", 125 | "name": "region_name" 126 | }, 127 | { 128 | "datatype": "str", 129 | "valuetype": "raceth/civick", 130 | "description": "Civic Knowledge race / ethnicity code.", 131 | "name": "raceth" 132 | }, 133 | { 134 | "datatype": "str", 135 | "valuetype": "label for raceeth", 136 | "description": "Race / Ethnicity Name", 137 | "name": "raceth_name" 138 | }, 139 | { 140 | "datatype": "str", 141 | "valuetype": "raceth/hci", 142 | "description": "Race / ethnicity code", 143 | "name": "race_eth_code" 144 | }, 145 | { 146 | "datatype": "str", 147 | "valuetype": "label for race_eth_code", 148 | "description": "Race / ethnicity name", 149 | "name": "race_eth_name" 150 | }, 151 | { 152 | "datatype": "int", 153 | "valuetype": "count", 154 | "description": "Adults who are registered to vote, or who voted, depending on type of record", 155 | "name": "numerator" 156 | }, 157 | { 158 | "datatype": "int", 159 | "valuetype": "count", 160 | "description": "Population of Adults, 18 years or older", 161 | "name": "denominator" 162 | }, 163 | { 164 | "datatype": "float", 165 | "valuetype": "percent of numerator over denominator", 166 | "description": "Percent of adults who are registered to vote, or who voted, depending on type of record", 167 | "name": "percent" 168 | }, 169 | { 170 | "datatype": "float", 171 | "valuetype": "ci95l for percent", 172 | "description": "Lower bound of 95% confidence interval", 173 | "name": "ll_95ci" 174 | }, 175 | { 176 | "datatype": "float", 177 | "valuetype": "ci95u for percent", 178 | "description": "Upper bound of 95% confidence interval", 179 | "name": "ul_95ci" 180 | }, 181 | { 182 | "datatype": "float", 183 | "valuetype": "se for percent", 184 | "description": "Standard error", 185 | "name": "se" 186 | }, 187 | { 188 | "datatype": "float", 189 | "valuetype": "rse for percent", 190 | "description": "Relative standard error (se/percent * 100) expressed as a percent", 191 | "name": "rse" 192 | }, 193 | { 194 | "datatype": "float", 195 | "valuetype": "decile", 196 | "description": "Statewide decile ranking", 197 | "name": "ca_decile" 198 | }, 199 | { 200 | "datatype": "float", 201 | "valuetype": "ratio", 202 | "description": "Ratio of indicator to state average", 203 | "name": "ca_rr" 204 | }, 205 | { 206 | "datatype": "float", 207 | "valuetype": "measure", 208 | "description": "Voter age population, from CA Department of Finance.", 209 | "name": "vap" 210 | }, 211 | { 212 | "datatype": "str", 213 | "valuetype": "dimension", 214 | "name": "ind_id" 215 | }, 216 | { 217 | "datatype": "str", 218 | "valuetype": "dimension", 219 | "name": "ind_definition" 220 | }, 221 | { 222 | "datatype": "str", 223 | "valuetype": "other", 224 | "name": "version" 225 | } 226 | ], 227 | "name": "registered_voters" 228 | } 229 | ] 230 | } -------------------------------------------------------------------------------- /examples/Pandas Reporter Example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "import pandasreporter as pr\n", 12 | "\n", 13 | "\n", 14 | "# B17001, Poverty Status by Sex by Age\n", 15 | "b17001 = pr.get_dataframe('B17001', '140', '05000US06073', cache=True).ct_columns\n", 16 | "# B17024, Age by Ratio of Income to Poverty Level\n", 17 | "b17024 = pr.get_dataframe('B17024', '140', '05000US06073', cache=True).ct_columns\n", 18 | "# B17017, Poverty Status by Household Type by Age of Householder\n", 19 | "b17017 = pr.get_dataframe('B17017', '140', '05000US06073', cache=True).ct_columns" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "# B17001 Poverty Status by Sex by Age\n", 27 | "\n", 28 | "For the [Poverty Status by Sex by Age](https://censusreporter.org/tables/B17001/) we'll select the columns for male and female, below poverty, 65 and older. \n", 29 | "\n", 30 | "**NOTE** if you want to get seniors of a particular race, use table `C17001a-g`, condensed race iterations. The 'C' tables have fewer age ranges, but there is no 'C' table for all races: There is a `C17001a` for Whites, a condensed version of `B17001a`, but there is no `C17001` for a condensed version of `B17001`\n", 31 | "\n" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "data": { 41 | "text/plain": [ 42 | "['B17001015 Total Income in the past 12 months below poverty level Male 65 to 74 years',\n", 43 | " 'Margins for B17001015 Total Income in the past 12 months below poverty level Male 65 to 74 years',\n", 44 | " 'B17001016 Total Income in the past 12 months below poverty level Male 75 years and over',\n", 45 | " 'Margins for B17001016 Total Income in the past 12 months below poverty level Male 75 years and over',\n", 46 | " 'B17001029 Total Income in the past 12 months below poverty level Female 65 to 74 years',\n", 47 | " 'Margins for B17001029 Total Income in the past 12 months below poverty level Female 65 to 74 years',\n", 48 | " 'B17001030 Total Income in the past 12 months below poverty level Female 75 years and over',\n", 49 | " 'Margins for B17001030 Total Income in the past 12 months below poverty level Female 75 years and over',\n", 50 | " 'B17001044 Total Income in the past 12 months at or above poverty level Male 65 to 74 years',\n", 51 | " 'Margins for B17001044 Total Income in the past 12 months at or above poverty level Male 65 to 74 years',\n", 52 | " 'B17001045 Total Income in the past 12 months at or above poverty level Male 75 years and over',\n", 53 | " 'Margins for B17001045 Total Income in the past 12 months at or above poverty level Male 75 years and over',\n", 54 | " 'B17001058 Total Income in the past 12 months at or above poverty level Female 65 to 74 years',\n", 55 | " 'Margins for B17001058 Total Income in the past 12 months at or above poverty level Female 65 to 74 years',\n", 56 | " 'B17001059 Total Income in the past 12 months at or above poverty level Female 75 years and over',\n", 57 | " 'Margins for B17001059 Total Income in the past 12 months at or above poverty level Female 75 years and over']" 58 | ] 59 | }, 60 | "execution_count": 2, 61 | "output_type": "execute_result", 62 | "metadata": {} 63 | } 64 | ], 65 | "source": [ 66 | "[e for e in b17001.columns if '65 to 74' in str(e) or '75 years' in str(e) ]" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 3, 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "data": { 76 | "text/html": [ 77 | "
\n", 78 | "\n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | "
geoidB17001015 Total Income in the past 12 months below poverty level Male 65 to 74 yearsMargins for B17001015 Total Income in the past 12 months below poverty level Male 65 to 74 yearsB17001016 Total Income in the past 12 months below poverty level Male 75 years and overMargins for B17001016 Total Income in the past 12 months below poverty level Male 75 years and overB17001029 Total Income in the past 12 months below poverty level Female 65 to 74 yearsMargins for B17001029 Total Income in the past 12 months below poverty level Female 65 to 74 yearsB17001030 Total Income in the past 12 months below poverty level Female 75 years and overMargins for B17001030 Total Income in the past 12 months below poverty level Female 75 years and over
014000US0607300450110.018.00.012.013.022.07.012.0
114000US060730198030.012.00.012.08.012.011.017.0
214000US0607300600018.030.00.012.00.012.00.012.0
314000US060730083640.017.07.018.07.017.00.017.0
414000US060730085070.017.067.061.017.026.026.041.0
\n", 156 | "
" 157 | ] 158 | }, 159 | "output_type": "execute_result", 160 | "metadata": {} 161 | } 162 | ], 163 | "source": [ 164 | "# Now create a subset dataframe with just the columns we need. \n", 165 | "b17001s = b17001[['geoid', 'B17001015', 'B17001016','B17001029','B17001030']]\n", 166 | "b17001s.head()" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "## Senior poverty rates\n", 174 | "\n", 175 | "Creating the sums for the senior below poverty rates at the tract level is easy, but there is a *serious problem* with the results: the numbers are completely unstable. The minimum RSE is 22%, and the median is about 60%. These are useless results. " 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 4, 181 | "metadata": {}, 182 | "outputs": [ 183 | { 184 | "data": { 185 | "text/plain": [ 186 | "count 576.000000\n", 187 | "mean 87.621218\n", 188 | "std 156.710591\n", 189 | "min 22.150407\n", 190 | "25% 43.645038\n", 191 | "50% 58.919310\n", 192 | "75% 82.136436\n", 193 | "max 1806.402183\n", 194 | "dtype: float64" 195 | ] 196 | }, 197 | "execution_count": 4, 198 | "output_type": "execute_result", 199 | "metadata": {} 200 | } 201 | ], 202 | "source": [ 203 | "b17001_65mf = pr.CensusDataFrame()\n", 204 | "b17001_65mf['geoid'] = b17001['geoid']\n", 205 | "b17001_65mf['poverty_65'], b17001_65mf['poverty_65_m90'] = b17001.sum_m('B17001015', 'B17001016','B17001029','B17001030')\n", 206 | "b17001_65mf.add_rse('poverty_65')\n", 207 | "b17001_65mf.poverty_65_rse.replace([np.inf, -np.inf], np.nan).dropna().describe()" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "" 217 | ] 218 | } 219 | ], 220 | "metadata": { 221 | "kernelspec": { 222 | "display_name": "Python 3", 223 | "language": "python", 224 | "name": "python3" 225 | }, 226 | "language_info": { 227 | "codemirror_mode": { 228 | "name": "ipython", 229 | "version": 3.0 230 | }, 231 | "file_extension": ".py", 232 | "mimetype": "text/x-python", 233 | "name": "python", 234 | "nbconvert_exporter": "python", 235 | "pygments_lexer": "ipython3", 236 | "version": "3.5.0" 237 | } 238 | }, 239 | "nbformat": 4, 240 | "nbformat_minor": 2 241 | } --------------------------------------------------------------------------------