├── metatab
├── test
│ ├── __init__.py
│ ├── test-data
│ │ ├── __init__.py
│ │ ├── scripts
│ │ │ ├── __init__.py
│ │ │ ├── programsource.py
│ │ │ ├── Py3Notebook.ipynb
│ │ │ └── complex-text.txt
│ │ ├── include3.csv
│ │ ├── declare-only.csv
│ │ ├── json
│ │ │ ├── include3.json
│ │ │ ├── include2.json
│ │ │ ├── include1.json
│ │ │ ├── datapackage_ex1.json
│ │ │ ├── datapackage_ex1_web.json
│ │ │ ├── issue1.json
│ │ │ ├── children2.json
│ │ │ ├── children.json
│ │ │ ├── datapackage_ex2.json
│ │ │ ├── example2.json
│ │ │ ├── example1-web.json
│ │ │ └── example1.json
│ │ ├── include1.csv
│ │ ├── include2.csv
│ │ ├── line
│ │ │ ├── line-oriented-doc-contacts.txt
│ │ │ ├── line-oriented-doc-root.txt
│ │ │ ├── line-oriented-doc-references-1.txt
│ │ │ ├── line-oriented-doc-bib.txt
│ │ │ ├── line-oriented-doc-references-2.txt
│ │ │ └── line-oriented-doc.txt
│ │ ├── short.csv
│ │ ├── childpropertytype.csv
│ │ ├── headers.csv
│ │ ├── name.csv
│ │ ├── name2.csv
│ │ ├── nested.csv
│ │ ├── errors
│ │ │ ├── bad_include.csv
│ │ │ ├── bad_declare.csv
│ │ │ └── errors2.csv
│ │ ├── children.csv
│ │ ├── issue1.csv
│ │ ├── children2.csv
│ │ ├── census.csv
│ │ ├── children3.csv
│ │ ├── url_classes.csv
│ │ ├── programsource.csv
│ │ ├── resolve_urls.csv
│ │ ├── packages
│ │ │ └── example.com-test_package
│ │ │ │ ├── metadata.csv
│ │ │ │ └── notebooks
│ │ │ │ └── Test_Notebook.ipynb
│ │ ├── simple-text.txt
│ │ ├── example2.csv
│ │ ├── simple1.csv
│ │ ├── resources.csv
│ │ ├── short-declare.csv
│ │ ├── datapackage_ex1.csv
│ │ ├── datapackage_ex1_web.csv
│ │ ├── geo.csv
│ │ ├── datapackage_ex2.csv
│ │ ├── schema.csv
│ │ ├── yaml
│ │ │ ├── yaml-example-1.csv
│ │ │ └── yaml-example-1.yaml
│ │ ├── notebooks
│ │ │ ├── ImportTest.ipynb
│ │ │ ├── CellExecuteError.ipynb
│ │ │ └── SimpleMagicsTest.ipynb
│ │ ├── example1-web.csv
│ │ ├── example1.csv
│ │ ├── example1-headers.csv
│ │ ├── example1.txt
│ │ ├── properties.csv
│ │ ├── almost-everything.csv
│ │ └── civicknowledge.com-rcfe_affordability-2015.csv
│ ├── Dockerfile
│ ├── core.py
│ ├── Makefile
│ ├── outputs
│ │ ├── datapackage.json
│ │ └── metadata.json
│ └── test_doc.py
├── templates
│ ├── __init__.py
│ ├── datapackage.csv
│ └── metatab.csv
├── __init__.py
├── exc.py
├── resolver.py
├── datapackage.py
├── appurl.py
├── rowgen.py
├── util.py
└── cli.py
├── requirements.txt
├── MANIFEST.in
├── pyproject.toml
├── .travis.yml
├── docker
├── Dockerfile
└── Makefile
├── examples
├── pandas-reporter.py
└── Pandas Reporter Example.ipynb
├── develop.sh
├── Makefile
├── LICENSE
├── .gitignore
├── setup.py
├── docs
├── Census.rst
├── GeneratingRowsWithPrograms.rst
├── PrivateDatasets.rst
└── Wrangling packages.rst
└── README.rst
/metatab/test/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/metatab/templates/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/metatab/test/test-data/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/metatab/test/test-data/scripts/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/metatab/test/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM n42org/tox
2 |
3 | WORKDIR /code
--------------------------------------------------------------------------------
/metatab/test/test-data/include3.csv:
--------------------------------------------------------------------------------
1 | "Note","Include File 3"
2 |
--------------------------------------------------------------------------------
/metatab/test/test-data/declare-only.csv:
--------------------------------------------------------------------------------
1 | "Declare","metadata.csv",,,
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | metatabdecl
2 | rowgenerators>=0.7.0
3 | tabulate
--------------------------------------------------------------------------------
/metatab/test/test-data/scripts/programsource.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | include README.rst
3 | include LICENSE
4 |
--------------------------------------------------------------------------------
/metatab/test/test-data/json/include3.json:
--------------------------------------------------------------------------------
1 | {
2 | "note": "Include File 3"
3 | }
--------------------------------------------------------------------------------
/metatab/test/test-data/include1.csv:
--------------------------------------------------------------------------------
1 | Note,Include File 1
2 | Include,include2.csv
3 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools", "wheel"]
3 | build-backend = "setuptools.build_meta:__legacy__"
4 |
5 | [tool.setuptools_scm]
--------------------------------------------------------------------------------
/metatab/test/test-data/include2.csv:
--------------------------------------------------------------------------------
1 | "Note","Include File 2"
2 | "Include","https://raw.githubusercontent.com/CivicKnowledge/structured_tables/master/test/data/include3.csv"
3 |
--------------------------------------------------------------------------------
/metatab/test/test-data/line/line-oriented-doc-contacts.txt:
--------------------------------------------------------------------------------
1 | Section: Contacts
2 | Wrangler: Eric Busboom
3 | Wrangler.Email: eric@civicknowledge.com
4 | Wrangler.Organization: Civic Knowledge
5 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 | - "3.6"
4 | install:
5 | - pip install -r requirements.txt
6 | script: python setup.py test
7 | branches:
8 | only:
9 | - master
10 |
11 |
--------------------------------------------------------------------------------
/metatab/test/test-data/short.csv:
--------------------------------------------------------------------------------
1 | "Declare","short-declare.csv"
2 | "include","include3.csv"
3 | "Title","Title1"
4 | ".Language","en"
5 | "Section","Section1"
6 | "Title","Title2"
7 | "Include","include3.csv"
8 | "Title","Title3"
9 |
--------------------------------------------------------------------------------
/metatab/test/test-data/childpropertytype.csv:
--------------------------------------------------------------------------------
1 | ,,
2 | "ChildPropertyType","Parent.Child","scalar"
3 | ,,
4 | "Parent","parent",
5 | "Parent.Child","child1",
6 | "Parent.Child","child2",
7 | "Parent.Child","child3",
8 | "Parent.Child","child4",
9 |
--------------------------------------------------------------------------------
/metatab/test/test-data/json/include2.json:
--------------------------------------------------------------------------------
1 | {
2 | "note": [
3 | "Include File 2",
4 | "Include File 3"
5 | ],
6 | "include": "https://raw.githubusercontent.com/CivicKnowledge/structured_tables/master/test/data/include3.csv"
7 | }
--------------------------------------------------------------------------------
/metatab/test/test-data/line/line-oriented-doc-root.txt:
--------------------------------------------------------------------------------
1 | Identifier: 47bc1089-7584-41f0-b804-602ec42f1249
2 | Origin: civicknowledge.com
3 | Dataset: rcfe_affordability
4 | Version: 4
5 | Time: 2015
6 | Name: civicknowledge.com-rcfe_affordability-2015-4
7 |
--------------------------------------------------------------------------------
/metatab/test/test-data/headers.csv:
--------------------------------------------------------------------------------
1 | "Section ","One",,
2 | "Header","A","B","C"
3 | "one",1,2,3
4 | "two",4,5,6
5 | "three",7,8,9
6 | ,,,
7 | "Section ","One",,
8 | "Header","D","E","F"
9 | "one",10,11,12
10 | "two",13,14,15
11 | "three",16,17,18
12 |
--------------------------------------------------------------------------------
/metatab/test/test-data/name.csv:
--------------------------------------------------------------------------------
1 | "Declare","metatab-latest"
2 | "Title","Registered Voters, By County"
3 | "Name","this_name_should_be_replaced"
4 | "Dataset","FooBar"
5 | "Version",1
6 | "Origin","example.com"
7 | "Time",2017
8 | "Space","CA"
9 | "Grain","people"
10 |
--------------------------------------------------------------------------------
/metatab/test/test-data/name2.csv:
--------------------------------------------------------------------------------
1 | "Declare","metatab-latest"
2 | "Title","Registered Voters, By County"
3 | "Name","this_name_should_be_replaced"
4 | "Dataset","FooBar"
5 | "Version",1
6 | "Origin","example.com"
7 | "Time",2017
8 | "Space","CA"
9 | "Grain","people"
10 |
--------------------------------------------------------------------------------
/metatab/test/test-data/nested.csv:
--------------------------------------------------------------------------------
1 | "Section","Nesting",
2 | "A",1,
3 | ".B",2,
4 | ".B",3,
5 | "X",4,
6 | ".Y",5,
7 | ".Y",6,
8 | ,,
9 | "Section","More Nesting","Alt"
10 | "A",1,"Alt"
11 | ".B",2,"b"
12 | ".B",3,"c"
13 | "X",4,"d"
14 | ".Y",5,"e"
15 | ".Y",6,"f"
16 |
--------------------------------------------------------------------------------
/metatab/test/test-data/line/line-oriented-doc-references-1.txt:
--------------------------------------------------------------------------------
1 |
2 | Section: References
3 |
4 | Reference: censusreporter:B09020/140/05000US06073
5 | Reference.Name: B09020
6 | Reference.Description: Relationship by Household Type (Including Living Alone) for Population 65 Years and Over
7 |
--------------------------------------------------------------------------------
/metatab/test/test-data/errors/bad_include.csv:
--------------------------------------------------------------------------------
1 | "Include","doesntexist.csv"
2 | "Title","Registered Voters, By County"
3 | "Description","Percent of the eligible population registered to vote and the percent who voted in statewide elections."
4 | "Identifier","cdph.ca.gov-hci-registered_voters-county"
5 |
--------------------------------------------------------------------------------
/metatab/test/test-data/errors/bad_declare.csv:
--------------------------------------------------------------------------------
1 | "Declare","http://example.com/doesntexist.csv"
2 | "Title","Registered Voters, By County"
3 | "Description","Percent of the eligible population registered to vote and the percent who voted in statewide elections."
4 | "Identifier","cdph.ca.gov-hci-registered_voters-county"
5 |
--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
1 |
2 | FROM python:3.6.1-alpine
3 |
4 | MAINTAINER Eric Busboom "eric@civicknowledge.com"
5 |
6 | VOLUME /opt/metatab
7 |
8 | RUN apk add --update --no-cache g++ gcc python-dev py-lxml libxslt-dev==1.1.29-r0 bash git
9 |
10 | RUN pip install https://github.com/CivicKnowledge/metatab-py/archive/master.zip # 9
11 |
--------------------------------------------------------------------------------
/metatab/test/test-data/json/include1.json:
--------------------------------------------------------------------------------
1 | {
2 | "note": [
3 | "Include File 1",
4 | "Include File 2",
5 | "Include File 3"
6 | ],
7 | "include": [
8 | "include2.csv",
9 | "https://raw.githubusercontent.com/CivicKnowledge/structured_tables/master/test/data/include3.csv"
10 | ]
11 | }
--------------------------------------------------------------------------------
/examples/pandas-reporter.py:
--------------------------------------------------------------------------------
1 |
2 | import pandas as pd
3 | import numpy as np
4 | import pandasreporter as pr
5 |
6 | b17001 = pr.get_dataframe('B17001', '140', '05000US06073', cache=True)
7 | b17024 = pr.get_dataframe('B17024', '140', '05000US06073', cache=True)
8 | b17017 = pr.get_dataframe('B17017', '140', '05000US06073', cache=True)
9 |
10 | print df.head(2)
--------------------------------------------------------------------------------
/metatab/test/core.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017 Civic Knowledge. This file is licensed under the terms of the
2 | # Revised BSD License, included in this distribution as LICENSE
3 |
4 | """
5 |
6 | """
7 |
8 |
9 | def test_data(*paths):
10 | from os.path import dirname, join, abspath
11 |
12 | return abspath(join(dirname(abspath(__file__)), 'test-data', *paths))
--------------------------------------------------------------------------------
/metatab/test/test-data/children.csv:
--------------------------------------------------------------------------------
1 | "NOte","This is a note",,
2 | ,,,
3 | "Section","Arguments","prop1","prop2"
4 | "Parent","parent","prop1","prop2"
5 | ,,,
6 | "Section","ExplicitChildren",,
7 | "Parent","parent",,
8 | "Parent.Prop1","prop1",,
9 | "Parent.Prop2","prop2",,
10 | ,,,
11 | "Section","ElidedChildren",,
12 | "Parent","parent",,
13 | ".Prop1","prop1",,
14 | ".Prop2","prop2",,
15 |
--------------------------------------------------------------------------------
/metatab/test/test-data/json/datapackage_ex1.json:
--------------------------------------------------------------------------------
1 | {
2 | "declare": "datapackage-latest.csv",
3 | "title": "Registered Voters, By County",
4 | "description": "Percent of the eligible population registered to vote and the percent who voted in statewide elections.",
5 | "name": "cdph.ca.gov-hci-registered_voters-county",
6 | "version": "1.3.4",
7 | "section": [
8 | {
9 | "section":
--------------------------------------------------------------------------------
/metatab/templates/datapackage.csv:
--------------------------------------------------------------------------------
1 | "# ","Declarations for producing package.json files",,,,,
2 | ,,,,,,
3 | "Section","DeclaredTerms","TermValueName","ChildPropertyType","Section","Synonym","ValueSet"
4 | "DeclareTerm","resources","url",,,,
5 | "DeclareTerm","resource",,,,"resources",
6 | "DeclareTerm","schema",,,,"resources.schema",
7 | "DeclareTerm","field","name",,,"schema.fields",
8 | "DeclareTerm","schema.fields","name",,,,
9 |
--------------------------------------------------------------------------------
/metatab/test/test-data/issue1.csv:
--------------------------------------------------------------------------------
1 | ,,,,
2 | "Section","Resources","table","Grain","Title"
3 | "Documentation","https://www.cdph.ca.gov/programs/Documents/HCI_RegisteredVoters_653_Narrative_and_examples_6-2-14.pdf","Indicator Documentation for Voter Registration / Participation",,
4 | ".description","Voter Registration/Participation: Percent of the eligible population registered to vote and the percent who voted in statewide elections",,,
5 |
--------------------------------------------------------------------------------
/metatab/test/test-data/json/datapackage_ex1_web.json:
--------------------------------------------------------------------------------
1 | {
2 | "declare": "http://assets.metatab.org/datapackage.csv",
3 | "title": "Registered Voters, By County",
4 | "description": "Percent of the eligible population registered to vote and the percent who voted in statewide elections.",
5 | "name": "cdph.ca.gov-hci-registered_voters-county",
6 | "version": "1.3.4",
7 | "section": [
8 | {
9 | "section":
--------------------------------------------------------------------------------
/metatab/test/test-data/children2.csv:
--------------------------------------------------------------------------------
1 | "# Like children.csv, but with different values for debugging. ",,,
2 | ,,,
3 | "Section","Arguments","prop1","prop2"
4 | "Parent","parent","prop11","prop12"
5 | ,,,
6 | "Section","ExplicitChildren",,
7 | "Parent","parent",,
8 | "Parent.Prop1","prop21",,
9 | "Parent.Prop2","prop22",,
10 | ,,,
11 | "Section","ElidedChildren",,
12 | "Parent","parent",,
13 | ".Prop1","prop31",,
14 | ".Prop2","prop32",,
15 |
--------------------------------------------------------------------------------
/metatab/test/test-data/census.csv:
--------------------------------------------------------------------------------
1 | Section,DeclaredSections,,,,
2 | DeclareSection,Section,Schema,title,column_ref,indent
3 | ,,,,,
4 | ,,,,,
5 | Section,DeclaredTerms,,,,
6 | Header,Term,TermValueName,ChildPropertyType,Section,
7 | DeclareTerm,Table,Name,,Schema,
8 | DeclareTerm,Table.Universe,,,Root,
9 | DeclareTerm,Table.Segment,,,Root,
10 | DeclareTerm,Table.Topics,,,,
11 | DeclareTerm,Table.Subject,,,,
12 | DeclareTerm,Table.Column,Name,,,
13 |
--------------------------------------------------------------------------------
/metatab/test/test-data/json/issue1.json:
--------------------------------------------------------------------------------
1 | {
2 | "documentation": {
3 | "table": "Indicator Documentation for Voter Registration / Participation",
4 | "description": "Voter Registration/Participation: Percent of the eligible population registered to vote and the percent who voted in statewide elections",
5 | "@value": "https://www.cdph.ca.gov/programs/Documents/HCI_RegisteredVoters_653_Narrative_and_examples_6-2-14.pdf"
6 | }
7 | }
--------------------------------------------------------------------------------
/metatab/test/test-data/children3.csv:
--------------------------------------------------------------------------------
1 | ,,,
2 | ,,,
3 | ,,,
4 | "Section","Arguments","child1","child2"
5 | "Parent","parent","child1","child2"
6 | ,,,
7 | "Section","ExplicitChildren",,
8 | "Parent","parent",,
9 | "Parent.Child1","child1",,
10 | "Parent.Child2","child2",,
11 | ,,,
12 | "Section","ElidedChildren",,
13 | "Parent","parent",,
14 | ".Child1","child1",,
15 | ".Child2","child2",,
16 | "Child1.grand1","grand1",,
17 | "Child2.grand2","grand2",,
18 | "Grand1.Great1","great1",,
19 |
--------------------------------------------------------------------------------
/metatab/test/test-data/json/children2.json:
--------------------------------------------------------------------------------
1 | {
2 | "parent": [
3 | {
4 | "prop1": "prop11",
5 | "prop2": "prop12",
6 | "@value": "parent"
7 | },
8 | {
9 | "prop1": "prop21",
10 | "prop2": "prop22",
11 | "@value": "parent"
12 | },
13 | {
14 | "prop1": "prop31",
15 | "prop2": "prop32",
16 | "@value": "parent"
17 | }
18 | ]
19 | }
--------------------------------------------------------------------------------
/metatab/templates/metatab.csv:
--------------------------------------------------------------------------------
1 | Declare,metatab-latest,,,
2 | Title,,,,
3 | Description,,,,
4 | Identifier,,,,
5 | Name,,,,
6 | Dataset,,,,
7 | Origin,,,,
8 | Space,,,,
9 | Time,,,,
10 | Grain,,,,
11 | Variant,,,,
12 | Version,1,,,
13 | ,,,,
14 | Section,References,Name,Description,
15 | ,,,,
16 | Section,Resources,Name,Description,
17 | ,,,,
18 | Section ,Documentation,Title,Description,
19 | ,,,,
20 | Section,Contacts,Email,Organization,Url
21 | ,,,,
22 | Section,Schema,DataType,AltName,Description
23 |
--------------------------------------------------------------------------------
/metatab/test/test-data/json/children.json:
--------------------------------------------------------------------------------
1 | {
2 | "note": "This is a note",
3 | "parent": [
4 | {
5 | "prop1": "prop1",
6 | "prop2": "prop2",
7 | "@value": "parent"
8 | },
9 | {
10 | "prop1": "prop1",
11 | "prop2": "prop2",
12 | "@value": "parent"
13 | },
14 | {
15 | "prop1": "prop1",
16 | "prop2": "prop2",
17 | "@value": "parent"
18 | }
19 | ]
20 | }
--------------------------------------------------------------------------------
/metatab/test/test-data/url_classes.csv:
--------------------------------------------------------------------------------
1 | in_url,download_file,download_format,download_url,encoding,file_segment,is_archive,proto,target_file,target_format,url
2 | http://example.com/simple-example-altnames.csv,simple-example-altnames.csv,csv,http://example.com/simple-example-altnames.csv,,,False,http,simple-example-altnames.csv,csv,http://example.com/simple-example-altnames.csv
3 | http://example.com/test_data.zip,test_data.zip,zip,http://example.com/test_data.zip,,,True,http,test_data.zip,zip,http://example.com/test_data.zip
4 |
--------------------------------------------------------------------------------
/metatab/test/test-data/line/line-oriented-doc-bib.txt:
--------------------------------------------------------------------------------
1 | Section: Bibliography
2 | Citation: ipums
3 | Citation.Type: dataset
4 | Citation.Author: Steven Ruggles; Katie Genadek; Ronald Goeken; Josiah Grover; Matthew Sobek
5 | Citation.Title: Integrated Public Use Microdata Series
6 | Citation.Year: 2017
7 | Citation.Publisher: University of Minnesota
8 | Citation.Version: 7.0
9 | Citation.AccessDate: 20170718
10 | Citation.Url: https://usa.ipums.org/usa/index.shtml
11 | Citation.Doi: https://doi.org/10.18128/D010.V7.0
12 |
13 |
--------------------------------------------------------------------------------
/develop.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | git clone https://github.com/CivicKnowledge/rowpipe.git && (cd rowpipe && python setup.py develop)
3 | git clone https://github.com/CivicKnowledge/tableintuit.git && (cd tableintuit && python setup.py develop)
4 | git clone https://github.com/CivicKnowledge/rowgenerators.git && (cd rowgenerators && python setup.py develop)
5 | git clone https://github.com/CivicKnowledge/pandas-reporter.git && (cd pandas-reporter && python setup.py develop)
6 | git clone https://github.com/CivicKnowledge/metatab-py.git; (cd metatab-py && python setup.py develop)
7 |
--------------------------------------------------------------------------------
/metatab/test/test-data/programsource.csv:
--------------------------------------------------------------------------------
1 | "Declare","metatab-latest",,,
2 | "Title","Program Source Text",,,
3 | "Description","Test using a program to generate the data",,,
4 | "Identifier","6e5cc47a-b712-4868-afc1-76a5797d1e98",,,
5 | "Name","program_source-1",,,
6 | "Name.Origin",,,,
7 | "Name.Space",,,,
8 | "Name.Time",,,,
9 | "Name.Dataset","program-source",,,
10 | "Name.Version",1,,,
11 | "Name.Grain",,,,
12 | ,,,,
13 | "Section","Resources","Name","VarName","GeoType"
14 | "Datafile","program:scripts/dumpvar.py","Obesity","OBESEA","ZCTA"
15 | ,,,,
16 | ,,,,
17 | "Section","Schema","DataType","AltName","Description"
18 |
--------------------------------------------------------------------------------
/metatab/test/test-data/resolve_urls.csv:
--------------------------------------------------------------------------------
1 | "doc","base_url","resource_url","url"
2 | "example1.csv",,"c/d.csv","file:/c/d.csv"
3 | "example1.csv","http://example/a/b","c/d.csv","http://example/a/c/d.csv"
4 | "example1.csv",,"program:c/d.csv","program+file:/c/d.csv"
5 | "example1.csv","http://example/a/b","program:c/d.csv","program+http://example/a/c/d.csv"
6 | "example1.csv",,"/c/d.csv","file:/c/d.csv"
7 | "example1.csv","http://example/a/b","/c/d.csv","http://example/c/d.csv"
8 | "example1.csv",,"program:/c/d.csv","program+file:/c/d.csv"
9 | "example1.csv","http://example/a/b","program:/c/d.csv","program+http://example/c/d.csv"
10 |
--------------------------------------------------------------------------------
/metatab/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2016 Civic Knowledge. This file is licensed under the terms of the
2 | # Revised BSD License, included in this distribution as LICENSE
3 | """
4 | Record objects for the Simple Data Package format.
5 | """
6 |
7 | # default metadata file
8 | DEFAULT_METATAB_FILE = 'metadata.csv'
9 | LINES_METATAB_FILE = 'metadata.txt'
10 | IPYNB_METATAB_FILE = 'metadata.ipynb'
11 |
12 | from .parser import *
13 | from .exc import *
14 | from .doc import MetatabDoc
15 | from .resolver import WebResolver
16 |
17 | from pkg_resources import get_distribution, DistributionNotFound
18 | try:
19 | __version__ = get_distribution(__name__).version
20 | except DistributionNotFound:
21 | # package is not installed
22 | pass
23 |
24 |
--------------------------------------------------------------------------------
/metatab/test/test-data/packages/example.com-test_package/metadata.csv:
--------------------------------------------------------------------------------
1 | Declare,metatab-latest
2 | Title,Test Package
3 | Description,Package for Testing
4 | Identifier,
5 | Identifier,36c7e945-943c-435e-923c-1af21d831b3b
6 | Name,example.com-test_package-1
7 | Dataset,test_package
8 | Origin,example.com
9 | Time,
10 | Space,
11 | Grain,
12 | Version,1
13 | Created,2017-08-03T21:15:56
14 | Modified,2017-08-03T21:15:56
15 | Modified,2017-08-03T21:16:42
16 | Giturl,https://github.com/CivicKnowledge/metatab-py.git
17 |
18 | Section,Resources,Name,Description,
19 | Datafile,http://example.com/data.csv,,,
20 |
21 | Section,Documentation,Title,Description,
22 | Note,,,,
23 |
24 | Section,Contacts,Email,Organization,Url
25 |
26 | Section,Schema,DataType,AltName,Description
27 |
--------------------------------------------------------------------------------
/metatab/test/test-data/json/datapackage_ex2.json:
--------------------------------------------------------------------------------
1 | {
2 | "declare": "datapackage-latest",
3 | "title": "Country, Regional and World GDP (Gross Domestic Product)",
4 | "description": "Country, regional and world GDP in current US Dollars ($). Regional means collections of countries e.g. Europe & Central Asia. Data is sourced from the World Bank and turned into a standard normalized CSV.",
5 | "name": "gdp",
6 | "version": "2011",
7 | "license": "PDDL-1.0",
8 | "keyword": [
9 | "GDP",
10 | "World",
11 | "Gross Domestic Product",
12 | "Time series"
13 | ],
14 | "image": "http://assets.okfn.org/p/opendatahandbook/img/data-wrench.png",
15 | "last-updated": "2011-09-21",
16 | "section": [
17 | {
18 | "section":
--------------------------------------------------------------------------------
/metatab/test/test-data/simple-text.txt:
--------------------------------------------------------------------------------
1 | Declare: metatab-latest
2 | Title: Registered Voters, By County
3 | Description: An Example Whatever.
4 | Origin: example.com
5 | Dataset: foobar.com
6 |
7 | Section: Contacts
8 | Wrangler: Eric Busboom
9 | Wrangler.Email: eric@civicknowledge.com
10 |
11 | Section: Resources
12 | Datafile: http://public.source.civicknowledge.com/example.com/sources/renter_cost.csv
13 | Datafile.Name: resource
14 | Datafile.Title: The First Example Data File
15 | Datafile.Startline: 5
16 | Datafile.HeaderLines: 3,4
17 |
18 | Section: References
19 | Reference: http://public.source.civicknowledge.com/example.com/sources/renter_cost.csv
20 | Reference.Name: reference
21 | Reference.Title: The First Example Data File
22 | Reference.Startline: 5
23 | Reference.HeaderLines: 3,4
24 |
--------------------------------------------------------------------------------
/metatab/test/test-data/example2.csv:
--------------------------------------------------------------------------------
1 | "Term","value",
2 | "Title","Registered Voters, By County",
3 | "Description","Percent of the eligible population registered to vote and the percent who voted in statewide elections.",
4 | "Identifier","cdph.ca.gov-hci-registered_voters-county",
5 | "Version",201404,
6 | ,,
7 | "Section","documentation","title"
8 | "Homepage","https://www.cdph.ca.gov/programs/pages/healthycommunityindicators.aspx","Healthy Communities Data and Indicators Project (HCI)"
9 | "Documentation","https://www.cdph.ca.gov/programs/Documents/HCI_RegisteredVoters_653_Narrative_and_examples_6-2-14.pdf","Indicator Documentation for Voter Registration / Participation"
10 | ".description","Voter Registration/Participation: Percent of the eligible population registered to vote and the percent who voted in statewide elections",
11 |
--------------------------------------------------------------------------------
/metatab/test/test-data/simple1.csv:
--------------------------------------------------------------------------------
1 | "Declare","metatab-latest",
2 | "Title","Registered Voters, By County",
3 | "Name","cdph.ca.gov-hci-registered_voters-county",
4 | ,,
5 | "Section","Resources","Name"
6 | "Datafile","http://example.com/example1.csv","namea"
7 | "Datafile","http://example.com/example2.csv","nameb"
8 | "Homepage","https://www.cdph.ca.gov/programs/pages/healthycommunityindicators.aspx","namec"
9 | "Documentation","https://www.cdph.ca.gov/programs/Documents/HCI_RegisteredVoters_653_Narrative_and_examples_6-2-14.pdf","named"
10 | ".description","Voter Registration/Participation: Percent of the eligible population registered to vote and the percent who voted in statewide elections","namee"
11 | ,,
12 | ,,
13 | "Section","Schema","datatype"
14 | "Table","registered_voters",
15 | "Table.Column","reportyear","int"
16 | "Table.Column","type","str"
17 |
--------------------------------------------------------------------------------
/metatab/test/Makefile:
--------------------------------------------------------------------------------
1 |
2 | NS = civicknowledge.com
3 | VERSION = latest
4 |
5 | REPO = tox
6 | NAME = tox
7 | INSTANCE = default
8 | DOCKER ?= docker
9 |
10 | .PHONY: test build push shell run start stop restart reload rm rmf release
11 | CWD = $(notdir $(shell pwd))
12 |
13 | VOLUMES=-v $(abspath $(CWD)/../../..):/code
14 |
15 | test:
16 | $(DOCKER) run --rm --name $(NAME) $(PORTS) $(VOLUMES) $(ENV) $(NS)/$(REPO):$(VERSION) tox
17 |
18 | build:
19 | $(DOCKER) build -t $(NS)/$(REPO):$(VERSION) .
20 |
21 | push:
22 | $(DOCKER) push $(NS)/$(REPO):$(VERSION)
23 |
24 | shell:
25 | $(DOCKER) run --rm -i -t $(PORTS) $(VOLUMES) $(ENV) $(NS)/$(REPO):$(VERSION) /bin/bash
26 |
27 | logs:
28 | $(DOCKER) logs -f $(NAME)
29 |
30 | rmf:
31 | $(DOCKER) rm -f $(NAME)
32 |
33 | rm:
34 | $(DOCKER) rm $(NAME)
35 |
36 | release: build
37 | make push -e VERSION=$(VERSION)
38 |
39 | default: test
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: default install reset check test tox readme docs publish clean
2 |
3 | MAKE := $(MAKE) --no-print-directory
4 |
5 | test:
6 | python setup.py test
7 |
8 | develop:
9 | python setup.py develop
10 |
11 | publish:
12 | $(MAKE) clean
13 | python setup.py sdist
14 | twine upload dist/*
15 | $(MAKE) clean
16 |
17 | clean:
18 | @rm -Rf *.egg .cache .coverage .tox build dist docs/build htmlcov
19 | @find -depth -type d -name __pycache__ -exec rm -Rf {} \;
20 | @find -type f -name '*.pyc' -delete
21 | test:
22 | python setup.py test
23 |
24 | develop:
25 | python setup.py develop
26 |
27 | publish:
28 | git push --tags origin
29 | $(MAKE) clean
30 | python setup.py sdist
31 | twine upload dist/*
32 | $(MAKE) clean
33 |
34 | clean:
35 | @rm -Rf *.egg .cache .coverage .tox build dist docs/build htmlcov
36 | #@find . -type d -name __pycache__ -exec rm -Rf {} \;
37 | #@find . -type f -name '*.pyc' -delete
--------------------------------------------------------------------------------
/metatab/test/test-data/json/example2.json:
--------------------------------------------------------------------------------
1 | {
2 | "term": "value",
3 | "title": "Registered Voters, By County",
4 | "description": "Percent of the eligible population registered to vote and the percent who voted in statewide elections.",
5 | "identifier": "cdph.ca.gov-hci-registered_voters-county",
6 | "version": "201404",
7 | "homepage": {
8 | "title": "Healthy Communities Data and Indicators Project (HCI)",
9 | "@value": "https://www.cdph.ca.gov/programs/pages/healthycommunityindicators.aspx"
10 | },
11 | "documentation": {
12 | "title": "Indicator Documentation for Voter Registration / Participation",
13 | "description": "Voter Registration/Participation: Percent of the eligible population registered to vote and the percent who voted in statewide elections",
14 | "@value": "https://www.cdph.ca.gov/programs/Documents/HCI_RegisteredVoters_653_Narrative_and_examples_6-2-14.pdf"
15 | }
16 | }
--------------------------------------------------------------------------------
/metatab/exc.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2016 Civic Knowledge. This file is licensed under the terms of the
2 | # Revised BSD License, included in this distribution as LICENSE
3 |
4 | """
5 |
6 | """
7 |
8 |
9 | class MetatabError(Exception):
10 | pass
11 |
12 |
13 |
14 |
15 | class ReferenceError(MetatabError):
16 | pass
17 |
18 |
19 | class ParserError(MetatabError):
20 | def __init__(self, *args, **kwargs):
21 | super(ParserError, self).__init__(*args, **kwargs)
22 | self.term = kwargs.get('term', None)
23 |
24 |
25 | class IncludeError(MetatabError):
26 | def __init__(self, *args, **kwargs):
27 | self.message = ''
28 | super(IncludeError, self).__init__(*args, **kwargs)
29 |
30 |
31 | class DeclarationError(ParserError):
32 | pass
33 |
34 |
35 | class GenerateError(MetatabError):
36 | pass
37 |
38 |
39 | class ConversionError(MetatabError):
40 | pass
41 |
42 | class FormatError(MetatabError):
43 | pass
44 |
--------------------------------------------------------------------------------
/metatab/test/test-data/resources.csv:
--------------------------------------------------------------------------------
1 | "Declare","metatab-latest",,,,
2 | "Title","Resource test",,,,
3 | "Name","resource-test",,,,
4 | "Description","Percent of the eligible population registered to vote and the percent who voted in statewide elections.",,,,
5 | "Identifier","cdph.ca.gov-hci-registered_voters-county",,,,
6 | ,,,,,
7 | "Section","Resources",,,,
8 | "Header","url","name",,,"Title"
9 | "Datafile","http://example.com/example1.csv","example1",,,"The First Example Data File"
10 | "Datafile","http://example.com/example3.csv","example2",,,"The Second Example Data File"
11 | "Reference","http://example.com/example3.csv","example3",,,
12 | "Reference","http://example.com/example4.csv","example4",,,
13 | "Documentation","http://example.com/example5.csv","example5",,,
14 | "Documentation","http://example.com/example6.csv","example6",,,
15 | "Homepage","http://example.com/example7.csv","example7",,,
16 | "Homepage","http://example.com/example8.csv","example8",,,
17 | "Citation","example9",,,,
18 | "Citation","example10",,,,
19 |
--------------------------------------------------------------------------------
/metatab/test/test-data/short-declare.csv:
--------------------------------------------------------------------------------
1 | "Section","DeclaredSections",,,,
2 | "DeclareSection","DeclaredSections","Arg0","Arg1","Arg2",
3 | "DeclareSection","Root",,,,
4 | "DeclareSection","DeclaredTerms","TermValueName","ChildPropertyType","Section",
5 | "DeclareSection","Resources","Table","Grain","Title",
6 | "DeclareSection","Contacts","Email",,,
7 | "DeclareSection","Schemas","DataType","ValueType","Description",
8 | ,,,,,
9 | "Section","DeclaredTerms","TermValueName","ChildPropertyType","Section","InheritsFrom"
10 | "DeclareTerm","DeclareTerm","Term",,"DeclaredTerms",
11 | "DeclareTerm","Declare",,,"Root",
12 | "DeclareTerm","Include",,,"Root",
13 | "DeclareTerm","Section","Name","sequence","Root",
14 | "DeclareTerm","DeclareSection","Section","sequence","DeclaredSections",
15 | ,,,,,
16 | "Section ","DeclaredTerms","TermValueName","InheritsFrom","Section",
17 | "# Top Level Dataset Terms",,,,,
18 | "DeclareTerm","Root.Title",,,"Root",
19 | "DeclareTerm","Title.Language",,,"Root",
20 | "DeclareTerm","Root.Summary",,"Root.Title","Root",
21 |
--------------------------------------------------------------------------------
/metatab/test/test-data/datapackage_ex1.csv:
--------------------------------------------------------------------------------
1 | "Declare","datapackage-latest.csv",,
2 | "title","Registered Voters, By County",,
3 | "description","Percent of the eligible population registered to vote and the percent who voted in statewide elections.",,
4 | "name","cdph.ca.gov-hci-registered_voters-county",,
5 | "version","1.3.4",,
6 | ,,,
7 | ,,,
8 | "Section","Resources","type","description"
9 | "resource","http://example.com/resource1.csv",,
10 | ".title","First Resource",,
11 | ".name","the-first-resource",,
12 | ".mediatype","text/csv",,
13 | ".format","csv",,
14 | ,,,
15 | "schema",,,
16 | "field","id","string","description"
17 | "field","state","string","description"
18 | "field","income","string","description"
19 | ,,,
20 | "resource","http://example.com/resource2.csv",,
21 | ".title","Second Resource",,
22 | ".name","the-second-resource",,
23 | ".mediatype","text/csv",,
24 | ".format","csv",,
25 | ,,,
26 | "schema",,,
27 | "field","id","string","description"
28 | "field","country","string","description"
29 | "field","gdp","string","description"
30 |
--------------------------------------------------------------------------------
/metatab/test/test-data/scripts/Py3Notebook.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from string import ascii_uppercase\n",
10 | "\n",
11 | "lst = [ascii_uppercase[:11] ] + [ list(range(10))+ [mult(i)] for i in range(10)]"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | ""
21 | ]
22 | }
23 | ],
24 | "metadata": {
25 | "kernelspec": {
26 | "display_name": "Python 3",
27 | "language": "python",
28 | "name": "python3"
29 | },
30 | "language_info": {
31 | "codemirror_mode": {
32 | "name": "ipython",
33 | "version": 3.0
34 | },
35 | "file_extension": ".py",
36 | "mimetype": "text/x-python",
37 | "name": "python",
38 | "nbconvert_exporter": "python",
39 | "pygments_lexer": "ipython3",
40 | "version": "3.5.0"
41 | }
42 | },
43 | "nbformat": 4,
44 | "nbformat_minor": 2
45 | }
--------------------------------------------------------------------------------
/metatab/test/test-data/datapackage_ex1_web.csv:
--------------------------------------------------------------------------------
1 | "Declare","http://assets.metatab.org/datapackage.csv",,
2 | "title","Registered Voters, By County",,
3 | "description","Percent of the eligible population registered to vote and the percent who voted in statewide elections.",,
4 | "name","cdph.ca.gov-hci-registered_voters-county",,
5 | "version","1.3.4",,
6 | ,,,
7 | ,,,
8 | "Section","Resources","type","description"
9 | "resource","http://example.com/resource1.csv",,
10 | ".title","First Resource",,
11 | ".name","the-first-resource",,
12 | ".mediatype","text/csv",,
13 | ".format","csv",,
14 | ,,,
15 | "schema",,,
16 | "field","country","string","description"
17 | "field","country","string","description"
18 | "field","country","string","description"
19 | ,,,
20 | "resource","http://example.com/resource2.csv",,
21 | ".title","Second Resource",,
22 | ".name","the-second-resource",,
23 | ".mediatype","text/csv",,
24 | ".format","csv",,
25 | ,,,
26 | "schema",,,
27 | "field","country","string","description"
28 | "field","country","string","description"
29 | "field","country","string","description"
30 |
--------------------------------------------------------------------------------
/metatab/test/test-data/geo.csv:
--------------------------------------------------------------------------------
1 | Declare,metatab-latest
2 | Title,US States
3 | Description,US States
4 | Identifier,11585edd-20f4-4b15-a0da-9b5197b5ecc5
5 | Name,us_states-1
6 | Name.Time,
7 | Name.Version,1
8 | Name.Dataset,us-states
9 | Name.Origin,
10 | Name.Grain,
11 | Name.Space,
12 |
13 | Section,Resources,Name,Description,
14 | Datafile,shape+http://s3.amazonaws.com/test.library.civicknowledge.com/census/tl_2016_us_state.geojson.zip,us_states,,
15 |
16 | Section,Schema,DataType,AltName,Description
17 | Table,us_states,,,
18 | Table.Column,id,integer,,
19 | Table.Column,REGION,integer,region,
20 | Table.Column,DIVISION,integer,division,
21 | Table.Column,STATEFP,integer,statefp,
22 | Table.Column,STATENS,integer,statens,
23 | Table.Column,GEOID,integer,geoid,
24 | Table.Column,STUSPS,text,stusps,
25 | Table.Column,NAME,text,name,
26 | Table.Column,LSAD,integer,lsad,
27 | Table.Column,MTFCC,text,mtfcc,
28 | Table.Column,FUNCSTAT,text,funcstat,
29 | Table.Column,ALAND,integer,aland,
30 | Table.Column,AWATER,integer,awater,
31 | Table.Column,INTPTLAT,number,intptlat,
32 | Table.Column,INTPTLON,number,intptlon,
33 | Table.Column,geometry,text,,
34 |
--------------------------------------------------------------------------------
/metatab/test/test-data/line/line-oriented-doc-references-2.txt:
--------------------------------------------------------------------------------
1 |
2 | Section: References
3 |
4 | #
5 | # Tract crosswalk
6 | #
7 | Reference: metatab+http://s3.amazonaws.com/library.metatab.org/sangis.org-census_regions-2010-sandiego-7.csv#tract-sra-msa-xwalk
8 | Reference.Name: tracts
9 | Reference.Description: Crosswalk between crosswalks, tracts, zip codes and SRAs in San Diego County
10 |
11 | #
12 | # Tract boundaries
13 | #
14 | Reference: metatab+http://s3.amazonaws.com/library.metatab.org/sangis.org-census_regions-2010-sandiego-7.csv#tracts
15 | Reference.Name: tracts_geo
16 | Reference.Description: Geographics Boundaries for Tracts
17 |
18 | #
19 | # SRA boundaries
20 | #
21 | Reference: metatab+http://s3.amazonaws.com/library.metatab.org/sangis.org-census_regions-2010-sandiego-7.csv#sra
22 | Reference.Name: sra_geo
23 | Reference.Description: Geographics Boundaries for SRAs
24 |
25 | #
26 | # IPUMS Housing and Income Data
27 | #
28 | # Need to use the ZIP version b/c we need to import the Python Code
29 | Reference: metatab+http://s3.amazonaws.com/library.metatab.org/ipums.org-income_homevalue-5.zip#income_homeval
30 | Reference.Name: incv
31 | Reference.Description: Income and Home value records from IPUMS for San Diego County
32 |
--------------------------------------------------------------------------------
/docker/Makefile:
--------------------------------------------------------------------------------
1 |
2 | INSTANCE = default
3 | DOCKER ?= docker
4 |
5 | NS = civicknowledge
6 | VERSION = latest
7 |
8 | REPO = metatab
9 | NAME = metatab
10 |
11 | DOCKER ?= docker
12 |
13 | PORTS =
14 |
15 | VOLUMES= -v /data
16 |
17 | ENV =
18 |
19 |
20 | .PHONY: build rebuild push shell run start stop restart reload rm rmf release test
21 |
22 | build:
23 | $(DOCKER) build -t $(NS)/$(REPO):$(VERSION) .
24 |
25 | rebuild:
26 | $(DOCKER) build --no-cache=true -t $(NS)/$(REPO):$(VERSION) .
27 |
28 | push:
29 | $(DOCKER) push $(NS)/$(REPO):$(VERSION)
30 |
31 | shell:
32 | $(DOCKER) run --rm -i -t $(PORTS) $(VOLUMES) $(LINKS) $(ENV) $(NS)/$(REPO):$(VERSION) /bin/bash
33 |
34 | run:
35 | $(DOCKER) run --rm --name $(NAME) $(PORTS) $(VOLUMES) $(LINKS) $(ENV) $(NS)/$(REPO):$(VERSION)
36 |
37 | logs:
38 | $(DOCKER) logs -f $(NAME)
39 |
40 | start:
41 | $(DOCKER) run -d --name $(NAME) $(PORTS) $(VOLUMES) $(LINKS) $(ENV) $(NS)/$(REPO):$(VERSION)
42 |
43 | stop:
44 | $(DOCKER) stop $(NAME)
45 |
46 | restart: stop start
47 |
48 | reload: build rmf start
49 |
50 | rmf:
51 | $(DOCKER) rm -f $(NAME)
52 |
53 | rm:
54 | $(DOCKER) rm $(NAME)
55 |
56 | release: build
57 | make push -e VERSION=$(VERSION)
58 |
59 | default: build
60 |
61 |
--------------------------------------------------------------------------------
/metatab/resolver.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2016 Civic Knowledge. This file is licensed under the terms of the
2 | # Revised BSD License, included in this distribution as LICENSE
3 |
4 | """
5 | Generate rows from a variety of paths, references or other input
6 | """
7 |
8 | from .exc import IncludeError, GenerateError
9 |
10 | class WebResolver(object):
11 |
12 | def fetch_row_source(self, url):
13 | pass
14 |
15 | def find_decl_doc(self, name):
16 |
17 |
18 | raise IncludeError(name)
19 |
20 | import requests
21 | from requests.exceptions import InvalidSchema
22 | url = METATAB_ASSETS_URL + name + '.csv'
23 | try:
24 | # See if it exists online in the official repo
25 | r = requests.head(url, allow_redirects=False)
26 | if r.status_code == requests.codes.ok:
27 |
28 | return url
29 |
30 | except InvalidSchema:
31 | pass # It's probably FTP
32 |
33 |
34 | def get_row_generator(self, ref, cache=None):
35 |
36 | """Return a row generator for a reference"""
37 | from inspect import isgenerator
38 | from rowgenerators import get_generator
39 |
40 | g = get_generator(ref)
41 |
42 | if not g:
43 | raise GenerateError("Cant figure out how to generate rows from {} ref: {}".format(type(ref), ref))
44 | else:
45 | return g
46 |
47 |
--------------------------------------------------------------------------------
/metatab/test/test-data/datapackage_ex2.csv:
--------------------------------------------------------------------------------
1 | "Declare","datapackage-latest",,,,
2 | "title","Country, Regional and World GDP (Gross Domestic Product)",,,,
3 | "description","Country, regional and world GDP in current US Dollars ($). Regional means collections of countries e.g. Europe & Central Asia. Data is sourced from the World Bank and turned into a standard normalized CSV.",,,,
4 | "name","gdp",,,,
5 | "version",2011,,,,
6 | "license","PDDL-1.0",,,,
7 | "keyword","GDP",,,,
8 | "keyword","World",,,,
9 | "keyword","Gross Domestic Product",,,,
10 | "keyword","Time series",,,,
11 | "image","http://assets.okfn.org/p/opendatahandbook/img/data-wrench.png",,,,
12 | "last-updated","2011-09-21",,,,
13 | ,,,,,
14 | "Section","Sources","web",,,
15 | "Source","World Bank and OECD","http://data.worldbank.org/indicator/NY.GDP.MKTP.CD",,,
16 | ,,,,,
17 | "Section ","Resources","type","format","foreignkey","description"
18 | "resource","gdp",,,,
19 | "resource.path","data/gdp.csv",,,,
20 | "schema",,,,,
21 | "field","Country Name","string",,,
22 | "field","Contry Code","string",,"iso-3-geo-codes/id",
23 | "field","Year","date","yyyy",,
24 | "field","Value","number",,,"GDP in current USD"
25 | ,,,,,
26 | "resource","another_gdp_resource",,,,
27 | "resource.path","data/other_gdp.csv",,,,
28 | "schema",,,,,
29 | "field","Country Name","string",,,
30 | "field","Contry Code","string",,"iso-3-geo-codes/id",
31 | "field","Year","date","yyyy",,
32 | "field","Value","number",,,"GDP in current USD"
33 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2017, Civic Knowledge
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without
5 | modification, are permitted provided that the following conditions are met:
6 |
7 | * Redistributions of source code must retain the above copyright notice, this
8 | list of conditions and the following disclaimer.
9 |
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 | this list of conditions and the following disclaimer in the documentation
12 | and/or other materials provided with the distribution.
13 |
14 | * Neither the name of Civic Knowledge nor the names of its
15 | contributors may be used to endorse or promote products derived from
16 | this software without specific prior written permission.
17 |
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 |
29 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | ##
3 | ## Python Ignores
4 | ##
5 |
6 | *.py[cod]
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Packages
12 | *.egg
13 | *.egg-info
14 | *.eggs
15 | *.cache
16 | dist
17 | build
18 | eggs
19 | parts
20 | var
21 | sdist
22 | develop-eggs
23 | .installed.cfg
24 | lib
25 | lib64
26 | __pycache__
27 |
28 | # Installer logs
29 | pip-log.txt
30 |
31 | # Unit test / coverage reports
32 | .coverage
33 | .tox
34 | nosetests.xml
35 | htmlcov/*
36 |
37 | # Translations
38 | *.mo
39 |
40 | # Mr Developer
41 | .mr.developer.cfg
42 | .project
43 | .pydevproject
44 | .idea
45 | test/testbundle/build-save
46 | test/bundles/testbundle/meta/schema-old.csv
47 | bundle.yaml.old
48 | schema-revised.csv
49 | build-save
50 | *.sqlite3
51 |
52 | test/coverage
53 | meta/coverage.yaml
54 |
55 | ##
56 | ## Javascript Ignores
57 | ##
58 |
59 | # Logs
60 | logs
61 | *.log
62 | npm-debug.log*
63 |
64 | # Runtime data
65 | pids
66 | *.pid
67 | *.seed
68 |
69 | # Directory for instrumented libs generated by jscoverage/JSCover
70 | lib-cov
71 |
72 | # Coverage directory used by tools like istanbul
73 | coverage
74 |
75 | # nyc test coverage
76 | .nyc_output
77 |
78 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
79 | .grunt
80 |
81 | # node-waf configuration
82 | .lock-wscript
83 |
84 | # Compiled binary addons (http://nodejs.org/api/addons.html)
85 | build/Release
86 |
87 | # Dependency directories
88 | node_modules
89 | jspm_packages
90 |
91 | # Optional npm cache directory
92 | .npm
93 |
94 | # Optional REPL history
95 | .node_repl_history
96 |
97 | _metapack
98 | .DS_Store
99 |
--------------------------------------------------------------------------------
/metatab/datapackage.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2016 Civic Knowledge. This file is licensed under the terms of the
2 | # Revised BSD License, included in this distribution as LICENSE
3 |
4 | """
5 | Convert Metatab terms into datapackage.json file
6 | """
7 |
8 | from metatab.exc import ConversionError
9 |
10 | type_map = {
11 | 'str': 'string',
12 | 'text': 'string',
13 | 'unicode': 'string',
14 | 'int': 'integer',
15 | 'float': 'number'
16 | }
17 |
18 |
19 | def convert_to_datapackage(doc):
20 | dp = doc['root'].as_dict()
21 |
22 | try:
23 | dp.update(doc['identity'].as_dict())
24 | except KeyError as e:
25 | pass
26 |
27 | if not 'name' in dp:
28 | if 'indentifier' in dp:
29 | dp['name'] = dp['indentifier']
30 | else:
31 | raise ConversionError("Datapackage.json requires a Name or Identity term")
32 |
33 | try:
34 | table_schemas = {t.value: t.as_dict()['column'] for t in doc['schema']}
35 | except KeyError as e:
36 | raise ConversionError("Failed to get schemas: " + str(e))
37 |
38 | file_resources = [fr.arg_props for fr in doc['resources'] if fr.term_is('root.datafile')]
39 |
40 | dp['resources'] = []
41 |
42 | for r in file_resources:
43 |
44 | try:
45 | columns = table_schemas[r['name']] if r.get('name', '') in table_schemas else table_schemas[
46 | r['table']]
47 | except KeyError as e:
48 | continue
49 |
50 | def mkdict(c):
51 | d = {}
52 |
53 | for prop in ('name', 'title', 'description'):
54 | if c.get(prop):
55 | d[prop] = c[prop]
56 |
57 | d['type'] = type_map.get(c.get('datatype'), c.get('datatype'))
58 |
59 | return d
60 |
61 | dr = dict(
62 | path=r['url'],
63 | name=r['name'],
64 | schema={'fields': [mkdict(c) for c in columns]}
65 | )
66 |
67 | dp['resources'].append(dr)
68 |
69 | return dp
70 |
--------------------------------------------------------------------------------
/metatab/test/test-data/schema.csv:
--------------------------------------------------------------------------------
1 | "Declare","metatab-0.1",,,
2 | "Title","Registered Voters, By County",,,
3 | "Description","Percent of the eligible population registered to vote and the percent who voted in statewide elections.",,,
4 | ,,,,
5 | "Section","Resources","table","Grain","Title"
6 | "Datafile","http://example.com/example1.csv","registered_voters","County","The First Example Data File"
7 | "Datafile","http://example.com/example2.csv","registered_voters","Tract","The Second Example Data File"
8 | "Homepage","https://www.cdph.ca.gov/programs/pages/healthycommunityindicators.aspx","Healthy Communities Data and Indicators Project (HCI)",,
9 | "Documentation","https://www.cdph.ca.gov/programs/Documents/HCI_RegisteredVoters_653_Narrative_and_examples_6-2-14.pdf","Indicator Documentation for Voter Registration / Participation",,
10 | ".description","Voter Registration/Participation: Percent of the eligible population registered to vote and the percent who voted in statewide elections",,,
11 | ,,,,
12 | "Section ","Contacts","email",,
13 | "Creator","Office of Health Equity","HCIOHE@cdph.ca.gov",,
14 | "Wrangler","Eric Busboom","eric@civicknowledge.com",,
15 | ,,,,
16 | "Section","Schema","datatype","valuetype","description"
17 | "Table","Table1",,,"HCI Indicator 653.0: Percent of adults age 18 years and older who are registered voters"
18 | "Column","Column1","int","year range","Year or years that indicator was reported"
19 | "Column","Column2","str","dimension","Type of record"
20 | "Column","Column3","str","gvid","GVid version of the geotype and geotypeval"
21 | "Column","Column4","str","label for gvid","Census name of geographic area"
22 | "Table","Table1",,,"HCI Indicator 653.0: Percent of adults age 18 years and older who are registered voters"
23 | "Column","Column1","int","year range","Year or years that indicator was reported"
24 | "Column","Column2","str","dimension","Type of record"
25 | "Column","Column3","str","gvid","GVid version of the geotype and geotypeval"
26 | "Column","Column4","str","label for gvid","Census name of geographic area"
27 | "Column.Foo","Bingo",,,
28 | ,,"Bingo 1","BIngo 2",
29 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 |
5 | import os
6 | import sys
7 | from setuptools import setup
8 |
9 | if sys.argv[-1] == 'publish':
10 | os.system('python setup.py sdist upload')
11 | sys.exit()
12 |
13 | with open(os.path.join(os.path.dirname(__file__), 'README.rst')) as f:
14 | readme = f.read()
15 |
16 | classifiers = [
17 | 'Development Status :: 4 - Beta',
18 | 'Intended Audience :: Developers',
19 | 'License :: OSI Approved :: BSD License',
20 | 'Operating System :: OS Independent',
21 | 'Programming Language :: Python',
22 | 'Programming Language :: Python :: 3.6',
23 | 'Topic :: Software Development :: Libraries :: Python Modules',
24 | ]
25 |
26 | # Setup a directory for a fake package for importing plugins
27 |
28 | setup(
29 | name='metatab',
30 | version='0.8.2',
31 | description='Data format for storing structured data in spreadsheet tables',
32 | long_description=readme,
33 | packages=['metatab','metatab.templates', 'metatab.test', 'metatab.test.test-data'],
34 |
35 | package_data={
36 | '': ['*.csv','*.json','*.txt','*.ipynb',''],
37 | },
38 |
39 | install_requires=[
40 | 'metatabdecl',
41 | 'rowgenerators',
42 | ],
43 |
44 | # test_suite='appurl.test.test_suite',
45 | test_suite='nose.collector',
46 | tests_require=['nose', 'tabulate'],
47 |
48 | entry_points={
49 | 'console_scripts': [
50 | 'metatab=metatab.cli:metatab'
51 | ],
52 |
53 | 'appurl.urls': [
54 | "metatab+ = metatab.appurl:MetatabUrl",
55 | ],
56 |
57 | 'rowgenerators': [
58 | "metatab+.txt = metatab.rowgenerators:TextRowGenerator",
59 | ".yaml = metatab.rowgenerators:YamlMetatabSource"
60 | ]
61 | },
62 |
63 | author='Eric Busboom',
64 | author_email='eric@civicknowledge.com',
65 | url='https://github.com/Metatab/metatab-py.git',
66 | license='BSD',
67 | classifiers=classifiers,
68 | extras_require={
69 | 'datapackage': ['datapackage'],
70 | }
71 | )
72 |
--------------------------------------------------------------------------------
/metatab/test/test-data/line/line-oriented-doc.txt:
--------------------------------------------------------------------------------
1 | Identifier: 47bc1089-7584-41f0-b804-602ec42f1249
2 | Origin: civicknowledge.com
3 | Dataset: rcfe_affordability
4 | Version: 4
5 | Time: 2015
6 | Name: civicknowledge.com-rcfe_affordability-2015-4
7 |
8 | Section: Contacts
9 | Wrangler: Eric Busboom
10 | Wrangler.Email: eric@civicknowledge.com
11 | Wrangler.Organization: Civic Knowledge
12 |
13 | Section: References
14 |
15 | Reference: censusreporter:B09020/140/05000US06073
16 | Reference.Name: B09020
17 | Reference.Description: Relationship by Household Type (Including Living Alone) for Population 65 Years and Over
18 |
19 |
20 | #
21 | # Tract crosswalk
22 | #
23 | Reference: metatab+http://s3.amazonaws.com/library.metatab.org/sangis.org-census_regions-2010-sandiego-7.csv#tract-sra-msa-xwalk
24 | Reference.Name: tracts
25 | Reference.Description: Crosswalk between crosswalks, tracts, zip codes and SRAs in San Diego County
26 |
27 | #
28 | # Tract boundaries
29 | #
30 | Reference: metatab+http://s3.amazonaws.com/library.metatab.org/sangis.org-census_regions-2010-sandiego-7.csv#tracts
31 | Reference.Name: tracts_geo
32 | Reference.Description: Geographics Boundaries for Tracts
33 |
34 | #
35 | # SRA boundaries
36 | #
37 | Reference: metatab+http://s3.amazonaws.com/library.metatab.org/sangis.org-census_regions-2010-sandiego-7.csv#sra
38 | Reference.Name: sra_geo
39 | Reference.Description: Geographics Boundaries for SRAs
40 |
41 | #
42 | # IPUMS Housing and Income Data
43 | #
44 | # Need to use the ZIP version b/c we need to import the Python Code
45 | Reference: metatab+http://s3.amazonaws.com/library.metatab.org/ipums.org-income_homevalue-5.zip#income_homeval
46 | Reference.Name: incv
47 | Reference.Description: Income and Home value records from IPUMS for San Diego County
48 |
49 |
50 | ==== Bibliography
51 | Citation: ipums
52 | Citation.Type: dataset
53 | Citation.Author: Steven Ruggles; Katie Genadek; Ronald Goeken; Josiah Grover; Matthew Sobek
54 | Citation.Title: Integrated Public Use Microdata Series
55 | Citation.Year: 2017
56 | Citation.Publisher: University of Minnesota
57 | Citation.Version: 7.0
58 | Citation.AccessDate: 20170718
59 | Citation.Url: https://usa.ipums.org/usa/index.shtml
60 | Citation.Doi: https://doi.org/10.18128/D010.V7.0
61 |
62 |
--------------------------------------------------------------------------------
/metatab/test/outputs/datapackage.json:
--------------------------------------------------------------------------------
1 | {
2 | "resources": [
3 | {
4 | "name": "the-first-resource",
5 | "format": "csv",
6 | "url": "http://example.com/resource1.csv",
7 | "title": "First Resource",
8 | "mediatype": "text/csv",
9 | "schema": {
10 | "fields": [
11 | {
12 | "type": "string",
13 | "description": "description",
14 | "name": "country"
15 | },
16 | {
17 | "type": "string",
18 | "description": "description",
19 | "name": "country"
20 | },
21 | {
22 | "type": "string",
23 | "description": "description",
24 | "name": "country"
25 | }
26 | ]
27 | }
28 | },
29 | {
30 | "name": "the-second-resource",
31 | "format": "csv",
32 | "url": "http://example.com/resource2.csv",
33 | "title": "Second Resource",
34 | "mediatype": "text/csv",
35 | "schema": {
36 | "fields": [
37 | {
38 | "type": "string",
39 | "description": "description",
40 | "name": "country"
41 | },
42 | {
43 | "type": "string",
44 | "description": "description",
45 | "name": "country"
46 | },
47 | {
48 | "type": "string",
49 | "description": "description",
50 | "name": "country"
51 | }
52 | ]
53 | }
54 | }
55 | ],
56 | "version": "1.3.4",
57 | "description": "Percent of the eligible population registered to vote and the percent who voted in statewide elections.",
58 | "name": "cdph.ca.gov-hci-registered_voters-county",
59 | "title": "Registered Voters, By County"
60 | }
61 |
--------------------------------------------------------------------------------
/docs/Census.rst:
--------------------------------------------------------------------------------
1 | Loading Census Data With Pandas Reporter
2 | ========================================
3 |
4 | The general process for creating a census package is similar to the package process described in the `Getting Started tutorial, `_ but with a ``DataFile`` term that uses a program to fetch data from Census Reporter. First we'll create the program, then link it into a Metatab package. The program uses the `pandas-reporter` module, so the reation process is very similar to the `Pandas-Reporter tutorial. `_
5 |
6 | Creating a Pandas-Reporter program
7 | ----------------------------------
8 |
9 | First, read the `Pandas-Reporter tutorial. `_ You'l need to install the `pandasreporter` python module.
10 |
11 | Then, visit `Census Reporter `_ to locate information about tables, regions and and summary levels.
12 |
13 | For this tutorial, we will use these tables:
14 |
15 | - B17001, Poverty Status by Sex by Age
16 | - B17024, Age by Ratio of Income to Poverty Level
17 | - B17017, Poverty Status by Household Type by Age of Householder
18 |
19 | For the geography, we will use tracts in San Diego County.
20 |
21 | To find the geoid code for San Diego County, visit the main page at `Census Reporter `_ and search for San Diego County. You should get a `profile page for the county '_. In the URL for the page, you should see the code `05000US06073`. This code is the geoid for San Diego County.
22 |
23 | Next, visit the page for `Cartographic Boundary File Summary Level Codes `_ to get the summary level code for tracts. It is actually listed by all of its components, in this case, "State-County-Census Tract." It is code "140". ( BTW, that is a string, not a number. )
24 |
25 | The start of our program is similar to the program in the `Pandas-Reporter tutorial. `_, except using the table, summary level and region codes for this example:
26 |
27 | .. code-block:: python
28 |
29 | $ mkdir example-data-package
30 | $ cd example-data-package
31 | $ metapack -c
--------------------------------------------------------------------------------
/metatab/test/test-data/yaml/yaml-example-1.csv:
--------------------------------------------------------------------------------
1 | Declare,metatab-latest,,,
2 | Title,San Diego County Weather,,,
3 | Description,Daily summaries from a selection of San Diego county weather stations,,,
4 | Identifier,2dc83efa-e6da-4561-bdf9-63263360ccf0,,,
5 | Name,noaa.gov-daily_summary-1998e-san-1,,,
6 | Dataset,daily_summary,,,
7 | Origin,noaa.gov,,,
8 | Time,1998e,,,
9 | Space,san,,,
10 | Grain,,,,
11 | Variant,,,,
12 | Version,1,,,
13 | Created,2018-08-17T15:44:24,,,
14 | Modified,2018-08-17T16:18:19,,,
15 | Giturl,https://github.com/san-diego-water-quality/water-datasets.git,,,
16 | ,,,,
17 | Section,Contacts,Email,Organization,Url
18 | Wrangler,Eric Busboom,eric@civicknowledge.com,Civic Knowledge,http://civicknowledge.com
19 | ,,,,
20 | ,,,,
21 | Section,Documentation,Title,Description,
22 | Documentation,file:README.md,README,,
23 | Documentation,https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/GHCND_documentation.pdf,Documentation,Main documentation,
24 | ,,,,
25 | Section,Resources,Name,Description,
26 | Datafile,http://ds.civicknowledge.org.s3.amazonaws.com/noaa.gov/daily-summary-1998-2018-san.csv,daily_summary_san,Daily weather summaries,
27 | Datafile,http://ds.civicknowledge.org.s3.amazonaws.com/noaa.gov/daily-summary-1998-2018-san.csv,daily_summary_la,Daily weather summaries,
28 | ,,,,
29 | ,,,,
30 | ,,,,
31 | Section,Schema,DataType,AltName,Description
32 | Table,daily_summary_san,,,
33 | Table.Column,STATION,string,station,
34 | Table.Column,NAME,string,name,Station code
35 | Table.Column,LATITUDE,number,latitude,Station name
36 | Table.Column,LONGITUDE,number,longitude,Station lattitude
37 | Table.Column,ELEVATION,number,elevation,Station longitude
38 | Table.Column,DATE,date,date,Station elevation
39 | Table.Column,AWND,number,awnd,Measurement date
40 | Table.Column,DAPR,string,dapr,Average daily wind speed (meters per second or miles per hour as per user preference
41 | Table.Column,FMTM,integer,fmtm,Number of days included in the multiday precipitation total (MDPR)
42 | Table.Column,MDPR,string,mdpr,"Time of fastest mile or fastest 1-minute wind (hours and minutes, i.e., HHMM)"
43 | Table.Column,PGTM,string,pgtm,"Multiday precipitation total (mm or inches as per user preference; use with DAPR and DWPR, if available)"
44 | Table.Column,PRCP,number,prcp,"Peak gust time (hours and minutes, i.e., HHMM)"
45 | Table.Column,SNOW,integer,snow,"Precipitation (mm or inches as per user preference, inches to hundredths on Daily Form pdf file)"
46 | Table.Column,SNWD,integer,snwd,"Snowfall (mm or inches as per user preference, inches to tenths on Daily Form pdf file)"
47 | Table.Column,TAVG,string,tavg,"Snow depth (mm or inches as per user preference, inches on Daily Form pdf file)"
48 | Table.Column,TMAX,integer,tmax,Average temerature
49 | Table.Column,TMIN,integer,tmin,"Maximum temperature (Fahrenheit or Celsius as per user preference, Fahrenheit to tenths on Daily Form pdf file"
50 |
--------------------------------------------------------------------------------
/docs/GeneratingRowsWithPrograms.rst:
--------------------------------------------------------------------------------
1 |
2 | Row Generating Programs
3 | =======================
4 |
5 | Metatab Datafile terms can reference programs and IPython notebooks to generate rows.
6 |
7 | To reference a program, the ``Root.Datafile`` must be a URL with a ``program`` scheme and a relative path. Usually, the file is placed in a subdirectory named 'scripts' at the same level as the ``metadata.csv`` file. It must be an executable program, and may be any executable program.
8 |
9 | When a data package is created, regardless of the type, a filesystem package is created first, then other types of packages are created from the filesystem package. This means that the row-generating program is only run once per resource when multiple packages are created, and also that the program can open the Metatab package being used to run the program to access previously created resource files.
10 |
11 | Program Inputs
12 | **************
13 |
14 | The program can receive information from Metatab through program options and environmental variables, and must print CSV formatted lines to std out.
15 |
16 | There are two broad sources for inputs to the program. The first is are several values that are passed into the program regardless of the configuration of the ``Root.DataFile`` term. The second are the properties of the ``Root.DataFile`` terms.
17 |
18 | The inputs for all programs are:
19 |
20 | - METATAB_DOC: An env var that holds the URL for the Metatab document being processed
21 | - METATAB_PACKAGE: An env var that holds the metatab document's package URL. ( Which is usually the same as the document URL )
22 | - METATAB_WORKING_DIR: An env var that holds the path to the directory holding the metatab file.
23 | - PROPERTIES: An env var with holds a JSON encoded dict with the three previous env values, along with the ``properties`` dict for the ``Root.DataFile`` term.
24 |
25 | Additionally, the program receives the ``Root.DataFile`` properties in these forms:
26 |
27 | - Properties that have names that are all uppercased are assigned to env variables.
28 | - Properties that have names that begin with '-' are assigned to program options.
29 |
30 |
31 | Common Patterns
32 | ***************
33 |
34 | It is very common for a program to open the Metatab document that is being used to run the program. In Python:
35 |
36 | .. code-block:: python
37 |
38 | import metatab as mt
39 | doc = mt.MetatabDoc(environ['METATAB_DOC'])
40 |
41 | Since the program must output CSV formatted lines, a CSV writer can be constructed on ``sys.stdout``:
42 |
43 | .. code-block:: python
44 |
45 | import sys
46 | import csv
47 |
48 | w = csv.writer(sys.stdout)
49 |
50 | w.writerow(...)
51 |
52 |
53 | If the program generates logging or warnings, they must be printed to ``sys.stderr``
54 |
55 | .. code-block:: python
56 |
57 | import sys
58 |
59 | print("ERROR!", file=sys.stderr)
60 |
61 |
--------------------------------------------------------------------------------
/metatab/test/test_doc.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 |
3 | import unittest
4 | from os.path import join, dirname
5 |
6 | from metatab import MetatabDoc
7 | from metatab.rowgenerators import TextRowGenerator
8 | from metatab.test.core import test_data
9 |
10 |
11 | class TestDoc(unittest.TestCase):
12 |
13 | def test_open(self):
14 |
15 | doc = MetatabDoc(test_data('almost-everything.csv'))
16 |
17 | self.assertEquals('9FC11204-B291-4E0E-A841-5372090ADEC0', doc.find_first_value('Root.Identifier'))
18 |
19 | self.assertEquals('9FC11204-B291-4E0E-A841-5372090ADEC0', doc['Root'].find_first_value('Root.Identifier'))
20 |
21 |
22 | def test_new(self):
23 |
24 | import metatab.templates as tmpl
25 |
26 | template_path = join(dirname(tmpl.__file__), 'metatab.csv')
27 |
28 | doc = MetatabDoc(template_path)
29 | doc.cleanse()
30 |
31 | print(doc.as_csv()[:200])
32 |
33 | def test_version(self):
34 |
35 | from textwrap import dedent
36 |
37 |
38 | doc = MetatabDoc(TextRowGenerator(
39 | dedent(
40 | """
41 | Root.Version:
42 | """)))
43 |
44 | # None because there are no Minor, Major, Patch value
45 | self.assertIsNone(doc.update_version())
46 |
47 | self.assertFalse(doc._has_semver())
48 |
49 | doc = MetatabDoc(TextRowGenerator(
50 | dedent(
51 | """
52 | Root.Version: 10
53 | """)))
54 |
55 | # None because there are no Minor, Major, Patch value
56 | self.assertEqual("10", doc.update_version())
57 | self.assertFalse(doc._has_semver())
58 |
59 | doc = MetatabDoc(TextRowGenerator(
60 | dedent(
61 | """
62 | Root.Version: 10
63 | Version.Patch: 5
64 | """)))
65 |
66 | # None because there are no Minor, Major, Patch value
67 | self.assertEqual("0.0.5", doc.update_version())
68 | self.assertTrue(doc._has_semver())
69 |
70 | doc = MetatabDoc(TextRowGenerator(
71 | dedent(
72 | """
73 | Root.Version: 10
74 | Version.Major: 2
75 | Version.Patch: 5
76 | """)))
77 |
78 | # None because there are no Minor, Major, Patch value
79 | self.assertEqual("2.0.5", doc.update_version())
80 |
81 | doc = MetatabDoc(TextRowGenerator(
82 | dedent(
83 | """
84 | Root.Name:
85 | Root.Origin: example.com
86 | Root.Dataset: foobar
87 | Root.Version:
88 | Version.Minor: 24
89 | Version.Major: 2
90 | Version.Patch: 5
91 | """)))
92 |
93 | # None because there are no Minor, Major, Patch value
94 | self.assertEqual("2.24.5", doc.update_version())
95 |
96 | doc.update_name()
97 | self.assertEqual('example.com-foobar-2.24', doc.get_value('Root.Name'))
98 |
99 | if __name__ == '__main__':
100 | unittest.main()
101 |
--------------------------------------------------------------------------------
/metatab/test/test-data/packages/example.com-test_package/notebooks/Test_Notebook.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 4,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stdout",
10 | "output_type": "stream",
11 | "text": [
12 | "The metatab extension is already loaded. To reload it, use:\n",
13 | " %reload_ext metatab\n",
14 | "The autoreload extension is already loaded. To reload it, use:\n",
15 | " %reload_ext autoreload\n"
16 | ]
17 | }
18 | ],
19 | "source": [
20 | "%matplotlib inline\n",
21 | "%load_ext metatab\n",
22 | "\n",
23 | "%load_ext autoreload\n",
24 | "%autoreload 2\n",
25 | "\n",
26 | "import pandas as pd\n",
27 | "import numpy as np \n",
28 | "import metatab as mt"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 5,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "%mt_open_package\n",
38 | "assert mt_pkg.path.endswith('metatab-py/test-data/packages/example.com-test_package/metadata.csv')\n",
39 | "orig_path = mt_pkg.path.endswith"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 6,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "from metatab.pands import MetatabDataFrame\n",
49 | "\n",
50 | "odf = MetatabDataFrame({ 'cola':range(10), 'colb': range(10)})\n",
51 | "\n",
52 | "odf.name = 'income_homeval'\n",
53 | "odf.title = 'Income and Home Value Records for San Diego County'\n",
54 | "odf.cola.description = 'Household income'\n",
55 | "odf.colb.description = 'Home value'\n",
56 | "\n",
57 | "%mt_add_dataframe odf --materialize\n",
58 | "\n",
59 | "cols = list(mt_pkg.resource('income_homeval').columns())\n",
60 | "assert 'cola' in [ c['name'] for c in cols]\n",
61 | "assert 'colb' in [ c['name'] for c in cols]"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {
68 | "collapsed": true
69 | },
70 | "outputs": [],
71 | "source": []
72 | }
73 | ],
74 | "metadata": {
75 | "kernelspec": {
76 | "display_name": "Python 3",
77 | "language": "python",
78 | "name": "python3"
79 | },
80 | "language_info": {
81 | "codemirror_mode": {
82 | "name": "ipython",
83 | "version": 3
84 | },
85 | "file_extension": ".py",
86 | "mimetype": "text/x-python",
87 | "name": "python",
88 | "nbconvert_exporter": "python",
89 | "pygments_lexer": "ipython3",
90 | "version": "3.6.1"
91 | },
92 | "varInspector": {
93 | "cols": {
94 | "lenName": 16,
95 | "lenType": 16,
96 | "lenVar": 40
97 | },
98 | "kernels_config": {
99 | "python": {
100 | "delete_cmd_postfix": "",
101 | "delete_cmd_prefix": "del ",
102 | "library": "var_list.py",
103 | "varRefreshCmd": "print(var_dic_list())"
104 | },
105 | "r": {
106 | "delete_cmd_postfix": ") ",
107 | "delete_cmd_prefix": "rm(",
108 | "library": "var_list.r",
109 | "varRefreshCmd": "cat(var_dic_list()) "
110 | }
111 | },
112 | "types_to_exclude": [
113 | "module",
114 | "function",
115 | "builtin_function_or_method",
116 | "instance",
117 | "_Feature"
118 | ],
119 | "window_display": false
120 | }
121 | },
122 | "nbformat": 4,
123 | "nbformat_minor": 2
124 | }
125 |
--------------------------------------------------------------------------------
/metatab/test/test-data/notebooks/ImportTest.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "%load_ext metatab\n",
12 | "%mt_lib_dir lib\n",
13 | "\n",
14 | "import file\n"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {
21 | "collapsed": true
22 | },
23 | "outputs": [],
24 | "source": [
25 | "assert file.__file__.endswith('test-data/notebooks/lib/file.py')"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 3,
31 | "metadata": {
32 | "collapsed": true
33 | },
34 | "outputs": [],
35 | "source": [
36 | "%mt_lib_dir http://s3.amazonaws.com/library.metatab.org/ipums.org-income_homevalue-5.zip"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 4,
42 | "metadata": {
43 | "collapsed": true
44 | },
45 | "outputs": [],
46 | "source": [
47 | "import lib.incomedist"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 5,
53 | "metadata": {
54 | "collapsed": true
55 | },
56 | "outputs": [],
57 | "source": [
58 | "assert lib.incomedist.__file__.endswith('ipums.org-income_homevalue-5.zip/ipums.org-income_homevalue-5/lib/incomedist.py')"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 6,
64 | "metadata": {
65 | "collapsed": true
66 | },
67 | "outputs": [],
68 | "source": [
69 | "%%metatab\n",
70 | "Identifier: 47bc1089-7584-41f0-b804-602ec42f1249\n",
71 | "Name: FooBarBaz\n",
72 | "\n",
73 | "Section: References \n",
74 | "Reference: metatab+http://s3.amazonaws.com/library.metatab.org/ipums.org-income_homevalue-5.zip#income_homeval\n",
75 | "Reference.Name: incv\n",
76 | "Reference.Description: Income and Home value records from IPUMS for San Diego County\n",
77 | "\n"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": 7,
83 | "metadata": {
84 | "collapsed": true
85 | },
86 | "outputs": [],
87 | "source": [
88 | "%mt_lib_dir incv"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": null,
94 | "metadata": {
95 | "collapsed": true
96 | },
97 | "outputs": [],
98 | "source": []
99 | }
100 | ],
101 | "metadata": {
102 | "kernelspec": {
103 | "display_name": "Python 3",
104 | "language": "python",
105 | "name": "python3"
106 | },
107 | "language_info": {
108 | "codemirror_mode": {
109 | "name": "ipython",
110 | "version": 3
111 | },
112 | "file_extension": ".py",
113 | "mimetype": "text/x-python",
114 | "name": "python",
115 | "nbconvert_exporter": "python",
116 | "pygments_lexer": "ipython3",
117 | "version": "3.6.1"
118 | },
119 | "varInspector": {
120 | "cols": {
121 | "lenName": 16,
122 | "lenType": 16,
123 | "lenVar": 40
124 | },
125 | "kernels_config": {
126 | "python": {
127 | "delete_cmd_postfix": "",
128 | "delete_cmd_prefix": "del ",
129 | "library": "var_list.py",
130 | "varRefreshCmd": "print(var_dic_list())"
131 | },
132 | "r": {
133 | "delete_cmd_postfix": ") ",
134 | "delete_cmd_prefix": "rm(",
135 | "library": "var_list.r",
136 | "varRefreshCmd": "cat(var_dic_list()) "
137 | }
138 | },
139 | "types_to_exclude": [
140 | "module",
141 | "function",
142 | "builtin_function_or_method",
143 | "instance",
144 | "_Feature"
145 | ],
146 | "window_display": false
147 | }
148 | },
149 | "nbformat": 4,
150 | "nbformat_minor": 2
151 | }
152 |
--------------------------------------------------------------------------------
/metatab/test/test-data/notebooks/CellExecuteError.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 3,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "%load_ext metatab"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 4,
17 | "metadata": {
18 | "collapsed": true
19 | },
20 | "outputs": [],
21 | "source": [
22 | "%%metatab\n",
23 | "Origin: example.com\n",
24 | "Dataset: foobar.com \n",
25 | "Identifier: de097279-28ef-42f5-a4f5-0eaac53b7dc4\n",
26 | "Name: example.com-foobar.com \n",
27 | "\n",
28 | "Section: Contacts\n",
29 | "Wrangler: Eric Busboom\n",
30 | "Wrangler.Email: eric@civicknowledge.com\n",
31 | "\n",
32 | "Section: References\n",
33 | "Reference: http://public.source.civicknowledge.com/example.com/sources/renter_cost.csv\n",
34 | "Reference.Name: reference\n",
35 | "Reference.Title: The First Example Data File\n",
36 | "Reference.Startline: 5\n",
37 | "Reference.HeaderLines: 3,4\n",
38 | " \n",
39 | "Section: Resources\n",
40 | "Datafile: http://public.source.civicknowledge.com/example.com/sources/renter_cost.csv\n",
41 | "Datafile.Name: ext_resource\n",
42 | "Datafile.Title: An Extern CSV Resource\n",
43 | "Datafile.Startline: 5\n",
44 | "Datafile.HeaderLines: 3,4"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 5,
50 | "metadata": {},
51 | "outputs": [
52 | {
53 | "ename": "ZeroDivisionError",
54 | "evalue": "division by zero",
55 | "output_type": "error",
56 | "traceback": [
57 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
58 | "\u001b[0;31mZeroDivisionError\u001b[0m Traceback (most recent call last)",
59 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;36m1\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
60 | "\u001b[0;31mZeroDivisionError\u001b[0m: division by zero"
61 | ]
62 | }
63 | ],
64 | "source": [
65 | "1/0"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {
72 | "collapsed": true
73 | },
74 | "outputs": [],
75 | "source": []
76 | }
77 | ],
78 | "metadata": {
79 | "kernelspec": {
80 | "display_name": "Python 3",
81 | "language": "python",
82 | "name": "python3"
83 | },
84 | "language_info": {
85 | "codemirror_mode": {
86 | "name": "ipython",
87 | "version": 3
88 | },
89 | "file_extension": ".py",
90 | "mimetype": "text/x-python",
91 | "name": "python",
92 | "nbconvert_exporter": "python",
93 | "pygments_lexer": "ipython3",
94 | "version": "3.6.1"
95 | },
96 | "varInspector": {
97 | "cols": {
98 | "lenName": 16,
99 | "lenType": 16,
100 | "lenVar": 40
101 | },
102 | "kernels_config": {
103 | "python": {
104 | "delete_cmd_postfix": "",
105 | "delete_cmd_prefix": "del ",
106 | "library": "var_list.py",
107 | "varRefreshCmd": "print(var_dic_list())"
108 | },
109 | "r": {
110 | "delete_cmd_postfix": ") ",
111 | "delete_cmd_prefix": "rm(",
112 | "library": "var_list.r",
113 | "varRefreshCmd": "cat(var_dic_list()) "
114 | }
115 | },
116 | "types_to_exclude": [
117 | "module",
118 | "function",
119 | "builtin_function_or_method",
120 | "instance",
121 | "_Feature"
122 | ],
123 | "window_display": false
124 | }
125 | },
126 | "nbformat": 4,
127 | "nbformat_minor": 2
128 | }
129 |
--------------------------------------------------------------------------------
/metatab/test/test-data/errors/errors2.csv:
--------------------------------------------------------------------------------
1 | "Declare","http://doesntexist.csv",,,
2 | "Title","Registered Voters, By County",,,
3 | "Description","Percent of the eligible population registered to vote and the percent who voted in statewide elections.",,,
4 | "Identifier","cdph.ca.gov-hci-registered_voters-county",,,
5 | ,201404,,,
6 | ,"cdph.ca.gov-hci-registered_voters-county-201304",,,
7 | "Format","excel",,,
8 | "Spatial","California <04000US06>",,,
9 | "Time","2002-2014",,,
10 | "SpatialGrain","County <05000US>",,,
11 | ,,,,
12 | "Section","Resources","table","Grain","Title"
13 | "Datafile","http://example.com/example1.csv","registered_voters","County","The First Example Data File"
14 | "Datafile","http://example.com/example2.csv","registered_voters","Tract","The Second Example Data File"
15 | ,,"Healthy Communities Data and Indicators Project (HCI)",,
16 | ,,"Indicator Documentation for Voter Registration / Participation",,
17 | ".description","Voter Registration/Participation: Percent of the eligible population registered to vote and the percent who voted in statewide elections",,,
18 | ,,,,
19 | "Section ","Contacts","email",,
20 | "Creator","Office of Health Equity","HCIOHE@cdph.ca.gov",,
21 | "Wrangler","Eric Busboom","eric@civicknowledge.com",,
22 | ,,,,
23 | "Section ","Notes",,,
24 | "Note","This file is an example of a data bundle, a simple format for linking data to metadata using spreadsheets. See the specification for more details. ",,,
25 | "Documentation","https://docs.google.com/document/d/16tb7x73AyF8pJ6e6IBcaIJAioEZCNBGDEksKYTXfdfg/edit#",,,
26 | ".title","Data Bundles Packaging Specification",,,
27 | ,,,,
28 | "Section","Schema",,"valuetype","description"
29 | "Table","registered_voters",,,"HCI Indicator 653.0: Percent of adults age 18 years and older who are registered voters"
30 | "Column","reportyear","int","year range","Year or years that indicator was reported"
31 | "Column","type","str","dimension","Type of record"
32 | "Column","gvid","str","gvid","GVid version of the geotype and geotypeval"
33 | "Column","geoname","str","label for gvid","Census name of geographic area"
34 | "Column","geotype","str","label","Code for type of geographic area"
35 | "Column","geotypevalue","str","census","Census geoid code"
36 | "Column","county_fips","str","FIPS county code","County FIPS code"
37 | "Column","county_name","str","label for counrty_fips","County name"
38 | "Column","region_code","str","census code","Numeric code of region"
39 | "Column","region_name","str","label for region_code","Name of region"
40 | "Column","raceth","str","raceth/civick","Civic Knowledge race / ethnicity code."
41 | "Column","raceth_name","str","label for raceeth","Race / Ethnicity Name"
42 | "Column","race_eth_code","str","raceth/hci","Race / ethnicity code"
43 | "Column","race_eth_name","str","label for race_eth_code","Race / ethnicity name"
44 | "Column","numerator","int","count","Adults who are registered to vote, or who voted, depending on type of record"
45 | "Column","denominator","int","count","Population of Adults, 18 years or older"
46 | "Column","percent","float","percent of numerator over denominator","Percent of adults who are registered to vote, or who voted, depending on type of record"
47 | "Column","ll_95ci","float","ci95l for percent","Lower bound of 95% confidence interval"
48 | "Column","ul_95ci","float","ci95u for percent","Upper bound of 95% confidence interval"
49 | "Column","se","float","se for percent","Standard error"
50 | "Column","rse","float","rse for percent","Relative standard error (se/percent * 100) expressed as a percent"
51 | "Column","ca_decile","float","decile","Statewide decile ranking"
52 | "Column","ca_rr","float","ratio","Ratio of indicator to state average"
53 | "Column","vap","float","measure","Voter age population, from CA Department of Finance."
54 | "Column","ind_id","str","dimension",
55 | "Column","ind_definition","str","dimension",
56 | "Column","version","str","other",
57 |
--------------------------------------------------------------------------------
/metatab/test/test-data/example1-web.csv:
--------------------------------------------------------------------------------
1 | "Declare","http://assets.metatab.org/metatab-0.1.csv",,,
2 | "Title","Registered Voters, By County",,,
3 | "Description","Percent of the eligible population registered to vote and the percent who voted in statewide elections.",,,
4 | "Identifier","cdph.ca.gov-hci-registered_voters-county",,,
5 | "Version",201404,,,
6 | "Obsoletes","cdph.ca.gov-hci-registered_voters-county-201304",,,
7 | "Format","excel",,,
8 | "Spatial","California <04000US06>",,,
9 | "Time","2002-2014",,,
10 | "SpatialGrain","County <05000US>",,,
11 | ,,,,
12 | "Section","Resources","table","Grain","Title"
13 | "Datafile","http://example.com/example1.csv","registered_voters","County","The First Example Data File"
14 | "Datafile","http://example.com/example2.csv","registered_voters","Tract","The Second Example Data File"
15 | "Homepage","https://www.cdph.ca.gov/programs/pages/healthycommunityindicators.aspx","Healthy Communities Data and Indicators Project (HCI)",,
16 | "Documentation","https://www.cdph.ca.gov/programs/Documents/HCI_RegisteredVoters_653_Narrative_and_examples_6-2-14.pdf","Indicator Documentation for Voter Registration / Participation",,
17 | ".description","Voter Registration/Participation: Percent of the eligible population registered to vote and the percent who voted in statewide elections",,,
18 | ,,,,
19 | "Section ","Contacts","email",,
20 | "Creator","Office of Health Equity","HCIOHE@cdph.ca.gov",,
21 | "Wrangler","Eric Busboom","eric@civicknowledge.com",,
22 | ,,,,
23 | "Section ","Notes",,,
24 | "Note","This file is an example of a data bundle, a simple format for linking data to metadata using spreadsheets. See the specification for more details. ",,,
25 | "Documentation","https://docs.google.com/document/d/16tb7x73AyF8pJ6e6IBcaIJAioEZCNBGDEksKYTXfdfg/edit#",,,
26 | ".title","Data Bundles Packaging Specification",,,
27 | ,,,,
28 | "Section","Schema","datatype","valuetype","description"
29 | "Table","registered_voters",,,"HCI Indicator 653.0: Percent of adults age 18 years and older who are registered voters"
30 | "Column","reportyear","int","year range","Year or years that indicator was reported"
31 | "Column","type","str","dimension","Type of record"
32 | "Column","gvid","str","gvid","GVid version of the geotype and geotypeval"
33 | "Column","geoname","str","label for gvid","Census name of geographic area"
34 | "Column","geotype","str","label","Code for type of geographic area"
35 | "Column","geotypevalue","str","census","Census geoid code"
36 | "Column","county_fips","str","FIPS county code","County FIPS code"
37 | "Column","county_name","str","label for counrty_fips","County name"
38 | "Column","region_code","str","census code","Numeric code of region"
39 | "Column","region_name","str","label for region_code","Name of region"
40 | "Column","raceth","str","raceth/civick","Civic Knowledge race / ethnicity code."
41 | "Column","raceth_name","str","label for raceeth","Race / Ethnicity Name"
42 | "Column","race_eth_code","str","raceth/hci","Race / ethnicity code"
43 | "Column","race_eth_name","str","label for race_eth_code","Race / ethnicity name"
44 | "Column","numerator","int","count","Adults who are registered to vote, or who voted, depending on type of record"
45 | "Column","denominator","int","count","Population of Adults, 18 years or older"
46 | "Column","percent","float","percent of numerator over denominator","Percent of adults who are registered to vote, or who voted, depending on type of record"
47 | "Column","ll_95ci","float","ci95l for percent","Lower bound of 95% confidence interval"
48 | "Column","ul_95ci","float","ci95u for percent","Upper bound of 95% confidence interval"
49 | "Column","se","float","se for percent","Standard error"
50 | "Column","rse","float","rse for percent","Relative standard error (se/percent * 100) expressed as a percent"
51 | "Column","ca_decile","float","decile","Statewide decile ranking"
52 | "Column","ca_rr","float","ratio","Ratio of indicator to state average"
53 | "Column","vap","float","measure","Voter age population, from CA Department of Finance."
54 | "Column","ind_id","str","dimension",
55 | "Column","ind_definition","str","dimension",
56 | "Column","version","str","other",
57 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | Metatab
2 | =======
3 |
4 | .. image:: https://travis-ci.org/Metatab/metatab.svg?branch=master
5 | :target: https://travis-ci.org/Metatab/metatab
6 |
7 | Parse and manipulate structured data and metadata in a tabular format.
8 |
9 | `Metatab `_ is a data format that allows structured
10 | metadata -- the sort you'd normally store in JSON, YAML or XML -- to be stored
11 | and edited in tabular forms like CSV or Excel. Metatab files look exactly like
12 | you'd expect, so they are very easy for non-technical users to read and edit,
13 | using tools they already have. Metatab is an excellent format for creating,
14 | storing and transmitting metadata. For more information about metatab, visit
15 | http://metatab.org.
16 |
17 | This repository has a Python module and executable. For a Javascript version,
18 | see the `metatab-js `_ repository.
19 |
20 | What is Metatab For?
21 | --------------------
22 |
23 | Metatab is a tabular format that allows storing metadata for demographics,
24 | health and research datasets in a tabular format. The tabular format is much
25 | easier for data creators to write and for data consumers to read, and it allows
26 | a complete data packages to be stored in a single Excel file.
27 |
28 |
29 | Install
30 | -------
31 |
32 |
33 |
34 | Install the package from PiPy with:
35 |
36 | .. code-block:: bash
37 |
38 | $ pip install metatab
39 |
40 | Or, install the master branch from github with:
41 |
42 | .. code-block:: bash
43 |
44 | $ pip install https://github.com/CivicKnowledge/metatab.git
45 |
46 | Then test parsing using a remote file with:
47 |
48 | .. code-block:: bash
49 |
50 | $ metatab -j https://raw.githubusercontent.com/CivicKnowledge/metatab/master/test-data/example1.csv
51 |
52 | Run ``metatab -h`` to get other program options.
53 |
54 | The ``test-data`` directory has test files that also serve as examples to
55 | parse. You can either clone the repo and parse them from the files, or from the
56 | Github page for the file, click on the ``raw`` button to get raw view of the
57 | flie, then copy the URL.
58 |
59 |
60 | Running tests
61 | +++++++++++++
62 |
63 | Run ``python setup.py tests`` to run normal development tests. You can also run
64 | ``tox``, which will try to run the tests with python 3.4, 3.5 and 3.6, ignoring
65 | non-existent interpreters.
66 |
67 |
68 | Development Testing with Docker
69 | +++++++++++++++++++++++++++++++
70 |
71 | Testing during development for other versions of Python is a bit of a pain,
72 | since you have to install the alternate version, and Tox will run all of the
73 | tests, not just the one you want.
74 |
75 | One way to deal with this is to install Docker locally, then run the docker
76 | test container on the source directory. This is done automatically from the
77 | Makefile in metatab/test, just run:
78 |
79 | .. code-block:: bash
80 |
81 | $ cd metatab/test
82 | $ make build # to create the container image
83 | $ make test
84 | # or just ..
85 | $ make
86 |
87 | You can also run the container shell, and run tests from the command line.
88 |
89 | .. code-block:: bash
90 |
91 | $ cd metatab/test
92 | $ make build # to create the container image
93 | $ make shell # to run bash the container
94 |
95 | You now have a docker container where the /code directory is the metatab source dir.
96 |
97 | Now, run tox to build the tox virtual environments, then enter the specific version you want to
98 | run tests for and activate the virtual environment.
99 |
100 | .. code-block:: bash
101 |
102 | # tox
103 | # cd .tox/py34
104 | # source bin/activate # Activate the python 3.4 virtual env
105 | # cd ../../
106 | # python setup.py test # Cause test deps to get installed
107 | #
108 | # python -munittest metatab.test.test_parser.TestParser.test_parse_everython # Run one test
109 |
110 | Note that your development environment is mounted into the Docker container, so you can edit local
111 | files and test the changes in Docker.
112 |
113 |
114 |
115 |
116 |
117 |
118 |
--------------------------------------------------------------------------------
/metatab/test/test-data/yaml/yaml-example-1.yaml:
--------------------------------------------------------------------------------
1 | declare: metatab-latest
2 | title: San Diego County Weather
3 | description: Daily summaries from a selection of San Diego county weather stations
4 | identifier: 2dc83efa-e6da-4561-bdf9-63263360ccf0
5 | name: noaa.gov-daily_summary-1998e-san-1
6 | dataset: daily_summary
7 | origin: noaa.gov
8 | time: 1998e
9 | space: san
10 | grain: null
11 | variant: null
12 | version: '1'
13 | created: '2018-08-17T15:44:24'
14 | modified: '2018-08-17T16:18:19'
15 | giturl: https://github.com/san-diego-water-quality/water-datasets.git
16 | wrangler:
17 | - email: eric@civicknowledge.com
18 | organization: Civic Knowledge
19 | url: http://civicknowledge.com
20 | name: Eric Busboom
21 | documentation:
22 | - title: README
23 | url: file:README.md
24 | - title: Documentation
25 | description: Main documentation
26 | url: https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/GHCND_documentation.pdf
27 | datafile:
28 | - name: daily_summary_san
29 | description: Daily weather summaries
30 | url: http://ds.civicknowledge.org.s3.amazonaws.com/noaa.gov/daily-summary-1998-2018-san.csv
31 | - name: daily_summary_la
32 | description: Daily weather summaries
33 | url: http://ds.civicknowledge.org.s3.amazonaws.com/noaa.gov/daily-summary-1998-2018-san.csv
34 | table:
35 | - column:
36 | - datatype: string
37 | altname: station
38 | name: STATION
39 | - datatype: string
40 | altname: name
41 | description: Station code
42 | name: NAME
43 | - datatype: number
44 | altname: latitude
45 | description: Station name
46 | name: LATITUDE
47 | - datatype: number
48 | altname: longitude
49 | description: Station lattitude
50 | name: LONGITUDE
51 | - datatype: number
52 | altname: elevation
53 | description: Station longitude
54 | name: ELEVATION
55 | - datatype: date
56 | altname: date
57 | description: Station elevation
58 | name: DATE
59 | - datatype: number
60 | altname: awnd
61 | description: Measurement date
62 | name: AWND
63 | - datatype: string
64 | altname: dapr
65 | description: Average daily wind speed (meters per second or miles per hour
66 | as per user preference
67 | name: DAPR
68 | - datatype: integer
69 | altname: fmtm
70 | description: Number of days included in the multiday precipitation total (MDPR)
71 | name: FMTM
72 | - datatype: string
73 | altname: mdpr
74 | description: Time of fastest mile or fastest 1-minute wind (hours and minutes,
75 | i.e., HHMM)
76 | name: MDPR
77 | - datatype: string
78 | altname: pgtm
79 | description: Multiday precipitation total (mm or inches as per user preference;
80 | use with DAPR and DWPR, if available)
81 | name: PGTM
82 | - datatype: number
83 | altname: prcp
84 | description: Peak gust time (hours and minutes, i.e., HHMM)
85 | name: PRCP
86 | - datatype: integer
87 | altname: snow
88 | description: Precipitation (mm or inches as per user preference, inches to
89 | hundredths on Daily Form pdf file)
90 | name: SNOW
91 | - datatype: integer
92 | altname: snwd
93 | description: Snowfall (mm or inches as per user preference, inches to tenths
94 | on Daily Form pdf file)
95 | name: SNWD
96 | - datatype: string
97 | altname: tavg
98 | description: Snow depth (mm or inches as per user preference, inches on Daily
99 | Form pdf file)
100 | name: TAVG
101 | - datatype: integer
102 | altname: tmax
103 | description: Average temerature
104 | name: TMAX
105 | - datatype: integer
106 | altname: tmin
107 | description: Maximum temperature (Fahrenheit or Celsius as per user preference,
108 | Fahrenheit to tenths on Daily Form pdf file
109 | name: TMIN
110 | name: daily_summary_san
111 |
112 |
--------------------------------------------------------------------------------
/metatab/test/test-data/example1.csv:
--------------------------------------------------------------------------------
1 | "Declare","metatab-latest",,,,
2 | "Title","Registered Voters, By County",,,,
3 | "Name","cdph.ca.gov-hci-registered_voters-county",,,,
4 | "Description","Percent of the eligible population registered to vote and the percent who voted in statewide elections.",,,,
5 | "Identifier","cdph.ca.gov-hci-registered_voters-county",,,,
6 | "Version",201404,,,,
7 | "Obsoletes","cdph.ca.gov-hci-registered_voters-county-201304",,,,
8 | "Dataset","voters",,,,
9 | "Origin","example.com",,,,
10 | "Space","Ca",,,,
11 | "Time","2002-2014",,,,
12 | "Grain","County",,,,
13 | "Format","excel",,,,
14 | ,,,,,
15 | ,,,,,
16 | "Section","Resources",,,,
17 | "Header","url","name","schema","Grain","Title"
18 | "Datafile","http://example.com/example1.csv","example1","registered_voters","County","The First Example Data File"
19 | "Datafile","http://example.com/example2.csv","example2","registered_voters","Tract","The Second Example Data File"
20 | "Homepage","https://www.cdph.ca.gov/programs/pages/healthycommunityindicators.aspx",,"Healthy Communities Data and Indicators Project (HCI)",,
21 | "Documentation","https://www.cdph.ca.gov/programs/Documents/HCI_RegisteredVoters_653_Narrative_and_examples_6-2-14.pdf",,"Indicator Documentation for Voter Registration / Participation",,
22 | ".description","Voter Registration/Participation: Percent of the eligible population registered to vote and the percent who voted in statewide elections",,,,
23 | ,,,,,
24 | "Section ","Contacts",,"email",,
25 | "Creator","Office of Health Equity",,"HCIOHE@cdph.ca.gov",,
26 | "Wrangler","Eric Busboom",,"eric@civicknowledge.com",,
27 | ,,,,,
28 | "Section ","Notes",,,,
29 | "Note","This file is an example of a data bundle, a simple format for linking data to metadata using spreadsheets. See the specification for more details. ",,,,
30 | "Documentation","https://docs.google.com/document/d/16tb7x73AyF8pJ6e6IBcaIJAioEZCNBGDEksKYTXfdfg/edit#",,,,
31 | ".Title","Data Bundles Packaging Specification",,,,
32 | ,,,,,
33 | "Section","Schema",,"datatype","valuetype","description"
34 | "Table","registered_voters",,,,"HCI Indicator 653.0: Percent of adults age 18 years and older who are registered voters"
35 | "Table.Column","reportyear",,"int","year range","Year or years that indicator was reported"
36 | "Table.Column","type",,"str","dimension","Type of record"
37 | "Table.Column","gvid",,"str","gvid","GVid version of the geotype and geotypeval"
38 | "Table.Column","geoname",,"str","label for gvid","Census name of geographic area"
39 | "Table.Column","geotype",,"str","label","Code for type of geographic area"
40 | "Table.Column","geotypevalue",,"str","census","Census geoid code"
41 | "Table.Column","county_fips",,"str","FIPS county code","County FIPS code"
42 | "Table.Column","county_name",,"str","label for counrty_fips","County name"
43 | "Table.Column","region_code",,"str","census code","Numeric code of region"
44 | "Table.Column","region_name",,"str","label for region_code","Name of region"
45 | "Table.Column","raceth",,"str","raceth/civick","Civic Knowledge race / ethnicity code."
46 | "Table.Column","raceth_name",,"str","label for raceeth","Race / Ethnicity Name"
47 | "Table.Column","race_eth_code",,"str","raceth/hci","Race / ethnicity code"
48 | "Table.Column","race_eth_name",,"str","label for race_eth_code","Race / ethnicity name"
49 | "Table.Column","numerator",,"int","count","Adults who are registered to vote, or who voted, depending on type of record"
50 | "Table.Column","denominator",,"int","count","Population of Adults, 18 years or older"
51 | "Table.Column","percent",,"float","percent of numerator over denominator","Percent of adults who are registered to vote, or who voted, depending on type of record"
52 | "Table.Column","ll_95ci",,"float","ci95l for percent","Lower bound of 95% confidence interval"
53 | "Table.Column","ul_95ci",,"float","ci95u for percent","Upper bound of 95% confidence interval"
54 | "Table.Column","se",,"float","se for percent","Standard error"
55 | "Table.Column","rse",,"float","rse for percent","Relative standard error (se/percent * 100) expressed as a percent"
56 | "Table.Column","ca_decile",,"float","decile","Statewide decile ranking"
57 | "Table.Column","ca_rr",,"float","ratio","Ratio of indicator to state average"
58 | "Table.Column","vap",,"float","measure","Voter age population, from CA Department of Finance."
59 | "Table.Column","ind_id",,"str","dimension",
60 | "Table.Column","ind_definition",,"str","dimension",
61 | "Table.Column","version",,"str","other",
62 |
--------------------------------------------------------------------------------
/metatab/test/test-data/example1-headers.csv:
--------------------------------------------------------------------------------
1 | "Declare","metatab-latest",,,,
2 | "Title","Registered Voters, By County",,,,
3 | "Name","cdph.ca.gov-hci-registered_voters-county",,,,
4 | "Description","Percent of the eligible population registered to vote and the percent who voted in statewide elections.",,,,
5 | "Identifier","cdph.ca.gov-hci-registered_voters-county",,,,
6 | "Version",201404,,,,
7 | "Obsoletes","cdph.ca.gov-hci-registered_voters-county-201304",,,,
8 | "Dataset","voters",,,,
9 | "Origin","example.com",,,,
10 | "Space","Ca",,,,
11 | "Time","2002-2014",,,,
12 | "Grain","County",,,,
13 | "Format","excel",,,,
14 | ,,,,,
15 | ,,,,,
16 | ,,,,,
17 | "Section","Resources",,,,
18 | "Header","url","name","schema","Grain","Title"
19 | "Datafile","http://example.com/example1.csv","example1","registered_voters","County","The First Example Data File"
20 | "Datafile","http://example.com/example2.csv","example2","registered_voters","Tract","The Second Example Data File"
21 | "Homepage","https://www.cdph.ca.gov/programs/pages/healthycommunityindicators.aspx",,"Healthy Communities Data and Indicators Project (HCI)",,
22 | "Documentation","https://www.cdph.ca.gov/programs/Documents/HCI_RegisteredVoters_653_Narrative_and_examples_6-2-14.pdf",,"Indicator Documentation for Voter Registration / Participation",,
23 | ".description","Voter Registration/Participation: Percent of the eligible population registered to vote and the percent who voted in statewide elections",,,,
24 | ,,,,,
25 | "Section ","Contacts",,"email",,
26 | "Creator","Office of Health Equity",,"HCIOHE@cdph.ca.gov",,
27 | "Wrangler","Eric Busboom",,"eric@civicknowledge.com",,
28 | ,,,,,
29 | "Section ","Notes",,,,
30 | "Note","This file is an example of a data bundle, a simple format for linking data to metadata using spreadsheets. See the specification for more details. ",,,,
31 | "Documentation","https://docs.google.com/document/d/16tb7x73AyF8pJ6e6IBcaIJAioEZCNBGDEksKYTXfdfg/edit#",,,,
32 | ".Title","Data Bundles Packaging Specification",,,,
33 | ,,,,,
34 | "Section","Schema",,"datatype","valuetype","description"
35 | "Table","registered_voters",,,,"HCI Indicator 653.0: Percent of adults age 18 years and older who are registered voters"
36 | "Table.Column","reportyear",,"int","year range","Year or years that indicator was reported"
37 | "Table.Column","type",,"str","dimension","Type of record"
38 | "Table.Column","gvid",,"str","gvid","GVid version of the geotype and geotypeval"
39 | "Table.Column","geoname",,"str","label for gvid","Census name of geographic area"
40 | "Table.Column","geotype",,"str","label","Code for type of geographic area"
41 | "Table.Column","geotypevalue",,"str","census","Census geoid code"
42 | "Table.Column","county_fips",,"str","FIPS county code","County FIPS code"
43 | "Table.Column","county_name",,"str","label for counrty_fips","County name"
44 | "Table.Column","region_code",,"str","census code","Numeric code of region"
45 | "Table.Column","region_name",,"str","label for region_code","Name of region"
46 | "Table.Column","raceth",,"str","raceth/civick","Civic Knowledge race / ethnicity code."
47 | "Table.Column","raceth_name",,"str","label for raceeth","Race / Ethnicity Name"
48 | "Table.Column","race_eth_code",,"str","raceth/hci","Race / ethnicity code"
49 | "Table.Column","race_eth_name",,"str","label for race_eth_code","Race / ethnicity name"
50 | "Table.Column","numerator",,"int","count","Adults who are registered to vote, or who voted, depending on type of record"
51 | "Table.Column","denominator",,"int","count","Population of Adults, 18 years or older"
52 | "Table.Column","percent",,"float","percent of numerator over denominator","Percent of adults who are registered to vote, or who voted, depending on type of record"
53 | "Table.Column","ll_95ci",,"float","ci95l for percent","Lower bound of 95% confidence interval"
54 | "Table.Column","ul_95ci",,"float","ci95u for percent","Upper bound of 95% confidence interval"
55 | "Table.Column","se",,"float","se for percent","Standard error"
56 | "Table.Column","rse",,"float","rse for percent","Relative standard error (se/percent * 100) expressed as a percent"
57 | "Table.Column","ca_decile",,"float","decile","Statewide decile ranking"
58 | "Table.Column","ca_rr",,"float","ratio","Ratio of indicator to state average"
59 | "Table.Column","vap",,"float","measure","Voter age population, from CA Department of Finance."
60 | "Table.Column","ind_id",,"str","dimension",
61 | "Table.Column","ind_definition",,"str","dimension",
62 | "Table.Column","version",,"str","other",
63 |
--------------------------------------------------------------------------------
/metatab/appurl.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017 Civic Knowledge. This file is licensed under the terms of the
2 | # Revised BSD License, included in this distribution as LICENSE
3 |
4 | """
5 |
6 | """
7 |
8 | from metatab import DEFAULT_METATAB_FILE
9 | from os.path import basename, join
10 | from rowgenerators import Url
11 | from rowgenerators.appurl.file.file import InnerFile
12 | from rowgenerators.appurl.util import file_ext
13 | from rowgenerators.appurl.web.web import WebUrl
14 |
15 | class MetatabUrl(InnerFile, Url):
16 | match_priority = WebUrl.match_priority - 1
17 |
18 | simple_file_formats = ('csv', 'txt', 'ipynb')
19 |
20 | def __init__(self, url=None, downloader=None, **kwargs):
21 | kwargs['proto'] = 'metatab'
22 |
23 | u = Url(url, **kwargs)
24 |
25 | assert downloader
26 |
27 | # If there is no file with an extension in the path, assume that this
28 | # is a filesystem package, and that the path should have DEFAULT_METATAB_FILE
29 | if file_ext(basename(u.path)) not in ('zip', 'xlsx') + self.simple_file_formats:
30 | u.path = join(u.path, DEFAULT_METATAB_FILE)
31 |
32 | super().__init__(str(u), downloader=downloader, **kwargs)
33 |
34 | self.scheme_extension = 'metatab'
35 |
36 | if basename(self.path) == DEFAULT_METATAB_FILE:
37 | frag = ''
38 | elif self.resource_format in self.simple_file_formats:
39 | frag = ''
40 | elif self.resource_format == 'xlsx':
41 | frag = 'meta'
42 | elif self.resource_format == 'zip':
43 | frag = DEFAULT_METATAB_FILE
44 |
45 | self.fragment = [frag, None]
46 |
47 | @classmethod
48 | def _match(cls, url, **kwargs):
49 | return url.proto == 'metatab'
50 |
51 | @property
52 | def resource_format(self):
53 |
54 | resource_format = file_ext(basename(self.path))
55 |
56 | assert resource_format, self.path # Should have either a definite file, or have added one in __init__
57 |
58 | return resource_format
59 |
60 | @property
61 | def resource_file(self):
62 |
63 | assert basename(self.resource_url)
64 |
65 | return basename(self.resource_url)
66 |
67 | @property
68 | def target_file(self):
69 | if self.path.endswith(DEFAULT_METATAB_FILE):
70 | return DEFAULT_METATAB_FILE
71 | elif self.resource_format in self.simple_file_formats:
72 | return self.resource_file
73 | elif self.resource_format == 'xlsx':
74 | return 'meta'
75 | elif self.resource_format == 'zip':
76 | return 'metadata.csv'
77 | else:
78 | return self.resource_file
79 |
80 | @property
81 | def target_format(self):
82 | if self.resource_format in self.simple_file_formats:
83 | return self.resource_format
84 | elif self.resource_format == 'xlsx':
85 | return 'xlsx'
86 | elif self.resource_format == 'zip':
87 | return 'csv'
88 | else:
89 | return 'csv'
90 |
91 | @property
92 | def doc(self):
93 | """Return the metatab document for the URL"""
94 | from metatab import MetatabDoc
95 | t = self.get_resource().get_target()
96 | return MetatabDoc(t.inner)
97 |
98 | @property
99 | def generator(self):
100 |
101 | from rowgenerators import get_generator
102 |
103 | ##
104 | ## Hack! This used to be
105 | ## target = self.get_resource().get_target().inner
106 |
107 | target = self.get_resource().get_target()
108 |
109 | return get_generator(target)
110 |
111 | def get_resource(self):
112 |
113 | if self.scheme == 'file':
114 | u = self
115 | else:
116 | u = WebUrl(str(self), downloader=self._downloader).get_resource()
117 |
118 | return MetatabUrl(str(u), downloader=self._downloader)
119 |
120 | def get_target(self):
121 | return MetatabUrl(str(self.inner.get_target()), downloader=self._downloader)
122 |
123 | def join_target(self, tf):
124 |
125 | print("Type=", type(self))
126 |
127 | if self.target_file == DEFAULT_METATAB_FILE:
128 | return self.inner.join_dir(tf)
129 | else:
130 | return self.inner.join_target(tf)
131 |
132 | def exists(self):
133 | return self.inner.exists()
134 |
--------------------------------------------------------------------------------
/metatab/test/test-data/notebooks/SimpleMagicsTest.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# This is the Title of the Notebook"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "And this is the description"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 1,
20 | "metadata": {
21 | "collapsed": true
22 | },
23 | "outputs": [],
24 | "source": [
25 | "from metatab.jupyter.script import get_ipython\n",
26 | "import pandas as pd\n",
27 | "from os.path import exists\n",
28 | "from os import remove"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 2,
34 | "metadata": {},
35 | "outputs": [
36 | {
37 | "name": "stdout",
38 | "output_type": "stream",
39 | "text": [
40 | "This is a Bash cell\n"
41 | ]
42 | }
43 | ],
44 | "source": [
45 | "%%bash\n",
46 | "echo \"This is a Bash cell\"\n",
47 | "touch /tmp/footouched"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": null,
53 | "metadata": {
54 | "collapsed": true
55 | },
56 | "outputs": [],
57 | "source": [
58 | "assert exists('/tmp/footouched')\n",
59 | "remove('/tmp/footouched')\n",
60 | "assert not exists('/tmp/footouched')"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 2,
66 | "metadata": {},
67 | "outputs": [
68 | {
69 | "name": "stdout",
70 | "output_type": "stream",
71 | "text": [
72 | "MagicsTest.ipynb\r\n"
73 | ]
74 | }
75 | ],
76 | "source": [
77 | "!ls"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": 2,
83 | "metadata": {
84 | "collapsed": true
85 | },
86 | "outputs": [],
87 | "source": [
88 | "%%metatab -p . \n",
89 | "Origin: example.com\n",
90 | "Dataset: foobar.com \n",
91 | "Identifier: de097279-28ef-42f5-a4f5-0eaac53b7dc4\n",
92 | "Name: example.com-foobar.com"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": null,
98 | "metadata": {
99 | "collapsed": true
100 | },
101 | "outputs": [],
102 | "source": [
103 | "assert mt_pkg.find_first_value('Root.Name') == 'example.com-foobar.com'"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": 1,
109 | "metadata": {},
110 | "outputs": [
111 | {
112 | "data": {
113 | "text/plain": [
114 | "20"
115 | ]
116 | },
117 | "execution_count": 1,
118 | "metadata": {},
119 | "output_type": "execute_result"
120 | }
121 | ],
122 | "source": [
123 | "foo = 10\n",
124 | "bar = 20\n",
125 | "print(bar)"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": null,
131 | "metadata": {
132 | "collapsed": true
133 | },
134 | "outputs": [],
135 | "source": [
136 | "print(\"DIsplaying Locals\")\n",
137 | "print(locals())"
138 | ]
139 | }
140 | ],
141 | "metadata": {
142 | "celltoolbar": "Tags",
143 | "kernelspec": {
144 | "display_name": "Python 3",
145 | "language": "python",
146 | "name": "python3"
147 | },
148 | "language_info": {
149 | "codemirror_mode": {
150 | "name": "ipython",
151 | "version": 3
152 | },
153 | "file_extension": ".py",
154 | "mimetype": "text/x-python",
155 | "name": "python",
156 | "nbconvert_exporter": "python",
157 | "pygments_lexer": "ipython3",
158 | "version": "3.6.1"
159 | },
160 | "varInspector": {
161 | "cols": {
162 | "lenName": 16,
163 | "lenType": 16,
164 | "lenVar": 40
165 | },
166 | "kernels_config": {
167 | "python": {
168 | "delete_cmd_postfix": "",
169 | "delete_cmd_prefix": "del ",
170 | "library": "var_list.py",
171 | "varRefreshCmd": "print(var_dic_list())"
172 | },
173 | "r": {
174 | "delete_cmd_postfix": ") ",
175 | "delete_cmd_prefix": "rm(",
176 | "library": "var_list.r",
177 | "varRefreshCmd": "cat(var_dic_list()) "
178 | }
179 | },
180 | "types_to_exclude": [
181 | "module",
182 | "function",
183 | "builtin_function_or_method",
184 | "instance",
185 | "_Feature"
186 | ],
187 | "window_display": false
188 | }
189 | },
190 | "nbformat": 4,
191 | "nbformat_minor": 2
192 | }
193 |
--------------------------------------------------------------------------------
/metatab/test/test-data/scripts/complex-text.txt:
--------------------------------------------------------------------------------
1 | Identifier: 47bc1089-7584-41f0-b804-602ec42f1249
2 | Origin: civicknowledge.com
3 | Dataset: rcfe_affordability
4 | Version: 4
5 | Time: 2015
6 | Name: civicknowledge.com-rcfe_affordability-2015-4
7 |
8 | Section: Contacts
9 | Wrangler: Eric Busboom
10 | Wrangler.Email: eric@civicknowledge.com
11 | Wrangler.Organization: Civic Knowledge
12 |
13 | Section: References
14 |
15 | Reference: censusreporter:B09020/140/05000US06073
16 | Reference.Name: B09020
17 | Reference.Description: Relationship by Household Type (Including Living Alone) for Population 65 Years and Over
18 |
19 | Reference: censusreporter:B25007/140/05000US06073
20 | Reference.Name: B25007
21 | Reference.Description: Tenure by Age of Householder
22 |
23 | #
24 | # Household Income
25 | #
26 |
27 | Reference: censusreporter:B19049/140/05000US06073
28 | Reference.Name: B19049
29 | Reference.Description: Median Household Income by Age of Householder
30 |
31 |
32 | # For whole county
33 |
34 | Reference: censusreporter:B19049/050/05000US06073
35 | Reference.Name: B19049_county
36 | Reference.Description: Median Household Income by Age of Householder
37 |
38 | #
39 | # Home value distributions, by tract
40 | #
41 |
42 | Reference: censusreporter:B25076/140/05000US06073
43 | Reference.Name: B25076
44 | Reference.Description: Lower Value Quartile (Dollars)
45 |
46 | Reference: censusreporter:B25077/140/05000US06073
47 | Reference.Name: B25077
48 | Reference.Description: Median Value
49 |
50 | Reference: censusreporter:B25078/140/05000US06073
51 | Reference.Name: B25078
52 | Reference.Description: Upper Value Quartile (Dollars)
53 |
54 | #
55 | # Home value distributions, for SD County
56 | #
57 | Reference: censusreporter:B25076/050/05000US06073
58 | Reference.Name: B25076_county
59 | Reference.Description: Lower Value Quartile (Dollars)
60 |
61 | Reference: censusreporter:B25077/050/05000US06073
62 | Reference.Name: B25077_county
63 | Reference.Description: Median Value
64 |
65 | Reference: censusreporter:B25078/050/05000US06073
66 | Reference.Name: B25078_county
67 | Reference.Description: Upper Value Quartile (Dollars)
68 |
69 | #
70 | # Tract crosswalk
71 | #
72 | Reference: metatab+http://s3.amazonaws.com/library.metatab.org/sangis.org-census_regions-2010-sandiego-7.csv#tract-sra-msa-xwalk
73 | Reference.Name: tracts
74 | Reference.Description: Crosswalk between crosswalks, tracts, zip codes and SRAs in San Diego County
75 |
76 | #
77 | # Tract boundaries
78 | #
79 | Reference: metatab+http://s3.amazonaws.com/library.metatab.org/sangis.org-census_regions-2010-sandiego-7.csv#tracts
80 | Reference.Name: tracts_geo
81 | Reference.Description: Geographics Boundaries for Tracts
82 |
83 | #
84 | # SRA boundaries
85 | #
86 | Reference: metatab+http://s3.amazonaws.com/library.metatab.org/sangis.org-census_regions-2010-sandiego-7.csv#sra
87 | Reference.Name: sra_geo
88 | Reference.Description: Geographics Boundaries for SRAs
89 |
90 | #
91 | # IPUMS Housing and Income Data
92 | #
93 | # Need to use the ZIP version b/c we need to import the Python Code
94 | Reference: metatab+http://s3.amazonaws.com/library.metatab.org/ipums.org-income_homevalue-5.zip#income_homeval
95 | Reference.Name: incv
96 | Reference.Description: Income and Home value records from IPUMS for San Diego County
97 | Section: Resources
98 |
99 |
100 | Section: Bibliography
101 | Citation: ipums
102 | Citation.Type: dataset
103 | Citation.Author: Steven Ruggles; Katie Genadek; Ronald Goeken; Josiah Grover; Matthew Sobek
104 | Citation.Title: Integrated Public Use Microdata Series
105 | Citation.Year: 2017
106 | Citation.Publisher: University of Minnesota
107 | Citation.Version: 7.0
108 | Citation.AccessDate: 20170718
109 | Citation.Url: https://usa.ipums.org/usa/index.shtml
110 | Citation.Doi: https://doi.org/10.18128/D010.V7.0
111 |
112 | Citation: bordley
113 | Citation.Type: article
114 | Citation.Author: Robert F. Bordley; James B. McDonald; Anand Mantrala
115 | Citation.Title: Something New, Something Old: Parametric Models for the Size of Distribution of Income
116 | Citation.Year: 1997
117 | Citation.Month: June
118 | Citation.Journal: Journal of Income Distribution
119 | Citation.Volume: 6
120 | Citation.Number: 1
121 | Citation.Pages: 5-5
122 | Citation.Url: https://ideas.repec.org/a/jid/journl/y1997v06i1p5-5.html
123 |
124 | Citation: mcdonald
125 | Citation.Type: article
126 | Citation.Author: McDonald, James B.; Mantrala, Anand
127 | Citation.Title: The distribution of personal income: Revisited
128 | Citation.Journal: Journal of Applied Econometrics
129 | Citation.Volume: 10
130 | Citation.Number: 2
131 | Citation.Publisher: Wiley Subscription Services, Inc., A Wiley Company
132 | Citation.Issn: 1099-1255
133 | Citation.Doi: 10.1002/jae.3950100208
134 | Citation.Pages: 201--204,
135 | Citation.Year: 1995
136 |
137 | Citation: majumder
138 | Citation.Type: article
139 | Citation.Author: Majumder, Amita; Chakravarty, Satya Ranjan
140 | Citation.Title: Distribution of personal income: Development of a new model and its application to U.S. income data
141 | Citation.Journal: Journal of Applied Econometrics
142 | Citation.Volume: 5
143 | Citation.Number: 2
144 | Citation.Publisher: Wiley Subscription Services, Inc., A Wiley Company
145 | Citation.Issn: 1099-1255
146 | Citation.Doi: 10.1002/jae.3950050206
147 | Citation.Pages: 189--196
148 | Citation.Year: 1990
--------------------------------------------------------------------------------
/docs/PrivateDatasets.rst:
--------------------------------------------------------------------------------
1 |
2 | Private Datasets
3 | ================
4 |
5 | Datasets that should be protected from unauthorized access can be written to S3 with a private ACL and access using S3 credentials. To use private datasets:
6 |
7 | - Use the **metaaws** program to setup an S3 bucket with a policy and users
8 | - Add a ``Root.Access`` term to the dataset's metatab document.
9 | - Syncronize the dataset to s3 with **metasync**
10 | - Setup credentials for an S3 user
11 | - Access the dataset using an S3 url.
12 |
13 | Setup The S3 Bucket
14 | -------------------
15 |
16 | Suppose we want to store datasets in a bucket ``bucket.example.com``. After creating the bucjet, initialize it with subdirectories and policies with the **metaaws** program.
17 |
18 | .. code-block:: bash
19 |
20 | $ metaaws init-bucket bucket.example.com
21 |
22 |
23 |
24 | Configure and Sync a Dataset
25 | ----------------------------
26 |
27 | To make a dataset private, add a ``Root.Access`` term to the ``Root`` section, with a value of ``private``
28 |
29 |
30 |
31 | Create S3 Users
32 | ---------------
33 |
34 | Use the **metaaws** program to create users and add permissions to the bucket. First, initialize a bucket with the apprpriate policies:
35 |
36 | .. code-block:: bash
37 |
38 | $ metaaws init-bucket bucket.example.com
39 |
40 | Then, create a new user.
41 |
42 | .. code-block:: bash
43 |
44 | $ metaaws new-user foobar
45 | Created user : foobar
46 | arn : arn:aws:iam::095555823111:user/metatab/foobar
47 | Access Key : AKIAJXMFAP3X5TRYYQ5Q
48 | Secret Key : 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
49 |
50 | The secret key and access key should be given to the user, to set up as according to the next
51 | section.
52 |
53 | Setup S3 Credentials
54 | --------------------
55 |
56 | The access and secret keys should be stored in a boto configuration file, such as ``~/.aws/credentials``. See
57 | the `boto3 configuration documentation `_ for details. Here is an example of a ``credentials`` file
58 |
59 | .. code-block::
60 |
61 | [default]
62 | aws_access_key_id = AKIAJXMFAP3X5TRYYQ5Q
63 | aws_secret_access_key = b81zw4LRDKVILzrZbS0B8KMn88xbY9BEEnwzKrz2
64 |
65 |
66 | If you have multiple credentials, you can put them in different sections by changing ``[default]`` to the name of another profile. For instance, here is a credentials file with a default and alternate profile:
67 |
68 | .. code-block::
69 |
70 | [default]
71 | aws_access_key_id = AKIAJXMFAP3X5TRYYQ5Q
72 | aws_secret_access_key = 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
73 | [fooprofile]
74 | aws_access_key_id = AKIAX5TRYYQ5QJXMFAP3
75 | aws_secret_access_key = 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
76 |
77 | To use the alternate credentials with the ``metasync`` program, use the ``-p`` option:
78 |
79 | .. code-block:: bash
80 |
81 | $ metasync -p fooprofile -S library.metatab.org
82 |
83 | To use the alternate credentials with the ``open_package()`` function, you will need to set them in the shell before you run any programs. The ``metasync -C`` program will display the credentials in a form that can be shell eval'd, and the ``-p`` option can select an alternate profile.
84 |
85 | .. code-block:: bash
86 |
87 | $ metasync -C -p fooprofile
88 | export AWS_ACCESS_KEY_ID=AKIAX5TRYYQ5QJXMFAP3
89 | export AWS_SECRET_ACCESS_KEY=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
90 | # Run 'eval $(metasync -C -p fooprofile )' to configure credentials in a shell
91 |
92 | The last line of the output shows the command to run to set the credentials in the shell:
93 |
94 | .. code-block:: bash
95 |
96 | $ eval $(metasync -C -p fooprofile )
97 |
98 | Setting credentials in the shell is only required if you access the private dataset via ``open_package()`` although it should also work when using the ``metasync`` and ``metapack`` program.
99 |
100 | Using Private Files
101 | -------------------
102 |
103 | Private files can't be easily downloaded using a web browser, but there are a few other ways to fetch them.
104 |
105 | * Use an S3 client, such as CyberDuck, S3 Browser, CloudBerry or S3 Tools.
106 | * Use the ``metapack`` program to dump a CSV file.
107 |
108 | To use the matpack program, first list the resources in the remote package:
109 |
110 | .. code-block:: bash
111 |
112 | $ metapack -r s3://library.civicknowledge.com/private/carr/civicknowledge.com-rcfe_health-1.csv
113 | seniors s3://library.civicknowledge.com/private/carr/civicknowledge.com-rcfe_health-1/data/seniors.csv
114 | rcfe_tract s3://library.civicknowledge.com/private/carr/civicknowledge.com-rcfe_health-1/data/rcfe_tract.csv
115 | rcfe_sra s3://library.civicknowledge.com/private/carr/civicknowledge.com-rcfe_health-1/data/rcfe_sra.csv
116 | rcfe_seniors_tract s3://library.civicknowledge.com/private/carr/civicknowledge.com-rcfe_health-1/data/rcfe_seniors_tract.csv
117 |
118 | Then, run the same command again, but appending a fragment to the url, and redirecting to a csv file. For instance, for the 'seniors' file, append ``#seniors`` to the url:
119 |
120 |
121 | .. code-block:: bash
122 |
123 | $ metapack -r s3://.../civicknowledge.com-rcfe_health-1.csv#seniors > seniors.csv
124 |
125 | You can also fetch the entire data package, downloading all of the data files, by creating a local file system, zip or excel package. The easiest to use is the Filesystem package, created with ``metapack -f``
126 |
127 | .. code-block:: bash
128 |
129 | $ metapack -f s3://.../civicknowledge.com-rcfe_health-1.csv
130 |
131 | The command will create a complete data package with unpacked CSV files in the ``_packages`` subdirectory.
132 |
133 |
134 |
135 |
136 |
137 |
138 |
--------------------------------------------------------------------------------
/metatab/test/test-data/example1.txt:
--------------------------------------------------------------------------------
1 | Declare: metatab-latest
2 | Title: Registered Voters, By County
3 | Name: cdph.ca.gov-hci-registered_voters-county
4 | Description: Percent of the eligible population registered to vote and the percent who voted in statewide elections.
5 | Identifier: cdph.ca.gov-hci-registered_voters-county
6 | Version: 201404
7 | Obsoletes: cdph.ca.gov-hci-registered_voters-county-201304
8 | Format: excel
9 | Spatial: California <04000US06>
10 | Time: 2002-2014
11 | Spatialgrain: County <05000US>
12 | Section: Resources
13 | Datafile: http://example.com/example1.csv
14 | Datafile.Name: example1
15 | Datafile.Schema: registered_voters
16 | Datafile.Grain: County
17 | Datafile.Title: The First Example Data File
18 | Datafile: http://example.com/example2.csv
19 | Datafile.Name: example2
20 | Datafile.Schema: registered_voters
21 | Datafile.Grain: Tract
22 | Datafile.Title: The Second Example Data File
23 | Homepage: https://www.cdph.ca.gov/programs/pages/healthycommunityindicators.aspx
24 | Homepage.Schema: Healthy Communities Data and Indicators Project (HCI)
25 | Documentation: https://www.cdph.ca.gov/programs/Documents/HCI_RegisteredVoters_653_Narrative_and_examples_6-2-14.pdf
26 | Documentation.Schema: Indicator Documentation for Voter Registration / Participation
27 | Documentation.Description: Voter Registration/Participation: Percent of the eligible population registered to vote and the percent who voted in statewide elections
28 | Section: Contacts
29 | Creator: Office of Health Equity
30 | Creator.Email: HCIOHE@cdph.ca.gov
31 | Wrangler: Eric Busboom
32 | Wrangler.Email: eric@civicknowledge.com
33 | Section: Notes
34 | Note: This file is an example of a data bundle, a simple format for linking data to metadata using spreadsheets. See the specification for more details.
35 | Documentation: https://docs.google.com/document/d/16tb7x73AyF8pJ6e6IBcaIJAioEZCNBGDEksKYTXfdfg/edit#
36 | Documentation.Title: Data Bundles Packaging Specification
37 | Section: Schema
38 | Table: registered_voters
39 | Table.Description: HCI Indicator 653.0: Percent of adults age 18 years and older who are registered voters
40 | Table.Column: reportyear
41 | Column.Datatype: int
42 | Column.Valuetype: year range
43 | Column.Description: Year or years that indicator was reported
44 | Table.Column: type
45 | Column.Datatype: str
46 | Column.Valuetype: dimension
47 | Column.Description: Type of record
48 | Table.Column: gvid
49 | Column.Datatype: str
50 | Column.Valuetype: gvid
51 | Column.Description: GVid version of the geotype and geotypeval
52 | Table.Column: geoname
53 | Column.Datatype: str
54 | Column.Valuetype: label for gvid
55 | Column.Description: Census name of geographic area
56 | Table.Column: geotype
57 | Column.Datatype: str
58 | Column.Valuetype: label
59 | Column.Description: Code for type of geographic area
60 | Table.Column: geotypevalue
61 | Column.Datatype: str
62 | Column.Valuetype: census
63 | Column.Description: Census geoid code
64 | Table.Column: county_fips
65 | Column.Datatype: str
66 | Column.Valuetype: FIPS county code
67 | Column.Description: County FIPS code
68 | Table.Column: county_name
69 | Column.Datatype: str
70 | Column.Valuetype: label for counrty_fips
71 | Column.Description: County name
72 | Table.Column: region_code
73 | Column.Datatype: str
74 | Column.Valuetype: census code
75 | Column.Description: Numeric code of region
76 | Table.Column: region_name
77 | Column.Datatype: str
78 | Column.Valuetype: label for region_code
79 | Column.Description: Name of region
80 | Table.Column: raceth
81 | Column.Datatype: str
82 | Column.Valuetype: raceth/civick
83 | Column.Description: Civic Knowledge race / ethnicity code.
84 | Table.Column: raceth_name
85 | Column.Datatype: str
86 | Column.Valuetype: label for raceeth
87 | Column.Description: Race / Ethnicity Name
88 | Table.Column: race_eth_code
89 | Column.Datatype: str
90 | Column.Valuetype: raceth/hci
91 | Column.Description: Race / ethnicity code
92 | Table.Column: race_eth_name
93 | Column.Datatype: str
94 | Column.Valuetype: label for race_eth_code
95 | Column.Description: Race / ethnicity name
96 | Table.Column: numerator
97 | Column.Datatype: int
98 | Column.Valuetype: count
99 | Column.Description: Adults who are registered to vote, or who voted, depending on type of record
100 | Table.Column: denominator
101 | Column.Datatype: int
102 | Column.Valuetype: count
103 | Column.Description: Population of Adults, 18 years or older
104 | Table.Column: percent
105 | Column.Datatype: float
106 | Column.Valuetype: percent of numerator over denominator
107 | Column.Description: Percent of adults who are registered to vote, or who voted, depending on type of record
108 | Table.Column: ll_95ci
109 | Column.Datatype: float
110 | Column.Valuetype: ci95l for percent
111 | Column.Description: Lower bound of 95% confidence interval
112 | Table.Column: ul_95ci
113 | Column.Datatype: float
114 | Column.Valuetype: ci95u for percent
115 | Column.Description: Upper bound of 95% confidence interval
116 | Table.Column: se
117 | Column.Datatype: float
118 | Column.Valuetype: se for percent
119 | Column.Description: Standard error
120 | Table.Column: rse
121 | Column.Datatype: float
122 | Column.Valuetype: rse for percent
123 | Column.Description: Relative standard error (se/percent * 100) expressed as a percent
124 | Table.Column: ca_decile
125 | Column.Datatype: float
126 | Column.Valuetype: decile
127 | Column.Description: Statewide decile ranking
128 | Table.Column: ca_rr
129 | Column.Datatype: float
130 | Column.Valuetype: ratio
131 | Column.Description: Ratio of indicator to state average
132 | Table.Column: vap
133 | Column.Datatype: float
134 | Column.Valuetype: measure
135 | Column.Description: Voter age population, from CA Department of Finance.
136 | Table.Column: ind_id
137 | Column.Datatype: str
138 | Column.Valuetype: dimension
139 | Table.Column: ind_definition
140 | Column.Datatype: str
141 | Column.Valuetype: dimension
142 | Table.Column: version
143 | Column.Datatype: str
144 | Column.Valuetype: other
145 |
--------------------------------------------------------------------------------
/metatab/rowgen.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017 Civic Knowledge. This file is licensed under the terms of the
2 | # MIT License, included in this distribution as LICENSE.txt
3 |
4 | """ """
5 | from rowgenerators import Source
6 | from rowgenerators.source import Source
7 | from rowgenerators import SourceError
8 |
9 | class YamlMetatabSource(Source):
10 | """Turn a metatab-formated YAML file into Metatab rows."""
11 |
12 | def __init__(self, ref, table=None, cache=None, working_dir=None, env=None, **kwargs):
13 | super().__init__(ref, cache, working_dir, **kwargs)
14 |
15 | self.url = ref
16 | self.section_map = {}
17 | self.sections = {}
18 |
19 | def yield_dict(self, doc, d, parent=None):
20 |
21 | for k, v in d.items():
22 |
23 | tn = "{}.{}".format((parent or 'Root').split('.')[-1], k).lower()
24 | t = doc.decl_terms.get(tn,{})
25 | vtn = t.get('termvaluename','').lower()
26 |
27 | if isinstance(v, list):
28 | for e in v:
29 | try:
30 | value = e[vtn]
31 | del e[vtn]
32 | yield (tn, value, parent)
33 | except KeyError:
34 | pass
35 |
36 | yield from self.yield_dict(doc, e, tn)
37 | elif isinstance(v, dict):
38 | yield from self.yield_dict(doc, v, tn)
39 | else:
40 | yield (tn,v, parent)
41 |
42 |
43 | def __iter__(self):
44 | """Iterate over all of the lines in the file"""
45 |
46 | import yaml
47 | from metatab import MetatabDoc
48 |
49 | with open(self.url.fspath) as f:
50 | d = yaml.load(f)
51 |
52 | decl = d.get('declare', 'metatab-latest')
53 |
54 | doc = MetatabDoc(decl=decl)
55 |
56 | #yield from doc.rows
57 |
58 | section_names = ['root','contacts','documentation','resources','references','schema']
59 |
60 | for section_name in section_names:
61 | section = doc.decl_sections[section_name]
62 | #print(section_name, section)
63 |
64 | for tn in section.get('terms',[]):
65 | self.section_map[tn.lower()] = section_name
66 |
67 | self.sections[section_name] = doc.get_or_new_section(section_name, section['args'])
68 |
69 | last_section = None
70 | last_term = { }
71 | for term_name, value, parent in self.yield_dict(doc, d):
72 |
73 | print(term_name, value, parent)
74 |
75 | section = self.sections.get(self.section_map.get(term_name) or 'root')
76 |
77 | if parent is None:
78 | term = section.new_term(term_name, value)
79 | else:
80 |
81 | parent_term = last_term[parent]
82 | term = parent_term.new_child(term_name, value)
83 |
84 | last_term[term_name] = term
85 |
86 |
87 |
88 |
89 | yield from doc.rows
90 |
91 |
92 | class MetatabRowGenerator(Source):
93 | """An object that generates rows. The current implementation mostly just a wrapper around
94 | csv.reader, but it adds a path property so term interperters know where the terms are coming from
95 | """
96 |
97 | def __init__(self, ref, cache=None, working_dir=None, path = None, **kwargs):
98 | super().__init__(ref, cache, working_dir, **kwargs)
99 |
100 | self._rows = ref
101 | self._path = path or ''
102 |
103 | @property
104 | def path(self):
105 | return self._path
106 |
107 | def open(self):
108 | pass
109 |
110 | def close(self):
111 | pass
112 |
113 | def __iter__(self):
114 | for row in self._rows:
115 | yield row
116 |
117 |
118 | class TextRowGenerator(MetatabRowGenerator):
119 | """Return lines of text of a line-oriented metatab file, breaking them to be used as Metatab rows.
120 | This is the core of the Lines format implementation"""
121 |
122 | def __init__(self, ref, cache=None, working_dir=None, path = None, **kwargs):
123 | super().__init__(ref, cache, working_dir, path, **kwargs)
124 |
125 | while True:
126 |
127 | try:
128 | # Pathlib Path
129 | with ref.open() as r:
130 | text = r.read()
131 | break
132 | except:
133 | pass
134 |
135 | try:
136 | # Filehandle
137 | text = ref.read()
138 | break
139 | except:
140 | pass
141 |
142 | try:
143 | # Url
144 | with ref.inner.fspath.open() as f:
145 | text = f.read()
146 | break
147 | except:
148 |
149 | pass
150 |
151 | try:
152 | # File name
153 | with open(ref) as r:
154 | text = r.read()
155 | break
156 | except:
157 | pass
158 |
159 | try:
160 | text = ref
161 | text.splitlines()
162 | break
163 | except AttributeError:
164 | pass
165 |
166 |
167 | raise SourceError("Can't handle ref of type {}".format(type(ref)))
168 |
169 | self._text = text
170 | self._text_lines = text.splitlines()
171 | self._path = path or ''
172 |
173 | @property
174 | def path(self):
175 | return self._path
176 |
177 | def open(self):
178 | pass
179 |
180 | def close(self):
181 | pass
182 |
183 | def __iter__(self):
184 | import re
185 |
186 | for row in self._text_lines:
187 | if re.match(r'^\s*#', row): # Skip comments
188 | continue
189 |
190 | # Special handling for ====, which implies a section:
191 | # ==== Schema
192 | # is also
193 | # Section: Schema
194 |
195 | if row.startswith('===='):
196 | row = re.sub(r'^=*','Section:', row)
197 |
198 | row = [e.strip() for e in row.split(':', 1)]
199 |
200 | # Pipe characters seperate columns
201 | if len(row) > 1:
202 | row = [row[0]] + [ e.replace('\|','|') for e in re.split(r'(?/')
111 |
112 | The result should be the same documentation, but with different URLs.
--------------------------------------------------------------------------------
/metatab/util.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2016 Civic Knowledge. This file is licensed under the terms of the
2 | # Revised BSD License, included in this distribution as LICENSE
3 |
4 | """Classes to build a Metatab document
5 | """
6 | import logging
7 | import os
8 | import shutil
9 | import sys
10 | from genericpath import exists, isfile
11 | from os import makedirs
12 | from os.path import join, basename, dirname, isdir, abspath
13 |
14 | #from rowgenerators import reparse_url, parse_url_to_dict, unparse_url_dict, Url
15 |
16 | from metatab import DEFAULT_METATAB_FILE
17 | from rowgenerators import get_cache
18 |
19 |
20 | def declaration_path(name):
21 | """Return the path to an included declaration"""
22 | from os.path import dirname, join, exists
23 | import metatabdecl
24 | from metatab.exc import IncludeError
25 |
26 | d = dirname(metatabdecl.__file__)
27 |
28 | path = join(d, name)
29 |
30 | if not exists(path):
31 | path = join(d, name + '.csv')
32 |
33 | if not exists(path):
34 | raise IncludeError("No local declaration file for name '{}' ".format(name))
35 |
36 | return path
37 |
38 |
39 | # From http://stackoverflow.com/a/295466
40 | def slugify(value):
41 | """
42 | Normalizes string, converts to lowercase, removes non-alpha characters,
43 | and converts spaces to hyphens.
44 | """
45 | import re
46 | import unicodedata
47 | value = str(value)
48 | value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('utf8').strip().lower()
49 | value = re.sub(r'[^\w\s\-\.]', '', value)
50 | value = re.sub(r'[-\s]+', '-', value)
51 | return value
52 |
53 |
54 | def flatten(d, sep='.'):
55 | """Flatten a data structure into tuples"""
56 |
57 | def _flatten(e, parent_key='', sep='.'):
58 | import collections
59 |
60 | prefix = parent_key + sep if parent_key else ''
61 |
62 | if isinstance(e, collections.MutableMapping):
63 | return tuple((prefix + k2, v2) for k, v in e.items() for k2, v2 in _flatten(v, k, sep))
64 | elif isinstance(e, collections.MutableSequence):
65 | return tuple((prefix + k2, v2) for i, v in enumerate(e) for k2, v2 in _flatten(v, str(i), sep))
66 | else:
67 | return (parent_key, (e,)),
68 |
69 | return tuple((k, v[0]) for k, v in _flatten(d, '', sep))
70 |
71 |
72 | # From http://stackoverflow.com/a/2597440
73 | class Bunch(object):
74 | def __init__(self, adict):
75 | self.__dict__.update(adict)
76 |
77 |
78 | MP_DIR = '_metapack'
79 | DOWNLOAD_DIR = join(MP_DIR, 'download')
80 | PACKAGE_DIR = join(MP_DIR, 'package')
81 | OLD_DIR = join(MP_DIR, 'old')
82 |
83 |
84 | def make_dir_structure(base_dir):
85 | """Make the build directory structure. """
86 |
87 | def maybe_makedir(*args):
88 |
89 | p = join(base_dir, *args)
90 |
91 | if exists(p) and not isdir(p):
92 | raise IOError("File '{}' exists but is not a directory ".format(p))
93 |
94 | if not exists(p):
95 | makedirs(p)
96 |
97 | maybe_makedir(DOWNLOAD_DIR)
98 | maybe_makedir(PACKAGE_DIR)
99 | maybe_makedir(OLD_DIR)
100 |
101 |
102 | def make_metatab_file(template='metatab'):
103 | from os.path import dirname
104 | from rowgenerators.util import fs_join as join
105 | import metatab.templates
106 | from metatab.doc import MetatabDoc
107 |
108 | template_path = join(dirname(metatab.templates.__file__), template + '.csv')
109 |
110 | doc = MetatabDoc(template_path)
111 |
112 | return doc
113 |
114 |
115 |
116 | import mimetypes
117 |
118 | mimetypes.init()
119 | mime_map = {v: k.strip('.') for k, v in mimetypes.types_map.items()}
120 | mime_map['application/x-zip-compressed'] = 'zip'
121 | mime_map['application/vnd.ms-excel'] = 'xls'
122 | mime_map['text/html'] = 'html'
123 |
124 |
125 | # From https://gist.github.com/zdavkeos/1098474
126 | def walk_up(bottom):
127 | """ mimic os.walk, but walk 'up' instead of down the directory tree
128 | :param bottom:
129 | :return:
130 | """
131 | import os
132 | from os import path
133 |
134 | bottom = path.realpath(bottom)
135 |
136 | # get files in current dir
137 | try:
138 | names = os.listdir(bottom)
139 | except Exception as e:
140 | raise e
141 |
142 | dirs, nondirs = [], []
143 | for name in names:
144 | if path.isdir(path.join(bottom, name)):
145 | dirs.append(name)
146 | else:
147 | nondirs.append(name)
148 |
149 | yield bottom, dirs, nondirs
150 |
151 | new_path = path.realpath(path.join(bottom, '..'))
152 |
153 | # see if we are at the top
154 | if new_path == bottom:
155 | return
156 |
157 | for x in walk_up(new_path):
158 | yield x
159 |
160 |
161 | def ensure_dir(path):
162 | if path and not exists(path):
163 | makedirs(path)
164 |
165 |
166 | def copytree(src, dst, symlinks=False, ignore=None):
167 | for item in os.listdir(src):
168 | s = os.path.join(src, item)
169 | d = os.path.join(dst, item)
170 | if os.path.isdir(s):
171 | shutil.copytree(s, d, symlinks, ignore)
172 | else:
173 | shutil.copy2(s, d)
174 |
175 |
176 | logger = logging.getLogger('user')
177 | logger_err = logging.getLogger('cli-errors')
178 | debug_logger = logging.getLogger('debug')
179 |
180 |
181 | def cli_init(log_level=logging.INFO):
182 | out_hdlr = logging.StreamHandler(sys.stdout)
183 | out_hdlr.setFormatter(logging.Formatter('%(message)s'))
184 | out_hdlr.setLevel(log_level)
185 | logger.addHandler(out_hdlr)
186 | logger.setLevel(log_level)
187 |
188 | out_hdlr = logging.StreamHandler(sys.stderr)
189 | out_hdlr.setFormatter(logging.Formatter('%(levelname)s: %(message)s'))
190 | out_hdlr.setLevel(logging.WARN)
191 | logger_err.addHandler(out_hdlr)
192 | logger_err.setLevel(logging.WARN)
193 |
194 |
195 | def prt(*args, **kwargs):
196 | logger.info(' '.join(str(e) for e in args), **kwargs)
197 |
198 |
199 | def warn(*args, **kwargs):
200 | logger_err.warn(' '.join(str(e) for e in args), **kwargs)
201 |
202 |
203 | def err(*args, **kwargs):
204 | logger_err.critical(' '.join(str(e) for e in args), **kwargs)
205 | sys.exit(1)
206 |
207 |
208 | def import_name_or_class(name):
209 | " Import an obect as either a fully qualified, dotted name, "
210 |
211 | if isinstance(name, str):
212 |
213 | # for "a.b.c.d" -> [ 'a.b.c', 'd' ]
214 | module_name, object_name = name.rsplit('.',1)
215 | # __import__ loads the multi-level of module, but returns
216 | # the top level, which we have to descend into
217 | mod = __import__(module_name)
218 |
219 | components = name.split('.')
220 |
221 | for comp in components[1:]: # Already got the top level, so start at 1
222 |
223 | mod = getattr(mod, comp)
224 | return mod
225 | else:
226 | return name # Assume it is already the thing we want to import
227 |
228 |
229 | def md5_file(filePath):
230 | import hashlib
231 |
232 | try:
233 | with open(filePath, 'rb') as fh:
234 | m = hashlib.md5()
235 | while True:
236 | data = fh.read(8192)
237 | if not data:
238 | break
239 | m.update(data)
240 | return m.hexdigest()
241 | except (FileNotFoundError, IsADirectoryError):
242 | return None
--------------------------------------------------------------------------------
/metatab/cli.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017 Civic Knowledge. This file is licensed under the terms of the
2 | # Revised BSD License, included in this distribution as LICENSE
3 |
4 | """
5 | CLI program for managing Metatab files
6 | """
7 |
8 | import json
9 | import sys
10 | from genericpath import exists
11 |
12 | from metatab import DEFAULT_METATAB_FILE, MetatabDoc, parse_app_url
13 | from rowgenerators.util import get_cache, clean_cache
14 | from os.path import dirname
15 | from rowgenerators.util import fs_join as join
16 |
17 | import logging
18 |
19 | logger = logging.getLogger('user')
20 | logger_err = logging.getLogger('cli-errors')
21 | debug_logger = logging.getLogger('debug')
22 |
23 | cache = get_cache()
24 |
25 | def metatab():
26 | import argparse
27 | parser = argparse.ArgumentParser(
28 | prog='metatab',
29 | description='Matatab file parser',
30 | epilog='Cache dir: {}\n'.format(str(cache.getsyspath('/') ) ))
31 |
32 | g = parser.add_mutually_exclusive_group()
33 |
34 | g.add_argument('-C', '--create', action='store', nargs='?', default=False,
35 | help="Create a new metatab file, from named template. With no argument, uses the 'metatab' template ")
36 |
37 | g.add_argument('-t', '--terms', default=False, action='store_const', dest='out_type', const='terms',
38 | help='Parse a file and print out the stream of terms, before interpretation')
39 |
40 | g.add_argument('-j', '--json', default=False, action='store_const', dest='out_type', const='json',
41 | help='Parse a file and print out a JSON representation')
42 |
43 | g.add_argument('-y', '--yaml', default=False, action='store_const', dest='out_type', const='yaml',
44 | help='Parse a file and print out a YAML representation')
45 |
46 | g.add_argument('-l', '--line', default=False, action='store_const', dest='out_type', const='line',
47 | help='Parse a file and print out a Metatab Line representation')
48 |
49 | g.add_argument('-c', '--csv', default=False, action='store_const', dest='out_type', const='csv',
50 | help='Parse a file and print out a Metatab Line representation')
51 |
52 | g.add_argument('-p', '--prety', default=False, action='store_const', dest='out_type', const='prety',
53 | help='Pretty print the python Dict representation ')
54 |
55 | parser.add_argument('-W', '--write-in-place',
56 | help='When outputting as yaml, json, csv or line, write the file instead of printing it, '
57 | 'to a file with same base name and appropriate extension ', action='store_true')
58 |
59 | parser.set_defaults(out_type='csv')
60 |
61 | parser.add_argument('-f', '--find-first',
62 | help='Find and print the first value for a fully qualified term name')
63 |
64 | parser.add_argument('-d', '--show-declaration', default=False, action='store_true',
65 | help='Parse a declaration file and print out declaration dict. Use -j or -y for the format')
66 |
67 | parser.add_argument('file', nargs='?', default=DEFAULT_METATAB_FILE, help='Path to a Metatab file')
68 |
69 | cli_init()
70 |
71 | args = parser.parse_args(sys.argv[1:])
72 |
73 | # Specing a fragment screws up setting the default metadata file name
74 | if args.file.startswith('#'):
75 | args.file = DEFAULT_METATAB_FILE + args.file
76 |
77 | if args.create is not False:
78 | if new_metatab_file(args.file, args.create):
79 | prt("Created ", args.file)
80 | else:
81 | warn("File",args.file,'already exists.')
82 |
83 | exit(0)
84 |
85 | metadata_url = parse_app_url(args.file, proto='metatab')
86 | try:
87 | doc = MetatabDoc(metadata_url, cache=cache)
88 | except IOError as e:
89 |
90 | err("Failed to open '{}': {}".format(metadata_url, e))
91 |
92 | def write_or_print(t):
93 | from pathlib import Path
94 |
95 | if metadata_url.scheme != 'file':
96 | err("Can only use -w with local files")
97 | return
98 |
99 | ext = 'txt' if args.out_type == 'line' else args.out_type
100 |
101 | if args.write_in_place:
102 | with metadata_url.fspath.with_suffix('.'+ext).open('w') as f:
103 | f.write(t)
104 | else:
105 | print(t)
106 |
107 |
108 |
109 | if args.show_declaration:
110 |
111 | decl_doc = MetatabDoc('', cache=cache, decl=metadata_url.path)
112 |
113 | d = {
114 | 'terms': decl_doc.decl_terms,
115 | 'sections': decl_doc.decl_sections
116 | }
117 |
118 | if args.out_type == 'json':
119 | print(json.dumps(d, indent=4))
120 |
121 | elif args.out_type == 'yaml':
122 | import yaml
123 | print(yaml.safe_dump(d, default_flow_style=False, indent=4))
124 |
125 | elif args.find_first:
126 |
127 | t = doc.find_first(args.find_first)
128 | print(t.value)
129 |
130 |
131 | elif args.out_type == 'terms':
132 | for t in doc._term_parser:
133 | print(t)
134 |
135 | elif args.out_type == 'json':
136 | write_or_print(json.dumps(doc.as_dict(), indent=4))
137 |
138 | elif args.out_type == 'yaml':
139 | import yaml
140 | from collections import OrderedDict
141 |
142 | def ordered_dump(data, stream=None, Dumper=yaml.Dumper, **kwds):
143 | class OrderedDumper(Dumper):
144 | pass
145 |
146 | def _dict_representer(dumper, data):
147 | return dumper.represent_mapping(
148 | yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG,
149 | data.items())
150 |
151 | OrderedDumper.add_representer(OrderedDict, _dict_representer)
152 | return yaml.dump(data, stream, OrderedDumper, **kwds)
153 |
154 | write_or_print(ordered_dump(doc.as_dict(), default_flow_style=False, indent=4, Dumper=yaml.SafeDumper))
155 |
156 | elif args.out_type == 'line':
157 | write_or_print(doc.as_lines())
158 |
159 | elif args.out_type == 'csv':
160 | write_or_print(doc.as_csv())
161 |
162 | elif args.out_type == 'prety':
163 | from pprint import pprint
164 | pprint(doc.as_dict())
165 |
166 | exit(0)
167 |
168 |
169 |
170 |
171 |
172 | def cli_init(log_level=logging.INFO):
173 |
174 | out_hdlr = logging.StreamHandler(sys.stdout)
175 | out_hdlr.setFormatter(logging.Formatter('%(message)s'))
176 | out_hdlr.setLevel(log_level)
177 | logger.addHandler(out_hdlr)
178 | logger.setLevel(log_level)
179 |
180 | out_hdlr = logging.StreamHandler(sys.stderr)
181 | out_hdlr.setFormatter(logging.Formatter('%(levelname)s: %(message)s'))
182 | out_hdlr.setLevel(logging.WARN)
183 | logger_err.addHandler(out_hdlr)
184 | logger_err.setLevel(logging.WARN)
185 |
186 | def prt(*args, **kwargs):
187 | logger.info(' '.join(str(e) for e in args),**kwargs)
188 |
189 | def warn(*args, **kwargs):
190 | logger_err.warn(' '.join(str(e) for e in args),**kwargs)
191 |
192 | def err(*args, **kwargs):
193 | logger_err.critical(' '.join(str(e) for e in args),**kwargs)
194 | sys.exit(1)
195 |
196 |
197 | def make_metatab_file(template='metatab'):
198 | import metatab.templates as tmpl
199 |
200 | template_path = join(dirname(tmpl.__file__),template+'.csv')
201 |
202 | doc = MetatabDoc(template_path)
203 |
204 | return doc
205 |
206 |
207 |
208 | def new_metatab_file(mt_file, template):
209 | template = template if template else 'metatab'
210 |
211 | if not exists(mt_file):
212 | doc = make_metatab_file(template)
213 |
214 | doc.write_csv(mt_file)
215 |
216 | return True
217 |
218 | else:
219 |
220 | return False
221 |
222 |
223 | def get_table(doc, name):
224 | t = doc.find_first('Root.Table', value=name)
225 |
226 | if not t:
227 |
228 | table_names = ["'" + t.value + "'" for t in doc.find('Root.Table')]
229 |
230 | if not table_names:
231 | table_names = [""]
232 |
233 | err("Did not find schema for table name '{}' Tables are: {}"
234 | .format(name, " ".join(table_names)))
235 |
236 | return t
237 |
--------------------------------------------------------------------------------
/metatab/test/test-data/json/example1-web.json:
--------------------------------------------------------------------------------
1 | {
2 | "declare": "http://assets.metatab.org/metatab-0.1.csv",
3 | "title": "Registered Voters, By County",
4 | "description": "Percent of the eligible population registered to vote and the percent who voted in statewide elections.",
5 | "identifier": "cdph.ca.gov-hci-registered_voters-county",
6 | "version": "201404",
7 | "obsoletes": "cdph.ca.gov-hci-registered_voters-county-201304",
8 | "format": "excel",
9 | "spatial": "California <04000US06>",
10 | "time": "2002-2014",
11 | "spatialgrain": "County <05000US>",
12 | "datafile": [
13 | {
14 | "table": "registered_voters",
15 | "grain": "County",
16 | "title": "The First Example Data File",
17 | "url": "http://example.com/example1.csv"
18 | },
19 | {
20 | "table": "registered_voters",
21 | "grain": "Tract",
22 | "title": "The Second Example Data File",
23 | "url": "http://example.com/example2.csv"
24 | }
25 | ],
26 | "homepage": {
27 | "table": "Healthy Communities Data and Indicators Project (HCI)",
28 | "url": "https://www.cdph.ca.gov/programs/pages/healthycommunityindicators.aspx"
29 | },
30 | "documentation": [
31 | {
32 | "table": "Indicator Documentation for Voter Registration / Participation",
33 | "description": "Voter Registration/Participation: Percent of the eligible population registered to vote and the percent who voted in statewide elections",
34 | "url": "https://www.cdph.ca.gov/programs/Documents/HCI_RegisteredVoters_653_Narrative_and_examples_6-2-14.pdf"
35 | },
36 | {
37 | "title": "Data Bundles Packaging Specification",
38 | "url": "https://docs.google.com/document/d/16tb7x73AyF8pJ6e6IBcaIJAioEZCNBGDEksKYTXfdfg/edit#"
39 | }
40 | ],
41 | "creator": {
42 | "email": "HCIOHE@cdph.ca.gov",
43 | "name": "Office of Health Equity"
44 | },
45 | "wrangler": {
46 | "email": "eric@civicknowledge.com",
47 | "name": "Eric Busboom"
48 | },
49 | "note": "This file is an example of a data bundle, a simple format for linking data to metadata using spreadsheets. See the specification for more details.",
50 | "table": {
51 | "description": "HCI Indicator 653.0: Percent of adults age 18 years and older who are registered voters",
52 | "column": [
53 | {
54 | "datatype": "int",
55 | "valuetype": "year range",
56 | "description": "Year or years that indicator was reported",
57 | "name": "reportyear"
58 | },
59 | {
60 | "datatype": "str",
61 | "valuetype": "dimension",
62 | "description": "Type of record",
63 | "name": "type"
64 | },
65 | {
66 | "datatype": "str",
67 | "valuetype": "gvid",
68 | "description": "GVid version of the geotype and geotypeval",
69 | "name": "gvid"
70 | },
71 | {
72 | "datatype": "str",
73 | "valuetype": "label for gvid",
74 | "description": "Census name of geographic area",
75 | "name": "geoname"
76 | },
77 | {
78 | "datatype": "str",
79 | "valuetype": "label",
80 | "description": "Code for type of geographic area",
81 | "name": "geotype"
82 | },
83 | {
84 | "datatype": "str",
85 | "valuetype": "census",
86 | "description": "Census geoid code",
87 | "name": "geotypevalue"
88 | },
89 | {
90 | "datatype": "str",
91 | "valuetype": "FIPS county code",
92 | "description": "County FIPS code",
93 | "name": "county_fips"
94 | },
95 | {
96 | "datatype": "str",
97 | "valuetype": "label for counrty_fips",
98 | "description": "County name",
99 | "name": "county_name"
100 | },
101 | {
102 | "datatype": "str",
103 | "valuetype": "census code",
104 | "description": "Numeric code of region",
105 | "name": "region_code"
106 | },
107 | {
108 | "datatype": "str",
109 | "valuetype": "label for region_code",
110 | "description": "Name of region",
111 | "name": "region_name"
112 | },
113 | {
114 | "datatype": "str",
115 | "valuetype": "raceth/civick",
116 | "description": "Civic Knowledge race / ethnicity code.",
117 | "name": "raceth"
118 | },
119 | {
120 | "datatype": "str",
121 | "valuetype": "label for raceeth",
122 | "description": "Race / Ethnicity Name",
123 | "name": "raceth_name"
124 | },
125 | {
126 | "datatype": "str",
127 | "valuetype": "raceth/hci",
128 | "description": "Race / ethnicity code",
129 | "name": "race_eth_code"
130 | },
131 | {
132 | "datatype": "str",
133 | "valuetype": "label for race_eth_code",
134 | "description": "Race / ethnicity name",
135 | "name": "race_eth_name"
136 | },
137 | {
138 | "datatype": "int",
139 | "valuetype": "count",
140 | "description": "Adults who are registered to vote, or who voted, depending on type of record",
141 | "name": "numerator"
142 | },
143 | {
144 | "datatype": "int",
145 | "valuetype": "count",
146 | "description": "Population of Adults, 18 years or older",
147 | "name": "denominator"
148 | },
149 | {
150 | "datatype": "float",
151 | "valuetype": "percent of numerator over denominator",
152 | "description": "Percent of adults who are registered to vote, or who voted, depending on type of record",
153 | "name": "percent"
154 | },
155 | {
156 | "datatype": "float",
157 | "valuetype": "ci95l for percent",
158 | "description": "Lower bound of 95% confidence interval",
159 | "name": "ll_95ci"
160 | },
161 | {
162 | "datatype": "float",
163 | "valuetype": "ci95u for percent",
164 | "description": "Upper bound of 95% confidence interval",
165 | "name": "ul_95ci"
166 | },
167 | {
168 | "datatype": "float",
169 | "valuetype": "se for percent",
170 | "description": "Standard error",
171 | "name": "se"
172 | },
173 | {
174 | "datatype": "float",
175 | "valuetype": "rse for percent",
176 | "description": "Relative standard error (se/percent * 100) expressed as a percent",
177 | "name": "rse"
178 | },
179 | {
180 | "datatype": "float",
181 | "valuetype": "decile",
182 | "description": "Statewide decile ranking",
183 | "name": "ca_decile"
184 | },
185 | {
186 | "datatype": "float",
187 | "valuetype": "ratio",
188 | "description": "Ratio of indicator to state average",
189 | "name": "ca_rr"
190 | },
191 | {
192 | "datatype": "float",
193 | "valuetype": "measure",
194 | "description": "Voter age population, from CA Department of Finance.",
195 | "name": "vap"
196 | },
197 | {
198 | "datatype": "str",
199 | "valuetype": "dimension",
200 | "name": "ind_id"
201 | },
202 | {
203 | "datatype": "str",
204 | "valuetype": "dimension",
205 | "name": "ind_definition"
206 | },
207 | {
208 | "datatype": "str",
209 | "valuetype": "other",
210 | "name": "version"
211 | }
212 | ],
213 | "name": "registered_voters"
214 | }
215 | }
--------------------------------------------------------------------------------
/metatab/test/outputs/metadata.json:
--------------------------------------------------------------------------------
1 | {
2 | "sections": {
3 | "contacts": {
4 | "terms": [
5 | "Wrangler",
6 | "Wrangler.Email",
7 | "Wrangler.Name",
8 | "Creator",
9 | "Creator.Email",
10 | "Creator.Name",
11 | "Publisher",
12 | "Publisher.Email",
13 | "Publisher.Name"
14 | ],
15 | "args": [
16 | "email"
17 | ]
18 | },
19 | "declaredterms": {
20 | "terms": [
21 | "DeclareTerm"
22 | ],
23 | "args": []
24 | },
25 | "resources": {
26 | "terms": [
27 | "Datafile",
28 | "Datafile.Grain",
29 | "Datafile.Table",
30 | "Datafile.Title",
31 | "Datafile.Url",
32 | "Documentation",
33 | "Documentation.Description",
34 | "Documentation.Title",
35 | "Documentation.Url",
36 | "Homepage",
37 | "Homepage.Title",
38 | "Homepage.Url"
39 | ],
40 | "args": [
41 | "table",
42 | "grain",
43 | "title"
44 | ]
45 | },
46 | "declaredsections": {
47 | "terms": [
48 | "DeclareSection"
49 | ],
50 | "args": []
51 | },
52 | "schema": {
53 | "terms": [
54 | "Table",
55 | "Table.Column",
56 | "Table.Description",
57 | "Table.Name",
58 | "Column",
59 | "Column.Datatype",
60 | "Column.Description",
61 | "Column.Name",
62 | "Column.Valuetype"
63 | ],
64 | "args": [
65 | "datatype",
66 | "valuetype",
67 | "description"
68 | ]
69 | },
70 | "root": {
71 | "terms": [
72 | "Declare",
73 | "Include",
74 | "Section",
75 | "Synonym",
76 | "Title",
77 | "Description",
78 | "Time",
79 | "Version",
80 | "Format",
81 | "Identifier",
82 | "Note",
83 | "Obsoletes",
84 | "Spatial",
85 | "SpatialGrain"
86 | ],
87 | "args": []
88 | }
89 | },
90 | "terms": {
91 | ".include": {
92 | "term_name": "Include",
93 | "section": "root"
94 | },
95 | ".declare": {
96 | "term_name": "Declare",
97 | "section": "root"
98 | },
99 | ".title": {
100 | "term_name": "Title",
101 | "section": "root"
102 | },
103 | "datafile.grain": {
104 | "term_name": "Datafile.Grain",
105 | "section": "resources"
106 | },
107 | "homepage.url": {
108 | "term_name": "Homepage.Url",
109 | "section": "resources"
110 | },
111 | ".synonym": {
112 | "term_name": "Synonym",
113 | "childpropertytype": "sequence",
114 | "termvaluename": "term",
115 | "section": "root"
116 | },
117 | "homepage.title": {
118 | "term_name": "Homepage.Title",
119 | "section": "resources"
120 | },
121 | ".datafile": {
122 | "term_name": "Datafile",
123 | "termvaluename": "url",
124 | "section": "resources"
125 | },
126 | ".obsoletes": {
127 | "term_name": "Obsoletes",
128 | "section": "root"
129 | },
130 | "documentation.url": {
131 | "term_name": "Documentation.Url",
132 | "section": "resources"
133 | },
134 | "table.description": {
135 | "term_name": "Table.Description",
136 | "section": "schema"
137 | },
138 | ".table": {
139 | "term_name": "Table",
140 | "termvaluename": "name",
141 | "section": "schema"
142 | },
143 | "documentation.description": {
144 | "term_name": "Documentation.Description",
145 | "section": "resources"
146 | },
147 | ".publisher": {
148 | "term_name": "Publisher",
149 | "termvaluename": "name",
150 | "section": "contacts"
151 | },
152 | "wrangler.email": {
153 | "term_name": "Wrangler.Email",
154 | "section": "contacts"
155 | },
156 | "publisher.name": {
157 | "term_name": "Publisher.Name",
158 | "section": "contacts"
159 | },
160 | ".note": {
161 | "term_name": "Note",
162 | "section": "root"
163 | },
164 | ".description": {
165 | "term_name": "Description",
166 | "section": "root"
167 | },
168 | "creator.email": {
169 | "term_name": "Creator.Email",
170 | "section": "contacts"
171 | },
172 | "column.valuetype": {
173 | "term_name": "Column.Valuetype",
174 | "section": "schema"
175 | },
176 | ".declareterm": {
177 | "term_name": "DeclareTerm",
178 | "termvaluename": "term",
179 | "section": "DeclaredTerms"
180 | },
181 | "datafile.table": {
182 | "term_name": "Datafile.Table",
183 | "section": "resources"
184 | },
185 | "table.column": {
186 | "term_name": "Table.Column",
187 | "childpropertytype": "sequence",
188 | "termvaluename": "name",
189 | "section": "schema"
190 | },
191 | ".documentation": {
192 | "term_name": "Documentation",
193 | "section": "resources"
194 | },
195 | "wrangler.name": {
196 | "term_name": "Wrangler.Name",
197 | "section": "contacts"
198 | },
199 | "column.description": {
200 | "term_name": "Column.Description",
201 | "section": "schema"
202 | },
203 | "documentation.title": {
204 | "term_name": "Documentation.Title",
205 | "section": "resources"
206 | },
207 | ".column": {
208 | "term_name": "Column",
209 | "termvaluename": "name",
210 | "synonym": "Table.Column",
211 | "section": "schema"
212 | },
213 | ".identifier": {
214 | "term_name": "Identifier",
215 | "section": "root"
216 | },
217 | "column.datatype": {
218 | "term_name": "Column.Datatype",
219 | "section": "schema"
220 | },
221 | "creator.name": {
222 | "term_name": "Creator.Name",
223 | "section": "contacts"
224 | },
225 | "column.name": {
226 | "term_name": "Column.Name",
227 | "section": "schema"
228 | },
229 | ".format": {
230 | "term_name": "Format",
231 | "section": "root"
232 | },
233 | ".spatialgrain": {
234 | "term_name": "SpatialGrain",
235 | "section": "root"
236 | },
237 | ".section": {
238 | "term_name": "Section",
239 | "childpropertytype": "sequence",
240 | "termvaluename": "name",
241 | "section": "root"
242 | },
243 | ".declaresection": {
244 | "term_name": "DeclareSection",
245 | "childpropertytype": "sequence",
246 | "termvaluename": "section",
247 | "section": "DeclaredSections"
248 | },
249 | "datafile.url": {
250 | "term_name": "Datafile.Url",
251 | "section": "resources"
252 | },
253 | "table.name": {
254 | "term_name": "Table.Name",
255 | "section": "schema"
256 | },
257 | ".time": {
258 | "term_name": "Time",
259 | "section": "root"
260 | },
261 | "datafile.title": {
262 | "term_name": "Datafile.Title",
263 | "section": "resources"
264 | },
265 | ".creator": {
266 | "term_name": "Creator",
267 | "termvaluename": "name",
268 | "section": "contacts"
269 | },
270 | ".homepage": {
271 | "term_name": "Homepage",
272 | "termvaluename": "url",
273 | "section": "resources"
274 | },
275 | ".spatial": {
276 | "term_name": "Spatial",
277 | "section": "root"
278 | },
279 | ".wrangler": {
280 | "term_name": "Wrangler",
281 | "termvaluename": "name",
282 | "section": "contacts"
283 | },
284 | "publisher.email": {
285 | "term_name": "Publisher.Email",
286 | "section": "contacts"
287 | },
288 | ".version": {
289 | "term_name": "Version",
290 | "section": "root"
291 | }
292 | }
293 | }
294 |
--------------------------------------------------------------------------------
/metatab/test/test-data/json/example1.json:
--------------------------------------------------------------------------------
1 | {
2 | "declare": "metatab-latest",
3 | "title": "Registered Voters, By County",
4 | "name": "cdph.ca.gov-hci-registered_voters-county",
5 | "description": "Percent of the eligible population registered to vote and the percent who voted in statewide elections.",
6 | "identifier": "cdph.ca.gov-hci-registered_voters-county",
7 | "version": "201404",
8 | "obsoletes": "cdph.ca.gov-hci-registered_voters-county-201304",
9 | "dataset": "voters",
10 | "origin": "example.com",
11 | "space": "Ca",
12 | "time": "2002-2014",
13 | "grain": "County",
14 | "format": "excel",
15 | "datafile": [
16 | {
17 | "name": "example1",
18 | "schema": "registered_voters",
19 | "grain": "County",
20 | "title": "The First Example Data File",
21 | "url": "http://example.com/example1.csv"
22 | },
23 | {
24 | "name": "example2",
25 | "schema": "registered_voters",
26 | "grain": "Tract",
27 | "title": "The Second Example Data File",
28 | "url": "http://example.com/example2.csv"
29 | }
30 | ],
31 | "homepage": [
32 | {
33 | "schema": "Healthy Communities Data and Indicators Project (HCI)",
34 | "url": "https://www.cdph.ca.gov/programs/pages/healthycommunityindicators.aspx"
35 | }
36 | ],
37 | "documentation": [
38 | {
39 | "schema": "Indicator Documentation for Voter Registration / Participation",
40 | "description": "Voter Registration/Participation: Percent of the eligible population registered to vote and the percent who voted in statewide elections",
41 | "url": "https://www.cdph.ca.gov/programs/Documents/HCI_RegisteredVoters_653_Narrative_and_examples_6-2-14.pdf"
42 | },
43 | {
44 | "title": "Data Bundles Packaging Specification",
45 | "url": "https://docs.google.com/document/d/16tb7x73AyF8pJ6e6IBcaIJAioEZCNBGDEksKYTXfdfg/edit#"
46 | }
47 | ],
48 | "creator": [
49 | {
50 | "email": "HCIOHE@cdph.ca.gov",
51 | "name": "Office of Health Equity"
52 | }
53 | ],
54 | "wrangler": [
55 | {
56 | "email": "eric@civicknowledge.com",
57 | "name": "Eric Busboom"
58 | }
59 | ],
60 | "note": [
61 | "This file is an example of a data bundle, a simple format for linking data to metadata using spreadsheets. See the specification for more details."
62 | ],
63 | "table": [
64 | {
65 | "description": "HCI Indicator 653.0: Percent of adults age 18 years and older who are registered voters",
66 | "column": [
67 | {
68 | "datatype": "int",
69 | "valuetype": "year range",
70 | "description": "Year or years that indicator was reported",
71 | "name": "reportyear"
72 | },
73 | {
74 | "datatype": "str",
75 | "valuetype": "dimension",
76 | "description": "Type of record",
77 | "name": "type"
78 | },
79 | {
80 | "datatype": "str",
81 | "valuetype": "gvid",
82 | "description": "GVid version of the geotype and geotypeval",
83 | "name": "gvid"
84 | },
85 | {
86 | "datatype": "str",
87 | "valuetype": "label for gvid",
88 | "description": "Census name of geographic area",
89 | "name": "geoname"
90 | },
91 | {
92 | "datatype": "str",
93 | "valuetype": "label",
94 | "description": "Code for type of geographic area",
95 | "name": "geotype"
96 | },
97 | {
98 | "datatype": "str",
99 | "valuetype": "census",
100 | "description": "Census geoid code",
101 | "name": "geotypevalue"
102 | },
103 | {
104 | "datatype": "str",
105 | "valuetype": "FIPS county code",
106 | "description": "County FIPS code",
107 | "name": "county_fips"
108 | },
109 | {
110 | "datatype": "str",
111 | "valuetype": "label for counrty_fips",
112 | "description": "County name",
113 | "name": "county_name"
114 | },
115 | {
116 | "datatype": "str",
117 | "valuetype": "census code",
118 | "description": "Numeric code of region",
119 | "name": "region_code"
120 | },
121 | {
122 | "datatype": "str",
123 | "valuetype": "label for region_code",
124 | "description": "Name of region",
125 | "name": "region_name"
126 | },
127 | {
128 | "datatype": "str",
129 | "valuetype": "raceth/civick",
130 | "description": "Civic Knowledge race / ethnicity code.",
131 | "name": "raceth"
132 | },
133 | {
134 | "datatype": "str",
135 | "valuetype": "label for raceeth",
136 | "description": "Race / Ethnicity Name",
137 | "name": "raceth_name"
138 | },
139 | {
140 | "datatype": "str",
141 | "valuetype": "raceth/hci",
142 | "description": "Race / ethnicity code",
143 | "name": "race_eth_code"
144 | },
145 | {
146 | "datatype": "str",
147 | "valuetype": "label for race_eth_code",
148 | "description": "Race / ethnicity name",
149 | "name": "race_eth_name"
150 | },
151 | {
152 | "datatype": "int",
153 | "valuetype": "count",
154 | "description": "Adults who are registered to vote, or who voted, depending on type of record",
155 | "name": "numerator"
156 | },
157 | {
158 | "datatype": "int",
159 | "valuetype": "count",
160 | "description": "Population of Adults, 18 years or older",
161 | "name": "denominator"
162 | },
163 | {
164 | "datatype": "float",
165 | "valuetype": "percent of numerator over denominator",
166 | "description": "Percent of adults who are registered to vote, or who voted, depending on type of record",
167 | "name": "percent"
168 | },
169 | {
170 | "datatype": "float",
171 | "valuetype": "ci95l for percent",
172 | "description": "Lower bound of 95% confidence interval",
173 | "name": "ll_95ci"
174 | },
175 | {
176 | "datatype": "float",
177 | "valuetype": "ci95u for percent",
178 | "description": "Upper bound of 95% confidence interval",
179 | "name": "ul_95ci"
180 | },
181 | {
182 | "datatype": "float",
183 | "valuetype": "se for percent",
184 | "description": "Standard error",
185 | "name": "se"
186 | },
187 | {
188 | "datatype": "float",
189 | "valuetype": "rse for percent",
190 | "description": "Relative standard error (se/percent * 100) expressed as a percent",
191 | "name": "rse"
192 | },
193 | {
194 | "datatype": "float",
195 | "valuetype": "decile",
196 | "description": "Statewide decile ranking",
197 | "name": "ca_decile"
198 | },
199 | {
200 | "datatype": "float",
201 | "valuetype": "ratio",
202 | "description": "Ratio of indicator to state average",
203 | "name": "ca_rr"
204 | },
205 | {
206 | "datatype": "float",
207 | "valuetype": "measure",
208 | "description": "Voter age population, from CA Department of Finance.",
209 | "name": "vap"
210 | },
211 | {
212 | "datatype": "str",
213 | "valuetype": "dimension",
214 | "name": "ind_id"
215 | },
216 | {
217 | "datatype": "str",
218 | "valuetype": "dimension",
219 | "name": "ind_definition"
220 | },
221 | {
222 | "datatype": "str",
223 | "valuetype": "other",
224 | "name": "version"
225 | }
226 | ],
227 | "name": "registered_voters"
228 | }
229 | ]
230 | }
--------------------------------------------------------------------------------
/examples/Pandas Reporter Example.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import numpy as np\n",
11 | "import pandasreporter as pr\n",
12 | "\n",
13 | "\n",
14 | "# B17001, Poverty Status by Sex by Age\n",
15 | "b17001 = pr.get_dataframe('B17001', '140', '05000US06073', cache=True).ct_columns\n",
16 | "# B17024, Age by Ratio of Income to Poverty Level\n",
17 | "b17024 = pr.get_dataframe('B17024', '140', '05000US06073', cache=True).ct_columns\n",
18 | "# B17017, Poverty Status by Household Type by Age of Householder\n",
19 | "b17017 = pr.get_dataframe('B17017', '140', '05000US06073', cache=True).ct_columns"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "# B17001 Poverty Status by Sex by Age\n",
27 | "\n",
28 | "For the [Poverty Status by Sex by Age](https://censusreporter.org/tables/B17001/) we'll select the columns for male and female, below poverty, 65 and older. \n",
29 | "\n",
30 | "**NOTE** if you want to get seniors of a particular race, use table `C17001a-g`, condensed race iterations. The 'C' tables have fewer age ranges, but there is no 'C' table for all races: There is a `C17001a` for Whites, a condensed version of `B17001a`, but there is no `C17001` for a condensed version of `B17001`\n",
31 | "\n"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 2,
37 | "metadata": {},
38 | "outputs": [
39 | {
40 | "data": {
41 | "text/plain": [
42 | "['B17001015 Total Income in the past 12 months below poverty level Male 65 to 74 years',\n",
43 | " 'Margins for B17001015 Total Income in the past 12 months below poverty level Male 65 to 74 years',\n",
44 | " 'B17001016 Total Income in the past 12 months below poverty level Male 75 years and over',\n",
45 | " 'Margins for B17001016 Total Income in the past 12 months below poverty level Male 75 years and over',\n",
46 | " 'B17001029 Total Income in the past 12 months below poverty level Female 65 to 74 years',\n",
47 | " 'Margins for B17001029 Total Income in the past 12 months below poverty level Female 65 to 74 years',\n",
48 | " 'B17001030 Total Income in the past 12 months below poverty level Female 75 years and over',\n",
49 | " 'Margins for B17001030 Total Income in the past 12 months below poverty level Female 75 years and over',\n",
50 | " 'B17001044 Total Income in the past 12 months at or above poverty level Male 65 to 74 years',\n",
51 | " 'Margins for B17001044 Total Income in the past 12 months at or above poverty level Male 65 to 74 years',\n",
52 | " 'B17001045 Total Income in the past 12 months at or above poverty level Male 75 years and over',\n",
53 | " 'Margins for B17001045 Total Income in the past 12 months at or above poverty level Male 75 years and over',\n",
54 | " 'B17001058 Total Income in the past 12 months at or above poverty level Female 65 to 74 years',\n",
55 | " 'Margins for B17001058 Total Income in the past 12 months at or above poverty level Female 65 to 74 years',\n",
56 | " 'B17001059 Total Income in the past 12 months at or above poverty level Female 75 years and over',\n",
57 | " 'Margins for B17001059 Total Income in the past 12 months at or above poverty level Female 75 years and over']"
58 | ]
59 | },
60 | "execution_count": 2,
61 | "output_type": "execute_result",
62 | "metadata": {}
63 | }
64 | ],
65 | "source": [
66 | "[e for e in b17001.columns if '65 to 74' in str(e) or '75 years' in str(e) ]"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": 3,
72 | "metadata": {},
73 | "outputs": [
74 | {
75 | "data": {
76 | "text/html": [
77 | "\n",
78 | "
\n",
79 | " \n",
80 | " \n",
81 | " | \n",
82 | " geoid | \n",
83 | " B17001015 Total Income in the past 12 months below poverty level Male 65 to 74 years | \n",
84 | " Margins for B17001015 Total Income in the past 12 months below poverty level Male 65 to 74 years | \n",
85 | " B17001016 Total Income in the past 12 months below poverty level Male 75 years and over | \n",
86 | " Margins for B17001016 Total Income in the past 12 months below poverty level Male 75 years and over | \n",
87 | " B17001029 Total Income in the past 12 months below poverty level Female 65 to 74 years | \n",
88 | " Margins for B17001029 Total Income in the past 12 months below poverty level Female 65 to 74 years | \n",
89 | " B17001030 Total Income in the past 12 months below poverty level Female 75 years and over | \n",
90 | " Margins for B17001030 Total Income in the past 12 months below poverty level Female 75 years and over | \n",
91 | "
\n",
92 | " \n",
93 | " \n",
94 | " \n",
95 | " | 0 | \n",
96 | " 14000US06073004501 | \n",
97 | " 10.0 | \n",
98 | " 18.0 | \n",
99 | " 0.0 | \n",
100 | " 12.0 | \n",
101 | " 13.0 | \n",
102 | " 22.0 | \n",
103 | " 7.0 | \n",
104 | " 12.0 | \n",
105 | "
\n",
106 | " \n",
107 | " | 1 | \n",
108 | " 14000US06073019803 | \n",
109 | " 0.0 | \n",
110 | " 12.0 | \n",
111 | " 0.0 | \n",
112 | " 12.0 | \n",
113 | " 8.0 | \n",
114 | " 12.0 | \n",
115 | " 11.0 | \n",
116 | " 17.0 | \n",
117 | "
\n",
118 | " \n",
119 | " | 2 | \n",
120 | " 14000US06073006000 | \n",
121 | " 18.0 | \n",
122 | " 30.0 | \n",
123 | " 0.0 | \n",
124 | " 12.0 | \n",
125 | " 0.0 | \n",
126 | " 12.0 | \n",
127 | " 0.0 | \n",
128 | " 12.0 | \n",
129 | "
\n",
130 | " \n",
131 | " | 3 | \n",
132 | " 14000US06073008364 | \n",
133 | " 0.0 | \n",
134 | " 17.0 | \n",
135 | " 7.0 | \n",
136 | " 18.0 | \n",
137 | " 7.0 | \n",
138 | " 17.0 | \n",
139 | " 0.0 | \n",
140 | " 17.0 | \n",
141 | "
\n",
142 | " \n",
143 | " | 4 | \n",
144 | " 14000US06073008507 | \n",
145 | " 0.0 | \n",
146 | " 17.0 | \n",
147 | " 67.0 | \n",
148 | " 61.0 | \n",
149 | " 17.0 | \n",
150 | " 26.0 | \n",
151 | " 26.0 | \n",
152 | " 41.0 | \n",
153 | "
\n",
154 | " \n",
155 | "
\n",
156 | "
"
157 | ]
158 | },
159 | "output_type": "execute_result",
160 | "metadata": {}
161 | }
162 | ],
163 | "source": [
164 | "# Now create a subset dataframe with just the columns we need. \n",
165 | "b17001s = b17001[['geoid', 'B17001015', 'B17001016','B17001029','B17001030']]\n",
166 | "b17001s.head()"
167 | ]
168 | },
169 | {
170 | "cell_type": "markdown",
171 | "metadata": {},
172 | "source": [
173 | "## Senior poverty rates\n",
174 | "\n",
175 | "Creating the sums for the senior below poverty rates at the tract level is easy, but there is a *serious problem* with the results: the numbers are completely unstable. The minimum RSE is 22%, and the median is about 60%. These are useless results. "
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": 4,
181 | "metadata": {},
182 | "outputs": [
183 | {
184 | "data": {
185 | "text/plain": [
186 | "count 576.000000\n",
187 | "mean 87.621218\n",
188 | "std 156.710591\n",
189 | "min 22.150407\n",
190 | "25% 43.645038\n",
191 | "50% 58.919310\n",
192 | "75% 82.136436\n",
193 | "max 1806.402183\n",
194 | "dtype: float64"
195 | ]
196 | },
197 | "execution_count": 4,
198 | "output_type": "execute_result",
199 | "metadata": {}
200 | }
201 | ],
202 | "source": [
203 | "b17001_65mf = pr.CensusDataFrame()\n",
204 | "b17001_65mf['geoid'] = b17001['geoid']\n",
205 | "b17001_65mf['poverty_65'], b17001_65mf['poverty_65_m90'] = b17001.sum_m('B17001015', 'B17001016','B17001029','B17001030')\n",
206 | "b17001_65mf.add_rse('poverty_65')\n",
207 | "b17001_65mf.poverty_65_rse.replace([np.inf, -np.inf], np.nan).dropna().describe()"
208 | ]
209 | },
210 | {
211 | "cell_type": "code",
212 | "execution_count": null,
213 | "metadata": {},
214 | "outputs": [],
215 | "source": [
216 | ""
217 | ]
218 | }
219 | ],
220 | "metadata": {
221 | "kernelspec": {
222 | "display_name": "Python 3",
223 | "language": "python",
224 | "name": "python3"
225 | },
226 | "language_info": {
227 | "codemirror_mode": {
228 | "name": "ipython",
229 | "version": 3.0
230 | },
231 | "file_extension": ".py",
232 | "mimetype": "text/x-python",
233 | "name": "python",
234 | "nbconvert_exporter": "python",
235 | "pygments_lexer": "ipython3",
236 | "version": "3.5.0"
237 | }
238 | },
239 | "nbformat": 4,
240 | "nbformat_minor": 2
241 | }
--------------------------------------------------------------------------------