├── .flake8 ├── .gitignore ├── .pylintrc ├── .travis.yml ├── README.md ├── config.py ├── datausa ├── __init__.py ├── acs │ ├── __init__.py │ ├── abstract_models.py │ ├── automap_models.py │ ├── models.py │ └── stats_models.py ├── attrs │ ├── __init__.py │ ├── consts.py │ ├── models.py │ ├── search.py │ └── views.py ├── bea │ ├── __init__.py │ └── models.py ├── bls │ ├── __init__.py │ └── models.py ├── cbp │ ├── __init__.py │ ├── abstract_models.py │ └── models.py ├── chr │ ├── __init__.py │ └── models.py ├── core │ ├── __init__.py │ ├── api.py │ ├── attr_crosswalking.py │ ├── crosswalker.py │ ├── exceptions.py │ ├── join_api.py │ ├── models.py │ ├── registrar.py │ ├── streaming.py │ ├── table_manager.py │ └── views.py ├── dartmouth │ ├── __init__.py │ └── models.py ├── database.py ├── ed │ ├── __init__.py │ └── models.py ├── freight │ ├── __init__.py │ └── models.py ├── ipeds │ ├── __init__.py │ ├── abstract_models.py │ └── models.py ├── onet │ ├── __init__.py │ └── models.py ├── opiods │ ├── __init__.py │ └── models.py ├── pums │ ├── __init__.py │ ├── abstract_models.py │ ├── models.py │ └── models_5.py └── util │ ├── __init__.py │ ├── big_places.py │ └── inmem.py ├── requirements.txt ├── run.py ├── scripts ├── __init__.py ├── alt_fill_cache.py ├── build_search.py ├── cache_helper.applescript ├── fill_cache.py ├── fix_urlnames.py ├── flickr │ ├── __init__.py │ ├── analyze.py │ ├── flickr.py │ ├── grab.py │ └── short.py ├── gen_indicies.py ├── get_vnames.py ├── search │ ├── build_index.py │ ├── build_var_index.py │ ├── geo_aliases.csv │ ├── rebuild_index.py │ └── zip_lookup.sql ├── university_abbrev_gen.py ├── update_university_keywords.py └── url_names.py ├── search_index ├── MAIN_WRITELOCK ├── MAIN_hzur5fe2wkrq53me.seg └── _MAIN_1.toc ├── tests ├── __init__.py ├── test_joins.py └── test_search.py └── var_index ├── MAIN_WRITELOCK ├── MAIN_g1c93s1e37q8coxg.seg └── _MAIN_1.toc /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E501,E402 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | 59 | # OSX stuff 60 | .DS_STORE 61 | 62 | # Ignore cache folder 63 | cache/ 64 | 65 | .env 66 | .envrc 67 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MESSAGES CONTROL] 2 | disable=singleton-comparison 3 | generated-members=query 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - 2.7 4 | 5 | install: 6 | - pip install -r requirements.txt 7 | 8 | script: pytest tests/test_search.py 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DataUSA API (Python) 2 | 3 | **⚠️ Deprecated ⚠️: The API that this documentation refers to (api.datausa.io) went into legacy mode in November 2018, and was be replaced by a newer version of the API with access to new & expanded data: https://datausa.io/about/api/** 4 | 5 | To learn more about the API, visit the [DataUSA API Wiki page](https://github.com/DataUSA/datausa-api/wiki) or visit the [DataUSA Quick Start Guide](http://beta.datausa.io/about/api/). 6 | 7 | ## Installation 8 | 9 | **DataUSA** is a web platform built using Flask, an open source Python library for interacting with HTTP. this installation guide is written assuming a Linux or Linux-like environment. The following software is required to be installed locally in order to get DataUSA running: 10 | 11 | * Python 12 | * Postgres 13 | 14 | 1. Clone from Github 15 | git clone https://github.com/Datawheel/datausa-api.git 16 | 2. [optional] Create a virtual environment. We suggest installing [virtualenv](https://pypi.python.org/pypi/virtualenv) with [virtualenvwrapper](http://virtualenvwrapper.readthedocs.org/en/latest/) especially if the machine you are using is used for many other web projects. This allows python libraries to be installed easily and specifically on a per proeject basis. 17 | 18 | Once this is complete, run the following to initialize your datausa environment. 19 | 20 | mkvirtualenv datausa 21 | 22 | 3. Install Prerequisites 23 | 24 | sudo apt-get install python-dev 25 | sudo apt-get install libpq-dev 26 | 27 | 4. Install Python modules 28 | 29 | pip install -r requirements.txt 30 | 31 | 5. Set environment variables 32 | 33 | export DATAUSA_DB_NAME=db_name 34 | export DATAUSA_DB_USER=postgres_user 35 | export DATAUSA_DB_PW=postgres_pw 36 | export DATAUSA_DB_HOST=127.0.0.1 37 | 38 | 6. Run api 39 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | 4 | ''' Base directory of where the site is held ''' 5 | basedir = os.path.abspath(os.path.dirname(__file__)) 6 | 7 | ''' CSRF (cross site forgery) for signing POST requests to server ''' 8 | CSRF_EN = True 9 | 10 | ''' Secret key should be set in environment var ''' 11 | SECRET_KEY = os.environ.get("DATAVIVA_SECRET_KEY", "default-datausa-secret") 12 | 13 | ''' Default debugging to True ''' 14 | DEBUG = True 15 | SQLALCHEMY_ECHO = True 16 | SQLALCHEMY_TRACK_MODIFICATIONS = False 17 | SQLALCHEMY_POOL_SIZE = 15 18 | SQLALCHEMY_POOL_TIMEOUT = 180 19 | SQLALCHEMY_POOL_RECYCLE = 150 20 | SQLALCHEMY_DATABASE_URI = "postgres://{0}:{1}@{2}:{3}/{4}".format( 21 | os.environ.get("DATAUSA_DB_USER", "postgres"), 22 | os.environ.get("DATAUSA_DB_PW", ""), 23 | os.environ.get("DATAUSA_DB_HOST", "localhost"), 24 | os.environ.get("DATAUSA_DB_PORT", 5432), 25 | os.environ.get("DATAUSA_DB_NAME", "postgres")) 26 | 27 | ''' If an env var for production is set turn off all debugging support ''' 28 | if "DATAUSA_PRODUCTION" in os.environ: 29 | SQLALCHEMY_ECHO = False 30 | DEBUG = False 31 | ERROR_EMAIL = True 32 | 33 | JSONIFY_PRETTYPRINT_REGULAR = False 34 | 35 | CACHE_TYPE = 'filesystem' 36 | CACHE_DIR = os.path.join(basedir, 'cache/') 37 | CACHE_DEFAULT_TIMEOUT = os.environ.get("CACHE_DEFAULT_TIMEOUT", 60 * 60 * 24 * 7 * 4) # 28 days 38 | CACHE_THRESHOLD = 5000 39 | 40 | FLICKR_DIR = os.environ.get("DATAUSA_FLICKR_DIR", os.path.join(basedir, '../datausa-site/datausa/static/img/splash')) 41 | SEARCH_INDEX_DIR = os.path.join(basedir, 'search_index/') 42 | VAR_INDEX_DIR = os.path.join(basedir, 'var_index/') 43 | -------------------------------------------------------------------------------- /datausa/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | from flask import Flask, jsonify 3 | from flask_compress import Compress 4 | # from flask.ext.cors import CORS 5 | from flask_cache import Cache 6 | 7 | app = Flask(__name__) 8 | app.config.from_object('config') 9 | Compress(app) 10 | cache = Cache(app) 11 | 12 | from datausa.attrs.views import mod as attrs_module 13 | from datausa.core.views import mod as core_module 14 | 15 | app.register_blueprint(attrs_module) 16 | app.register_blueprint(core_module) 17 | 18 | # CORS(app) 19 | 20 | @app.errorhandler(500) 21 | def error_page(err): 22 | return jsonify(error=str(err)), 500 23 | -------------------------------------------------------------------------------- /datausa/acs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataUSA/datausa-api/7288dede082eda07b61e11cf6dc801fe692f6334/datausa/acs/__init__.py -------------------------------------------------------------------------------- /datausa/acs/abstract_models.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy.ext.declarative import declared_attr 2 | 3 | from datausa.database import db 4 | from datausa.attrs.models import Geo, AcsOcc, AcsInd, GeoContainment 5 | from datausa.attrs.models import AcsLanguage, Insurance, AgeBucket 6 | from datausa.core.models import BaseModel 7 | from datausa.attrs.consts import NATION, STATE, COUNTY 8 | from datausa.attrs.consts import PUMA, MSA, ALL, GEO 9 | from datausa.attrs.consts import PLACE, TRACT 10 | from sqlalchemy.sql import func 11 | 12 | class BaseHealth(object): 13 | __virtual_schema__ = "acs_health" 14 | hc_pop = db.Column(db.Float) 15 | hc_pop_moe = db.Column(db.Float) 16 | hc_pop_rca = db.Column(db.Float) 17 | 18 | class AcsIndId(object): 19 | LEVELS = ["0", "1", "2", ALL] 20 | 21 | @classmethod 22 | def acs_ind_filter(cls, level): 23 | if level == ALL: 24 | return True 25 | else: 26 | target = (int(level) * 2) + 2 27 | return func.length(cls.acs_ind) == target 28 | 29 | @classmethod 30 | def get_supported_levels(cls): 31 | return {"acs_ind": AcsIndId.LEVELS} 32 | 33 | @declared_attr 34 | def acs_ind(cls): 35 | return db.Column(db.String(), db.ForeignKey(AcsInd.id), 36 | primary_key=True) 37 | 38 | class AcsOccId(object): 39 | LEVELS = ["0", "1", "2", "3", "4", ALL] 40 | 41 | @classmethod 42 | def get_supported_levels(cls): 43 | return {"acs_occ": AcsOccId.LEVELS} 44 | 45 | @declared_attr 46 | def acs_occ(cls): 47 | return db.Column(db.String(), db.ForeignKey(AcsOcc.id), 48 | primary_key=True) 49 | 50 | @classmethod 51 | def acs_occ_filter(cls, level): 52 | if level == ALL: 53 | return True 54 | else: 55 | target = (int(level) * 2) + 2 56 | return func.length(cls.acs_occ) == target 57 | 58 | 59 | class GeoId(object): 60 | LEVELS = [NATION, STATE, COUNTY, MSA, PUMA, PLACE, TRACT, ALL] 61 | LEVELS_1YR = [NATION, STATE, COUNTY, MSA, PLACE, ALL] 62 | LEVELS_5YR = LEVELS 63 | 64 | JOINED_FILTER = {"geo": { 65 | "triggers": [("tract", "160"), ("tract", "310"), 66 | ("tract", "050"), ("county", "310")], 67 | "table": GeoContainment.parent, 68 | "column": GeoContainment.parent_geoid, 69 | "id": GeoContainment.child_geoid, 70 | }} 71 | @classmethod 72 | def get_supported_levels(cls): 73 | return {GEO: GeoId.LEVELS} 74 | 75 | @classmethod 76 | def geo_filter(cls, level): 77 | if level == ALL: 78 | return True 79 | level_map = {NATION: "010", STATE: "040", 80 | PUMA: "795", MSA: "310", 81 | COUNTY: "050", PLACE: "160", TRACT: "140"} 82 | level_code = level_map[level] 83 | return cls.geo.startswith(level_code) 84 | 85 | @declared_attr 86 | def geo(cls): 87 | return db.Column(db.String(), db.ForeignKey(Geo.id), primary_key=True) 88 | 89 | class GeoId1(GeoId): 90 | @classmethod 91 | def get_supported_levels(cls): 92 | return {GEO: GeoId.LEVELS_1YR} 93 | 94 | class GeoId5(GeoId): 95 | @classmethod 96 | def get_supported_levels(cls): 97 | return {GEO: GeoId.LEVELS_5YR} 98 | 99 | class BaseAcs5(db.Model, BaseModel): 100 | __abstract__ = True 101 | schema_name = 'acs_5yr' 102 | __table_args__ = {"schema": schema_name, "extend_existing": True} 103 | supported_levels = {} 104 | source_title = 'ACS 5-year Estimate' 105 | source_link = 'http://www.census.gov/programs-surveys/acs/' 106 | source_org = 'Census Bureau' 107 | CACHED_YEARS = [2013, 2014, 2015, 2016] 108 | 109 | @declared_attr 110 | def year(cls): 111 | return db.Column(db.Integer, primary_key=True) 112 | 113 | 114 | class BaseAcs3(db.Model, BaseModel): 115 | __abstract__ = True 116 | schema_name = 'acs_3year' 117 | __table_args__ = {"schema": schema_name} 118 | supported_levels = {} 119 | source_title = 'ACS 3-year Estimate' 120 | source_link = 'http://www.census.gov/programs-surveys/acs/' 121 | source_org = 'Census Bureau' 122 | 123 | @declared_attr 124 | def year(cls): 125 | return db.Column(db.Integer, primary_key=True) 126 | 127 | 128 | class BaseAcs1(db.Model, BaseModel): 129 | __abstract__ = True 130 | schema_name = 'acs_1yr' 131 | __table_args__ = {"schema": schema_name, "extend_existing": True} 132 | supported_levels = {} 133 | source_title = 'ACS 1-year Estimate' 134 | source_link = 'http://www.census.gov/programs-surveys/acs/' 135 | source_org = 'Census Bureau' 136 | CACHED_YEARS = [2013, 2014, 2015, 2016] 137 | 138 | @declared_attr 139 | def year(cls): 140 | return db.Column(db.Integer, primary_key=True) 141 | 142 | 143 | class Ygl_Speakers(object): 144 | median_moe = 2 145 | 146 | num_speakers = db.Column(db.Float) 147 | num_speakers_moe = db.Column(db.Float) 148 | num_speakers_rca = db.Column(db.Float) 149 | 150 | @declared_attr 151 | def language(cls): 152 | return db.Column(db.String(), db.ForeignKey(AcsLanguage.id), primary_key=True) 153 | -------------------------------------------------------------------------------- /datausa/acs/automap_models.py: -------------------------------------------------------------------------------- 1 | from datausa.database import db 2 | from datausa import cache 3 | from datausa.acs.abstract_models import BaseAcs1, BaseAcs5, GeoId, GeoId5, GeoId1 4 | from sqlalchemy.ext.automap import automap_base 5 | from sqlalchemy import MetaData 6 | 7 | metadata = cache.get("acs5_metadata") 8 | if not metadata: 9 | metadata = MetaData(schema=BaseAcs5.schema_name, bind=db.engine) 10 | metadata.reflect() 11 | cache.set("acs5_metadata", metadata) 12 | 13 | AutomapBase = automap_base(bind=db.engine, metadata=metadata) 14 | 15 | metadata_1yr = cache.get("acs1_metadata") 16 | if not metadata_1yr: 17 | metadata_1yr = MetaData(schema=BaseAcs1.schema_name, bind=db.engine) 18 | metadata_1yr.reflect() 19 | cache.set("acs1_metadata", metadata_1yr) 20 | 21 | AutomapBase_1yr = automap_base(bind=db.engine, metadata=metadata_1yr) 22 | 23 | # 1 year 24 | class Acs1_Yg_Income(AutomapBase_1yr, BaseAcs1, GeoId1): 25 | __tablename__ = "yg_income" 26 | median_moe = 1.2 27 | 28 | class Acs1_Yg_Poverty(AutomapBase_1yr, BaseAcs1, GeoId1): 29 | __tablename__ = 'yg_poverty' 30 | median_moe = 1.2 31 | 32 | class Acs1_Yg_Tenure(AutomapBase_1yr, BaseAcs1, GeoId1): 33 | __tablename__ = 'yg_tenure' 34 | median_moe = 1.2 35 | 36 | class Acs1_Yg(AutomapBase_1yr, BaseAcs1, GeoId1): 37 | __tablename__ = "yg" 38 | median_moe = 1.2 39 | 40 | class Acs1_Yg_IncDist(AutomapBase_1yr, BaseAcs1, GeoId1): 41 | __tablename__ = "yg_income_distribution" 42 | median_moe = 2.2 43 | 44 | class Acs1_Yg_PovertyRace(AutomapBase_1yr, BaseAcs1, GeoId1): 45 | __tablename__ = 'yg_poverty_race' 46 | median_moe = 2.2 47 | 48 | class Acs1_Yg_NatAge(AutomapBase_1yr, BaseAcs1, GeoId1): 49 | __tablename__ = 'yg_nativity_age' 50 | median_moe = 1.2 51 | 52 | class Acs1_Yg_Race(AutomapBase_1yr, BaseAcs1, GeoId1): 53 | __tablename__ = 'yg_race' 54 | median_moe = 1.2 55 | 56 | class Acs1_Yg_Conflict(AutomapBase_1yr, BaseAcs1, GeoId1): 57 | __tablename__ = "yg_conflict" 58 | median_moe = 2.2 59 | 60 | class Acs1_Yg_PropertyValue(AutomapBase_1yr, BaseAcs1, GeoId1): 61 | __tablename__ = 'yg_property_value' 62 | median_moe = 1.2 63 | 64 | class Acs1_Yg_PropertyTax(AutomapBase_1yr, BaseAcs1, GeoId1): 65 | __tablename__ = 'yg_property_tax' 66 | median_moe = 1.2 67 | 68 | class Acs1_Yg_Vehicles(AutomapBase_1yr, BaseAcs1, GeoId1): 69 | __tablename__ = 'yg_vehicles' 70 | median_moe = 1.2 71 | 72 | class Acs1_Yg_TravelTime(AutomapBase_1yr, BaseAcs1, GeoId1): 73 | __tablename__ = 'yg_travel_time' 74 | median_moe = 1.2 75 | 76 | class Acs1_Yg_Transport(AutomapBase_1yr, BaseAcs1, GeoId1): 77 | __tablename__ = 'yg_transport' 78 | median_moe = 1.2 79 | 80 | # 5 year 81 | 82 | class Acs5_Yg(AutomapBase, BaseAcs5, GeoId5): 83 | __tablename__ = "yg" 84 | median_moe = 1 85 | 86 | class Acs5_Yg_Conflict(AutomapBase, BaseAcs5, GeoId5): 87 | __tablename__ = "yg_conflict" 88 | median_moe = 2 89 | 90 | class Acs5_Yg_Income(AutomapBase, BaseAcs5, GeoId5): 91 | __tablename__ = "yg_income" 92 | median_moe = 1 93 | 94 | class Acs5_Yg_IncDist(AutomapBase, BaseAcs5, GeoId5): 95 | __tablename__ = "yg_income_distribution" 96 | median_moe = 2 97 | 98 | class Acs5_Yg_NatAge(AutomapBase, BaseAcs5, GeoId5): 99 | __tablename__ = 'yg_nativity_age' 100 | median_moe = 1 101 | 102 | 103 | class Acs5_Yg_Poverty(AutomapBase, BaseAcs5, GeoId5): 104 | __tablename__ = 'yg_poverty' 105 | median_moe = 1 106 | 107 | 108 | class Acs5_Yg_PropertyTax(AutomapBase, BaseAcs5, GeoId5): 109 | __tablename__ = 'yg_property_tax' 110 | median_moe = 1 111 | 112 | 113 | class Acs5_Yg_PropertyValue(AutomapBase, BaseAcs5, GeoId5): 114 | __tablename__ = 'yg_property_value' 115 | median_moe = 1 116 | 117 | 118 | class Acs5_Yg_Race(AutomapBase, BaseAcs5, GeoId5): 119 | __tablename__ = 'yg_race' 120 | median_moe = 1 121 | 122 | 123 | class Acs5_Yg_PovertyRace(AutomapBase, BaseAcs5, GeoId5): 124 | __tablename__ = 'yg_poverty_race' 125 | median_moe = 2 126 | 127 | 128 | class Acs5_Yg_Tenure(AutomapBase, BaseAcs5, GeoId5): 129 | __tablename__ = 'yg_tenure' 130 | median_moe = 1 131 | 132 | 133 | class Acs5_Yg_Transport(AutomapBase, BaseAcs5, GeoId5): 134 | __tablename__ = 'yg_transport' 135 | median_moe = 1 136 | 137 | 138 | class Acs5_Yg_TravelTime(AutomapBase, BaseAcs5, GeoId5): 139 | __tablename__ = 'yg_travel_time' 140 | median_moe = 1 141 | 142 | 143 | class Acs5_Yg_Vehicles(AutomapBase, BaseAcs5, GeoId5): 144 | __tablename__ = 'yg_vehicles' 145 | median_moe = 1 146 | 147 | AutomapBase_1yr.prepare(db.engine, reflect=False) 148 | AutomapBase.prepare(db.engine, reflect=False) 149 | -------------------------------------------------------------------------------- /datausa/acs/models.py: -------------------------------------------------------------------------------- 1 | from datausa.acs.abstract_models import GeoId, AcsOccId, db, AcsIndId 2 | from datausa.acs.abstract_models import BaseAcs1, BaseAcs3, BaseAcs5 3 | from datausa.acs.abstract_models import Ygl_Speakers, GeoId5, GeoId1, BaseHealth 4 | from datausa.attrs import consts 5 | from datausa.attrs.consts import NATION, STATE, MSA, PLACE, PUMA, COUNTY, ALL 6 | 7 | class Acs1_Ygi_Health(BaseAcs1, GeoId1, BaseHealth): 8 | __tablename__ = "ygi_health" 9 | median_moe = 2 10 | 11 | insurance = db.Column(db.String(), primary_key=True) 12 | 13 | @classmethod 14 | def get_supported_levels(cls): 15 | return { 16 | "geo": [NATION, STATE, COUNTY, PLACE, MSA, PUMA, ALL], 17 | "insurance": [ALL] 18 | } 19 | 20 | class Acs1_Yga_Health(BaseAcs1, GeoId1, BaseHealth): 21 | __tablename__ = "yga_health" 22 | median_moe = 2 23 | 24 | age_bucket = db.Column(db.String(), primary_key=True) 25 | 26 | @classmethod 27 | def get_supported_levels(cls): 28 | return { 29 | "geo": [NATION, STATE, COUNTY, PLACE, MSA, PUMA, ALL], 30 | "age_bucket": [ALL] 31 | } 32 | 33 | class Acs1_Ygai_Health(BaseAcs1, GeoId1, BaseHealth): 34 | __tablename__ = "ygai_health" 35 | median_moe = 3 36 | 37 | age_bucket = db.Column(db.String(), primary_key=True) 38 | insurance = db.Column(db.String(), primary_key=True) 39 | 40 | @classmethod 41 | def get_supported_levels(cls): 42 | return { 43 | "geo": [NATION, STATE, COUNTY, PLACE, MSA, PUMA, ALL], 44 | "insurance": [ALL], 45 | "age_bucket": [ALL] 46 | } 47 | 48 | 49 | class Acs1_Ygis_Health(BaseAcs1, GeoId1, BaseHealth): 50 | __tablename__ = "ygis_health" 51 | median_moe = 3 52 | 53 | sex = db.Column(db.String(), primary_key=True) 54 | insurance = db.Column(db.String(), primary_key=True) 55 | 56 | @classmethod 57 | def get_supported_levels(cls): 58 | return { 59 | "geo": [NATION, STATE, COUNTY, PLACE, MSA, PUMA, ALL], 60 | "insurance": [ALL], 61 | "sex": [ALL] 62 | } 63 | 64 | 65 | 66 | class Acs1_Ygas_Health(BaseAcs1, GeoId1, BaseHealth): 67 | __tablename__ = "ygas_health" 68 | median_moe = 3 69 | 70 | sex = db.Column(db.String(), primary_key=True) 71 | age_bucket = db.Column(db.String(), primary_key=True) 72 | 73 | @classmethod 74 | def get_supported_levels(cls): 75 | return { 76 | "geo": [NATION, STATE, COUNTY, PLACE, MSA, PUMA, ALL], 77 | "sex": [ALL], 78 | "age_bucket": [ALL] 79 | } 80 | 81 | class Acs1_Ygs_Health(BaseAcs1, GeoId1, BaseHealth): 82 | __tablename__ = "ygs_health" 83 | median_moe = 2 84 | 85 | sex = db.Column(db.String(), primary_key=True) 86 | 87 | @classmethod 88 | def get_supported_levels(cls): 89 | return { 90 | "geo": [NATION, STATE, COUNTY, PLACE, MSA, PUMA, ALL], 91 | "sex": [ALL] 92 | } 93 | 94 | 95 | class Acs1_Ygais_Health(BaseAcs1, GeoId1, BaseHealth): 96 | __tablename__ = "ygais_health" 97 | median_moe = 4 98 | 99 | sex = db.Column(db.String(), primary_key=True) 100 | age_bucket = db.Column(db.String(), primary_key=True) 101 | insurance = db.Column(db.String(), primary_key=True) 102 | 103 | @classmethod 104 | def get_supported_levels(cls): 105 | return { 106 | "geo": [NATION, STATE, COUNTY, PLACE, MSA, PUMA, ALL], 107 | "sex": [ALL], 108 | "insurance": [ALL], 109 | "age_bucket": [ALL] 110 | } 111 | 112 | 113 | class Acs1_Ygl_Speakers(BaseAcs1, GeoId1, Ygl_Speakers): 114 | __tablename__ = "ygl_speakers" 115 | median_moe = 2.2 116 | CACHED_YEARS = [2013, 2014, 2015] 117 | 118 | @classmethod 119 | def get_supported_levels(cls): 120 | return {"geo": GeoId.LEVELS_1YR, "language": [consts.ALL]} 121 | 122 | 123 | class Acs5_Ygl_Speakers(BaseAcs5, GeoId5, Ygl_Speakers): 124 | __tablename__ = "ygl_speakers" 125 | median_moe = 2 126 | CACHED_YEARS = [2013, 2014, 2015] 127 | 128 | @classmethod 129 | def get_supported_levels(cls): 130 | return {"geo": GeoId.LEVELS_5YR, "language": [consts.ALL]} 131 | 132 | 133 | class Acs3_Ygo_Num_Emp(BaseAcs3, GeoId, AcsOccId): 134 | __tablename__ = "ygo_num_emp" 135 | median_moe = 2 136 | 137 | num_emp = db.Column(db.Float) 138 | num_emp_moe = db.Column(db.Float) 139 | num_emp_rca = db.Column(db.Float) 140 | num_emp_male = db.Column(db.Float) 141 | num_emp_moe_male = db.Column(db.Float) 142 | num_emp_female = db.Column(db.Float) 143 | num_emp_moe_female = db.Column(db.Float) 144 | 145 | @classmethod 146 | def get_supported_levels(cls): 147 | return {"geo": [NATION, STATE, MSA, ALL], "acs_occ": AcsOccId.LEVELS} 148 | 149 | 150 | class Acs1_Ygo_Num_Emp(BaseAcs1, GeoId, AcsOccId): 151 | __tablename__ = "ygo_num_emp" 152 | median_moe = 2.5 153 | 154 | num_emp = db.Column(db.Float) 155 | num_emp_moe = db.Column(db.Float) 156 | num_emp_rca = db.Column(db.Float) 157 | num_emp_male = db.Column(db.Float) 158 | num_emp_moe_male = db.Column(db.Float) 159 | num_emp_female = db.Column(db.Float) 160 | num_emp_moe_female = db.Column(db.Float) 161 | 162 | @classmethod 163 | def get_supported_levels(cls): 164 | return {"geo": [NATION, COUNTY, MSA, PLACE, PUMA, ALL], "acs_occ": AcsOccId.LEVELS} 165 | 166 | 167 | class Acs1_Ygo_Earnings(BaseAcs1, GeoId, AcsOccId): 168 | __tablename__ = "ygo_med_earnings" 169 | median_moe = 2.5 170 | 171 | med_earnings = db.Column(db.Float) 172 | med_earnings_male = db.Column(db.Float) 173 | med_earnings_female = db.Column(db.Float) 174 | med_earnings_moe = db.Column(db.Float) 175 | med_earnings_moe_female = db.Column(db.Float) 176 | med_earnings_moe_male = db.Column(db.Float) 177 | med_earnings_rca = db.Column(db.Float) 178 | 179 | @classmethod 180 | def get_supported_levels(cls): 181 | return {"geo": [NATION, STATE, COUNTY, MSA, PLACE, PUMA, ALL], "acs_occ": AcsOccId.LEVELS} 182 | 183 | 184 | class Acs1_Ygi_Num_Emp(BaseAcs1, AcsIndId, GeoId): 185 | __tablename__ = "ygi_num_emp" 186 | median_moe = 2.5 187 | 188 | num_emp = db.Column(db.Float) 189 | num_emp_moe = db.Column(db.Float) 190 | num_emp_rca = db.Column(db.Float) 191 | 192 | @classmethod 193 | def get_supported_levels(cls): 194 | return {"geo": [NATION, STATE, COUNTY, PLACE, MSA, PUMA, ALL], 195 | "acs_ind": ["0", "1", ALL]} 196 | 197 | 198 | class Acs5_Ygo_Num_Emp(BaseAcs5, GeoId, AcsOccId): 199 | __tablename__ = "ygo_num_emp" 200 | median_moe = 2 201 | 202 | num_emp = db.Column(db.Float) 203 | num_emp_moe = db.Column(db.Float) 204 | num_emp_rca = db.Column(db.Float) 205 | num_emp_male = db.Column(db.Float) 206 | num_emp_moe_male = db.Column(db.Float) 207 | num_emp_female = db.Column(db.Float) 208 | num_emp_moe_female = db.Column(db.Float) 209 | 210 | @classmethod 211 | def get_supported_levels(cls): 212 | return {"geo": [NATION, COUNTY, MSA, PLACE, PUMA, ALL], "acs_occ": AcsOccId.LEVELS} 213 | 214 | 215 | class Acs5_Ygo_Earnings(BaseAcs5, GeoId, AcsOccId): 216 | __tablename__ = "ygo_med_earnings" 217 | median_moe = 2 218 | 219 | med_earnings = db.Column(db.Float) 220 | med_earnings_male = db.Column(db.Float) 221 | med_earnings_female = db.Column(db.Float) 222 | med_earnings_moe = db.Column(db.Float) 223 | med_earnings_moe_female = db.Column(db.Float) 224 | med_earnings_moe_male = db.Column(db.Float) 225 | med_earnings_rca = db.Column(db.Float) 226 | 227 | @classmethod 228 | def get_supported_levels(cls): 229 | return {"geo": [NATION, STATE, COUNTY, MSA, PLACE, PUMA, ALL], "acs_occ": AcsOccId.LEVELS} 230 | 231 | 232 | class Acs3_Ygi_Num_Emp(BaseAcs3, AcsIndId, GeoId): 233 | __tablename__ = "ygi_num_emp" 234 | median_moe = 2 235 | 236 | num_emp = db.Column(db.Float) 237 | num_emp_moe = db.Column(db.Float) 238 | num_emp_rca = db.Column(db.Float) 239 | 240 | @classmethod 241 | def get_supported_levels(cls): 242 | return {"geo": [NATION, STATE, MSA, ALL], "acs_ind": AcsIndId.LEVELS} 243 | 244 | 245 | class Acs5_Ygi_Num_Emp(BaseAcs5, AcsIndId, GeoId): 246 | __tablename__ = "ygi_num_emp" 247 | median_moe = 1.9 248 | 249 | num_emp = db.Column(db.Float) 250 | num_emp_moe = db.Column(db.Float) 251 | num_emp_rca = db.Column(db.Float) 252 | 253 | @classmethod 254 | def get_supported_levels(cls): 255 | return {"geo": [NATION, STATE, COUNTY, PLACE, MSA, PUMA, ALL], 256 | "acs_ind": ["0", "1", ALL]} 257 | 258 | 259 | class Acs3_Ygi_MedEarnings(BaseAcs3, AcsIndId, GeoId): 260 | __tablename__ = "ygi_med_earnings" 261 | median_moe = 2 262 | 263 | med_earnings = db.Column(db.Float) 264 | med_earnings_moe = db.Column(db.Float) 265 | 266 | @classmethod 267 | def get_supported_levels(cls): 268 | return {"geo": [NATION, STATE, COUNTY, MSA, PLACE, PUMA, ALL], "acs_ind": ["0", "1", "all"]} 269 | 270 | 271 | class Acs1_Yg_Num_Emp(BaseAcs1, GeoId): 272 | __tablename__ = "yg_num_emp" 273 | median_moe = 1.2 274 | 275 | civ_labor_force = db.Column(db.Float) 276 | civ_labor_force_moe = db.Column(db.Float) 277 | emp_survey_total = db.Column(db.Float) 278 | emp_survey_total_moe = db.Column(db.Float) 279 | labor_force = db.Column(db.Float) 280 | labor_force_moe = db.Column(db.Float) 281 | not_in_labor_force = db.Column(db.Float) 282 | not_in_labor_force_moe = db.Column(db.Float) 283 | num_armed_forces = db.Column(db.Float) 284 | num_armed_forces_moe = db.Column(db.Float) 285 | num_emp = db.Column(db.Float) 286 | num_emp_moe = db.Column(db.Float) 287 | num_unemp = db.Column(db.Float) 288 | num_unemp_moe = db.Column(db.Float) 289 | 290 | @classmethod 291 | def get_supported_levels(cls): 292 | return {"geo": [NATION, COUNTY, MSA, PLACE, PUMA, ALL], "acs_occ": AcsOccId.LEVELS} 293 | 294 | 295 | class Acs5_Yg_Num_Emp(BaseAcs5, GeoId): 296 | __tablename__ = "yg_num_emp" 297 | median_moe = 1 298 | 299 | civ_labor_force = db.Column(db.Float) 300 | civ_labor_force_moe = db.Column(db.Float) 301 | emp_survey_total = db.Column(db.Float) 302 | emp_survey_total_moe = db.Column(db.Float) 303 | labor_force = db.Column(db.Float) 304 | labor_force_moe = db.Column(db.Float) 305 | not_in_labor_force = db.Column(db.Float) 306 | not_in_labor_force_moe = db.Column(db.Float) 307 | num_armed_forces = db.Column(db.Float) 308 | num_armed_forces_moe = db.Column(db.Float) 309 | num_emp = db.Column(db.Float) 310 | num_emp_moe = db.Column(db.Float) 311 | num_unemp = db.Column(db.Float) 312 | num_unemp_moe = db.Column(db.Float) 313 | 314 | @classmethod 315 | def get_supported_levels(cls): 316 | return {"geo": [NATION, COUNTY, MSA, PLACE, PUMA, ALL], "acs_occ": AcsOccId.LEVELS} 317 | -------------------------------------------------------------------------------- /datausa/acs/stats_models.py: -------------------------------------------------------------------------------- 1 | from datausa.acs.abstract_models import GeoId, db 2 | from datausa.core.models import BaseModel 3 | from sqlalchemy.dialects import postgresql 4 | from datausa.attrs import consts 5 | 6 | 7 | class BaseStat(db.Model, BaseModel): 8 | __abstract__ = True 9 | __table_args__ = {"schema": "stats"} 10 | supported_levels = {} 11 | source_title = 'ACS 5-year Estimate' 12 | source_link = 'http://www.census.gov/programs-surveys/acs/' 13 | source_org = 'Census Bureau' 14 | 15 | 16 | class StateStats(BaseStat, GeoId): 17 | __tablename__ = "state" 18 | median_moe = 1.2 19 | 20 | year = db.Column(db.Integer, primary_key=True) 21 | state_rank = db.Column(db.Integer) 22 | top_places = db.Column(postgresql.ARRAY(db.String)) 23 | top_counties = db.Column(postgresql.ARRAY(db.String)) 24 | state_neighbors = db.Column(postgresql.ARRAY(db.String)) 25 | 26 | @classmethod 27 | def get_supported_levels(cls): 28 | return {"geo": [consts.STATE]} 29 | 30 | 31 | class CountyStats(BaseStat, GeoId): 32 | __tablename__ = "counties" 33 | median_moe = 1.2 34 | 35 | year = db.Column(db.Integer, primary_key=True) 36 | county_state_rank = db.Column(db.Integer) 37 | places_in_county = db.Column(db.Integer) 38 | top_places = db.Column(postgresql.ARRAY(db.String)) 39 | county_neighbors = db.Column(postgresql.ARRAY(db.String)) 40 | 41 | @classmethod 42 | def get_supported_levels(cls): 43 | return {"geo": [consts.COUNTY]} 44 | 45 | 46 | class MSAStats(BaseStat, GeoId): 47 | __tablename__ = "msa" 48 | median_moe = 1.2 49 | 50 | top_counties = db.Column(postgresql.ARRAY(db.String)) 51 | top_places = db.Column(postgresql.ARRAY(db.String)) 52 | 53 | @classmethod 54 | def get_supported_levels(cls): 55 | return {"geo": [consts.MSA]} 56 | 57 | 58 | class PlaceStats(BaseStat, GeoId): 59 | __tablename__ = "place" 60 | median_moe = 1.2 61 | 62 | parent_counties = db.Column(postgresql.ARRAY(db.String)) 63 | places_neighbors = db.Column(postgresql.ARRAY(db.String)) 64 | 65 | @classmethod 66 | def get_supported_levels(cls): 67 | return {"geo": [consts.PLACE]} 68 | 69 | 70 | class PUMAStats(BaseStat, GeoId): 71 | __tablename__ = "puma" 72 | median_moe = 1.2 73 | 74 | puma_state_rank = db.Column(db.Integer) 75 | pumas_in_state = db.Column(db.Integer) 76 | puma_neighbors = db.Column(postgresql.ARRAY(db.String)) 77 | 78 | @classmethod 79 | def get_supported_levels(cls): 80 | return {"geo": [consts.PUMA]} 81 | -------------------------------------------------------------------------------- /datausa/attrs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataUSA/datausa-api/7288dede082eda07b61e11cf6dc801fe692f6334/datausa/attrs/__init__.py -------------------------------------------------------------------------------- /datausa/attrs/consts.py: -------------------------------------------------------------------------------- 1 | GEO = 'geo' 2 | PUMA = 'puma' 3 | MSA = 'msa' 4 | COUNTY = 'county' 5 | STATE = 'state' 6 | NATION = 'nation' 7 | MSA = 'msa' 8 | TRACT = 'tract' 9 | PLACE = 'place' 10 | PUMA = 'puma' 11 | 12 | ALL = 'all' 13 | OR = "," 14 | YEAR = 'year' 15 | LATEST = 'latest' 16 | OLDEST = 'oldest' 17 | GEO_LEVEL_MAP = {NATION: "010", STATE: "040", COUNTY: "050", 18 | PUMA: "795", MSA: "310", PLACE: "160", TRACT: "140"} 19 | LEVEL_TO_GEO = {v: k for k,v in GEO_LEVEL_MAP.items()} 20 | 21 | POP_THRESHOLD = 250000 22 | NO_VALUE_ADDED = 'no_value_added' 23 | -------------------------------------------------------------------------------- /datausa/attrs/search.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | 4 | from whoosh.qparser import QueryParser 5 | from whoosh import index, sorting, qparser, scoring, query 6 | from config import SEARCH_INDEX_DIR, VAR_INDEX_DIR 7 | from whoosh.lang.porter import stem 8 | from whoosh.analysis import RegexTokenizer 9 | 10 | class SimpleWeighter(scoring.BM25F): 11 | use_final = True 12 | 13 | def __init__(self, fullterm, *args, **kwargs): 14 | self.fullterm = fullterm.lower().strip() 15 | super(SimpleWeighter, self).__init__(*args, **kwargs) 16 | 17 | def final(self, searcher, docnum, score_me): 18 | name = searcher.stored_fields(docnum).get("name") 19 | zvalue = searcher.stored_fields(docnum).get("zvalue") 20 | zscore = zvalue * .15 21 | 22 | if name == self.fullterm: 23 | return score_me * 30 + (25 * abs(zscore)) 24 | elif name.startswith(self.fullterm): 25 | if zvalue > 0: 26 | return (score_me * 5.75) + (25 * zscore) 27 | else: 28 | return score_me * 5.75 + (1 - abs(zscore) * 25) 29 | elif self.fullterm.startswith(name[:10]): 30 | return score_me * 3 + abs(zscore) 31 | elif self.fullterm.startswith(name[:5]): 32 | return score_me * 1.5 + abs(zscore) 33 | # return (score_me * 1.75) + (10 * zvalue) 34 | return (score_me * 0.75) + (zscore * 0.31) 35 | 36 | 37 | vars_ix = index.open_dir(VAR_INDEX_DIR) 38 | vars_qp = QueryParser("name", schema=vars_ix.schema, group=qparser.OrGroup) 39 | 40 | 41 | ix = index.open_dir(SEARCH_INDEX_DIR) 42 | qp = QueryParser("name", schema=ix.schema, group=qparser.OrGroup) 43 | facet = sorting.FieldFacet("zvalue", reverse=True) 44 | scores = sorting.ScoreFacet() 45 | 46 | 47 | def nationwide_results(data, my_vars, attr_score, var_score, usr_query): 48 | '''given attribute search results and variable search results, determine 49 | if we should inject the US page into the data''' 50 | attr_ids = [row[0] for row in data] 51 | usa = '01000US' 52 | var_names = [my_var["description"][0].title() for my_var in my_vars] if my_vars else [] 53 | var_names = ", ".join(var_names[:-1]) + " and {}".format(var_names[-1]) if len(var_names) > 1 else "".join(var_names) 54 | name = "{} in United States".format(var_names) if my_vars else None 55 | 56 | put_us_first = False 57 | 58 | pos = 0 59 | for row in data[:3]: 60 | raw_name = row[1].lower() if data else "" 61 | first_name = raw_name.split(" ")[0] 62 | put_us_first = not (usr_query.startswith(first_name) or 63 | usr_query.endswith(first_name) or 64 | raw_name[:6] in usr_query or 65 | first_name.startswith(usr_query)) 66 | if put_us_first: 67 | break 68 | else: 69 | pos +=1 70 | if my_vars and var_score and var_score * 20 > attr_score: 71 | data.insert(pos, [usa, name, 10, "geo", name, "010", "united-states"]) 72 | elif my_vars and usa not in attr_ids and len(data) < 10: 73 | data.insert(pos, [usa, name, 10, "geo", name, "010", "united-states"]) 74 | 75 | return data 76 | 77 | 78 | 79 | def do_search(txt, sumlevel=None, kind=None, tries=0, limit=10, is_stem=None, my_vars=None): 80 | txt = txt.replace(",", "") 81 | 82 | my_filter = None 83 | 84 | if kind and sumlevel: 85 | kf = query.Term("kind", kind) 86 | sf = query.Term("sumlevel", sumlevel) 87 | my_filter = query.And([kf, sf]) 88 | elif kind: 89 | my_filter = query.Term("kind", kind) 90 | elif sumlevel: 91 | my_filter = query.Term("sumlevel", sumlevel) 92 | if is_stem and is_stem > 0 and my_filter is not None: 93 | my_filter = my_filter & query.NumericRange("is_stem", 1, is_stem) 94 | elif is_stem and is_stem > 0 and my_filter is None: 95 | my_filter = query.NumericRange("is_stem", 1, is_stem) 96 | 97 | if tries > 2: 98 | return [], [], [], [] 99 | q = qp.parse(txt) 100 | 101 | rext = RegexTokenizer() 102 | var_txt = u" ".join([stem(token.text) if len(token.text) > 3 else token.text for token in rext(unicode(txt))]) 103 | 104 | var_q = vars_qp.parse(var_txt) 105 | var_keywords = {} 106 | vars_max_score = None 107 | # search for variables in query 108 | if not my_vars: 109 | # my_vars can save original vars detected before autocorrecting for spelling, 110 | # so we'll only do var searches that haven't yet been autocorrected 111 | with vars_ix.searcher() as s: 112 | # s = vars_ix.searcher() 113 | results = s.search(var_q) 114 | # raise Exception(list(results)[0]) 115 | vscores = [r.score for r in results] 116 | vars_max_score = max(vscores) if vscores else None 117 | 118 | my_vars = [{"matched_on": r.highlights("name"), 119 | "name": r["name"], 120 | "description": r["description"].split(","), 121 | "section": r["section"], 122 | "section_title": r["section_title"], 123 | "related_attrs": r["related_attrs"].split(","), 124 | "related_vars": r["related_vars"].split(","), 125 | "params": json.loads(r["params"]) if 'params' in r else None} for r in results] 126 | if my_vars: 127 | already_seen = [] 128 | filtered_my_vars = [] 129 | for my_var in my_vars: 130 | if my_var["related_vars"] not in already_seen: 131 | filtered_my_vars.append(my_var) 132 | already_seen.append(my_var["related_vars"]) 133 | highlight_txt = my_var["matched_on"] 134 | 135 | if highlight_txt: 136 | matches = re.findall(r'([^>]+)', highlight_txt) 137 | if matches: 138 | for matched_txt in matches: 139 | var_keywords[matched_txt] = True 140 | my_vars = filtered_my_vars 141 | 142 | try: 143 | for term in q: 144 | for keyword in var_keywords.keys(): 145 | if term.text == 'in' and " in " in txt: 146 | term.boost = -1 147 | elif term.text in keyword or keyword in term.text: 148 | term.boost = -0.5 149 | except NotImplementedError: 150 | for keyword in var_keywords.keys(): 151 | if q.text == 'in' and " in " in txt: 152 | q.boost = -1 153 | elif q.text in keyword or keyword in q.text: 154 | q.boost = -0.5 155 | 156 | weighter = SimpleWeighter(txt, B=.6, content_B=1.0, K1=2.75) 157 | with ix.searcher(weighting=weighter) as s: 158 | if len(txt) > 2: 159 | corrector = s.corrector("display") 160 | suggs = corrector.suggest(txt, limit=10, maxdist=2, prefix=3) 161 | else: 162 | suggs = [] 163 | results = s.search_page(q, 1, sortedby=[scores], pagelen=20, filter=my_filter) 164 | data = [[r["id"], r["name"], r["zvalue"], 165 | r["kind"], r["display"], 166 | r["sumlevel"] if "sumlevel" in r else "", 167 | r["is_stem"] if "is_stem" in r else False, 168 | r["url_name"] if "url_name" in r else None] 169 | for r in results] 170 | 171 | if not data and suggs: 172 | return do_search(suggs[0], sumlevel, kind, tries=tries+1, limit=limit, is_stem=is_stem, 173 | my_vars=my_vars) 174 | 175 | ascores = [r.score for r in results] 176 | attr_max_score = max(ascores) if ascores else 0 177 | # raise Exception(attr_max_score, vars_max_score) 178 | # insert nationwide linkage 179 | data = nationwide_results(data, my_vars, attr_max_score, vars_max_score, txt) 180 | 181 | return data, suggs, tries, my_vars 182 | -------------------------------------------------------------------------------- /datausa/attrs/views.py: -------------------------------------------------------------------------------- 1 | import re 2 | from flask import Blueprint, request, jsonify, abort 3 | 4 | mod = Blueprint('attrs', __name__, url_prefix='/attrs') 5 | from datausa.attrs.models import Cip, Naics, University, Soc, Degree 6 | from datausa.attrs.models import Race, Search, ZipLookup, GeoNeighbors 7 | from datausa.attrs.models import OccCrosswalk, IndCrosswalk, ProductCrosswalk 8 | from datausa.attrs.models import Skill, Sector, Geo, AcsInd, PumsIoCrosswalk 9 | from datausa.attrs.models import PumsDegree, PumsNaics, PumsRace, PumsSoc 10 | from datausa.attrs.models import PumsWage, PumsSex, PumsBirthplace 11 | from datausa.attrs.models import LStudy, EnrollmentStatus, LivingArrangement 12 | from datausa.attrs.models import IoCode, AcsOcc, AcsRace, AcsLanguage, Conflict 13 | from datausa.attrs.models import Insurance, Cohort, Sctg, Napcs, IPedsRace 14 | from datausa.attrs.models import IncomeRange, IPedsOcc, AcademicRank 15 | from datausa.attrs.models import IPedsToPumsCrosswalk, Carnegie, IPedsExpense 16 | from datausa.attrs.models import Opeid, SchoolType, EthnicCode, ProgramLength 17 | from datausa.attrs.models import SimilarUniversities, RateType 18 | from datausa.attrs.consts import GEO, GEO_LEVEL_MAP 19 | from datausa.attrs.search import do_search 20 | from datausa.database import db 21 | 22 | 23 | def to_bool(x): 24 | return x and x.lower() == "true" 25 | 26 | 27 | attr_map = {"soc": PumsSoc, "naics": PumsNaics, "cip": Cip, 28 | "geo": Geo, "university": University, "degree": Degree, 29 | "skill": Skill, "sector": Sector, 30 | "pums_degree": PumsDegree, 31 | "pums_race": PumsRace, "sex": PumsSex, 32 | "birthplace": PumsBirthplace, 33 | "wage_bin": PumsWage, "iocode": IoCode, 34 | "race": Race, "acs_race": AcsRace, 35 | "acs_occ": AcsOcc, "conflict": Conflict, "acs_ind": AcsInd, 36 | "language": AcsLanguage, 37 | "bls_soc": Soc, "bls_naics": Naics, 38 | "insurance": Insurance, "cohort": Cohort, 39 | "sctg": Sctg, "napcs": Napcs, "opeid": Opeid, 40 | "ethnic_code": EthnicCode, "program_length": ProgramLength, 41 | "school_type": SchoolType, 42 | "lstudy": LStudy, "enrollment_status": EnrollmentStatus, 43 | "ipeds_race": IPedsRace, 44 | "living_arrangement": LivingArrangement, 45 | "income_range": IncomeRange, 46 | "academic_rank": AcademicRank, 47 | "ipeds_occ": IPedsOcc, 48 | "ipeds_expense": IPedsExpense, 49 | "carnegie": Carnegie, 50 | "rate_type": RateType} 51 | 52 | 53 | def show_attrs(attr_obj, sumlevels=None): 54 | if sumlevels is not None: 55 | if attr_obj is Geo: 56 | sumlevels = [GEO_LEVEL_MAP[lvl] if lvl in GEO_LEVEL_MAP else lvl for lvl in sumlevels] 57 | attrs = attr_obj.query.filter(attr_obj.sumlevel.in_(sumlevels)).all() 58 | else: 59 | attrs = attr_obj.query.filter(attr_obj.level.in_(sumlevels)).all() 60 | elif attr_obj is Geo: 61 | # exclude census tracts and ZIPs 62 | attrs = attr_obj.query.filter(~Geo.id.startswith("140"), ~Geo.id.startswith("860")).all() 63 | else: 64 | attrs = attr_obj.query.all() 65 | 66 | data = [] 67 | headers = [] 68 | for a in attrs: 69 | obj = a.serialize() 70 | data.append(obj.values()) 71 | if not headers: 72 | headers = obj.keys() 73 | return jsonify(data=data, headers=headers) 74 | 75 | 76 | @mod.route("/pums//") 77 | def pums_attrs(kind): 78 | return attrs("pums_{}".format(kind)) 79 | 80 | 81 | @mod.route("/pums///") 82 | def pums_attr_id(kind, pums_attr_id): 83 | return attrs_by_id("pums_{}".format(kind), pums_attr_id) 84 | 85 | 86 | @mod.route("//") 87 | def attrs(kind): 88 | 89 | if kind in attr_map: 90 | attr_obj = attr_map[kind] 91 | sumlevel = request.args.get("sumlevel", None) 92 | sumlevels = sumlevel.split(",") if sumlevel else None 93 | return show_attrs(attr_obj, sumlevels=sumlevels) 94 | raise Exception("Invalid attribute type.") 95 | 96 | 97 | @mod.route("///") 98 | def attrs_by_id(kind, attr_id): 99 | 100 | if kind in attr_map: 101 | attr_obj = attr_map[kind] 102 | if kind in ["naics", "soc"]: 103 | aid_obj = attr_obj.query.filter_by(id=attr_id).order_by(attr_obj.level.asc()).first() 104 | else: 105 | aid_obj = attr_obj.query.get(attr_id) 106 | tmp = aid_obj.serialize() 107 | return jsonify(data=[tmp.values()], headers=tmp.keys()) 108 | raise Exception("Invalid attribute type.") 109 | 110 | 111 | @mod.route("/list/") 112 | def attrs_list(): 113 | return jsonify(data=attr_map.keys()) 114 | 115 | 116 | @mod.route("///parents/") 117 | def get_parents(kind, attr_id): 118 | if kind in attr_map: 119 | attr_obj = attr_map[kind] 120 | data, headers = attr_obj.parents(attr_id) 121 | return jsonify(data=data, headers=headers) 122 | raise Exception("Invalid attribute type.") 123 | 124 | 125 | @mod.route("///children/") 126 | def get_children(kind, attr_id): 127 | if kind in attr_map: 128 | attr_obj = attr_map[kind] 129 | kwargs = request.args 130 | data, headers = attr_obj.children(attr_id, **kwargs) 131 | return jsonify(data=data, headers=headers) 132 | raise Exception("Invalid attribute type.") 133 | 134 | 135 | @mod.route("/search/") 136 | def search(): 137 | offset = request.args.get("offset", None) 138 | limit = int(request.args.get("limit", 10)) 139 | kind = request.args.get("kind", None) 140 | sumlevel = request.args.get("sumlevel", None) 141 | txt = request.args.get("q", '').lower() 142 | is_stem = int(request.args.get("is_stem", 0)) 143 | 144 | if txt and re.match('^[0-9]{1,5}$', txt): 145 | return zip_search(txt, limit=limit) 146 | elif not txt or len(txt) <= 1: 147 | return search_old() 148 | 149 | data, suggs, tries, my_vars = do_search(txt, sumlevel, kind, limit=limit, is_stem=is_stem) 150 | headers = ["id", "name", "zvalue", "kind", "display", "sumlevel", "is_stem", "url_name"] 151 | autocorrected = tries > 0 152 | suggs = [x for x in suggs if x != txt] 153 | return jsonify(data=data, headers=headers, suggestions=suggs, autocorrected=autocorrected, related_vars=my_vars) 154 | 155 | 156 | @mod.route("/search_old/") 157 | def search_old(): 158 | q = request.args.get("q", '') 159 | q = q.lower() 160 | offset = request.args.get("offset", None) 161 | limit = request.args.get("limit", 100) 162 | kind = request.args.get("kind", None) 163 | sumlevel = request.args.get("sumlevel", None) 164 | is_stem = int(request.args.get("is_stem", 0)) 165 | filters = [Search.name.like("%{}%".format(q))] 166 | if kind: 167 | filters.append(Search.kind == kind) 168 | if sumlevel: 169 | filters.append(Search.sumlevel == sumlevel) 170 | if is_stem == 1: 171 | filters.append(Search.is_stem == is_stem) 172 | elif is_stem == 2: 173 | filters.append(Search.is_stem >= 1) 174 | qry = Search.query.filter(*filters).order_by(Search.zvalue.desc()) 175 | if limit: 176 | qry = qry.limit(int(limit)) 177 | if offset: 178 | qry = qry.offset(int(offset)) 179 | qry = qry.all() 180 | 181 | data = [[a.id, a.name, a.zvalue, a.kind, a.display, a.sumlevel, a.is_stem, a.url_name] for a in qry] 182 | 183 | headers = ["id", "name", "zvalue", "kind", "display", "sumlevel", "is_stem", "url_name"] 184 | return jsonify(data=data, headers=headers) 185 | 186 | 187 | @mod.route("/ranks/") 188 | def ranks(): 189 | attr_sumlvls = { 190 | "soc": {"0": 6, "1": 17, "2": 24, "3": 478}, 191 | "naics": {"0": 14, "1": 21, "2": 266}, 192 | "cip": {"2": 38, "4": 368, "6": 1416}, 193 | "geo": {"nation": 1, 194 | "state": 52, 195 | "county": 3221, 196 | "msa": 929, 197 | "place": 29509, 198 | "puma": 2378} 199 | } 200 | return jsonify(data=attr_sumlvls) 201 | 202 | 203 | def zip_search(zc, limit=10): 204 | if len(zc) != 5: 205 | zc += "%" 206 | zc = "86000US" + zc 207 | 208 | filters = [ 209 | ZipLookup.child_geoid.like(zc), 210 | ZipLookup.percent_covered >= 90, 211 | Search.id == ZipLookup.parent_geoid 212 | ] 213 | 214 | qry = Search.query.join(ZipLookup).filter(*filters) 215 | qry = qry.order_by(ZipLookup.parent_area.asc()) 216 | 217 | qry = qry.with_entities(Search.id, Search.name, Search.zvalue, Search.kind, 218 | Search.display, Search.sumlevel, ZipLookup.child_geoid, Search.is_stem, Search.url_name) 219 | qry = qry.limit(limit) 220 | data = [list(row) for row in qry] 221 | headers = ["id", "name", "zvalue", "kind", "display", "sumlevel", "zipcode", "is_stem", "url_name"] 222 | return jsonify(data=data, headers=headers, zip_search=True) 223 | 224 | 225 | @mod.route("/geo//neighbors/") 226 | def neighbors(geo_id): 227 | results = GeoNeighbors.query.filter_by(geo=geo_id).all() 228 | headers = ["geo", "neighbor"] 229 | data = [[result.geo, result.neighbor] for result in results] 230 | return jsonify(data=data, headers=headers) 231 | 232 | 233 | @mod.route("/geo//ipeds/") 234 | def has_ipeds_data(attr_id): 235 | from datausa.util import inmem 236 | # first check, do I have any data 237 | data, headers = Geo.parents(attr_id) 238 | id_idx = headers.index("id") 239 | ipeds_places = inmem.ipeds_place_map() 240 | if attr_id in ipeds_places: 241 | return jsonify(data=[], headers=[]) 242 | data.reverse() 243 | for row in data: 244 | geo_id = row[id_idx] 245 | if geo_id in ipeds_places: 246 | return jsonify(data=[geo_id], headers=[GEO]) 247 | 248 | 249 | @mod.route("/crosswalk///") 250 | def crosswalk_acs(attr_kind, attr_id): 251 | if attr_kind not in ["acs_occ", "acs_ind", "iocode", "sctg", "ipeds_occ"]: 252 | return abort(404) 253 | if attr_kind == "sctg": 254 | results = ProductCrosswalk.query.filter(ProductCrosswalk.sctg == attr_id) 255 | results = [[item.napcs, "napcs"] for item in results] 256 | elif attr_kind == "ipeds_occ": 257 | results = IPedsToPumsCrosswalk.query.filter(IPedsToPumsCrosswalk.ipeds_occ == attr_id).all() 258 | results = [[item.pums_soc, "soc"] for item in results] 259 | elif attr_kind == "iocode": 260 | results = PumsIoCrosswalk.query.filter(PumsIoCrosswalk.iocode == attr_id).all() 261 | results = [[item.pums_naics, "naics"] for item in results] 262 | else: 263 | attr_obj = {"acs_occ": OccCrosswalk, "acs_ind": IndCrosswalk}[attr_kind] 264 | header_name = {"acs_occ": "soc", "acs_ind": "naics"}[attr_kind] 265 | col_name = "pums_{}".format(header_name) 266 | results = attr_obj.query.filter(getattr(attr_obj, attr_kind) == attr_id).with_entities(col_name).all() 267 | results = [[getattr(item, col_name), header_name] for item in results] 268 | return jsonify(data=results, headers=["attr_id", "attr_kind"]) 269 | 270 | 271 | @mod.route("/nearby/university/") 272 | def nearby_university(university_id): 273 | limit = int(request.args.get("limit", 5)) 274 | univ = University.query.get(university_id) 275 | query_str = """SELECT id, name 276 | FROM attrs.university 277 | where carnegie = :carnegie AND status != 'D' and id != :uid 278 | ORDER BY ST_MakePoint(:lat, :lng) <-> st_makepoint(lat, lng) 279 | LIMIT :limit; 280 | """ 281 | res = db.session.execute(query_str, {"lat": univ.lat, "lng": univ.lng, "carnegie": univ.carnegie, "limit": limit, "uid": university_id}) 282 | data = [map(unicode, x) for x in res] 283 | headers = ["id", "name"] 284 | return jsonify(data=data, headers=headers) 285 | 286 | 287 | @mod.route("/similar/university/") 288 | def similar_universities(university_id): 289 | limit = int(request.args.get("limit", 5)) 290 | univ = SimilarUniversities.query.get(university_id) 291 | query_str = """SELECT id, name 292 | FROM attrs.similar_universities 293 | where id != :uid 294 | AND carnegie_parent = :carnegie_parent 295 | ORDER BY ST_MakePoint(:x, :y) <-> st_makepoint(x, y) 296 | LIMIT :limit; 297 | """ 298 | res = db.session.execute(query_str, {"x": univ.x, "y": univ.y, "carnegie_parent": univ.carnegie_parent, "limit": limit, "uid": university_id}) 299 | data = [map(unicode, x) for x in res] 300 | headers = ["id", "name"] 301 | return jsonify(data=data, headers=headers) 302 | -------------------------------------------------------------------------------- /datausa/bea/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataUSA/datausa-api/7288dede082eda07b61e11cf6dc801fe692f6334/datausa/bea/__init__.py -------------------------------------------------------------------------------- /datausa/bea/models.py: -------------------------------------------------------------------------------- 1 | from datausa.database import db 2 | from datausa.attrs.models import IoCode 3 | from datausa.core.models import BaseModel 4 | from datausa.attrs.consts import ALL 5 | from datausa.attrs.consts import NO_VALUE_ADDED 6 | # from sqlalchemy import and_ 7 | 8 | class BeaUse(db.Model, BaseModel): 9 | __table_args__ = {"schema": "bea"} 10 | __tablename__ = 'use' 11 | source_title = 'Use Tables' 12 | source_link = 'http://bea.gov' 13 | source_org = 'Bureau of Economic Analysis' 14 | 15 | median_moe = 2 16 | to_filter = ["TOTCOMOUT", "HS", "ORE", "GFGD", "G", "TOTII", "GFGN", "GSLE", 17 | "GFE", "GSLG", "Other", "Used", "TOTFU", "TOTVA", "TOTINDOUT"] 18 | no_value_added = to_filter + ["V001", "V002", "V003", "F010", "F020", "F030", 19 | "F040", "F050", "F100"] 20 | year = db.Column(db.Integer, primary_key=True) 21 | industry_iocode = db.Column(db.String, db.ForeignKey(IoCode.id), primary_key=True) 22 | commodity_iocode = db.Column(db.String, db.ForeignKey(IoCode.id), primary_key=True) 23 | 24 | value_millions = db.Column(db.Integer) 25 | industry_level = db.Column(db.Integer) 26 | 27 | @classmethod 28 | def get_supported_levels(cls): 29 | return { 30 | "industry_iocode": [ALL, "0", "1", "naics", NO_VALUE_ADDED], 31 | "commodity_iocode": [ALL, "naics", NO_VALUE_ADDED], 32 | } 33 | 34 | @classmethod 35 | def industry_iocode_filter(cls, level): 36 | if level == ALL: 37 | return True 38 | elif level == "naics": 39 | return ~cls.industry_iocode.in_(cls.to_filter) 40 | elif level == NO_VALUE_ADDED: 41 | return ~cls.industry_iocode.in_(cls.no_value_added) 42 | target_len = int(level) 43 | return cls.industry_level == target_len 44 | 45 | @classmethod 46 | def commodity_iocode_filter(cls, level): 47 | if level == ALL: 48 | return True 49 | elif level == NO_VALUE_ADDED: 50 | return ~cls.commodity_iocode.in_(cls.no_value_added) 51 | return ~cls.commodity_iocode.in_(cls.to_filter) 52 | -------------------------------------------------------------------------------- /datausa/bls/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataUSA/datausa-api/7288dede082eda07b61e11cf6dc801fe692f6334/datausa/bls/__init__.py -------------------------------------------------------------------------------- /datausa/bls/models.py: -------------------------------------------------------------------------------- 1 | from datausa.database import db 2 | from datausa.attrs.models import Geo, Soc, Naics 3 | from datausa.attrs.models import PumsSoc, PumsNaics 4 | from datausa.core.models import BaseModel 5 | from datausa.attrs.consts import NATION, STATE, MSA, ALL 6 | from sqlalchemy.orm import column_property 7 | from sqlalchemy.ext.declarative import declared_attr 8 | 9 | 10 | class Bls(BaseModel): 11 | source_title = 'Growth' 12 | source_org = 'Bureau of Labor Statistics' 13 | 14 | __table_args__ = {"schema": "bls"} 15 | source_link = 'http://bls.gov' 16 | 17 | 18 | class SocCrosswalk(db.Model, Bls): 19 | __tablename__ = 'soc_crosswalk' 20 | soc = db.Column("pums_soc", db.String(), db.ForeignKey(PumsSoc.id), primary_key=True) 21 | bls_soc = db.Column(db.String(), db.ForeignKey(Soc.id), primary_key=True) 22 | 23 | 24 | class BlsSoc(object): 25 | @declared_attr 26 | def bls_soc(cls): 27 | return db.Column(db.String(), primary_key=True) 28 | 29 | @declared_attr 30 | def soc(cls): 31 | return column_property(SocCrosswalk.soc) 32 | 33 | @classmethod 34 | def crosswalk_join(cls, qry): 35 | cond = SocCrosswalk.bls_soc == cls.bls_soc 36 | return qry.join(SocCrosswalk, cond, full=True) 37 | 38 | 39 | class GrowthO(db.Model, Bls, BlsSoc): 40 | source_title = 'Employment Projections' 41 | __tablename__ = 'growth_o' 42 | median_moe = 1 43 | 44 | emp_2014_thousands = db.Column(db.Float) 45 | emp_2024_thousands = db.Column(db.Float) 46 | emp_pct_2014 = db.Column(db.Float) 47 | emp_pct_2024 = db.Column(db.Float) 48 | change_thousands = db.Column(db.Float) 49 | pct_change = db.Column(db.Float) 50 | openings_thousands = db.Column(db.Float) 51 | 52 | @classmethod 53 | def get_supported_levels(cls): 54 | return { 55 | "soc": [ALL, "0", "1", "2", "3"], 56 | "bls_soc": [ALL, "0", "1", "2", "3"] 57 | } 58 | 59 | 60 | class GrowthO16(db.Model, Bls, BlsSoc): 61 | source_title = 'Employment Projections' 62 | __tablename__ = 'growth_o_2016' 63 | median_moe = 1 64 | 65 | emp_2016_thousands = db.Column(db.Float) 66 | emp_2026_thousands = db.Column(db.Float) 67 | emp_pct_2016 = db.Column(db.Float) 68 | emp_pct_2026 = db.Column(db.Float) 69 | change_thousands = db.Column(db.Float) 70 | pct_change = db.Column(db.Float) 71 | openings_thousands = db.Column(db.Float) 72 | 73 | @classmethod 74 | def get_supported_levels(cls): 75 | return { 76 | "soc": [ALL, "0", "1", "2", "3"], 77 | "bls_soc": [ALL, "0", "1", "2", "3"] 78 | } 79 | 80 | 81 | class GrowthI(db.Model, Bls): 82 | source_title = 'Industry Projections' 83 | __tablename__ = 'growth_i' 84 | median_moe = 2 85 | 86 | naics = db.Column(db.String, primary_key=True) 87 | title = db.Column(db.String) 88 | emp_2004_thousands = db.Column(db.Float) 89 | emp_2014_thousands = db.Column(db.Float) 90 | emp_2024_thousands = db.Column(db.Float) 91 | emp_change_2004_2014 = db.Column(db.Float) 92 | emp_change_2014_2024 = db.Column(db.Float) 93 | output_2004 = db.Column(db.Float) 94 | output_2014 = db.Column(db.Float) 95 | output_2024 = db.Column(db.Float) 96 | output_carc_2004_2014 = db.Column(db.Float) 97 | output_carc_2014_2024 = db.Column(db.Float) 98 | emp_carc_2004_2014 = db.Column(db.Float) 99 | emp_carc_2014_2024 = db.Column(db.Float) 100 | 101 | @classmethod 102 | def get_supported_levels(cls): 103 | return { 104 | "naics": [ALL, "0", "1", "2", "3", "4"] 105 | } 106 | 107 | 108 | class GrowthI16(db.Model, Bls): 109 | source_title = 'Industry Projections' 110 | __tablename__ = 'growth_i_2016' 111 | median_moe = 2 112 | 113 | naics = db.Column(db.String, primary_key=True) 114 | title = db.Column(db.String) 115 | emp_2006_thousands = db.Column(db.Float) 116 | emp_2016_thousands = db.Column(db.Float) 117 | emp_2026_thousands = db.Column(db.Float) 118 | emp_change_2006_2016 = db.Column(db.Float) 119 | emp_change_2016_2026 = db.Column(db.Float) 120 | output_2006 = db.Column(db.Float) 121 | output_2016 = db.Column(db.Float) 122 | output_2026 = db.Column(db.Float) 123 | output_carc_2006_2016 = db.Column(db.Float) 124 | output_carc_2016_2026 = db.Column(db.Float) 125 | emp_carc_2006_2016 = db.Column(db.Float) 126 | emp_carc_2016_2026 = db.Column(db.Float) 127 | 128 | @classmethod 129 | def get_supported_levels(cls): 130 | return { 131 | "naics": [ALL, "0", "1", "2", "3", "4"] 132 | } 133 | 134 | 135 | class BlsCrosswalk(db.Model, Bls): 136 | __tablename__ = 'bls_crosswalk' 137 | pums_naics = db.Column(db.String, db.ForeignKey(PumsNaics.id), 138 | primary_key=True) 139 | bls_naics = db.Column(db.String, primary_key=True) 140 | 141 | 142 | class GrowthILookup(db.Model, Bls): 143 | __tablename__ = 'growth_i_lookup' 144 | pums_naics = db.Column(db.String, db.ForeignKey(PumsNaics.id), primary_key=True) 145 | bls_naics = db.Column(db.String, primary_key=True) 146 | 147 | 148 | class OesYgo(db.Model, Bls, BlsSoc): 149 | __tablename__ = 'oes_ygo' 150 | 151 | median_moe = 2 152 | 153 | year = db.Column(db.Integer, primary_key=True) 154 | geo = db.Column(db.String, db.ForeignKey(Geo.id), primary_key=True) 155 | # soc = db.Column(db.String, db.ForeignKey(Soc.id), primary_key=True) 156 | 157 | tot_emp = db.Column(db.Integer) 158 | tot_emp_prse = db.Column(db.Float) 159 | avg_wage = db.Column(db.Float) 160 | avg_wage_prse = db.Column(db.Float) 161 | tot_emp_rca = db.Column(db.Float) 162 | 163 | @classmethod 164 | def get_supported_levels(cls): 165 | return { 166 | "geo": [ALL, NATION, STATE, MSA], 167 | "bls_soc": [ALL, "0", "1", "2", "3"], 168 | "soc": [ALL, "0", "1", "2", "3"] 169 | } 170 | 171 | @classmethod 172 | def geo_filter(cls, level): 173 | if level == ALL: 174 | return True 175 | level_map = {NATION: "010", STATE: "040", MSA: "050"} 176 | level_code = level_map[level] 177 | return cls.geo.startswith(level_code) 178 | 179 | 180 | class CesYi(db.Model, Bls): 181 | source_title = 'Current Employment Statistics' 182 | __tablename__ = 'ces_yi' 183 | median_moe = 1.5 184 | 185 | JOINED_FILTER = {"naics": { 186 | "table": Naics, 187 | "column": Naics.level, 188 | "id": Naics.id}} 189 | 190 | year = db.Column(db.Integer, primary_key=True) 191 | naics = db.Column(db.String, db.ForeignKey(Naics.id), primary_key=True) 192 | 193 | avg_hrly_earnings = db.Column(db.Float) 194 | avg_wkly_hrs = db.Column(db.Float) 195 | employees_thousands = db.Column(db.Float) 196 | 197 | @classmethod 198 | def get_supported_levels(cls): 199 | return { 200 | "naics": [ALL, "0", "1", "2", "3", "4"] 201 | } 202 | 203 | 204 | class QcewYgi(db.Model, Bls): 205 | __tablename__ = 'qcew_ygi' 206 | median_moe = 2 207 | 208 | year = db.Column(db.Integer, primary_key=True) 209 | geo = db.Column(db.String, db.ForeignKey(Geo.id), primary_key=True) 210 | naics = db.Column(db.String, db.ForeignKey(Naics.id), primary_key=True) 211 | 212 | naics_level = db.Column(db.Integer) 213 | avg_annual_pay = db.Column(db.Float) 214 | total_annual_wages = db.Column(db.Float) 215 | annual_contributions = db.Column(db.Float) 216 | annual_avg_emplvl = db.Column(db.Float) 217 | total_annual_wages_rca = db.Column(db.Float) 218 | annual_avg_estabs = db.Column(db.Float) 219 | taxable_annual_wages = db.Column(db.Float) 220 | annual_avg_wkly_wage = db.Column(db.Float) 221 | 222 | @classmethod 223 | def get_supported_levels(cls): 224 | return { 225 | "geo": [ALL, NATION, STATE, MSA], 226 | "naics": [ALL, "0", "1", "2", "3", "4"] 227 | } 228 | 229 | @classmethod 230 | def geo_filter(cls, level): 231 | if level == ALL: 232 | return True 233 | level_map = {NATION: "010", STATE: "040", MSA: "050"} 234 | level_code = level_map[level] 235 | return cls.geo.startswith(level_code) 236 | 237 | @classmethod 238 | def naics_filter(cls, level): 239 | if level == ALL: 240 | return True 241 | return cls.naics_level == level 242 | -------------------------------------------------------------------------------- /datausa/cbp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataUSA/datausa-api/7288dede082eda07b61e11cf6dc801fe692f6334/datausa/cbp/__init__.py -------------------------------------------------------------------------------- /datausa/cbp/abstract_models.py: -------------------------------------------------------------------------------- 1 | from datausa.database import db 2 | from datausa.attrs import consts 3 | from datausa.attrs.models import Naics, Geo 4 | from sqlalchemy.orm import column_property 5 | from datausa.core.models import BaseModel 6 | from sqlalchemy.ext.declarative import declared_attr 7 | from sqlalchemy.sql import func 8 | from datausa.attrs.consts import NATION, STATE, COUNTY, MSA, ALL 9 | 10 | class BaseCbp(db.Model, BaseModel): 11 | __abstract__ = True 12 | __table_args__ = {"schema": "cbp"} 13 | source_title = 'County Business Patterns' 14 | source_link = 'http://www.census.gov/econ/cbp/' 15 | source_org = 'Census Bureau' 16 | 17 | est = db.Column(db.Integer()) 18 | 19 | emp = db.Column(db.Integer()) 20 | emp_nf = db.Column(db.String()) 21 | empflag = db.Column(db.String()) 22 | 23 | ap = db.Column(db.Float()) 24 | ap_nf = db.Column(db.String()) 25 | 26 | n1_4 = db.Column(db.Integer()) 27 | n5_9 = db.Column(db.Integer()) 28 | n20_49 = db.Column(db.Integer()) 29 | n50_99 = db.Column(db.Integer()) 30 | n100_249 = db.Column(db.Integer()) 31 | n250_499 = db.Column(db.Integer()) 32 | n500_999 = db.Column(db.Integer()) 33 | n1000 = db.Column(db.Integer()) 34 | n1000_1 = db.Column(db.Integer()) 35 | n1000_2 = db.Column(db.Integer()) 36 | n1000_3 = db.Column(db.Integer()) 37 | n1000_4 = db.Column(db.Integer()) 38 | 39 | @classmethod 40 | def geo_filter(cls, level): 41 | if level == ALL: 42 | return True 43 | level_map = {NATION: "010", STATE: "040", MSA: "310", COUNTY: "050"} 44 | level_code = level_map[level] 45 | return cls.geo.startswith(level_code) 46 | -------------------------------------------------------------------------------- /datausa/cbp/models.py: -------------------------------------------------------------------------------- 1 | from datausa.database import db 2 | from datausa.attrs.models import Geo, Soc 3 | from datausa.core.models import BaseModel 4 | from datausa.attrs.consts import NATION, STATE, MSA, ALL, GEO 5 | 6 | from datausa.cbp.abstract_models import BaseCbp 7 | from datausa.attrs.consts import NATION, STATE, COUNTY, MSA 8 | from sqlalchemy.sql import func 9 | 10 | class CbpYgi(BaseCbp): 11 | __tablename__ = "ygi" 12 | median_moe = 2 13 | 14 | year = db.Column(db.Integer(), primary_key=True) 15 | geo = db.Column(db.String(), primary_key=True) 16 | naics = db.Column(db.String(), primary_key=True) 17 | 18 | @classmethod 19 | def get_supported_levels(cls): 20 | return { 21 | GEO: [ALL, NATION, STATE, MSA, COUNTY], 22 | "naics": [ALL, "0", "1", "2", "3", "4"] 23 | } 24 | 25 | @classmethod 26 | def naics_filter(cls, level): 27 | if level == ALL: 28 | return True 29 | target_len = int(level) + 2 30 | return func.length(cls.naics) == target_len 31 | 32 | class CbpYg(BaseCbp): 33 | __tablename__ = "yg" 34 | median_moe = 1 35 | 36 | year = db.Column(db.Integer(), primary_key=True) 37 | geo = db.Column(db.String(), primary_key=True) 38 | 39 | @classmethod 40 | def get_supported_levels(cls): 41 | return { 42 | GEO: [ALL, NATION, STATE, MSA, COUNTY], 43 | } 44 | -------------------------------------------------------------------------------- /datausa/chr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataUSA/datausa-api/7288dede082eda07b61e11cf6dc801fe692f6334/datausa/chr/__init__.py -------------------------------------------------------------------------------- /datausa/chr/models.py: -------------------------------------------------------------------------------- 1 | from datausa.database import db 2 | from datausa.attrs import consts 3 | from datausa.attrs.models import University 4 | from datausa import cache 5 | 6 | from datausa.core.models import BaseModel 7 | from datausa.attrs.consts import NATION, STATE, COUNTY, MSA, ALL 8 | from sqlalchemy.ext.automap import automap_base 9 | from sqlalchemy import MetaData 10 | 11 | metadata = cache.get("chr_metadata") 12 | if not metadata: 13 | metadata = MetaData(schema='chr', bind=db.engine) 14 | metadata.reflect() 15 | cache.set("chr_metadata", metadata) 16 | 17 | AutomapBase = automap_base(bind=db.engine, metadata=metadata) 18 | 19 | 20 | class HealthYg(AutomapBase, db.Model, BaseModel): 21 | __table_args__ = {"schema": "chr", "extend_existing": True} 22 | source_title = 'County Health Rankings' 23 | source_link = 'http://www.countyhealthrankings.org/' 24 | source_org = 'University of Wisconsin' 25 | __tablename__ = 'yg' 26 | median_moe = 1 27 | 28 | year = db.Column(db.Integer, primary_key=True) 29 | geo = db.Column(db.String(), primary_key=True) 30 | 31 | @classmethod 32 | def get_supported_levels(cls): 33 | return {"geo": [ALL, STATE, COUNTY]} 34 | 35 | @classmethod 36 | def geo_filter(cls, level): 37 | if level == ALL: 38 | return True 39 | level_map = {STATE: "040", COUNTY: "050"} 40 | level_code = level_map[level] 41 | return cls.geo.startswith(level_code) 42 | 43 | AutomapBase.prepare(db.engine, reflect=False) 44 | -------------------------------------------------------------------------------- /datausa/core/__init__.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy.orm.attributes import InstrumentedAttribute 2 | 3 | def get_columns(tbl): 4 | cols = tbl.__mapper__.attrs 5 | return [getattr(tbl, col.key) for col in cols] 6 | 7 | # possible_variables = [col.key for t in registered_models for col in t.__table__.columns] 8 | # def attribute_names(cls): 9 | # return [prop.key for prop in class_mapper(cls).iterate_properties 10 | # if isinstance(prop, ColumnProperty)] 11 | 12 | # def get_columns(tbl): 13 | # cols = [] 14 | # for item,val in tbl.__dict__.items(): 15 | # if isinstance(val, InstrumentedAttribute) and not item.startswith("_"): 16 | # cols.append(val) 17 | # # print tbl.__table__.columns 18 | # return cols 19 | -------------------------------------------------------------------------------- /datausa/core/api.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import and_ 2 | from flask import Response 3 | import simplejson 4 | 5 | from datausa.core import get_columns 6 | from datausa.core.table_manager import TableManager, table_name 7 | from datausa.attrs import consts 8 | from datausa.attrs.views import attr_map 9 | from sqlalchemy.orm import aliased 10 | from datausa.util.inmem import splitter 11 | from datausa.core.exceptions import DataUSAException 12 | from sqlalchemy import desc, asc, func 13 | 14 | MAX_LIMIT = 4 15 | 16 | 17 | def use_attr_names(table, qry, cols): 18 | new_cols = [] 19 | joins = {} 20 | for col in cols: 21 | col_str = col if isinstance(col, basestring) else col.key 22 | orig_str = col_str 23 | col_str = "iocode" if "_iocode" in col_str else col_str 24 | col_str = "geo" if col_str.endswith("_geo") else col_str 25 | col_str = "pums_degree" if "pums" in table.__table_args__["schema"] and col_str == "degree" else col_str 26 | if table.__table_args__["schema"] == 'bls' and col_str in ['naics', 'soc']: 27 | col_str = "bls_{}".format(col_str) 28 | if col_str in attr_map: 29 | attr_obj = attr_map[col_str] 30 | attr_alias = aliased(attr_obj) 31 | joins[orig_str] = [attr_alias, getattr(table, orig_str) == attr_alias.id] 32 | new_cols.append(attr_alias.name.label(orig_str + "_name")) 33 | 34 | new_cols.append(col) 35 | for col_str, j in joins.items(): 36 | qry = qry.join(*j, isouter=True) 37 | return qry, new_cols 38 | 39 | 40 | def stream_format(table, cols, qry, api_obj): 41 | def generate(): 42 | yield ','.join([col if isinstance(col, basestring) else col.key for col in cols]) + '\n' 43 | for row in qry: 44 | row = [u'"{}"'.format(x) if isinstance(x, basestring) else str(x) for x in list(row)] 45 | yield u','.join(row) + u'\n' 46 | return Response(generate(), mimetype='text/csv') 47 | 48 | def simple_format(table, cols, data, api_obj): 49 | ''' Based on https://github.com/al4/orlo/blob/1b3930bae4aa37eb51aed33a97c088e576cb5a99/orlo/route_api.py#L285-L311''' 50 | def generate(table): 51 | headers = [col if isinstance(col, basestring) else col.key for col in cols] 52 | inf = float('inf') 53 | 54 | """ 55 | A lagging generator to stream JSON so we don't have to hold everything in memory 56 | This is a little tricky, as we need to omit the last comma to make valid JSON, 57 | thus we use a lagging generator, similar to http://stackoverflow.com/questions/1630320/ 58 | """ 59 | yield u'{' 60 | 61 | rows = data.__iter__() 62 | try: 63 | prev_row = next(rows) # get first result 64 | except StopIteration: 65 | # StopIteration here means the length was zero, so yield a valid releases doc and stop 66 | yield u'''"data": [], 67 | "headers": {}, 68 | "source": {}, 69 | "subs": {}, 70 | "logic": {} 71 | '''.format(simplejson.dumps(list(headers)), simplejson.dumps(table.info(api_obj)), simplejson.dumps(api_obj.subs), 72 | simplejson.dumps([table.info(api_obj) for table in api_obj.table_list])) + u'}' 73 | raise StopIteration 74 | 75 | # We have some releases. First, yield the opening json 76 | yield u'"data": [' 77 | 78 | # Iterate over the releases 79 | for row in rows: 80 | yield simplejson.dumps([x if x != inf else None for x in prev_row]) + u', ' 81 | prev_row = row 82 | 83 | # Now yield the last iteration without comma 84 | yield simplejson.dumps([x if x != inf else None for x in prev_row]) 85 | 86 | yield u'''], "headers": {}, 87 | "source": {}, 88 | "subs": {}, 89 | "logic": {} 90 | '''.format(simplejson.dumps(list(headers)), simplejson.dumps(table.info(api_obj)), simplejson.dumps(api_obj.subs), 91 | simplejson.dumps([table.info(api_obj) for table in api_obj.table_list])) + u'}' 92 | 93 | return Response(generate(table), content_type='application/json') 94 | 95 | # def simple_format(table, cols, data, api_obj): 96 | # headers = [col if isinstance(col, basestring) else col.key for col in cols] 97 | # inf = float('inf') 98 | # data = { 99 | # "headers": list(headers), 100 | # "source": table.info(api_obj), 101 | # "subs": api_obj.subs, 102 | # "logic": [table.info(api_obj) for table in api_obj.table_list], 103 | # "data": [ [x if x != inf else None for x in row] for row in data], 104 | # } 105 | # 106 | # return flask.jsonify(data) 107 | 108 | def parse_method_and_val(cond): 109 | if cond.startswith("^"): 110 | return "startswith", cond[1:], False 111 | elif cond.startswith("~^"): 112 | return "startswith", cond[2:], True 113 | elif cond.endswith("$"): 114 | return "endswith", cond[:-1], False 115 | elif cond.endswith("~$"): 116 | return "endswith", cond[:-2], True 117 | elif cond.startswith("str!"): 118 | return "ne", str(cond[4:]), False 119 | elif cond.startswith("!"): 120 | return "ne", int(cond[1:]), False 121 | elif cond.startswith(">"): 122 | return "gt", int(cond[1:]), False 123 | elif cond.startswith("<"): 124 | return "lt", int(cond[1:]), False 125 | elif cond.startswith("R<"): 126 | return "rt", float(cond[2:]), False 127 | elif cond.startswith("R>"): 128 | return "rg", float(cond[2:]), False 129 | else: 130 | return "like", cond, False 131 | 132 | def where_filters(table, where_str): 133 | if not where_str: 134 | return [] 135 | filts = [] 136 | 137 | wheres = splitter(where_str) 138 | for where in wheres: 139 | colname, cond = where.split(":") 140 | cols = None 141 | if "/" in colname: 142 | cols = [getattr(table, c) for c in colname.split("/")] 143 | else: 144 | col = getattr(table, colname) 145 | method, value, negate = parse_method_and_val(cond) 146 | if method == "ne": 147 | expr = col != value 148 | elif method == "gt": 149 | expr = col > value 150 | elif method == "lt": 151 | expr = col < value 152 | elif method == "rt": 153 | expr = and_(cols[1] != 0, cols[0] / cols[1] < value) 154 | elif method == "rg": 155 | expr = and_(cols[1] != 0, cols[0] / cols[1] > value) 156 | else: 157 | expr = getattr(col, method)(value) 158 | if negate: 159 | expr = ~expr 160 | filts.append(expr) 161 | return filts 162 | 163 | def sumlevel_filtering(table, api_obj): 164 | shows_and_levels = api_obj.shows_and_levels 165 | filters = [] 166 | for col, level in shows_and_levels.items(): 167 | args = (table, "{}_filter".format(col)) 168 | if hasattr(*args): 169 | func = getattr(*args) 170 | filters.append(func(level)) 171 | 172 | # raise Exception(filters) 173 | return filters 174 | 175 | def process_value_filters(table, vars_and_vals, api_obj): 176 | filts = [] 177 | for var, val in vars_and_vals.items(): 178 | if var == consts.YEAR and val in [consts.LATEST, consts.OLDEST]: 179 | years = TableManager.table_years[table_name(table)] 180 | my_year = years[val] 181 | filt = table.year == my_year 182 | api_obj.set_year(my_year) 183 | elif consts.OR in val: 184 | filt = getattr(table, var).in_(splitter(val)) 185 | else: 186 | filt = getattr(table, var) == val 187 | if var == consts.YEAR and val == consts.ALL: 188 | pass # do nothing, show all years 189 | else: 190 | filts.append(filt) 191 | return filts 192 | 193 | def remove_filters(filters, table, col, api_obj): 194 | new_filts = [] 195 | for filt in filters: 196 | if hasattr(filt, "left") and hasattr(filt, "right"): 197 | if filt.left.key == col and isinstance(filt.right.value, basestring): 198 | if api_obj.vars_and_vals[col] == filt.right.value: 199 | continue 200 | new_filts.append(filt) 201 | return new_filts 202 | 203 | 204 | def copy_where_literals(api_obj): 205 | if hasattr(api_obj, "where") and api_obj.where: 206 | wheres = splitter(api_obj.where) 207 | for where in wheres: 208 | colname, cond = where.split(":") 209 | if colname not in api_obj.vars_and_vals: 210 | api_obj.vars_and_vals[colname] = cond 211 | return api_obj 212 | 213 | 214 | def handle_join(qry, filters, table, api_obj): 215 | joins = [] 216 | joined_filt = table.JOINED_FILTER 217 | # see if we need to copy over which variables are involved 218 | api_obj = copy_where_literals(api_obj) 219 | for col, level in api_obj.shows_and_levels.items(): 220 | if level != consts.ALL: 221 | if col in joined_filt: 222 | if not "triggers" in joined_filt[col]: 223 | joins.append(joined_filt[col]["table"]) 224 | filters.append(joined_filt[col]["column"] == level) 225 | filters.append(joined_filt[col]["id"] == getattr(table, col)) 226 | else: 227 | triggers = joined_filt[col]["triggers"] 228 | for target_lvl, starting in triggers: 229 | if col in api_obj.vars_and_vals: 230 | if api_obj.vars_and_vals[col].startswith(starting) and level == target_lvl: 231 | joins.append(joined_filt[col]["table"]) 232 | filters = remove_filters(filters, table, col, api_obj) 233 | filters.append(joined_filt[col]["id"] == getattr(table, col)) 234 | filters.append(joined_filt[col]["column"] == api_obj.vars_and_vals[col]) 235 | qry = qry.join(*joins) 236 | return qry, filters 237 | 238 | 239 | def query(table, api_obj, stream=False): 240 | vars_and_vals = api_obj.vars_and_vals 241 | values = api_obj.values 242 | exclude = api_obj.exclude 243 | 244 | filters = process_value_filters(table, vars_and_vals, api_obj) 245 | filters += where_filters(table, api_obj.where) 246 | filters += sumlevel_filtering(table, api_obj) 247 | 248 | if values: 249 | pk = [col for col in table.__table__.columns if col.primary_key and col.key not in values] 250 | cols = pk + [getattr(table, col_name) for col_name in values] 251 | else: 252 | cols = get_columns(table) 253 | 254 | if exclude: 255 | cols = [col for col in cols 256 | if (isinstance(col, basestring) and col not in exclude) or col.key not in exclude] 257 | 258 | # qry = table.query.with_entities(*cols) 259 | qry = table.query 260 | 261 | if hasattr(table, "crosswalk_join"): 262 | qry = table.crosswalk_join(qry) 263 | 264 | if stream or api_obj.display_names: 265 | qry, cols = use_attr_names(table, qry, cols) 266 | qry = qry.with_entities(*cols) 267 | 268 | if hasattr(table, "JOINED_FILTER"): 269 | qry, filters = handle_join(qry, filters, table, api_obj) 270 | 271 | qry = qry.filter(*filters) 272 | 273 | if api_obj.order: 274 | sort = desc if api_obj.sort == "desc" else asc 275 | if api_obj.order not in TableManager.possible_variables: 276 | if api_obj.order == 'abs(pct_change)': 277 | pass # allow this 278 | else: 279 | raise DataUSAException("Bad order parameter", api_obj.order) 280 | # sort_stmt = text("{} {} NULLS LAST".format(api_obj.order, sort)) 281 | if api_obj.order == 'abs(pct_change)': 282 | target_col = func.abs(table.pct_change) 283 | else: 284 | target_col = getattr(table, api_obj.order) 285 | 286 | qry = qry.order_by(sort(target_col).nullslast()) 287 | if api_obj.limit: 288 | qry = qry.limit(api_obj.limit) 289 | 290 | if stream: 291 | return stream_format(table, cols, qry, api_obj) 292 | 293 | return simple_format(table, cols, qry, api_obj) 294 | -------------------------------------------------------------------------------- /datausa/core/attr_crosswalking.py: -------------------------------------------------------------------------------- 1 | '''Attribute crosswalker for join API''' 2 | from sqlalchemy.orm import aliased 3 | from sqlalchemy import and_, or_, func 4 | 5 | from datausa.bls.models import BlsCrosswalk, GrowthILookup, SocCrosswalk 6 | from datausa.attrs.models import GeoCrosswalker 7 | 8 | def geo_crosswalk_join(tbl1, tbl2, col): 9 | my_joins = [] 10 | gc_alias = aliased(GeoCrosswalker) 11 | j1 = [ 12 | gc_alias, or_(gc_alias.geo_a == tbl1.geo, 13 | gc_alias.geo_b == tbl1.geo) 14 | ] 15 | j1 = [j1, {"full": False, "isouter": False}] 16 | my_joins.append(j1) 17 | 18 | j2_cond = or_( 19 | and_(gc_alias.geo_a == tbl1.geo, gc_alias.geo_b == tbl2.geo), 20 | and_(gc_alias.geo_b == tbl1.geo, gc_alias.geo_a == tbl2.geo) 21 | ) 22 | j2 = [tbl2, j2_cond] 23 | j2 = [j2, {"full": False, "isouter": False}] 24 | my_joins.append(j2) 25 | 26 | return my_joins 27 | 28 | def naics_crosswalk_join(tbl1, tbl2, col, already_joined): 29 | my_joins = [] 30 | bls_table = None 31 | pums_table = None 32 | 33 | if tbl1.get_schema_name() == "bls": 34 | bls_table = tbl1 35 | pums_table = tbl2 36 | if tbl2.get_schema_name() == "bls": 37 | bls_table = tbl2 38 | pums_table = tbl1 39 | 40 | cond1 = BlsCrosswalk.pums_naics == pums_table.naics if tbl1 is pums_table else BlsCrosswalk.bls_naics == bls_table.naics 41 | 42 | if not BlsCrosswalk.full_name() in already_joined: 43 | j1 = [[BlsCrosswalk, cond1], {"full": False, "isouter": False}] 44 | my_joins.append(j1) 45 | already_joined[BlsCrosswalk.full_name()] = True 46 | j2_cond = and_(BlsCrosswalk.pums_naics == pums_table.naics, 47 | BlsCrosswalk.bls_naics == bls_table.naics) 48 | j2 = [tbl2, j2_cond] 49 | j2 = [j2, {"full": False, "isouter": False}] 50 | my_joins.append(j2) 51 | return my_joins 52 | 53 | def soc_crosswalk_join(tbl1, tbl2, col): 54 | my_joins = [] 55 | cond1 = True 56 | pums_table = None 57 | bls_table = None 58 | 59 | if tbl1.get_schema_name() == "bls": 60 | bls_table = tbl1 61 | elif tbl2.get_schema_name() == "bls": 62 | bls_table = tbl2 63 | if tbl1.get_schema_name().startswith("pums"): 64 | pums_table = tbl1 65 | elif tbl2.get_schema_name().startswith("pums"): 66 | pums_table = tbl2 67 | 68 | if pums_table and bls_table: 69 | AliasedSocCrosswalk = aliased(SocCrosswalk) 70 | cond1 = AliasedSocCrosswalk.pums_soc == pums_table.soc if tbl1 is pums_table else AliasedSocCrosswalk.bls_soc == bls_table.soc 71 | j1 = [[AliasedSocCrosswalk, cond1], {"full": False, "isouter": False}] 72 | my_joins.append(j1) 73 | j2_cond = and_(AliasedSocCrosswalk.pums_soc == pums_table.soc, 74 | AliasedSocCrosswalk.bls_soc == bls_table.soc) 75 | j2 = [[tbl2, j2_cond], {"full": False, "isouter": False}] 76 | my_joins.append(j2) 77 | else: 78 | onet_table = tbl1 if tbl1.get_schema_name() == 'onet' else tbl2 79 | other_table = pums_table or bls_table 80 | j2_cond = or_(onet_table.soc == other_table.soc, 81 | onet_table.soc == func.left(other_table.soc, 2) + '0000', 82 | onet_table.soc == func.left(other_table.soc, 3) + '000', 83 | onet_table.soc == func.left(other_table.soc, 3) + '100', 84 | onet_table.soc == func.left(other_table.soc, 5) + '0') 85 | my_joins.append([[tbl2, j2_cond], {}]) 86 | return my_joins 87 | 88 | def cip_crosswalk_join(tbl1, tbl2, col): 89 | if tbl1.get_schema_name().startswith('pums'): 90 | pums_table = tbl1 91 | elif tbl2.get_schema_name().startswith('pums'): 92 | pums_table = tbl2 93 | if tbl1.get_schema_name() == 'ipeds': 94 | deeper_table = tbl1 95 | elif tbl2.get_schema_name() == 'ipeds': 96 | deeper_table = tbl2 97 | if tbl1.get_schema_name() == 'onet': 98 | deeper_table = tbl1 99 | elif tbl1.get_schema_name() == 'onet': 100 | deeper_table = tbl2 101 | direct_join = getattr(pums_table, col) == func.left(getattr(deeper_table, col), 2) 102 | 103 | my_joins = [[[tbl2, direct_join], {"full": False, "isouter": False}]] 104 | return my_joins 105 | -------------------------------------------------------------------------------- /datausa/core/exceptions.py: -------------------------------------------------------------------------------- 1 | class DataUSAException(Exception): pass -------------------------------------------------------------------------------- /datausa/core/models.py: -------------------------------------------------------------------------------- 1 | from datausa.core.exceptions import DataUSAException 2 | from datausa.attrs.consts import ALL, OR 3 | 4 | 5 | class BaseModel(object): 6 | median_moe = None 7 | size = None 8 | source_title = '' 9 | source_link = '' 10 | source_org = '' 11 | 12 | # def __init__(levels, moe, size): 13 | # self.supported_levels = levels 14 | # self.median_moe = moe 15 | # self.size = size 16 | 17 | @classmethod 18 | def get_supported_level(cls): 19 | return {} 20 | 21 | @classmethod 22 | def info(cls, api_obj=None): 23 | dataset = cls.source_title 24 | if api_obj and api_obj.get_year(): 25 | dataset = "{} {}".format(api_obj.get_year(), dataset) 26 | return { 27 | "dataset": dataset, 28 | "org": cls.source_org, 29 | "table": cls.full_name(), 30 | "link": cls.source_link, 31 | "supported_levels": cls.get_supported_levels(), 32 | } 33 | 34 | @classmethod 35 | def full_name(cls): 36 | table_name = cls.__tablename__ 37 | schema_name = cls.__table_args__["schema"] 38 | return "{}.{}".format(schema_name, table_name) 39 | 40 | @classmethod 41 | def get_schema_name(cls): 42 | return cls.__table_args__["schema"] 43 | 44 | @classmethod 45 | def col_strs(cls, short_name=False): 46 | results = [str(col) for col in cls.__table__.columns] 47 | if short_name: 48 | results = [col_name.split(".")[-1] for col_name in results] 49 | return results 50 | 51 | @classmethod 52 | def can_show(cls, attr, lvl): 53 | supported = cls.get_supported_levels() 54 | return attr in supported and lvl in supported[attr] 55 | 56 | class ApiObject(object): 57 | def __init__(self, **kwargs): 58 | allowed = ["vars_needed", "vars_and_vals", "values", 59 | "shows_and_levels", "force", "where", "order", 60 | "sort", "limit", "exclude", "auto_crosswalk", 61 | "display_names", "offset"] 62 | self._year = None 63 | self.auto_crosswalk = False 64 | self.display_names = False 65 | self.offset = None 66 | self.vars_and_vals = {} 67 | for keyword, value in kwargs.items(): 68 | if keyword in allowed: 69 | setattr(self, keyword, value) 70 | else: 71 | raise DataUSAException("Invalid ApiObject attribute") 72 | if self.limit: 73 | self.limit = int(self.limit) 74 | if self.offset: 75 | self.offset = int(self.offset) 76 | self.subs = {} 77 | self.table_list = [] 78 | self.warnings = [] 79 | if self.exclude: 80 | self.exclude = self.exclude.split(",") 81 | if hasattr(self, "year") and self.year != ALL: 82 | self._year = self.year 83 | self.force_schema = None 84 | self.auto_crosswalk = self.auto_crosswalk in [True, 'true', '1'] 85 | self.display_names = self.display_names in ['true', '1'] 86 | # if not "geo" in self.shows_and_levels and "geo" in self.vars_and_vals: 87 | # if self.vars_and_vals["geo"]: 88 | # prefix = self.vars_and_vals["geo"][:3] 89 | # lookup = {"010": "nation", "040": "state", "050": "county", "310":"msa", "795":"puma", "160":"place"} 90 | # if prefix in lookup: 91 | # self.shows_and_levels["geo"] = lookup[prefix] 92 | def set_year(self, yr): 93 | self._year = str(int(yr)) 94 | 95 | def get_year(self): 96 | return self._year 97 | 98 | def capture_logic(self, table_list): 99 | self.table_list = table_list 100 | 101 | def warn(self, msg): 102 | self.warnings.append(msg) 103 | 104 | def record_sub(self, tbl, col, orig_val, new_val): 105 | deltas = [{"original": ov, "replacement": nv} for ov, nv in zip(orig_val, new_val) if ov != nv] 106 | 107 | tbl_name = tbl.full_name() 108 | if tbl_name not in self.subs: 109 | self.subs[tbl_name] = {} 110 | if col not in self.subs[tbl_name]: 111 | self.subs[tbl_name][col] = {} 112 | self.subs[tbl_name][col] = deltas 113 | 114 | def where_vars(self): 115 | if not hasattr(self, "where") or not self.where: 116 | return [] 117 | # split by commas 118 | wheres = self.where.split(",") 119 | # then split by colons, and take the last item after period e.g. 120 | var_names = [x.split(":")[0].split(".")[-1] for x in wheres] 121 | var_names = [x for x in var_names if x != 'sumlevel'] 122 | # so where=year:2014,grads_total.degree:5 => ['year', 'degree'] 123 | return var_names 124 | -------------------------------------------------------------------------------- /datausa/core/registrar.py: -------------------------------------------------------------------------------- 1 | from datausa.pums.models import * 2 | from datausa.pums.models_5 import * 3 | from datausa.ipeds.models import * 4 | from datausa.onet.models import * 5 | from datausa.chr.models import * 6 | from datausa.bls.models import * 7 | from datausa.cbp.models import * 8 | from datausa.bea.models import * 9 | from datausa.acs.models import * 10 | from datausa.acs.automap_models import * 11 | from datausa.acs.stats_models import * 12 | from datausa.dartmouth.models import * 13 | from datausa.freight.models import * 14 | from datausa.ed.models import DefaultsYg, DefaultsYu, DefaultsYur, DefaultsYure 15 | from datausa.attrs.models import UniversityCrosswalk 16 | from datausa.opiods.models import DrugOverdoseDeathRate, OpiodOverdoseDeathRate, NonMedUsePainMeds 17 | 18 | registered_models = [ 19 | # PUMS 20 | Yg, Ygd, Ygr, Ygi, Ygio, 21 | Yo, Yow, Yos, Yod, Yor, Yoas, 22 | Ygo, Ygw, Ygor, Ygs, Ygb, Ygos, 23 | 24 | Yc, Ygc, Yca, Ycd, Ycb, Yoc, Yic, Ycs, 25 | Yi, Yio, Yior, Yios, Yocd, Yid, Yir, Yis, 26 | Yiw, 27 | Ya, 28 | 29 | # PUMS 5-year tables 30 | Ygo5, Ygi5, Yoas5, Ygor5, Ygos5, Ygb5, 31 | 32 | # IPEDS 33 | TuitionYu, TuitionYc, TuitionYcu, TuitionYcs, TuitionYgs, 34 | GradsYu, GradsYcu, GradsYc, GradsYcd, GradsYgd, GradsYud, GradsYucd, 35 | GradsYg, GradsYgc, GradsYgu, GradsYgs, GradsYgcd, 36 | GradsPctYcu, 37 | UnivGeo, 38 | AdmissionsY, 39 | AdmissionsYu, 40 | EnrollmentEfaYusrle, 41 | EnrollmentEfaYusrle, 42 | EnrollmentEfaYus, 43 | EnrollmentEfaYue, 44 | EnrollmentEfaYul, 45 | EnrollmentEfaYur, 46 | LivingArrangementSfaYa, LivingArrangementSfaYu, LivingArrangementSfaYua, 47 | GradRateGrYu, GradRateGrYur, GradRateGrYus, GradRateGrYusr, 48 | FinancialsYu, 49 | AidSfaYui, UniversitySfaYu, 50 | FinancialsEndowmentQuintilesYu, 51 | RetentionEfdYu, 52 | NISSalariesYu, 53 | NISSalariesYuo, 54 | ISSalariesYu, 55 | ISSalariesYua, 56 | ISSalariesYus, 57 | ISSalariesYuas, 58 | ExpensesYu, 59 | ExpensesYue, 60 | ICLivingExpensesYua, ICMaxLivingExpensesYua, 61 | GradRateTimeframeYut, 62 | # ONET 63 | SkillByCip, SkillBySoc, 64 | 65 | # Dartmouth 66 | YgPrimaryCare, YgReimbursements, YgcPostDischarge, 67 | 68 | # County Health Rankings 69 | HealthYg, 70 | 71 | # Bureau of Labor Statistics 72 | OesYgo, QcewYgi, GrowthI16, GrowthI, GrowthO16, GrowthO, CesYi, 73 | 74 | # County Business Patterns 75 | CbpYgi, CbpYg, 76 | 77 | # BEA I/O Tables 78 | BeaUse, 79 | 80 | # ACS 1-year 81 | Acs1_Ygl_Speakers, 82 | Acs1_Ygo_Num_Emp, Acs1_Ygo_Earnings, Acs1_Ygi_Num_Emp, 83 | Acs1_Yg, Acs1_Yg_IncDist, Acs1_Yg_PovertyRace, 84 | Acs1_Yg_NatAge, Acs1_Yg_Race, Acs1_Yg_Conflict, 85 | Acs1_Yg_PropertyValue, Acs1_Yg_PropertyTax, Acs1_Yg_Vehicles, 86 | Acs1_Yg_TravelTime, Acs1_Yg_Transport, 87 | Acs1_Yg_Poverty, Acs1_Yg_Tenure, Acs1_Yg_Income, 88 | Acs1_Yg_Num_Emp, 89 | # ACS 90 | Acs5_Yg, Acs5_Yg_Income, Acs5_Yg_Conflict, Acs5_Yg_IncDist, 91 | Acs5_Ygo_Earnings, 92 | Acs5_Yg_NatAge, Acs5_Yg_Race, Acs5_Yg_Tenure, Acs5_Yg_Transport, 93 | Acs5_Yg_TravelTime, Acs5_Yg_Vehicles, Acs5_Yg_Poverty, 94 | Acs5_Yg_PropertyTax, Acs5_Yg_PropertyValue, Acs5_Ygl_Speakers, 95 | Acs5_Yg_PovertyRace, 96 | Acs5_Ygo_Num_Emp, Acs5_Ygi_Num_Emp, 97 | Acs5_Yg_Num_Emp, 98 | # ACS 3-year 99 | Acs3_Ygo_Num_Emp, Acs3_Ygi_Num_Emp, Acs3_Ygi_MedEarnings, 100 | 101 | # Stats 102 | StateStats, CountyStats, MSAStats, PlaceStats, PUMAStats, 103 | 104 | # ACS Health 105 | Acs1_Yga_Health, Acs1_Ygai_Health, Acs1_Ygais_Health, 106 | Acs1_Ygas_Health, Acs1_Ygi_Health, Acs1_Ygis_Health, Acs1_Ygs_Health, 107 | 108 | # Freight 109 | FAFYodmp, FAFYodp, FAFYomp, FAFYodm, FAFYop, FAFYdp, FAFYdm, FAFYom, FAFYod, 110 | 111 | # Loans 112 | DefaultsYu, DefaultsYg, DefaultsYur, DefaultsYure, UniversityCrosswalk, 113 | 114 | # Opiods 115 | DrugOverdoseDeathRate, OpiodOverdoseDeathRate, NonMedUsePainMeds 116 | ] 117 | 118 | 119 | def register(cls): 120 | registered_models.append(cls) 121 | -------------------------------------------------------------------------------- /datausa/core/streaming.py: -------------------------------------------------------------------------------- 1 | '''Module to provide streaming of sqlalchemy queries back to client''' 2 | import simplejson 3 | from flask import Response 4 | 5 | def stream_qry_csv(cols, qry, api_obj): 6 | def generate(): 7 | yield ','.join([col if isinstance(col, basestring) else col.key for col in cols]) + '\n' 8 | for row in qry: 9 | row = [u'"{}"'.format(x) if isinstance(x, basestring) else str(x) for x in list(row)] 10 | yield u','.join(row) + u'\n' 11 | return Response(generate(), mimetype='text/csv') 12 | 13 | def stream_qry(tables, cols, data, api_obj): 14 | ''' Based on https://github.com/al4/orlo/blob/1b3930bae4aa37eb51aed33a97c088e576cb5a99/orlo/route_api.py#L285-L311''' 15 | def generate(tables): 16 | headers = [col if isinstance(col, basestring) else col.key for col in cols] 17 | inf = float('inf') 18 | 19 | """ 20 | A lagging generator to stream JSON so we don't have to hold everything in memory 21 | This is a little tricky, as we need to omit the last comma to make valid JSON, 22 | thus we use a lagging generator, similar to http://stackoverflow.com/questions/1630320/ 23 | """ 24 | yield u'{' 25 | 26 | rows = data.__iter__() 27 | try: 28 | prev_row = next(rows) # get first result 29 | except StopIteration: 30 | # StopIteration here means the length was zero, so yield a valid releases doc and stop 31 | yield u'''"data": [], 32 | "headers": {}, 33 | "source": {}, 34 | "subs": {}, 35 | "limit": {}, 36 | "warnings": {} 37 | '''.format(simplejson.dumps(list(headers)), simplejson.dumps([table.info(api_obj) for table in tables]), simplejson.dumps(api_obj.subs), 38 | api_obj.limit, 39 | simplejson.dumps(api_obj.warnings)) + u'}' 40 | raise StopIteration 41 | 42 | # We have some releases. First, yield the opening json 43 | yield u'"data": [' 44 | 45 | # Iterate over the releases 46 | for row in rows: 47 | yield simplejson.dumps([x if x != inf else None for x in prev_row]) + u', ' 48 | prev_row = row 49 | 50 | # Now yield the last iteration without comma 51 | yield simplejson.dumps([x if x != inf else None for x in prev_row]) 52 | 53 | yield u'''], "headers": {}, 54 | "source": {}, 55 | "subs": {}, 56 | "limit": {}, 57 | "warnings": {} 58 | '''.format(simplejson.dumps(list(headers)), simplejson.dumps([table.info(api_obj) for table in tables]), simplejson.dumps(api_obj.subs), 59 | api_obj.limit, 60 | simplejson.dumps(api_obj.warnings)) + u'}' 61 | 62 | return Response(generate(tables), content_type='application/json') 63 | -------------------------------------------------------------------------------- /datausa/core/views.py: -------------------------------------------------------------------------------- 1 | from flask import Blueprint, request, jsonify 2 | from datausa.attrs.models import Cip, Naics, University 3 | from datausa.core import table_manager 4 | from datausa.core import api, join_api 5 | from datausa.core.models import ApiObject 6 | from datausa.core.crosswalker import crosswalk 7 | from datausa.util.big_places import is_big_geo 8 | from datausa.core.exceptions import DataUSAException 9 | 10 | 11 | mod = Blueprint('core', __name__, url_prefix='/api') 12 | 13 | manager = table_manager.TableManager() 14 | 15 | def show_attrs(attr_obj): 16 | attrs = attr_obj.query.all() 17 | data = [a.serialize() for a in attrs] 18 | return jsonify(data=data) 19 | 20 | def build_api_obj(default_limit=None): 21 | show = request.args.get("show", "") 22 | sumlevel = request.args.get("sumlevel", "").lower() 23 | required = request.args.get("required", "") 24 | force = request.args.get("force", "") 25 | where = request.args.get("where", "") 26 | order = request.args.get("order", "") 27 | sort = request.args.get("sort", "") 28 | limit = request.args.get("limit", default_limit) 29 | offset = request.args.get("offset", None) 30 | exclude = request.args.get("exclude", None) 31 | auto_crosswalk = request.args.get("auto_crosswalk", False) 32 | display_names = request.args.get("display_names", False) 33 | 34 | shows = show.split(",") 35 | sumlevels = sumlevel.split(",") 36 | if shows and not sumlevel: 37 | sumlevels = ["all" for show in shows] 38 | values = required.split(",") if required else [] 39 | 40 | shows_and_levels = {val:sumlevels[idx] for idx, val in enumerate(shows)} 41 | 42 | variables = manager.possible_variables 43 | vars_and_vals = {var:request.args.get(var, None) for var in variables} 44 | vars_and_vals = {k:v for k,v in vars_and_vals.items() if v} 45 | 46 | 47 | vars_needed = vars_and_vals.keys() + shows + values 48 | api_obj = ApiObject(vars_needed=vars_needed, vars_and_vals=vars_and_vals, 49 | shows_and_levels=shows_and_levels, values=values, 50 | where=where, force=force, order=order, 51 | sort=sort, limit=limit, exclude=exclude, 52 | auto_crosswalk=auto_crosswalk, 53 | display_names=display_names, 54 | offset=offset) 55 | return api_obj 56 | 57 | @mod.route("/") 58 | @mod.route("/v1/") 59 | @mod.route("/csv/", defaults={'csv': True}) 60 | def api_view(csv=None): 61 | api_obj = build_api_obj() 62 | api_obj = manager.force_1yr_for_big_places(api_obj) 63 | api_obj = manager.schema_selector(api_obj) 64 | table_list = manager.all_tables(api_obj) 65 | table = manager.select_best(table_list, api_obj) 66 | api_obj.capture_logic(table_list) 67 | api_obj = manager.crosswalk(table, api_obj) 68 | data = api.query(table, api_obj, stream=csv) 69 | return data 70 | 71 | @mod.route("/join/") 72 | @mod.route("/join/csv/", defaults={'csv': True}) 73 | def api_join_view(csv=None): 74 | api_obj = build_api_obj(default_limit=500) 75 | if api_obj.limit and api_obj.limit > 80000: 76 | raise DataUSAException("Limit parameter must be less than 80,000") 77 | tables = manager.required_tables(api_obj) 78 | data = join_api.joinable_query(tables, api_obj, manager.table_years, csv_format=csv) 79 | return data 80 | 81 | 82 | @mod.route("/logic/") 83 | def logic_view(): 84 | api_obj = build_api_obj() 85 | table_list = manager.all_tables(api_obj) 86 | return jsonify(tables=[table.info(api_obj) for table in table_list]) 87 | 88 | @mod.route("/variables/") 89 | def view_variables(): 90 | '''show available data tables and contained variables''' 91 | shows = request.args.get("show", "").split(",") 92 | sumlevels = request.args.get("sumlevel", "").split(",") 93 | list_all = sumlevels == [""] and shows == [""] 94 | if sumlevels == [""]: 95 | sumlevels = ["all"] * len(shows) 96 | combos = zip(shows, sumlevels) 97 | results = {table.full_name(): table.col_strs(short_name=True) for table in table_manager.registered_models 98 | if list_all or all([table.can_show(show, sumlevel) for show,sumlevel in combos])} 99 | return jsonify(metadata=results) 100 | 101 | 102 | @mod.route('/table/variables/') 103 | def all_table_vars(): 104 | '''show all available data tables and contained variables''' 105 | results = {table.full_name(): table.col_strs(short_name=True) for table in table_manager.registered_models} 106 | return jsonify(metadata=results) 107 | 108 | @mod.route("/years/") 109 | def years_view(): 110 | years_data = manager.table_years_set 111 | return jsonify(data=years_data) 112 | -------------------------------------------------------------------------------- /datausa/dartmouth/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataUSA/datausa-api/7288dede082eda07b61e11cf6dc801fe692f6334/datausa/dartmouth/__init__.py -------------------------------------------------------------------------------- /datausa/dartmouth/models.py: -------------------------------------------------------------------------------- 1 | from datausa.database import db 2 | from datausa.attrs.models import Geo 3 | from datausa import cache 4 | 5 | from datausa.core.models import BaseModel 6 | from datausa.attrs.consts import NATION, STATE, COUNTY, ALL 7 | from sqlalchemy.ext.declarative import declared_attr 8 | from sqlalchemy.ext.automap import automap_base 9 | from sqlalchemy import MetaData 10 | 11 | SCHEMA_NAME = 'dartmouth' 12 | CACHE_KEY = '{}_metadata'.format(SCHEMA_NAME) 13 | metadata = cache.get(CACHE_KEY) 14 | if not metadata: 15 | metadata = MetaData(schema=SCHEMA_NAME, bind=db.engine) 16 | metadata.reflect() 17 | cache.set(CACHE_KEY, metadata) 18 | 19 | AutomapBase = automap_base(bind=db.engine, metadata=metadata) 20 | 21 | 22 | class DartmouthBase(db.Model, BaseModel): 23 | __abstract__ = True 24 | __table_args__ = {"schema": SCHEMA_NAME, "extend_existing": True} 25 | source_title = 'Dartmouth Atlas of Health Care' 26 | source_link = 'http://www.dartmouthatlas.org' 27 | source_org = 'Dartmouth College' 28 | 29 | @declared_attr 30 | def year(cls): 31 | return db.Column(db.Integer(), primary_key=True) 32 | 33 | @declared_attr 34 | def geo(cls): 35 | return db.Column(db.String(), db.ForeignKey(Geo.id), primary_key=True) 36 | 37 | @classmethod 38 | def get_supported_levels(cls): 39 | return {"geo": [ALL, NATION, STATE, COUNTY]} 40 | 41 | @classmethod 42 | def geo_filter(cls, level): 43 | if level == ALL: 44 | return True 45 | level_map = {STATE: "040", COUNTY: "050", NATION: "010"} 46 | level_code = level_map[level] 47 | return cls.geo.startswith(level_code) 48 | 49 | 50 | class YgcPostDischarge(AutomapBase, DartmouthBase): 51 | __tablename__ = 'ygc_post_discharge' 52 | median_moe = 2 53 | 54 | cohort = db.Column(db.String(), primary_key=True) 55 | 56 | @classmethod 57 | def get_supported_levels(cls): 58 | return {"geo": [ALL, NATION, STATE, COUNTY], "cohort": [ALL]} 59 | 60 | class YgPrimaryCare(AutomapBase, DartmouthBase): 61 | __tablename__ = 'yg_prim_care_access' 62 | median_moe = 1 63 | 64 | 65 | class YgReimbursements(AutomapBase, DartmouthBase): 66 | __tablename__ = 'yg_reimbursements' 67 | median_moe = 1 68 | 69 | 70 | AutomapBase.prepare(db.engine, reflect=False) 71 | -------------------------------------------------------------------------------- /datausa/database.py: -------------------------------------------------------------------------------- 1 | from datausa import app 2 | from flask_sqlalchemy import SQLAlchemy 3 | db = SQLAlchemy(app) 4 | -------------------------------------------------------------------------------- /datausa/ed/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataUSA/datausa-api/7288dede082eda07b61e11cf6dc801fe692f6334/datausa/ed/__init__.py -------------------------------------------------------------------------------- /datausa/ed/models.py: -------------------------------------------------------------------------------- 1 | from datausa.database import db 2 | from datausa.core.models import BaseModel 3 | from sqlalchemy.ext.declarative import declared_attr 4 | from datausa.attrs.consts import ALL 5 | from datausa.attrs.models import UniversityCrosswalk, Geo 6 | from sqlalchemy.orm import column_property 7 | from datausa.attrs import consts 8 | 9 | 10 | class GeoId(object): 11 | LEVELS = [consts.NATION, consts.STATE, consts.COUNTY, consts.ALL] 12 | 13 | @declared_attr 14 | def geo(cls): 15 | return db.Column(db.String(), db.ForeignKey(Geo.id), primary_key=True) 16 | 17 | @classmethod 18 | def get_supported_levels(cls): 19 | return {consts.GEO: GeoId.LEVELS} 20 | 21 | @classmethod 22 | def geo_filter(cls, level): 23 | if level == ALL: 24 | return True 25 | level_map = { 26 | consts.NATION: "010", 27 | consts.STATE: "040", 28 | consts.COUNTY: "050", 29 | consts.MSA: "310" 30 | } 31 | level_code = level_map[level] 32 | return cls.geo.startswith(level_code) 33 | 34 | 35 | class BaseEd(db.Model, BaseModel): 36 | __abstract__ = True 37 | __table_args__ = {"schema": "ed"} 38 | supported_levels = {"year": [ALL]} 39 | source_title = 'Official Cohort Default Rates for Schools' 40 | source_link = 'https://www2.ed.gov/offices/OSFAP/defaultmanagement/cdr.html' 41 | source_org = 'Department of Education' 42 | 43 | default_rate = db.Column(db.Float) 44 | num_defaults = db.Column(db.Integer) 45 | num_borrowers = db.Column(db.Integer) 46 | 47 | 48 | class UniversityCols(object): 49 | @declared_attr 50 | def opeid(cls): 51 | return db.Column(db.String(), primary_key=True) 52 | 53 | @declared_attr 54 | def university(cls): 55 | return column_property(UniversityCrosswalk.university) 56 | 57 | @classmethod 58 | def crosswalk_join(cls, qry): 59 | cond = UniversityCrosswalk.opeid6 == cls.opeid 60 | return qry.join(UniversityCrosswalk, cond) 61 | 62 | 63 | class DefaultsYu(BaseEd, UniversityCols): 64 | __tablename__ = "yu_defaults" 65 | median_moe = 1 66 | 67 | year = db.Column(db.Integer(), primary_key=True) 68 | rate_type = db.Column(db.String()) 69 | default_rate = db.Column(db.Float) 70 | num_defaults = db.Column(db.Integer) 71 | num_borrowers = db.Column(db.Integer) 72 | 73 | @classmethod 74 | def get_supported_levels(cls): 75 | return { 76 | "year": [ALL], 77 | "university": [ALL], 78 | "opeid": [ALL], 79 | } 80 | 81 | 82 | class DefaultsYg(BaseEd, GeoId): 83 | __tablename__ = "yg_defaults" 84 | median_moe = 1.1 85 | 86 | year = db.Column(db.Integer(), primary_key=True) 87 | rate_type = db.Column(db.String()) 88 | default_rate = db.Column(db.Float) 89 | num_defaults = db.Column(db.Integer) 90 | num_borrowers = db.Column(db.Integer) 91 | 92 | @classmethod 93 | def get_supported_levels(cls): 94 | return { 95 | "year": [ALL], 96 | "geo": GeoId.LEVELS 97 | } 98 | 99 | 100 | class DefaultsYur(BaseEd, UniversityCols): 101 | __tablename__ = "yur_defaults" 102 | median_moe = 2 103 | 104 | year = db.Column(db.Integer(), primary_key=True) 105 | rate_type = db.Column(db.String(), primary_key=True) 106 | default_rate = db.Column(db.Float) 107 | num_defaults = db.Column(db.Integer) 108 | num_borrowers = db.Column(db.Integer) 109 | 110 | @classmethod 111 | def get_supported_levels(cls): 112 | return { 113 | "year": [ALL], 114 | "university": [ALL], 115 | "opeid": [ALL], 116 | "rate_type": [ALL] 117 | } 118 | 119 | 120 | class DefaultsYure(BaseEd, UniversityCols): 121 | __tablename__ = "yure_defaults" 122 | median_moe = 3 123 | 124 | year = db.Column(db.Integer(), primary_key=True) 125 | rate_type = db.Column(db.String(), primary_key=True) 126 | ethnic_code = db.Column(db.Integer(), primary_key=True) 127 | 128 | @classmethod 129 | def get_supported_levels(cls): 130 | return { 131 | "year": [ALL], 132 | "university": [ALL], 133 | "opeid": [ALL], 134 | "rate_type": [ALL], 135 | "ethnic_code": [ALL], 136 | } 137 | -------------------------------------------------------------------------------- /datausa/freight/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataUSA/datausa-api/7288dede082eda07b61e11cf6dc801fe692f6334/datausa/freight/__init__.py -------------------------------------------------------------------------------- /datausa/freight/models.py: -------------------------------------------------------------------------------- 1 | from datausa.database import db 2 | from datausa.core.models import BaseModel 3 | from sqlalchemy.ext.declarative import declared_attr 4 | from datausa.attrs.consts import STATE, COUNTY, ALL 5 | from datausa.attrs.models import Geo, Sctg, ProductCrosswalk 6 | from sqlalchemy.orm import column_property 7 | 8 | class BaseFreight(db.Model, BaseModel): 9 | __abstract__ = True 10 | __table_args__ = {"schema": "freight"} 11 | supported_levels = {} 12 | source_title = 'Freight Analysis Framework' 13 | source_link = 'https://www.rita.dot.gov/bts/sites/rita.dot.gov.bts/files/subject_areas/freight_transportation/faf' 14 | source_org = 'Bureau of Transportation Statistics' 15 | tons = db.Column(db.Float) 16 | millions_of_2012_dollars = db.Column(db.Float) 17 | 18 | 19 | class Product(object): 20 | @declared_attr 21 | def sctg(cls): 22 | return db.Column(db.String(), db.ForeignKey(Sctg.id), primary_key=True) 23 | 24 | @declared_attr 25 | def napcs(cls): 26 | return column_property(ProductCrosswalk.napcs) 27 | 28 | @classmethod 29 | def crosswalk_join(cls, qry): 30 | cond = ProductCrosswalk.sctg == cls.sctg 31 | return qry.join(ProductCrosswalk, cond) 32 | 33 | class OriginGeo(object): 34 | @declared_attr 35 | def origin_geo(cls): 36 | return db.Column(db.String(), db.ForeignKey(Geo.id), primary_key=True) 37 | 38 | @classmethod 39 | def origin_geo_filter(cls, level): 40 | if level == ALL: 41 | return True 42 | level_map = {STATE: "040", COUNTY: "050"} 43 | level_code = level_map[level] 44 | return cls.origin_geo.startswith(level_code) 45 | 46 | class DestGeo(object): 47 | @declared_attr 48 | def destination_geo(cls): 49 | return db.Column(db.String(), db.ForeignKey(Geo.id), primary_key=True) 50 | 51 | @classmethod 52 | def destination_geo_filter(cls, level): 53 | if level == ALL: 54 | return True 55 | level_map = {STATE: "040", COUNTY: "050"} 56 | level_code = level_map[level] 57 | return cls.destination_geo.startswith(level_code) 58 | 59 | 60 | class FAFYodmp(BaseFreight, OriginGeo, DestGeo, Product): 61 | __tablename__ = "yodmp_faf" 62 | median_moe = 4 63 | year = db.Column(db.Integer(), primary_key=True) 64 | transportation_mode = db.Column(db.String(), primary_key=True) 65 | 66 | @classmethod 67 | def get_supported_levels(cls): 68 | return { 69 | "origin_geo": [STATE, COUNTY, ALL], 70 | "destination_geo": [STATE, COUNTY, ALL], 71 | "transportation_mode": [ALL], 72 | "sctg": [ALL], 73 | "napcs": [ALL] 74 | } 75 | 76 | class FAFYodm(BaseFreight, OriginGeo, DestGeo): 77 | __tablename__ = "yodm_faf" 78 | median_moe = 3 79 | year = db.Column(db.Integer(), primary_key=True) 80 | transportation_mode = db.Column(db.String(), primary_key=True) 81 | 82 | @classmethod 83 | def get_supported_levels(cls): 84 | return { 85 | "origin_geo": [STATE, COUNTY, ALL], 86 | "destination_geo": [STATE, COUNTY, ALL], 87 | "transportation_mode": [ALL] 88 | } 89 | 90 | class FAFYod(BaseFreight, OriginGeo, DestGeo): 91 | __tablename__ = "yod_faf" 92 | median_moe = 2 93 | year = db.Column(db.Integer(), primary_key=True) 94 | 95 | @classmethod 96 | def get_supported_levels(cls): 97 | return { 98 | "origin_geo": [STATE, COUNTY, ALL], 99 | "destination_geo": [STATE, COUNTY, ALL] 100 | } 101 | 102 | 103 | class FAFYodp(BaseFreight, OriginGeo, DestGeo, Product): 104 | __tablename__ = "yodp_faf" 105 | median_moe = 3 106 | year = db.Column(db.Integer(), primary_key=True) 107 | 108 | @classmethod 109 | def get_supported_levels(cls): 110 | return { 111 | "origin_geo": [STATE, COUNTY, ALL], 112 | "destination_geo": [STATE, COUNTY, ALL], 113 | "sctg": [ALL], 114 | "napcs": [ALL] 115 | } 116 | 117 | class FAFYomp(BaseFreight, OriginGeo, Product): 118 | __tablename__ = "yomp_faf" 119 | median_moe = 3 120 | year = db.Column(db.Integer(), primary_key=True) 121 | transportation_mode = db.Column(db.String(), primary_key=True) 122 | 123 | @classmethod 124 | def get_supported_levels(cls): 125 | return { 126 | "origin_geo": [STATE, COUNTY, ALL], 127 | "transportation_mode": [ALL], 128 | "sctg": [ALL], 129 | "napcs": [ALL] 130 | } 131 | 132 | class FAFYop(BaseFreight, OriginGeo, Product): 133 | __tablename__ = "yop_faf" 134 | median_moe = 2 135 | year = db.Column(db.Integer(), primary_key=True) 136 | 137 | @classmethod 138 | def get_supported_levels(cls): 139 | return { 140 | "origin_geo": [STATE, COUNTY, ALL], 141 | "sctg": [ALL], 142 | "napcs": [ALL] 143 | } 144 | 145 | class FAFYdp(BaseFreight, DestGeo, Product): 146 | __tablename__ = "ydp_faf" 147 | median_moe = 2 148 | year = db.Column(db.Integer(), primary_key=True) 149 | 150 | @classmethod 151 | def get_supported_levels(cls): 152 | return { 153 | "destination_geo": [STATE, COUNTY, ALL], 154 | "sctg": [ALL], 155 | "napcs": [ALL] 156 | } 157 | 158 | class FAFYdm(BaseFreight, DestGeo): 159 | __tablename__ = "ydm_faf" 160 | median_moe = 2 161 | year = db.Column(db.Integer(), primary_key=True) 162 | transportation_mode = db.Column(db.String(), primary_key=True) 163 | 164 | @classmethod 165 | def get_supported_levels(cls): 166 | return { 167 | "destination_geo": [STATE, COUNTY, ALL], 168 | "sctg": [ALL], 169 | "transportation_mode": [ALL] 170 | } 171 | 172 | class FAFYom(BaseFreight, OriginGeo): 173 | __tablename__ = "yom_faf" 174 | median_moe = 2 175 | year = db.Column(db.Integer(), primary_key=True) 176 | transportation_mode = db.Column(db.String(), primary_key=True) 177 | 178 | @classmethod 179 | def get_supported_levels(cls): 180 | return { 181 | "origin_geo": [STATE, COUNTY, ALL], 182 | "sctg": [ALL], 183 | "transportation_mode": [ALL] 184 | } 185 | -------------------------------------------------------------------------------- /datausa/ipeds/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataUSA/datausa-api/7288dede082eda07b61e11cf6dc801fe692f6334/datausa/ipeds/__init__.py -------------------------------------------------------------------------------- /datausa/ipeds/abstract_models.py: -------------------------------------------------------------------------------- 1 | from datausa.database import db 2 | from datausa.attrs.models import University, Cip, Geo, EnrollmentStatus 3 | from datausa.attrs.models import Degree, Sector, LStudy, IPedsRace, IPedsOcc 4 | from datausa.attrs.models import LivingArrangement, IncomeRange, AcademicRank 5 | from datausa.attrs.models import IPedsExpense 6 | from datausa.core.models import BaseModel 7 | from datausa.attrs.consts import NATION, STATE, COUNTY, MSA 8 | from datausa.attrs.consts import PUMA, PLACE, ALL, GEO 9 | 10 | from sqlalchemy.ext.declarative import declared_attr 11 | from sqlalchemy.sql import func 12 | 13 | 14 | class BaseIpeds(db.Model, BaseModel): 15 | __abstract__ = True 16 | __table_args__ = {"schema": "ipeds"} 17 | supported_levels = {} 18 | source_title = 'NCES IPEDS' 19 | source_link = 'http://nces.ed.gov/ipeds/' 20 | source_org = 'Department of Education' 21 | 22 | 23 | class Enrollment(BaseIpeds): 24 | __abstract__ = True 25 | 26 | enrolled_total = db.Column(db.Integer()) 27 | enrolled_men = db.Column(db.Integer()) 28 | enrolled_women = db.Column(db.Integer()) 29 | enrolled_black = db.Column(db.Integer()) 30 | enrolled_asian = db.Column(db.Integer()) 31 | enrolled_native = db.Column(db.Integer()) 32 | enrolled_unknown = db.Column(db.Integer()) 33 | 34 | 35 | class Tuition(BaseIpeds): 36 | __abstract__ = True 37 | 38 | oos_tuition = db.Column(db.Integer()) 39 | state_tuition = db.Column(db.Integer()) 40 | district_tuition = db.Column(db.Integer()) 41 | 42 | oos_fee = db.Column(db.Integer()) 43 | state_fee = db.Column(db.Integer()) 44 | district_fee = db.Column(db.Integer()) 45 | 46 | oos_tuition_grads = db.Column(db.Integer()) 47 | state_tuition_grads = db.Column(db.Integer()) 48 | district_tuition_grads = db.Column(db.Integer()) 49 | 50 | oos_fee_grads = db.Column(db.Integer()) 51 | state_fee_grads = db.Column(db.Integer()) 52 | district_fee_grads = db.Column(db.Integer()) 53 | 54 | class GradsPct(BaseIpeds): 55 | __abstract__ = True 56 | pct_total = db.Column(db.Float()) 57 | pct_men = db.Column(db.Float()) 58 | pct_women = db.Column(db.Float()) 59 | 60 | 61 | class Grads(BaseIpeds): 62 | __abstract__ = True 63 | grads_total = db.Column(db.Integer()) 64 | grads_men = db.Column(db.Integer()) 65 | grads_women = db.Column(db.Integer()) 66 | grads_native = db.Column(db.Integer()) 67 | grads_native_men = db.Column(db.Integer()) 68 | grads_native_women = db.Column(db.Integer()) 69 | grads_asian = db.Column(db.Integer()) 70 | grads_asian_men = db.Column(db.Integer()) 71 | grads_asian_women = db.Column(db.Integer()) 72 | grads_black = db.Column(db.Integer()) 73 | grads_black_men = db.Column(db.Integer()) 74 | grads_black_women = db.Column(db.Integer()) 75 | grads_hispanic = db.Column(db.Integer()) 76 | grads_hispanic_men = db.Column(db.Integer()) 77 | grads_hispanic_women = db.Column(db.Integer()) 78 | grads_hawaiian = db.Column(db.Integer()) 79 | grads_hawaiian_men = db.Column(db.Integer()) 80 | grads_hawaiian_women = db.Column(db.Integer()) 81 | grads_white = db.Column(db.Integer()) 82 | grads_white_men = db.Column(db.Integer()) 83 | grads_white_women = db.Column(db.Integer()) 84 | grads_multi = db.Column(db.Integer()) 85 | grads_multi_men = db.Column(db.Integer()) 86 | grads_multi_women = db.Column(db.Integer()) 87 | grads_unknown = db.Column(db.Integer()) 88 | grads_unknown_men = db.Column(db.Integer()) 89 | grads_unknown_women = db.Column(db.Integer()) 90 | grads_nonresident = db.Column(db.Integer()) 91 | grads_nonresident_men = db.Column(db.Integer()) 92 | grads_nonresident_women = db.Column(db.Integer()) 93 | 94 | 95 | class GeoId(object): 96 | LEVELS = [NATION, STATE, COUNTY, PLACE, MSA, PUMA, ALL] 97 | 98 | @declared_attr 99 | def geo(cls): 100 | return db.Column(db.String(), db.ForeignKey(Geo.id), primary_key=True) 101 | 102 | @classmethod 103 | def get_supported_levels(cls): 104 | return {GEO: GeoId.LEVELS} 105 | 106 | @classmethod 107 | def geo_filter(cls, level): 108 | if level == ALL: 109 | return True 110 | level_map = {NATION: "010", STATE: "040", PUMA: "795", 111 | COUNTY: "050", MSA: "310", PLACE: "160"} 112 | level_code = level_map[level] 113 | return cls.geo.startswith(level_code) 114 | 115 | 116 | class CipId(object): 117 | LEVELS = ["2", "4", "6", "all"] 118 | 119 | @declared_attr 120 | def cip(cls): 121 | return db.Column(db.String(), db.ForeignKey(Cip.id), primary_key=True) 122 | 123 | @classmethod 124 | def get_supported_levels(cls): 125 | return {"cip": ["all", "2", "4", "6"]} 126 | 127 | @classmethod 128 | def cip_filter(cls, level): 129 | if level == 'all': 130 | return True 131 | return func.length(cls.cip) == level 132 | 133 | 134 | class UniversityId(object): 135 | LEVELS = ["all", "0", "1", "2"] 136 | # TODO add university level filter ... 137 | 138 | @declared_attr 139 | def university(cls): 140 | return db.Column(db.String(), db.ForeignKey(University.id), primary_key=True) 141 | 142 | @declared_attr 143 | def university_level(cls): 144 | return db.Column(db.Integer()) 145 | 146 | @classmethod 147 | def get_supported_levels(cls): 148 | return {"university": UniversityId.LEVELS} 149 | 150 | @classmethod 151 | def university_filter(cls, level): 152 | if level == 'all': 153 | return True 154 | return cls.university_level == level 155 | 156 | 157 | class LStudyId(object): 158 | @declared_attr 159 | def lstudy(cls): 160 | return db.Column(db.String(), db.ForeignKey(LStudy.id), primary_key=True) 161 | 162 | 163 | class EnrollmentStatusId(object): 164 | @declared_attr 165 | def enrollment_status(cls): 166 | return db.Column(db.String(), db.ForeignKey(EnrollmentStatus.id), primary_key=True) 167 | 168 | 169 | class DegreeId(object): 170 | @declared_attr 171 | def degree(cls): 172 | return db.Column(db.String(), db.ForeignKey(Degree.id), primary_key=True) 173 | 174 | 175 | class SectorId(object): 176 | @declared_attr 177 | def sector(cls): 178 | return db.Column(db.String(), db.ForeignKey(Sector.id), primary_key=True) 179 | 180 | @classmethod 181 | def get_supported_levels(cls): 182 | return {"sector": ["all"]} 183 | 184 | 185 | class Admissions(BaseIpeds): 186 | __abstract__ = True 187 | applicants_total = db.Column(db.Float) 188 | applicants_men = db.Column(db.Float) 189 | applicants_women = db.Column(db.Float) 190 | admissions_total = db.Column(db.Float) 191 | admissions_men = db.Column(db.Float) 192 | admissions_women = db.Column(db.Float) 193 | admissions_enrolled_total = db.Column(db.Float) 194 | admissions_enrolled_men = db.Column(db.Float) 195 | admissions_enrolled_women = db.Column(db.Float) 196 | admissions_enrolled_ft_total = db.Column(db.Float) 197 | admissions_enrolled_ft_men = db.Column(db.Float) 198 | admissions_enrolled_ft_women = db.Column(db.Float) 199 | admissions_enrolled_pt_total = db.Column(db.Float) 200 | admissions_enrolled_pt_men = db.Column(db.Float) 201 | admissions_enrolled_pt_women = db.Column(db.Float) 202 | sub_sat_scores_num = db.Column(db.Float) 203 | sub_act_scores_num = db.Column(db.Float) 204 | sub_sat_scores_pct = db.Column(db.Float) 205 | sub_act_scores_pct = db.Column(db.Float) 206 | sat_cr_25 = db.Column(db.Float) 207 | sat_cr_75 = db.Column(db.Float) 208 | sat_math_25 = db.Column(db.Float) 209 | sat_math_75 = db.Column(db.Float) 210 | sat_writing_25 = db.Column(db.Float) 211 | sat_writing_75 = db.Column(db.Float) 212 | act_composite_25 = db.Column(db.Float) 213 | act_composite_75 = db.Column(db.Float) 214 | act_english_25 = db.Column(db.Float) 215 | act_english_75 = db.Column(db.Float) 216 | act_math_25 = db.Column(db.Float) 217 | act_math_75 = db.Column(db.Float) 218 | act_writing_25 = db.Column(db.Float) 219 | act_writing_75 = db.Column(db.Float) 220 | yield_total = db.Column(db.Float) 221 | yield_men = db.Column(db.Float) 222 | yield_women = db.Column(db.Float) 223 | 224 | 225 | class IPedsRaceId(object): 226 | @declared_attr 227 | def ipeds_race(cls): 228 | return db.Column(db.String(), db.ForeignKey(IPedsRace.id), primary_key=True) 229 | 230 | 231 | class EnrollmentEfa(BaseIpeds): 232 | __abstract__ = True 233 | num_enrolled = db.Column(db.Float) 234 | 235 | 236 | class LivingArrangementId(object): 237 | @declared_attr 238 | def living_arrangement(cls): 239 | return db.Column(db.String(), db.ForeignKey(LivingArrangement.id), primary_key=True) 240 | 241 | 242 | class IncomeRangeId(object): 243 | @declared_attr 244 | def income_range(cls): 245 | return db.Column(db.String(), db.ForeignKey(IncomeRange.id), primary_key=True) 246 | 247 | 248 | class SfaLivingBase(BaseIpeds): 249 | __abstract__ = True 250 | num_in_living_arrangement = db.Column(db.Float) 251 | 252 | 253 | class GradRateBase(BaseIpeds): 254 | __abstract__ = True 255 | grad_rate = db.Column(db.Float) 256 | cohort_size = db.Column(db.Float) 257 | num_finishers = db.Column(db.Float) 258 | 259 | 260 | class FinancialsBase(BaseIpeds): 261 | __abstract__ = True 262 | endowment_value_fiscal_year_end = db.Column(db.Float) 263 | federal_grants_and_contracts = db.Column(db.Float) 264 | investment_income = db.Column(db.Float) 265 | local_grants = db.Column(db.Float) 266 | local_grants_and_contracts = db.Column(db.Float) 267 | other_federal_grants = db.Column(db.Float) 268 | pell_grants = db.Column(db.Float) 269 | private_grants = db.Column(db.Float) 270 | research_rank = db.Column(db.Float) 271 | research_rank_carnegie = db.Column(db.Float) 272 | research_rank_carnegie_pct = db.Column(db.Float) 273 | research_rank_pct = db.Column(db.Float) 274 | research_total = db.Column(db.Float) 275 | state_grants = db.Column(db.Float) 276 | state_grants_and_contracts = db.Column(db.Float) 277 | total_expenses = db.Column(db.Float) 278 | tuition_and_fees = db.Column(db.Float) 279 | total_salaries = db.Column(db.Float) 280 | 281 | 282 | class ExpensesBase(BaseIpeds): 283 | __abstract__ = True 284 | benefits_expense = db.Column(db.Float) 285 | dep_expense = db.Column(db.Float) 286 | interest_expense = db.Column(db.Float) 287 | ops_expense = db.Column(db.Float) 288 | other_expense = db.Column(db.Float) 289 | salaries_expense = db.Column(db.Float) 290 | 291 | 292 | class NISSalariesBase(BaseIpeds): 293 | __abstract__ = True 294 | num_noninstructional_staff = db.Column(db.Float) 295 | outlays_noninstructional_staff = db.Column(db.Float) 296 | 297 | 298 | class ISSalariesBase(BaseIpeds): 299 | __abstract__ = True 300 | num_instructional_staff = db.Column(db.Float) 301 | outlays_instructional_staff = db.Column(db.Float) 302 | months_covered_instructional_staff = db.Column(db.Float) 303 | 304 | 305 | class IPedsOccId(object): 306 | @declared_attr 307 | def ipeds_occ(cls): 308 | return db.Column(db.String(), db.ForeignKey(IPedsOcc.id), primary_key=True) 309 | 310 | 311 | class AcademicRankId(object): 312 | @declared_attr 313 | def academic_rank(cls): 314 | return db.Column(db.String(), db.ForeignKey(AcademicRank.id), primary_key=True) 315 | 316 | 317 | class IPedsExpenseId(object): 318 | @declared_attr 319 | def ipeds_expense(cls): 320 | return db.Column(db.String(), db.ForeignKey(IPedsExpense.id), primary_key=True) 321 | -------------------------------------------------------------------------------- /datausa/onet/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataUSA/datausa-api/7288dede082eda07b61e11cf6dc801fe692f6334/datausa/onet/__init__.py -------------------------------------------------------------------------------- /datausa/onet/models.py: -------------------------------------------------------------------------------- 1 | from datausa.database import db 2 | from datausa.attrs.models import Skill 3 | from datausa.ipeds.abstract_models import CipId 4 | from datausa.core.models import BaseModel 5 | from sqlalchemy.ext.declarative import declared_attr 6 | 7 | class BaseOnet(db.Model, BaseModel): 8 | __abstract__ = True 9 | __table_args__ = {"schema": "onet"} 10 | supported_levels = {} 11 | source_title = 'O*NET' 12 | source_link = 'http://www.onetonline.org/' 13 | source_org = 'Department of Labor' 14 | 15 | class SkillId(object): 16 | @declared_attr 17 | def skill(cls): 18 | return db.Column(db.String(), db.ForeignKey(Skill.id), primary_key=True) 19 | 20 | @classmethod 21 | def get_supported_levels(cls): 22 | return {"cip": ["2", "4", "6"]} 23 | 24 | class SkillByCip(BaseOnet, SkillId, CipId): 25 | __tablename__ = "skills_by_cip" 26 | median_moe = 1 27 | 28 | value = db.Column(db.Float) 29 | value_rca = db.Column(db.Float) 30 | 31 | @classmethod 32 | def get_supported_levels(cls): 33 | return {"cip": ["2", "4", "6", "all"], "skill": ["all"]} 34 | 35 | class SkillBySoc(BaseOnet, SkillId): 36 | __tablename__ = "skills_by_soc" 37 | median_moe = 1 38 | 39 | soc = db.Column(db.String(), primary_key=True) 40 | value = db.Column(db.Float) 41 | value_rca = db.Column(db.Float) 42 | 43 | @classmethod 44 | def get_supported_levels(cls): 45 | return {"soc": ["all"], "skill": ["all"]} 46 | -------------------------------------------------------------------------------- /datausa/opiods/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataUSA/datausa-api/7288dede082eda07b61e11cf6dc801fe692f6334/datausa/opiods/__init__.py -------------------------------------------------------------------------------- /datausa/opiods/models.py: -------------------------------------------------------------------------------- 1 | from datausa.database import db 2 | from datausa.core.models import BaseModel 3 | from datausa.attrs.consts import NATION, STATE, ALL 4 | from datausa.attrs.models import Geo 5 | from sqlalchemy.ext.declarative import declared_attr 6 | 7 | 8 | class BaseOpiods(db.Model, BaseModel): 9 | __abstract__ = True 10 | __table_args__ = {"schema": "opiods"} 11 | supported_levels = {"year": [ALL]} 12 | source_title = 'Kaiser Family Foundation analysis of Centers for Disease Control and Prevention (CDC), National Center for Health Statistics' 13 | source_link = 'https://www.kff.org/other/state-indicator/opioid-overdose-death-rates/' 14 | source_org = 'Kaiser Family Foundation State Health Facts' 15 | 16 | @classmethod 17 | def get_supported_levels(cls): 18 | return { 19 | "year": [ALL], 20 | "geo": [ALL, NATION, STATE] 21 | } 22 | 23 | @classmethod 24 | def geo_filter(cls, level): 25 | if level == ALL: 26 | return True 27 | level_map = {STATE: "040", NATION: "010"} 28 | level_code = level_map[level] 29 | return cls.geo.startswith(level_code) 30 | 31 | @declared_attr 32 | def geo(cls): 33 | return db.Column(db.String(), db.ForeignKey(Geo.id), primary_key=True) 34 | 35 | 36 | class DrugOverdoseDeathRate(BaseOpiods): 37 | __tablename__ = "drug_overdose_deathrate" 38 | median_moe = 1 39 | source_link = 'https://www.kff.org/other/state-indicator/opioid-overdose-death-rates/?currentTimeframe=0&sortModel=%7B%22colId%22:%22Location%22,%22sort%22:%22asc%22%7D' 40 | year = db.Column(db.Integer(), primary_key=True) 41 | drug_overdose_ageadjusted = db.Column(db.String()) 42 | 43 | 44 | class OpiodOverdoseDeathRate(BaseOpiods): 45 | __tablename__ = "opioid_overdose_deathrate" 46 | median_moe = 1 47 | source_link = 'https://www.kff.org/other/state-indicator/opioid-overdose-death-rates/?currentTimeframe=0&sortModel=%7B%22colId%22:%22Location%22,%22sort%22:%22asc%22%7D' 48 | year = db.Column(db.Integer(), primary_key=True) 49 | opioid_overdose_deathrate_ageadjusted = db.Column(db.String()) 50 | 51 | 52 | class NonMedUsePainMeds(BaseOpiods): 53 | __tablename__ = "non_medical_use_of_pain_releivers" 54 | median_moe = 1 55 | source_title = 'National Survey on Drug Use and Health' 56 | source_org = 'SAMHSA, Center for Behavioral Health Statistics and Quality' 57 | source_link = 'https://nsduhweb.rti.org/respweb/homepage.cfm' 58 | start_year = db.Column(db.Integer(), primary_key=True) 59 | year = db.Column(db.Integer(), primary_key=True) 60 | 61 | non_medical_use_of_pain_relievers = db.Column(db.String()) 62 | non_medical_use_of_pain_relievers_lci = db.Column(db.String()) 63 | non_medical_use_of_pain_relievers_uci = db.Column(db.String()) 64 | 65 | @classmethod 66 | def get_supported_levels(cls): 67 | return { 68 | "year": [ALL], 69 | "start_year": [ALL], 70 | "geo": [ALL, NATION, STATE] 71 | } 72 | -------------------------------------------------------------------------------- /datausa/pums/__init__.py: -------------------------------------------------------------------------------- 1 | # __init__.py -------------------------------------------------------------------------------- /datausa/pums/abstract_models.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import MetaData 2 | from sqlalchemy.ext.declarative import declared_attr 3 | from datausa.core.exceptions import DataUSAException 4 | 5 | from datausa.database import db 6 | from datausa.attrs import consts 7 | from datausa.core.models import BaseModel 8 | from datausa.attrs.models import * 9 | from datausa.attrs.consts import NATION, STATE, PUMA, ALL, GEO, COUNTY 10 | 11 | 12 | class BasePums(db.Model, BaseModel): 13 | __abstract__ = True 14 | __table_args__ = {"schema": "pums_1yr"} 15 | source_title = 'ACS PUMS 1-year Estimate' 16 | source_link = 'http://census.gov/programs-surveys/acs/technical-documentation/pums.html' 17 | source_org = 'Census Bureau' 18 | num_records = db.Column(db.Integer) 19 | CACHED_YEARS = [2014, 2015, 2016] 20 | 21 | 22 | def __repr__(self): 23 | return '<{}>'.format(self.__class__) 24 | 25 | 26 | class BasePums5(BasePums): 27 | __abstract__ = True 28 | __table_args__ = {"schema": "pums_5yr"} 29 | source_title = 'ACS PUMS 5-year Estimate' 30 | source_link = 'http://census.gov/programs-surveys/acs/technical-documentation/pums.html' 31 | source_org = 'Census Bureau' 32 | CACHED_YEARS = [2014, 2015, 2016] 33 | 34 | class PersonalOver5(object): 35 | avg_age = db.Column(db.Float) 36 | avg_wage = db.Column(db.Float) 37 | avg_age_moe = db.Column(db.Float) 38 | avg_wage_moe = db.Column(db.Float) 39 | num_ppl = db.Column(db.Integer) 40 | num_ppl_moe = db.Column(db.Float) 41 | 42 | class Personal(object): 43 | avg_age = db.Column(db.Float) 44 | avg_wage = db.Column(db.Float) 45 | num_ppl = db.Column(db.Integer) 46 | avg_age_moe = db.Column(db.Float) 47 | avg_wage_moe = db.Column(db.Float) 48 | num_ppl_moe = db.Column(db.Float) 49 | 50 | class Employees(Personal): 51 | avg_age_ft = db.Column(db.Float) 52 | avg_age_pt = db.Column(db.Float) 53 | avg_wage_ft = db.Column(db.Float) 54 | avg_wage_pt = db.Column(db.Float) 55 | num_ppl_ft = db.Column(db.Integer) 56 | num_ppl_pt = db.Column(db.Integer) 57 | 58 | avg_age_ft_moe = db.Column(db.Float) 59 | avg_age_pt_moe = db.Column(db.Float) 60 | avg_wage_ft_moe = db.Column(db.Float) 61 | avg_wage_pt_moe = db.Column(db.Float) 62 | num_ppl_ft_moe = db.Column(db.Float) 63 | num_ppl_pt_moe = db.Column(db.Float) 64 | 65 | avg_hrs = db.Column(db.Float) 66 | avg_hrs_ft = db.Column(db.Float) 67 | avg_hrs_pt = db.Column(db.Float) 68 | avg_hrs_moe = db.Column(db.Float) 69 | avg_hrs_ft_moe = db.Column(db.Float) 70 | avg_hrs_pt_moe = db.Column(db.Float) 71 | 72 | class EmployeesGini(object): 73 | gini = db.Column(db.Float) 74 | gini_ft = db.Column(db.Float) 75 | gini_pt = db.Column(db.Float) 76 | 77 | class EmployeesRca(object): 78 | num_ppl_rca = db.Column(db.Float) 79 | num_ppl_pt_rca = db.Column(db.Float) 80 | num_ppl_ft_rca = db.Column(db.Float) 81 | 82 | class EmployeesWithAge(Personal): 83 | avg_wage_ft = db.Column(db.Float) 84 | avg_wage_pt = db.Column(db.Float) 85 | num_ppl_ft = db.Column(db.Integer) 86 | num_ppl_pt = db.Column(db.Integer) 87 | 88 | avg_wage_ft_moe = db.Column(db.Float) 89 | avg_wage_pt_moe = db.Column(db.Float) 90 | num_ppl_ft_moe = db.Column(db.Float) 91 | num_ppl_pt_moe = db.Column(db.Float) 92 | 93 | class PersonalWithAge(object): 94 | avg_wage = db.Column(db.Float) 95 | num_ppl = db.Column(db.Integer) 96 | avg_wage_moe = db.Column(db.Float) 97 | num_ppl_moe = db.Column(db.Float) 98 | 99 | 100 | class Year(object): 101 | @declared_attr 102 | def year(cls): 103 | return db.Column(db.Integer(), primary_key=True) 104 | 105 | class GeoId(object): 106 | LEVELS = [NATION, STATE, PUMA, ALL] 107 | @classmethod 108 | def get_supported_levels(cls): 109 | return {GEO: GeoId.LEVELS} 110 | 111 | @classmethod 112 | def geo_filter(cls, level): 113 | if level == ALL: 114 | return True 115 | level_map = {NATION: "010", STATE: "040", PUMA: "795"} 116 | level_code = level_map[level] 117 | return cls.geo.startswith(level_code) 118 | 119 | @declared_attr 120 | def geo(cls): 121 | return db.Column(db.String(), db.ForeignKey(Geo.id), primary_key=True) 122 | 123 | class CipId(object): 124 | @declared_attr 125 | def cip(cls): 126 | return db.Column(db.String(), db.ForeignKey(Cip.id), primary_key=True) 127 | 128 | class DegreeId(object): 129 | @declared_attr 130 | def degree(cls): 131 | return db.Column(db.String(), db.ForeignKey(PumsDegree.id), primary_key=True) 132 | 133 | class NaicsId(object): 134 | LEVELS = ["0", "1", "2", "all"] 135 | naics_level = db.Column(db.Integer()) 136 | 137 | @declared_attr 138 | def naics(cls): 139 | return db.Column(db.String(), db.ForeignKey(PumsNaics.id), primary_key=True) 140 | 141 | @classmethod 142 | def naics_filter(cls, level): 143 | if level == consts.ALL: 144 | return True 145 | return cls.naics_level == level 146 | 147 | class SocId(object): 148 | LEVELS = ["0", "1", "2", "3", "all"] 149 | soc_level = db.Column(db.Integer()) 150 | 151 | @declared_attr 152 | def soc(cls): 153 | return db.Column(db.String(), db.ForeignKey(PumsSoc.id), primary_key=True) 154 | 155 | @classmethod 156 | def soc_filter(cls, level): 157 | if level == consts.ALL: 158 | return True 159 | return cls.soc_level == level 160 | 161 | class WageId(object): 162 | @declared_attr 163 | def wage_bin(cls): 164 | return db.Column(db.String(), db.ForeignKey(PumsWage.id), primary_key=True) 165 | 166 | class RaceId(object): 167 | @declared_attr 168 | def race(cls): 169 | return db.Column(db.String(), db.ForeignKey(PumsRace.id), primary_key=True) 170 | 171 | class SexId(object): 172 | @declared_attr 173 | def sex(cls): 174 | return db.Column(db.String(), db.ForeignKey(PumsSex.id), primary_key=True) 175 | 176 | class BirthplaceId(object): 177 | @declared_attr 178 | def birthplace(cls): 179 | return db.Column(db.String(), db.ForeignKey(PumsBirthplace.id), primary_key=True) 180 | -------------------------------------------------------------------------------- /datausa/pums/models.py: -------------------------------------------------------------------------------- 1 | from datausa.pums.abstract_models import * 2 | from datausa.attrs.consts import ALL 3 | 4 | class Ya(BasePums, EmployeesWithAge, Year): 5 | __tablename__ = "ya" 6 | median_moe = 0.5 7 | 8 | age = db.Column(db.String(), primary_key=True) 9 | 10 | @classmethod 11 | def get_supported_levels(cls): 12 | return {"age": [ALL]} 13 | 14 | class Yc(BasePums, Employees, Year, CipId): 15 | __tablename__ = "yc" 16 | median_moe = 1 17 | 18 | avg_wage_rank = db.Column(db.Integer) 19 | 20 | @classmethod 21 | def get_supported_levels(cls): 22 | return {"cip": ["2", ALL]} 23 | 24 | class Ycs(BasePums, Employees, Year, CipId, SexId): 25 | __tablename__ = "ycs" 26 | median_moe = 2 27 | 28 | @classmethod 29 | def get_supported_levels(cls): 30 | return {"cip": ["2", ALL], "sex": [ALL]} 31 | 32 | class Yca(BasePums, EmployeesWithAge, Year, CipId): 33 | __tablename__ = "yca" 34 | median_moe = 2 35 | 36 | age = db.Column(db.String(), primary_key=True) 37 | 38 | @classmethod 39 | def get_supported_levels(cls): 40 | return {"cip": ["2", ALL], "age": [ALL]} 41 | 42 | class Ycb(BasePums, Employees, Year, CipId, BirthplaceId, EmployeesRca): 43 | __tablename__ = "ycb" 44 | median_moe = 2 45 | 46 | @classmethod 47 | def get_supported_levels(cls): 48 | return {"cip": ["2", ALL], "birthplace": [ALL]} # TODO support in/out of US? 49 | 50 | class Ycd(BasePums, Employees, Year, CipId, DegreeId): 51 | __tablename__ = "ycd" 52 | median_moe = 2 53 | 54 | @classmethod 55 | def get_supported_levels(cls): 56 | return {"cip": ["2", ALL], "degree": [ALL]} 57 | 58 | class Yg(BasePums, Employees, Year, GeoId, EmployeesGini): 59 | __tablename__ = "yg" 60 | median_moe = 1 61 | 62 | 63 | class Ygd(BasePums, Employees, Year, GeoId, DegreeId): 64 | __tablename__ = "ygd" 65 | median_moe = 2 66 | 67 | @classmethod 68 | def get_supported_levels(cls): 69 | return {"geo": GeoId.LEVELS, "degree": [ALL]} 70 | 71 | 72 | class Ygi(BasePums, Employees, Year, GeoId, NaicsId, EmployeesRca): 73 | __tablename__ = "ygi" 74 | median_moe = 2 75 | 76 | @classmethod 77 | def get_supported_levels(cls): 78 | return {"geo": GeoId.LEVELS, "naics": NaicsId.LEVELS} 79 | 80 | class Ygio(BasePums, Employees, Year, GeoId, NaicsId, SocId): 81 | __tablename__ = "ygio" 82 | median_moe = 5 83 | @classmethod 84 | def get_supported_levels(cls): 85 | return {"geo": GeoId.LEVELS, 86 | "soc": SocId.LEVELS, 87 | "naics": NaicsId.LEVELS} 88 | 89 | # class Ygmd(BasePums, Personal, Year, GeoId, MajorId, DegreeId): 90 | # __tablename__ = "ygmd" 91 | # median_moe = 3 92 | 93 | class Ygc(BasePums, Employees, Year, GeoId, CipId, EmployeesRca): 94 | __tablename__ = "ygc" 95 | median_moe = 2 96 | 97 | @classmethod 98 | def get_supported_levels(cls): 99 | return {"geo": GeoId.LEVELS, "cip": ["2", ALL]} 100 | 101 | class Yo(BasePums, Employees, Year, SocId, EmployeesGini): 102 | __tablename__ = "yo" 103 | median_moe = 1 104 | 105 | avg_wage_rank = db.Column(db.Integer) 106 | num_ppl_rank = db.Column(db.Integer) 107 | 108 | @classmethod 109 | def get_supported_levels(cls): 110 | return {"soc": SocId.LEVELS} 111 | 112 | class Yow(BasePums, Employees, Year, SocId, WageId): 113 | __tablename__ = "yow" 114 | median_moe = 1.5 115 | 116 | @classmethod 117 | def get_supported_levels(cls): 118 | return {"soc": SocId.LEVELS, "wage_bin": [ALL]} 119 | 120 | 121 | class Yiw(BasePums, Employees, Year, NaicsId, WageId): 122 | __tablename__ = "yiw" 123 | median_moe = 1.5 124 | 125 | @classmethod 126 | def get_supported_levels(cls): 127 | return {"naics": NaicsId.LEVELS, "wage_bin": [ALL]} 128 | 129 | 130 | class Ygo(BasePums, Employees, Year, GeoId, SocId, EmployeesRca): 131 | __tablename__ = "ygo" 132 | median_moe = 2 133 | 134 | @classmethod 135 | def get_supported_levels(cls): 136 | return {"geo": GeoId.LEVELS, "soc": SocId.LEVELS} 137 | 138 | class Ygw(BasePums, Employees, Year, GeoId, WageId): 139 | __tablename__ = "ygw" 140 | median_moe = 2 141 | 142 | @classmethod 143 | def get_supported_levels(cls): 144 | return {"geo": GeoId.LEVELS, "wage_bin": [ALL]} 145 | 146 | 147 | class Yor(BasePums, Employees, Year, SocId, RaceId): 148 | __tablename__ = "yor" 149 | median_moe = 2 150 | 151 | @classmethod 152 | def get_supported_levels(cls): 153 | return {"soc": SocId.LEVELS, 154 | "race": [ALL]} 155 | 156 | 157 | class Ygor(BasePums, Employees, Year, GeoId, SocId, RaceId): 158 | __tablename__ = "ygor" 159 | median_moe = 3 160 | 161 | @classmethod 162 | def get_supported_levels(cls): 163 | return {"geo": GeoId.LEVELS, "soc": SocId.LEVELS, 164 | "race": [ALL]} 165 | 166 | class Ygs(BasePums, Employees, Year, GeoId, SexId): 167 | __tablename__ = "ygs" 168 | median_moe = 2 169 | 170 | @classmethod 171 | def get_supported_levels(cls): 172 | return {"geo": GeoId.LEVELS, "sex": [ALL]} 173 | 174 | class Ygr(BasePums, Employees, Year, GeoId, RaceId): 175 | __tablename__ = "ygr" 176 | median_moe = 2 177 | 178 | @classmethod 179 | def get_supported_levels(cls): 180 | return {"geo": GeoId.LEVELS, "race": [ALL]} 181 | 182 | class Ygos(BasePums, Employees, Year, GeoId, SocId, SexId): 183 | __tablename__ = "ygos" 184 | median_moe = 3 185 | 186 | @classmethod 187 | def get_supported_levels(cls): 188 | return {"geo": GeoId.LEVELS, "soc": SocId.LEVELS, 189 | "sex": [ALL]} 190 | 191 | class Yoc(BasePums, Employees, Year, SocId, CipId, EmployeesRca): 192 | __tablename__ = "yoc" 193 | median_moe = 2 194 | 195 | @classmethod 196 | def get_supported_levels(cls): 197 | return {"cip": ["2", ALL], "soc": SocId.LEVELS} 198 | 199 | class Yic(BasePums, Employees, Year, NaicsId, CipId): 200 | __tablename__ = "yic" 201 | median_moe = 2 202 | @classmethod 203 | def get_supported_levels(cls): 204 | return {"cip": ["2", ALL], "naics": NaicsId.LEVELS} 205 | 206 | class Yio(BasePums, Employees, Year, NaicsId, SocId, EmployeesRca): 207 | __tablename__ = "yio" 208 | median_moe = 2 209 | 210 | @classmethod 211 | def get_supported_levels(cls): 212 | return {"soc": SocId.LEVELS, "naics": NaicsId.LEVELS} 213 | 214 | 215 | class Yir(BasePums, Employees, Year, NaicsId, RaceId, EmployeesRca): 216 | __tablename__ = "yir" 217 | median_moe = 2 218 | 219 | @classmethod 220 | def get_supported_levels(cls): 221 | return {"naics": NaicsId.LEVELS, "race": [ALL]} 222 | 223 | 224 | class Yior(BasePums, Employees, Year, NaicsId, SocId, RaceId): 225 | __tablename__ = "yior" 226 | median_moe = 3 227 | 228 | @classmethod 229 | def get_supported_levels(cls): 230 | return {"soc": SocId.LEVELS, "naics": NaicsId.LEVELS, "race": [ALL]} 231 | 232 | 233 | class Yos(BasePums, Employees, Year, SocId, SexId): 234 | __tablename__ = "yos" 235 | median_moe = 2 236 | 237 | @classmethod 238 | def get_supported_levels(cls): 239 | return {"soc": SocId.LEVELS, "sex": [ALL]} 240 | 241 | 242 | class Yoas(BasePums, EmployeesWithAge, Year, SocId, SexId): 243 | __tablename__ = "yoas" 244 | median_moe = 3 245 | age = db.Column(db.String(), primary_key=True) 246 | 247 | @classmethod 248 | def get_supported_levels(cls): 249 | return {"soc": SocId.LEVELS, "sex": [ALL], "age": [ALL]} 250 | 251 | 252 | class Yod(BasePums, Employees, Year, SocId, DegreeId): 253 | __tablename__ = "yod" 254 | median_moe = 2 255 | 256 | @classmethod 257 | def get_supported_levels(cls): 258 | return {"soc": SocId.LEVELS, "degree": [ALL]} 259 | 260 | 261 | class Yid(BasePums, Employees, Year, NaicsId, DegreeId): 262 | __tablename__ = "yid" 263 | median_moe = 2 264 | 265 | @classmethod 266 | def get_supported_levels(cls): 267 | return {"naics": NaicsId.LEVELS, "degree": [ALL]} 268 | 269 | 270 | class Yi(BasePums, Employees, Year, NaicsId, EmployeesGini): 271 | __tablename__ = "yi" 272 | median_moe = 1 273 | 274 | avg_wage_rank = db.Column(db.Integer) 275 | num_ppl_rank = db.Column(db.Integer) 276 | 277 | @classmethod 278 | def get_supported_levels(cls): 279 | return {"naics": NaicsId.LEVELS} 280 | 281 | 282 | class Yis(BasePums, Employees, Year, NaicsId, SexId, EmployeesRca): 283 | __tablename__ = "yis" 284 | median_moe = 2 285 | 286 | @classmethod 287 | def get_supported_levels(cls): 288 | return {"naics": NaicsId.LEVELS, "sex": [ALL]} 289 | 290 | 291 | class Yios(BasePums, Employees, Year, NaicsId, SocId, SexId): 292 | __tablename__ = "yios" 293 | median_moe = 3 294 | 295 | @classmethod 296 | def get_supported_levels(cls): 297 | return {"soc": SocId.LEVELS, "naics": NaicsId.LEVELS, "sex": [ALL]} 298 | 299 | class Yocd(BasePums, Employees, Year, SocId, CipId, DegreeId, EmployeesRca): 300 | __tablename__ = "yocd" 301 | median_moe = 3 302 | 303 | @classmethod 304 | def get_supported_levels(cls): 305 | return {"cip": ["2", ALL], "soc": SocId.LEVELS, "degree": [ALL]} 306 | 307 | 308 | class Ygb(BasePums, PersonalOver5, Year, GeoId, BirthplaceId): 309 | __tablename__ = "ygb_v2" 310 | median_moe = 2.1 311 | num_over5 = db.Column(db.Float) 312 | num_over5_moe = db.Column(db.Float) 313 | num_over5_rca = db.Column(db.Float) 314 | 315 | @classmethod 316 | def get_supported_levels(cls): 317 | return {"geo": GeoId.LEVELS, "birthplace": [ALL]} 318 | -------------------------------------------------------------------------------- /datausa/pums/models_5.py: -------------------------------------------------------------------------------- 1 | from datausa.pums.abstract_models import * 2 | from datausa.attrs.consts import ALL 3 | 4 | 5 | class Ygi5(BasePums5, Employees, Year, GeoId, NaicsId, EmployeesRca): 6 | __tablename__ = "ygi" 7 | median_moe = 1.9 8 | 9 | @classmethod 10 | def get_supported_levels(cls): 11 | return {"geo": GeoId.LEVELS, "naics": NaicsId.LEVELS} 12 | 13 | 14 | class Ygo5(BasePums5, Employees, Year, GeoId, SocId, EmployeesRca): 15 | __tablename__ = "ygo" 16 | median_moe = 1.9 17 | 18 | @classmethod 19 | def get_supported_levels(cls): 20 | return {"geo": GeoId.LEVELS, "soc": SocId.LEVELS} 21 | 22 | 23 | class Yoas5(BasePums5, EmployeesWithAge, Year, SocId, SexId): 24 | __tablename__ = "yoas" 25 | median_moe = 2.9 26 | age = db.Column(db.String(), primary_key=True) 27 | 28 | @classmethod 29 | def get_supported_levels(cls): 30 | return {"soc": SocId.LEVELS, "sex": [ALL], "age": [ALL]} 31 | 32 | 33 | class Ygor5(BasePums5, Employees, Year, GeoId, SocId, RaceId): 34 | __tablename__ = "ygor" 35 | median_moe = 2.9 36 | 37 | @classmethod 38 | def get_supported_levels(cls): 39 | return {"geo": GeoId.LEVELS, "soc": SocId.LEVELS, 40 | "race": [ALL]} 41 | 42 | class Ygos5(BasePums5, Employees, Year, GeoId, SocId, SexId): 43 | __tablename__ = "ygos" 44 | median_moe = 2.9 45 | 46 | @classmethod 47 | def get_supported_levels(cls): 48 | return {"geo": GeoId.LEVELS, "soc": SocId.LEVELS, 49 | "sex": [ALL]} 50 | 51 | class Ygb5(BasePums5, PersonalOver5, Year, GeoId, BirthplaceId): 52 | __tablename__ = "ygb_v2" 53 | median_moe = 2 54 | num_over5 = db.Column(db.Float) 55 | num_over5_moe = db.Column(db.Float) 56 | num_over5_rca = db.Column(db.Float) 57 | 58 | @classmethod 59 | def get_supported_levels(cls): 60 | return {"geo": GeoId.LEVELS, "birthplace": [ALL]} 61 | -------------------------------------------------------------------------------- /datausa/util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataUSA/datausa-api/7288dede082eda07b61e11cf6dc801fe692f6334/datausa/util/__init__.py -------------------------------------------------------------------------------- /datausa/util/big_places.py: -------------------------------------------------------------------------------- 1 | from datausa import cache 2 | from sqlalchemy import or_, and_ 3 | from sqlalchemy import distinct 4 | from datausa.attrs.consts import POP_THRESHOLD 5 | from datausa.acs.automap_models import Acs1_Yg 6 | 7 | @cache.memoize() 8 | def get_big_geos(): 9 | conds = [ 10 | Acs1_Yg.geo.startswith("010"), 11 | Acs1_Yg.geo.startswith("040"), 12 | Acs1_Yg.geo.startswith("050"), 13 | Acs1_Yg.geo.startswith("160"), 14 | Acs1_Yg.geo.startswith("310"), 15 | ] 16 | cond = and_(or_(*conds), Acs1_Yg.pop > POP_THRESHOLD) 17 | geos = Acs1_Yg.query.with_entities(distinct(Acs1_Yg.geo)).filter(cond).all() 18 | return set([g for g, in geos]) # faster lookup with set 19 | 20 | def is_big_geo(geo_id): 21 | # for sufficiently large places, we can also rely on 1-year estimate 22 | return geo_id in big_geos 23 | 24 | big_geos = get_big_geos() 25 | -------------------------------------------------------------------------------- /datausa/util/inmem.py: -------------------------------------------------------------------------------- 1 | from datausa import cache 2 | from datausa.ipeds.models import GradsYgc 3 | from datausa.onet.models import SkillBySoc, SkillByCip 4 | import re 5 | 6 | def splitter(x): 7 | return re.split(",(?! )", x) 8 | 9 | @cache.memoize() 10 | def ipeds_place_map(): 11 | qry = GradsYgc.query.with_entities(GradsYgc.geo.distinct()).all() 12 | return {item: True for item, in qry} 13 | 14 | 15 | @cache.memoize() 16 | def onet_socs(): 17 | qry = SkillBySoc.query.with_entities(SkillBySoc.soc.distinct()).all() 18 | return {item: True for item, in qry} 19 | 20 | 21 | @cache.memoize() 22 | def onet_cips(): 23 | qry = SkillByCip.query.with_entities(SkillByCip.cip.distinct()).all() 24 | return {item: True for item, in qry} 25 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Flask==0.10.1 2 | Flask-SQLAlchemy==2.1 3 | SQLAlchemy==1.1.14 4 | psycopg2==2.7.3.2 5 | Flask-Compress==1.3.0 6 | simplejson==3.8.2 7 | Flask-Cors==2.1.2 8 | Flask-Cache==0.13.1 9 | Pillow==3.1.1 10 | Whoosh==2.7.2 11 | Unidecode==0.4.19 12 | gunicorn==19.4.5 13 | click==6.3 14 | pytest==3.0.3 15 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | from datausa import app 2 | 3 | if __name__ == '__main__': 4 | app.debug = True 5 | app.run('0.0.0.0') 6 | -------------------------------------------------------------------------------- /scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataUSA/datausa-api/7288dede082eda07b61e11cf6dc801fe692f6334/scripts/__init__.py -------------------------------------------------------------------------------- /scripts/alt_fill_cache.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import os 3 | from requests.auth import HTTPBasicAuth 4 | import click 5 | import signal 6 | import sys 7 | 8 | errors = [] 9 | 10 | 11 | def url_to_json(url): 12 | print url 13 | result = requests.get(url).json() 14 | if 'data' in result: 15 | return result['data'], result['headers'] 16 | raise Exception("No data!") 17 | 18 | 19 | def crawl_page(site_url, moi): 20 | display_id, attr_kind = moi 21 | if not display_id: 22 | print "skipping", display_id, attr_kind 23 | page = u'{}/profile/{}/{}/'.format(site_url, attr_kind, display_id) 24 | print page, "getting..." 25 | r = requests.get(page, auth=HTTPBasicAuth('sunbird', os.environ.get('DATAUSA_WEB_PW', ''))) 26 | if r.status_code != 200: 27 | if r.status_code == 401: 28 | raise Exception("You may have forgotten to set DATAUSA_WEB_PW \ 29 | envinroment variable (or provided a bad PW).\nWe need this because \ 30 | the site is password protected") 31 | print "PAGE ERROR", page, r.status_code 32 | errors.append(page) 33 | 34 | 35 | def crawl_attr(api_url, site_url, attr_kind, offset, sumlevel): 36 | sumlevel = "" if not sumlevel else "sumlevel={}".format(sumlevel) 37 | url_str = '{}/attrs/search?q=&kind={}&limit=110000&offset={}&{}' 38 | data, headers = url_to_json(url_str.format(api_url, attr_kind, offset, sumlevel)) 39 | data = sorted(data, key=lambda obj: obj[headers.index('zvalue')], reverse=True) 40 | if attr_kind != 'geo': 41 | mydata = [[country[headers.index('id')], attr_kind] for country in data] 42 | else: 43 | mydata = [[country[headers.index('url_name')], attr_kind] for country in data] 44 | 45 | for x in mydata: 46 | crawl_page(site_url, x) 47 | 48 | 49 | def fix_url(my_url): 50 | if not my_url.startswith('http://'): 51 | my_url = 'http://' + my_url 52 | if my_url.endswith('/'): 53 | my_url = my_url[:-1] 54 | return my_url 55 | 56 | 57 | def signal_handler(signal, frame): 58 | print "Pages with Errors" 59 | print errors 60 | print "Number of errors:", len(errors) 61 | sys.exit(0) 62 | 63 | 64 | @click.command() 65 | @click.option('--api_url', default="http://db.datausa.io", help='API Url') 66 | @click.option('--site_url', default="http://beta.datausa.io", help='Site Url') 67 | @click.option('--attr', default="geo", help="attr kind") 68 | @click.option('--offset', default=0, help="offset in list") 69 | @click.option('--sumlevel', default=None, help="attr sumlevel") 70 | def main(api_url, site_url, attr, offset, sumlevel): 71 | api_url = fix_url(api_url) 72 | site_url = fix_url(site_url) 73 | attrs = attr.split(",") 74 | print "Waiting for crawl to complete..." 75 | for attr in attrs: 76 | crawl_attr(api_url, site_url, attr, offset, sumlevel) 77 | print "Crawl complete!" 78 | print "Pages with Errors" 79 | print errors 80 | print "Number of errors:", len(errors) 81 | 82 | 83 | if __name__ == "__main__": 84 | signal.signal(signal.SIGINT, signal_handler) 85 | main() 86 | -------------------------------------------------------------------------------- /scripts/build_search.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Script used to generate the query that makes up the search table 3 | ''' 4 | from datausa.pums.abstract_models import BasePums 5 | 6 | pums_schema_name = BasePums.get_schema_name() 7 | 8 | # Industry and Occupation Z-scoring 9 | attrs = [("soc", "{}.yo".format(pums_schema_name), "avg_wage", [0, 1, 2, 3]), 10 | ("naics", "{}.yi".format(pums_schema_name), "num_ppl", [0, 1, 2])] 11 | 12 | qry = '''SELECT g.{0} as id, (g.{2} - stats.average) / stats.st AS zvalue, '{0}' as kind , lower(a.name) as name, a.name as display, a.level::text as sumlevel, -1 as is_stem, a.url_name as url_name, a.keywords as keywords 13 | FROM {1} g 14 | LEFT JOIN pums_attrs.pums_{0} a ON (a.id = g.{0} and a.level = g.{0}_level) 15 | CROSS JOIN 16 | (select STDDEV({2}) as st, AVG({2}) as average FROM {1} WHERE {0}_level={3} AND year=2015) stats 17 | WHERE g.{0}_level = {3} 18 | AND g.year = 2015''' 19 | 20 | queries = [] 21 | for attr, table, metric, levels in attrs: 22 | for level in levels: 23 | queries.append(qry.format(attr, table, metric, level)) 24 | #print queries[0] 25 | 26 | 27 | 28 | # CIP codes 29 | cip_qry = '''SELECT g.{0}, (g.{2} - stats.average) / stats.st AS zvalue, '{0}' as kind , lower(a.name) as name, a.name as display, a.level::text as sumlevel, a.is_stem as is_stem, a.url_name as url_name, a.keywords as keywords 30 | FROM {1} g 31 | LEFT JOIN attrs.course a ON (a.id = g.{0}) 32 | CROSS JOIN 33 | (select STDDEV({2}) as st, AVG({2}) as average FROM {1} WHERE char_length({0}) = {3} AND year=2015) stats 34 | WHERE char_length({0}) = {3} 35 | AND g.year = 2015''' 36 | 37 | for level in [2, 4, 6]: 38 | queries.append(cip_qry.format("cip", "ipeds.grads_yc", "grads_total", level)) 39 | 40 | # GEO codes 41 | geo_qry = '''SELECT g.{0}, (g.{2} - stats.average) / stats.st AS zvalue, '{0}' as kind , lower(a.name) as name, a.display_name as display, a.sumlevel::text as sumlevel, -1 as is_stem, a.url_name as url_name, a.keywords as keywords 42 | FROM {1} g 43 | LEFT JOIN attrs.geo_names a ON (a.id = g.{0}) 44 | CROSS JOIN 45 | (select STDDEV({2}) as st, AVG({2}) as average FROM {1} WHERE {0} LIKE '{3}%' AND year=2015) stats 46 | WHERE g.{0} LIKE '{3}%' 47 | AND g.year = 2015''' 48 | 49 | for level in ['040', '050', '160', '310', '795']: 50 | queries.append(geo_qry.format("geo", "acs_5yr.yg", "pop", level)) 51 | 52 | queries.append("SELECT '01000US', 150, 'geo', 'united states', 'United States', '010', -1, 'united-states', '{usa, us, america}'") 53 | 54 | 55 | # UNIVERSITIES 56 | university_qry = '''SELECT g.{0}, (g.{2} - stats.average) / stats.st AS zvalue, '{0}' as kind , lower(a.name) as name, a.display_name as display, a.university_level::text as sumlevel, a.is_stem as is_stem, a.url_name as url_name, a.keywords as keywords 57 | FROM {1} g 58 | LEFT JOIN attrs.university a ON (a.id = g.{0}) 59 | CROSS JOIN 60 | (select STDDEV({2}) as st, AVG({2}) as average FROM {1} WHERE year=2015) stats 61 | WHERE g.year = 2015 and a.status != 'D' ''' 62 | 63 | queries.append(university_qry.format("university", "ipeds.grads_yu", "grads_total")) 64 | 65 | tail_qrys = ["({})".format(q) if i != 0 else q for i, q in enumerate(queries)] 66 | final_q = "\n UNION \n".join(tail_qrys) 67 | print(final_q) 68 | -------------------------------------------------------------------------------- /scripts/cache_helper.applescript: -------------------------------------------------------------------------------- 1 | tell application "Terminal" 2 | do script ". ~/Virtualenvs/datausa-api/bin/activate; python ~/Projects/datausa-api/scripts/fill_cache.py http://db.datausa.io geo" 3 | activate 4 | end tell 5 | 6 | tell application "Terminal" 7 | do script ". ~/Virtualenvs/datausa-api/bin/activate; python ~/Projects/datausa-api/scripts/fill_cache.py http://db.datausa.io cip" 8 | activate 9 | end tell 10 | 11 | tell application "Terminal" 12 | do script ". ~/Virtualenvs/datausa-api/bin/activate; python ~/Projects/datausa-api/scripts/fill_cache.py http://db.datausa.io soc" 13 | activate 14 | end tell 15 | 16 | tell application "Terminal" 17 | do script ". ~/Virtualenvs/datausa-api/bin/activate; python ~/Projects/datausa-api/scripts/fill_cache.py http://db.datausa.io naics" 18 | activate 19 | end tell 20 | -------------------------------------------------------------------------------- /scripts/fill_cache.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import time 4 | import threading 5 | import os 6 | from multiprocessing import Pool 7 | from requests.auth import HTTPBasicAuth 8 | 9 | 10 | def url_to_json(url): 11 | print url 12 | result = requests.get(url).json() 13 | if 'data' in result: 14 | return result['data'], result['headers'] 15 | raise Exception("No data!") 16 | 17 | def crawl_page(moi): 18 | display_id,attr_kind = moi 19 | if not display_id: 20 | print "skipping", display_id, attr_kind 21 | page = u'http://beta.datausa.io/profile/{}/{}/'.format( attr_kind, display_id) 22 | print page, "getting..." 23 | r = requests.get(page, auth=HTTPBasicAuth('datausa', os.environ.get('DATAUSA_WEB_PW', ''))) 24 | if r.status_code != 200: 25 | if r.status_code == 401: 26 | raise Exception("You may have forgotten to set DATAUSA_WEB_PW env var (or provided a bad PW).\nWe need this because the site is password protected") 27 | print "PAGE ERROR", page, r.status_code 28 | 29 | def crawl_attr(base_url, attr_kind='country'): 30 | data, headers = url_to_json('{}/attrs/search?q=&kind={}&limit=100000'.format(base_url, attr_kind)) 31 | data = sorted(data, key=lambda obj: obj[headers.index('zvalue')], reverse=True) 32 | if attr_kind != 'geo': 33 | mydata = [[country[headers.index('id')], attr_kind] for country in data] 34 | else: 35 | mydata = [[country[headers.index('url_name')], attr_kind] for country in data] 36 | #pool = Pool(5) 37 | #pool.map(crawl_page, mydata) 38 | for x in mydata: 39 | crawl_page(x) 40 | 41 | 42 | 43 | def main(base_url="http://db.datausa.io", attr="geo"): 44 | if not base_url.startswith('http://'): 45 | base_url = 'http://' + base_url 46 | if base_url.endswith('/'): 47 | base_url = base_url[:-1] 48 | attrs = attr.split(",") 49 | print "Waiting for crawl to complete..." 50 | for attr in attrs: 51 | crawl_attr(base_url, attr) 52 | print "Crawl complete!" 53 | 54 | if __name__ == "__main__": 55 | import sys 56 | if len(sys.argv) < 2: 57 | main() 58 | else: 59 | attr = sys.argv[2] if len(sys.argv) >= 3 else "geo,naics,soc,cip" 60 | main(sys.argv[1], attr) 61 | 62 | # EXAMPLE: python fill_cache.py db.datausa.io naics 63 | # python fill_cache.py db.datausa.io naics,soc 64 | # python fill_cache.py db.datausa.io geo 65 | -------------------------------------------------------------------------------- /scripts/fix_urlnames.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path 3 | from whoosh import index 4 | from whoosh.fields import Schema, ID, TEXT, NUMERIC, KEYWORD, NGRAM, NGRAMWORDS 5 | from whoosh.fields import BOOLEAN 6 | from config import SEARCH_INDEX_DIR 7 | from datausa.database import db 8 | 9 | from datausa.attrs.models import Geo 10 | from datausa.acs.automap_models import Acs5_Yg 11 | geos = Geo.query.filter(Geo.id.like('160%')).all() 12 | 13 | url_map = {} 14 | 15 | for g in geos: 16 | if g.url_name and g.url_name not in url_map: 17 | url_map[g.url_name] = [] 18 | if g.url_name: 19 | url_map[g.url_name].append(g) 20 | 21 | # now we have a list of all g's 22 | url_map = {k:v for k,v in url_map.items() if v and len(v) > 1} 23 | 24 | # get first ... 25 | for url_name, glist in url_map.items(): 26 | if url_name.endswith("-pr") or url_name == 'chevy-chase-md': 27 | print "skipping pr for now..." 28 | continue 29 | print "working on", url_name 30 | if len(glist) == 2: 31 | data = [] 32 | has_ran = False 33 | for g in glist: 34 | moi = Acs5_Yg.query.filter(Acs5_Yg.year == 2014, Acs5_Yg.geo == g.id).first() 35 | parents, headers = Geo.parents(g.id) 36 | county = None 37 | for p in parents: 38 | print p, "TEST" 39 | if p[0][:3] == '050': 40 | county = p[2].split("-county-")[0].lower() 41 | if not moi: 42 | continue 43 | pop = moi.pop 44 | data.append([g.url_name, g.id, pop, county, g]) 45 | has_ran = True 46 | if not has_ran: 47 | print "skipping", url_name 48 | continue 49 | # select the place with less pop 50 | from operator import attrgetter 51 | min_pl = min(data, key=lambda x: x[2]) 52 | print data 53 | print min_pl 54 | print "RENAMING!!!!" 55 | geo_obj = min_pl[-1] 56 | print geo_obj.name, "|", geo_obj.display_name , "|", geo_obj.url_name 57 | newc = u", {} County".format(min_pl[-2].title()) 58 | new_name = geo_obj.name.strip() + newc 59 | new_disp = geo_obj.display_name.replace(geo_obj.name, new_name) 60 | print "min_pl-2=",min_pl[-2] 61 | new_url = geo_obj.url_name[:-3] + u'-{}-county'.format(min_pl[-2]) + geo_obj.url_name[-3:] 62 | print "=========" 63 | print "GEOid", geo_obj.id 64 | print "original", geo_obj.name 65 | print "original", geo_obj.display_name 66 | print "original", geo_obj.url_name 67 | print "The new name", new_name 68 | print "The new disp", new_disp 69 | print "The new url", new_url 70 | geo_obj.name = new_name 71 | geo_obj.display_name = new_disp 72 | geo_obj.url_name = new_url 73 | user_ok = raw_input("DO I HAVE THE OK? ") 74 | if user_ok == "AOK": 75 | db.session.add(geo_obj) 76 | db.session.commit() 77 | else: 78 | print "url_name has more than 2!!!!!!", url_name 79 | -------------------------------------------------------------------------------- /scripts/flickr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataUSA/datausa-api/7288dede082eda07b61e11cf6dc801fe692f6334/scripts/flickr/__init__.py -------------------------------------------------------------------------------- /scripts/flickr/analyze.py: -------------------------------------------------------------------------------- 1 | import csv, flickr, short, sys 2 | 3 | MAX_SIDE = 1400 4 | LICENSES = ["4", "5", "7", "8", "9", "10"] 5 | 6 | def read_csv(): 7 | 8 | if len(sys.argv) < 3: 9 | print "------------------------------------------" 10 | print "ERROR: Script requires 2 variables, an attribute type and a filename." 11 | print "Example: python grab.py cip file.csv" 12 | print "------------------------------------------" 13 | return 14 | 15 | attr_type = sys.argv[1] 16 | csvReader = csv.DictReader(open(sys.argv[2])) 17 | input_file = list(csvReader) 18 | images = [] 19 | 20 | print "Analyzing {} images".format(attr_type.upper()) 21 | for index, row in enumerate(input_file): 22 | sys.stdout.write("\r{} of {}".format(index, len(input_file))) 23 | sys.stdout.flush() 24 | uid = row["id"] 25 | 26 | if "image_link" in row and row["image_link"] != "": 27 | 28 | image = row["image_link"] 29 | if "photolist" in image: 30 | image = image.split("/in/photolist")[0] 31 | 32 | pid = image.split("/")[-1] 33 | if "flic.kr" not in image: 34 | image = "http://flic.kr/p/{}".format(short.encode(pid)) 35 | 36 | 37 | image = {"id": uid, "url": image, "small": False, "removed": False} 38 | row["error"] = "" 39 | 40 | photo = flickr.Photo(pid) 41 | try: 42 | photo._load_properties() 43 | except: 44 | image["removed"] = True 45 | row["error"] = "removed" 46 | 47 | if photo._Photo__license: 48 | image["license"] = photo._Photo__license 49 | if image["license"] in LICENSES: 50 | if len([p for p in photo.getSizes() if p["width"] >= MAX_SIDE]) == 0: 51 | image["small"] = True 52 | row["error"] = "resolution" 53 | else: 54 | row["error"] = "license-{}".format(image["license"]) 55 | 56 | images.append(image) 57 | 58 | print "\n" 59 | print "Outputing to CSV..." 60 | with open(sys.argv[2].replace(".csv", "-update.csv"), 'wb') as f: 61 | w = csv.DictWriter(f, None) 62 | 63 | w.fieldnames = csvReader.fieldnames 64 | w.writerow(dict((h, h) for h in csvReader.fieldnames)) 65 | 66 | for row in input_file: 67 | w.writerow(row) 68 | 69 | print "\n" 70 | num_images = float(len(images)) 71 | print "{} images have been analyzed".format(int(num_images)) 72 | bads = sum(1 for image in images if "license" in image and image["license"] not in LICENSES) 73 | print "{} ({}%) have bad licenses".format(bads, round((bads / num_images) * 100)) 74 | smalls = sum(1 for image in images if image["small"]) 75 | print "{} ({}%) are too small".format(smalls, round((smalls / num_images) * 100)) 76 | missing = sum(1 for image in images if image["removed"]) 77 | print "{} ({}%) have been removed from Flickr".format(missing, round((missing / num_images) * 100)) 78 | 79 | if __name__ == '__main__': 80 | read_csv() 81 | -------------------------------------------------------------------------------- /scripts/flickr/grab.py: -------------------------------------------------------------------------------- 1 | import csv, flickr, os, short, sys, urllib 2 | from config import FLICKR_DIR 3 | from datausa.database import db 4 | from datausa.attrs.views import attr_map 5 | from PIL import Image as pillow 6 | 7 | MAX_SIDE = 1400 8 | LICENSES = ["4", "5", "7", "8", "9", "10"] 9 | 10 | def read_csv(): 11 | 12 | thumb_side = 425 13 | quality = 90 14 | 15 | if len(sys.argv) < 3: 16 | print "------------------------------------------" 17 | print "ERROR: Script requires 2 variables, an attribute type and a filename." 18 | print "Example: python grab.py cip file.csv" 19 | print "------------------------------------------" 20 | return 21 | 22 | attr_type = sys.argv[1] 23 | if attr_type not in attr_map: 24 | print "------------------------------------------" 25 | print "ERROR: Invalid attribute type." 26 | print "Allowed keys: {}".format(", ".join(attr_map.keys())) 27 | print "------------------------------------------" 28 | return 29 | else: 30 | table = attr_map[attr_type] 31 | 32 | csvFilename = sys.argv[2] 33 | csvReader = csv.DictReader(open(csvFilename)) 34 | input_file = list(csvReader) 35 | imgdir = os.path.join(FLICKR_DIR, attr_type) 36 | thumbdir = imgdir.replace("splash", "thumb") 37 | badImages = [] 38 | smallImages = [] 39 | goodImages = [] 40 | removedImages = [] 41 | deletedImages = [] 42 | 43 | # skip = True 44 | 45 | if not os.path.exists(imgdir): 46 | os.makedirs(imgdir) 47 | 48 | if not os.path.exists(thumbdir): 49 | os.makedirs(thumbdir) 50 | 51 | for row in input_file: 52 | 53 | update = False 54 | 55 | uid = row["id"] 56 | imgpath = os.path.join(imgdir, "{}.jpg".format(uid)) 57 | thumbpath = os.path.join(thumbdir, "{}.jpg".format(uid)) 58 | 59 | image_only = attr_type == "geo" 60 | 61 | if "level" in row: 62 | attr = table.query.filter_by(id=uid,level=row["level"]).first() 63 | else: 64 | attr = table.query.get(uid) 65 | 66 | if attr and "image_link" in row: 67 | image = row["image_link"] 68 | 69 | if not image and attr.image_link: 70 | 71 | attr.image_meta = None 72 | attr.image_link = None 73 | attr.image_author = None 74 | update = True 75 | deletedImages.append(uid) 76 | row["error"] = "" 77 | os.remove(imgpath) 78 | os.remove(thumbpath) 79 | 80 | elif image and attr.image_link != image: 81 | 82 | if "photolist" in image: 83 | image = image.split("/in/photolist")[0] 84 | 85 | pid = image.split("/")[-1] 86 | if "flic.kr" not in image: 87 | image = "http://flic.kr/p/{}".format(short.encode(pid)) 88 | 89 | photo = flickr.Photo(pid) 90 | try: 91 | photo._load_properties() 92 | except: 93 | row["error"] = "removed" 94 | removedImages.append(uid) 95 | continue 96 | 97 | image = {"id": uid, "url": image, "license": photo._Photo__license} 98 | 99 | if image["license"] not in LICENSES: 100 | badImages.append(image) 101 | row["error"] = "license-{}".format(image["license"]) 102 | else: 103 | sizes = [p for p in photo.getSizes() if p["width"] >= MAX_SIDE] 104 | if len(sizes) == 0: 105 | smallImages.append(image) 106 | row["error"] = "resolution" 107 | else: 108 | download_url = min(sizes, key=lambda item: item["width"])["source"] 109 | 110 | urllib.urlretrieve(download_url, imgpath) 111 | 112 | img = pillow.open(imgpath).convert("RGB") 113 | 114 | img.thumbnail((MAX_SIDE, MAX_SIDE), pillow.ANTIALIAS) 115 | img.save(imgpath, "JPEG", quality=quality) 116 | 117 | img.thumbnail((thumb_side, thumb_side), pillow.ANTIALIAS) 118 | img.save(thumbpath, "JPEG", quality=quality) 119 | 120 | author = photo._Photo__owner 121 | author = author.realname if author.realname else author.username 122 | image["author"] = author.replace("'", "\\'") 123 | goodImages.append(image) 124 | 125 | attr.image_link = image["url"] 126 | attr.image_author = image["author"] 127 | update = True 128 | 129 | # set False to True to force thumbnails 130 | elif False and image: 131 | 132 | imgpath = os.path.join(imgdir, "{}.jpg".format(uid)) 133 | thumbpath = os.path.join(thumbdir, "{}.jpg".format(uid)) 134 | 135 | img = pillow.open(imgpath).convert("RGB") 136 | 137 | img.thumbnail((thumb_side, thumb_side), pillow.ANTIALIAS) 138 | img.save(thumbpath, "JPEG", quality=quality) 139 | 140 | if not image_only: 141 | name = row["name"] 142 | if attr and name and attr.name != name: 143 | attr.name = name 144 | update = True 145 | 146 | if "image_meta" in row: 147 | meta = row["image_meta"] 148 | if attr and meta and attr.image_meta != meta: 149 | attr.image_meta = meta 150 | update = True 151 | 152 | if update: 153 | db.session.add(attr) 154 | db.session.commit() 155 | 156 | # break 157 | 158 | 159 | 160 | print "\n" 161 | print "Outputing to CSV..." 162 | with open(csvFilename.replace(".csv", "-update.csv"), 'wb') as f: 163 | w = csv.DictWriter(f, None) 164 | 165 | w.fieldnames = csvReader.fieldnames 166 | w.writerow(dict((h, h) for h in csvReader.fieldnames)) 167 | 168 | for row in input_file: 169 | w.writerow(row) 170 | 171 | print "\n" 172 | print "{} new images have been processed.".format(len(goodImages)) 173 | if len(badImages) > 0: 174 | print "The following images have bad licenses: {}".format(", ".join([i["id"] for i in badImages])) 175 | if len(smallImages) > 0: 176 | print "The following images are too small: {}".format(", ".join([i["id"] for i in smallImages])) 177 | if len(removedImages) > 0: 178 | print "The following images have been removed from Flickr: {}".format(", ".join([i for i in removedImages])) 179 | if len(deletedImages) > 0: 180 | print "The following images have been deleted: {}".format(", ".join([i for i in deletedImages])) 181 | 182 | 183 | 184 | if __name__ == '__main__': 185 | read_csv() 186 | -------------------------------------------------------------------------------- /scripts/flickr/short.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This code is taken from the flickrapi project 3 | See https://github.com/rfaulkner/flickrapi/blob/master/flickrapi/shorturl.py 4 | and http://stuvel.eu/flickrapi 5 | ''' 6 | 7 | ALPHABET = u'123456789abcdefghijkmnopqrstuvwxyzABCDEFGHJKLMNPQRSTUVWXYZ' 8 | ALPHALEN = len(ALPHABET) 9 | SHORT_URL = u'http://flic.kr/p/%s' 10 | 11 | def encode(photo_id): 12 | '''encode(photo_id) -> short id 13 | 14 | >>> encode(u'4325695128') 15 | '7Afjsu' 16 | >>> encode(u'2811466321') 17 | '5hruZg' 18 | ''' 19 | 20 | photo_id = int(photo_id) 21 | 22 | encoded = u'' 23 | while photo_id >= ALPHALEN: 24 | div, mod = divmod(photo_id, ALPHALEN) 25 | encoded = ALPHABET[mod] + encoded 26 | photo_id = int(div) 27 | 28 | encoded = ALPHABET[photo_id] + encoded 29 | 30 | return encoded 31 | -------------------------------------------------------------------------------- /scripts/gen_indicies.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Script used to add indexes for PUMS tables 3 | ''' 4 | import itertools 5 | 6 | lookup = { 7 | "a": "age", 8 | "b": "birthplace", 9 | "c": "cip", 10 | "d": "degree", 11 | "s": "sector", 12 | "g": "geo", 13 | "i": "naics", 14 | "o": "soc", 15 | "r": "race", 16 | "s": "sex", 17 | "w": "wage_bin", 18 | "y": "year", 19 | } 20 | 21 | tables = [ 22 | 'ya', 23 | 'yc', 24 | 'yca', 25 | 'ycb', 26 | 'ycd', 27 | 'ycs', 28 | 'yg', 29 | 'ygb', 30 | 'ygc', 31 | 'ygd', 32 | 'ygi', 33 | 'ygio', 34 | 'ygo', 35 | 'ygor', 36 | 'ygos', 37 | 'ygr', 38 | 'ygs', 39 | 'ygw', 40 | 'yi', 41 | 'yic', 42 | 'yid', 43 | 'yio', 44 | 'yior', 45 | 'yios', 46 | 'yir', 47 | 'yis', 48 | 'yiw', 49 | 'yo', 50 | 'yoas', 51 | 'yoc', 52 | 'yocd', 53 | 'yod', 54 | 'yor', 55 | 'yos', 56 | 'yow', 57 | ] 58 | schema = 'pums_1yr' 59 | 60 | def has_prefix(indexes, index): 61 | for ix in indexes: 62 | if ix.startswith(index): 63 | return True 64 | return False 65 | 66 | def gen_index(table, idx_id, is_pk=False): 67 | cols = [lookup[l] for l in idx_id] 68 | if is_pk: 69 | if "i" in table: 70 | cols.append("naics_level") 71 | if "o" in table: 72 | cols.append("soc_level") 73 | cols = ",".join(cols) 74 | unq = "" if not is_pk else "UNIQUE" 75 | qry = "CREATE {4} INDEX {1}_{2}_idx ON {0}.{1} ({3});".format(schema, table, idx_id, cols, unq) 76 | return qry 77 | 78 | for table in tables: 79 | indexes = [] 80 | sizes = range(1, len(table) + 1) 81 | sizes.reverse() 82 | for size in sizes: 83 | tmp = list(itertools.combinations(table, size)) 84 | indexes += [''.join(x) for x in tmp if not has_prefix(indexes, ''.join(x))] 85 | 86 | # indexes to create 87 | for index in indexes: 88 | print gen_index(table, index, len(index) == len(table)) 89 | -------------------------------------------------------------------------------- /scripts/get_vnames.py: -------------------------------------------------------------------------------- 1 | import pprint 2 | from datausa.core import registrar 3 | from datausa.database import db 4 | 5 | data={} 6 | 7 | for tbl in registrar.registered_models: 8 | data[tbl.full_name()] = [c.key for c in tbl.__table__.columns] 9 | 10 | pp = pprint.PrettyPrinter(indent=4) 11 | pp.pprint(data) 12 | -------------------------------------------------------------------------------- /scripts/search/build_index.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path 3 | from whoosh import index 4 | from whoosh.fields import Schema, ID, TEXT, NUMERIC, KEYWORD, NGRAMWORDS 5 | from config import SEARCH_INDEX_DIR 6 | from unidecode import unidecode 7 | 8 | 9 | def manual_add(writer, name, display, orig_id, is_stem=False, url_name=None, zoverride=None, kind=u"geo"): 10 | from datausa.attrs.models import Search 11 | kind = unicode(kind) 12 | doc_obj = Search.query.filter_by(id=orig_id, kind=kind).first() 13 | zval = doc_obj.zvalue * 1.5 if not zoverride else zoverride 14 | is_stem = is_stem or doc_obj.is_stem 15 | if not url_name: 16 | url_name = doc_obj.url_name 17 | writer.add_document(id=doc_obj.id, name=name, 18 | display=display, zvalue=zval, 19 | kind=kind, sumlevel=doc_obj.sumlevel, 20 | is_stem=is_stem, url_name=url_name) 21 | 22 | 23 | def get_schema(): 24 | return Schema(id=ID(stored=True), 25 | name=NGRAMWORDS(stored=True, minsize=2, maxsize=12, at='start', queryor=True), 26 | display=TEXT(stored=True), 27 | zvalue=NUMERIC(stored=True), 28 | kind=KEYWORD(stored=True), 29 | sumlevel=KEYWORD(stored=True), 30 | is_stem=NUMERIC(stored=True), 31 | url_name=TEXT(stored=True)) 32 | 33 | 34 | if __name__ == '__main__': 35 | print "got here!" 36 | print SEARCH_INDEX_DIR 37 | if not os.path.exists(SEARCH_INDEX_DIR): 38 | print "got here2" 39 | os.mkdir(SEARCH_INDEX_DIR) 40 | ix = index.create_in(SEARCH_INDEX_DIR, get_schema()) 41 | print "Creating attr index..." 42 | 43 | ix = index.open_dir(SEARCH_INDEX_DIR) 44 | writer = ix.writer() 45 | from datausa.attrs.models import Search 46 | all_objs = Search.query.all() 47 | for obj in all_objs: 48 | dname = obj.display 49 | stem = False if not hasattr(obj, "is_stem") else obj.is_stem 50 | if dname: 51 | dname = unicode(dname) 52 | dname = unidecode(dname) 53 | dname = unicode(dname) 54 | dname = dname.lower().replace(",", "") 55 | dname = dname.replace(".", "") 56 | writer.add_document(id=obj.id, name=dname, 57 | display=obj.display, zvalue=obj.zvalue, 58 | kind=obj.kind, sumlevel=obj.sumlevel, 59 | is_stem=stem, url_name=obj.url_name) 60 | 61 | if obj.keywords: 62 | for keyword in obj.keywords: 63 | writer.add_document(id=obj.id, name=keyword, 64 | display=obj.display, zvalue=obj.zvalue, 65 | kind=obj.kind, sumlevel=obj.sumlevel, 66 | is_stem=stem, url_name=obj.url_name) 67 | # Custom synonyms to help with search 68 | import pandas as pd 69 | attrs_with_aliases = ["geo"] 70 | for kind in attrs_with_aliases: 71 | target_path = os.path.join(SEARCH_INDEX_DIR, "..", "scripts", "search", "{}_aliases.csv".format(kind)) 72 | df = pd.read_csv(target_path) 73 | for geo, name, short, zval in df.values: 74 | for alias in short.split(","): 75 | alias = alias.strip() 76 | manual_add(writer, unicode(alias), unicode(name), unicode(geo), zoverride=zval, kind=kind) 77 | 78 | # -- 79 | manual_add(writer, u'garbagemen', u'Garbagemen', '537081', kind=u'soc') 80 | manual_add(writer, u'doctors', u'Doctors', '291060', kind=u'soc') 81 | manual_add(writer, u'manhattan', u'Manhattan, NY', '05000US36061', kind=u'geo') 82 | manual_add(writer, u'meteorologists', u'Meteorologists', '192021', kind=u'soc') 83 | manual_add(writer, u'film', u'Motion Pictures & Video Industries', '5121', kind=u'naics') 84 | manual_add(writer, u'movies', u'Motion Pictures & Video Industries', '5121', kind=u'naics') 85 | 86 | writer.commit() 87 | -------------------------------------------------------------------------------- /scripts/search/build_var_index.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path 3 | import json 4 | 5 | from whoosh import index 6 | from whoosh.fields import Schema, TEXT, NGRAMWORDS 7 | from config import VAR_INDEX_DIR 8 | 9 | 10 | def get_schema(): 11 | return Schema(related_vars=TEXT(stored=True), 12 | name=NGRAMWORDS(stored=True, minsize=3, maxsize=12, at='start', queryor=True), 13 | description=TEXT(stored=True), 14 | section=TEXT(stored=True), 15 | section_title=TEXT(stored=True), 16 | related_attrs=TEXT(stored=True), 17 | params=TEXT(stored=True)) 18 | 19 | if __name__ == '__main__': 20 | print("Building index...") 21 | if not os.path.exists(VAR_INDEX_DIR): 22 | os.mkdir(VAR_INDEX_DIR) 23 | ix = index.create_in(VAR_INDEX_DIR, get_schema()) 24 | print("Creating variables index...") 25 | 26 | ix = index.open_dir(VAR_INDEX_DIR) 27 | writer = ix.writer() 28 | 29 | all_vars = [ 30 | [u'adult_obesity,diabetes', u'obesity', u'Obesity Prevalence,Diabetes Prevalence', u'conditions_diseases', u'Healthcare', u'geo', None], 31 | [u'adult_obesity,diabetes', u'diabetes', u'Obesity Prevalence,Diabetes Prevalence', u'conditions_diseases', u'Healthcare', u'geo', None], 32 | [u'adult_obesity,diabetes', u'healthcare', u'Obesity Prevalence,Diabetes Prevalence', u'conditions_diseases', u'Healthcare', u'geo', None], 33 | [u'motor_vehicle_crash_deaths', u'car crashes', u'Motor Vehicle Crash Deaths', u'risky', u'Crime', u'geo', None], 34 | [u'motor_vehicle_crash_deaths', u'accidents', u'Motor Vehicle Crash Deaths', u'risky', u'Crime', u'geo', None], 35 | 36 | [u'adult_smoking', u'smokers', u'Adult Smoking Prevalence', u'risky', u'Healthcare', u'geo', None], 37 | [u'adult_smoking', u'cigarettes', u'Adult Smoking Prevalence', u'risky', u'Healthcare', u'geo', None], 38 | 39 | # [u'infant_mortality', u'infant mortality', u'Infant mortality', u'health', u'geo'], 40 | # [u'teen_births', u'teen births', u'Teen births', u'health', u'geo'], 41 | [u'mean_commute_minutes', u'commuters', u'Average Travel Time', u'commute_time', u'Transportation', u'geo', None], 42 | [u'mean_commute_minutes', u'transit', u'Average Travel Time', u'commute_time', u'Transportation', u'geo', None], 43 | [u'mean_commute_minutes', u'transportation', u'Average Travel Time', u'commute_time', u'Transportation', u'geo', None], 44 | [u'mean_commute_minutes', u'travel time', u'Average Travel Time', u'commute_time', u'Transportation', u'geo', None], 45 | 46 | [u'conflict_total', u'veterans', u'Number of Veterans', u'veterans', u'Military', u'geo', None], 47 | [u'conflict_total', u'war', u'Number of Veterans', u'veterans', u'Military', u'geo', None], 48 | 49 | [u'violent_crime', u'crime', u'Violent Crimes', u'crime', u'Crime', u'geo', None], 50 | [u'homicide_rate', u'murder', u'Homicide Deaths', u'crime', u'Crime', u'geo', None], 51 | [u'homicide_rate', u'homicide', u'Homicide Deaths', u'crime', u'Crime', u'geo', None], 52 | 53 | [u'pop,age', u'population', u'Population,Median Age', u'demographics', u'Demographics', u'geo', None], 54 | [u'pop,age', u'people', u'Population,Median Age', u'demographics', u'Demographics', u'geo', None], 55 | [u'age', u'age', u'Median Age', u'demographics', u'Demographics', u'geo', None], 56 | [u'income', u'income', u'Median Yearly Income', u'economy', u'Economy', u'geo', None], 57 | [u'avg_wage', u'salaries', u'Average Salary', u'economy', u'Economy', u'geo,soc,naics,cip', None], 58 | [u'avg_wage', u'wage', u'Average Salary', u'economy', u'Economy', u'geo,soc,naics,cip', None], 59 | [u'income,age,pop', u'economics', u'Median Yearly Income,Age,Population', u'economy', u'Economy', u'geo', None], 60 | # [u'avg_wage', u'wages', u'Wages', u'income_distro', u'geo', None], 61 | [u'median_property_value', u'property value', u'Median Property Value', u'economy', u'Economy', u'geo', None], 62 | [u'median_property_value', u'home value', u'Median Property Value', u'economy', u'Economy', u'geo', None], 63 | [u'median_property_value', u'housing cost', u'Median Property Value', u'economy', u'Economy', u'geo', None], 64 | 65 | [u'income_below_poverty', u'poverti', u'Population in Poverty', u'poverty_age_gender', u'Wages', u'geo', None], 66 | [u'income_below_poverty', u'poor', u'Population in Poverty', u'poverty_age_gender', u'Wages', u'geo', None], 67 | 68 | [u'households_renter_occupied,households_owner_occupied,households', u'renters', u'Renter occupied households', u'rent_own', u'Housing', u'geo', None], 69 | [u'grads_total', u'graduates', u'Degrees Awarded', u'education', u'Housing', u'geo', None], 70 | [u'grads_total', u'grads', u'Degrees Awarded', u'education', u'Housing', u'geo', None], 71 | [u'grads_total', u'students', u'Degrees Awarded', u'education', u'Housing', u'geo', None], 72 | 73 | [u'nativity_foreign,nativity_us', u'foreign born', u'Foreign Born,Native Born', u'demographics', u'Demographics', u'geo', None], 74 | [u'nativity_foreign,nativity_us', u'native born', u'Foreign Born,Native Born', u'demographics', u'Demographics', u'geo', None], 75 | 76 | [u'pop_black,pop_latino,pop_white,pop_asian', u'race ethnicity', u'Black Population,Hispanic Population,White Population,Asian Population', u'ethnicity', u'Heritage', u'geo', None], 77 | [u'us_citizens', u'citizen', u'Citizenship', u'citizenship', u'Heritage', u'geo', None], 78 | [u'gini', u'gini', u'Gini', u'income_distro', u'Wages', u'geo', None], 79 | [u'gini', u'inequality', u'Gini', u'income_distro', u'Wages', u'geo', None], 80 | [u'pop_latino', u'hispanic', u'Hispanic Population', u'ethnicity', u'Heritage', u'geo', None], 81 | [u'pop_latino', u'latino', u'Hispanic Population', u'ethnicity', u'Heritage', u'geo', None], 82 | [u'pop_black', u'black', u'Black Population', u'ethnicity', u'Heritage', u'geo', None], 83 | [u'pop_white', u'white', u'White Population', u'ethnicity', u'Heritage', u'geo', None], 84 | [u'pop_asian', u'asian', u'Asian Population', u'ethnicity', u'Heritage', u'geo', None], 85 | [u'transport_bicycle', u'bicycle', u'Bicycle to Work', u'mode_transport', u'Transportation', u'geo', None], 86 | [u'transport_bicycle', u'bikers', u'Bicycle to Work', u'mode_transport', u'Transportation', u'geo', None], 87 | [u'transport_bicycle', u'cyclist', u'Bicycle to Work', u'mode_transport', u'Transportation', u'geo', None], 88 | [u'transport_carpooled', u'carpool', u'Carpool to Work', u'mode_transport', u'Transportation', u'geo', None], 89 | [u'transport_publictrans', u'public transit', u'Public Transit to Work', u'mode_transport', u'Transportation', u'geo', None], 90 | [u'transport_drove', u'drive', u'Drove Alone to Work', u'mode_transport', u'Transportation', u'geo', None], 91 | [u'transport_drove', u'cars', u'Drove Alone to Work', u'mode_transport', u'Transportation', u'geo', None], 92 | [u'transport_drove', u'drivers', u'Drove Alone to Work', u'mode_transport', u'Transportation', u'geo', None], 93 | [u'transport_taxi', u'taxi', u'Taxi to Work', u'mode_transport', u'Transportation', u'geo', None], 94 | [u'transport_motorcycle', u'motorcycle', u'Motorcycled to Work', u'mode_transport', u'Transportation', u'geo', None], 95 | [u'transport_walked', u'walk', u'Walked to Work', u'mode_transport', u'Transportation', u'geo', None], 96 | 97 | ] 98 | 99 | from datausa.attrs.models import AcsLanguage, PumsBirthplace 100 | 101 | for lang in AcsLanguage.query.all(): 102 | my_params = { 103 | "year": "latest", 104 | "language": lang.id 105 | } 106 | my_var = [u'num_speakers', u'{}'.format(lang.name.lower()), 107 | u'{} Speakers'.format(lang.name), u'languages', u'Heritage', u'geo', unicode(json.dumps(my_params))] 108 | print my_var 109 | all_vars.append(my_var) 110 | 111 | 112 | for birthplace in PumsBirthplace.query.filter(~PumsBirthplace.id.startswith("XX"), 113 | ~PumsBirthplace.id.startswith("040")): 114 | if birthplace.id in ["161", "344"]: # skip georgia and car 115 | continue 116 | my_params = { 117 | "year": "latest", 118 | "birthplace": birthplace.id 119 | } 120 | b_keyword = birthplace.demonym or birthplace.name 121 | b_keyword = b_keyword.lower().strip() 122 | b_keyword = " ".join([k for k in b_keyword.split(" ") if len(k) > 3]) 123 | my_var = [u'num_over5', u'{}'.format(b_keyword), 124 | u'People Born in {}'.format(birthplace.name.title()), u'heritage', u'Heritage', u'geo', unicode(json.dumps(my_params))] 125 | print my_var 126 | all_vars.append(my_var) 127 | 128 | for related_vars, name, description, section, section_title, related_attrs, params in all_vars: 129 | # print '|{}|{}|{}|'.format(name, description, related_vars) 130 | writer.add_document(related_vars=related_vars, name=name, 131 | description=description, section=section, 132 | section_title=section_title, 133 | related_attrs=related_attrs, params=params) 134 | writer.commit() 135 | -------------------------------------------------------------------------------- /scripts/search/geo_aliases.csv: -------------------------------------------------------------------------------- 1 | geo,name,short,zval 2 | 01000US,United States,"united states, us, usa",150 3 | 16000US3651000,"New York, NY",nyc,124.7253778 4 | 05000US36047,"Kings County (Brooklyn), NY",brooklyn,15 5 | 16000US0644000,"Los Angeles, CA",la,57.66425865 6 | 16000US4260000,"Philadelphia, PA",philly,23.08155342 7 | 05000US06059,"Orange County, CA",oc,19.442103262 8 | 16000US1150000,"Washington, DC",dc,19.231709584 9 | 16000US3240000,"Las Vegas, NV",vegas,18.810854347 10 | 16000US4055000,"Oklahoma City, OK",okc,18.803290277 11 | 16000US2255000,"New Orleans, LA",nola,15.270642743 12 | 04000US06,California,"Cali, CA",14.616604426 13 | 16000US3915000,"Cincinnati, OH",cinci,14.366834419 14 | 04000US48,Texas,tx,13.860417732 15 | 04000US36,New York,ny,12.961516308 16 | 04000US12,Florida,fl,12.903672701 17 | 04000US17,Illinois,il,10 18 | 04000US42,Pennsylvania,pa,10 19 | 79500US2000500,"Kansas City PUMA, KS",kc,1 20 | 04000US39,Ohio,oh,10 21 | 04000US26,Michigan,mi,10 22 | 04000US13,Georgia,ga,10 23 | 04000US37,North Carolina,nc,10 24 | 04000US34,New Jersey,nj,10 25 | 04000US51,Virginia,va,10 26 | 04000US53,Washington,wa,10 27 | 04000US25,Massachusetts,ma,10 28 | 04000US25,Massachusetts,mass,10 29 | 04000US18,Indiana,in,10 30 | 04000US04,Arizona,az,10 31 | 04000US47,Tennessee,tn,10 32 | 04000US29,Missouri,mo,10 33 | 16000US0931270,"Glastonbury Center, CT",gbury,0.5 34 | 04000US24,Maryland,md,10 35 | 04000US55,Wisconsin,wi,10 36 | 04000US27,Minnesota,mn,10 37 | 04000US08,Colorado,co,10 38 | 04000US01,Alabama,al,10 39 | 04000US45,South Carolina,sc,10 40 | 04000US22,Louisiana,la,10 41 | 04000US21,Kentucky,ky,10 42 | 04000US41,Oregon,or,10 43 | 04000US40,Oklahoma,ok,10 44 | 04000US72,Puerto Rico,pr,10 45 | 04000US09,Connecticut,ct,10 46 | 04000US19,Iowa,ia,10 47 | 04000US28,Mississippi,ms,10 48 | 04000US05,Arkansas,ar,10 49 | 04000US20,Kansas,ks,10 50 | 04000US49,Utah,ut,10 51 | 04000US32,Nevada,nv,10 52 | 04000US35,New Mexico,nm,10 53 | 04000US54,West Virginia,wv,10 54 | 04000US31,Nebraska,ne,10 55 | 04000US16,Idaho,id,10 56 | 04000US15,Hawaii,hi,10 57 | 04000US23,Maine,me,10 58 | 04000US33,New Hampshire,nh,10 59 | 04000US44,Rhode Island,ri,10 60 | 04000US30,Montana,mt,10 61 | 04000US10,Delaware,de,10 62 | 04000US46,South Dakota,sd,10 63 | 04000US02,Alaska,ak,10 64 | 04000US38,North Dakota,nd,10 65 | 04000US50,Vermont,vt,10 66 | 04000US11,District of Columbia,dc,10 67 | 04000US56,Wyoming,wy,10 68 | -------------------------------------------------------------------------------- /scripts/search/rebuild_index.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | from config import SEARCH_INDEX_DIR, SQLALCHEMY_DATABASE_URI 4 | from sqlalchemy import create_engine 5 | 6 | print("Step 1. Delete old index") 7 | try: 8 | shutil.rmtree(SEARCH_INDEX_DIR) 9 | except OSError: 10 | print("No directory found...continuing...") 11 | 12 | print("Step 2. Refresh Materialized View") 13 | engine = create_engine(SQLALCHEMY_DATABASE_URI) 14 | with engine.begin() as connection: 15 | result = connection.execute("REFRESH MATERIALIZED VIEW attrs.search_v8") 16 | print("Result", result) 17 | 18 | print("Step 3. Rebuild Index") 19 | build_index = os.path.join(SEARCH_INDEX_DIR.replace("search_index/", ""), "scripts", "search", "build_index.py") 20 | result = os.system("python {}".format(build_index)) 21 | print("Result", result) 22 | -------------------------------------------------------------------------------- /scripts/search/zip_lookup.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | '86000US' || zcta5.geoid10 AS child_geoid, 3 | '31000US' || cbsa.geoid AS parent_geoid, 4 | ST_Area(ST_Intersection(zcta5.geom,cbsa.geom))/ST_Area(zcta5.geom)*100 as percent_covered, 5 | ST_Area(cbsa.geom) as parent_area 6 | FROM tiger2013.zcta5 7 | JOIN tiger2013.cbsa ON ST_Intersects(zcta5.geom, cbsa.geom) 8 | WHERE 9 | ST_Area(ST_Intersection(zcta5.geom,cbsa.geom))/ST_Area(zcta5.geom) > 0 10 | UNION 11 | (SELECT 12 | '86000US' || zcta5.geoid10 AS child_geoid, 13 | '16000US' || place.geoid AS parent_geoid, 14 | ST_Area(ST_Intersection(zcta5.geom,place.geom))/ST_Area(zcta5.geom)*100 as percent_covered, 15 | ST_Area(place.geom) as parent_area 16 | FROM tiger2013.zcta5 17 | JOIN tiger2013.place ON ST_Intersects(zcta5.geom, place.geom) 18 | WHERE 19 | ST_Area(ST_Intersection(zcta5.geom,place.geom))/ST_Area(zcta5.geom) > 0 20 | and ST_IsValid(zcta5.geom)) 21 | UNION 22 | (SELECT 23 | '86000US' || zcta5.geoid10 AS child_geoid, 24 | '05000US' || county.geoid AS parent_geoid, 25 | ST_Area(ST_Intersection(zcta5.geom,county.geom))/ST_Area(zcta5.geom)*100 as percent_covered, 26 | ST_Area(county.geom) as parent_area 27 | FROM tiger2013.zcta5 28 | JOIN tiger2013.county ON ST_Intersects(zcta5.geom, county.geom) 29 | WHERE 30 | ST_Area(ST_Intersection(zcta5.geom, county.geom))/ST_Area(zcta5.geom) > 0 31 | and ST_IsValid(zcta5.geom)) 32 | UNION 33 | (SELECT 34 | '86000US' || zcta5.geoid10 AS child_geoid, 35 | '79500US' || puma.geoid10 AS parent_geoid, 36 | ST_Area(ST_Intersection(zcta5.geom,puma.geom))/ST_Area(zcta5.geom)*100 as percent_covered, 37 | ST_Area(puma.geom) as parent_area 38 | FROM tiger2013.zcta5 39 | JOIN tiger2013.puma ON ST_Intersects(zcta5.geom, puma.geom) 40 | WHERE 41 | ST_Area(ST_Intersection(zcta5.geom, puma.geom))/ST_Area(zcta5.geom) > 0 42 | and ST_IsValid(zcta5.geom)) -------------------------------------------------------------------------------- /scripts/university_abbrev_gen.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import re 3 | from nltk.tokenize import word_tokenize 4 | from nltk.corpus import stopwords 5 | 6 | df = pd.read_csv("schools.csv") 7 | stop_words = set(stopwords.words('english')).union(set(["&", "-"])) 8 | 9 | 10 | def abbrev(x): 11 | x = x.replace("-", " ") 12 | x = re.sub(r"('|:)", "", x) 13 | tokens = word_tokenize(x) 14 | filtered = [w[0] for w in tokens if w not in stop_words] 15 | return "".join(filtered).lower() 16 | 17 | 18 | df['abbrev'] = df.name.apply(abbrev) 19 | df.to_csv("school_abrevs.csv") 20 | -------------------------------------------------------------------------------- /scripts/update_university_keywords.py: -------------------------------------------------------------------------------- 1 | import click 2 | import pandas as pd 3 | from datausa.attrs.models import University 4 | from datausa.database import db 5 | 6 | 7 | @click.command() 8 | @click.option('--sheet_url', prompt='Spreadsheet URL', 9 | help='The spreadsheet containing the university abbreviation mappings.') 10 | def update_keywords(sheet_url): 11 | abbr_df = pd.read_csv(sheet_url) 12 | univs_by_id = {u.id: u for u in University.query} 13 | for univ_id, name, abbrev in abbr_df.values: 14 | univ_obj = univs_by_id[univ_id] 15 | abbrevs = abbrev.split(",") 16 | if univ_obj.keywords and set(abbrevs) == set(univ_obj.keywords): 17 | # no update required! 18 | pass 19 | else: 20 | univ_obj.keywords = abbrevs 21 | db.session.add(univ_obj) 22 | db.session.commit() 23 | 24 | 25 | if __name__ == '__main__': 26 | update_keywords() 27 | -------------------------------------------------------------------------------- /scripts/url_names.py: -------------------------------------------------------------------------------- 1 | 2 | from datausa.attrs.models import Geo 3 | from datausa.database import db 4 | 5 | def hyphenate(x): 6 | ctr = {ord(c):u'-' for c in [',', ' ', '-']} 7 | tmp = unicode(x).translate(ctr) 8 | return tmp.replace('--', '-') 9 | 10 | sumlevels = ['160'] 11 | 12 | count = 1 13 | for sumlevel in sumlevels: 14 | filters = [Geo.id.startswith(sumlevel)] 15 | objs = Geo.query.filter(*filters).all() 16 | for o in objs: 17 | o.url_name = hyphenate(o.display_name) 18 | print o.url_name 19 | db.session.add(o) 20 | 21 | if count > 10000: 22 | db.session.commit() 23 | count = 1 24 | 25 | db.session.commit() 26 | 27 | 28 | -------------------------------------------------------------------------------- /search_index/MAIN_WRITELOCK: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataUSA/datausa-api/7288dede082eda07b61e11cf6dc801fe692f6334/search_index/MAIN_WRITELOCK -------------------------------------------------------------------------------- /search_index/MAIN_hzur5fe2wkrq53me.seg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataUSA/datausa-api/7288dede082eda07b61e11cf6dc801fe692f6334/search_index/MAIN_hzur5fe2wkrq53me.seg -------------------------------------------------------------------------------- /search_index/_MAIN_1.toc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataUSA/datausa-api/7288dede082eda07b61e11cf6dc801fe692f6334/search_index/_MAIN_1.toc -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataUSA/datausa-api/7288dede082eda07b61e11cf6dc801fe692f6334/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_joins.py: -------------------------------------------------------------------------------- 1 | import json 2 | import unittest 3 | 4 | import datausa 5 | 6 | 7 | class TestJoinAPI(unittest.TestCase): 8 | 9 | def setUp(self): 10 | self.app = datausa.app.test_client() 11 | 12 | def get_data(self, url): 13 | req = self.app.get(url) 14 | result = json.loads(req.data) 15 | assert 'data' in result 16 | data = result['data'] 17 | headers = result['headers'] 18 | return data, headers 19 | 20 | def test_geo_crosswalk(self): 21 | req = self.app.get('/api/join/?required=adult_obesity,income&sumlevel=all&show=geo&where=income.geo:16000US2507000,adult_obesity.sumlevel:county&year=latest&auto_crosswalk=1') 22 | result = json.loads(req.data) 23 | assert 'data' in result 24 | data = result['data'] 25 | headers = result['headers'] 26 | target_index = headers.index('chr.yg.adult_obesity') 27 | chr_geo_index = headers.index('chr.yg.geo') 28 | first_row = data[0] 29 | assert len(data) == 1 30 | assert first_row[target_index] 31 | assert first_row[chr_geo_index] == '05000US25025' 32 | 33 | def test_join_but_no_geo_crosswalk(self): 34 | req = self.app.get('/api/join/?required=pop_black,pop_white,income&sumlevel=all&show=geo&where=income.geo:16000US2511000&year=latest') 35 | result = json.loads(req.data) 36 | assert 'data' in result 37 | data = result['data'] 38 | headers = result['headers'] 39 | target_index = headers.index('acs_5yr.yg_race.pop_black') 40 | yg_race_geo_index = headers.index('acs_5yr.yg_race.geo') 41 | first_row = data[0] 42 | assert len(data) == 1 43 | assert first_row[target_index] 44 | assert first_row[yg_race_geo_index] == '16000US2511000' 45 | 46 | def test_display_names(self): 47 | req = self.app.get('/api/join/?required=adult_obesity,income&sumlevel=all&show=geo&where=adult_obesity.geo:04000US25&display_names=1') 48 | result = json.loads(req.data) 49 | assert 'data' in result 50 | data = result['data'] 51 | headers = result['headers'] 52 | target_index = headers.index('chr.yg.geo_name') 53 | assert target_index >= 0 54 | first_row = data[0] 55 | assert first_row[target_index] == 'Massachusetts' 56 | 57 | def test_limit(self): 58 | url = '/api/join/?required=grads_total&sumlevel=all&show=geo&limit=3' 59 | data, _ = self.get_data(url) 60 | assert len(data) == 3 61 | 62 | def test_geos_crosswalk_3vars(self): 63 | url = '/api/join/?required=adult_obesity,avg_wage,income&sumlevel=all&show=geo&where=income.geo:16000US2507000,adult_obesity.sumlevel:county,grads_total.sumlevel:county&year=latest&auto_crosswalk=1' 64 | data, _ = self.get_data(url) 65 | assert len(data) >= 1 66 | 67 | def test_cip_crosswalk(self): 68 | url = '/api/join/?required=avg_wage,value&sumlevel=all&show=cip&where=value.cip:010000' 69 | data, _ = self.get_data(url) 70 | assert len(data) >= 1 71 | 72 | def test_geos_2vars_latest(self): 73 | url = '/api/join/?required=adult_obesity,income&sumlevel=all&show=geo&where=income.geo:04000US25,adult_obesity.geo:04000US25&year=latest' 74 | data, _ = self.get_data(url) 75 | assert len(data) == 1 76 | 77 | def test_ipeds_acs_geo_join(self): 78 | url = '/api/join/?required=grads_total,income&sumlevel=all&show=geo&where=income.geo:16000US2507000,grads_total.sumlevel:state&year=latest&auto_crosswalk=1' 79 | data, _ = self.get_data(url) 80 | assert len(data) == 1 81 | 82 | def test_puma_to_state(self): 83 | url = '/api/join/?required=avg_wage,grads_total,income&show=geo&where=avg_wage.sumlevel:puma,grads_total.geo:04000US25,avg_wage.geo:79500US2500100,income.sumlevel:state&year=latest&auto_crosswalk=1' 84 | data, _ = self.get_data(url) 85 | assert len(data) == 1 86 | 87 | def test_puma_to_state_and_county(self): 88 | url = '/api/join/?required=avg_wage,grads_total,income&show=geo&where=avg_wage.geo:79500US2500506,grads_total.sumlevel:state,income.sumlevel:county&year=latest&auto_crosswalk=1' 89 | data, _ = self.get_data(url) 90 | assert len(data) == 1 91 | 92 | def test_bug(self): 93 | url = '/api/join/?required=grads_total,adult_obesity&sumlevel=all&show=geo&where=grads_total.geo:16000US2511000,adult_obesity.sumlevel:state&year=latest&auto_crosswalk=1' 94 | data, _ = self.get_data(url) 95 | assert len(data) == 1 96 | 97 | def test_bug2(self): 98 | url = '/api/join/?required=avg_wage,income&show=geo&where=avg_wage.geo:79500US2500506,income.sumlevel:state&year=latest&auto_crosswalk=1' 99 | data, _ = self.get_data(url) 100 | assert len(data) == 1 101 | 102 | def test_national_containment(self): 103 | url='/api/join/?required=grads_total,adult_obesity&sumlevel=all&show=geo&limit=5&where=grads_total.geo:01000US,adult_obesity.sumlevel:county&auto_crosswalk=1' 104 | data, _ = self.get_data(url) 105 | assert len(data) >= 1 106 | 107 | def test_geo_non_crosswalk(self): 108 | url='/api/join/?required=grads_total,adult_obesity&show=geo&limit=1&where=grads_total.geo:16000US2511000&auto_crosswalk=0' 109 | data, headers = self.get_data(url) 110 | target_index = headers.index('chr.yg.geo') 111 | first_row = data[0] 112 | assert first_row[target_index] is None 113 | 114 | def test_cip_crosswalk(self): 115 | url='/api/join/?required=avg_wage,grads_total&show=cip&limit=1&where=grads_total.cip:090401&auto_crosswalk=1' 116 | data, headers = self.get_data(url) 117 | target_index = headers.index('pums_1yr.yc.cip') 118 | first_row = data[0] 119 | assert first_row[target_index] == '09' 120 | 121 | def test_cip_no_crosswalk(self): 122 | url='/api/join/?required=avg_wage,grads_total&show=cip&limit=1&where=grads_total.cip:090401&auto_crosswalk=0' 123 | data, headers = self.get_data(url) 124 | target_index = headers.index('pums_1yr.yc.cip') 125 | first_row = data[0] 126 | assert first_row[target_index] is None 127 | 128 | def test_onet_soc_crosswalk(self): 129 | url='/api/join/?required=avg_wage,value&sumlevel=all&show=soc&limit=5&auto_crosswalk=1&where=avg_wage.soc:1110XX' 130 | data, headers = self.get_data(url) 131 | onet_index = headers.index('onet.skills_by_soc.soc') 132 | pums_index = headers.index('pums_1yr.yo.soc') 133 | first_row = data[0] 134 | assert first_row[onet_index] in ['111000', '110000'] 135 | assert first_row[pums_index] == '1110XX' 136 | 137 | def test_onet_soc_no_crosswalk(self): 138 | url='/api/join/?required=avg_wage,value&sumlevel=all&show=soc&limit=5&auto_crosswalk=0&where=avg_wage.soc:1110XX' 139 | data, headers = self.get_data(url) 140 | onet_index = headers.index('onet.skills_by_soc.soc') 141 | pums_index = headers.index('pums_1yr.yo.soc') 142 | first_row = data[0] 143 | assert first_row[onet_index] is None 144 | assert first_row[pums_index] == '1110XX' 145 | 146 | def where_bug(self): 147 | url = 'api/join/?required=income,grads_total&sumlevel=county&show=geo&where=grads_total.degree:5&limit=5' 148 | data, headers = self.get_data(url) 149 | assert len(data) == 5 150 | 151 | def test_naics_xwalk(self): 152 | url = '/api/join/?required=employees_thousands,num_ppl,avg_wage&sumlevel=0&show=naics&limit=5&naics=23&year=latest' 153 | data, headers = self.get_data(url) 154 | bls_index = headers.index('bls.ces_yi.naics') 155 | pums_index = headers.index('pums_1yr.yi.naics') 156 | first_row = data[0] 157 | assert len(data) == 1 158 | assert first_row[bls_index] is not None 159 | assert first_row[pums_index] is not None 160 | 161 | def test_naics_auto_xwalk(self): 162 | url = '/api/join/?required=employees_thousands,num_ppl&show=naics&auto_crosswalk=1&where=num_ppl.naics:71-72&year=latest' 163 | data, headers = self.get_data(url) 164 | bls_index = headers.index('bls.ces_yi.naics') 165 | pums_index = headers.index('pums_1yr.yi.naics') 166 | first_row = data[0] 167 | assert len(data) == 1 168 | assert first_row[bls_index] == '71' 169 | assert first_row[pums_index] == '71-72' 170 | 171 | 172 | def test_pums_names(self): 173 | url = '/api/join/?required=num_ppl&sumlevel=all&show=naics&naics=23&display_names=1' 174 | data, headers = self.get_data(url) 175 | pums_index = headers.index('pums_1yr.yi.naics_name') 176 | first_row = data[0] 177 | assert first_row[pums_index] == 'Construction' 178 | 179 | def test_pums_degree_name(self): 180 | url = '/api/join/?required=num_ppl&sumlevel=all&show=degree&naics=54&display_names=1°ree=21' 181 | data, headers = self.get_data(url) 182 | pums_index = headers.index('pums_1yr.yid.degree_name') 183 | first_row = data[0] 184 | assert first_row[pums_index] == "Bachelor's degree" 185 | 186 | def test_bls_names(self): 187 | url = '/api/join/?required=employees_thousands&sumlevel=all&show=naics&naics=54&display_names=1' 188 | data, headers = self.get_data(url) 189 | pums_index = headers.index('bls.ces_yi.naics_name') 190 | first_row = data[0] 191 | assert first_row[pums_index] == "Professional, Scientific, and Technical Services" 192 | 193 | def test_offset_sort(self): 194 | url = '/api/join/?required=pop&sumlevel=state&show=geo&limit=1&offset=1&display_names=1&year=2014&sort=desc&order=pop' 195 | data, headers = self.get_data(url) 196 | pums_index = headers.index('acs_5yr.yg.geo_name') 197 | first_row = data[0] 198 | assert first_row[pums_index] == "Texas" 199 | 200 | def test_ed_crosswalk_join(self): 201 | url = '/api/join/?show=university&sumlevel=all&required=grads_total,default_rate&limit=1&university=100654' 202 | data, headers = self.get_data(url) 203 | ed_id = headers.index('ed.yu_defaults.university') 204 | ipeds_id = headers.index('ipeds.grads_yu.university') 205 | opeid = headers.index('ed.yu_defaults.opeid') 206 | 207 | first_row = data[0] 208 | assert first_row[ed_id] == first_row[ipeds_id] and first_row[opeid] == "001002" 209 | 210 | if __name__ == '__main__': 211 | unittest.main() 212 | -------------------------------------------------------------------------------- /tests/test_search.py: -------------------------------------------------------------------------------- 1 | from whoosh.qparser import QueryParser 2 | from whoosh import index, sorting, scoring 3 | from whoosh import qparser, query 4 | from config import SEARCH_INDEX_DIR 5 | import math 6 | import unittest 7 | from datausa.attrs.search import do_search 8 | 9 | ix = index.open_dir(SEARCH_INDEX_DIR) 10 | qp = QueryParser("name", schema=ix.schema, group=qparser.OrGroup) 11 | 12 | facet = sorting.FieldFacet("zvalue", reverse=True) 13 | scores = sorting.ScoreFacet() 14 | 15 | class TestStringMethods(unittest.TestCase): 16 | NY_IDS = ['31000US35620', '05000US36061', '04000US36', '16000US3651000'] 17 | 18 | def test_extra_word(self): 19 | data,suggs,tries,my_vars = do_search("new york economy") 20 | self.assertTrue(data[0][0] in self.NY_IDS) 21 | 22 | def test_manhattan(self): 23 | data,suggs,tries,my_vars = do_search("manhattan") 24 | self.assertEqual(data[0][0], "05000US36061") 25 | 26 | def test_exact_match_begin(self): 27 | data,suggs,tries,my_vars = do_search("nome") 28 | self.assertEqual(data[0][0], '16000US0254920') 29 | 30 | def test_ny(self): 31 | data,suggs,tries,my_vars = do_search("new york") 32 | self.assertTrue(data[0][0] in self.NY_IDS) 33 | 34 | def test_doc(self): 35 | data,suggs,tries,my_vars = do_search("doctor") 36 | self.assertEqual(data[0][0], '291060') 37 | 38 | def test_stl(self): 39 | data,suggs,tries,my_vars = do_search("st louis") 40 | self.assertEqual(data[0][0], '16000US2965000') 41 | 42 | def test_fortla(self): 43 | data,suggs,tries,my_vars = do_search("fort lau") 44 | self.assertEqual(data[0][0], '16000US1224000') 45 | 46 | def test_bad_spelling(self): 47 | data,suggs,tries,my_vars = do_search("massachusitt") 48 | self.assertEqual(data[0][0], '04000US25') 49 | 50 | def test_econ(self): 51 | econs = ['193011', '450601', '01000US', '193011'] 52 | data,suggs,tries,my_vars = do_search("econ") 53 | self.assertTrue(data[0][0] in econs) 54 | 55 | def test_milford(self): 56 | data,suggs,tries,my_vars = do_search("milford nh") 57 | self.assertEqual(data[0][0], '16000US3347940') 58 | 59 | def test_bevhills(self): 60 | data,suggs,tries,my_vars = do_search("beverly hills") 61 | self.assertEqual(data[0][0], '16000US0606308') 62 | 63 | def test_kind_naics(self): 64 | data,suggs,tries,my_vars = do_search("educat", kind="naics") 65 | self.assertTrue(data[0][0]) 66 | 67 | def test_ma(self): 68 | data,suggs,tries,my_vars = do_search("ma") 69 | self.assertEqual(data[0][0], '04000US25') 70 | 71 | def test_ak(self): 72 | data,suggs,tries,my_vars = do_search("ak") 73 | self.assertEqual(data[0][0], '04000US02') 74 | 75 | def test_pa(self): 76 | data,suggs,tries,my_vars = do_search("pa") 77 | self.assertEqual(data[0][0], '04000US42') 78 | 79 | def test_al(self): 80 | data,suggs,tries,my_vars = do_search("al") 81 | self.assertEqual(data[0][0], '04000US01') 82 | 83 | def test_dc(self): 84 | data,suggs,tries,my_vars = do_search("dc") 85 | self.assertEqual(data[0][0], '16000US1150000') 86 | 87 | def test_rny(self): 88 | data,suggs,tries,my_vars = do_search("rochester, ny") 89 | self.assertEqual(data[0][0], '16000US3663000') 90 | 91 | def test_cpmd(self): 92 | data,suggs,tries,my_vars = do_search("college park, md") 93 | self.assertEqual(data[0][0], '16000US2418750') 94 | 95 | def test_moco(self): 96 | data,suggs,tries,my_vars = do_search("montgomery county") 97 | self.assertEqual(data[0][0], '05000US24031') 98 | 99 | def test_pgc(self): 100 | data,suggs,tries,my_vars = do_search("prince georges county") 101 | self.assertEqual(data[0][0], '05000US24033') 102 | 103 | def test_travel_time(self): 104 | data,suggs,tries,my_vars = do_search("travel time") 105 | self.assertEqual(data[0][0], '01000US') 106 | 107 | def test_commute_time(self): 108 | data,suggs,tries,my_vars = do_search("commute time") 109 | self.assertEqual(data[0][0], '01000US') 110 | 111 | def test_boston_travel_time(self): 112 | data,suggs,tries,my_vars = do_search("boston travel time") 113 | self.assertEqual(data[0][0], '16000US2507000') 114 | 115 | def test_nj_travel_time(self): 116 | data,suggs,tries,my_vars = do_search("economy in new jersey") 117 | ids = [row[0] for row in data[:3]] 118 | self.assertTrue('04000US34' in ids) 119 | self.assertEqual(ids[0], '16000US1820152') 120 | 121 | def test_obesity(self): 122 | data,suggs,tries,my_vars = do_search("obesity") 123 | self.assertEqual(data[0][0], '01000US') 124 | 125 | def test_vietnamese_wyoming(self): 126 | data,suggs,tries,my_vars = do_search("vietnamese speakers in wyoming") 127 | ids = [row[0] for row in data] 128 | self.assertTrue('04000US56' in ids[:2]) 129 | 130 | def test_polish_chicago(self): 131 | data,suggs,tries,my_vars = do_search("polish speakers in chicago") 132 | self.assertEqual(data[0][0], '16000US1714000') 133 | 134 | def test_native_cambr(self): 135 | data,suggs,tries,my_vars = do_search("native born in cambridge") 136 | self.assertEqual(data[0][0], '16000US2511000') 137 | 138 | def test_fr_cambr(self): 139 | data,suggs,tries,my_vars = do_search("french in cambridge") 140 | self.assertEqual(data[0][0], '16000US2511000') 141 | 142 | def test_chil_nm(self): 143 | data,suggs,tries,my_vars = do_search("chileans in new mexico") 144 | self.assertEqual(data[0][0], '04000US35') 145 | 146 | def test_swiss_nj(self): 147 | data,suggs,tries,my_vars = do_search("swiss in new jersey") 148 | self.assertEqual(data[0][0], '04000US34') 149 | 150 | def test_cuba_montana(self): 151 | data,suggs,tries,my_vars = do_search("cubans in montana") 152 | self.assertEqual(data[0][0], '04000US30') 153 | 154 | def test_il_fl(self): 155 | data,suggs,tries,my_vars = do_search("israelis in florida") 156 | self.assertEqual(data[0][0], '04000US12') 157 | 158 | def test_citizenship_fla(self): 159 | data,suggs,tries,my_vars = do_search("citizenship in florida") 160 | self.assertEqual(data[0][0], '04000US12') 161 | 162 | def test_ga(self): 163 | data,suggs,tries,my_vars = do_search("georgia") 164 | self.assertEqual(data[0][0], '04000US13') 165 | 166 | def test_age(self): 167 | data,suggs,tries,my_vars = do_search("age in chicago") 168 | self.assertEqual(data[0][0], '16000US1714000') 169 | self.assertTrue(len(my_vars) > 0) 170 | self.assertEqual(my_vars[0]['name'], 'age') 171 | 172 | def test_healthcare(self): 173 | data,suggs,tries,my_vars = do_search("healthcare") 174 | self.assertEqual(data[0][0], "01000US") 175 | self.assertEqual(my_vars[0]["name"], "healthcare") 176 | self.assertEqual(my_vars[0]["section"], "conditions_diseases") 177 | 178 | def test_obesity(self): 179 | data,suggs,tries,my_vars = do_search("obesity") 180 | self.assertEqual(data[0][0], "01000US") 181 | self.assertEqual(my_vars[0]["name"], "obesity") 182 | self.assertEqual(my_vars[0]["section"], "conditions_diseases") 183 | 184 | def test_umd(self): 185 | data, suggs, tries, my_vars = do_search("umd") 186 | self.assertEqual(data[0][0], "163286") 187 | 188 | def test_harvard(self): 189 | data, suggs, tries, my_vars = do_search("boston university") 190 | self.assertEqual(data[0][0], "164988") 191 | 192 | def test_neu(self): 193 | data, suggs, tries, my_vars = do_search("neu") 194 | self.assertEqual(data[0][0], "167358") 195 | 196 | 197 | if __name__ == '__main__': 198 | unittest.main() 199 | -------------------------------------------------------------------------------- /var_index/MAIN_WRITELOCK: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataUSA/datausa-api/7288dede082eda07b61e11cf6dc801fe692f6334/var_index/MAIN_WRITELOCK -------------------------------------------------------------------------------- /var_index/MAIN_g1c93s1e37q8coxg.seg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataUSA/datausa-api/7288dede082eda07b61e11cf6dc801fe692f6334/var_index/MAIN_g1c93s1e37q8coxg.seg -------------------------------------------------------------------------------- /var_index/_MAIN_1.toc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataUSA/datausa-api/7288dede082eda07b61e11cf6dc801fe692f6334/var_index/_MAIN_1.toc --------------------------------------------------------------------------------