├── .gitattributes ├── nsfg.hdf5 ├── data ├── GSS.dat.gz ├── 2013_2015_FemPregData.dat.gz ├── GSS.do ├── GSS.dct └── 2013_2015_FemPregSetup.dct ├── gss_eda ├── GSS.dat.gz ├── GSS.dct └── GSS.do ├── README.md ├── LLCP2017.ASC.gz ├── environment.yml ├── LICENSE ├── .gitignore ├── distribution.py ├── utils.py ├── eds01_gss_clean.ipynb └── gss_validate.ipynb /.gitattributes: -------------------------------------------------------------------------------- 1 | LLCP2017.ASC.gz filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /nsfg.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AllenDowney/ExploratoryDataAnalysis/HEAD/nsfg.hdf5 -------------------------------------------------------------------------------- /data/GSS.dat.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AllenDowney/ExploratoryDataAnalysis/HEAD/data/GSS.dat.gz -------------------------------------------------------------------------------- /gss_eda/GSS.dat.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AllenDowney/ExploratoryDataAnalysis/HEAD/gss_eda/GSS.dat.gz -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ExploratoryDataAnalysis 2 | Repository for an online class on Exploratory Data Analysis in Python 3 | -------------------------------------------------------------------------------- /data/2013_2015_FemPregData.dat.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AllenDowney/ExploratoryDataAnalysis/HEAD/data/2013_2015_FemPregData.dat.gz -------------------------------------------------------------------------------- /LLCP2017.ASC.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b13013ed06f8a69eb58b6c352208b4f5d8cfd0780896873ca39d81efcfb97a4c 3 | size 69310674 4 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: ExploratoryDataAnalysis 2 | 3 | dependencies: 4 | - python=3.7 5 | - jupyter 6 | - numpy 7 | - matplotlib 8 | - seaborn 9 | - pandas 10 | - pytables 11 | - scipy 12 | - scikit-learn 13 | - pip 14 | - pip: 15 | - empiricaldist 16 | 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Allen Downey 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /distribution.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | import matplotlib.pyplot as plt 5 | 6 | 7 | def underride(d, **options): 8 | """Add key-value pairs to d only if key is not in d. 9 | 10 | d: dictionary 11 | options: keyword args to add to d 12 | """ 13 | for key, val in options.items(): 14 | d.setdefault(key, val) 15 | 16 | return d 17 | 18 | 19 | class Pmf(pd.Series): 20 | 21 | def __init__(self, seq, name='Pmf', normalize=True): 22 | series = pd.Series(seq).value_counts().sort_index() 23 | super().__init__(series, name=name) 24 | if normalize: 25 | self.normalize() 26 | 27 | @property 28 | def qs(self): 29 | return self.index.values 30 | 31 | @property 32 | def ps(self): 33 | return self.values 34 | 35 | def __call__(self, qs): 36 | return self.get(qs, 0) 37 | 38 | def normalize(self): 39 | self /= self.sum() 40 | 41 | def bar(self, **options): 42 | underride(options, label=self.name) 43 | plt.bar(self.index, self.values, **options) 44 | 45 | def plot(self, **options): 46 | underride(options, label=self.name) 47 | plt.plot(self.index, self.values, **options) 48 | 49 | 50 | 51 | from scipy.interpolate import interp1d 52 | 53 | class Cdf(pd.Series): 54 | 55 | def __init__(self, seq, name='Cdf'): 56 | pmf = Pmf(seq) 57 | super().__init__(pmf.cumsum(), name=name) 58 | 59 | @property 60 | def qs(self): 61 | return self.index 62 | 63 | @property 64 | def ps(self): 65 | return self.values 66 | 67 | @property 68 | def forward(self): 69 | return interp1d(self.qs, self.ps, 70 | kind='previous', 71 | assume_sorted=True, 72 | bounds_error=False, 73 | fill_value=(0,1)) 74 | 75 | @property 76 | def inverse(self): 77 | return interp1d(self.ps, self.qs, 78 | kind='next', 79 | assume_sorted=True, 80 | bounds_error=False, 81 | fill_value=(self.qs[0], np.nan)) 82 | 83 | def __call__(self, qs): 84 | return self.forward(qs) 85 | 86 | def percentile_rank(self, qs): 87 | return self.forward(qs) * 100 88 | 89 | def percentile(self, percentile_ranks): 90 | return self.inverse(percentile_ranks / 100) 91 | 92 | def step(self, **options): 93 | underride(options, label=self.name, where='post') 94 | plt.step(self.index, self.values, **options) 95 | 96 | def plot(self, **options): 97 | underride(options, label=self.name) 98 | plt.plot(self.index, self.values, **options) -------------------------------------------------------------------------------- /data/GSS.do: -------------------------------------------------------------------------------- 1 | #delimit ; 2 | 3 | infix 4 | year 1 - 20 5 | gun 21 - 40 6 | gunage 41 - 60 7 | gunnum 61 - 80 8 | owngun 81 - 100 9 | rowngun 101 - 120 10 | realinc 121 - 140 11 | conrinc 141 - 160 12 | hispanic 161 - 180 13 | cohort 181 - 200 14 | ballot 201 - 220 15 | gunlaw 221 - 240 16 | cappun 241 - 260 17 | id_ 261 - 280 18 | age 281 - 300 19 | educ 301 - 320 20 | sex 321 - 340 21 | race 341 - 360 22 | income 361 - 380 23 | rincome 381 - 400 24 | srcbelt 401 - 420 25 | polviews 421 - 440 26 | natcrime 441 - 460 27 | wtssall 461 - 480 28 | using GSS.dat; 29 | 30 | label variable year "Gss year for this respondent "; 31 | label variable gun "Ever threatened with gun or shot at"; 32 | label variable gunage "Threatened or shot at as child or adult"; 33 | label variable gunnum "If threatened or shot at--how many times"; 34 | label variable owngun "Have gun in home"; 35 | label variable rowngun "Does gun belong to r"; 36 | label variable realinc "Family income in constant $"; 37 | label variable conrinc "Respondent income in constant dollars"; 38 | label variable hispanic "Hispanic specified"; 39 | label variable cohort "Year of birth"; 40 | label variable ballot "Ballot used for interview"; 41 | label variable gunlaw "Favor or oppose gun permits"; 42 | label variable cappun "Favor or oppose death penalty for murder"; 43 | label variable id_ "Respondent id number"; 44 | label variable age "Age of respondent"; 45 | label variable educ "Highest year of school completed"; 46 | label variable sex "Respondents sex"; 47 | label variable race "Race of respondent"; 48 | label variable income "Total family income"; 49 | label variable rincome "Respondents income"; 50 | label variable srcbelt "Src beltcode"; 51 | label variable polviews "Think of self as liberal or conservative"; 52 | label variable natcrime "Halting rising crime rate"; 53 | label variable wtssall "Weight variable"; 54 | 55 | 56 | label define gsp001x 57 | 9 "No answer" 58 | 8 "Don't know" 59 | 2 "No" 60 | 1 "Yes" 61 | 0 "Not applicable" 62 | ; 63 | label define gsp002x 64 | 9 "No answer" 65 | 8 "Don't know" 66 | 3 "Both" 67 | 2 "Adult" 68 | 1 "Child" 69 | 0 "Not applicable" 70 | ; 71 | label define gsp003x 72 | 9 "No answer" 73 | 8 "Not sure" 74 | 3 "4+ times" 75 | 2 "2-3 times" 76 | 1 "Once" 77 | 0 "Not applicable" 78 | ; 79 | label define gsp004x 80 | 9 "No answer" 81 | 8 "Don't know" 82 | 3 "Refused" 83 | 2 "No" 84 | 1 "Yes" 85 | 0 "Not applicable" 86 | ; 87 | label define gsp005x 88 | 9 "No answer" 89 | 8 "Don't know" 90 | 3 "Refused" 91 | 2 "No" 92 | 1 "Yes" 93 | 0 "Not applicable" 94 | ; 95 | label define gsp006x 96 | 999999 "No answer" 97 | 999998 "Dont know" 98 | 0 "Not applicable" 99 | ; 100 | label define gsp007x 101 | 999999 "No answer" 102 | 999998 "Dont know" 103 | 0 "Not applicable" 104 | ; 105 | label define gsp008x 106 | 99 "No answer" 107 | 98 "Don't know" 108 | 50 "Other, not specified" 109 | 47 "Hispanic" 110 | 46 "Latino/a" 111 | 45 "Latin" 112 | 41 "South american" 113 | 40 "Latin american" 114 | 35 "Filipino/a" 115 | 31 "Basque" 116 | 30 "Spanish" 117 | 25 "Chilean" 118 | 24 "Argentinian" 119 | 23 "Venezuelan" 120 | 22 "Columbian" 121 | 21 "Equadorian" 122 | 20 "Peruvian" 123 | 16 "West indian" 124 | 15 "Dominican" 125 | 11 "Honduran" 126 | 10 "Central american" 127 | 9 "Costa rican" 128 | 8 "Nicaraguan" 129 | 7 "Panamanian" 130 | 6 "Guatemalan" 131 | 5 "Salvadorian" 132 | 4 "Cuban" 133 | 3 "Puerto rican" 134 | 2 "Mexican, mexican american, chicano/a" 135 | 1 "Not hispanic" 136 | 0 "Not applicable" 137 | ; 138 | label define gsp009x 139 | 9999 "No answer" 140 | 0 "Not applicable" 141 | ; 142 | label define gsp010x 143 | 4 "Ballot d" 144 | 3 "Ballot c" 145 | 2 "Ballot b" 146 | 1 "Ballot a" 147 | 0 "Not applicable" 148 | ; 149 | label define gsp011x 150 | 9 "No answer" 151 | 8 "Don't know" 152 | 2 "Oppose" 153 | 1 "Favor" 154 | 0 "Not applicable" 155 | ; 156 | label define gsp012x 157 | 9 "No answer" 158 | 8 "Don't know" 159 | 2 "Oppose" 160 | 1 "Favor" 161 | 0 "Not applicable" 162 | ; 163 | label define gsp013x 164 | 99 "No answer" 165 | 98 "Don't know" 166 | 89 "89 or older" 167 | ; 168 | label define gsp014x 169 | 99 "No answer" 170 | 98 "Don't know" 171 | 97 "Not applicable" 172 | ; 173 | label define gsp015x 174 | 2 "Female" 175 | 1 "Male" 176 | ; 177 | label define gsp016x 178 | 3 "Other" 179 | 2 "Black" 180 | 1 "White" 181 | 0 "Not applicable" 182 | ; 183 | label define gsp017x 184 | 99 "No answer" 185 | 98 "Don't know" 186 | 13 "Refused" 187 | 12 "$25000 or more" 188 | 11 "$20000 - 24999" 189 | 10 "$15000 - 19999" 190 | 9 "$10000 - 14999" 191 | 8 "$8000 to 9999" 192 | 7 "$7000 to 7999" 193 | 6 "$6000 to 6999" 194 | 5 "$5000 to 5999" 195 | 4 "$4000 to 4999" 196 | 3 "$3000 to 3999" 197 | 2 "$1000 to 2999" 198 | 1 "Lt $1000" 199 | 0 "Not applicable" 200 | ; 201 | label define gsp018x 202 | 99 "No answer" 203 | 98 "Don't know" 204 | 13 "Refused" 205 | 12 "$25000 or more" 206 | 11 "$20000 - 24999" 207 | 10 "$15000 - 19999" 208 | 9 "$10000 - 14999" 209 | 8 "$8000 to 9999" 210 | 7 "$7000 to 7999" 211 | 6 "$6000 to 6999" 212 | 5 "$5000 to 5999" 213 | 4 "$4000 to 4999" 214 | 3 "$3000 to 3999" 215 | 2 "$1000 to 2999" 216 | 1 "Lt $1000" 217 | 0 "Not applicable" 218 | ; 219 | label define gsp019x 220 | 6 "Other rural" 221 | 5 "Other urban" 222 | 4 "Suburb, 13-100" 223 | 3 "Suburb, 12 lrgst" 224 | 2 "Smsa's 13-100" 225 | 1 "12 lrgst smsa's" 226 | 0 "Not assigned" 227 | ; 228 | label define gsp020x 229 | 9 "No answer" 230 | 8 "Don't know" 231 | 7 "Extrmly conservative" 232 | 6 "Conservative" 233 | 5 "Slghtly conservative" 234 | 4 "Moderate" 235 | 3 "Slightly liberal" 236 | 2 "Liberal" 237 | 1 "Extremely liberal" 238 | 0 "Not applicable" 239 | ; 240 | label define gsp021x 241 | 9 "No answer" 242 | 8 "Don't know" 243 | 3 "Too much" 244 | 2 "About right" 245 | 1 "Too little" 246 | 0 "Not applicable" 247 | ; 248 | 249 | 250 | label values gun gsp001x; 251 | label values gunage gsp002x; 252 | label values gunnum gsp003x; 253 | label values owngun gsp004x; 254 | label values rowngun gsp005x; 255 | label values realinc gsp006x; 256 | label values conrinc gsp007x; 257 | label values hispanic gsp008x; 258 | label values cohort gsp009x; 259 | label values ballot gsp010x; 260 | label values gunlaw gsp011x; 261 | label values cappun gsp012x; 262 | label values age gsp013x; 263 | label values educ gsp014x; 264 | label values sex gsp015x; 265 | label values race gsp016x; 266 | label values income gsp017x; 267 | label values rincome gsp018x; 268 | label values srcbelt gsp019x; 269 | label values polviews gsp020x; 270 | label values natcrime gsp021x; 271 | 272 | 273 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | import re 6 | 7 | class FixedWidthVariables(object): 8 | """Represents a set of variables in a fixed width file.""" 9 | 10 | def __init__(self, variables, index_base=0): 11 | """Initializes. 12 | 13 | variables: DataFrame 14 | index_base: are the indices 0 or 1 based? 15 | 16 | Attributes: 17 | colspecs: list of (start, end) index tuples 18 | names: list of string variable names 19 | """ 20 | self.variables = variables 21 | 22 | # note: by default, subtract 1 from colspecs 23 | self.colspecs = variables[['start', 'end']] - index_base 24 | 25 | # convert colspecs to a list of pair of int 26 | self.colspecs = self.colspecs.astype(np.int).values.tolist() 27 | self.names = variables['name'] 28 | 29 | def read_fixed_width(self, filename, **options): 30 | """Reads a fixed width ASCII file. 31 | 32 | filename: string filename 33 | 34 | returns: DataFrame 35 | """ 36 | df = pd.read_fwf(filename, 37 | colspecs=self.colspecs, 38 | names=self.names, 39 | **options) 40 | return df 41 | 42 | 43 | def read_stata_dict(dct_file, **options): 44 | """Reads a Stata dictionary file. 45 | 46 | dct_file: string filename 47 | options: dict of options passed to open() 48 | 49 | returns: FixedWidthVariables object 50 | """ 51 | type_map = dict(byte=int, int=int, long=int, float=float, 52 | double=float, numeric=float) 53 | 54 | var_info = [] 55 | with open(dct_file, **options) as f: 56 | for line in f: 57 | match = re.search( r'_column\(([^)]*)\)', line) 58 | if not match: 59 | continue 60 | start = int(match.group(1)) 61 | t = line.split() 62 | vtype, name, fstring = t[1:4] 63 | name = name.lower() 64 | if vtype.startswith('str'): 65 | vtype = str 66 | else: 67 | vtype = type_map[vtype] 68 | long_desc = ' '.join(t[4:]).strip('"') 69 | var_info.append((start, vtype, name, fstring, long_desc)) 70 | 71 | columns = ['start', 'type', 'name', 'fstring', 'desc'] 72 | variables = pd.DataFrame(var_info, columns=columns) 73 | 74 | # fill in the end column by shifting the start column 75 | variables['end'] = variables.start.shift(-1) 76 | variables.loc[len(variables)-1, 'end'] = 0 77 | 78 | dct = FixedWidthVariables(variables, index_base=1) 79 | return dct 80 | 81 | 82 | def read_stata(dct_name, dat_name, **options): 83 | """Reads Stata files from the given directory. 84 | 85 | dirname: string 86 | 87 | returns: DataFrame 88 | """ 89 | dct = read_stata_dict(dct_name) 90 | df = dct.read_fixed_width(dat_name, **options) 91 | return df 92 | 93 | 94 | def sample_rows(df, nrows, replace=False): 95 | """Choose a sample of rows from a DataFrame. 96 | 97 | df: DataFrame 98 | nrows: number of rows 99 | replace: whether to sample with replacement 100 | 101 | returns: DataDf 102 | """ 103 | indices = np.random.choice(df.index, nrows, replace=replace) 104 | sample = df.loc[indices] 105 | return sample 106 | 107 | 108 | def resample_rows(df): 109 | """Resamples rows from a DataFrame. 110 | 111 | df: DataFrame 112 | 113 | returns: DataFrame 114 | """ 115 | return sample_rows(df, len(df), replace=True) 116 | 117 | 118 | def resample_rows_weighted(df, column='finalwgt'): 119 | """Resamples a DataFrame using probabilities proportional to given column. 120 | 121 | df: DataFrame 122 | column: string column name to use as weights 123 | 124 | returns: DataFrame 125 | """ 126 | weights = df[column].copy() 127 | weights /= sum(weights) 128 | indices = np.random.choice(df.index, len(df), replace=True, p=weights) 129 | sample = df.loc[indices] 130 | return sample 131 | 132 | 133 | def resample_by_year(df, column='wtssall'): 134 | """Resample rows within each year. 135 | 136 | df: DataFrame 137 | column: string name of weight variable 138 | 139 | returns DataFrame 140 | """ 141 | grouped = df.groupby('year') 142 | samples = [resample_rows_weighted(group, column) 143 | for _, group in grouped] 144 | sample = pd.concat(samples, ignore_index=True) 145 | return sample 146 | 147 | 148 | def values(df, varname): 149 | """Values and counts in index order. 150 | 151 | df: DataFrame 152 | varname: strign column name 153 | 154 | returns: Series that maps from value to frequency 155 | """ 156 | return df[varname].value_counts().sort_index() 157 | 158 | def count_by_year(gss, varname): 159 | """Groups by category and year and counts. 160 | 161 | gss: DataFrame 162 | varname: string variable to group by 163 | 164 | returns: DataFrame with one row per year, one column per category. 165 | """ 166 | grouped = gss.groupby([varname, 'year']) 167 | count = grouped[varname].count().unstack(level=0) 168 | 169 | # note: the following is not ideal, because it does not 170 | # distinguish 0 from NA, but in this dataset the only 171 | # zeros are during years when the question was not asked. 172 | count = count.replace(0, np.nan).dropna() 173 | return count 174 | 175 | def fill_missing(df, varname, badvals=[98, 99]): 176 | """Fill missing data with random values. 177 | 178 | df: DataFrame 179 | varname: string column name 180 | badvals: list of values to be replaced 181 | """ 182 | # replace badvals with NaN 183 | df[varname].replace(badvals, np.nan, inplace=True) 184 | 185 | # get the index of rows missing varname 186 | null = df[varname].isnull() 187 | n_missing = sum(null) 188 | 189 | # choose a random sample from the non-missing values 190 | fill = np.random.choice(df[varname].dropna(), n_missing, replace=True) 191 | 192 | # replace missing data with the samples 193 | df.loc[null, varname] = fill 194 | 195 | # return the number of missing values replaced 196 | return n_missing 197 | 198 | 199 | def round_into_bins(df, var, bin_width, high=None, low=0): 200 | """Rounds values down to the bin they belong in. 201 | 202 | df: DataFrame 203 | var: string variable name 204 | bin_width: number, width of the bins 205 | 206 | returns: array of bin values 207 | """ 208 | if high is None: 209 | high = df[var].max() 210 | 211 | bins = np.arange(low, high+bin_width, bin_width) 212 | indices = np.digitize(df[var], bins) 213 | return bins[indices-1] 214 | 215 | 216 | def underride(d, **options): 217 | """Add key-value pairs to d only if key is not in d. 218 | 219 | d: dictionary 220 | options: keyword args to add to d 221 | """ 222 | for key, val in options.items(): 223 | d.setdefault(key, val) 224 | 225 | return d 226 | 227 | 228 | def decorate(**options): 229 | """Decorate the current axes. 230 | Call decorate with keyword arguments like 231 | decorate(title='Title', 232 | xlabel='x', 233 | ylabel='y') 234 | The keyword arguments can be any of the axis properties 235 | https://matplotlib.org/api/axes_api.html 236 | In addition, you can use `legend=False` to suppress the legend. 237 | And you can use `loc` to indicate the location of the legend 238 | (the default value is 'best') 239 | """ 240 | loc = options.pop('loc', 'best') 241 | if options.pop('legend', True): 242 | legend(loc=loc) 243 | 244 | plt.gca().set(**options) 245 | plt.tight_layout() 246 | 247 | 248 | def legend(**options): 249 | """Draws a legend only if there is at least one labeled item. 250 | options are passed to plt.legend() 251 | https://matplotlib.org/api/_as_gen/matplotlib.pyplot.legend.html 252 | """ 253 | underride(options, loc='best') 254 | 255 | ax = plt.gca() 256 | handles, labels = ax.get_legend_handles_labels() 257 | #TODO: don't draw if there are none 258 | ax.legend(handles, labels, **options) 259 | 260 | from statsmodels.nonparametric.smoothers_lowess import lowess 261 | 262 | def make_lowess(series): 263 | """Use LOWESS to compute a smooth line. 264 | 265 | series: pd.Series 266 | 267 | returns: pd.Series 268 | """ 269 | endog = series.values 270 | exog = series.index.values 271 | 272 | smooth = lowess(endog, exog) 273 | index, data = np.transpose(smooth) 274 | 275 | return pd.Series(data, index=index) 276 | 277 | def plot_series_lowess(series, color): 278 | """Plots a series of data points and a smooth line. 279 | 280 | series: pd.Series 281 | color: string or tuple 282 | """ 283 | series.plot(lw=0, marker='o', color=color, alpha=0.5) 284 | smooth = make_lowess(series) 285 | smooth.plot(label='_', color=color) 286 | 287 | def plot_columns_lowess(df, columns, colors): 288 | """Plot the columns in a DataFrame. 289 | 290 | df: pd.DataFrame 291 | columns: list of column names, in the desired order 292 | colors: mapping from column names to colors 293 | """ 294 | for col in columns: 295 | series = df[col] 296 | plot_series_lowess(series, colors[col]) 297 | 298 | def anchor_legend(x, y): 299 | """Put the legend at the given locationself. 300 | 301 | x: axis coordinate 302 | y: axis coordinate 303 | """ 304 | plt.legend(bbox_to_anchor=(x, y), loc='upper left', ncol=1) 305 | -------------------------------------------------------------------------------- /data/GSS.dct: -------------------------------------------------------------------------------- 1 | infile dictionary { 2 | _column(1) numeric POSTLIFE %20f "Belief in life after death" 3 | _column(21) numeric LIFE %20f "Is life exciting or dull" 4 | _column(41) numeric HELPFUL %20f "People helpful or looking out for selves" 5 | _column(61) numeric FAIR %20f "People fair or try to take advantage" 6 | _column(81) numeric TRUST %20f "Can people be trusted" 7 | _column(101) numeric CONCLERG %20f "Confidence in organized religion" 8 | _column(121) numeric CONEDUC %20f "Confidence in education" 9 | _column(141) numeric CONFED %20f "Confid. in exec branch of fed govt" 10 | _column(161) numeric CONPRESS %20f "Confidence in press" 11 | _column(181) numeric CONJUDGE %20f "Confid. in united states supreme court" 12 | _column(201) numeric CONLEGIS %20f "Confidence in congress" 13 | _column(221) numeric HEALTH %20f "Condition of health" 14 | _column(241) numeric HAPMAR %20f "Happiness of marriage" 15 | _column(261) numeric PRAY %20f "How often does r pray" 16 | _column(281) numeric RELIG16 %20f "Religion in which raised" 17 | _column(301) numeric FUND16 %20f "How fundamentalist was r at age 16" 18 | _column(321) numeric SPREL16 %20f "Religion in which spouse raised" 19 | _column(341) numeric PRAYER %20f "Bible prayer in public schools" 20 | _column(361) numeric BIBLE %20f "Feelings about the bible" 21 | _column(381) numeric RACMAR %20f "Favor law against racial intermarriage" 22 | _column(401) numeric RACPRES %20f "Would vote for black president" 23 | _column(421) numeric AFFRMACT %20f "Favor preference in hiring blacks" 24 | _column(441) numeric HAPPY %20f "General happiness" 25 | _column(461) numeric CONARMY %20f "Confidence in military" 26 | _column(481) numeric SATJOB %20f "Job or housework" 27 | _column(501) numeric FEAR %20f "Afraid to walk at night in neighborhood" 28 | _column(521) numeric OWNGUN %20f "Have gun in home" 29 | _column(541) numeric PISTOL %20f "Pistol or revolver in home" 30 | _column(561) numeric HUNT %20f "Does r or spouse hunt" 31 | _column(581) numeric PHONE %20f "Does r have telephone" 32 | _column(601) numeric MEMCHURH %20f "Membership in church group" 33 | _column(621) float REALINC %20f "Family income in constant $" 34 | _column(641) numeric COHORT %20f "Year of birth" 35 | _column(661) numeric MARCOHRT %20f "Year of first marriage" 36 | _column(681) numeric BALLOT %20f "Ballot used for interview" 37 | _column(701) numeric SPANKING %20f "Favor spanking to discipline child" 38 | _column(721) numeric HOMOSEX %20f "Homosexual sex relations" 39 | _column(741) numeric CLASS_ %20f "Subjective class identification" 40 | _column(761) numeric SATFIN %20f "Satisfaction with financial situation" 41 | _column(781) numeric FINRELA %20f "Opinion of family income" 42 | _column(801) numeric UNION_ %20f "Does r or spouse belong to union" 43 | _column(821) numeric FEPOL %20f "Women not suited for politics" 44 | _column(841) numeric ABANY %20f "Abortion if woman wants for any reason" 45 | _column(861) numeric CHLDIDEL %20f "Ideal number of children" 46 | _column(881) numeric SEXEDUC %20f "Sex education in public schools" 47 | _column(901) numeric PREMARSX %20f "Sex before marriage" 48 | _column(921) numeric XMARSEX %20f "Sex with person other than spouse" 49 | _column(941) numeric WTSSALL %20f "Weight variable" 50 | _column(961) numeric RELITEN %20f "Strength of affiliation" 51 | _column(981) numeric YEAR %20f "Gss year for this respondent " 52 | _column(1001) numeric MADEG %20f "Mothers highest degree" 53 | _column(1021) numeric SPDEG %20f "Spouses highest degree" 54 | _column(1041) numeric SEX %20f "Respondents sex" 55 | _column(1061) numeric RACE %20f "Race of respondent" 56 | _column(1081) numeric RES16 %20f "Type of place lived in when 16 yrs old" 57 | _column(1101) numeric REG16 %20f "Region of residence, age 16" 58 | _column(1121) numeric SRCBELT %20f "Src beltcode" 59 | _column(1141) numeric PARTYID %20f "Political party affiliation" 60 | _column(1161) numeric PRES04 %20f "Vote for kerry, bush, nader" 61 | _column(1181) numeric PRES08 %20f "Vote obama or mccain" 62 | _column(1201) numeric PADEG %20f "Fathers highest degree" 63 | _column(1221) numeric DEGREE %20f "Rs highest degree" 64 | _column(1241) numeric ID_ %20f "Respondent id number" 65 | _column(1261) numeric AGEWED %20f "Age when first married" 66 | _column(1281) numeric DIVORCE %20f "Ever been divorced or separated" 67 | _column(1301) numeric SIBS %20f "Number of brothers and sisters" 68 | _column(1321) numeric CHILDS %20f "Number of children" 69 | _column(1341) numeric AGE %20f "Age of respondent" 70 | _column(1361) numeric EDUC %20f "Highest year of school completed" 71 | _column(1381) numeric PAEDUC %20f "Highest year school completed, father" 72 | _column(1401) numeric MAEDUC %20f "Highest year school completed, mother" 73 | _column(1421) numeric SPEDUC %20f "Highest year school completed, spouse" 74 | _column(1441) numeric PRES12 %20f "Vote obama or romney" 75 | _column(1461) numeric POLVIEWS %20f "Think of self as liberal or conservative" 76 | _column(1481) numeric COLATH %20f "Allow anti-religionist to teach" 77 | _column(1501) numeric LIBATH %20f "Allow anti-religious book in library" 78 | _column(1521) numeric SPKHOMO %20f "Allow homosexual to speak" 79 | _column(1541) numeric COLHOMO %20f "Allow homosexual to teach" 80 | _column(1561) numeric LIBHOMO %20f "Allow homosexuals book in library" 81 | _column(1581) numeric CAPPUN %20f "Favor or oppose death penalty for murder" 82 | _column(1601) numeric GUNLAW %20f "Favor or oppose gun permits" 83 | _column(1621) numeric GRASS %20f "Should marijuana be made legal" 84 | _column(1641) numeric RELIG %20f "Rs religious preference" 85 | _column(1661) numeric FUND %20f "How fundamentalist is r currently" 86 | _column(1681) numeric SPKATH %20f "Allow anti-religionist to speak" 87 | _column(1701) numeric NATFARE %20f "Welfare" 88 | _column(1721) numeric NATSPAC %20f "Space exploration program" 89 | _column(1741) numeric NATENVIR %20f "Improving & protecting environment" 90 | _column(1761) numeric NATHEAL %20f "Improving & protecting nations health" 91 | _column(1781) numeric NATCITY %20f "Solving problems of big cities" 92 | _column(1801) numeric NATCRIME %20f "Halting rising crime rate" 93 | _column(1821) numeric NATDRUG %20f "Dealing with drug addiction" 94 | _column(1841) numeric NATEDUC %20f "Improving nations education system" 95 | _column(1861) numeric NATRACE %20f "Improving the conditions of blacks" 96 | _column(1881) numeric NATARMS %20f "Military, armaments, and defense" 97 | _column(1901) numeric NATAID %20f "Foreign aid" 98 | _column(1921) numeric ATTEND %20f "How often r attends religious services" 99 | } 100 | -------------------------------------------------------------------------------- /gss_eda/GSS.dct: -------------------------------------------------------------------------------- 1 | infile dictionary { 2 | _column(1) numeric YEAR %20f "Gss year for this respondent " 3 | _column(21) numeric ID_ %20f "Respondent id number" 4 | _column(41) numeric AGEWED %20f "Age when first married" 5 | _column(61) numeric DIVORCE %20f "Ever been divorced or separated" 6 | _column(81) numeric SIBS %20f "Number of brothers and sisters" 7 | _column(101) numeric CHILDS %20f "Number of children" 8 | _column(121) numeric AGE %20f "Age of respondent" 9 | _column(141) numeric EDUC %20f "Highest year of school completed" 10 | _column(161) numeric PAEDUC %20f "Highest year school completed, father" 11 | _column(181) numeric MAEDUC %20f "Highest year school completed, mother" 12 | _column(201) numeric SPEDUC %20f "Highest year school completed, spouse" 13 | _column(221) numeric DEGREE %20f "Rs highest degree" 14 | _column(241) numeric PADEG %20f "Fathers highest degree" 15 | _column(261) numeric MADEG %20f "Mothers highest degree" 16 | _column(281) numeric SPDEG %20f "Spouses highest degree" 17 | _column(301) numeric SEX %20f "Respondents sex" 18 | _column(321) numeric RACE %20f "Race of respondent" 19 | _column(341) numeric RES16 %20f "Type of place lived in when 16 yrs old" 20 | _column(361) numeric REG16 %20f "Region of residence, age 16" 21 | _column(381) numeric SRCBELT %20f "Src beltcode" 22 | _column(401) numeric PARTYID %20f "Political party affiliation" 23 | _column(421) numeric PRES04 %20f "Vote for kerry, bush, nader" 24 | _column(441) numeric PRES08 %20f "Vote obama or mccain" 25 | _column(461) numeric PRES12 %20f "Vote obama or romney" 26 | _column(481) numeric POLVIEWS %20f "Think of self as liberal or conservative" 27 | _column(501) numeric NATSPAC %20f "Space exploration program" 28 | _column(521) numeric NATENVIR %20f "Improving & protecting environment" 29 | _column(541) numeric NATHEAL %20f "Improving & protecting nations health" 30 | _column(561) numeric NATCITY %20f "Solving problems of big cities" 31 | _column(581) numeric NATCRIME %20f "Halting rising crime rate" 32 | _column(601) numeric NATDRUG %20f "Dealing with drug addiction" 33 | _column(621) numeric NATEDUC %20f "Improving nations education system" 34 | _column(641) numeric NATRACE %20f "Improving the conditions of blacks" 35 | _column(661) numeric NATARMS %20f "Military, armaments, and defense" 36 | _column(681) numeric NATAID %20f "Foreign aid" 37 | _column(701) numeric NATFARE %20f "Welfare" 38 | _column(721) numeric SPKATH %20f "Allow anti-religionist to speak" 39 | _column(741) numeric COLATH %20f "Allow anti-religionist to teach" 40 | _column(761) numeric LIBATH %20f "Allow anti-religious book in library" 41 | _column(781) numeric SPKHOMO %20f "Allow homosexual to speak" 42 | _column(801) numeric COLHOMO %20f "Allow homosexual to teach" 43 | _column(821) numeric LIBHOMO %20f "Allow homosexuals book in library" 44 | _column(841) numeric CAPPUN %20f "Favor or oppose death penalty for murder" 45 | _column(861) numeric GUNLAW %20f "Favor or oppose gun permits" 46 | _column(881) numeric GRASS %20f "Should marijuana be made legal" 47 | _column(901) numeric RELIG %20f "Rs religious preference" 48 | _column(921) numeric FUND %20f "How fundamentalist is r currently" 49 | _column(941) numeric ATTEND %20f "How often r attends religious services" 50 | _column(961) numeric RELITEN %20f "Strength of affiliation" 51 | _column(981) numeric POSTLIFE %20f "Belief in life after death" 52 | _column(1001) numeric PRAY %20f "How often does r pray" 53 | _column(1021) numeric RELIG16 %20f "Religion in which raised" 54 | _column(1041) numeric FUND16 %20f "How fundamentalist was r at age 16" 55 | _column(1061) numeric SPREL16 %20f "Religion in which spouse raised" 56 | _column(1081) numeric PRAYER %20f "Bible prayer in public schools" 57 | _column(1101) numeric BIBLE %20f "Feelings about the bible" 58 | _column(1121) numeric RACMAR %20f "Favor law against racial intermarriage" 59 | _column(1141) numeric RACPRES %20f "Would vote for black president" 60 | _column(1161) numeric AFFRMACT %20f "Favor preference in hiring blacks" 61 | _column(1181) numeric HAPPY %20f "General happiness" 62 | _column(1201) numeric HAPMAR %20f "Happiness of marriage" 63 | _column(1221) numeric HEALTH %20f "Condition of health" 64 | _column(1241) numeric LIFE %20f "Is life exciting or dull" 65 | _column(1261) numeric HELPFUL %20f "People helpful or looking out for selves" 66 | _column(1281) numeric FAIR %20f "People fair or try to take advantage" 67 | _column(1301) numeric TRUST %20f "Can people be trusted" 68 | _column(1321) numeric CONCLERG %20f "Confidence in organized religion" 69 | _column(1341) numeric CONEDUC %20f "Confidence in education" 70 | _column(1361) numeric CONFED %20f "Confid. in exec branch of fed govt" 71 | _column(1381) numeric CONPRESS %20f "Confidence in press" 72 | _column(1401) numeric CONJUDGE %20f "Confid. in united states supreme court" 73 | _column(1421) numeric CONLEGIS %20f "Confidence in congress" 74 | _column(1441) numeric CONARMY %20f "Confidence in military" 75 | _column(1461) numeric SATJOB %20f "Job or housework" 76 | _column(1481) numeric CLASS_ %20f "Subjective class identification" 77 | _column(1501) numeric SATFIN %20f "Satisfaction with financial situation" 78 | _column(1521) numeric FINRELA %20f "Opinion of family income" 79 | _column(1541) numeric UNION_ %20f "Does r or spouse belong to union" 80 | _column(1561) numeric FEPOL %20f "Women not suited for politics" 81 | _column(1581) numeric ABANY %20f "Abortion if woman wants for any reason" 82 | _column(1601) numeric CHLDIDEL %20f "Ideal number of children" 83 | _column(1621) numeric SEXEDUC %20f "Sex education in public schools" 84 | _column(1641) numeric PREMARSX %20f "Sex before marriage" 85 | _column(1661) numeric XMARSEX %20f "Sex with person other than spouse" 86 | _column(1681) numeric HOMOSEX %20f "Homosexual sex relations" 87 | _column(1701) numeric SPANKING %20f "Favor spanking to discipline child" 88 | _column(1721) numeric FEAR %20f "Afraid to walk at night in neighborhood" 89 | _column(1741) numeric OWNGUN %20f "Have gun in home" 90 | _column(1761) numeric PISTOL %20f "Pistol or revolver in home" 91 | _column(1781) numeric HUNT %20f "Does r or spouse hunt" 92 | _column(1801) numeric PHONE %20f "Does r have telephone" 93 | _column(1821) numeric MEMCHURH %20f "Membership in church group" 94 | _column(1841) float REALINC %20f "Family income in constant $" 95 | _column(1861) numeric COHORT %20f "Year of birth" 96 | _column(1881) numeric MARCOHRT %20f "Year of first marriage" 97 | _column(1901) numeric BALLOT %20f "Ballot used for interview" 98 | _column(1921) numeric WTSSALL %20f "Weight variable" 99 | _column(1941) numeric ADULTS %20f "Household members 18 yrs and older" 100 | _column(1961) numeric COMPUSE %20f "R use computer" 101 | _column(1981) numeric DATABANK %20f "Computer data threat to individual privacy" 102 | _column(2001) numeric WTSSNR %20f "Weight variable" 103 | _column(2021) numeric SPKRAC %20f "Allow racist to speak" 104 | _column(2041) numeric SPKCOM %20f "Allow communist to speak" 105 | _column(2061) numeric SPKMIL %20f "Allow militarist to speak" 106 | _column(2081) numeric SPKMSLM %20f "Allow muslim clergymen preaching hatred of the us" 107 | } 108 | -------------------------------------------------------------------------------- /eds01_gss_clean.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# EDS Case Study\n", 8 | "\n", 9 | "Load and resample GSS data\n", 10 | "\n", 11 | "Allen Downey\n", 12 | "\n", 13 | "[MIT License](https://en.wikipedia.org/wiki/MIT_License)" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 1, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "# If we're running in Colab, set up the environment\n", 23 | "\n", 24 | "import sys\n", 25 | "IN_COLAB = 'google.colab' in sys.modules\n", 26 | "\n", 27 | "if IN_COLAB:\n", 28 | " !pip install empiricaldist\n", 29 | " !git clone --depth 1 https://github.com/AllenDowney/ExploratoryDataAnalysis\n", 30 | " %cd ExploratoryDataAnalysis" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "import pandas as pd\n", 40 | "import numpy as np\n", 41 | "import matplotlib.pyplot as plt\n", 42 | "import seaborn as sns\n", 43 | "\n", 44 | "import utils" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "### Reading the extract\n", 52 | "\n", 53 | "https://gssdataexplorer.norc.org/projects/52787/extracts\n", 54 | "\n", 55 | "Currently Pandas is not able to read the files generated by GSS in any of the standard formats: Stata, SPSS, Excel.\n", 56 | "\n", 57 | "As a workaround, I wrote the following functions to read the Stata dictionary file and use the information there to read the Stata data file using `pd.read_fwf` which reads fixed-width files." 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 3, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "import re\n", 67 | "import os\n", 68 | "\n", 69 | "class FixedWidthVariables(object):\n", 70 | " \"\"\"Represents a set of variables in a fixed width file.\"\"\"\n", 71 | "\n", 72 | " def __init__(self, variables, index_base=0):\n", 73 | " \"\"\"Initializes.\n", 74 | "\n", 75 | " variables: DataFrame\n", 76 | " index_base: are the indices 0 or 1 based?\n", 77 | "\n", 78 | " Attributes:\n", 79 | " colspecs: list of (start, end) index tuples\n", 80 | " names: list of string variable names\n", 81 | " \"\"\"\n", 82 | " self.variables = variables\n", 83 | "\n", 84 | " # note: by default, subtract 1 from colspecs\n", 85 | " self.colspecs = variables[['start', 'end']] - index_base\n", 86 | "\n", 87 | " # convert colspecs to a list of pair of int\n", 88 | " self.colspecs = self.colspecs.astype(np.int).values.tolist()\n", 89 | " self.names = variables['name']\n", 90 | "\n", 91 | " def ReadFixedWidth(self, filename, **options):\n", 92 | " \"\"\"Reads a fixed width ASCII file.\n", 93 | "\n", 94 | " filename: string filename\n", 95 | "\n", 96 | " returns: DataFrame\n", 97 | " \"\"\"\n", 98 | " df = pd.read_fwf(filename,\n", 99 | " colspecs=self.colspecs, \n", 100 | " names=self.names,\n", 101 | " **options)\n", 102 | " return df\n", 103 | "\n", 104 | "\n", 105 | "def ReadStataDct(dct_file, **options):\n", 106 | " \"\"\"Reads a Stata dictionary file.\n", 107 | "\n", 108 | " dct_file: string filename\n", 109 | " options: dict of options passed to open()\n", 110 | "\n", 111 | " returns: FixedWidthVariables object\n", 112 | " \"\"\"\n", 113 | " type_map = dict(byte=int, int=int, long=int, float=float, \n", 114 | " double=float, numeric=float)\n", 115 | "\n", 116 | " var_info = []\n", 117 | " with open(dct_file, **options) as f:\n", 118 | " for line in f:\n", 119 | " match = re.search( r'_column\\(([^)]*)\\)', line)\n", 120 | " if not match:\n", 121 | " continue\n", 122 | " start = int(match.group(1))\n", 123 | " t = line.split()\n", 124 | " vtype, name, fstring = t[1:4]\n", 125 | " name = name.lower()\n", 126 | " if vtype.startswith('str'):\n", 127 | " vtype = str\n", 128 | " else:\n", 129 | " vtype = type_map[vtype]\n", 130 | " long_desc = ' '.join(t[4:]).strip('\"')\n", 131 | " var_info.append((start, vtype, name, fstring, long_desc))\n", 132 | " \n", 133 | " columns = ['start', 'type', 'name', 'fstring', 'desc']\n", 134 | " variables = pd.DataFrame(var_info, columns=columns)\n", 135 | "\n", 136 | " # fill in the end column by shifting the start column\n", 137 | " variables['end'] = variables.start.shift(-1)\n", 138 | " variables.loc[len(variables)-1, 'end'] = 0\n", 139 | "\n", 140 | " dct = FixedWidthVariables(variables, index_base=1)\n", 141 | " return dct\n", 142 | "\n", 143 | "def read_gss(dirname):\n", 144 | " \"\"\"Reads GSS files from the given directory.\n", 145 | " \n", 146 | " dirname: string\n", 147 | " \n", 148 | " returns: DataFrame\n", 149 | " \"\"\"\n", 150 | " dct_file = os.path.join(dirname, 'GSS.dct')\n", 151 | " dct = ReadStataDct(dct_file)\n", 152 | " \n", 153 | " data_file = os.path.join(dirname, 'GSS.dat.gz')\n", 154 | " gss = dct.ReadFixedWidth(data_file, compression='gzip')\n", 155 | " \n", 156 | " return gss" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 4, 162 | "metadata": { 163 | "scrolled": true 164 | }, 165 | "outputs": [ 166 | { 167 | "name": "stdout", 168 | "output_type": "stream", 169 | "text": [ 170 | "(64814, 105)\n" 171 | ] 172 | }, 173 | { 174 | "data": { 175 | "text/html": [ 176 | "
\n", 177 | "\n", 190 | "\n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | "
yearid_ageweddivorcesibschildsageeducpaeducmaeduc...ballotwtssalladultscompusedatabankwtssnrspkracspkcomspkmilspkmslm
019721003023161097...00.44461001.00100
11972221245701088...00.88932001.00200
21972320254481288...00.88932001.00200
3197242425027171612...00.88932001.00100
41972522222611288...00.88932001.00100
\n", 340 | "

5 rows × 105 columns

\n", 341 | "
" 342 | ], 343 | "text/plain": [ 344 | " year id_ agewed divorce sibs childs age educ paeduc maeduc ... \\\n", 345 | "0 1972 1 0 0 3 0 23 16 10 97 ... \n", 346 | "1 1972 2 21 2 4 5 70 10 8 8 ... \n", 347 | "2 1972 3 20 2 5 4 48 12 8 8 ... \n", 348 | "3 1972 4 24 2 5 0 27 17 16 12 ... \n", 349 | "4 1972 5 22 2 2 2 61 12 8 8 ... \n", 350 | "\n", 351 | " ballot wtssall adults compuse databank wtssnr spkrac spkcom spkmil \\\n", 352 | "0 0 0.4446 1 0 0 1.0 0 1 0 \n", 353 | "1 0 0.8893 2 0 0 1.0 0 2 0 \n", 354 | "2 0 0.8893 2 0 0 1.0 0 2 0 \n", 355 | "3 0 0.8893 2 0 0 1.0 0 1 0 \n", 356 | "4 0 0.8893 2 0 0 1.0 0 1 0 \n", 357 | "\n", 358 | " spkmslm \n", 359 | "0 0 \n", 360 | "1 0 \n", 361 | "2 0 \n", 362 | "3 0 \n", 363 | "4 0 \n", 364 | "\n", 365 | "[5 rows x 105 columns]" 366 | ] 367 | }, 368 | "execution_count": 4, 369 | "metadata": {}, 370 | "output_type": "execute_result" 371 | } 372 | ], 373 | "source": [ 374 | "gss = read_gss('gss_eda')\n", 375 | "print(gss.shape)\n", 376 | "gss.head()" 377 | ] 378 | }, 379 | { 380 | "cell_type": "markdown", 381 | "metadata": {}, 382 | "source": [ 383 | "### Missing data\n", 384 | "\n", 385 | "For many variables, missing values are encoded with numbers, so we need to replace them before we do any analysis.\n", 386 | "\n", 387 | "For example, for `polviews`, the values 8, 9, and 0 represent \"Don't know\", \"No answer\", and \"Not applicable\".\n", 388 | "\n", 389 | "\"Not applicable\" usually means the respondent was not asked a particular question.\n", 390 | "\n", 391 | "To keep things simple, we'll treat all of these values as equivalent, but we should keep in mind that we lose some information by doing that. For example, if a respondent refuses to answer a question, that might suggest something about their answer. If so, treating their response as missing data might bias the results.\n", 392 | "\n", 393 | "Fortunately, for most questions the number of respondents who refused to answer is small." 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": 5, 399 | "metadata": {}, 400 | "outputs": [], 401 | "source": [ 402 | "def replace_invalid(df):\n", 403 | " \"\"\"Replace invalid data with NaN.\n", 404 | " \n", 405 | " df: DataFrame\n", 406 | " \"\"\"\n", 407 | " df.realinc.replace([0], np.nan, inplace=True) \n", 408 | " df.educ.replace([98, 99], np.nan, inplace=True)\n", 409 | " # 89 means 89 or older\n", 410 | " df.age.replace([98, 99], np.nan, inplace=True) \n", 411 | " df.cohort.replace([9999], np.nan, inplace=True)\n", 412 | " df.adults.replace([9], np.nan, inplace=True)\n", 413 | " df.colhomo.replace([0, 8, 9], np.nan, inplace=True)\n", 414 | " df.libhomo.replace([0, 8, 9], np.nan, inplace=True)\n", 415 | " df.cappun.replace([0, 8, 9], np.nan, inplace=True)\n", 416 | " df.gunlaw.replace([0, 8, 9], np.nan, inplace=True)\n", 417 | " df.grass.replace([0, 8, 9], np.nan, inplace=True)\n", 418 | " df.fepol.replace([0, 8, 9], np.nan, inplace=True)\n", 419 | " df.abany.replace([0, 8, 9], np.nan, inplace=True)\n", 420 | " df.prayer.replace([0, 8, 9], np.nan, inplace=True)\n", 421 | " df.sexeduc.replace([0, 8, 9], np.nan, inplace=True)\n", 422 | " df.premarsx.replace([0, 8, 9], np.nan, inplace=True)\n", 423 | " df.xmarsex.replace([0, 8, 9], np.nan, inplace=True)\n", 424 | " df.homosex.replace([0, 5, 8, 9], np.nan, inplace=True)\n", 425 | " df.racmar.replace([0, 8, 9], np.nan, inplace=True)\n", 426 | " df.spanking.replace([0, 8, 9], np.nan, inplace=True)\n", 427 | " df.racpres.replace([0, 8, 9], np.nan, inplace=True)\n", 428 | " df.fear.replace([0, 8, 9], np.nan, inplace=True)\n", 429 | " df.databank.replace([0, 8, 9], np.nan, inplace=True)\n", 430 | " df.affrmact.replace([0, 8, 9], np.nan, inplace=True)\n", 431 | " df.happy.replace([0, 8, 9], np.nan, inplace=True)\n", 432 | " df.hapmar.replace([0, 8, 9], np.nan, inplace=True)\n", 433 | " df.natspac.replace([0, 8, 9], np.nan, inplace=True)\n", 434 | " df.natenvir.replace([0, 8, 9], np.nan, inplace=True)\n", 435 | " df.natheal.replace([0, 8, 9], np.nan, inplace=True)\n", 436 | " df.natcity.replace([0, 8, 9], np.nan, inplace=True)\n", 437 | " df.natcrime.replace([0, 8, 9], np.nan, inplace=True)\n", 438 | " df.natdrug.replace([0, 8, 9], np.nan, inplace=True)\n", 439 | " df.nateduc.replace([0, 8, 9], np.nan, inplace=True)\n", 440 | " df.natrace.replace([0, 8, 9], np.nan, inplace=True)\n", 441 | " df.natarms.replace([0, 8, 9], np.nan, inplace=True)\n", 442 | " df.nataid.replace([0, 8, 9], np.nan, inplace=True)\n", 443 | " df.natfare.replace([0, 8, 9], np.nan, inplace=True)\n", 444 | " df.health.replace([0, 8, 9], np.nan, inplace=True)\n", 445 | " df.life.replace([0, 8, 9], np.nan, inplace=True)\n", 446 | " df.helpful.replace([0, 8, 9], np.nan, inplace=True)\n", 447 | " df.fair.replace([0, 8, 9], np.nan, inplace=True)\n", 448 | " df.trust.replace([0, 8, 9], np.nan, inplace=True)\n", 449 | " df.conclerg.replace([0, 8, 9], np.nan, inplace=True)\n", 450 | " df.coneduc.replace([0, 8, 9], np.nan, inplace=True)\n", 451 | " df.confed.replace([0, 8, 9], np.nan, inplace=True)\n", 452 | " df.conpress.replace([0, 8, 9], np.nan, inplace=True)\n", 453 | " df.conjudge.replace([0, 8, 9], np.nan, inplace=True)\n", 454 | " df.conlegis.replace([0, 8, 9], np.nan, inplace=True)\n", 455 | " df.conarmy.replace([0, 8, 9], np.nan, inplace=True)\n", 456 | " df.spkhomo.replace([0, 8, 9], np.nan, inplace=True)\n", 457 | " df.spkath.replace([0, 8, 9], np.nan, inplace=True)\n", 458 | " df.colath.replace([0, 8, 9], np.nan, inplace=True)\n", 459 | " df.libath.replace([0, 8, 9], np.nan, inplace=True)\n", 460 | " df.spkrac.replace([0, 8, 9], np.nan, inplace=True)\n", 461 | " df.spkcom.replace([0, 8, 9], np.nan, inplace=True)\n", 462 | " df.spkmil.replace([0, 8, 9], np.nan, inplace=True)\n", 463 | " df.satjob.replace([0, 8, 9], np.nan, inplace=True)\n", 464 | " df.satfin.replace([0, 8, 9], np.nan, inplace=True)\n", 465 | " df.finrela.replace([0, 8, 9], np.nan, inplace=True)\n", 466 | "\n", 467 | " df.union_.replace([0, 8, 9], np.nan, inplace=True)\n", 468 | " df.res16.replace([0, 8, 9], np.nan, inplace=True)\n", 469 | "\n", 470 | " df.fund.replace([0, 8, 9], np.nan, inplace=True)\n", 471 | " df.memchurh.replace([0, 8, 9], np.nan, inplace=True)\n", 472 | " df.fund16.replace([0, 8, 9], np.nan, inplace=True)\n", 473 | " df.reliten.replace([0, 8, 9], np.nan, inplace=True)\n", 474 | " df.postlife.replace([0, 8, 9], np.nan, inplace=True)\n", 475 | " df.pray.replace([0, 8, 9], np.nan, inplace=True)\n", 476 | " df.sprel16.replace([0, 8, 9], np.nan, inplace=True)\n", 477 | " df.hunt.replace([0, 8, 9], np.nan, inplace=True)\n", 478 | " df.polviews.replace([0, 8, 9], np.nan, inplace=True)\n", 479 | "\n", 480 | " df.compuse.replace([0, 8, 9], np.nan, inplace=True)\n", 481 | "\n", 482 | " df.degree.replace([8, 9], np.nan, inplace=True)\n", 483 | " df.padeg.replace([8, 9], np.nan, inplace=True)\n", 484 | " df.madeg.replace([8, 9], np.nan, inplace=True)\n", 485 | " df.spdeg.replace([8, 9], np.nan, inplace=True)\n", 486 | " df.partyid.replace([8, 9], np.nan, inplace=True)\n", 487 | "\n", 488 | " df.chldidel.replace([-1, 8, 9], np.nan, inplace=True)\n", 489 | "\n", 490 | " df.attend.replace([9], np.nan, inplace=True)\n", 491 | " df.childs.replace([9], np.nan, inplace=True)\n", 492 | " df.adults.replace([9], np.nan, inplace=True)\n", 493 | "\n", 494 | " df.divorce.replace([0, 8, 9], np.nan, inplace=True)\n", 495 | " df.agewed.replace([0, 98, 99], np.nan, inplace=True)\n", 496 | " df.relig.replace([0, 98, 99], np.nan, inplace=True)\n", 497 | " df.relig16.replace([0, 98, 99], np.nan, inplace=True)\n", 498 | " df.age.replace([0, 98, 99], np.nan, inplace=True)\n", 499 | " \n", 500 | " # note: sibs contains some unlikely numbers\n", 501 | " df.sibs.replace([-1, 98, 99], np.nan, inplace=True)\n", 502 | " df.educ.replace([97, 98, 99], np.nan, inplace=True)\n", 503 | " df.maeduc.replace([97, 98, 99], np.nan, inplace=True)\n", 504 | " df.paeduc.replace([97, 98, 99], np.nan, inplace=True)\n", 505 | " df.speduc.replace([97, 98, 99], np.nan, inplace=True)\n", 506 | "\n", 507 | " df.cohort.replace([0, 9999], np.nan, inplace=True)\n", 508 | " df.marcohrt.replace([0, 9999], np.nan, inplace=True)\n", 509 | "\n", 510 | " df.phone.replace([0, 2, 9], np.nan, inplace=True)\n", 511 | " df.owngun.replace([0, 3, 8, 9], np.nan, inplace=True)\n", 512 | " df.pistol.replace([0, 3, 8, 9], np.nan, inplace=True)\n", 513 | " df.class_.replace([0, 5, 8, 9], np.nan, inplace=True)\n", 514 | " df.pres04.replace([0, 8, 9], np.nan, inplace=True)\n", 515 | " df.pres08.replace([0, 8, 9], np.nan, inplace=True)\n", 516 | " df.pres12.replace([0, 8, 9], np.nan, inplace=True)\n", 517 | "\n", 518 | "replace_invalid(gss)" 519 | ] 520 | }, 521 | { 522 | "cell_type": "markdown", 523 | "metadata": {}, 524 | "source": [ 525 | "### Resampling\n", 526 | "\n", 527 | "The GSS uses stratified sampling, which means that some groups are deliberately oversampled to help with statistical validity.\n", 528 | "\n", 529 | "As a result, each respondent has a sampling weight which is proportional to the number of people in the population represented by the respondent.\n", 530 | "\n", 531 | "Before running any analysis, we should compensate for stratified sampling by \"resampling\", that is, by drawing a random sample from the dataset, where each respondent's chance of appearing in the sample is proportional to their sampling weight.\n", 532 | "\n", 533 | "`utils` provides a function to do this resampling." 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": 6, 539 | "metadata": {}, 540 | "outputs": [], 541 | "source": [ 542 | "np.random.seed(19)\n", 543 | "sample = utils.resample_by_year(gss, 'wtssall')" 544 | ] 545 | }, 546 | { 547 | "cell_type": "markdown", 548 | "metadata": {}, 549 | "source": [ 550 | "### Saving the results\n", 551 | "\n", 552 | "I'll save the results to an HDF5 file, which is a binary format that makes it much faster to read the data back." 553 | ] 554 | }, 555 | { 556 | "cell_type": "code", 557 | "execution_count": 7, 558 | "metadata": {}, 559 | "outputs": [], 560 | "source": [ 561 | "!rm eds.gss.hdf5" 562 | ] 563 | }, 564 | { 565 | "cell_type": "code", 566 | "execution_count": 8, 567 | "metadata": {}, 568 | "outputs": [], 569 | "source": [ 570 | "for i in range(3):\n", 571 | " np.random.seed(i)\n", 572 | " sample = utils.resample_by_year(gss, 'wtssall')\n", 573 | "\n", 574 | " key = f'gss{i}'\n", 575 | " sample.to_hdf('eds.gss.hdf5', key)" 576 | ] 577 | }, 578 | { 579 | "cell_type": "code", 580 | "execution_count": 9, 581 | "metadata": {}, 582 | "outputs": [ 583 | { 584 | "name": "stdout", 585 | "output_type": "stream", 586 | "text": [ 587 | "CPU times: user 9.86 ms, sys: 19.8 ms, total: 29.7 ms\n", 588 | "Wall time: 28.8 ms\n" 589 | ] 590 | }, 591 | { 592 | "data": { 593 | "text/plain": [ 594 | "(64814, 105)" 595 | ] 596 | }, 597 | "execution_count": 9, 598 | "metadata": {}, 599 | "output_type": "execute_result" 600 | } 601 | ], 602 | "source": [ 603 | "%time gss = pd.read_hdf('eds.gss.hdf5', 'gss0')\n", 604 | "gss.shape" 605 | ] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "execution_count": null, 610 | "metadata": {}, 611 | "outputs": [], 612 | "source": [] 613 | } 614 | ], 615 | "metadata": { 616 | "kernelspec": { 617 | "display_name": "Python 3", 618 | "language": "python", 619 | "name": "python3" 620 | }, 621 | "language_info": { 622 | "codemirror_mode": { 623 | "name": "ipython", 624 | "version": 3 625 | }, 626 | "file_extension": ".py", 627 | "mimetype": "text/x-python", 628 | "name": "python", 629 | "nbconvert_exporter": "python", 630 | "pygments_lexer": "ipython3", 631 | "version": "3.7.3" 632 | } 633 | }, 634 | "nbformat": 4, 635 | "nbformat_minor": 1 636 | } 637 | -------------------------------------------------------------------------------- /data/2013_2015_FemPregSetup.dct: -------------------------------------------------------------------------------- 1 | infile dictionary { 2 | 3 | ***************************************************************** 4 | * NATIONAL SURVEY OF FAMILY GROWTH (NSFG), 2013-2015 5 | * STATA Pregnancy Data Dictionary 6 | * 7 | * Warning: Edit this file at your own risk 8 | * 9 | ***************************************************************** 10 | 11 | _lines(1) 12 | 13 | _line(1) 14 | 15 | _column(1) long CASEID %5f "Case identification number" 16 | _column(6) byte PREGORDR %2f "Pregnancy order (number)" 17 | _column(8) byte HOWPREG_N %2f "BB-2 # of Weeks or Months Currently Pregnant" 18 | _column(10) byte HOWPREG_P %1f "BB-2 Current pregnancy length reported in months or weeks" 19 | _column(11) byte MOSCURRP %1f "Number of Months Currently Pregnant" 20 | _column(12) byte NOWPRGDK %1f "BB-3 Which Trimester -- Current Pregnancy" 21 | _column(13) byte PREGEND1 %1f "BC-1 How Pregnancy Ended - 1st mention" 22 | _column(14) byte PREGEND2 %1f "BC-1 How Pregnancy Ended - 2nd mention" 23 | _column(15) byte HOWENDDK %1f "BC-1b Did pregnancy result in a baby/babies born alive or did it end in some other way?" 24 | _column(16) byte NBRNALIV %1f "BC-2 Number of babies born alive from this pregnancy" 25 | _column(17) byte MULTBRTH %1f "BC-3 Was this a multiple birth" 26 | _column(18) byte BORNALIV %1f "Number of babies born alive from this pregnancy" 27 | _column(19) byte DATPRGEN_M %2f "BC-4a Month when pregnancy ended (if nonlivebirth)" 28 | _column(21) int DATPRGEN_Y %4f "BC-4a Year when pregnancy ended (if nonlivebirth)" 29 | _column(25) int CMOTPREG %4f "CM for Pregnancy End Date (if nonlivebirth)" 30 | _column(29) byte AGEATEND %2f "BC-4b R's age at pregnancy's end date" 31 | _column(31) byte HPAGEEND %2f "BC-4c Father's age at pregnancy's end date" 32 | _column(33) byte GESTASUN_M %2f "BC-5 Gestational Length of Pregnancy in Months" 33 | _column(35) byte GESTASUN_W %2f "BC-5 Gestational Length of Pregnancy in Weeks" 34 | _column(37) byte WKSGEST %2f "Gestational length of completed pregnancy (in weeks)" 35 | _column(39) byte MOSGEST %2f "Gestational length of completed pregnancy (in months)" 36 | _column(41) byte DK1GEST %1f "BC-6 DK followup for gestational length of a stillbirth" 37 | _column(42) byte DK2GEST %1f "BC-7 DK followup for gestational length of a livebirth" 38 | _column(43) byte DK3GEST %1f "BC-8 DK followup for gestational length of a miscarr/abor/ectop" 39 | _column(44) byte BPA_BDSCHECK1 %1f "Whether 1st liveborn baby from this pregnancy was BPA or BDS" 40 | _column(45) byte BABYSEX1 %1f "BD-2 Sex of 1st Liveborn Baby from This Pregnancy" 41 | _column(46) byte BIRTHWGT_LB1 %2f "BD-3 Birthweight in Pounds - 1st baby from this pregnancy" 42 | _column(48) byte BIRTHWGT_OZ1 %2f "BD-3 Birthweight in Ounces - 1st baby from this pregnancy" 43 | _column(50) byte LOBTHWGT1 %1f "BD-4 Is Baby Low Birthweight- 1st baby from this pregnancy" 44 | _column(51) byte BABYSEX2 %1f "BD-2 Sex of 2nd Liveborn Baby from This Pregnancy" 45 | _column(52) byte BIRTHWGT_LB2 %2f "BD-3 Birthweight in Pounds - 2nd baby from this pregnancy" 46 | _column(54) byte BIRTHWGT_OZ2 %2f "BD-3 Birthweight in Ounces - 2nd baby from this pregnancy" 47 | _column(56) byte LOBTHWGT2 %1f "BD-4 Is Baby Low Birthweight- 2nd baby from this pregnancy" 48 | _column(57) byte BABYSEX3 %1f "BD-2 Sex of 3rd Liveborn Baby from This Pregnancy" 49 | _column(58) byte BIRTHWGT_LB3 %1f "BD-3 Birthweight in Pounds - 3rd baby from this pregnancy" 50 | _column(59) byte BIRTHWGT_OZ3 %2f "BD-3 Birthweight in Ounces - 3rd baby from this pregnancy" 51 | _column(61) byte LOBTHWGT3 %1f "BD-4 Is Baby Low Birthweight- 3rd baby from this pregnancy" 52 | _column(62) byte BABYDOB_M %2f "BD-5 Month of delivery for this pregnancy" 53 | _column(64) int BABYDOB_Y %4f "BD-5 Year of delivery for this pregnancy" 54 | _column(68) int CMBABDOB %4f "CM for baby's or babies' date of birth (delivery date)" 55 | _column(72) int KIDAGE %3f "Current Age (in mos) of R's child(ren) from this pregnancy" 56 | _column(75) byte HPAGELB %2f "BD-6 Father's age at time of child(ren) s birth" 57 | _column(77) byte BIRTHPLC %1f "BD-7 Place where R gave birth" 58 | _column(78) byte PAYBIRTH1 %1f "BD-8 Payment for delivery - 1st mention" 59 | _column(79) byte PAYBIRTH2 %1f "BD-8 Payment for delivery - 2nd mention" 60 | _column(80) byte PAYBIRTH3 %1f "BD-8 Payment for delivery - 3rd mention" 61 | _column(81) byte CSECPRIM %1f "BD-9 Is this R's first c-section?" 62 | _column(82) byte CSECMED1 %1f "BD-10 Medical reasons reported for this C-section - 1st mention" 63 | _column(83) byte CSECMED2 %1f "BD-10 Medical reasons reported for this C-section - 2nd mention" 64 | _column(84) byte CSECMED3 %1f "BD-10 Medical reasons reported for this C-section - 3rd mention" 65 | _column(85) byte CSECMED4 %1f "BD-10 Medical reasons reported for this C-section - 4th mention" 66 | _column(86) byte CSECPLAN %1f "BD-11 Was this c-section planned for by R?" 67 | _column(87) byte KNEWPREG %2f "BE-1 Weeks pregnant when R learned she was pregnant" 68 | _column(89) byte TRIMESTR %1f "BE-2a DK followup for KNEWPREG when gestation >= 6 mos" 69 | _column(90) byte LTRIMEST %1f "BE-2b DK followup for KNEWPREG when gestation < 6 mos" 70 | _column(91) byte PRIORSMK %1f "BE-3 Amount R smoked in 6 mos before R knew she was pregnant" 71 | _column(92) byte POSTSMKS %1f "BE-4 R smoked at all after R knew she was pregnant" 72 | _column(93) byte NPOSTSMK %1f "BE-5 Amount R smoked during pregnancy after R knew she was preg" 73 | _column(94) byte GETPRENA %1f "BE-6 Any prenatal care for this pregnancy" 74 | _column(95) byte BGNPRENA %2f "BE-7 Weeks pregnant at first prenatal care visit" 75 | _column(97) byte PNCTRIM %1f "BE-8a DK followup for BGNPRENA when gestation >= 6 mos" 76 | _column(98) byte LPNCTRI %1f "BE-8b DK followup for BGNPRENA when gestation < 6 mos" 77 | _column(99) byte LIVEHERE1 %1f "BG-1 Whether child lives with R - 1st from this pregnancy" 78 | _column(100) byte ALIVENOW1 %1f "BG-2 Whether child is still alive - 1st from this pregnancy" 79 | _column(101) byte WHENDIED_M1 %2f "BG-3 Month when child died - 1st from this pregnancy" 80 | _column(103) int WHENDIED_Y1 %4f "BG-3 Year when child died - 1st from this pregnancy" 81 | _column(107) int CMKIDIED1 %4f "CM for child's date of death - 1st from this pregnancy" 82 | _column(111) byte WHENLEFT_M1 %2f "BG-4 Month when child stopped living with R- 1st from this preg" 83 | _column(113) int WHENLEFT_Y1 %4f "BG-4 Year when child stopped living with R- 1st from this preg" 84 | _column(117) int CMKIDLFT1 %4f "CM for date child stopped living w/R - 1st from this pregnancy" 85 | _column(121) int LASTAGE1 %3f "Age (in mos) when child last lived w/R-1st from this pregnancy" 86 | _column(124) byte WHERENOW1 %1f "BG-5 Where child lives now - 1st from this pregnancy" 87 | _column(125) byte LEGAGREE1 %1f "BG-6 Legal agreement for where child lives - 1st from this preg" 88 | _column(126) byte PARENEND1 %1f "BG-7 Is R still legal mother of child - 1st from this pregnancy" 89 | _column(127) byte ANYNURSE1 %1f "BH-1 Whether R breastfed this child at all - 1st from this preg" 90 | _column(128) byte FEDSOLID1 %1f "BH-2 Has R fed this child anything other than breastmilk - 1st from this preg" 91 | _column(129) int FRSTEATD_N1 %3f "BH-3 Age (mos/wks/day) when 1st fed non-breastmilk - 1st from this preg" 92 | _column(132) byte FRSTEATD_P1 %1f "BH-3 Units (mos/wks/days) for FRSTEATD_N - 1st from this preg" 93 | _column(133) byte FRSTEATD1 %2f "Age (in mos) when 1st fed non-breastmilk - 1st from this preg" 94 | _column(135) byte QUITNURS1 %1f "BH-4 Has R stopped breastfeeding child - 1st from this preg" 95 | _column(136) int AGEQTNUR_N1 %3f "BH-5 Age (mos/wks/day) when stopped breastfeeding - 1st from this preg" 96 | _column(139) byte AGEQTNUR_P1 %1f "BH-5 Units (mos/wks/days) for AGEQTNUR_N - 1st from this preg" 97 | _column(140) byte AGEQTNUR1 %2f "Age (in mos) when R stopped nursing child - 1st from this preg" 98 | _column(142) byte LIVEHERE2 %1f "BG-1 Whether child lives with R - 2nd from this pregnancy" 99 | _column(143) byte ALIVENOW2 %1f "BG-2 Whether child is still alive - 2nd from this pregnancy" 100 | _column(144) byte WHENDIED_M2 %1f "BG-3 Month when child died - 2nd from this pregnancy" 101 | _column(145) int WHENDIED_Y2 %4f "BG-3 Year when child died - 2nd from this pregnancy" 102 | _column(149) int CMKIDIED2 %4f "CM for child's date of death - 2nd from this pregnancy" 103 | _column(153) byte WHENLEFT_M2 %1f "BG-4 Month when child stopped living with R - 2nd from this preg" 104 | _column(154) int WHENLEFT_Y2 %4f "BG-4 Year when child stopped living with R - 2nd from this preg" 105 | _column(158) int CMKIDLFT2 %4f "CM for date child stopped living w/R - 2nd from this pregnancy" 106 | _column(162) int LASTAGE2 %3f "Age (in mos) when child last lived w/R - 2nd from this pregnancy" 107 | _column(165) byte WHERENOW2 %1f "BG-5 Where child lives now - 2nd from this pregnancy" 108 | _column(166) byte LEGAGREE2 %1f "BG-6 Legal agreement for where child lives - 2nd from this preg" 109 | _column(167) byte PARENEND2 %1f "BG-7 Is R still legal mother of child - 2nd from this pregnancy" 110 | _column(168) byte ANYNURSE2 %1f "BH-1 Whether R breastfed this child at all - 2nd from this preg" 111 | _column(169) byte FEDSOLID2 %1f "BH-2 Has R fed this child anything other than breastmilk - 2nd from this preg" 112 | _column(170) byte FRSTEATD_N2 %1f "BH-3 Age (mos/wks/day) when 1st fed non-breastmilk - 2nd from this preg" 113 | _column(171) byte FRSTEATD_P2 %1f "BH-3 Units (mos/wks/days) for FRSTEATD_N - 2nd from this preg" 114 | _column(172) byte FRSTEATD2 %1f "Age (in mos) when 1st fed non-breastmilk - 2nd from this preg" 115 | _column(173) byte QUITNURS2 %1f "BH-4 Has R stopped breastfeeding child - 2nd from this preg" 116 | _column(174) byte AGEQTNUR_N2 %2f "BH-5 Age (mos/wks/day) when stopped breastfeeding - 2nd from this preg" 117 | _column(176) byte AGEQTNUR_P2 %1f "BH-5 Units (mos/wks/days) for AGEQTNUR_N - 2nd from this preg" 118 | _column(177) byte AGEQTNUR2 %2f "Age (in mos) when R stopped nursing child - 2nd from this preg" 119 | _column(179) byte LIVEHERE3 %1f "BG-1 Whether child lives with R - 3rd from this pregnancy" 120 | _column(180) byte ALIVENOW3 %1f "BG-2 Whether child is still alive - 3rd from this pregnancy" 121 | _column(181) byte WHENDIED_M3 %1f "BG-3 Month when child died - 3rd from this pregnancy" 122 | _column(182) byte WHENDIED_Y3 %1f "BG-3 Year when child died - 3rd from this pregnancy" 123 | _column(183) byte CMKIDIED3 %1f "CM for child's date of death - 3rd from this pregnancy" 124 | _column(184) byte WHENLEFT_M3 %1f "BG-4 Month when child stopped living with R - 3rd from this preg" 125 | _column(185) byte WHENLEFT_Y3 %1f "BG-4 Year when child stopped living with R - 3rd from this preg" 126 | _column(186) byte CMKIDLFT3 %1f "CM for date child stopped living w/R - 3rd from this pregnancy" 127 | _column(187) byte LASTAGE3 %1f "Age (in mos) when child last lived w/R - 3rd from this pregnancy" 128 | _column(188) byte WHERENOW3 %1f "BG-5 Where child lives now - 3rd from this pregnancy" 129 | _column(189) byte LEGAGREE3 %1f "BG-6 Legal agreement for where child lives - 3rd from this preg" 130 | _column(190) byte PARENEND3 %1f "BG-7 Is R still legal mother of child - 3rd from this pregnancy" 131 | _column(191) byte ANYNURSE3 %1f "BH-1 Whether R breastfed this child at all - 3rd from this preg" 132 | _column(192) byte FEDSOLID3 %1f "BH-2 Has R fed this child anything other than breastmilk - 3rd from this preg" 133 | _column(193) byte FRSTEATD_N3 %1f "BH-3 Age (mos/wks/day) when 1st fed non-breastmilk - 3rd from this preg" 134 | _column(194) byte FRSTEATD_P3 %1f "BH-3 Units (mos/wks/days) for FRSTEATD_N - 3rd from this preg" 135 | _column(195) byte FRSTEATD3 %1f "Age (in mos) when 1st fed non-breastmilk - 3rd from this preg" 136 | _column(196) byte QUITNURS3 %1f "BH-4 Has R stopped breastfeeding child - 3rd from this preg" 137 | _column(197) byte AGEQTNUR_N3 %1f "BH-5 Age (mos/wks/day) when stopped breastfeeding - 3rd from this preg" 138 | _column(198) byte AGEQTNUR_P3 %1f "BH-5 Units (mos/wks/days) for AGEQTNUR_N - 3rd from this preg" 139 | _column(199) byte AGEQTNUR3 %1f "Age (in mos) when R stopped nursing child - 3rd from this preg" 140 | _column(200) byte PRGOUTCOME %1f "Outcome of Pregnancy (based on priority ordering)" 141 | _column(201) byte OUTCOM_S %1f "Outcome of pregnancy (based on corrected/chron sorted data)" 142 | _column(202) byte NBRNLV_S %1f "# of babies born alive from this preg (based on CCSD)" 143 | _column(203) int CMPRGEND %4f "CM for Pregnancy End Date (regardless of outcome)" 144 | _column(207) int CMENDP_S %4f "CM date when pregnancy ended (based on CCSD)" 145 | _column(211) int CMPRGBEG %4f "CM for Pregnancy Start Date" 146 | _column(215) int CMPBEG_S %4f "CM date when pregnancy began (based on CCSD)" 147 | _column(219) int CMLASTLB %4f "CM for R's most recent live birth (based on CCSD)" 148 | _column(223) int CMLSTPRG %4f "CM for R's most recent completed pregnancy (based on CCSD)" 149 | _column(227) int CMFSTPRG %4f "CM for R's first completed pregnancy (based on CCSD)" 150 | _column(231) int CMPG1BEG %4f "CM date when R's 1st pregnancy began (based on CCSD)" 151 | _column(235) int CMINTSTR %4f "CM for date of beginning of pregnancy interval" 152 | _column(239) int CMINTFIN %4f "CM for date of end of pregnancy interval" 153 | _column(243) int CMINTSTROP %4f "Open interval: CM of date of beginning" 154 | _column(247) int CMINTFINOP %4f "Open interval: CM of date of end (mon of interview)" 155 | _column(251) int CMINTSTRCR %4f "Currently pregnant: CM of date of beginning of interval" 156 | _column(255) int CMINTFINCR %4f "Currently pregnant: CM of date of end of interval (mon of interview)" 157 | _column(259) byte ANYUSINT %1f "Any method use in pregnancy interval" 158 | _column(260) byte EVUSEINT %1f "EG-1 Use any method in pregnancy interval?" 159 | _column(261) byte STOPDUSE %1f "EG-2 Before you became preg, stop using all methods?" 160 | _column(262) byte WHYSTOPD %1f "EG-3 Stop using methods before preg because wanted preg?" 161 | _column(263) byte WHATMETH01 %2f "EG-4 Method(s) using when became preg - 1st mention" 162 | _column(265) byte WHATMETH02 %2f "EG-4 Method(s) using when became preg - 2nd mention" 163 | _column(267) byte WHATMETH03 %2f "EG-4 Method(s) using when became preg - 3rd mention" 164 | _column(269) byte WHATMETH04 %2f "EG-4 Method(s) using when became preg - 4th mention" 165 | _column(271) byte RESNOUSE %1f "EG-5 Reason not using/had stopped using method bec. wanted preg?" 166 | _column(272) byte WANTBOLD %1f "EG-6 Right bef preg, want to have baby at any time in future?" 167 | _column(273) byte PROBBABE %1f "EG-7 probably want baby at any time in future or not?" 168 | _column(274) byte CNFRMNO %1f "EG-8 Verify didn't want baby at any time in future" 169 | _column(275) byte WANTBLD2 %1f "EG-9 Right before preg, want to have baby at any time in future? (2nd asking)" 170 | _column(276) byte TIMINGOK %1f "EG-10 Become preg too soon, right time, or later than you wanted?" 171 | _column(277) int TOOSOON_N %3f "EG-11 How much sooner than wanted became preg (months or years)" 172 | _column(280) byte TOOSOON_P %1f "EG-11 Choose mons or yrs for how much sooner became preg than wanted" 173 | _column(281) byte WTHPART1 %1f "EG-12a Right before preg, want to have baby with that partner?" 174 | _column(282) byte WTHPART2 %1f "EG-12b Right bef. preg, think might ever want to have baby w/that partner?" 175 | _column(283) byte FEELINPG %2f "EG-13 How happy to be preg. scale (0-10)" 176 | _column(285) byte HPWNOLD %1f "EG-16 Right bef preg, did the father want R to have baby at any time in future?" 177 | _column(286) byte TIMOKHP %1f "EG-17 R became preg sooner, right time, or later than father of preg wanted" 178 | _column(287) byte COHPBEG %1f "EG-18a Was R living w/father of preg at beginning of preg" 179 | _column(288) byte COHPEND %1f "EG-18b Was R living w/father of preg when preg ended/baby was born" 180 | _column(289) byte TELLFATH %1f "EG-19 Did R tell father of preg that she was pregnant" 181 | _column(290) byte WHENTELL %1f "EG-20 When did R tell father of preg about pregnancy: during or after?" 182 | _column(291) byte TRYSCALE %2f "EG-21 How hard trying to get/avoid pregnancy (0-10 scale)" 183 | _column(293) byte WANTSCAL %2f "EG-22 How much wanted to get/avoid pregnancy (0-10 scale)" 184 | _column(295) byte WHYPRG1 %1f "EG-23 (unintended preg): method fail or R wasn't using properly-1st mention" 185 | _column(296) byte WHYPRG2 %1f "EG-23 (unintended preg): method fail or R wasn't using properly-2nd mention" 186 | _column(297) byte WHYNOUSE1 %2f "EG-24 (unintended preg) Reason didn't use contraceptn - 1st" 187 | _column(299) byte WHYNOUSE2 %1f "EG-24 (unintended preg) Reason didn't use contraceptn - 2nd" 188 | _column(300) byte WHYNOUSE3 %1f "EG-24 (unintended preg) Reason didn't use contraceptn - 3rd" 189 | _column(301) byte WHYNOUSE4 %1f "EG-24 (unintended preg) Reason didn't use contraceptn - 4th" 190 | _column(302) byte WHYNOUSE5 %1f "EG-24 (unintended preg) Reason didn't use contraceptn - 5th" 191 | _column(303) byte WHYNOPG1 %2f "EG-24aa open-ended response: reason didn't think could get preg - 1st" 192 | _column(305) byte WHYNOPG2 %2f "EG-24aa open-ended response: reason didn't think could get preg - 2nd" 193 | _column(307) byte MAINOUSE %2f "EG-24a (unintended preg) Main reason didn't use contraception" 194 | _column(309) byte PRGLNGTH %2f "Duration of completed pregnancy in weeks" 195 | _column(311) byte OUTCOME %1f "Pregnancy outcome" 196 | _column(312) byte BIRTHORD %2f "Birth order" 197 | _column(314) int DATEND %4f "CM date pregnancy ended" 198 | _column(318) int AGEPREG %4f "Age at pregnancy outcome" 199 | _column(322) int DATECON %4f "CM date of conception" 200 | _column(326) int AGECON %4f "Age at time of conception" 201 | _column(330) byte FMAROUT5 %1f "Formal marital status at pregnancy outcome" 202 | _column(331) byte PMARPREG %1f "Whether pregnancy ended before R's 1st marriage (premaritally)" 203 | _column(332) byte RMAROUT6 %1f "Informal marital status at pregnancy outcome - 6 categories" 204 | _column(333) byte FMARCON5 %1f "Formal marital status at conception - 5 categories" 205 | _column(334) byte RMARCON6 %1f "Informal marital status at conception - 6 categories" 206 | _column(335) byte LEARNPRG %2f "Number of weeks pregnant when R learned she was pregnant" 207 | _column(337) byte PNCAREWK %2f "Number of weeks pregnant at first prenatal care" 208 | _column(339) byte PAYDELIV %1f "Payment for delivery" 209 | _column(340) byte LBW1 %1f "Low birthweight - 1st baby from this preg" 210 | _column(341) byte LIVCHILD %1f "Living arrangements for 1st liveborn child from this pregnancy" 211 | _column(342) int BFEEDWKS %3f "Duration of breastfeeding in weeks" 212 | _column(345) byte OLDWANTR %1f "Wantedness of pregnancy - respondent - Cycle 4 version" 213 | _column(346) byte OLDWANTP %1f "Wantedness of preg - R's partner (father of pregnancy) - Cycle 4 version" 214 | _column(347) byte WANTRESP %1f "Wantedness of pregnancy -- Respondent (RECODE)" 215 | _column(348) byte WANTPART %1f "Wantedness of pregnancy -- R's partner (RECODE)" 216 | _column(349) int TOOSOON %3f "Number of months too soon pregnancy occurred" 217 | _column(352) byte NEWWANTR %1f "Detailed wantedness of pregnancy - respondent" 218 | _column(353) int CMBIRTH %4f "Century month of R's birth" 219 | _column(357) byte AGER %2f "Age at interview" 220 | _column(359) byte AGESCRN %2f "R's age at screener" 221 | _column(361) byte FMARITAL %1f "Formal marital status" 222 | _column(362) byte RMARITAL %1f "Informal Marital Status" 223 | _column(363) byte EDUCAT %2f "Education (completed years of schooling)" 224 | _column(365) byte HIEDUC %2f "Highest completed year of school or degree" 225 | _column(367) byte RACE %1f "Race" 226 | _column(368) byte HISPANIC %1f "Hispanic origin" 227 | _column(369) byte HISPRACE %1f "Race & Hispanic origin of respondent - 1977 OMB standards (respondent recode)" 228 | _column(370) byte HISPRACE2 %1f "Race & Hispanic origin of respondent - 1997 OMB standards (respondent recode)" 229 | _column(371) byte RCURPREG %1f "Pregnant at time of interview" 230 | _column(372) byte PREGNUM %2f "CAPI-based total number of pregnancies" 231 | _column(374) byte PARITY %2f "Total number of live births" 232 | _column(376) byte CURR_INS %1f "Current health insurance coverage" 233 | _column(377) byte PUBASSIS %1f "Whether R received public assistance in prior calendar year" 234 | _column(378) int POVERTY %3f "Poverty level income" 235 | _column(381) byte LABORFOR %1f "Labor force status" 236 | _column(382) byte RELIGION %1f "Current religious affiliation" 237 | _column(383) byte METRO %1f "Place of residence (Metropolitan / Nonmetropolitan)" 238 | _column(384) byte BRNOUT %1f "IB-8 R born outside of US" 239 | _column(385) int YRSTRUS %4f "Year R came to the United States" 240 | _column(389) byte PRGLNGTH_I %1f "PRGLNGTH Imputation Flag" 241 | _column(390) byte OUTCOME_I %1f "OUTCOME Imputation Flag" 242 | _column(391) byte BIRTHORD_I %1f "BIRTHORD Imputation Flag" 243 | _column(392) byte DATEND_I %1f "DATEND Imputation Flag" 244 | _column(393) byte AGEPREG_I %1f "AGEPREG Imputation Flag" 245 | _column(394) byte DATECON_I %1f "DATECON Imputation Flag" 246 | _column(395) byte AGECON_I %1f "AGECON Imputation Flag" 247 | _column(396) byte FMAROUT5_I %1f "FMAROUT5 Imputation Flag" 248 | _column(397) byte PMARPREG_I %1f "PMARPREG Imputation Flag" 249 | _column(398) byte RMAROUT6_I %1f "RMAROUT6 Imputation Flag" 250 | _column(399) byte FMARCON5_I %1f "FMARCON5 Imputation Flag" 251 | _column(400) byte RMARCON6_I %1f "RMARCON6 Imputation Flag" 252 | _column(401) byte LEARNPRG_I %1f "LEARNPRG Imputation Flag" 253 | _column(402) byte PNCAREWK_I %1f "PNCAREWK Imputation Flag" 254 | _column(403) byte PAYDELIV_I %1f "PAYDELIV Imputation Flag" 255 | _column(404) byte LBW1_I %1f "LBW1 Imputation Flag" 256 | _column(405) byte LIVCHILD_I %1f "LIVCHILD Imputation Flag" 257 | _column(406) byte BFEEDWKS_I %1f "BFEEDWKS Imputation Flag" 258 | _column(407) byte OLDWANTR_I %1f "OLDWANTR Imputation Flag" 259 | _column(408) byte OLDWANTP_I %1f "OLDWANTP Imputation Flag" 260 | _column(409) byte WANTRESP_I %1f "WANTRESP Imputation Flag" 261 | _column(410) byte WANTPART_I %1f "WANTPART Imputation Flag" 262 | _column(411) byte TOOSOON_I %1f "TOOSOON Imputation Flag" 263 | _column(412) byte NEWWANTR_I %1f "NEWWANTR Imputation Flag" 264 | _column(413) byte AGER_I %1f "AGER Imputation Flag" 265 | _column(414) byte FMARITAL_I %1f "FMARITAL Imputation Flag" 266 | _column(415) byte RMARITAL_I %1f "RMARITAL Imputation Flag" 267 | _column(416) byte EDUCAT_I %1f "EDUCAT Imputation Flag" 268 | _column(417) byte HIEDUC_I %1f "HIEDUC Imputation Flag" 269 | _column(418) byte RACE_I %1f "RACE Imputation Flag" 270 | _column(419) byte HISPANIC_I %1f "HISPANIC Imputation Flag" 271 | _column(420) byte HISPRACE_I %1f "HISPRACE Imputation Flag" 272 | _column(421) byte HISPRACE2_I %1f "HISPRACE2 Imputation Flag" 273 | _column(422) byte RCURPREG_I %1f "RCURPREG Imputation Flag" 274 | _column(423) byte PREGNUM_I %1f "PREGNUM Imputation Flag" 275 | _column(424) byte PARITY_I %1f "PARITY Imputation Flag" 276 | _column(425) byte CURR_INS_I %1f "CURR_INS Imputation Flag" 277 | _column(426) byte PUBASSIS_I %1f "PUBASSIS Imputation Flag" 278 | _column(427) byte POVERTY_I %1f "POVERTY Imputation Flag" 279 | _column(428) byte LABORFOR_I %1f "LABORFOR Imputation Flag" 280 | _column(429) byte RELIGION_I %1f "RELIGION Imputation Flag" 281 | _column(430) byte METRO_I %1f "METRO Imputation Flag" 282 | _column(431) double WGT2013_2015 %16f "Final weight for the 2013-2015 NSFG" 283 | _column(447) byte SECU %1f "Randomized version of the sampling error computational unit" 284 | _column(448) int SEST %3f "Randomized version of the stratum" 285 | _column(451) int CMINTVW %4f "Century month for date of interview (Computed in Flow Check A-1)" 286 | _column(455) int CMLSTYR %4f "Century month for month/year of interview minus one year (Computed in Flow Check A-1)" 287 | _column(459) int CMJAN3YR %4f "Century month of January Three Years Prior to Year of interview (Computed in Flow Check A-1)" 288 | _column(463) int CMJAN4YR %4f "Century month of January Four Years Prior to Year of Interview (Computed in Flow Check A-1)" 289 | _column(467) int CMJAN5YR %4f "Century month of January Five Years Prior to Year of Interview (Computed in Flow Check A-1)" 290 | _column(471) str2 QUARTER %2s "Quarter when case was sampled" 291 | _column(473) str1 PHASE %1s "Regular- or double-sample portion of the quarter" 292 | _column(474) str4 INTVWYEAR %4s "Calendar year when interview occurred" 293 | } 294 | -------------------------------------------------------------------------------- /gss_eda/GSS.do: -------------------------------------------------------------------------------- 1 | #delimit ; 2 | 3 | infix 4 | year 1 - 20 5 | id_ 21 - 40 6 | agewed 41 - 60 7 | divorce 61 - 80 8 | sibs 81 - 100 9 | childs 101 - 120 10 | age 121 - 140 11 | educ 141 - 160 12 | paeduc 161 - 180 13 | maeduc 181 - 200 14 | speduc 201 - 220 15 | degree 221 - 240 16 | padeg 241 - 260 17 | madeg 261 - 280 18 | spdeg 281 - 300 19 | sex 301 - 320 20 | race 321 - 340 21 | res16 341 - 360 22 | reg16 361 - 380 23 | srcbelt 381 - 400 24 | partyid 401 - 420 25 | pres04 421 - 440 26 | pres08 441 - 460 27 | pres12 461 - 480 28 | polviews 481 - 500 29 | natspac 501 - 520 30 | natenvir 521 - 540 31 | natheal 541 - 560 32 | natcity 561 - 580 33 | natcrime 581 - 600 34 | natdrug 601 - 620 35 | nateduc 621 - 640 36 | natrace 641 - 660 37 | natarms 661 - 680 38 | nataid 681 - 700 39 | natfare 701 - 720 40 | spkath 721 - 740 41 | colath 741 - 760 42 | libath 761 - 780 43 | spkhomo 781 - 800 44 | colhomo 801 - 820 45 | libhomo 821 - 840 46 | cappun 841 - 860 47 | gunlaw 861 - 880 48 | grass 881 - 900 49 | relig 901 - 920 50 | fund 921 - 940 51 | attend 941 - 960 52 | reliten 961 - 980 53 | postlife 981 - 1000 54 | pray 1001 - 1020 55 | relig16 1021 - 1040 56 | fund16 1041 - 1060 57 | sprel16 1061 - 1080 58 | prayer 1081 - 1100 59 | bible 1101 - 1120 60 | racmar 1121 - 1140 61 | racpres 1141 - 1160 62 | affrmact 1161 - 1180 63 | happy 1181 - 1200 64 | hapmar 1201 - 1220 65 | health 1221 - 1240 66 | life 1241 - 1260 67 | helpful 1261 - 1280 68 | fair 1281 - 1300 69 | trust 1301 - 1320 70 | conclerg 1321 - 1340 71 | coneduc 1341 - 1360 72 | confed 1361 - 1380 73 | conpress 1381 - 1400 74 | conjudge 1401 - 1420 75 | conlegis 1421 - 1440 76 | conarmy 1441 - 1460 77 | satjob 1461 - 1480 78 | class_ 1481 - 1500 79 | satfin 1501 - 1520 80 | finrela 1521 - 1540 81 | union_ 1541 - 1560 82 | fepol 1561 - 1580 83 | abany 1581 - 1600 84 | chldidel 1601 - 1620 85 | sexeduc 1621 - 1640 86 | premarsx 1641 - 1660 87 | xmarsex 1661 - 1680 88 | homosex 1681 - 1700 89 | spanking 1701 - 1720 90 | fear 1721 - 1740 91 | owngun 1741 - 1760 92 | pistol 1761 - 1780 93 | hunt 1781 - 1800 94 | phone 1801 - 1820 95 | memchurh 1821 - 1840 96 | realinc 1841 - 1860 97 | cohort 1861 - 1880 98 | marcohrt 1881 - 1900 99 | ballot 1901 - 1920 100 | wtssall 1921 - 1940 101 | adults 1941 - 1960 102 | compuse 1961 - 1980 103 | databank 1981 - 2000 104 | wtssnr 2001 - 2020 105 | spkrac 2021 - 2040 106 | spkcom 2041 - 2060 107 | spkmil 2061 - 2080 108 | spkmslm 2081 - 2100 109 | using GSS.dat; 110 | 111 | label variable year "Gss year for this respondent "; 112 | label variable id_ "Respondent id number"; 113 | label variable agewed "Age when first married"; 114 | label variable divorce "Ever been divorced or separated"; 115 | label variable sibs "Number of brothers and sisters"; 116 | label variable childs "Number of children"; 117 | label variable age "Age of respondent"; 118 | label variable educ "Highest year of school completed"; 119 | label variable paeduc "Highest year school completed, father"; 120 | label variable maeduc "Highest year school completed, mother"; 121 | label variable speduc "Highest year school completed, spouse"; 122 | label variable degree "Rs highest degree"; 123 | label variable padeg "Fathers highest degree"; 124 | label variable madeg "Mothers highest degree"; 125 | label variable spdeg "Spouses highest degree"; 126 | label variable sex "Respondents sex"; 127 | label variable race "Race of respondent"; 128 | label variable res16 "Type of place lived in when 16 yrs old"; 129 | label variable reg16 "Region of residence, age 16"; 130 | label variable srcbelt "Src beltcode"; 131 | label variable partyid "Political party affiliation"; 132 | label variable pres04 "Vote for kerry, bush, nader"; 133 | label variable pres08 "Vote obama or mccain"; 134 | label variable pres12 "Vote obama or romney"; 135 | label variable polviews "Think of self as liberal or conservative"; 136 | label variable natspac "Space exploration program"; 137 | label variable natenvir "Improving & protecting environment"; 138 | label variable natheal "Improving & protecting nations health"; 139 | label variable natcity "Solving problems of big cities"; 140 | label variable natcrime "Halting rising crime rate"; 141 | label variable natdrug "Dealing with drug addiction"; 142 | label variable nateduc "Improving nations education system"; 143 | label variable natrace "Improving the conditions of blacks"; 144 | label variable natarms "Military, armaments, and defense"; 145 | label variable nataid "Foreign aid"; 146 | label variable natfare "Welfare"; 147 | label variable spkath "Allow anti-religionist to speak"; 148 | label variable colath "Allow anti-religionist to teach"; 149 | label variable libath "Allow anti-religious book in library"; 150 | label variable spkhomo "Allow homosexual to speak"; 151 | label variable colhomo "Allow homosexual to teach"; 152 | label variable libhomo "Allow homosexuals book in library"; 153 | label variable cappun "Favor or oppose death penalty for murder"; 154 | label variable gunlaw "Favor or oppose gun permits"; 155 | label variable grass "Should marijuana be made legal"; 156 | label variable relig "Rs religious preference"; 157 | label variable fund "How fundamentalist is r currently"; 158 | label variable attend "How often r attends religious services"; 159 | label variable reliten "Strength of affiliation"; 160 | label variable postlife "Belief in life after death"; 161 | label variable pray "How often does r pray"; 162 | label variable relig16 "Religion in which raised"; 163 | label variable fund16 "How fundamentalist was r at age 16"; 164 | label variable sprel16 "Religion in which spouse raised"; 165 | label variable prayer "Bible prayer in public schools"; 166 | label variable bible "Feelings about the bible"; 167 | label variable racmar "Favor law against racial intermarriage"; 168 | label variable racpres "Would vote for black president"; 169 | label variable affrmact "Favor preference in hiring blacks"; 170 | label variable happy "General happiness"; 171 | label variable hapmar "Happiness of marriage"; 172 | label variable health "Condition of health"; 173 | label variable life "Is life exciting or dull"; 174 | label variable helpful "People helpful or looking out for selves"; 175 | label variable fair "People fair or try to take advantage"; 176 | label variable trust "Can people be trusted"; 177 | label variable conclerg "Confidence in organized religion"; 178 | label variable coneduc "Confidence in education"; 179 | label variable confed "Confid. in exec branch of fed govt"; 180 | label variable conpress "Confidence in press"; 181 | label variable conjudge "Confid. in united states supreme court"; 182 | label variable conlegis "Confidence in congress"; 183 | label variable conarmy "Confidence in military"; 184 | label variable satjob "Job or housework"; 185 | label variable class_ "Subjective class identification"; 186 | label variable satfin "Satisfaction with financial situation"; 187 | label variable finrela "Opinion of family income"; 188 | label variable union_ "Does r or spouse belong to union"; 189 | label variable fepol "Women not suited for politics"; 190 | label variable abany "Abortion if woman wants for any reason"; 191 | label variable chldidel "Ideal number of children"; 192 | label variable sexeduc "Sex education in public schools"; 193 | label variable premarsx "Sex before marriage"; 194 | label variable xmarsex "Sex with person other than spouse"; 195 | label variable homosex "Homosexual sex relations"; 196 | label variable spanking "Favor spanking to discipline child"; 197 | label variable fear "Afraid to walk at night in neighborhood"; 198 | label variable owngun "Have gun in home"; 199 | label variable pistol "Pistol or revolver in home"; 200 | label variable hunt "Does r or spouse hunt"; 201 | label variable phone "Does r have telephone"; 202 | label variable memchurh "Membership in church group"; 203 | label variable realinc "Family income in constant $"; 204 | label variable cohort "Year of birth"; 205 | label variable marcohrt "Year of first marriage"; 206 | label variable ballot "Ballot used for interview"; 207 | label variable wtssall "Weight variable"; 208 | label variable adults "Household members 18 yrs and older"; 209 | label variable compuse "R use computer"; 210 | label variable databank "Computer data threat to individual privacy"; 211 | label variable wtssnr "Weight variable"; 212 | label variable spkrac "Allow racist to speak"; 213 | label variable spkcom "Allow communist to speak"; 214 | label variable spkmil "Allow militarist to speak"; 215 | label variable spkmslm "Allow muslim clergymen preaching hatred of the us"; 216 | 217 | 218 | label define gsp001x 219 | 99 "No answer" 220 | 98 "Don't know" 221 | 0 "Not applicable" 222 | ; 223 | label define gsp002x 224 | 9 "No answer" 225 | 8 "Don't know" 226 | 2 "No" 227 | 1 "Yes" 228 | 0 "Not applicable" 229 | ; 230 | label define gsp003x 231 | 99 "No answer" 232 | 98 "Don't know" 233 | -1 "Not applicable" 234 | ; 235 | label define gsp004x 236 | 9 "Dk na" 237 | 8 "Eight or more" 238 | ; 239 | label define gsp005x 240 | 99 "No answer" 241 | 98 "Don't know" 242 | 89 "89 or older" 243 | ; 244 | label define gsp006x 245 | 99 "No answer" 246 | 98 "Don't know" 247 | 97 "Not applicable" 248 | ; 249 | label define gsp007x 250 | 99 "No answer" 251 | 98 "Don't know" 252 | 97 "Not applicable" 253 | ; 254 | label define gsp008x 255 | 99 "No answer" 256 | 98 "Don't know" 257 | 97 "Not applicable" 258 | ; 259 | label define gsp009x 260 | 99 "No answer" 261 | 98 "Don't know" 262 | 97 "Not applicable" 263 | ; 264 | label define gsp010x 265 | 9 "No answer" 266 | 8 "Don't know" 267 | 7 "Not applicable" 268 | 4 "Graduate" 269 | 3 "Bachelor" 270 | 2 "Junior college" 271 | 1 "High school" 272 | 0 "Lt high school" 273 | ; 274 | label define gsp011x 275 | 9 "No answer" 276 | 8 "Don't know" 277 | 7 "Not applicable" 278 | 4 "Graduate" 279 | 3 "Bachelor" 280 | 2 "Junior college" 281 | 1 "High school" 282 | 0 "Lt high school" 283 | ; 284 | label define gsp012x 285 | 9 "No answer" 286 | 8 "Don't know" 287 | 7 "Not applicable" 288 | 4 "Graduate" 289 | 3 "Bachelor" 290 | 2 "Junior college" 291 | 1 "High school" 292 | 0 "Lt high school" 293 | ; 294 | label define gsp013x 295 | 9 "No answer" 296 | 8 "Don't know" 297 | 7 "Not applicable" 298 | 4 "Graduate" 299 | 3 "Bachelor" 300 | 2 "Junior college" 301 | 1 "High school" 302 | 0 "Lt high school" 303 | ; 304 | label define gsp014x 305 | 2 "Female" 306 | 1 "Male" 307 | ; 308 | label define gsp015x 309 | 3 "Other" 310 | 2 "Black" 311 | 1 "White" 312 | 0 "Not applicable" 313 | ; 314 | label define gsp016x 315 | 9 "No answer" 316 | 8 "Don't know" 317 | 6 "City gt 250000" 318 | 5 "Big-city suburb" 319 | 4 "50000 to 250000" 320 | 3 "Town lt 50000" 321 | 2 "Farm" 322 | 1 "Country,nonfarm" 323 | 0 "Not applicable" 324 | ; 325 | label define gsp017x 326 | 9 "Pacific" 327 | 8 "Mountain" 328 | 7 "W. sou. central" 329 | 6 "E. sou. central" 330 | 5 "South atlantic" 331 | 4 "W. nor. central" 332 | 3 "E. nor. central" 333 | 2 "Middle atlantic" 334 | 1 "New england" 335 | 0 "Foreign" 336 | ; 337 | label define gsp018x 338 | 6 "Other rural" 339 | 5 "Other urban" 340 | 4 "Suburb, 13-100" 341 | 3 "Suburb, 12 lrgst" 342 | 2 "Smsa's 13-100" 343 | 1 "12 lrgst smsa's" 344 | 0 "Not assigned" 345 | ; 346 | label define gsp019x 347 | 9 "No answer" 348 | 8 "Don't know" 349 | 7 "Other party" 350 | 6 "Strong republican" 351 | 5 "Not str republican" 352 | 4 "Ind,near rep" 353 | 3 "Independent" 354 | 2 "Ind,near dem" 355 | 1 "Not str democrat" 356 | 0 "Strong democrat" 357 | ; 358 | label define gsp020x 359 | 9 "No answer" 360 | 8 "Dont know" 361 | 6 "Didnt vote" 362 | 4 "Other (specify)" 363 | 3 "Nader" 364 | 2 "Bush" 365 | 1 "Kerry" 366 | 0 "Not applicable" 367 | ; 368 | label define gsp021x 369 | 9 "No answer" 370 | 8 "Don't know" 371 | 4 "Didn't vote" 372 | 3 "Other candidate (specify)" 373 | 2 "Mccain" 374 | 1 "Obama" 375 | 0 "Not applicable" 376 | ; 377 | label define gsp022x 378 | 9 "No answer" 379 | 8 "Don't know" 380 | 4 "Didn't vote for president" 381 | 3 "Other candidate (specify)" 382 | 2 "Romney" 383 | 1 "Obama" 384 | 0 "Not applicable" 385 | ; 386 | label define gsp023x 387 | 9 "No answer" 388 | 8 "Don't know" 389 | 7 "Extrmly conservative" 390 | 6 "Conservative" 391 | 5 "Slghtly conservative" 392 | 4 "Moderate" 393 | 3 "Slightly liberal" 394 | 2 "Liberal" 395 | 1 "Extremely liberal" 396 | 0 "Not applicable" 397 | ; 398 | label define gsp024x 399 | 9 "No answer" 400 | 8 "Don't know" 401 | 3 "Too much" 402 | 2 "About right" 403 | 1 "Too little" 404 | 0 "Not applicable" 405 | ; 406 | label define gsp025x 407 | 9 "No answer" 408 | 8 "Don't know" 409 | 3 "Too much" 410 | 2 "About right" 411 | 1 "Too little" 412 | 0 "Not applicable" 413 | ; 414 | label define gsp026x 415 | 9 "No answer" 416 | 8 "Don't know" 417 | 3 "Too much" 418 | 2 "About right" 419 | 1 "Too little" 420 | 0 "Not applicable" 421 | ; 422 | label define gsp027x 423 | 9 "No answer" 424 | 8 "Don't know" 425 | 3 "Too much" 426 | 2 "About right" 427 | 1 "Too little" 428 | 0 "Not applicable" 429 | ; 430 | label define gsp028x 431 | 9 "No answer" 432 | 8 "Don't know" 433 | 3 "Too much" 434 | 2 "About right" 435 | 1 "Too little" 436 | 0 "Not applicable" 437 | ; 438 | label define gsp029x 439 | 9 "No answer" 440 | 8 "Don't know" 441 | 3 "Too much" 442 | 2 "About right" 443 | 1 "Too little" 444 | 0 "Not applicable" 445 | ; 446 | label define gsp030x 447 | 9 "No answer" 448 | 8 "Don't know" 449 | 3 "Too much" 450 | 2 "About right" 451 | 1 "Too little" 452 | 0 "Not applicable" 453 | ; 454 | label define gsp031x 455 | 9 "No answer" 456 | 8 "Don't know" 457 | 3 "Too much" 458 | 2 "About right" 459 | 1 "Too little" 460 | 0 "Not applicable" 461 | ; 462 | label define gsp032x 463 | 9 "No answer" 464 | 8 "Don't know" 465 | 3 "Too much" 466 | 2 "About right" 467 | 1 "Too little" 468 | 0 "Not applicable" 469 | ; 470 | label define gsp033x 471 | 9 "No answer" 472 | 8 "Don't know" 473 | 3 "Too much" 474 | 2 "About right" 475 | 1 "Too little" 476 | 0 "Not applicable" 477 | ; 478 | label define gsp034x 479 | 9 "No answer" 480 | 8 "Don't know" 481 | 3 "Too much" 482 | 2 "About right" 483 | 1 "Too little" 484 | 0 "Not applicable" 485 | ; 486 | label define gsp035x 487 | 9 "No answer" 488 | 8 "Don't know" 489 | 2 "Not allowed" 490 | 1 "Allowed" 491 | 0 "Not applicable" 492 | ; 493 | label define gsp036x 494 | 9 "No answer" 495 | 8 "Don't know" 496 | 5 "Not allowed" 497 | 4 "Allowed" 498 | 0 "Not applicable" 499 | ; 500 | label define gsp037x 501 | 9 "No answer" 502 | 8 "Don't know" 503 | 2 "Not remove" 504 | 1 "Remove" 505 | 0 "Not applicable" 506 | ; 507 | label define gsp038x 508 | 9 "No answer" 509 | 8 "Don't know" 510 | 2 "Not allowed" 511 | 1 "Allowed" 512 | 0 "Not applicable" 513 | ; 514 | label define gsp039x 515 | 9 "No answer" 516 | 8 "Don't know" 517 | 5 "Not allowed" 518 | 4 "Allowed" 519 | 0 "Not applicable" 520 | ; 521 | label define gsp040x 522 | 9 "No answer" 523 | 8 "Don't know" 524 | 2 "Not remove" 525 | 1 "Remove" 526 | 0 "Not applicable" 527 | ; 528 | label define gsp041x 529 | 9 "No answer" 530 | 8 "Don't know" 531 | 2 "Oppose" 532 | 1 "Favor" 533 | 0 "Not applicable" 534 | ; 535 | label define gsp042x 536 | 9 "No answer" 537 | 8 "Don't know" 538 | 2 "Oppose" 539 | 1 "Favor" 540 | 0 "Not applicable" 541 | ; 542 | label define gsp043x 543 | 9 "No answer" 544 | 8 "Don't know" 545 | 2 "Not legal" 546 | 1 "Legal" 547 | 0 "Not applicable" 548 | ; 549 | label define gsp044x 550 | 99 "No answer" 551 | 98 "Don't know" 552 | 13 "Inter-nondenominational" 553 | 12 "Native american" 554 | 11 "Christian" 555 | 10 "Orthodox-christian" 556 | 9 "Moslem/islam" 557 | 8 "Other eastern" 558 | 7 "Hinduism" 559 | 6 "Buddhism" 560 | 5 "Other" 561 | 4 "None" 562 | 3 "Jewish" 563 | 2 "Catholic" 564 | 1 "Protestant" 565 | 0 "Not applicable" 566 | ; 567 | label define gsp045x 568 | 9 "Na-excluded" 569 | 8 "Don't know" 570 | 3 "Liberal" 571 | 2 "Moderate" 572 | 1 "Fundamentalist" 573 | 0 "Not applicable" 574 | ; 575 | label define gsp046x 576 | 9 "Dk,na" 577 | 8 "More thn once wk" 578 | 7 "Every week" 579 | 6 "Nrly every week" 580 | 5 "2-3x a month" 581 | 4 "Once a month" 582 | 3 "Sevrl times a yr" 583 | 2 "Once a year" 584 | 1 "Lt once a year" 585 | 0 "Never" 586 | ; 587 | label define gsp047x 588 | 9 "No answer" 589 | 8 "Don't know" 590 | 4 "No religion" 591 | 3 "Somewhat strong" 592 | 2 "Not very strong" 593 | 1 "Strong" 594 | 0 "Not applicable" 595 | ; 596 | label define gsp048x 597 | 9 "No answer" 598 | 8 "Don't know" 599 | 2 "No" 600 | 1 "Yes" 601 | 0 "Not applicable" 602 | ; 603 | label define gsp049x 604 | 9 "No answer" 605 | 8 "Don't know" 606 | 6 "Never" 607 | 5 "Lt once a week" 608 | 4 "Once a week" 609 | 3 "Several times a week" 610 | 2 "Once a day" 611 | 1 "Several times a day" 612 | 0 "Not applicable" 613 | ; 614 | label define gsp050x 615 | 99 "No answer" 616 | 98 "Don't know" 617 | 13 "Inter-nondenominational" 618 | 12 "Native american" 619 | 11 "Christian" 620 | 10 "Orthodox-christian" 621 | 9 "Moslem/islam" 622 | 8 "Other eastern" 623 | 7 "Hinduism" 624 | 6 "Buddhism" 625 | 5 "Other" 626 | 4 "None" 627 | 3 "Jewish" 628 | 2 "Catholic" 629 | 1 "Protestant" 630 | 0 "Not applicable" 631 | ; 632 | label define gsp051x 633 | 9 "Na-excluded" 634 | 8 "Don't know" 635 | 3 "Liberal" 636 | 2 "Moderate" 637 | 1 "Fundamentalist" 638 | 0 "Not applicable" 639 | ; 640 | label define gsp052x 641 | 9 "No answer" 642 | 8 "Dont know" 643 | 5 "Other" 644 | 4 "None" 645 | 3 "Jewish" 646 | 2 "Catholic" 647 | 1 "Protestant" 648 | 0 "Not applicable" 649 | ; 650 | label define gsp053x 651 | 9 "No answer" 652 | 8 "Don't know" 653 | 2 "Disapprove" 654 | 1 "Approve" 655 | 0 "Not applicable" 656 | ; 657 | label define gsp054x 658 | 9 "No answer" 659 | 8 "Don't know" 660 | 4 "Other" 661 | 3 "Book of fables" 662 | 2 "Inspired word" 663 | 1 "Word of god" 664 | 0 "Not applicable" 665 | ; 666 | label define gsp055x 667 | 9 "No answer" 668 | 8 "Don't know" 669 | 2 "No" 670 | 1 "Yes" 671 | 0 "Not applicable" 672 | ; 673 | label define gsp056x 674 | 9 "No answer" 675 | 8 "Don't know" 676 | 2 "No" 677 | 1 "Yes" 678 | 0 "Not applicable" 679 | ; 680 | label define gsp057x 681 | 9 "No answer" 682 | 8 "Don't know" 683 | 4 "Strongly oppose pref" 684 | 3 "Oppose pref" 685 | 2 "Support pref" 686 | 1 "Strongly support pref" 687 | 0 "Not applicable" 688 | ; 689 | label define gsp058x 690 | 9 "No answer" 691 | 8 "Don't know" 692 | 3 "Not too happy" 693 | 2 "Pretty happy" 694 | 1 "Very happy" 695 | 0 "Not applicable" 696 | ; 697 | label define gsp059x 698 | 9 "No answer" 699 | 8 "Don't know" 700 | 3 "Not too happy" 701 | 2 "Pretty happy" 702 | 1 "Very happy" 703 | 0 "Not applicable" 704 | ; 705 | label define gsp060x 706 | 9 "No answer" 707 | 8 "Don't know" 708 | 4 "Poor" 709 | 3 "Fair" 710 | 2 "Good" 711 | 1 "Excellent" 712 | 0 "Not applicable" 713 | ; 714 | label define gsp061x 715 | 9 "No answer" 716 | 8 "Don't know" 717 | 3 "Dull" 718 | 2 "Routine" 719 | 1 "Exciting" 720 | 0 "Not applicable" 721 | ; 722 | label define gsp062x 723 | 9 "No answer" 724 | 8 "Don't know" 725 | 3 "Depends" 726 | 2 "Lookout for self" 727 | 1 "Helpful" 728 | 0 "Not applicable" 729 | ; 730 | label define gsp063x 731 | 9 "No answer" 732 | 8 "Don't know" 733 | 3 "Depends" 734 | 2 "Fair" 735 | 1 "Take advantage" 736 | 0 "Not applicable" 737 | ; 738 | label define gsp064x 739 | 9 "No answer" 740 | 8 "Don't know" 741 | 3 "Depends" 742 | 2 "Cannot trust" 743 | 1 "Can trust" 744 | 0 "Not applicable" 745 | ; 746 | label define gsp065x 747 | 9 "No answer" 748 | 8 "Don't know" 749 | 3 "Hardly any" 750 | 2 "Only some" 751 | 1 "A great deal" 752 | 0 "Not applicable" 753 | ; 754 | label define gsp066x 755 | 9 "No answer" 756 | 8 "Don't know" 757 | 3 "Hardly any" 758 | 2 "Only some" 759 | 1 "A great deal" 760 | 0 "Not applicable" 761 | ; 762 | label define gsp067x 763 | 9 "No answer" 764 | 8 "Don't know" 765 | 3 "Hardly any" 766 | 2 "Only some" 767 | 1 "A great deal" 768 | 0 "Not applicable" 769 | ; 770 | label define gsp068x 771 | 9 "No answer" 772 | 8 "Don't know" 773 | 3 "Hardly any" 774 | 2 "Only some" 775 | 1 "A great deal" 776 | 0 "Not applicable" 777 | ; 778 | label define gsp069x 779 | 9 "No answer" 780 | 8 "Don't know" 781 | 3 "Hardly any" 782 | 2 "Only some" 783 | 1 "A great deal" 784 | 0 "Not applicable" 785 | ; 786 | label define gsp070x 787 | 9 "No answer" 788 | 8 "Don't know" 789 | 3 "Hardly any" 790 | 2 "Only some" 791 | 1 "A great deal" 792 | 0 "Not applicable" 793 | ; 794 | label define gsp071x 795 | 9 "No answer" 796 | 8 "Don't know" 797 | 3 "Hardly any" 798 | 2 "Only some" 799 | 1 "A great deal" 800 | 0 "Not applicable" 801 | ; 802 | label define gsp072x 803 | 9 "No answer" 804 | 8 "Don't know" 805 | 4 "Very dissatisfied" 806 | 3 "A little dissat" 807 | 2 "Mod. satisfied" 808 | 1 "Very satisfied" 809 | 0 "Not applicable" 810 | ; 811 | label define gsp073x 812 | 9 "No answer" 813 | 8 "Don't know" 814 | 5 "No class" 815 | 4 "Upper class" 816 | 3 "Middle class" 817 | 2 "Working class" 818 | 1 "Lower class" 819 | 0 "Not applicable" 820 | ; 821 | label define gsp074x 822 | 9 "No answer" 823 | 8 "Don't know" 824 | 3 "Not at all sat" 825 | 2 "More or less" 826 | 1 "Satisfied" 827 | 0 "Not applicable" 828 | ; 829 | label define gsp075x 830 | 9 "No answer" 831 | 8 "Don't know" 832 | 5 "Far above average" 833 | 4 "Above average" 834 | 3 "Average" 835 | 2 "Below average" 836 | 1 "Far below average" 837 | 0 "Not applicable" 838 | ; 839 | label define gsp076x 840 | 9 "No answer" 841 | 8 "Don't know" 842 | 4 "Neither belongs" 843 | 3 "R and spouse belong" 844 | 2 "Spouse belongs" 845 | 1 "R belongs" 846 | 0 "Not applicable" 847 | ; 848 | label define gsp077x 849 | 9 "No answer" 850 | 8 "Not sure" 851 | 2 "Disagree" 852 | 1 "Agree" 853 | 0 "Not applicable" 854 | ; 855 | label define gsp078x 856 | 9 "No answer" 857 | 8 "Don't know" 858 | 2 "No" 859 | 1 "Yes" 860 | 0 "Not applicable" 861 | ; 862 | label define gsp079x 863 | 9 "Dk,na" 864 | 8 "As many as want" 865 | 7 "Seven+" 866 | -1 "Not applicable" 867 | ; 868 | label define gsp080x 869 | 9 "No answer" 870 | 8 "Don't know" 871 | 3 "Depends" 872 | 2 "Oppose" 873 | 1 "Favor" 874 | 0 "Not applicable" 875 | ; 876 | label define gsp081x 877 | 9 "No answer" 878 | 8 "Don't know" 879 | 5 "Other" 880 | 4 "Not wrong at all" 881 | 3 "Sometimes wrong" 882 | 2 "Almst always wrg" 883 | 1 "Always wrong" 884 | 0 "Not applicable" 885 | ; 886 | label define gsp082x 887 | 9 "No answer" 888 | 8 "Don't know" 889 | 5 "Other" 890 | 4 "Not wrong at all" 891 | 3 "Sometimes wrong" 892 | 2 "Almst always wrg" 893 | 1 "Always wrong" 894 | 0 "Not applicable" 895 | ; 896 | label define gsp083x 897 | 9 "No answer" 898 | 8 "Don't know" 899 | 5 "Other" 900 | 4 "Not wrong at all" 901 | 3 "Sometimes wrong" 902 | 2 "Almst always wrg" 903 | 1 "Always wrong" 904 | 0 "Not applicable" 905 | ; 906 | label define gsp084x 907 | 9 "No answer" 908 | 8 "Don't know" 909 | 4 "Strongly disagree" 910 | 3 "Disagree" 911 | 2 "Agree" 912 | 1 "Strongly agree" 913 | 0 "Not applicable" 914 | ; 915 | label define gsp085x 916 | 9 "No answer" 917 | 8 "Don't know" 918 | 2 "No" 919 | 1 "Yes" 920 | 0 "Not applicable" 921 | ; 922 | label define gsp086x 923 | 9 "No answer" 924 | 8 "Don't know" 925 | 3 "Refused" 926 | 2 "No" 927 | 1 "Yes" 928 | 0 "Not applicable" 929 | ; 930 | label define gsp087x 931 | 9 "No answer" 932 | 8 "Don't know" 933 | 3 "Refused" 934 | 2 "No" 935 | 1 "Yes" 936 | 0 "Not applicable" 937 | ; 938 | label define gsp088x 939 | 9 "No answer" 940 | 8 "Don't know" 941 | 4 "Neither" 942 | 3 "Both" 943 | 2 "Spouse" 944 | 1 "Resp" 945 | 0 "Not applicable" 946 | ; 947 | label define gsp089x 948 | 9 "No answer" 949 | 6 "Cellphone" 950 | 5 "Phone,dk where" 951 | 4 "Phone elsewhere" 952 | 3 "Phone in home" 953 | 2 "Refused" 954 | 1 "No phone" 955 | 0 "Not applicable" 956 | ; 957 | label define gsp090x 958 | 9 "No answer" 959 | 8 "Don't know" 960 | 2 "No" 961 | 1 "Yes" 962 | 0 "Not applicable" 963 | ; 964 | label define gsp091x 965 | 999999 "No answer" 966 | 999998 "Dont know" 967 | 0 "Not applicable" 968 | ; 969 | label define gsp092x 970 | 9999 "No answer" 971 | 0 "Not applicable" 972 | ; 973 | label define gsp093x 974 | 9999 "No answer" 975 | 0 "Not applicable" 976 | ; 977 | label define gsp094x 978 | 4 "Ballot d" 979 | 3 "Ballot c" 980 | 2 "Ballot b" 981 | 1 "Ballot a" 982 | 0 "Not applicable" 983 | ; 984 | label define gsp095x 985 | 9 "No answer" 986 | 8 "8 or more" 987 | ; 988 | label define gsp096x 989 | 9 "No answer" 990 | 8 "Don't know" 991 | 2 "No" 992 | 1 "Yes" 993 | 0 "Not applicable" 994 | ; 995 | label define gsp097x 996 | 9 "No answer" 997 | 8 "Cant choose" 998 | 4 "Not a threat at all" 999 | 3 "Not serious" 1000 | 2 "Fairly serious" 1001 | 1 "Very serious threat" 1002 | 0 "Not applicable" 1003 | ; 1004 | label define gsp098x 1005 | 9 "No answer" 1006 | 8 "Don't know" 1007 | 2 "Not allowed" 1008 | 1 "Allowed" 1009 | 0 "Not applicable" 1010 | ; 1011 | label define gsp099x 1012 | 9 "No answer" 1013 | 8 "Don't know" 1014 | 2 "Not allowed" 1015 | 1 "Allowed" 1016 | 0 "Not applicable" 1017 | ; 1018 | label define gsp100x 1019 | 9 "No answer" 1020 | 8 "Don't know" 1021 | 2 "Not allowed" 1022 | 1 "Allowed" 1023 | 0 "Not applicable" 1024 | ; 1025 | label define gsp101x 1026 | 9 "No answer" 1027 | 8 "Dont know" 1028 | 2 "Not allowed" 1029 | 1 "Yes, allowed" 1030 | 0 "Not applicable" 1031 | ; 1032 | 1033 | 1034 | label values agewed gsp001x; 1035 | label values divorce gsp002x; 1036 | label values sibs gsp003x; 1037 | label values childs gsp004x; 1038 | label values age gsp005x; 1039 | label values educ gsp006x; 1040 | label values paeduc gsp007x; 1041 | label values maeduc gsp008x; 1042 | label values speduc gsp009x; 1043 | label values degree gsp010x; 1044 | label values padeg gsp011x; 1045 | label values madeg gsp012x; 1046 | label values spdeg gsp013x; 1047 | label values sex gsp014x; 1048 | label values race gsp015x; 1049 | label values res16 gsp016x; 1050 | label values reg16 gsp017x; 1051 | label values srcbelt gsp018x; 1052 | label values partyid gsp019x; 1053 | label values pres04 gsp020x; 1054 | label values pres08 gsp021x; 1055 | label values pres12 gsp022x; 1056 | label values polviews gsp023x; 1057 | label values natspac gsp024x; 1058 | label values natenvir gsp025x; 1059 | label values natheal gsp026x; 1060 | label values natcity gsp027x; 1061 | label values natcrime gsp028x; 1062 | label values natdrug gsp029x; 1063 | label values nateduc gsp030x; 1064 | label values natrace gsp031x; 1065 | label values natarms gsp032x; 1066 | label values nataid gsp033x; 1067 | label values natfare gsp034x; 1068 | label values spkath gsp035x; 1069 | label values colath gsp036x; 1070 | label values libath gsp037x; 1071 | label values spkhomo gsp038x; 1072 | label values colhomo gsp039x; 1073 | label values libhomo gsp040x; 1074 | label values cappun gsp041x; 1075 | label values gunlaw gsp042x; 1076 | label values grass gsp043x; 1077 | label values relig gsp044x; 1078 | label values fund gsp045x; 1079 | label values attend gsp046x; 1080 | label values reliten gsp047x; 1081 | label values postlife gsp048x; 1082 | label values pray gsp049x; 1083 | label values relig16 gsp050x; 1084 | label values fund16 gsp051x; 1085 | label values sprel16 gsp052x; 1086 | label values prayer gsp053x; 1087 | label values bible gsp054x; 1088 | label values racmar gsp055x; 1089 | label values racpres gsp056x; 1090 | label values affrmact gsp057x; 1091 | label values happy gsp058x; 1092 | label values hapmar gsp059x; 1093 | label values health gsp060x; 1094 | label values life gsp061x; 1095 | label values helpful gsp062x; 1096 | label values fair gsp063x; 1097 | label values trust gsp064x; 1098 | label values conclerg gsp065x; 1099 | label values coneduc gsp066x; 1100 | label values confed gsp067x; 1101 | label values conpress gsp068x; 1102 | label values conjudge gsp069x; 1103 | label values conlegis gsp070x; 1104 | label values conarmy gsp071x; 1105 | label values satjob gsp072x; 1106 | label values class_ gsp073x; 1107 | label values satfin gsp074x; 1108 | label values finrela gsp075x; 1109 | label values union_ gsp076x; 1110 | label values fepol gsp077x; 1111 | label values abany gsp078x; 1112 | label values chldidel gsp079x; 1113 | label values sexeduc gsp080x; 1114 | label values premarsx gsp081x; 1115 | label values xmarsex gsp082x; 1116 | label values homosex gsp083x; 1117 | label values spanking gsp084x; 1118 | label values fear gsp085x; 1119 | label values owngun gsp086x; 1120 | label values pistol gsp087x; 1121 | label values hunt gsp088x; 1122 | label values phone gsp089x; 1123 | label values memchurh gsp090x; 1124 | label values realinc gsp091x; 1125 | label values cohort gsp092x; 1126 | label values marcohrt gsp093x; 1127 | label values ballot gsp094x; 1128 | label values adults gsp095x; 1129 | label values compuse gsp096x; 1130 | label values databank gsp097x; 1131 | label values spkrac gsp098x; 1132 | label values spkcom gsp099x; 1133 | label values spkmil gsp100x; 1134 | label values spkmslm gsp101x; 1135 | 1136 | 1137 | -------------------------------------------------------------------------------- /gss_validate.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Exploratory Data Analysis\n", 8 | "\n", 9 | "Load and validate GSS data\n", 10 | "\n", 11 | "Allen Downey\n", 12 | "\n", 13 | "[MIT License](https://en.wikipedia.org/wiki/MIT_License)" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 33, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "%matplotlib inline\n", 23 | "\n", 24 | "import pandas as pd\n", 25 | "import numpy as np\n", 26 | "\n", 27 | "import matplotlib.pyplot as plt\n", 28 | "import seaborn as sns\n", 29 | "sns.set(style='white')\n", 30 | "\n", 31 | "import utils\n", 32 | "from utils import decorate\n", 33 | "from distribution import Pmf, Cdf" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 34, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "def underride(d, **options):\n", 43 | " \"\"\"Add key-value pairs to d only if key is not in d.\n", 44 | "\n", 45 | " d: dictionary\n", 46 | " options: keyword args to add to d\n", 47 | " \"\"\"\n", 48 | " for key, val in options.items():\n", 49 | " d.setdefault(key, val)\n", 50 | "\n", 51 | " return d" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "## Loading and validation\n" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 35, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "import re\n", 68 | "\n", 69 | "class FixedWidthVariables(object):\n", 70 | " \"\"\"Represents a set of variables in a fixed width file.\"\"\"\n", 71 | "\n", 72 | " def __init__(self, variables, index_base=0):\n", 73 | " \"\"\"Initializes.\n", 74 | "\n", 75 | " variables: DataFrame\n", 76 | " index_base: are the indices 0 or 1 based?\n", 77 | "\n", 78 | " Attributes:\n", 79 | " colspecs: list of (start, end) index tuples\n", 80 | " names: list of string variable names\n", 81 | " \"\"\"\n", 82 | " self.variables = variables\n", 83 | "\n", 84 | " # note: by default, subtract 1 from colspecs\n", 85 | " self.colspecs = variables[['start', 'end']] - index_base\n", 86 | "\n", 87 | " # convert colspecs to a list of pair of int\n", 88 | " self.colspecs = self.colspecs.astype(np.int).values.tolist()\n", 89 | " self.names = variables['name']\n", 90 | "\n", 91 | " def ReadFixedWidth(self, filename, **options):\n", 92 | " \"\"\"Reads a fixed width ASCII file.\n", 93 | "\n", 94 | " filename: string filename\n", 95 | "\n", 96 | " returns: DataFrame\n", 97 | " \"\"\"\n", 98 | " df = pd.read_fwf(filename,\n", 99 | " colspecs=self.colspecs, \n", 100 | " names=self.names,\n", 101 | " **options)\n", 102 | " return df\n", 103 | "\n", 104 | "\n", 105 | "def ReadStataDct(dct_file, **options):\n", 106 | " \"\"\"Reads a Stata dictionary file.\n", 107 | "\n", 108 | " dct_file: string filename\n", 109 | " options: dict of options passed to open()\n", 110 | "\n", 111 | " returns: FixedWidthVariables object\n", 112 | " \"\"\"\n", 113 | " type_map = dict(byte=int, int=int, long=int, float=float, \n", 114 | " double=float, numeric=float)\n", 115 | "\n", 116 | " var_info = []\n", 117 | " with open(dct_file, **options) as f:\n", 118 | " for line in f:\n", 119 | " match = re.search( r'_column\\(([^)]*)\\)', line)\n", 120 | " if not match:\n", 121 | " continue\n", 122 | " start = int(match.group(1))\n", 123 | " t = line.split()\n", 124 | " vtype, name, fstring = t[1:4]\n", 125 | " name = name.lower()\n", 126 | " if vtype.startswith('str'):\n", 127 | " vtype = str\n", 128 | " else:\n", 129 | " vtype = type_map[vtype]\n", 130 | " long_desc = ' '.join(t[4:]).strip('\"')\n", 131 | " var_info.append((start, vtype, name, fstring, long_desc))\n", 132 | " \n", 133 | " columns = ['start', 'type', 'name', 'fstring', 'desc']\n", 134 | " variables = pd.DataFrame(var_info, columns=columns)\n", 135 | "\n", 136 | " # fill in the end column by shifting the start column\n", 137 | " variables['end'] = variables.start.shift(-1)\n", 138 | " variables.loc[len(variables)-1, 'end'] = 0\n", 139 | "\n", 140 | " dct = FixedWidthVariables(variables, index_base=1)\n", 141 | " return dct\n", 142 | "\n", 143 | "def read_gss(dirname):\n", 144 | " \"\"\"Reads GSS files from the given directory.\n", 145 | " \n", 146 | " dirname: string\n", 147 | " \n", 148 | " returns: DataFrame\n", 149 | " \"\"\"\n", 150 | " dct = ReadStataDct(dirname + '/GSS.dct')\n", 151 | " gss = dct.ReadFixedWidth(dirname + '/GSS.dat.gz',\n", 152 | " compression='gzip')\n", 153 | " return gss" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 36, 159 | "metadata": {}, 160 | "outputs": [ 161 | { 162 | "name": "stdout", 163 | "output_type": "stream", 164 | "text": [ 165 | "(62466, 101)\n" 166 | ] 167 | }, 168 | { 169 | "data": { 170 | "text/html": [ 171 | "
\n", 172 | "\n", 185 | "\n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | "
yearid_ageweddivorcesibschildsageeducpaeducmaeduc...memchurhrealinccohortmarcohrtballotwtssalladultscompusedatabankwtssnr
019721003023161097...018951.01949000.44461001.0
11972221245701088...024366.01902192300.88932001.0
21972320254481288...024366.01924194400.88932001.0
3197242425027171612...030458.01945196900.88932001.0
41972522222611288...050763.01911193300.88932001.0
\n", 335 | "

5 rows × 101 columns

\n", 336 | "
" 337 | ], 338 | "text/plain": [ 339 | " year id_ agewed divorce sibs childs age educ paeduc maeduc \\\n", 340 | "0 1972 1 0 0 3 0 23 16 10 97 \n", 341 | "1 1972 2 21 2 4 5 70 10 8 8 \n", 342 | "2 1972 3 20 2 5 4 48 12 8 8 \n", 343 | "3 1972 4 24 2 5 0 27 17 16 12 \n", 344 | "4 1972 5 22 2 2 2 61 12 8 8 \n", 345 | "\n", 346 | " ... memchurh realinc cohort marcohrt ballot wtssall adults \\\n", 347 | "0 ... 0 18951.0 1949 0 0 0.4446 1 \n", 348 | "1 ... 0 24366.0 1902 1923 0 0.8893 2 \n", 349 | "2 ... 0 24366.0 1924 1944 0 0.8893 2 \n", 350 | "3 ... 0 30458.0 1945 1969 0 0.8893 2 \n", 351 | "4 ... 0 50763.0 1911 1933 0 0.8893 2 \n", 352 | "\n", 353 | " compuse databank wtssnr \n", 354 | "0 0 0 1.0 \n", 355 | "1 0 0 1.0 \n", 356 | "2 0 0 1.0 \n", 357 | "3 0 0 1.0 \n", 358 | "4 0 0 1.0 \n", 359 | "\n", 360 | "[5 rows x 101 columns]" 361 | ] 362 | }, 363 | "execution_count": 36, 364 | "metadata": {}, 365 | "output_type": "execute_result" 366 | } 367 | ], 368 | "source": [ 369 | "gss = read_gss('gss_eda')\n", 370 | "print(gss.shape)\n", 371 | "gss.head()" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": 37, 377 | "metadata": {}, 378 | "outputs": [ 379 | { 380 | "data": { 381 | "text/plain": [ 382 | "0 21165\n", 383 | "1 30936\n", 384 | "2 9536\n", 385 | "8 722\n", 386 | "9 107\n", 387 | "Name: gunlaw, dtype: int64" 388 | ] 389 | }, 390 | "execution_count": 37, 391 | "metadata": {}, 392 | "output_type": "execute_result" 393 | } 394 | ], 395 | "source": [ 396 | "gss.gunlaw.value_counts().sort_index()" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": 38, 402 | "metadata": {}, 403 | "outputs": [ 404 | { 405 | "data": { 406 | "text/plain": [ 407 | "0 24364\n", 408 | "1 10946\n", 409 | "2 25153\n", 410 | "8 1892\n", 411 | "9 111\n", 412 | "Name: grass, dtype: int64" 413 | ] 414 | }, 415 | "execution_count": 38, 416 | "metadata": {}, 417 | "output_type": "execute_result" 418 | } 419 | ], 420 | "source": [ 421 | "gss.grass.value_counts().sort_index()" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": 39, 427 | "metadata": {}, 428 | "outputs": [], 429 | "source": [ 430 | "def replace_invalid(df):\n", 431 | " df.realinc.replace([0], np.nan, inplace=True) \n", 432 | " df.educ.replace([98,99], np.nan, inplace=True)\n", 433 | " # 89 means 89 or older\n", 434 | " df.age.replace([98, 99], np.nan, inplace=True) \n", 435 | " df.cohort.replace([9999], np.nan, inplace=True)\n", 436 | " df.adults.replace([9], np.nan, inplace=True)\n", 437 | " df.gunlaw.replace([0,8,9], np.nan, inplace=True)\n", 438 | " df.grass.replace([0,8,9], np.nan, inplace=True)\n", 439 | "\n", 440 | "replace_invalid(gss)" 441 | ] 442 | }, 443 | { 444 | "cell_type": "markdown", 445 | "metadata": {}, 446 | "source": [ 447 | "The proportion of women in this dataset is slightly higher than it probably is in the population, even after weighting.\n", 448 | "\n", 449 | "The issue seems to be that the GSS excludes people living in institutions, including prisons and army housing, which disproportionaly excludes men." 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": 40, 455 | "metadata": {}, 456 | "outputs": [], 457 | "source": [ 458 | "sex = gss.loc[gss.year==2010, 'sex']" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": 41, 464 | "metadata": {}, 465 | "outputs": [ 466 | { 467 | "data": { 468 | "text/plain": [ 469 | "1 0.43591\n", 470 | "2 0.56409\n", 471 | "Name: Pmf, dtype: float64" 472 | ] 473 | }, 474 | "execution_count": 41, 475 | "metadata": {}, 476 | "output_type": "execute_result" 477 | } 478 | ], 479 | "source": [ 480 | "pmf = Pmf([1,2])\n", 481 | "pmf[1] = np.sum(sex==1)\n", 482 | "pmf[2] = np.sum(sex==2)\n", 483 | "pmf.normalize()\n", 484 | "pmf" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": 42, 490 | "metadata": {}, 491 | "outputs": [ 492 | { 493 | "data": { 494 | "text/plain": [ 495 | "1 0.451634\n", 496 | "2 0.548366\n", 497 | "Name: Pmf, dtype: float64" 498 | ] 499 | }, 500 | "execution_count": 42, 501 | "metadata": {}, 502 | "output_type": "execute_result" 503 | } 504 | ], 505 | "source": [ 506 | "pmf = Pmf([1,2])\n", 507 | "pmf[1] = np.sum((sex==1) * gss.wtssall)\n", 508 | "pmf[2] = np.sum((sex==2) * gss.wtssall)\n", 509 | "pmf.normalize()\n", 510 | "pmf" 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": 43, 516 | "metadata": {}, 517 | "outputs": [ 518 | { 519 | "data": { 520 | "text/plain": [ 521 | "1 0.453784\n", 522 | "2 0.546216\n", 523 | "Name: Pmf, dtype: float64" 524 | ] 525 | }, 526 | "execution_count": 43, 527 | "metadata": {}, 528 | "output_type": "execute_result" 529 | } 530 | ], 531 | "source": [ 532 | "pmf = Pmf([1,2])\n", 533 | "pmf[1] = np.sum((sex==1) * gss.wtssnr)\n", 534 | "pmf[2] = np.sum((sex==2) * gss.wtssnr)\n", 535 | "pmf.normalize()\n", 536 | "pmf" 537 | ] 538 | }, 539 | { 540 | "cell_type": "code", 541 | "execution_count": 44, 542 | "metadata": {}, 543 | "outputs": [ 544 | { 545 | "data": { 546 | "text/plain": [ 547 | "1 0.463868\n", 548 | "2 0.536132\n", 549 | "Name: Pmf, dtype: float64" 550 | ] 551 | }, 552 | "execution_count": 44, 553 | "metadata": {}, 554 | "output_type": "execute_result" 555 | } 556 | ], 557 | "source": [ 558 | "pmf = Pmf([1,2])\n", 559 | "pmf[1] = np.sum((sex==1) * gss.wtssall * gss.adults)\n", 560 | "pmf[2] = np.sum((sex==2) * gss.wtssall * gss.adults)\n", 561 | "pmf.normalize()\n", 562 | "pmf" 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "execution_count": 45, 568 | "metadata": {}, 569 | "outputs": [ 570 | { 571 | "data": { 572 | "text/plain": [ 573 | "1 0.485397\n", 574 | "2 0.514603\n", 575 | "Name: Pmf, dtype: float64" 576 | ] 577 | }, 578 | "execution_count": 45, 579 | "metadata": {}, 580 | "output_type": "execute_result" 581 | } 582 | ], 583 | "source": [ 584 | "pmf = Pmf([1,2])\n", 585 | "pmf[1] = 114173831\n", 586 | "pmf[2] = 121043794\n", 587 | "pmf.normalize()\n", 588 | "pmf" 589 | ] 590 | }, 591 | { 592 | "cell_type": "code", 593 | "execution_count": 46, 594 | "metadata": { 595 | "scrolled": true 596 | }, 597 | "outputs": [], 598 | "source": [ 599 | "gss['wtsample'] = gss['wtssall']\n", 600 | "gss.loc[gss.sex==1, 'wtsample'] *= 1.145" 601 | ] 602 | }, 603 | { 604 | "cell_type": "code", 605 | "execution_count": 47, 606 | "metadata": {}, 607 | "outputs": [ 608 | { 609 | "data": { 610 | "text/plain": [ 611 | "1 0.485338\n", 612 | "2 0.514662\n", 613 | "Name: Pmf, dtype: float64" 614 | ] 615 | }, 616 | "execution_count": 47, 617 | "metadata": {}, 618 | "output_type": "execute_result" 619 | } 620 | ], 621 | "source": [ 622 | "pmf = Pmf([1,2])\n", 623 | "pmf[1] = np.sum((sex==1) * gss.wtsample)\n", 624 | "pmf[2] = np.sum((sex==2) * gss.wtsample)\n", 625 | "pmf.normalize()\n", 626 | "pmf" 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": 49, 632 | "metadata": { 633 | "scrolled": true 634 | }, 635 | "outputs": [ 636 | { 637 | "data": { 638 | "text/html": [ 639 | "
\n", 640 | "\n", 653 | "\n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | "
yearagecohortsexraceeducrealincgunlawgrasswtssall
0197223.01949.02116.018951.01.0NaN0.4446
1197270.01902.01110.024366.01.0NaN0.8893
2197248.01924.02112.024366.01.0NaN0.8893
3197227.01945.02117.030458.01.0NaN0.8893
4197261.01911.02112.050763.01.0NaN0.8893
\n", 737 | "
" 738 | ], 739 | "text/plain": [ 740 | " year age cohort sex race educ realinc gunlaw grass wtssall\n", 741 | "0 1972 23.0 1949.0 2 1 16.0 18951.0 1.0 NaN 0.4446\n", 742 | "1 1972 70.0 1902.0 1 1 10.0 24366.0 1.0 NaN 0.8893\n", 743 | "2 1972 48.0 1924.0 2 1 12.0 24366.0 1.0 NaN 0.8893\n", 744 | "3 1972 27.0 1945.0 2 1 17.0 30458.0 1.0 NaN 0.8893\n", 745 | "4 1972 61.0 1911.0 2 1 12.0 50763.0 1.0 NaN 0.8893" 746 | ] 747 | }, 748 | "execution_count": 49, 749 | "metadata": {}, 750 | "output_type": "execute_result" 751 | } 752 | ], 753 | "source": [ 754 | "variables = ['year', 'age', 'cohort', 'sex', 'race', \n", 755 | " 'educ', 'realinc', 'gunlaw', 'grass', 'wtssall']\n", 756 | "\n", 757 | "subset = gss[variables]\n", 758 | "subset.head()" 759 | ] 760 | }, 761 | { 762 | "cell_type": "code", 763 | "execution_count": 50, 764 | "metadata": {}, 765 | "outputs": [], 766 | "source": [ 767 | "# drop the 65 respondents with unknown household size\n", 768 | "# subset = subset.dropna(subset=['adults'])" 769 | ] 770 | }, 771 | { 772 | "cell_type": "code", 773 | "execution_count": 51, 774 | "metadata": {}, 775 | "outputs": [], 776 | "source": [ 777 | "np.random.seed(19)\n", 778 | "sample = utils.resample_by_year(subset, 'wtssall')" 779 | ] 780 | }, 781 | { 782 | "cell_type": "code", 783 | "execution_count": 52, 784 | "metadata": {}, 785 | "outputs": [], 786 | "source": [ 787 | "!rm gss.hdf5\n", 788 | "sample.to_hdf('gss.hdf5', 'gss')" 789 | ] 790 | }, 791 | { 792 | "cell_type": "code", 793 | "execution_count": 53, 794 | "metadata": {}, 795 | "outputs": [ 796 | { 797 | "name": "stdout", 798 | "output_type": "stream", 799 | "text": [ 800 | "CPU times: user 16 ms, sys: 4 ms, total: 20 ms\n", 801 | "Wall time: 18.8 ms\n" 802 | ] 803 | }, 804 | { 805 | "data": { 806 | "text/plain": [ 807 | "(62466, 10)" 808 | ] 809 | }, 810 | "execution_count": 53, 811 | "metadata": {}, 812 | "output_type": "execute_result" 813 | } 814 | ], 815 | "source": [ 816 | "%time gss = pd.read_hdf('gss.hdf5', 'gss')\n", 817 | "gss.shape" 818 | ] 819 | }, 820 | { 821 | "cell_type": "code", 822 | "execution_count": 54, 823 | "metadata": {}, 824 | "outputs": [ 825 | { 826 | "data": { 827 | "text/html": [ 828 | "
\n", 829 | "\n", 842 | "\n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | "
yearagecohortsexraceeducrealincgunlawgrasswtssall
0197226.01946.01118.013537.02.0NaN0.8893
1197238.01934.02112.018951.01.0NaN0.4446
2197257.01915.01112.030458.01.0NaN1.3339
3197261.01911.02114.037226.01.0NaN0.8893
4197259.01913.01112.030458.02.0NaN0.8893
\n", 926 | "
" 927 | ], 928 | "text/plain": [ 929 | " year age cohort sex race educ realinc gunlaw grass wtssall\n", 930 | "0 1972 26.0 1946.0 1 1 18.0 13537.0 2.0 NaN 0.8893\n", 931 | "1 1972 38.0 1934.0 2 1 12.0 18951.0 1.0 NaN 0.4446\n", 932 | "2 1972 57.0 1915.0 1 1 12.0 30458.0 1.0 NaN 1.3339\n", 933 | "3 1972 61.0 1911.0 2 1 14.0 37226.0 1.0 NaN 0.8893\n", 934 | "4 1972 59.0 1913.0 1 1 12.0 30458.0 2.0 NaN 0.8893" 935 | ] 936 | }, 937 | "execution_count": 54, 938 | "metadata": {}, 939 | "output_type": "execute_result" 940 | } 941 | ], 942 | "source": [ 943 | "gss.head()" 944 | ] 945 | }, 946 | { 947 | "cell_type": "code", 948 | "execution_count": 24, 949 | "metadata": {}, 950 | "outputs": [ 951 | { 952 | "data": { 953 | "text/plain": [ 954 | "count 62466.000000\n", 955 | "mean 1994.072359\n", 956 | "std 12.937941\n", 957 | "min 1972.000000\n", 958 | "25% 1984.000000\n", 959 | "50% 1994.000000\n", 960 | "75% 2006.000000\n", 961 | "max 2016.000000\n", 962 | "Name: year, dtype: float64" 963 | ] 964 | }, 965 | "execution_count": 24, 966 | "metadata": {}, 967 | "output_type": "execute_result" 968 | } 969 | ], 970 | "source": [ 971 | "gss['year'].describe()" 972 | ] 973 | }, 974 | { 975 | "cell_type": "code", 976 | "execution_count": 25, 977 | "metadata": {}, 978 | "outputs": [ 979 | { 980 | "data": { 981 | "text/plain": [ 982 | "count 62466.000000\n", 983 | "mean 1.541415\n", 984 | "std 0.498286\n", 985 | "min 1.000000\n", 986 | "25% 1.000000\n", 987 | "50% 2.000000\n", 988 | "75% 2.000000\n", 989 | "max 2.000000\n", 990 | "Name: sex, dtype: float64" 991 | ] 992 | }, 993 | "execution_count": 25, 994 | "metadata": {}, 995 | "output_type": "execute_result" 996 | } 997 | ], 998 | "source": [ 999 | "gss['sex'].describe()" 1000 | ] 1001 | }, 1002 | { 1003 | "cell_type": "code", 1004 | "execution_count": 26, 1005 | "metadata": {}, 1006 | "outputs": [ 1007 | { 1008 | "data": { 1009 | "text/plain": [ 1010 | "count 62281.000000\n", 1011 | "mean 44.648320\n", 1012 | "std 17.072244\n", 1013 | "min 18.000000\n", 1014 | "25% 30.000000\n", 1015 | "50% 43.000000\n", 1016 | "75% 57.000000\n", 1017 | "max 89.000000\n", 1018 | "Name: age, dtype: float64" 1019 | ] 1020 | }, 1021 | "execution_count": 26, 1022 | "metadata": {}, 1023 | "output_type": "execute_result" 1024 | } 1025 | ], 1026 | "source": [ 1027 | "gss['age'].describe()" 1028 | ] 1029 | }, 1030 | { 1031 | "cell_type": "code", 1032 | "execution_count": 27, 1033 | "metadata": {}, 1034 | "outputs": [ 1035 | { 1036 | "data": { 1037 | "text/plain": [ 1038 | "count 62282.000000\n", 1039 | "mean 1949.429996\n", 1040 | "std 20.734302\n", 1041 | "min 1883.000000\n", 1042 | "25% 1935.000000\n", 1043 | "50% 1951.000000\n", 1044 | "75% 1964.000000\n", 1045 | "max 1998.000000\n", 1046 | "Name: cohort, dtype: float64" 1047 | ] 1048 | }, 1049 | "execution_count": 27, 1050 | "metadata": {}, 1051 | "output_type": "execute_result" 1052 | } 1053 | ], 1054 | "source": [ 1055 | "gss['cohort'].describe()" 1056 | ] 1057 | }, 1058 | { 1059 | "cell_type": "code", 1060 | "execution_count": 28, 1061 | "metadata": {}, 1062 | "outputs": [ 1063 | { 1064 | "data": { 1065 | "text/plain": [ 1066 | "count 62466.000000\n", 1067 | "mean 1.254955\n", 1068 | "std 0.554694\n", 1069 | "min 1.000000\n", 1070 | "25% 1.000000\n", 1071 | "50% 1.000000\n", 1072 | "75% 1.000000\n", 1073 | "max 3.000000\n", 1074 | "Name: race, dtype: float64" 1075 | ] 1076 | }, 1077 | "execution_count": 28, 1078 | "metadata": {}, 1079 | "output_type": "execute_result" 1080 | } 1081 | ], 1082 | "source": [ 1083 | "gss['race'].describe()" 1084 | ] 1085 | }, 1086 | { 1087 | "cell_type": "code", 1088 | "execution_count": 29, 1089 | "metadata": {}, 1090 | "outputs": [ 1091 | { 1092 | "data": { 1093 | "text/plain": [ 1094 | "count 62304.000000\n", 1095 | "mean 12.831311\n", 1096 | "std 3.117027\n", 1097 | "min 0.000000\n", 1098 | "25% 12.000000\n", 1099 | "50% 12.000000\n", 1100 | "75% 15.000000\n", 1101 | "max 20.000000\n", 1102 | "Name: educ, dtype: float64" 1103 | ] 1104 | }, 1105 | "execution_count": 29, 1106 | "metadata": {}, 1107 | "output_type": "execute_result" 1108 | } 1109 | ], 1110 | "source": [ 1111 | "gss['educ'].describe()" 1112 | ] 1113 | }, 1114 | { 1115 | "cell_type": "code", 1116 | "execution_count": 30, 1117 | "metadata": {}, 1118 | "outputs": [ 1119 | { 1120 | "data": { 1121 | "text/plain": [ 1122 | "count 55499.000000\n", 1123 | "mean 34702.430164\n", 1124 | "std 30665.659411\n", 1125 | "min 234.000000\n", 1126 | "25% 13750.000000\n", 1127 | "50% 26015.000000\n", 1128 | "75% 43426.000000\n", 1129 | "max 162607.000000\n", 1130 | "Name: realinc, dtype: float64" 1131 | ] 1132 | }, 1133 | "execution_count": 30, 1134 | "metadata": {}, 1135 | "output_type": "execute_result" 1136 | } 1137 | ], 1138 | "source": [ 1139 | "gss['realinc'].describe()" 1140 | ] 1141 | }, 1142 | { 1143 | "cell_type": "code", 1144 | "execution_count": 31, 1145 | "metadata": {}, 1146 | "outputs": [ 1147 | { 1148 | "data": { 1149 | "text/plain": [ 1150 | "count 62466.000000\n", 1151 | "mean 1.213340\n", 1152 | "std 0.585544\n", 1153 | "min 0.411898\n", 1154 | "25% 0.918400\n", 1155 | "50% 1.062100\n", 1156 | "75% 1.515500\n", 1157 | "max 8.739876\n", 1158 | "Name: wtssall, dtype: float64" 1159 | ] 1160 | }, 1161 | "execution_count": 31, 1162 | "metadata": {}, 1163 | "output_type": "execute_result" 1164 | } 1165 | ], 1166 | "source": [ 1167 | "gss['wtssall'].describe()" 1168 | ] 1169 | } 1170 | ], 1171 | "metadata": { 1172 | "kernelspec": { 1173 | "display_name": "Python 3", 1174 | "language": "python", 1175 | "name": "python3" 1176 | }, 1177 | "language_info": { 1178 | "codemirror_mode": { 1179 | "name": "ipython", 1180 | "version": 3 1181 | }, 1182 | "file_extension": ".py", 1183 | "mimetype": "text/x-python", 1184 | "name": "python", 1185 | "nbconvert_exporter": "python", 1186 | "pygments_lexer": "ipython3", 1187 | "version": "3.6.7" 1188 | } 1189 | }, 1190 | "nbformat": 4, 1191 | "nbformat_minor": 1 1192 | } 1193 | --------------------------------------------------------------------------------