├── .gitattributes
├── nsfg.hdf5
├── data
├── GSS.dat.gz
├── 2013_2015_FemPregData.dat.gz
├── GSS.do
├── GSS.dct
└── 2013_2015_FemPregSetup.dct
├── gss_eda
├── GSS.dat.gz
├── GSS.dct
└── GSS.do
├── README.md
├── LLCP2017.ASC.gz
├── environment.yml
├── LICENSE
├── .gitignore
├── distribution.py
├── utils.py
├── eds01_gss_clean.ipynb
└── gss_validate.ipynb
/.gitattributes:
--------------------------------------------------------------------------------
1 | LLCP2017.ASC.gz filter=lfs diff=lfs merge=lfs -text
2 |
--------------------------------------------------------------------------------
/nsfg.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AllenDowney/ExploratoryDataAnalysis/HEAD/nsfg.hdf5
--------------------------------------------------------------------------------
/data/GSS.dat.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AllenDowney/ExploratoryDataAnalysis/HEAD/data/GSS.dat.gz
--------------------------------------------------------------------------------
/gss_eda/GSS.dat.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AllenDowney/ExploratoryDataAnalysis/HEAD/gss_eda/GSS.dat.gz
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ExploratoryDataAnalysis
2 | Repository for an online class on Exploratory Data Analysis in Python
3 |
--------------------------------------------------------------------------------
/data/2013_2015_FemPregData.dat.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AllenDowney/ExploratoryDataAnalysis/HEAD/data/2013_2015_FemPregData.dat.gz
--------------------------------------------------------------------------------
/LLCP2017.ASC.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b13013ed06f8a69eb58b6c352208b4f5d8cfd0780896873ca39d81efcfb97a4c
3 | size 69310674
4 |
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: ExploratoryDataAnalysis
2 |
3 | dependencies:
4 | - python=3.7
5 | - jupyter
6 | - numpy
7 | - matplotlib
8 | - seaborn
9 | - pandas
10 | - pytables
11 | - scipy
12 | - scikit-learn
13 | - pip
14 | - pip:
15 | - empiricaldist
16 |
17 |
18 |
19 |
20 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Allen Downey
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
--------------------------------------------------------------------------------
/distribution.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 |
4 | import matplotlib.pyplot as plt
5 |
6 |
7 | def underride(d, **options):
8 | """Add key-value pairs to d only if key is not in d.
9 |
10 | d: dictionary
11 | options: keyword args to add to d
12 | """
13 | for key, val in options.items():
14 | d.setdefault(key, val)
15 |
16 | return d
17 |
18 |
19 | class Pmf(pd.Series):
20 |
21 | def __init__(self, seq, name='Pmf', normalize=True):
22 | series = pd.Series(seq).value_counts().sort_index()
23 | super().__init__(series, name=name)
24 | if normalize:
25 | self.normalize()
26 |
27 | @property
28 | def qs(self):
29 | return self.index.values
30 |
31 | @property
32 | def ps(self):
33 | return self.values
34 |
35 | def __call__(self, qs):
36 | return self.get(qs, 0)
37 |
38 | def normalize(self):
39 | self /= self.sum()
40 |
41 | def bar(self, **options):
42 | underride(options, label=self.name)
43 | plt.bar(self.index, self.values, **options)
44 |
45 | def plot(self, **options):
46 | underride(options, label=self.name)
47 | plt.plot(self.index, self.values, **options)
48 |
49 |
50 |
51 | from scipy.interpolate import interp1d
52 |
53 | class Cdf(pd.Series):
54 |
55 | def __init__(self, seq, name='Cdf'):
56 | pmf = Pmf(seq)
57 | super().__init__(pmf.cumsum(), name=name)
58 |
59 | @property
60 | def qs(self):
61 | return self.index
62 |
63 | @property
64 | def ps(self):
65 | return self.values
66 |
67 | @property
68 | def forward(self):
69 | return interp1d(self.qs, self.ps,
70 | kind='previous',
71 | assume_sorted=True,
72 | bounds_error=False,
73 | fill_value=(0,1))
74 |
75 | @property
76 | def inverse(self):
77 | return interp1d(self.ps, self.qs,
78 | kind='next',
79 | assume_sorted=True,
80 | bounds_error=False,
81 | fill_value=(self.qs[0], np.nan))
82 |
83 | def __call__(self, qs):
84 | return self.forward(qs)
85 |
86 | def percentile_rank(self, qs):
87 | return self.forward(qs) * 100
88 |
89 | def percentile(self, percentile_ranks):
90 | return self.inverse(percentile_ranks / 100)
91 |
92 | def step(self, **options):
93 | underride(options, label=self.name, where='post')
94 | plt.step(self.index, self.values, **options)
95 |
96 | def plot(self, **options):
97 | underride(options, label=self.name)
98 | plt.plot(self.index, self.values, **options)
--------------------------------------------------------------------------------
/data/GSS.do:
--------------------------------------------------------------------------------
1 | #delimit ;
2 |
3 | infix
4 | year 1 - 20
5 | gun 21 - 40
6 | gunage 41 - 60
7 | gunnum 61 - 80
8 | owngun 81 - 100
9 | rowngun 101 - 120
10 | realinc 121 - 140
11 | conrinc 141 - 160
12 | hispanic 161 - 180
13 | cohort 181 - 200
14 | ballot 201 - 220
15 | gunlaw 221 - 240
16 | cappun 241 - 260
17 | id_ 261 - 280
18 | age 281 - 300
19 | educ 301 - 320
20 | sex 321 - 340
21 | race 341 - 360
22 | income 361 - 380
23 | rincome 381 - 400
24 | srcbelt 401 - 420
25 | polviews 421 - 440
26 | natcrime 441 - 460
27 | wtssall 461 - 480
28 | using GSS.dat;
29 |
30 | label variable year "Gss year for this respondent ";
31 | label variable gun "Ever threatened with gun or shot at";
32 | label variable gunage "Threatened or shot at as child or adult";
33 | label variable gunnum "If threatened or shot at--how many times";
34 | label variable owngun "Have gun in home";
35 | label variable rowngun "Does gun belong to r";
36 | label variable realinc "Family income in constant $";
37 | label variable conrinc "Respondent income in constant dollars";
38 | label variable hispanic "Hispanic specified";
39 | label variable cohort "Year of birth";
40 | label variable ballot "Ballot used for interview";
41 | label variable gunlaw "Favor or oppose gun permits";
42 | label variable cappun "Favor or oppose death penalty for murder";
43 | label variable id_ "Respondent id number";
44 | label variable age "Age of respondent";
45 | label variable educ "Highest year of school completed";
46 | label variable sex "Respondents sex";
47 | label variable race "Race of respondent";
48 | label variable income "Total family income";
49 | label variable rincome "Respondents income";
50 | label variable srcbelt "Src beltcode";
51 | label variable polviews "Think of self as liberal or conservative";
52 | label variable natcrime "Halting rising crime rate";
53 | label variable wtssall "Weight variable";
54 |
55 |
56 | label define gsp001x
57 | 9 "No answer"
58 | 8 "Don't know"
59 | 2 "No"
60 | 1 "Yes"
61 | 0 "Not applicable"
62 | ;
63 | label define gsp002x
64 | 9 "No answer"
65 | 8 "Don't know"
66 | 3 "Both"
67 | 2 "Adult"
68 | 1 "Child"
69 | 0 "Not applicable"
70 | ;
71 | label define gsp003x
72 | 9 "No answer"
73 | 8 "Not sure"
74 | 3 "4+ times"
75 | 2 "2-3 times"
76 | 1 "Once"
77 | 0 "Not applicable"
78 | ;
79 | label define gsp004x
80 | 9 "No answer"
81 | 8 "Don't know"
82 | 3 "Refused"
83 | 2 "No"
84 | 1 "Yes"
85 | 0 "Not applicable"
86 | ;
87 | label define gsp005x
88 | 9 "No answer"
89 | 8 "Don't know"
90 | 3 "Refused"
91 | 2 "No"
92 | 1 "Yes"
93 | 0 "Not applicable"
94 | ;
95 | label define gsp006x
96 | 999999 "No answer"
97 | 999998 "Dont know"
98 | 0 "Not applicable"
99 | ;
100 | label define gsp007x
101 | 999999 "No answer"
102 | 999998 "Dont know"
103 | 0 "Not applicable"
104 | ;
105 | label define gsp008x
106 | 99 "No answer"
107 | 98 "Don't know"
108 | 50 "Other, not specified"
109 | 47 "Hispanic"
110 | 46 "Latino/a"
111 | 45 "Latin"
112 | 41 "South american"
113 | 40 "Latin american"
114 | 35 "Filipino/a"
115 | 31 "Basque"
116 | 30 "Spanish"
117 | 25 "Chilean"
118 | 24 "Argentinian"
119 | 23 "Venezuelan"
120 | 22 "Columbian"
121 | 21 "Equadorian"
122 | 20 "Peruvian"
123 | 16 "West indian"
124 | 15 "Dominican"
125 | 11 "Honduran"
126 | 10 "Central american"
127 | 9 "Costa rican"
128 | 8 "Nicaraguan"
129 | 7 "Panamanian"
130 | 6 "Guatemalan"
131 | 5 "Salvadorian"
132 | 4 "Cuban"
133 | 3 "Puerto rican"
134 | 2 "Mexican, mexican american, chicano/a"
135 | 1 "Not hispanic"
136 | 0 "Not applicable"
137 | ;
138 | label define gsp009x
139 | 9999 "No answer"
140 | 0 "Not applicable"
141 | ;
142 | label define gsp010x
143 | 4 "Ballot d"
144 | 3 "Ballot c"
145 | 2 "Ballot b"
146 | 1 "Ballot a"
147 | 0 "Not applicable"
148 | ;
149 | label define gsp011x
150 | 9 "No answer"
151 | 8 "Don't know"
152 | 2 "Oppose"
153 | 1 "Favor"
154 | 0 "Not applicable"
155 | ;
156 | label define gsp012x
157 | 9 "No answer"
158 | 8 "Don't know"
159 | 2 "Oppose"
160 | 1 "Favor"
161 | 0 "Not applicable"
162 | ;
163 | label define gsp013x
164 | 99 "No answer"
165 | 98 "Don't know"
166 | 89 "89 or older"
167 | ;
168 | label define gsp014x
169 | 99 "No answer"
170 | 98 "Don't know"
171 | 97 "Not applicable"
172 | ;
173 | label define gsp015x
174 | 2 "Female"
175 | 1 "Male"
176 | ;
177 | label define gsp016x
178 | 3 "Other"
179 | 2 "Black"
180 | 1 "White"
181 | 0 "Not applicable"
182 | ;
183 | label define gsp017x
184 | 99 "No answer"
185 | 98 "Don't know"
186 | 13 "Refused"
187 | 12 "$25000 or more"
188 | 11 "$20000 - 24999"
189 | 10 "$15000 - 19999"
190 | 9 "$10000 - 14999"
191 | 8 "$8000 to 9999"
192 | 7 "$7000 to 7999"
193 | 6 "$6000 to 6999"
194 | 5 "$5000 to 5999"
195 | 4 "$4000 to 4999"
196 | 3 "$3000 to 3999"
197 | 2 "$1000 to 2999"
198 | 1 "Lt $1000"
199 | 0 "Not applicable"
200 | ;
201 | label define gsp018x
202 | 99 "No answer"
203 | 98 "Don't know"
204 | 13 "Refused"
205 | 12 "$25000 or more"
206 | 11 "$20000 - 24999"
207 | 10 "$15000 - 19999"
208 | 9 "$10000 - 14999"
209 | 8 "$8000 to 9999"
210 | 7 "$7000 to 7999"
211 | 6 "$6000 to 6999"
212 | 5 "$5000 to 5999"
213 | 4 "$4000 to 4999"
214 | 3 "$3000 to 3999"
215 | 2 "$1000 to 2999"
216 | 1 "Lt $1000"
217 | 0 "Not applicable"
218 | ;
219 | label define gsp019x
220 | 6 "Other rural"
221 | 5 "Other urban"
222 | 4 "Suburb, 13-100"
223 | 3 "Suburb, 12 lrgst"
224 | 2 "Smsa's 13-100"
225 | 1 "12 lrgst smsa's"
226 | 0 "Not assigned"
227 | ;
228 | label define gsp020x
229 | 9 "No answer"
230 | 8 "Don't know"
231 | 7 "Extrmly conservative"
232 | 6 "Conservative"
233 | 5 "Slghtly conservative"
234 | 4 "Moderate"
235 | 3 "Slightly liberal"
236 | 2 "Liberal"
237 | 1 "Extremely liberal"
238 | 0 "Not applicable"
239 | ;
240 | label define gsp021x
241 | 9 "No answer"
242 | 8 "Don't know"
243 | 3 "Too much"
244 | 2 "About right"
245 | 1 "Too little"
246 | 0 "Not applicable"
247 | ;
248 |
249 |
250 | label values gun gsp001x;
251 | label values gunage gsp002x;
252 | label values gunnum gsp003x;
253 | label values owngun gsp004x;
254 | label values rowngun gsp005x;
255 | label values realinc gsp006x;
256 | label values conrinc gsp007x;
257 | label values hispanic gsp008x;
258 | label values cohort gsp009x;
259 | label values ballot gsp010x;
260 | label values gunlaw gsp011x;
261 | label values cappun gsp012x;
262 | label values age gsp013x;
263 | label values educ gsp014x;
264 | label values sex gsp015x;
265 | label values race gsp016x;
266 | label values income gsp017x;
267 | label values rincome gsp018x;
268 | label values srcbelt gsp019x;
269 | label values polviews gsp020x;
270 | label values natcrime gsp021x;
271 |
272 |
273 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 |
5 | import re
6 |
7 | class FixedWidthVariables(object):
8 | """Represents a set of variables in a fixed width file."""
9 |
10 | def __init__(self, variables, index_base=0):
11 | """Initializes.
12 |
13 | variables: DataFrame
14 | index_base: are the indices 0 or 1 based?
15 |
16 | Attributes:
17 | colspecs: list of (start, end) index tuples
18 | names: list of string variable names
19 | """
20 | self.variables = variables
21 |
22 | # note: by default, subtract 1 from colspecs
23 | self.colspecs = variables[['start', 'end']] - index_base
24 |
25 | # convert colspecs to a list of pair of int
26 | self.colspecs = self.colspecs.astype(np.int).values.tolist()
27 | self.names = variables['name']
28 |
29 | def read_fixed_width(self, filename, **options):
30 | """Reads a fixed width ASCII file.
31 |
32 | filename: string filename
33 |
34 | returns: DataFrame
35 | """
36 | df = pd.read_fwf(filename,
37 | colspecs=self.colspecs,
38 | names=self.names,
39 | **options)
40 | return df
41 |
42 |
43 | def read_stata_dict(dct_file, **options):
44 | """Reads a Stata dictionary file.
45 |
46 | dct_file: string filename
47 | options: dict of options passed to open()
48 |
49 | returns: FixedWidthVariables object
50 | """
51 | type_map = dict(byte=int, int=int, long=int, float=float,
52 | double=float, numeric=float)
53 |
54 | var_info = []
55 | with open(dct_file, **options) as f:
56 | for line in f:
57 | match = re.search( r'_column\(([^)]*)\)', line)
58 | if not match:
59 | continue
60 | start = int(match.group(1))
61 | t = line.split()
62 | vtype, name, fstring = t[1:4]
63 | name = name.lower()
64 | if vtype.startswith('str'):
65 | vtype = str
66 | else:
67 | vtype = type_map[vtype]
68 | long_desc = ' '.join(t[4:]).strip('"')
69 | var_info.append((start, vtype, name, fstring, long_desc))
70 |
71 | columns = ['start', 'type', 'name', 'fstring', 'desc']
72 | variables = pd.DataFrame(var_info, columns=columns)
73 |
74 | # fill in the end column by shifting the start column
75 | variables['end'] = variables.start.shift(-1)
76 | variables.loc[len(variables)-1, 'end'] = 0
77 |
78 | dct = FixedWidthVariables(variables, index_base=1)
79 | return dct
80 |
81 |
82 | def read_stata(dct_name, dat_name, **options):
83 | """Reads Stata files from the given directory.
84 |
85 | dirname: string
86 |
87 | returns: DataFrame
88 | """
89 | dct = read_stata_dict(dct_name)
90 | df = dct.read_fixed_width(dat_name, **options)
91 | return df
92 |
93 |
94 | def sample_rows(df, nrows, replace=False):
95 | """Choose a sample of rows from a DataFrame.
96 |
97 | df: DataFrame
98 | nrows: number of rows
99 | replace: whether to sample with replacement
100 |
101 | returns: DataDf
102 | """
103 | indices = np.random.choice(df.index, nrows, replace=replace)
104 | sample = df.loc[indices]
105 | return sample
106 |
107 |
108 | def resample_rows(df):
109 | """Resamples rows from a DataFrame.
110 |
111 | df: DataFrame
112 |
113 | returns: DataFrame
114 | """
115 | return sample_rows(df, len(df), replace=True)
116 |
117 |
118 | def resample_rows_weighted(df, column='finalwgt'):
119 | """Resamples a DataFrame using probabilities proportional to given column.
120 |
121 | df: DataFrame
122 | column: string column name to use as weights
123 |
124 | returns: DataFrame
125 | """
126 | weights = df[column].copy()
127 | weights /= sum(weights)
128 | indices = np.random.choice(df.index, len(df), replace=True, p=weights)
129 | sample = df.loc[indices]
130 | return sample
131 |
132 |
133 | def resample_by_year(df, column='wtssall'):
134 | """Resample rows within each year.
135 |
136 | df: DataFrame
137 | column: string name of weight variable
138 |
139 | returns DataFrame
140 | """
141 | grouped = df.groupby('year')
142 | samples = [resample_rows_weighted(group, column)
143 | for _, group in grouped]
144 | sample = pd.concat(samples, ignore_index=True)
145 | return sample
146 |
147 |
148 | def values(df, varname):
149 | """Values and counts in index order.
150 |
151 | df: DataFrame
152 | varname: strign column name
153 |
154 | returns: Series that maps from value to frequency
155 | """
156 | return df[varname].value_counts().sort_index()
157 |
158 | def count_by_year(gss, varname):
159 | """Groups by category and year and counts.
160 |
161 | gss: DataFrame
162 | varname: string variable to group by
163 |
164 | returns: DataFrame with one row per year, one column per category.
165 | """
166 | grouped = gss.groupby([varname, 'year'])
167 | count = grouped[varname].count().unstack(level=0)
168 |
169 | # note: the following is not ideal, because it does not
170 | # distinguish 0 from NA, but in this dataset the only
171 | # zeros are during years when the question was not asked.
172 | count = count.replace(0, np.nan).dropna()
173 | return count
174 |
175 | def fill_missing(df, varname, badvals=[98, 99]):
176 | """Fill missing data with random values.
177 |
178 | df: DataFrame
179 | varname: string column name
180 | badvals: list of values to be replaced
181 | """
182 | # replace badvals with NaN
183 | df[varname].replace(badvals, np.nan, inplace=True)
184 |
185 | # get the index of rows missing varname
186 | null = df[varname].isnull()
187 | n_missing = sum(null)
188 |
189 | # choose a random sample from the non-missing values
190 | fill = np.random.choice(df[varname].dropna(), n_missing, replace=True)
191 |
192 | # replace missing data with the samples
193 | df.loc[null, varname] = fill
194 |
195 | # return the number of missing values replaced
196 | return n_missing
197 |
198 |
199 | def round_into_bins(df, var, bin_width, high=None, low=0):
200 | """Rounds values down to the bin they belong in.
201 |
202 | df: DataFrame
203 | var: string variable name
204 | bin_width: number, width of the bins
205 |
206 | returns: array of bin values
207 | """
208 | if high is None:
209 | high = df[var].max()
210 |
211 | bins = np.arange(low, high+bin_width, bin_width)
212 | indices = np.digitize(df[var], bins)
213 | return bins[indices-1]
214 |
215 |
216 | def underride(d, **options):
217 | """Add key-value pairs to d only if key is not in d.
218 |
219 | d: dictionary
220 | options: keyword args to add to d
221 | """
222 | for key, val in options.items():
223 | d.setdefault(key, val)
224 |
225 | return d
226 |
227 |
228 | def decorate(**options):
229 | """Decorate the current axes.
230 | Call decorate with keyword arguments like
231 | decorate(title='Title',
232 | xlabel='x',
233 | ylabel='y')
234 | The keyword arguments can be any of the axis properties
235 | https://matplotlib.org/api/axes_api.html
236 | In addition, you can use `legend=False` to suppress the legend.
237 | And you can use `loc` to indicate the location of the legend
238 | (the default value is 'best')
239 | """
240 | loc = options.pop('loc', 'best')
241 | if options.pop('legend', True):
242 | legend(loc=loc)
243 |
244 | plt.gca().set(**options)
245 | plt.tight_layout()
246 |
247 |
248 | def legend(**options):
249 | """Draws a legend only if there is at least one labeled item.
250 | options are passed to plt.legend()
251 | https://matplotlib.org/api/_as_gen/matplotlib.pyplot.legend.html
252 | """
253 | underride(options, loc='best')
254 |
255 | ax = plt.gca()
256 | handles, labels = ax.get_legend_handles_labels()
257 | #TODO: don't draw if there are none
258 | ax.legend(handles, labels, **options)
259 |
260 | from statsmodels.nonparametric.smoothers_lowess import lowess
261 |
262 | def make_lowess(series):
263 | """Use LOWESS to compute a smooth line.
264 |
265 | series: pd.Series
266 |
267 | returns: pd.Series
268 | """
269 | endog = series.values
270 | exog = series.index.values
271 |
272 | smooth = lowess(endog, exog)
273 | index, data = np.transpose(smooth)
274 |
275 | return pd.Series(data, index=index)
276 |
277 | def plot_series_lowess(series, color):
278 | """Plots a series of data points and a smooth line.
279 |
280 | series: pd.Series
281 | color: string or tuple
282 | """
283 | series.plot(lw=0, marker='o', color=color, alpha=0.5)
284 | smooth = make_lowess(series)
285 | smooth.plot(label='_', color=color)
286 |
287 | def plot_columns_lowess(df, columns, colors):
288 | """Plot the columns in a DataFrame.
289 |
290 | df: pd.DataFrame
291 | columns: list of column names, in the desired order
292 | colors: mapping from column names to colors
293 | """
294 | for col in columns:
295 | series = df[col]
296 | plot_series_lowess(series, colors[col])
297 |
298 | def anchor_legend(x, y):
299 | """Put the legend at the given locationself.
300 |
301 | x: axis coordinate
302 | y: axis coordinate
303 | """
304 | plt.legend(bbox_to_anchor=(x, y), loc='upper left', ncol=1)
305 |
--------------------------------------------------------------------------------
/data/GSS.dct:
--------------------------------------------------------------------------------
1 | infile dictionary {
2 | _column(1) numeric POSTLIFE %20f "Belief in life after death"
3 | _column(21) numeric LIFE %20f "Is life exciting or dull"
4 | _column(41) numeric HELPFUL %20f "People helpful or looking out for selves"
5 | _column(61) numeric FAIR %20f "People fair or try to take advantage"
6 | _column(81) numeric TRUST %20f "Can people be trusted"
7 | _column(101) numeric CONCLERG %20f "Confidence in organized religion"
8 | _column(121) numeric CONEDUC %20f "Confidence in education"
9 | _column(141) numeric CONFED %20f "Confid. in exec branch of fed govt"
10 | _column(161) numeric CONPRESS %20f "Confidence in press"
11 | _column(181) numeric CONJUDGE %20f "Confid. in united states supreme court"
12 | _column(201) numeric CONLEGIS %20f "Confidence in congress"
13 | _column(221) numeric HEALTH %20f "Condition of health"
14 | _column(241) numeric HAPMAR %20f "Happiness of marriage"
15 | _column(261) numeric PRAY %20f "How often does r pray"
16 | _column(281) numeric RELIG16 %20f "Religion in which raised"
17 | _column(301) numeric FUND16 %20f "How fundamentalist was r at age 16"
18 | _column(321) numeric SPREL16 %20f "Religion in which spouse raised"
19 | _column(341) numeric PRAYER %20f "Bible prayer in public schools"
20 | _column(361) numeric BIBLE %20f "Feelings about the bible"
21 | _column(381) numeric RACMAR %20f "Favor law against racial intermarriage"
22 | _column(401) numeric RACPRES %20f "Would vote for black president"
23 | _column(421) numeric AFFRMACT %20f "Favor preference in hiring blacks"
24 | _column(441) numeric HAPPY %20f "General happiness"
25 | _column(461) numeric CONARMY %20f "Confidence in military"
26 | _column(481) numeric SATJOB %20f "Job or housework"
27 | _column(501) numeric FEAR %20f "Afraid to walk at night in neighborhood"
28 | _column(521) numeric OWNGUN %20f "Have gun in home"
29 | _column(541) numeric PISTOL %20f "Pistol or revolver in home"
30 | _column(561) numeric HUNT %20f "Does r or spouse hunt"
31 | _column(581) numeric PHONE %20f "Does r have telephone"
32 | _column(601) numeric MEMCHURH %20f "Membership in church group"
33 | _column(621) float REALINC %20f "Family income in constant $"
34 | _column(641) numeric COHORT %20f "Year of birth"
35 | _column(661) numeric MARCOHRT %20f "Year of first marriage"
36 | _column(681) numeric BALLOT %20f "Ballot used for interview"
37 | _column(701) numeric SPANKING %20f "Favor spanking to discipline child"
38 | _column(721) numeric HOMOSEX %20f "Homosexual sex relations"
39 | _column(741) numeric CLASS_ %20f "Subjective class identification"
40 | _column(761) numeric SATFIN %20f "Satisfaction with financial situation"
41 | _column(781) numeric FINRELA %20f "Opinion of family income"
42 | _column(801) numeric UNION_ %20f "Does r or spouse belong to union"
43 | _column(821) numeric FEPOL %20f "Women not suited for politics"
44 | _column(841) numeric ABANY %20f "Abortion if woman wants for any reason"
45 | _column(861) numeric CHLDIDEL %20f "Ideal number of children"
46 | _column(881) numeric SEXEDUC %20f "Sex education in public schools"
47 | _column(901) numeric PREMARSX %20f "Sex before marriage"
48 | _column(921) numeric XMARSEX %20f "Sex with person other than spouse"
49 | _column(941) numeric WTSSALL %20f "Weight variable"
50 | _column(961) numeric RELITEN %20f "Strength of affiliation"
51 | _column(981) numeric YEAR %20f "Gss year for this respondent "
52 | _column(1001) numeric MADEG %20f "Mothers highest degree"
53 | _column(1021) numeric SPDEG %20f "Spouses highest degree"
54 | _column(1041) numeric SEX %20f "Respondents sex"
55 | _column(1061) numeric RACE %20f "Race of respondent"
56 | _column(1081) numeric RES16 %20f "Type of place lived in when 16 yrs old"
57 | _column(1101) numeric REG16 %20f "Region of residence, age 16"
58 | _column(1121) numeric SRCBELT %20f "Src beltcode"
59 | _column(1141) numeric PARTYID %20f "Political party affiliation"
60 | _column(1161) numeric PRES04 %20f "Vote for kerry, bush, nader"
61 | _column(1181) numeric PRES08 %20f "Vote obama or mccain"
62 | _column(1201) numeric PADEG %20f "Fathers highest degree"
63 | _column(1221) numeric DEGREE %20f "Rs highest degree"
64 | _column(1241) numeric ID_ %20f "Respondent id number"
65 | _column(1261) numeric AGEWED %20f "Age when first married"
66 | _column(1281) numeric DIVORCE %20f "Ever been divorced or separated"
67 | _column(1301) numeric SIBS %20f "Number of brothers and sisters"
68 | _column(1321) numeric CHILDS %20f "Number of children"
69 | _column(1341) numeric AGE %20f "Age of respondent"
70 | _column(1361) numeric EDUC %20f "Highest year of school completed"
71 | _column(1381) numeric PAEDUC %20f "Highest year school completed, father"
72 | _column(1401) numeric MAEDUC %20f "Highest year school completed, mother"
73 | _column(1421) numeric SPEDUC %20f "Highest year school completed, spouse"
74 | _column(1441) numeric PRES12 %20f "Vote obama or romney"
75 | _column(1461) numeric POLVIEWS %20f "Think of self as liberal or conservative"
76 | _column(1481) numeric COLATH %20f "Allow anti-religionist to teach"
77 | _column(1501) numeric LIBATH %20f "Allow anti-religious book in library"
78 | _column(1521) numeric SPKHOMO %20f "Allow homosexual to speak"
79 | _column(1541) numeric COLHOMO %20f "Allow homosexual to teach"
80 | _column(1561) numeric LIBHOMO %20f "Allow homosexuals book in library"
81 | _column(1581) numeric CAPPUN %20f "Favor or oppose death penalty for murder"
82 | _column(1601) numeric GUNLAW %20f "Favor or oppose gun permits"
83 | _column(1621) numeric GRASS %20f "Should marijuana be made legal"
84 | _column(1641) numeric RELIG %20f "Rs religious preference"
85 | _column(1661) numeric FUND %20f "How fundamentalist is r currently"
86 | _column(1681) numeric SPKATH %20f "Allow anti-religionist to speak"
87 | _column(1701) numeric NATFARE %20f "Welfare"
88 | _column(1721) numeric NATSPAC %20f "Space exploration program"
89 | _column(1741) numeric NATENVIR %20f "Improving & protecting environment"
90 | _column(1761) numeric NATHEAL %20f "Improving & protecting nations health"
91 | _column(1781) numeric NATCITY %20f "Solving problems of big cities"
92 | _column(1801) numeric NATCRIME %20f "Halting rising crime rate"
93 | _column(1821) numeric NATDRUG %20f "Dealing with drug addiction"
94 | _column(1841) numeric NATEDUC %20f "Improving nations education system"
95 | _column(1861) numeric NATRACE %20f "Improving the conditions of blacks"
96 | _column(1881) numeric NATARMS %20f "Military, armaments, and defense"
97 | _column(1901) numeric NATAID %20f "Foreign aid"
98 | _column(1921) numeric ATTEND %20f "How often r attends religious services"
99 | }
100 |
--------------------------------------------------------------------------------
/gss_eda/GSS.dct:
--------------------------------------------------------------------------------
1 | infile dictionary {
2 | _column(1) numeric YEAR %20f "Gss year for this respondent "
3 | _column(21) numeric ID_ %20f "Respondent id number"
4 | _column(41) numeric AGEWED %20f "Age when first married"
5 | _column(61) numeric DIVORCE %20f "Ever been divorced or separated"
6 | _column(81) numeric SIBS %20f "Number of brothers and sisters"
7 | _column(101) numeric CHILDS %20f "Number of children"
8 | _column(121) numeric AGE %20f "Age of respondent"
9 | _column(141) numeric EDUC %20f "Highest year of school completed"
10 | _column(161) numeric PAEDUC %20f "Highest year school completed, father"
11 | _column(181) numeric MAEDUC %20f "Highest year school completed, mother"
12 | _column(201) numeric SPEDUC %20f "Highest year school completed, spouse"
13 | _column(221) numeric DEGREE %20f "Rs highest degree"
14 | _column(241) numeric PADEG %20f "Fathers highest degree"
15 | _column(261) numeric MADEG %20f "Mothers highest degree"
16 | _column(281) numeric SPDEG %20f "Spouses highest degree"
17 | _column(301) numeric SEX %20f "Respondents sex"
18 | _column(321) numeric RACE %20f "Race of respondent"
19 | _column(341) numeric RES16 %20f "Type of place lived in when 16 yrs old"
20 | _column(361) numeric REG16 %20f "Region of residence, age 16"
21 | _column(381) numeric SRCBELT %20f "Src beltcode"
22 | _column(401) numeric PARTYID %20f "Political party affiliation"
23 | _column(421) numeric PRES04 %20f "Vote for kerry, bush, nader"
24 | _column(441) numeric PRES08 %20f "Vote obama or mccain"
25 | _column(461) numeric PRES12 %20f "Vote obama or romney"
26 | _column(481) numeric POLVIEWS %20f "Think of self as liberal or conservative"
27 | _column(501) numeric NATSPAC %20f "Space exploration program"
28 | _column(521) numeric NATENVIR %20f "Improving & protecting environment"
29 | _column(541) numeric NATHEAL %20f "Improving & protecting nations health"
30 | _column(561) numeric NATCITY %20f "Solving problems of big cities"
31 | _column(581) numeric NATCRIME %20f "Halting rising crime rate"
32 | _column(601) numeric NATDRUG %20f "Dealing with drug addiction"
33 | _column(621) numeric NATEDUC %20f "Improving nations education system"
34 | _column(641) numeric NATRACE %20f "Improving the conditions of blacks"
35 | _column(661) numeric NATARMS %20f "Military, armaments, and defense"
36 | _column(681) numeric NATAID %20f "Foreign aid"
37 | _column(701) numeric NATFARE %20f "Welfare"
38 | _column(721) numeric SPKATH %20f "Allow anti-religionist to speak"
39 | _column(741) numeric COLATH %20f "Allow anti-religionist to teach"
40 | _column(761) numeric LIBATH %20f "Allow anti-religious book in library"
41 | _column(781) numeric SPKHOMO %20f "Allow homosexual to speak"
42 | _column(801) numeric COLHOMO %20f "Allow homosexual to teach"
43 | _column(821) numeric LIBHOMO %20f "Allow homosexuals book in library"
44 | _column(841) numeric CAPPUN %20f "Favor or oppose death penalty for murder"
45 | _column(861) numeric GUNLAW %20f "Favor or oppose gun permits"
46 | _column(881) numeric GRASS %20f "Should marijuana be made legal"
47 | _column(901) numeric RELIG %20f "Rs religious preference"
48 | _column(921) numeric FUND %20f "How fundamentalist is r currently"
49 | _column(941) numeric ATTEND %20f "How often r attends religious services"
50 | _column(961) numeric RELITEN %20f "Strength of affiliation"
51 | _column(981) numeric POSTLIFE %20f "Belief in life after death"
52 | _column(1001) numeric PRAY %20f "How often does r pray"
53 | _column(1021) numeric RELIG16 %20f "Religion in which raised"
54 | _column(1041) numeric FUND16 %20f "How fundamentalist was r at age 16"
55 | _column(1061) numeric SPREL16 %20f "Religion in which spouse raised"
56 | _column(1081) numeric PRAYER %20f "Bible prayer in public schools"
57 | _column(1101) numeric BIBLE %20f "Feelings about the bible"
58 | _column(1121) numeric RACMAR %20f "Favor law against racial intermarriage"
59 | _column(1141) numeric RACPRES %20f "Would vote for black president"
60 | _column(1161) numeric AFFRMACT %20f "Favor preference in hiring blacks"
61 | _column(1181) numeric HAPPY %20f "General happiness"
62 | _column(1201) numeric HAPMAR %20f "Happiness of marriage"
63 | _column(1221) numeric HEALTH %20f "Condition of health"
64 | _column(1241) numeric LIFE %20f "Is life exciting or dull"
65 | _column(1261) numeric HELPFUL %20f "People helpful or looking out for selves"
66 | _column(1281) numeric FAIR %20f "People fair or try to take advantage"
67 | _column(1301) numeric TRUST %20f "Can people be trusted"
68 | _column(1321) numeric CONCLERG %20f "Confidence in organized religion"
69 | _column(1341) numeric CONEDUC %20f "Confidence in education"
70 | _column(1361) numeric CONFED %20f "Confid. in exec branch of fed govt"
71 | _column(1381) numeric CONPRESS %20f "Confidence in press"
72 | _column(1401) numeric CONJUDGE %20f "Confid. in united states supreme court"
73 | _column(1421) numeric CONLEGIS %20f "Confidence in congress"
74 | _column(1441) numeric CONARMY %20f "Confidence in military"
75 | _column(1461) numeric SATJOB %20f "Job or housework"
76 | _column(1481) numeric CLASS_ %20f "Subjective class identification"
77 | _column(1501) numeric SATFIN %20f "Satisfaction with financial situation"
78 | _column(1521) numeric FINRELA %20f "Opinion of family income"
79 | _column(1541) numeric UNION_ %20f "Does r or spouse belong to union"
80 | _column(1561) numeric FEPOL %20f "Women not suited for politics"
81 | _column(1581) numeric ABANY %20f "Abortion if woman wants for any reason"
82 | _column(1601) numeric CHLDIDEL %20f "Ideal number of children"
83 | _column(1621) numeric SEXEDUC %20f "Sex education in public schools"
84 | _column(1641) numeric PREMARSX %20f "Sex before marriage"
85 | _column(1661) numeric XMARSEX %20f "Sex with person other than spouse"
86 | _column(1681) numeric HOMOSEX %20f "Homosexual sex relations"
87 | _column(1701) numeric SPANKING %20f "Favor spanking to discipline child"
88 | _column(1721) numeric FEAR %20f "Afraid to walk at night in neighborhood"
89 | _column(1741) numeric OWNGUN %20f "Have gun in home"
90 | _column(1761) numeric PISTOL %20f "Pistol or revolver in home"
91 | _column(1781) numeric HUNT %20f "Does r or spouse hunt"
92 | _column(1801) numeric PHONE %20f "Does r have telephone"
93 | _column(1821) numeric MEMCHURH %20f "Membership in church group"
94 | _column(1841) float REALINC %20f "Family income in constant $"
95 | _column(1861) numeric COHORT %20f "Year of birth"
96 | _column(1881) numeric MARCOHRT %20f "Year of first marriage"
97 | _column(1901) numeric BALLOT %20f "Ballot used for interview"
98 | _column(1921) numeric WTSSALL %20f "Weight variable"
99 | _column(1941) numeric ADULTS %20f "Household members 18 yrs and older"
100 | _column(1961) numeric COMPUSE %20f "R use computer"
101 | _column(1981) numeric DATABANK %20f "Computer data threat to individual privacy"
102 | _column(2001) numeric WTSSNR %20f "Weight variable"
103 | _column(2021) numeric SPKRAC %20f "Allow racist to speak"
104 | _column(2041) numeric SPKCOM %20f "Allow communist to speak"
105 | _column(2061) numeric SPKMIL %20f "Allow militarist to speak"
106 | _column(2081) numeric SPKMSLM %20f "Allow muslim clergymen preaching hatred of the us"
107 | }
108 |
--------------------------------------------------------------------------------
/eds01_gss_clean.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# EDS Case Study\n",
8 | "\n",
9 | "Load and resample GSS data\n",
10 | "\n",
11 | "Allen Downey\n",
12 | "\n",
13 | "[MIT License](https://en.wikipedia.org/wiki/MIT_License)"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 1,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "# If we're running in Colab, set up the environment\n",
23 | "\n",
24 | "import sys\n",
25 | "IN_COLAB = 'google.colab' in sys.modules\n",
26 | "\n",
27 | "if IN_COLAB:\n",
28 | " !pip install empiricaldist\n",
29 | " !git clone --depth 1 https://github.com/AllenDowney/ExploratoryDataAnalysis\n",
30 | " %cd ExploratoryDataAnalysis"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 2,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "import pandas as pd\n",
40 | "import numpy as np\n",
41 | "import matplotlib.pyplot as plt\n",
42 | "import seaborn as sns\n",
43 | "\n",
44 | "import utils"
45 | ]
46 | },
47 | {
48 | "cell_type": "markdown",
49 | "metadata": {},
50 | "source": [
51 | "### Reading the extract\n",
52 | "\n",
53 | "https://gssdataexplorer.norc.org/projects/52787/extracts\n",
54 | "\n",
55 | "Currently Pandas is not able to read the files generated by GSS in any of the standard formats: Stata, SPSS, Excel.\n",
56 | "\n",
57 | "As a workaround, I wrote the following functions to read the Stata dictionary file and use the information there to read the Stata data file using `pd.read_fwf` which reads fixed-width files."
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 3,
63 | "metadata": {},
64 | "outputs": [],
65 | "source": [
66 | "import re\n",
67 | "import os\n",
68 | "\n",
69 | "class FixedWidthVariables(object):\n",
70 | " \"\"\"Represents a set of variables in a fixed width file.\"\"\"\n",
71 | "\n",
72 | " def __init__(self, variables, index_base=0):\n",
73 | " \"\"\"Initializes.\n",
74 | "\n",
75 | " variables: DataFrame\n",
76 | " index_base: are the indices 0 or 1 based?\n",
77 | "\n",
78 | " Attributes:\n",
79 | " colspecs: list of (start, end) index tuples\n",
80 | " names: list of string variable names\n",
81 | " \"\"\"\n",
82 | " self.variables = variables\n",
83 | "\n",
84 | " # note: by default, subtract 1 from colspecs\n",
85 | " self.colspecs = variables[['start', 'end']] - index_base\n",
86 | "\n",
87 | " # convert colspecs to a list of pair of int\n",
88 | " self.colspecs = self.colspecs.astype(np.int).values.tolist()\n",
89 | " self.names = variables['name']\n",
90 | "\n",
91 | " def ReadFixedWidth(self, filename, **options):\n",
92 | " \"\"\"Reads a fixed width ASCII file.\n",
93 | "\n",
94 | " filename: string filename\n",
95 | "\n",
96 | " returns: DataFrame\n",
97 | " \"\"\"\n",
98 | " df = pd.read_fwf(filename,\n",
99 | " colspecs=self.colspecs, \n",
100 | " names=self.names,\n",
101 | " **options)\n",
102 | " return df\n",
103 | "\n",
104 | "\n",
105 | "def ReadStataDct(dct_file, **options):\n",
106 | " \"\"\"Reads a Stata dictionary file.\n",
107 | "\n",
108 | " dct_file: string filename\n",
109 | " options: dict of options passed to open()\n",
110 | "\n",
111 | " returns: FixedWidthVariables object\n",
112 | " \"\"\"\n",
113 | " type_map = dict(byte=int, int=int, long=int, float=float, \n",
114 | " double=float, numeric=float)\n",
115 | "\n",
116 | " var_info = []\n",
117 | " with open(dct_file, **options) as f:\n",
118 | " for line in f:\n",
119 | " match = re.search( r'_column\\(([^)]*)\\)', line)\n",
120 | " if not match:\n",
121 | " continue\n",
122 | " start = int(match.group(1))\n",
123 | " t = line.split()\n",
124 | " vtype, name, fstring = t[1:4]\n",
125 | " name = name.lower()\n",
126 | " if vtype.startswith('str'):\n",
127 | " vtype = str\n",
128 | " else:\n",
129 | " vtype = type_map[vtype]\n",
130 | " long_desc = ' '.join(t[4:]).strip('\"')\n",
131 | " var_info.append((start, vtype, name, fstring, long_desc))\n",
132 | " \n",
133 | " columns = ['start', 'type', 'name', 'fstring', 'desc']\n",
134 | " variables = pd.DataFrame(var_info, columns=columns)\n",
135 | "\n",
136 | " # fill in the end column by shifting the start column\n",
137 | " variables['end'] = variables.start.shift(-1)\n",
138 | " variables.loc[len(variables)-1, 'end'] = 0\n",
139 | "\n",
140 | " dct = FixedWidthVariables(variables, index_base=1)\n",
141 | " return dct\n",
142 | "\n",
143 | "def read_gss(dirname):\n",
144 | " \"\"\"Reads GSS files from the given directory.\n",
145 | " \n",
146 | " dirname: string\n",
147 | " \n",
148 | " returns: DataFrame\n",
149 | " \"\"\"\n",
150 | " dct_file = os.path.join(dirname, 'GSS.dct')\n",
151 | " dct = ReadStataDct(dct_file)\n",
152 | " \n",
153 | " data_file = os.path.join(dirname, 'GSS.dat.gz')\n",
154 | " gss = dct.ReadFixedWidth(data_file, compression='gzip')\n",
155 | " \n",
156 | " return gss"
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": 4,
162 | "metadata": {
163 | "scrolled": true
164 | },
165 | "outputs": [
166 | {
167 | "name": "stdout",
168 | "output_type": "stream",
169 | "text": [
170 | "(64814, 105)\n"
171 | ]
172 | },
173 | {
174 | "data": {
175 | "text/html": [
176 | "
\n",
177 | "\n",
190 | "
\n",
191 | " \n",
192 | " \n",
193 | " | \n",
194 | " year | \n",
195 | " id_ | \n",
196 | " agewed | \n",
197 | " divorce | \n",
198 | " sibs | \n",
199 | " childs | \n",
200 | " age | \n",
201 | " educ | \n",
202 | " paeduc | \n",
203 | " maeduc | \n",
204 | " ... | \n",
205 | " ballot | \n",
206 | " wtssall | \n",
207 | " adults | \n",
208 | " compuse | \n",
209 | " databank | \n",
210 | " wtssnr | \n",
211 | " spkrac | \n",
212 | " spkcom | \n",
213 | " spkmil | \n",
214 | " spkmslm | \n",
215 | "
\n",
216 | " \n",
217 | " \n",
218 | " \n",
219 | " | 0 | \n",
220 | " 1972 | \n",
221 | " 1 | \n",
222 | " 0 | \n",
223 | " 0 | \n",
224 | " 3 | \n",
225 | " 0 | \n",
226 | " 23 | \n",
227 | " 16 | \n",
228 | " 10 | \n",
229 | " 97 | \n",
230 | " ... | \n",
231 | " 0 | \n",
232 | " 0.4446 | \n",
233 | " 1 | \n",
234 | " 0 | \n",
235 | " 0 | \n",
236 | " 1.0 | \n",
237 | " 0 | \n",
238 | " 1 | \n",
239 | " 0 | \n",
240 | " 0 | \n",
241 | "
\n",
242 | " \n",
243 | " | 1 | \n",
244 | " 1972 | \n",
245 | " 2 | \n",
246 | " 21 | \n",
247 | " 2 | \n",
248 | " 4 | \n",
249 | " 5 | \n",
250 | " 70 | \n",
251 | " 10 | \n",
252 | " 8 | \n",
253 | " 8 | \n",
254 | " ... | \n",
255 | " 0 | \n",
256 | " 0.8893 | \n",
257 | " 2 | \n",
258 | " 0 | \n",
259 | " 0 | \n",
260 | " 1.0 | \n",
261 | " 0 | \n",
262 | " 2 | \n",
263 | " 0 | \n",
264 | " 0 | \n",
265 | "
\n",
266 | " \n",
267 | " | 2 | \n",
268 | " 1972 | \n",
269 | " 3 | \n",
270 | " 20 | \n",
271 | " 2 | \n",
272 | " 5 | \n",
273 | " 4 | \n",
274 | " 48 | \n",
275 | " 12 | \n",
276 | " 8 | \n",
277 | " 8 | \n",
278 | " ... | \n",
279 | " 0 | \n",
280 | " 0.8893 | \n",
281 | " 2 | \n",
282 | " 0 | \n",
283 | " 0 | \n",
284 | " 1.0 | \n",
285 | " 0 | \n",
286 | " 2 | \n",
287 | " 0 | \n",
288 | " 0 | \n",
289 | "
\n",
290 | " \n",
291 | " | 3 | \n",
292 | " 1972 | \n",
293 | " 4 | \n",
294 | " 24 | \n",
295 | " 2 | \n",
296 | " 5 | \n",
297 | " 0 | \n",
298 | " 27 | \n",
299 | " 17 | \n",
300 | " 16 | \n",
301 | " 12 | \n",
302 | " ... | \n",
303 | " 0 | \n",
304 | " 0.8893 | \n",
305 | " 2 | \n",
306 | " 0 | \n",
307 | " 0 | \n",
308 | " 1.0 | \n",
309 | " 0 | \n",
310 | " 1 | \n",
311 | " 0 | \n",
312 | " 0 | \n",
313 | "
\n",
314 | " \n",
315 | " | 4 | \n",
316 | " 1972 | \n",
317 | " 5 | \n",
318 | " 22 | \n",
319 | " 2 | \n",
320 | " 2 | \n",
321 | " 2 | \n",
322 | " 61 | \n",
323 | " 12 | \n",
324 | " 8 | \n",
325 | " 8 | \n",
326 | " ... | \n",
327 | " 0 | \n",
328 | " 0.8893 | \n",
329 | " 2 | \n",
330 | " 0 | \n",
331 | " 0 | \n",
332 | " 1.0 | \n",
333 | " 0 | \n",
334 | " 1 | \n",
335 | " 0 | \n",
336 | " 0 | \n",
337 | "
\n",
338 | " \n",
339 | "
\n",
340 | "
5 rows × 105 columns
\n",
341 | "
"
342 | ],
343 | "text/plain": [
344 | " year id_ agewed divorce sibs childs age educ paeduc maeduc ... \\\n",
345 | "0 1972 1 0 0 3 0 23 16 10 97 ... \n",
346 | "1 1972 2 21 2 4 5 70 10 8 8 ... \n",
347 | "2 1972 3 20 2 5 4 48 12 8 8 ... \n",
348 | "3 1972 4 24 2 5 0 27 17 16 12 ... \n",
349 | "4 1972 5 22 2 2 2 61 12 8 8 ... \n",
350 | "\n",
351 | " ballot wtssall adults compuse databank wtssnr spkrac spkcom spkmil \\\n",
352 | "0 0 0.4446 1 0 0 1.0 0 1 0 \n",
353 | "1 0 0.8893 2 0 0 1.0 0 2 0 \n",
354 | "2 0 0.8893 2 0 0 1.0 0 2 0 \n",
355 | "3 0 0.8893 2 0 0 1.0 0 1 0 \n",
356 | "4 0 0.8893 2 0 0 1.0 0 1 0 \n",
357 | "\n",
358 | " spkmslm \n",
359 | "0 0 \n",
360 | "1 0 \n",
361 | "2 0 \n",
362 | "3 0 \n",
363 | "4 0 \n",
364 | "\n",
365 | "[5 rows x 105 columns]"
366 | ]
367 | },
368 | "execution_count": 4,
369 | "metadata": {},
370 | "output_type": "execute_result"
371 | }
372 | ],
373 | "source": [
374 | "gss = read_gss('gss_eda')\n",
375 | "print(gss.shape)\n",
376 | "gss.head()"
377 | ]
378 | },
379 | {
380 | "cell_type": "markdown",
381 | "metadata": {},
382 | "source": [
383 | "### Missing data\n",
384 | "\n",
385 | "For many variables, missing values are encoded with numbers, so we need to replace them before we do any analysis.\n",
386 | "\n",
387 | "For example, for `polviews`, the values 8, 9, and 0 represent \"Don't know\", \"No answer\", and \"Not applicable\".\n",
388 | "\n",
389 | "\"Not applicable\" usually means the respondent was not asked a particular question.\n",
390 | "\n",
391 | "To keep things simple, we'll treat all of these values as equivalent, but we should keep in mind that we lose some information by doing that. For example, if a respondent refuses to answer a question, that might suggest something about their answer. If so, treating their response as missing data might bias the results.\n",
392 | "\n",
393 | "Fortunately, for most questions the number of respondents who refused to answer is small."
394 | ]
395 | },
396 | {
397 | "cell_type": "code",
398 | "execution_count": 5,
399 | "metadata": {},
400 | "outputs": [],
401 | "source": [
402 | "def replace_invalid(df):\n",
403 | " \"\"\"Replace invalid data with NaN.\n",
404 | " \n",
405 | " df: DataFrame\n",
406 | " \"\"\"\n",
407 | " df.realinc.replace([0], np.nan, inplace=True) \n",
408 | " df.educ.replace([98, 99], np.nan, inplace=True)\n",
409 | " # 89 means 89 or older\n",
410 | " df.age.replace([98, 99], np.nan, inplace=True) \n",
411 | " df.cohort.replace([9999], np.nan, inplace=True)\n",
412 | " df.adults.replace([9], np.nan, inplace=True)\n",
413 | " df.colhomo.replace([0, 8, 9], np.nan, inplace=True)\n",
414 | " df.libhomo.replace([0, 8, 9], np.nan, inplace=True)\n",
415 | " df.cappun.replace([0, 8, 9], np.nan, inplace=True)\n",
416 | " df.gunlaw.replace([0, 8, 9], np.nan, inplace=True)\n",
417 | " df.grass.replace([0, 8, 9], np.nan, inplace=True)\n",
418 | " df.fepol.replace([0, 8, 9], np.nan, inplace=True)\n",
419 | " df.abany.replace([0, 8, 9], np.nan, inplace=True)\n",
420 | " df.prayer.replace([0, 8, 9], np.nan, inplace=True)\n",
421 | " df.sexeduc.replace([0, 8, 9], np.nan, inplace=True)\n",
422 | " df.premarsx.replace([0, 8, 9], np.nan, inplace=True)\n",
423 | " df.xmarsex.replace([0, 8, 9], np.nan, inplace=True)\n",
424 | " df.homosex.replace([0, 5, 8, 9], np.nan, inplace=True)\n",
425 | " df.racmar.replace([0, 8, 9], np.nan, inplace=True)\n",
426 | " df.spanking.replace([0, 8, 9], np.nan, inplace=True)\n",
427 | " df.racpres.replace([0, 8, 9], np.nan, inplace=True)\n",
428 | " df.fear.replace([0, 8, 9], np.nan, inplace=True)\n",
429 | " df.databank.replace([0, 8, 9], np.nan, inplace=True)\n",
430 | " df.affrmact.replace([0, 8, 9], np.nan, inplace=True)\n",
431 | " df.happy.replace([0, 8, 9], np.nan, inplace=True)\n",
432 | " df.hapmar.replace([0, 8, 9], np.nan, inplace=True)\n",
433 | " df.natspac.replace([0, 8, 9], np.nan, inplace=True)\n",
434 | " df.natenvir.replace([0, 8, 9], np.nan, inplace=True)\n",
435 | " df.natheal.replace([0, 8, 9], np.nan, inplace=True)\n",
436 | " df.natcity.replace([0, 8, 9], np.nan, inplace=True)\n",
437 | " df.natcrime.replace([0, 8, 9], np.nan, inplace=True)\n",
438 | " df.natdrug.replace([0, 8, 9], np.nan, inplace=True)\n",
439 | " df.nateduc.replace([0, 8, 9], np.nan, inplace=True)\n",
440 | " df.natrace.replace([0, 8, 9], np.nan, inplace=True)\n",
441 | " df.natarms.replace([0, 8, 9], np.nan, inplace=True)\n",
442 | " df.nataid.replace([0, 8, 9], np.nan, inplace=True)\n",
443 | " df.natfare.replace([0, 8, 9], np.nan, inplace=True)\n",
444 | " df.health.replace([0, 8, 9], np.nan, inplace=True)\n",
445 | " df.life.replace([0, 8, 9], np.nan, inplace=True)\n",
446 | " df.helpful.replace([0, 8, 9], np.nan, inplace=True)\n",
447 | " df.fair.replace([0, 8, 9], np.nan, inplace=True)\n",
448 | " df.trust.replace([0, 8, 9], np.nan, inplace=True)\n",
449 | " df.conclerg.replace([0, 8, 9], np.nan, inplace=True)\n",
450 | " df.coneduc.replace([0, 8, 9], np.nan, inplace=True)\n",
451 | " df.confed.replace([0, 8, 9], np.nan, inplace=True)\n",
452 | " df.conpress.replace([0, 8, 9], np.nan, inplace=True)\n",
453 | " df.conjudge.replace([0, 8, 9], np.nan, inplace=True)\n",
454 | " df.conlegis.replace([0, 8, 9], np.nan, inplace=True)\n",
455 | " df.conarmy.replace([0, 8, 9], np.nan, inplace=True)\n",
456 | " df.spkhomo.replace([0, 8, 9], np.nan, inplace=True)\n",
457 | " df.spkath.replace([0, 8, 9], np.nan, inplace=True)\n",
458 | " df.colath.replace([0, 8, 9], np.nan, inplace=True)\n",
459 | " df.libath.replace([0, 8, 9], np.nan, inplace=True)\n",
460 | " df.spkrac.replace([0, 8, 9], np.nan, inplace=True)\n",
461 | " df.spkcom.replace([0, 8, 9], np.nan, inplace=True)\n",
462 | " df.spkmil.replace([0, 8, 9], np.nan, inplace=True)\n",
463 | " df.satjob.replace([0, 8, 9], np.nan, inplace=True)\n",
464 | " df.satfin.replace([0, 8, 9], np.nan, inplace=True)\n",
465 | " df.finrela.replace([0, 8, 9], np.nan, inplace=True)\n",
466 | "\n",
467 | " df.union_.replace([0, 8, 9], np.nan, inplace=True)\n",
468 | " df.res16.replace([0, 8, 9], np.nan, inplace=True)\n",
469 | "\n",
470 | " df.fund.replace([0, 8, 9], np.nan, inplace=True)\n",
471 | " df.memchurh.replace([0, 8, 9], np.nan, inplace=True)\n",
472 | " df.fund16.replace([0, 8, 9], np.nan, inplace=True)\n",
473 | " df.reliten.replace([0, 8, 9], np.nan, inplace=True)\n",
474 | " df.postlife.replace([0, 8, 9], np.nan, inplace=True)\n",
475 | " df.pray.replace([0, 8, 9], np.nan, inplace=True)\n",
476 | " df.sprel16.replace([0, 8, 9], np.nan, inplace=True)\n",
477 | " df.hunt.replace([0, 8, 9], np.nan, inplace=True)\n",
478 | " df.polviews.replace([0, 8, 9], np.nan, inplace=True)\n",
479 | "\n",
480 | " df.compuse.replace([0, 8, 9], np.nan, inplace=True)\n",
481 | "\n",
482 | " df.degree.replace([8, 9], np.nan, inplace=True)\n",
483 | " df.padeg.replace([8, 9], np.nan, inplace=True)\n",
484 | " df.madeg.replace([8, 9], np.nan, inplace=True)\n",
485 | " df.spdeg.replace([8, 9], np.nan, inplace=True)\n",
486 | " df.partyid.replace([8, 9], np.nan, inplace=True)\n",
487 | "\n",
488 | " df.chldidel.replace([-1, 8, 9], np.nan, inplace=True)\n",
489 | "\n",
490 | " df.attend.replace([9], np.nan, inplace=True)\n",
491 | " df.childs.replace([9], np.nan, inplace=True)\n",
492 | " df.adults.replace([9], np.nan, inplace=True)\n",
493 | "\n",
494 | " df.divorce.replace([0, 8, 9], np.nan, inplace=True)\n",
495 | " df.agewed.replace([0, 98, 99], np.nan, inplace=True)\n",
496 | " df.relig.replace([0, 98, 99], np.nan, inplace=True)\n",
497 | " df.relig16.replace([0, 98, 99], np.nan, inplace=True)\n",
498 | " df.age.replace([0, 98, 99], np.nan, inplace=True)\n",
499 | " \n",
500 | " # note: sibs contains some unlikely numbers\n",
501 | " df.sibs.replace([-1, 98, 99], np.nan, inplace=True)\n",
502 | " df.educ.replace([97, 98, 99], np.nan, inplace=True)\n",
503 | " df.maeduc.replace([97, 98, 99], np.nan, inplace=True)\n",
504 | " df.paeduc.replace([97, 98, 99], np.nan, inplace=True)\n",
505 | " df.speduc.replace([97, 98, 99], np.nan, inplace=True)\n",
506 | "\n",
507 | " df.cohort.replace([0, 9999], np.nan, inplace=True)\n",
508 | " df.marcohrt.replace([0, 9999], np.nan, inplace=True)\n",
509 | "\n",
510 | " df.phone.replace([0, 2, 9], np.nan, inplace=True)\n",
511 | " df.owngun.replace([0, 3, 8, 9], np.nan, inplace=True)\n",
512 | " df.pistol.replace([0, 3, 8, 9], np.nan, inplace=True)\n",
513 | " df.class_.replace([0, 5, 8, 9], np.nan, inplace=True)\n",
514 | " df.pres04.replace([0, 8, 9], np.nan, inplace=True)\n",
515 | " df.pres08.replace([0, 8, 9], np.nan, inplace=True)\n",
516 | " df.pres12.replace([0, 8, 9], np.nan, inplace=True)\n",
517 | "\n",
518 | "replace_invalid(gss)"
519 | ]
520 | },
521 | {
522 | "cell_type": "markdown",
523 | "metadata": {},
524 | "source": [
525 | "### Resampling\n",
526 | "\n",
527 | "The GSS uses stratified sampling, which means that some groups are deliberately oversampled to help with statistical validity.\n",
528 | "\n",
529 | "As a result, each respondent has a sampling weight which is proportional to the number of people in the population represented by the respondent.\n",
530 | "\n",
531 | "Before running any analysis, we should compensate for stratified sampling by \"resampling\", that is, by drawing a random sample from the dataset, where each respondent's chance of appearing in the sample is proportional to their sampling weight.\n",
532 | "\n",
533 | "`utils` provides a function to do this resampling."
534 | ]
535 | },
536 | {
537 | "cell_type": "code",
538 | "execution_count": 6,
539 | "metadata": {},
540 | "outputs": [],
541 | "source": [
542 | "np.random.seed(19)\n",
543 | "sample = utils.resample_by_year(gss, 'wtssall')"
544 | ]
545 | },
546 | {
547 | "cell_type": "markdown",
548 | "metadata": {},
549 | "source": [
550 | "### Saving the results\n",
551 | "\n",
552 | "I'll save the results to an HDF5 file, which is a binary format that makes it much faster to read the data back."
553 | ]
554 | },
555 | {
556 | "cell_type": "code",
557 | "execution_count": 7,
558 | "metadata": {},
559 | "outputs": [],
560 | "source": [
561 | "!rm eds.gss.hdf5"
562 | ]
563 | },
564 | {
565 | "cell_type": "code",
566 | "execution_count": 8,
567 | "metadata": {},
568 | "outputs": [],
569 | "source": [
570 | "for i in range(3):\n",
571 | " np.random.seed(i)\n",
572 | " sample = utils.resample_by_year(gss, 'wtssall')\n",
573 | "\n",
574 | " key = f'gss{i}'\n",
575 | " sample.to_hdf('eds.gss.hdf5', key)"
576 | ]
577 | },
578 | {
579 | "cell_type": "code",
580 | "execution_count": 9,
581 | "metadata": {},
582 | "outputs": [
583 | {
584 | "name": "stdout",
585 | "output_type": "stream",
586 | "text": [
587 | "CPU times: user 9.86 ms, sys: 19.8 ms, total: 29.7 ms\n",
588 | "Wall time: 28.8 ms\n"
589 | ]
590 | },
591 | {
592 | "data": {
593 | "text/plain": [
594 | "(64814, 105)"
595 | ]
596 | },
597 | "execution_count": 9,
598 | "metadata": {},
599 | "output_type": "execute_result"
600 | }
601 | ],
602 | "source": [
603 | "%time gss = pd.read_hdf('eds.gss.hdf5', 'gss0')\n",
604 | "gss.shape"
605 | ]
606 | },
607 | {
608 | "cell_type": "code",
609 | "execution_count": null,
610 | "metadata": {},
611 | "outputs": [],
612 | "source": []
613 | }
614 | ],
615 | "metadata": {
616 | "kernelspec": {
617 | "display_name": "Python 3",
618 | "language": "python",
619 | "name": "python3"
620 | },
621 | "language_info": {
622 | "codemirror_mode": {
623 | "name": "ipython",
624 | "version": 3
625 | },
626 | "file_extension": ".py",
627 | "mimetype": "text/x-python",
628 | "name": "python",
629 | "nbconvert_exporter": "python",
630 | "pygments_lexer": "ipython3",
631 | "version": "3.7.3"
632 | }
633 | },
634 | "nbformat": 4,
635 | "nbformat_minor": 1
636 | }
637 |
--------------------------------------------------------------------------------
/data/2013_2015_FemPregSetup.dct:
--------------------------------------------------------------------------------
1 | infile dictionary {
2 |
3 | *****************************************************************
4 | * NATIONAL SURVEY OF FAMILY GROWTH (NSFG), 2013-2015
5 | * STATA Pregnancy Data Dictionary
6 | *
7 | * Warning: Edit this file at your own risk
8 | *
9 | *****************************************************************
10 |
11 | _lines(1)
12 |
13 | _line(1)
14 |
15 | _column(1) long CASEID %5f "Case identification number"
16 | _column(6) byte PREGORDR %2f "Pregnancy order (number)"
17 | _column(8) byte HOWPREG_N %2f "BB-2 # of Weeks or Months Currently Pregnant"
18 | _column(10) byte HOWPREG_P %1f "BB-2 Current pregnancy length reported in months or weeks"
19 | _column(11) byte MOSCURRP %1f "Number of Months Currently Pregnant"
20 | _column(12) byte NOWPRGDK %1f "BB-3 Which Trimester -- Current Pregnancy"
21 | _column(13) byte PREGEND1 %1f "BC-1 How Pregnancy Ended - 1st mention"
22 | _column(14) byte PREGEND2 %1f "BC-1 How Pregnancy Ended - 2nd mention"
23 | _column(15) byte HOWENDDK %1f "BC-1b Did pregnancy result in a baby/babies born alive or did it end in some other way?"
24 | _column(16) byte NBRNALIV %1f "BC-2 Number of babies born alive from this pregnancy"
25 | _column(17) byte MULTBRTH %1f "BC-3 Was this a multiple birth"
26 | _column(18) byte BORNALIV %1f "Number of babies born alive from this pregnancy"
27 | _column(19) byte DATPRGEN_M %2f "BC-4a Month when pregnancy ended (if nonlivebirth)"
28 | _column(21) int DATPRGEN_Y %4f "BC-4a Year when pregnancy ended (if nonlivebirth)"
29 | _column(25) int CMOTPREG %4f "CM for Pregnancy End Date (if nonlivebirth)"
30 | _column(29) byte AGEATEND %2f "BC-4b R's age at pregnancy's end date"
31 | _column(31) byte HPAGEEND %2f "BC-4c Father's age at pregnancy's end date"
32 | _column(33) byte GESTASUN_M %2f "BC-5 Gestational Length of Pregnancy in Months"
33 | _column(35) byte GESTASUN_W %2f "BC-5 Gestational Length of Pregnancy in Weeks"
34 | _column(37) byte WKSGEST %2f "Gestational length of completed pregnancy (in weeks)"
35 | _column(39) byte MOSGEST %2f "Gestational length of completed pregnancy (in months)"
36 | _column(41) byte DK1GEST %1f "BC-6 DK followup for gestational length of a stillbirth"
37 | _column(42) byte DK2GEST %1f "BC-7 DK followup for gestational length of a livebirth"
38 | _column(43) byte DK3GEST %1f "BC-8 DK followup for gestational length of a miscarr/abor/ectop"
39 | _column(44) byte BPA_BDSCHECK1 %1f "Whether 1st liveborn baby from this pregnancy was BPA or BDS"
40 | _column(45) byte BABYSEX1 %1f "BD-2 Sex of 1st Liveborn Baby from This Pregnancy"
41 | _column(46) byte BIRTHWGT_LB1 %2f "BD-3 Birthweight in Pounds - 1st baby from this pregnancy"
42 | _column(48) byte BIRTHWGT_OZ1 %2f "BD-3 Birthweight in Ounces - 1st baby from this pregnancy"
43 | _column(50) byte LOBTHWGT1 %1f "BD-4 Is Baby Low Birthweight- 1st baby from this pregnancy"
44 | _column(51) byte BABYSEX2 %1f "BD-2 Sex of 2nd Liveborn Baby from This Pregnancy"
45 | _column(52) byte BIRTHWGT_LB2 %2f "BD-3 Birthweight in Pounds - 2nd baby from this pregnancy"
46 | _column(54) byte BIRTHWGT_OZ2 %2f "BD-3 Birthweight in Ounces - 2nd baby from this pregnancy"
47 | _column(56) byte LOBTHWGT2 %1f "BD-4 Is Baby Low Birthweight- 2nd baby from this pregnancy"
48 | _column(57) byte BABYSEX3 %1f "BD-2 Sex of 3rd Liveborn Baby from This Pregnancy"
49 | _column(58) byte BIRTHWGT_LB3 %1f "BD-3 Birthweight in Pounds - 3rd baby from this pregnancy"
50 | _column(59) byte BIRTHWGT_OZ3 %2f "BD-3 Birthweight in Ounces - 3rd baby from this pregnancy"
51 | _column(61) byte LOBTHWGT3 %1f "BD-4 Is Baby Low Birthweight- 3rd baby from this pregnancy"
52 | _column(62) byte BABYDOB_M %2f "BD-5 Month of delivery for this pregnancy"
53 | _column(64) int BABYDOB_Y %4f "BD-5 Year of delivery for this pregnancy"
54 | _column(68) int CMBABDOB %4f "CM for baby's or babies' date of birth (delivery date)"
55 | _column(72) int KIDAGE %3f "Current Age (in mos) of R's child(ren) from this pregnancy"
56 | _column(75) byte HPAGELB %2f "BD-6 Father's age at time of child(ren) s birth"
57 | _column(77) byte BIRTHPLC %1f "BD-7 Place where R gave birth"
58 | _column(78) byte PAYBIRTH1 %1f "BD-8 Payment for delivery - 1st mention"
59 | _column(79) byte PAYBIRTH2 %1f "BD-8 Payment for delivery - 2nd mention"
60 | _column(80) byte PAYBIRTH3 %1f "BD-8 Payment for delivery - 3rd mention"
61 | _column(81) byte CSECPRIM %1f "BD-9 Is this R's first c-section?"
62 | _column(82) byte CSECMED1 %1f "BD-10 Medical reasons reported for this C-section - 1st mention"
63 | _column(83) byte CSECMED2 %1f "BD-10 Medical reasons reported for this C-section - 2nd mention"
64 | _column(84) byte CSECMED3 %1f "BD-10 Medical reasons reported for this C-section - 3rd mention"
65 | _column(85) byte CSECMED4 %1f "BD-10 Medical reasons reported for this C-section - 4th mention"
66 | _column(86) byte CSECPLAN %1f "BD-11 Was this c-section planned for by R?"
67 | _column(87) byte KNEWPREG %2f "BE-1 Weeks pregnant when R learned she was pregnant"
68 | _column(89) byte TRIMESTR %1f "BE-2a DK followup for KNEWPREG when gestation >= 6 mos"
69 | _column(90) byte LTRIMEST %1f "BE-2b DK followup for KNEWPREG when gestation < 6 mos"
70 | _column(91) byte PRIORSMK %1f "BE-3 Amount R smoked in 6 mos before R knew she was pregnant"
71 | _column(92) byte POSTSMKS %1f "BE-4 R smoked at all after R knew she was pregnant"
72 | _column(93) byte NPOSTSMK %1f "BE-5 Amount R smoked during pregnancy after R knew she was preg"
73 | _column(94) byte GETPRENA %1f "BE-6 Any prenatal care for this pregnancy"
74 | _column(95) byte BGNPRENA %2f "BE-7 Weeks pregnant at first prenatal care visit"
75 | _column(97) byte PNCTRIM %1f "BE-8a DK followup for BGNPRENA when gestation >= 6 mos"
76 | _column(98) byte LPNCTRI %1f "BE-8b DK followup for BGNPRENA when gestation < 6 mos"
77 | _column(99) byte LIVEHERE1 %1f "BG-1 Whether child lives with R - 1st from this pregnancy"
78 | _column(100) byte ALIVENOW1 %1f "BG-2 Whether child is still alive - 1st from this pregnancy"
79 | _column(101) byte WHENDIED_M1 %2f "BG-3 Month when child died - 1st from this pregnancy"
80 | _column(103) int WHENDIED_Y1 %4f "BG-3 Year when child died - 1st from this pregnancy"
81 | _column(107) int CMKIDIED1 %4f "CM for child's date of death - 1st from this pregnancy"
82 | _column(111) byte WHENLEFT_M1 %2f "BG-4 Month when child stopped living with R- 1st from this preg"
83 | _column(113) int WHENLEFT_Y1 %4f "BG-4 Year when child stopped living with R- 1st from this preg"
84 | _column(117) int CMKIDLFT1 %4f "CM for date child stopped living w/R - 1st from this pregnancy"
85 | _column(121) int LASTAGE1 %3f "Age (in mos) when child last lived w/R-1st from this pregnancy"
86 | _column(124) byte WHERENOW1 %1f "BG-5 Where child lives now - 1st from this pregnancy"
87 | _column(125) byte LEGAGREE1 %1f "BG-6 Legal agreement for where child lives - 1st from this preg"
88 | _column(126) byte PARENEND1 %1f "BG-7 Is R still legal mother of child - 1st from this pregnancy"
89 | _column(127) byte ANYNURSE1 %1f "BH-1 Whether R breastfed this child at all - 1st from this preg"
90 | _column(128) byte FEDSOLID1 %1f "BH-2 Has R fed this child anything other than breastmilk - 1st from this preg"
91 | _column(129) int FRSTEATD_N1 %3f "BH-3 Age (mos/wks/day) when 1st fed non-breastmilk - 1st from this preg"
92 | _column(132) byte FRSTEATD_P1 %1f "BH-3 Units (mos/wks/days) for FRSTEATD_N - 1st from this preg"
93 | _column(133) byte FRSTEATD1 %2f "Age (in mos) when 1st fed non-breastmilk - 1st from this preg"
94 | _column(135) byte QUITNURS1 %1f "BH-4 Has R stopped breastfeeding child - 1st from this preg"
95 | _column(136) int AGEQTNUR_N1 %3f "BH-5 Age (mos/wks/day) when stopped breastfeeding - 1st from this preg"
96 | _column(139) byte AGEQTNUR_P1 %1f "BH-5 Units (mos/wks/days) for AGEQTNUR_N - 1st from this preg"
97 | _column(140) byte AGEQTNUR1 %2f "Age (in mos) when R stopped nursing child - 1st from this preg"
98 | _column(142) byte LIVEHERE2 %1f "BG-1 Whether child lives with R - 2nd from this pregnancy"
99 | _column(143) byte ALIVENOW2 %1f "BG-2 Whether child is still alive - 2nd from this pregnancy"
100 | _column(144) byte WHENDIED_M2 %1f "BG-3 Month when child died - 2nd from this pregnancy"
101 | _column(145) int WHENDIED_Y2 %4f "BG-3 Year when child died - 2nd from this pregnancy"
102 | _column(149) int CMKIDIED2 %4f "CM for child's date of death - 2nd from this pregnancy"
103 | _column(153) byte WHENLEFT_M2 %1f "BG-4 Month when child stopped living with R - 2nd from this preg"
104 | _column(154) int WHENLEFT_Y2 %4f "BG-4 Year when child stopped living with R - 2nd from this preg"
105 | _column(158) int CMKIDLFT2 %4f "CM for date child stopped living w/R - 2nd from this pregnancy"
106 | _column(162) int LASTAGE2 %3f "Age (in mos) when child last lived w/R - 2nd from this pregnancy"
107 | _column(165) byte WHERENOW2 %1f "BG-5 Where child lives now - 2nd from this pregnancy"
108 | _column(166) byte LEGAGREE2 %1f "BG-6 Legal agreement for where child lives - 2nd from this preg"
109 | _column(167) byte PARENEND2 %1f "BG-7 Is R still legal mother of child - 2nd from this pregnancy"
110 | _column(168) byte ANYNURSE2 %1f "BH-1 Whether R breastfed this child at all - 2nd from this preg"
111 | _column(169) byte FEDSOLID2 %1f "BH-2 Has R fed this child anything other than breastmilk - 2nd from this preg"
112 | _column(170) byte FRSTEATD_N2 %1f "BH-3 Age (mos/wks/day) when 1st fed non-breastmilk - 2nd from this preg"
113 | _column(171) byte FRSTEATD_P2 %1f "BH-3 Units (mos/wks/days) for FRSTEATD_N - 2nd from this preg"
114 | _column(172) byte FRSTEATD2 %1f "Age (in mos) when 1st fed non-breastmilk - 2nd from this preg"
115 | _column(173) byte QUITNURS2 %1f "BH-4 Has R stopped breastfeeding child - 2nd from this preg"
116 | _column(174) byte AGEQTNUR_N2 %2f "BH-5 Age (mos/wks/day) when stopped breastfeeding - 2nd from this preg"
117 | _column(176) byte AGEQTNUR_P2 %1f "BH-5 Units (mos/wks/days) for AGEQTNUR_N - 2nd from this preg"
118 | _column(177) byte AGEQTNUR2 %2f "Age (in mos) when R stopped nursing child - 2nd from this preg"
119 | _column(179) byte LIVEHERE3 %1f "BG-1 Whether child lives with R - 3rd from this pregnancy"
120 | _column(180) byte ALIVENOW3 %1f "BG-2 Whether child is still alive - 3rd from this pregnancy"
121 | _column(181) byte WHENDIED_M3 %1f "BG-3 Month when child died - 3rd from this pregnancy"
122 | _column(182) byte WHENDIED_Y3 %1f "BG-3 Year when child died - 3rd from this pregnancy"
123 | _column(183) byte CMKIDIED3 %1f "CM for child's date of death - 3rd from this pregnancy"
124 | _column(184) byte WHENLEFT_M3 %1f "BG-4 Month when child stopped living with R - 3rd from this preg"
125 | _column(185) byte WHENLEFT_Y3 %1f "BG-4 Year when child stopped living with R - 3rd from this preg"
126 | _column(186) byte CMKIDLFT3 %1f "CM for date child stopped living w/R - 3rd from this pregnancy"
127 | _column(187) byte LASTAGE3 %1f "Age (in mos) when child last lived w/R - 3rd from this pregnancy"
128 | _column(188) byte WHERENOW3 %1f "BG-5 Where child lives now - 3rd from this pregnancy"
129 | _column(189) byte LEGAGREE3 %1f "BG-6 Legal agreement for where child lives - 3rd from this preg"
130 | _column(190) byte PARENEND3 %1f "BG-7 Is R still legal mother of child - 3rd from this pregnancy"
131 | _column(191) byte ANYNURSE3 %1f "BH-1 Whether R breastfed this child at all - 3rd from this preg"
132 | _column(192) byte FEDSOLID3 %1f "BH-2 Has R fed this child anything other than breastmilk - 3rd from this preg"
133 | _column(193) byte FRSTEATD_N3 %1f "BH-3 Age (mos/wks/day) when 1st fed non-breastmilk - 3rd from this preg"
134 | _column(194) byte FRSTEATD_P3 %1f "BH-3 Units (mos/wks/days) for FRSTEATD_N - 3rd from this preg"
135 | _column(195) byte FRSTEATD3 %1f "Age (in mos) when 1st fed non-breastmilk - 3rd from this preg"
136 | _column(196) byte QUITNURS3 %1f "BH-4 Has R stopped breastfeeding child - 3rd from this preg"
137 | _column(197) byte AGEQTNUR_N3 %1f "BH-5 Age (mos/wks/day) when stopped breastfeeding - 3rd from this preg"
138 | _column(198) byte AGEQTNUR_P3 %1f "BH-5 Units (mos/wks/days) for AGEQTNUR_N - 3rd from this preg"
139 | _column(199) byte AGEQTNUR3 %1f "Age (in mos) when R stopped nursing child - 3rd from this preg"
140 | _column(200) byte PRGOUTCOME %1f "Outcome of Pregnancy (based on priority ordering)"
141 | _column(201) byte OUTCOM_S %1f "Outcome of pregnancy (based on corrected/chron sorted data)"
142 | _column(202) byte NBRNLV_S %1f "# of babies born alive from this preg (based on CCSD)"
143 | _column(203) int CMPRGEND %4f "CM for Pregnancy End Date (regardless of outcome)"
144 | _column(207) int CMENDP_S %4f "CM date when pregnancy ended (based on CCSD)"
145 | _column(211) int CMPRGBEG %4f "CM for Pregnancy Start Date"
146 | _column(215) int CMPBEG_S %4f "CM date when pregnancy began (based on CCSD)"
147 | _column(219) int CMLASTLB %4f "CM for R's most recent live birth (based on CCSD)"
148 | _column(223) int CMLSTPRG %4f "CM for R's most recent completed pregnancy (based on CCSD)"
149 | _column(227) int CMFSTPRG %4f "CM for R's first completed pregnancy (based on CCSD)"
150 | _column(231) int CMPG1BEG %4f "CM date when R's 1st pregnancy began (based on CCSD)"
151 | _column(235) int CMINTSTR %4f "CM for date of beginning of pregnancy interval"
152 | _column(239) int CMINTFIN %4f "CM for date of end of pregnancy interval"
153 | _column(243) int CMINTSTROP %4f "Open interval: CM of date of beginning"
154 | _column(247) int CMINTFINOP %4f "Open interval: CM of date of end (mon of interview)"
155 | _column(251) int CMINTSTRCR %4f "Currently pregnant: CM of date of beginning of interval"
156 | _column(255) int CMINTFINCR %4f "Currently pregnant: CM of date of end of interval (mon of interview)"
157 | _column(259) byte ANYUSINT %1f "Any method use in pregnancy interval"
158 | _column(260) byte EVUSEINT %1f "EG-1 Use any method in pregnancy interval?"
159 | _column(261) byte STOPDUSE %1f "EG-2 Before you became preg, stop using all methods?"
160 | _column(262) byte WHYSTOPD %1f "EG-3 Stop using methods before preg because wanted preg?"
161 | _column(263) byte WHATMETH01 %2f "EG-4 Method(s) using when became preg - 1st mention"
162 | _column(265) byte WHATMETH02 %2f "EG-4 Method(s) using when became preg - 2nd mention"
163 | _column(267) byte WHATMETH03 %2f "EG-4 Method(s) using when became preg - 3rd mention"
164 | _column(269) byte WHATMETH04 %2f "EG-4 Method(s) using when became preg - 4th mention"
165 | _column(271) byte RESNOUSE %1f "EG-5 Reason not using/had stopped using method bec. wanted preg?"
166 | _column(272) byte WANTBOLD %1f "EG-6 Right bef preg, want to have baby at any time in future?"
167 | _column(273) byte PROBBABE %1f "EG-7 probably want baby at any time in future or not?"
168 | _column(274) byte CNFRMNO %1f "EG-8 Verify didn't want baby at any time in future"
169 | _column(275) byte WANTBLD2 %1f "EG-9 Right before preg, want to have baby at any time in future? (2nd asking)"
170 | _column(276) byte TIMINGOK %1f "EG-10 Become preg too soon, right time, or later than you wanted?"
171 | _column(277) int TOOSOON_N %3f "EG-11 How much sooner than wanted became preg (months or years)"
172 | _column(280) byte TOOSOON_P %1f "EG-11 Choose mons or yrs for how much sooner became preg than wanted"
173 | _column(281) byte WTHPART1 %1f "EG-12a Right before preg, want to have baby with that partner?"
174 | _column(282) byte WTHPART2 %1f "EG-12b Right bef. preg, think might ever want to have baby w/that partner?"
175 | _column(283) byte FEELINPG %2f "EG-13 How happy to be preg. scale (0-10)"
176 | _column(285) byte HPWNOLD %1f "EG-16 Right bef preg, did the father want R to have baby at any time in future?"
177 | _column(286) byte TIMOKHP %1f "EG-17 R became preg sooner, right time, or later than father of preg wanted"
178 | _column(287) byte COHPBEG %1f "EG-18a Was R living w/father of preg at beginning of preg"
179 | _column(288) byte COHPEND %1f "EG-18b Was R living w/father of preg when preg ended/baby was born"
180 | _column(289) byte TELLFATH %1f "EG-19 Did R tell father of preg that she was pregnant"
181 | _column(290) byte WHENTELL %1f "EG-20 When did R tell father of preg about pregnancy: during or after?"
182 | _column(291) byte TRYSCALE %2f "EG-21 How hard trying to get/avoid pregnancy (0-10 scale)"
183 | _column(293) byte WANTSCAL %2f "EG-22 How much wanted to get/avoid pregnancy (0-10 scale)"
184 | _column(295) byte WHYPRG1 %1f "EG-23 (unintended preg): method fail or R wasn't using properly-1st mention"
185 | _column(296) byte WHYPRG2 %1f "EG-23 (unintended preg): method fail or R wasn't using properly-2nd mention"
186 | _column(297) byte WHYNOUSE1 %2f "EG-24 (unintended preg) Reason didn't use contraceptn - 1st"
187 | _column(299) byte WHYNOUSE2 %1f "EG-24 (unintended preg) Reason didn't use contraceptn - 2nd"
188 | _column(300) byte WHYNOUSE3 %1f "EG-24 (unintended preg) Reason didn't use contraceptn - 3rd"
189 | _column(301) byte WHYNOUSE4 %1f "EG-24 (unintended preg) Reason didn't use contraceptn - 4th"
190 | _column(302) byte WHYNOUSE5 %1f "EG-24 (unintended preg) Reason didn't use contraceptn - 5th"
191 | _column(303) byte WHYNOPG1 %2f "EG-24aa open-ended response: reason didn't think could get preg - 1st"
192 | _column(305) byte WHYNOPG2 %2f "EG-24aa open-ended response: reason didn't think could get preg - 2nd"
193 | _column(307) byte MAINOUSE %2f "EG-24a (unintended preg) Main reason didn't use contraception"
194 | _column(309) byte PRGLNGTH %2f "Duration of completed pregnancy in weeks"
195 | _column(311) byte OUTCOME %1f "Pregnancy outcome"
196 | _column(312) byte BIRTHORD %2f "Birth order"
197 | _column(314) int DATEND %4f "CM date pregnancy ended"
198 | _column(318) int AGEPREG %4f "Age at pregnancy outcome"
199 | _column(322) int DATECON %4f "CM date of conception"
200 | _column(326) int AGECON %4f "Age at time of conception"
201 | _column(330) byte FMAROUT5 %1f "Formal marital status at pregnancy outcome"
202 | _column(331) byte PMARPREG %1f "Whether pregnancy ended before R's 1st marriage (premaritally)"
203 | _column(332) byte RMAROUT6 %1f "Informal marital status at pregnancy outcome - 6 categories"
204 | _column(333) byte FMARCON5 %1f "Formal marital status at conception - 5 categories"
205 | _column(334) byte RMARCON6 %1f "Informal marital status at conception - 6 categories"
206 | _column(335) byte LEARNPRG %2f "Number of weeks pregnant when R learned she was pregnant"
207 | _column(337) byte PNCAREWK %2f "Number of weeks pregnant at first prenatal care"
208 | _column(339) byte PAYDELIV %1f "Payment for delivery"
209 | _column(340) byte LBW1 %1f "Low birthweight - 1st baby from this preg"
210 | _column(341) byte LIVCHILD %1f "Living arrangements for 1st liveborn child from this pregnancy"
211 | _column(342) int BFEEDWKS %3f "Duration of breastfeeding in weeks"
212 | _column(345) byte OLDWANTR %1f "Wantedness of pregnancy - respondent - Cycle 4 version"
213 | _column(346) byte OLDWANTP %1f "Wantedness of preg - R's partner (father of pregnancy) - Cycle 4 version"
214 | _column(347) byte WANTRESP %1f "Wantedness of pregnancy -- Respondent (RECODE)"
215 | _column(348) byte WANTPART %1f "Wantedness of pregnancy -- R's partner (RECODE)"
216 | _column(349) int TOOSOON %3f "Number of months too soon pregnancy occurred"
217 | _column(352) byte NEWWANTR %1f "Detailed wantedness of pregnancy - respondent"
218 | _column(353) int CMBIRTH %4f "Century month of R's birth"
219 | _column(357) byte AGER %2f "Age at interview"
220 | _column(359) byte AGESCRN %2f "R's age at screener"
221 | _column(361) byte FMARITAL %1f "Formal marital status"
222 | _column(362) byte RMARITAL %1f "Informal Marital Status"
223 | _column(363) byte EDUCAT %2f "Education (completed years of schooling)"
224 | _column(365) byte HIEDUC %2f "Highest completed year of school or degree"
225 | _column(367) byte RACE %1f "Race"
226 | _column(368) byte HISPANIC %1f "Hispanic origin"
227 | _column(369) byte HISPRACE %1f "Race & Hispanic origin of respondent - 1977 OMB standards (respondent recode)"
228 | _column(370) byte HISPRACE2 %1f "Race & Hispanic origin of respondent - 1997 OMB standards (respondent recode)"
229 | _column(371) byte RCURPREG %1f "Pregnant at time of interview"
230 | _column(372) byte PREGNUM %2f "CAPI-based total number of pregnancies"
231 | _column(374) byte PARITY %2f "Total number of live births"
232 | _column(376) byte CURR_INS %1f "Current health insurance coverage"
233 | _column(377) byte PUBASSIS %1f "Whether R received public assistance in prior calendar year"
234 | _column(378) int POVERTY %3f "Poverty level income"
235 | _column(381) byte LABORFOR %1f "Labor force status"
236 | _column(382) byte RELIGION %1f "Current religious affiliation"
237 | _column(383) byte METRO %1f "Place of residence (Metropolitan / Nonmetropolitan)"
238 | _column(384) byte BRNOUT %1f "IB-8 R born outside of US"
239 | _column(385) int YRSTRUS %4f "Year R came to the United States"
240 | _column(389) byte PRGLNGTH_I %1f "PRGLNGTH Imputation Flag"
241 | _column(390) byte OUTCOME_I %1f "OUTCOME Imputation Flag"
242 | _column(391) byte BIRTHORD_I %1f "BIRTHORD Imputation Flag"
243 | _column(392) byte DATEND_I %1f "DATEND Imputation Flag"
244 | _column(393) byte AGEPREG_I %1f "AGEPREG Imputation Flag"
245 | _column(394) byte DATECON_I %1f "DATECON Imputation Flag"
246 | _column(395) byte AGECON_I %1f "AGECON Imputation Flag"
247 | _column(396) byte FMAROUT5_I %1f "FMAROUT5 Imputation Flag"
248 | _column(397) byte PMARPREG_I %1f "PMARPREG Imputation Flag"
249 | _column(398) byte RMAROUT6_I %1f "RMAROUT6 Imputation Flag"
250 | _column(399) byte FMARCON5_I %1f "FMARCON5 Imputation Flag"
251 | _column(400) byte RMARCON6_I %1f "RMARCON6 Imputation Flag"
252 | _column(401) byte LEARNPRG_I %1f "LEARNPRG Imputation Flag"
253 | _column(402) byte PNCAREWK_I %1f "PNCAREWK Imputation Flag"
254 | _column(403) byte PAYDELIV_I %1f "PAYDELIV Imputation Flag"
255 | _column(404) byte LBW1_I %1f "LBW1 Imputation Flag"
256 | _column(405) byte LIVCHILD_I %1f "LIVCHILD Imputation Flag"
257 | _column(406) byte BFEEDWKS_I %1f "BFEEDWKS Imputation Flag"
258 | _column(407) byte OLDWANTR_I %1f "OLDWANTR Imputation Flag"
259 | _column(408) byte OLDWANTP_I %1f "OLDWANTP Imputation Flag"
260 | _column(409) byte WANTRESP_I %1f "WANTRESP Imputation Flag"
261 | _column(410) byte WANTPART_I %1f "WANTPART Imputation Flag"
262 | _column(411) byte TOOSOON_I %1f "TOOSOON Imputation Flag"
263 | _column(412) byte NEWWANTR_I %1f "NEWWANTR Imputation Flag"
264 | _column(413) byte AGER_I %1f "AGER Imputation Flag"
265 | _column(414) byte FMARITAL_I %1f "FMARITAL Imputation Flag"
266 | _column(415) byte RMARITAL_I %1f "RMARITAL Imputation Flag"
267 | _column(416) byte EDUCAT_I %1f "EDUCAT Imputation Flag"
268 | _column(417) byte HIEDUC_I %1f "HIEDUC Imputation Flag"
269 | _column(418) byte RACE_I %1f "RACE Imputation Flag"
270 | _column(419) byte HISPANIC_I %1f "HISPANIC Imputation Flag"
271 | _column(420) byte HISPRACE_I %1f "HISPRACE Imputation Flag"
272 | _column(421) byte HISPRACE2_I %1f "HISPRACE2 Imputation Flag"
273 | _column(422) byte RCURPREG_I %1f "RCURPREG Imputation Flag"
274 | _column(423) byte PREGNUM_I %1f "PREGNUM Imputation Flag"
275 | _column(424) byte PARITY_I %1f "PARITY Imputation Flag"
276 | _column(425) byte CURR_INS_I %1f "CURR_INS Imputation Flag"
277 | _column(426) byte PUBASSIS_I %1f "PUBASSIS Imputation Flag"
278 | _column(427) byte POVERTY_I %1f "POVERTY Imputation Flag"
279 | _column(428) byte LABORFOR_I %1f "LABORFOR Imputation Flag"
280 | _column(429) byte RELIGION_I %1f "RELIGION Imputation Flag"
281 | _column(430) byte METRO_I %1f "METRO Imputation Flag"
282 | _column(431) double WGT2013_2015 %16f "Final weight for the 2013-2015 NSFG"
283 | _column(447) byte SECU %1f "Randomized version of the sampling error computational unit"
284 | _column(448) int SEST %3f "Randomized version of the stratum"
285 | _column(451) int CMINTVW %4f "Century month for date of interview (Computed in Flow Check A-1)"
286 | _column(455) int CMLSTYR %4f "Century month for month/year of interview minus one year (Computed in Flow Check A-1)"
287 | _column(459) int CMJAN3YR %4f "Century month of January Three Years Prior to Year of interview (Computed in Flow Check A-1)"
288 | _column(463) int CMJAN4YR %4f "Century month of January Four Years Prior to Year of Interview (Computed in Flow Check A-1)"
289 | _column(467) int CMJAN5YR %4f "Century month of January Five Years Prior to Year of Interview (Computed in Flow Check A-1)"
290 | _column(471) str2 QUARTER %2s "Quarter when case was sampled"
291 | _column(473) str1 PHASE %1s "Regular- or double-sample portion of the quarter"
292 | _column(474) str4 INTVWYEAR %4s "Calendar year when interview occurred"
293 | }
294 |
--------------------------------------------------------------------------------
/gss_eda/GSS.do:
--------------------------------------------------------------------------------
1 | #delimit ;
2 |
3 | infix
4 | year 1 - 20
5 | id_ 21 - 40
6 | agewed 41 - 60
7 | divorce 61 - 80
8 | sibs 81 - 100
9 | childs 101 - 120
10 | age 121 - 140
11 | educ 141 - 160
12 | paeduc 161 - 180
13 | maeduc 181 - 200
14 | speduc 201 - 220
15 | degree 221 - 240
16 | padeg 241 - 260
17 | madeg 261 - 280
18 | spdeg 281 - 300
19 | sex 301 - 320
20 | race 321 - 340
21 | res16 341 - 360
22 | reg16 361 - 380
23 | srcbelt 381 - 400
24 | partyid 401 - 420
25 | pres04 421 - 440
26 | pres08 441 - 460
27 | pres12 461 - 480
28 | polviews 481 - 500
29 | natspac 501 - 520
30 | natenvir 521 - 540
31 | natheal 541 - 560
32 | natcity 561 - 580
33 | natcrime 581 - 600
34 | natdrug 601 - 620
35 | nateduc 621 - 640
36 | natrace 641 - 660
37 | natarms 661 - 680
38 | nataid 681 - 700
39 | natfare 701 - 720
40 | spkath 721 - 740
41 | colath 741 - 760
42 | libath 761 - 780
43 | spkhomo 781 - 800
44 | colhomo 801 - 820
45 | libhomo 821 - 840
46 | cappun 841 - 860
47 | gunlaw 861 - 880
48 | grass 881 - 900
49 | relig 901 - 920
50 | fund 921 - 940
51 | attend 941 - 960
52 | reliten 961 - 980
53 | postlife 981 - 1000
54 | pray 1001 - 1020
55 | relig16 1021 - 1040
56 | fund16 1041 - 1060
57 | sprel16 1061 - 1080
58 | prayer 1081 - 1100
59 | bible 1101 - 1120
60 | racmar 1121 - 1140
61 | racpres 1141 - 1160
62 | affrmact 1161 - 1180
63 | happy 1181 - 1200
64 | hapmar 1201 - 1220
65 | health 1221 - 1240
66 | life 1241 - 1260
67 | helpful 1261 - 1280
68 | fair 1281 - 1300
69 | trust 1301 - 1320
70 | conclerg 1321 - 1340
71 | coneduc 1341 - 1360
72 | confed 1361 - 1380
73 | conpress 1381 - 1400
74 | conjudge 1401 - 1420
75 | conlegis 1421 - 1440
76 | conarmy 1441 - 1460
77 | satjob 1461 - 1480
78 | class_ 1481 - 1500
79 | satfin 1501 - 1520
80 | finrela 1521 - 1540
81 | union_ 1541 - 1560
82 | fepol 1561 - 1580
83 | abany 1581 - 1600
84 | chldidel 1601 - 1620
85 | sexeduc 1621 - 1640
86 | premarsx 1641 - 1660
87 | xmarsex 1661 - 1680
88 | homosex 1681 - 1700
89 | spanking 1701 - 1720
90 | fear 1721 - 1740
91 | owngun 1741 - 1760
92 | pistol 1761 - 1780
93 | hunt 1781 - 1800
94 | phone 1801 - 1820
95 | memchurh 1821 - 1840
96 | realinc 1841 - 1860
97 | cohort 1861 - 1880
98 | marcohrt 1881 - 1900
99 | ballot 1901 - 1920
100 | wtssall 1921 - 1940
101 | adults 1941 - 1960
102 | compuse 1961 - 1980
103 | databank 1981 - 2000
104 | wtssnr 2001 - 2020
105 | spkrac 2021 - 2040
106 | spkcom 2041 - 2060
107 | spkmil 2061 - 2080
108 | spkmslm 2081 - 2100
109 | using GSS.dat;
110 |
111 | label variable year "Gss year for this respondent ";
112 | label variable id_ "Respondent id number";
113 | label variable agewed "Age when first married";
114 | label variable divorce "Ever been divorced or separated";
115 | label variable sibs "Number of brothers and sisters";
116 | label variable childs "Number of children";
117 | label variable age "Age of respondent";
118 | label variable educ "Highest year of school completed";
119 | label variable paeduc "Highest year school completed, father";
120 | label variable maeduc "Highest year school completed, mother";
121 | label variable speduc "Highest year school completed, spouse";
122 | label variable degree "Rs highest degree";
123 | label variable padeg "Fathers highest degree";
124 | label variable madeg "Mothers highest degree";
125 | label variable spdeg "Spouses highest degree";
126 | label variable sex "Respondents sex";
127 | label variable race "Race of respondent";
128 | label variable res16 "Type of place lived in when 16 yrs old";
129 | label variable reg16 "Region of residence, age 16";
130 | label variable srcbelt "Src beltcode";
131 | label variable partyid "Political party affiliation";
132 | label variable pres04 "Vote for kerry, bush, nader";
133 | label variable pres08 "Vote obama or mccain";
134 | label variable pres12 "Vote obama or romney";
135 | label variable polviews "Think of self as liberal or conservative";
136 | label variable natspac "Space exploration program";
137 | label variable natenvir "Improving & protecting environment";
138 | label variable natheal "Improving & protecting nations health";
139 | label variable natcity "Solving problems of big cities";
140 | label variable natcrime "Halting rising crime rate";
141 | label variable natdrug "Dealing with drug addiction";
142 | label variable nateduc "Improving nations education system";
143 | label variable natrace "Improving the conditions of blacks";
144 | label variable natarms "Military, armaments, and defense";
145 | label variable nataid "Foreign aid";
146 | label variable natfare "Welfare";
147 | label variable spkath "Allow anti-religionist to speak";
148 | label variable colath "Allow anti-religionist to teach";
149 | label variable libath "Allow anti-religious book in library";
150 | label variable spkhomo "Allow homosexual to speak";
151 | label variable colhomo "Allow homosexual to teach";
152 | label variable libhomo "Allow homosexuals book in library";
153 | label variable cappun "Favor or oppose death penalty for murder";
154 | label variable gunlaw "Favor or oppose gun permits";
155 | label variable grass "Should marijuana be made legal";
156 | label variable relig "Rs religious preference";
157 | label variable fund "How fundamentalist is r currently";
158 | label variable attend "How often r attends religious services";
159 | label variable reliten "Strength of affiliation";
160 | label variable postlife "Belief in life after death";
161 | label variable pray "How often does r pray";
162 | label variable relig16 "Religion in which raised";
163 | label variable fund16 "How fundamentalist was r at age 16";
164 | label variable sprel16 "Religion in which spouse raised";
165 | label variable prayer "Bible prayer in public schools";
166 | label variable bible "Feelings about the bible";
167 | label variable racmar "Favor law against racial intermarriage";
168 | label variable racpres "Would vote for black president";
169 | label variable affrmact "Favor preference in hiring blacks";
170 | label variable happy "General happiness";
171 | label variable hapmar "Happiness of marriage";
172 | label variable health "Condition of health";
173 | label variable life "Is life exciting or dull";
174 | label variable helpful "People helpful or looking out for selves";
175 | label variable fair "People fair or try to take advantage";
176 | label variable trust "Can people be trusted";
177 | label variable conclerg "Confidence in organized religion";
178 | label variable coneduc "Confidence in education";
179 | label variable confed "Confid. in exec branch of fed govt";
180 | label variable conpress "Confidence in press";
181 | label variable conjudge "Confid. in united states supreme court";
182 | label variable conlegis "Confidence in congress";
183 | label variable conarmy "Confidence in military";
184 | label variable satjob "Job or housework";
185 | label variable class_ "Subjective class identification";
186 | label variable satfin "Satisfaction with financial situation";
187 | label variable finrela "Opinion of family income";
188 | label variable union_ "Does r or spouse belong to union";
189 | label variable fepol "Women not suited for politics";
190 | label variable abany "Abortion if woman wants for any reason";
191 | label variable chldidel "Ideal number of children";
192 | label variable sexeduc "Sex education in public schools";
193 | label variable premarsx "Sex before marriage";
194 | label variable xmarsex "Sex with person other than spouse";
195 | label variable homosex "Homosexual sex relations";
196 | label variable spanking "Favor spanking to discipline child";
197 | label variable fear "Afraid to walk at night in neighborhood";
198 | label variable owngun "Have gun in home";
199 | label variable pistol "Pistol or revolver in home";
200 | label variable hunt "Does r or spouse hunt";
201 | label variable phone "Does r have telephone";
202 | label variable memchurh "Membership in church group";
203 | label variable realinc "Family income in constant $";
204 | label variable cohort "Year of birth";
205 | label variable marcohrt "Year of first marriage";
206 | label variable ballot "Ballot used for interview";
207 | label variable wtssall "Weight variable";
208 | label variable adults "Household members 18 yrs and older";
209 | label variable compuse "R use computer";
210 | label variable databank "Computer data threat to individual privacy";
211 | label variable wtssnr "Weight variable";
212 | label variable spkrac "Allow racist to speak";
213 | label variable spkcom "Allow communist to speak";
214 | label variable spkmil "Allow militarist to speak";
215 | label variable spkmslm "Allow muslim clergymen preaching hatred of the us";
216 |
217 |
218 | label define gsp001x
219 | 99 "No answer"
220 | 98 "Don't know"
221 | 0 "Not applicable"
222 | ;
223 | label define gsp002x
224 | 9 "No answer"
225 | 8 "Don't know"
226 | 2 "No"
227 | 1 "Yes"
228 | 0 "Not applicable"
229 | ;
230 | label define gsp003x
231 | 99 "No answer"
232 | 98 "Don't know"
233 | -1 "Not applicable"
234 | ;
235 | label define gsp004x
236 | 9 "Dk na"
237 | 8 "Eight or more"
238 | ;
239 | label define gsp005x
240 | 99 "No answer"
241 | 98 "Don't know"
242 | 89 "89 or older"
243 | ;
244 | label define gsp006x
245 | 99 "No answer"
246 | 98 "Don't know"
247 | 97 "Not applicable"
248 | ;
249 | label define gsp007x
250 | 99 "No answer"
251 | 98 "Don't know"
252 | 97 "Not applicable"
253 | ;
254 | label define gsp008x
255 | 99 "No answer"
256 | 98 "Don't know"
257 | 97 "Not applicable"
258 | ;
259 | label define gsp009x
260 | 99 "No answer"
261 | 98 "Don't know"
262 | 97 "Not applicable"
263 | ;
264 | label define gsp010x
265 | 9 "No answer"
266 | 8 "Don't know"
267 | 7 "Not applicable"
268 | 4 "Graduate"
269 | 3 "Bachelor"
270 | 2 "Junior college"
271 | 1 "High school"
272 | 0 "Lt high school"
273 | ;
274 | label define gsp011x
275 | 9 "No answer"
276 | 8 "Don't know"
277 | 7 "Not applicable"
278 | 4 "Graduate"
279 | 3 "Bachelor"
280 | 2 "Junior college"
281 | 1 "High school"
282 | 0 "Lt high school"
283 | ;
284 | label define gsp012x
285 | 9 "No answer"
286 | 8 "Don't know"
287 | 7 "Not applicable"
288 | 4 "Graduate"
289 | 3 "Bachelor"
290 | 2 "Junior college"
291 | 1 "High school"
292 | 0 "Lt high school"
293 | ;
294 | label define gsp013x
295 | 9 "No answer"
296 | 8 "Don't know"
297 | 7 "Not applicable"
298 | 4 "Graduate"
299 | 3 "Bachelor"
300 | 2 "Junior college"
301 | 1 "High school"
302 | 0 "Lt high school"
303 | ;
304 | label define gsp014x
305 | 2 "Female"
306 | 1 "Male"
307 | ;
308 | label define gsp015x
309 | 3 "Other"
310 | 2 "Black"
311 | 1 "White"
312 | 0 "Not applicable"
313 | ;
314 | label define gsp016x
315 | 9 "No answer"
316 | 8 "Don't know"
317 | 6 "City gt 250000"
318 | 5 "Big-city suburb"
319 | 4 "50000 to 250000"
320 | 3 "Town lt 50000"
321 | 2 "Farm"
322 | 1 "Country,nonfarm"
323 | 0 "Not applicable"
324 | ;
325 | label define gsp017x
326 | 9 "Pacific"
327 | 8 "Mountain"
328 | 7 "W. sou. central"
329 | 6 "E. sou. central"
330 | 5 "South atlantic"
331 | 4 "W. nor. central"
332 | 3 "E. nor. central"
333 | 2 "Middle atlantic"
334 | 1 "New england"
335 | 0 "Foreign"
336 | ;
337 | label define gsp018x
338 | 6 "Other rural"
339 | 5 "Other urban"
340 | 4 "Suburb, 13-100"
341 | 3 "Suburb, 12 lrgst"
342 | 2 "Smsa's 13-100"
343 | 1 "12 lrgst smsa's"
344 | 0 "Not assigned"
345 | ;
346 | label define gsp019x
347 | 9 "No answer"
348 | 8 "Don't know"
349 | 7 "Other party"
350 | 6 "Strong republican"
351 | 5 "Not str republican"
352 | 4 "Ind,near rep"
353 | 3 "Independent"
354 | 2 "Ind,near dem"
355 | 1 "Not str democrat"
356 | 0 "Strong democrat"
357 | ;
358 | label define gsp020x
359 | 9 "No answer"
360 | 8 "Dont know"
361 | 6 "Didnt vote"
362 | 4 "Other (specify)"
363 | 3 "Nader"
364 | 2 "Bush"
365 | 1 "Kerry"
366 | 0 "Not applicable"
367 | ;
368 | label define gsp021x
369 | 9 "No answer"
370 | 8 "Don't know"
371 | 4 "Didn't vote"
372 | 3 "Other candidate (specify)"
373 | 2 "Mccain"
374 | 1 "Obama"
375 | 0 "Not applicable"
376 | ;
377 | label define gsp022x
378 | 9 "No answer"
379 | 8 "Don't know"
380 | 4 "Didn't vote for president"
381 | 3 "Other candidate (specify)"
382 | 2 "Romney"
383 | 1 "Obama"
384 | 0 "Not applicable"
385 | ;
386 | label define gsp023x
387 | 9 "No answer"
388 | 8 "Don't know"
389 | 7 "Extrmly conservative"
390 | 6 "Conservative"
391 | 5 "Slghtly conservative"
392 | 4 "Moderate"
393 | 3 "Slightly liberal"
394 | 2 "Liberal"
395 | 1 "Extremely liberal"
396 | 0 "Not applicable"
397 | ;
398 | label define gsp024x
399 | 9 "No answer"
400 | 8 "Don't know"
401 | 3 "Too much"
402 | 2 "About right"
403 | 1 "Too little"
404 | 0 "Not applicable"
405 | ;
406 | label define gsp025x
407 | 9 "No answer"
408 | 8 "Don't know"
409 | 3 "Too much"
410 | 2 "About right"
411 | 1 "Too little"
412 | 0 "Not applicable"
413 | ;
414 | label define gsp026x
415 | 9 "No answer"
416 | 8 "Don't know"
417 | 3 "Too much"
418 | 2 "About right"
419 | 1 "Too little"
420 | 0 "Not applicable"
421 | ;
422 | label define gsp027x
423 | 9 "No answer"
424 | 8 "Don't know"
425 | 3 "Too much"
426 | 2 "About right"
427 | 1 "Too little"
428 | 0 "Not applicable"
429 | ;
430 | label define gsp028x
431 | 9 "No answer"
432 | 8 "Don't know"
433 | 3 "Too much"
434 | 2 "About right"
435 | 1 "Too little"
436 | 0 "Not applicable"
437 | ;
438 | label define gsp029x
439 | 9 "No answer"
440 | 8 "Don't know"
441 | 3 "Too much"
442 | 2 "About right"
443 | 1 "Too little"
444 | 0 "Not applicable"
445 | ;
446 | label define gsp030x
447 | 9 "No answer"
448 | 8 "Don't know"
449 | 3 "Too much"
450 | 2 "About right"
451 | 1 "Too little"
452 | 0 "Not applicable"
453 | ;
454 | label define gsp031x
455 | 9 "No answer"
456 | 8 "Don't know"
457 | 3 "Too much"
458 | 2 "About right"
459 | 1 "Too little"
460 | 0 "Not applicable"
461 | ;
462 | label define gsp032x
463 | 9 "No answer"
464 | 8 "Don't know"
465 | 3 "Too much"
466 | 2 "About right"
467 | 1 "Too little"
468 | 0 "Not applicable"
469 | ;
470 | label define gsp033x
471 | 9 "No answer"
472 | 8 "Don't know"
473 | 3 "Too much"
474 | 2 "About right"
475 | 1 "Too little"
476 | 0 "Not applicable"
477 | ;
478 | label define gsp034x
479 | 9 "No answer"
480 | 8 "Don't know"
481 | 3 "Too much"
482 | 2 "About right"
483 | 1 "Too little"
484 | 0 "Not applicable"
485 | ;
486 | label define gsp035x
487 | 9 "No answer"
488 | 8 "Don't know"
489 | 2 "Not allowed"
490 | 1 "Allowed"
491 | 0 "Not applicable"
492 | ;
493 | label define gsp036x
494 | 9 "No answer"
495 | 8 "Don't know"
496 | 5 "Not allowed"
497 | 4 "Allowed"
498 | 0 "Not applicable"
499 | ;
500 | label define gsp037x
501 | 9 "No answer"
502 | 8 "Don't know"
503 | 2 "Not remove"
504 | 1 "Remove"
505 | 0 "Not applicable"
506 | ;
507 | label define gsp038x
508 | 9 "No answer"
509 | 8 "Don't know"
510 | 2 "Not allowed"
511 | 1 "Allowed"
512 | 0 "Not applicable"
513 | ;
514 | label define gsp039x
515 | 9 "No answer"
516 | 8 "Don't know"
517 | 5 "Not allowed"
518 | 4 "Allowed"
519 | 0 "Not applicable"
520 | ;
521 | label define gsp040x
522 | 9 "No answer"
523 | 8 "Don't know"
524 | 2 "Not remove"
525 | 1 "Remove"
526 | 0 "Not applicable"
527 | ;
528 | label define gsp041x
529 | 9 "No answer"
530 | 8 "Don't know"
531 | 2 "Oppose"
532 | 1 "Favor"
533 | 0 "Not applicable"
534 | ;
535 | label define gsp042x
536 | 9 "No answer"
537 | 8 "Don't know"
538 | 2 "Oppose"
539 | 1 "Favor"
540 | 0 "Not applicable"
541 | ;
542 | label define gsp043x
543 | 9 "No answer"
544 | 8 "Don't know"
545 | 2 "Not legal"
546 | 1 "Legal"
547 | 0 "Not applicable"
548 | ;
549 | label define gsp044x
550 | 99 "No answer"
551 | 98 "Don't know"
552 | 13 "Inter-nondenominational"
553 | 12 "Native american"
554 | 11 "Christian"
555 | 10 "Orthodox-christian"
556 | 9 "Moslem/islam"
557 | 8 "Other eastern"
558 | 7 "Hinduism"
559 | 6 "Buddhism"
560 | 5 "Other"
561 | 4 "None"
562 | 3 "Jewish"
563 | 2 "Catholic"
564 | 1 "Protestant"
565 | 0 "Not applicable"
566 | ;
567 | label define gsp045x
568 | 9 "Na-excluded"
569 | 8 "Don't know"
570 | 3 "Liberal"
571 | 2 "Moderate"
572 | 1 "Fundamentalist"
573 | 0 "Not applicable"
574 | ;
575 | label define gsp046x
576 | 9 "Dk,na"
577 | 8 "More thn once wk"
578 | 7 "Every week"
579 | 6 "Nrly every week"
580 | 5 "2-3x a month"
581 | 4 "Once a month"
582 | 3 "Sevrl times a yr"
583 | 2 "Once a year"
584 | 1 "Lt once a year"
585 | 0 "Never"
586 | ;
587 | label define gsp047x
588 | 9 "No answer"
589 | 8 "Don't know"
590 | 4 "No religion"
591 | 3 "Somewhat strong"
592 | 2 "Not very strong"
593 | 1 "Strong"
594 | 0 "Not applicable"
595 | ;
596 | label define gsp048x
597 | 9 "No answer"
598 | 8 "Don't know"
599 | 2 "No"
600 | 1 "Yes"
601 | 0 "Not applicable"
602 | ;
603 | label define gsp049x
604 | 9 "No answer"
605 | 8 "Don't know"
606 | 6 "Never"
607 | 5 "Lt once a week"
608 | 4 "Once a week"
609 | 3 "Several times a week"
610 | 2 "Once a day"
611 | 1 "Several times a day"
612 | 0 "Not applicable"
613 | ;
614 | label define gsp050x
615 | 99 "No answer"
616 | 98 "Don't know"
617 | 13 "Inter-nondenominational"
618 | 12 "Native american"
619 | 11 "Christian"
620 | 10 "Orthodox-christian"
621 | 9 "Moslem/islam"
622 | 8 "Other eastern"
623 | 7 "Hinduism"
624 | 6 "Buddhism"
625 | 5 "Other"
626 | 4 "None"
627 | 3 "Jewish"
628 | 2 "Catholic"
629 | 1 "Protestant"
630 | 0 "Not applicable"
631 | ;
632 | label define gsp051x
633 | 9 "Na-excluded"
634 | 8 "Don't know"
635 | 3 "Liberal"
636 | 2 "Moderate"
637 | 1 "Fundamentalist"
638 | 0 "Not applicable"
639 | ;
640 | label define gsp052x
641 | 9 "No answer"
642 | 8 "Dont know"
643 | 5 "Other"
644 | 4 "None"
645 | 3 "Jewish"
646 | 2 "Catholic"
647 | 1 "Protestant"
648 | 0 "Not applicable"
649 | ;
650 | label define gsp053x
651 | 9 "No answer"
652 | 8 "Don't know"
653 | 2 "Disapprove"
654 | 1 "Approve"
655 | 0 "Not applicable"
656 | ;
657 | label define gsp054x
658 | 9 "No answer"
659 | 8 "Don't know"
660 | 4 "Other"
661 | 3 "Book of fables"
662 | 2 "Inspired word"
663 | 1 "Word of god"
664 | 0 "Not applicable"
665 | ;
666 | label define gsp055x
667 | 9 "No answer"
668 | 8 "Don't know"
669 | 2 "No"
670 | 1 "Yes"
671 | 0 "Not applicable"
672 | ;
673 | label define gsp056x
674 | 9 "No answer"
675 | 8 "Don't know"
676 | 2 "No"
677 | 1 "Yes"
678 | 0 "Not applicable"
679 | ;
680 | label define gsp057x
681 | 9 "No answer"
682 | 8 "Don't know"
683 | 4 "Strongly oppose pref"
684 | 3 "Oppose pref"
685 | 2 "Support pref"
686 | 1 "Strongly support pref"
687 | 0 "Not applicable"
688 | ;
689 | label define gsp058x
690 | 9 "No answer"
691 | 8 "Don't know"
692 | 3 "Not too happy"
693 | 2 "Pretty happy"
694 | 1 "Very happy"
695 | 0 "Not applicable"
696 | ;
697 | label define gsp059x
698 | 9 "No answer"
699 | 8 "Don't know"
700 | 3 "Not too happy"
701 | 2 "Pretty happy"
702 | 1 "Very happy"
703 | 0 "Not applicable"
704 | ;
705 | label define gsp060x
706 | 9 "No answer"
707 | 8 "Don't know"
708 | 4 "Poor"
709 | 3 "Fair"
710 | 2 "Good"
711 | 1 "Excellent"
712 | 0 "Not applicable"
713 | ;
714 | label define gsp061x
715 | 9 "No answer"
716 | 8 "Don't know"
717 | 3 "Dull"
718 | 2 "Routine"
719 | 1 "Exciting"
720 | 0 "Not applicable"
721 | ;
722 | label define gsp062x
723 | 9 "No answer"
724 | 8 "Don't know"
725 | 3 "Depends"
726 | 2 "Lookout for self"
727 | 1 "Helpful"
728 | 0 "Not applicable"
729 | ;
730 | label define gsp063x
731 | 9 "No answer"
732 | 8 "Don't know"
733 | 3 "Depends"
734 | 2 "Fair"
735 | 1 "Take advantage"
736 | 0 "Not applicable"
737 | ;
738 | label define gsp064x
739 | 9 "No answer"
740 | 8 "Don't know"
741 | 3 "Depends"
742 | 2 "Cannot trust"
743 | 1 "Can trust"
744 | 0 "Not applicable"
745 | ;
746 | label define gsp065x
747 | 9 "No answer"
748 | 8 "Don't know"
749 | 3 "Hardly any"
750 | 2 "Only some"
751 | 1 "A great deal"
752 | 0 "Not applicable"
753 | ;
754 | label define gsp066x
755 | 9 "No answer"
756 | 8 "Don't know"
757 | 3 "Hardly any"
758 | 2 "Only some"
759 | 1 "A great deal"
760 | 0 "Not applicable"
761 | ;
762 | label define gsp067x
763 | 9 "No answer"
764 | 8 "Don't know"
765 | 3 "Hardly any"
766 | 2 "Only some"
767 | 1 "A great deal"
768 | 0 "Not applicable"
769 | ;
770 | label define gsp068x
771 | 9 "No answer"
772 | 8 "Don't know"
773 | 3 "Hardly any"
774 | 2 "Only some"
775 | 1 "A great deal"
776 | 0 "Not applicable"
777 | ;
778 | label define gsp069x
779 | 9 "No answer"
780 | 8 "Don't know"
781 | 3 "Hardly any"
782 | 2 "Only some"
783 | 1 "A great deal"
784 | 0 "Not applicable"
785 | ;
786 | label define gsp070x
787 | 9 "No answer"
788 | 8 "Don't know"
789 | 3 "Hardly any"
790 | 2 "Only some"
791 | 1 "A great deal"
792 | 0 "Not applicable"
793 | ;
794 | label define gsp071x
795 | 9 "No answer"
796 | 8 "Don't know"
797 | 3 "Hardly any"
798 | 2 "Only some"
799 | 1 "A great deal"
800 | 0 "Not applicable"
801 | ;
802 | label define gsp072x
803 | 9 "No answer"
804 | 8 "Don't know"
805 | 4 "Very dissatisfied"
806 | 3 "A little dissat"
807 | 2 "Mod. satisfied"
808 | 1 "Very satisfied"
809 | 0 "Not applicable"
810 | ;
811 | label define gsp073x
812 | 9 "No answer"
813 | 8 "Don't know"
814 | 5 "No class"
815 | 4 "Upper class"
816 | 3 "Middle class"
817 | 2 "Working class"
818 | 1 "Lower class"
819 | 0 "Not applicable"
820 | ;
821 | label define gsp074x
822 | 9 "No answer"
823 | 8 "Don't know"
824 | 3 "Not at all sat"
825 | 2 "More or less"
826 | 1 "Satisfied"
827 | 0 "Not applicable"
828 | ;
829 | label define gsp075x
830 | 9 "No answer"
831 | 8 "Don't know"
832 | 5 "Far above average"
833 | 4 "Above average"
834 | 3 "Average"
835 | 2 "Below average"
836 | 1 "Far below average"
837 | 0 "Not applicable"
838 | ;
839 | label define gsp076x
840 | 9 "No answer"
841 | 8 "Don't know"
842 | 4 "Neither belongs"
843 | 3 "R and spouse belong"
844 | 2 "Spouse belongs"
845 | 1 "R belongs"
846 | 0 "Not applicable"
847 | ;
848 | label define gsp077x
849 | 9 "No answer"
850 | 8 "Not sure"
851 | 2 "Disagree"
852 | 1 "Agree"
853 | 0 "Not applicable"
854 | ;
855 | label define gsp078x
856 | 9 "No answer"
857 | 8 "Don't know"
858 | 2 "No"
859 | 1 "Yes"
860 | 0 "Not applicable"
861 | ;
862 | label define gsp079x
863 | 9 "Dk,na"
864 | 8 "As many as want"
865 | 7 "Seven+"
866 | -1 "Not applicable"
867 | ;
868 | label define gsp080x
869 | 9 "No answer"
870 | 8 "Don't know"
871 | 3 "Depends"
872 | 2 "Oppose"
873 | 1 "Favor"
874 | 0 "Not applicable"
875 | ;
876 | label define gsp081x
877 | 9 "No answer"
878 | 8 "Don't know"
879 | 5 "Other"
880 | 4 "Not wrong at all"
881 | 3 "Sometimes wrong"
882 | 2 "Almst always wrg"
883 | 1 "Always wrong"
884 | 0 "Not applicable"
885 | ;
886 | label define gsp082x
887 | 9 "No answer"
888 | 8 "Don't know"
889 | 5 "Other"
890 | 4 "Not wrong at all"
891 | 3 "Sometimes wrong"
892 | 2 "Almst always wrg"
893 | 1 "Always wrong"
894 | 0 "Not applicable"
895 | ;
896 | label define gsp083x
897 | 9 "No answer"
898 | 8 "Don't know"
899 | 5 "Other"
900 | 4 "Not wrong at all"
901 | 3 "Sometimes wrong"
902 | 2 "Almst always wrg"
903 | 1 "Always wrong"
904 | 0 "Not applicable"
905 | ;
906 | label define gsp084x
907 | 9 "No answer"
908 | 8 "Don't know"
909 | 4 "Strongly disagree"
910 | 3 "Disagree"
911 | 2 "Agree"
912 | 1 "Strongly agree"
913 | 0 "Not applicable"
914 | ;
915 | label define gsp085x
916 | 9 "No answer"
917 | 8 "Don't know"
918 | 2 "No"
919 | 1 "Yes"
920 | 0 "Not applicable"
921 | ;
922 | label define gsp086x
923 | 9 "No answer"
924 | 8 "Don't know"
925 | 3 "Refused"
926 | 2 "No"
927 | 1 "Yes"
928 | 0 "Not applicable"
929 | ;
930 | label define gsp087x
931 | 9 "No answer"
932 | 8 "Don't know"
933 | 3 "Refused"
934 | 2 "No"
935 | 1 "Yes"
936 | 0 "Not applicable"
937 | ;
938 | label define gsp088x
939 | 9 "No answer"
940 | 8 "Don't know"
941 | 4 "Neither"
942 | 3 "Both"
943 | 2 "Spouse"
944 | 1 "Resp"
945 | 0 "Not applicable"
946 | ;
947 | label define gsp089x
948 | 9 "No answer"
949 | 6 "Cellphone"
950 | 5 "Phone,dk where"
951 | 4 "Phone elsewhere"
952 | 3 "Phone in home"
953 | 2 "Refused"
954 | 1 "No phone"
955 | 0 "Not applicable"
956 | ;
957 | label define gsp090x
958 | 9 "No answer"
959 | 8 "Don't know"
960 | 2 "No"
961 | 1 "Yes"
962 | 0 "Not applicable"
963 | ;
964 | label define gsp091x
965 | 999999 "No answer"
966 | 999998 "Dont know"
967 | 0 "Not applicable"
968 | ;
969 | label define gsp092x
970 | 9999 "No answer"
971 | 0 "Not applicable"
972 | ;
973 | label define gsp093x
974 | 9999 "No answer"
975 | 0 "Not applicable"
976 | ;
977 | label define gsp094x
978 | 4 "Ballot d"
979 | 3 "Ballot c"
980 | 2 "Ballot b"
981 | 1 "Ballot a"
982 | 0 "Not applicable"
983 | ;
984 | label define gsp095x
985 | 9 "No answer"
986 | 8 "8 or more"
987 | ;
988 | label define gsp096x
989 | 9 "No answer"
990 | 8 "Don't know"
991 | 2 "No"
992 | 1 "Yes"
993 | 0 "Not applicable"
994 | ;
995 | label define gsp097x
996 | 9 "No answer"
997 | 8 "Cant choose"
998 | 4 "Not a threat at all"
999 | 3 "Not serious"
1000 | 2 "Fairly serious"
1001 | 1 "Very serious threat"
1002 | 0 "Not applicable"
1003 | ;
1004 | label define gsp098x
1005 | 9 "No answer"
1006 | 8 "Don't know"
1007 | 2 "Not allowed"
1008 | 1 "Allowed"
1009 | 0 "Not applicable"
1010 | ;
1011 | label define gsp099x
1012 | 9 "No answer"
1013 | 8 "Don't know"
1014 | 2 "Not allowed"
1015 | 1 "Allowed"
1016 | 0 "Not applicable"
1017 | ;
1018 | label define gsp100x
1019 | 9 "No answer"
1020 | 8 "Don't know"
1021 | 2 "Not allowed"
1022 | 1 "Allowed"
1023 | 0 "Not applicable"
1024 | ;
1025 | label define gsp101x
1026 | 9 "No answer"
1027 | 8 "Dont know"
1028 | 2 "Not allowed"
1029 | 1 "Yes, allowed"
1030 | 0 "Not applicable"
1031 | ;
1032 |
1033 |
1034 | label values agewed gsp001x;
1035 | label values divorce gsp002x;
1036 | label values sibs gsp003x;
1037 | label values childs gsp004x;
1038 | label values age gsp005x;
1039 | label values educ gsp006x;
1040 | label values paeduc gsp007x;
1041 | label values maeduc gsp008x;
1042 | label values speduc gsp009x;
1043 | label values degree gsp010x;
1044 | label values padeg gsp011x;
1045 | label values madeg gsp012x;
1046 | label values spdeg gsp013x;
1047 | label values sex gsp014x;
1048 | label values race gsp015x;
1049 | label values res16 gsp016x;
1050 | label values reg16 gsp017x;
1051 | label values srcbelt gsp018x;
1052 | label values partyid gsp019x;
1053 | label values pres04 gsp020x;
1054 | label values pres08 gsp021x;
1055 | label values pres12 gsp022x;
1056 | label values polviews gsp023x;
1057 | label values natspac gsp024x;
1058 | label values natenvir gsp025x;
1059 | label values natheal gsp026x;
1060 | label values natcity gsp027x;
1061 | label values natcrime gsp028x;
1062 | label values natdrug gsp029x;
1063 | label values nateduc gsp030x;
1064 | label values natrace gsp031x;
1065 | label values natarms gsp032x;
1066 | label values nataid gsp033x;
1067 | label values natfare gsp034x;
1068 | label values spkath gsp035x;
1069 | label values colath gsp036x;
1070 | label values libath gsp037x;
1071 | label values spkhomo gsp038x;
1072 | label values colhomo gsp039x;
1073 | label values libhomo gsp040x;
1074 | label values cappun gsp041x;
1075 | label values gunlaw gsp042x;
1076 | label values grass gsp043x;
1077 | label values relig gsp044x;
1078 | label values fund gsp045x;
1079 | label values attend gsp046x;
1080 | label values reliten gsp047x;
1081 | label values postlife gsp048x;
1082 | label values pray gsp049x;
1083 | label values relig16 gsp050x;
1084 | label values fund16 gsp051x;
1085 | label values sprel16 gsp052x;
1086 | label values prayer gsp053x;
1087 | label values bible gsp054x;
1088 | label values racmar gsp055x;
1089 | label values racpres gsp056x;
1090 | label values affrmact gsp057x;
1091 | label values happy gsp058x;
1092 | label values hapmar gsp059x;
1093 | label values health gsp060x;
1094 | label values life gsp061x;
1095 | label values helpful gsp062x;
1096 | label values fair gsp063x;
1097 | label values trust gsp064x;
1098 | label values conclerg gsp065x;
1099 | label values coneduc gsp066x;
1100 | label values confed gsp067x;
1101 | label values conpress gsp068x;
1102 | label values conjudge gsp069x;
1103 | label values conlegis gsp070x;
1104 | label values conarmy gsp071x;
1105 | label values satjob gsp072x;
1106 | label values class_ gsp073x;
1107 | label values satfin gsp074x;
1108 | label values finrela gsp075x;
1109 | label values union_ gsp076x;
1110 | label values fepol gsp077x;
1111 | label values abany gsp078x;
1112 | label values chldidel gsp079x;
1113 | label values sexeduc gsp080x;
1114 | label values premarsx gsp081x;
1115 | label values xmarsex gsp082x;
1116 | label values homosex gsp083x;
1117 | label values spanking gsp084x;
1118 | label values fear gsp085x;
1119 | label values owngun gsp086x;
1120 | label values pistol gsp087x;
1121 | label values hunt gsp088x;
1122 | label values phone gsp089x;
1123 | label values memchurh gsp090x;
1124 | label values realinc gsp091x;
1125 | label values cohort gsp092x;
1126 | label values marcohrt gsp093x;
1127 | label values ballot gsp094x;
1128 | label values adults gsp095x;
1129 | label values compuse gsp096x;
1130 | label values databank gsp097x;
1131 | label values spkrac gsp098x;
1132 | label values spkcom gsp099x;
1133 | label values spkmil gsp100x;
1134 | label values spkmslm gsp101x;
1135 |
1136 |
1137 |
--------------------------------------------------------------------------------
/gss_validate.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Exploratory Data Analysis\n",
8 | "\n",
9 | "Load and validate GSS data\n",
10 | "\n",
11 | "Allen Downey\n",
12 | "\n",
13 | "[MIT License](https://en.wikipedia.org/wiki/MIT_License)"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 33,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "%matplotlib inline\n",
23 | "\n",
24 | "import pandas as pd\n",
25 | "import numpy as np\n",
26 | "\n",
27 | "import matplotlib.pyplot as plt\n",
28 | "import seaborn as sns\n",
29 | "sns.set(style='white')\n",
30 | "\n",
31 | "import utils\n",
32 | "from utils import decorate\n",
33 | "from distribution import Pmf, Cdf"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 34,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "def underride(d, **options):\n",
43 | " \"\"\"Add key-value pairs to d only if key is not in d.\n",
44 | "\n",
45 | " d: dictionary\n",
46 | " options: keyword args to add to d\n",
47 | " \"\"\"\n",
48 | " for key, val in options.items():\n",
49 | " d.setdefault(key, val)\n",
50 | "\n",
51 | " return d"
52 | ]
53 | },
54 | {
55 | "cell_type": "markdown",
56 | "metadata": {},
57 | "source": [
58 | "## Loading and validation\n"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 35,
64 | "metadata": {},
65 | "outputs": [],
66 | "source": [
67 | "import re\n",
68 | "\n",
69 | "class FixedWidthVariables(object):\n",
70 | " \"\"\"Represents a set of variables in a fixed width file.\"\"\"\n",
71 | "\n",
72 | " def __init__(self, variables, index_base=0):\n",
73 | " \"\"\"Initializes.\n",
74 | "\n",
75 | " variables: DataFrame\n",
76 | " index_base: are the indices 0 or 1 based?\n",
77 | "\n",
78 | " Attributes:\n",
79 | " colspecs: list of (start, end) index tuples\n",
80 | " names: list of string variable names\n",
81 | " \"\"\"\n",
82 | " self.variables = variables\n",
83 | "\n",
84 | " # note: by default, subtract 1 from colspecs\n",
85 | " self.colspecs = variables[['start', 'end']] - index_base\n",
86 | "\n",
87 | " # convert colspecs to a list of pair of int\n",
88 | " self.colspecs = self.colspecs.astype(np.int).values.tolist()\n",
89 | " self.names = variables['name']\n",
90 | "\n",
91 | " def ReadFixedWidth(self, filename, **options):\n",
92 | " \"\"\"Reads a fixed width ASCII file.\n",
93 | "\n",
94 | " filename: string filename\n",
95 | "\n",
96 | " returns: DataFrame\n",
97 | " \"\"\"\n",
98 | " df = pd.read_fwf(filename,\n",
99 | " colspecs=self.colspecs, \n",
100 | " names=self.names,\n",
101 | " **options)\n",
102 | " return df\n",
103 | "\n",
104 | "\n",
105 | "def ReadStataDct(dct_file, **options):\n",
106 | " \"\"\"Reads a Stata dictionary file.\n",
107 | "\n",
108 | " dct_file: string filename\n",
109 | " options: dict of options passed to open()\n",
110 | "\n",
111 | " returns: FixedWidthVariables object\n",
112 | " \"\"\"\n",
113 | " type_map = dict(byte=int, int=int, long=int, float=float, \n",
114 | " double=float, numeric=float)\n",
115 | "\n",
116 | " var_info = []\n",
117 | " with open(dct_file, **options) as f:\n",
118 | " for line in f:\n",
119 | " match = re.search( r'_column\\(([^)]*)\\)', line)\n",
120 | " if not match:\n",
121 | " continue\n",
122 | " start = int(match.group(1))\n",
123 | " t = line.split()\n",
124 | " vtype, name, fstring = t[1:4]\n",
125 | " name = name.lower()\n",
126 | " if vtype.startswith('str'):\n",
127 | " vtype = str\n",
128 | " else:\n",
129 | " vtype = type_map[vtype]\n",
130 | " long_desc = ' '.join(t[4:]).strip('\"')\n",
131 | " var_info.append((start, vtype, name, fstring, long_desc))\n",
132 | " \n",
133 | " columns = ['start', 'type', 'name', 'fstring', 'desc']\n",
134 | " variables = pd.DataFrame(var_info, columns=columns)\n",
135 | "\n",
136 | " # fill in the end column by shifting the start column\n",
137 | " variables['end'] = variables.start.shift(-1)\n",
138 | " variables.loc[len(variables)-1, 'end'] = 0\n",
139 | "\n",
140 | " dct = FixedWidthVariables(variables, index_base=1)\n",
141 | " return dct\n",
142 | "\n",
143 | "def read_gss(dirname):\n",
144 | " \"\"\"Reads GSS files from the given directory.\n",
145 | " \n",
146 | " dirname: string\n",
147 | " \n",
148 | " returns: DataFrame\n",
149 | " \"\"\"\n",
150 | " dct = ReadStataDct(dirname + '/GSS.dct')\n",
151 | " gss = dct.ReadFixedWidth(dirname + '/GSS.dat.gz',\n",
152 | " compression='gzip')\n",
153 | " return gss"
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": 36,
159 | "metadata": {},
160 | "outputs": [
161 | {
162 | "name": "stdout",
163 | "output_type": "stream",
164 | "text": [
165 | "(62466, 101)\n"
166 | ]
167 | },
168 | {
169 | "data": {
170 | "text/html": [
171 | "\n",
172 | "\n",
185 | "
\n",
186 | " \n",
187 | " \n",
188 | " | \n",
189 | " year | \n",
190 | " id_ | \n",
191 | " agewed | \n",
192 | " divorce | \n",
193 | " sibs | \n",
194 | " childs | \n",
195 | " age | \n",
196 | " educ | \n",
197 | " paeduc | \n",
198 | " maeduc | \n",
199 | " ... | \n",
200 | " memchurh | \n",
201 | " realinc | \n",
202 | " cohort | \n",
203 | " marcohrt | \n",
204 | " ballot | \n",
205 | " wtssall | \n",
206 | " adults | \n",
207 | " compuse | \n",
208 | " databank | \n",
209 | " wtssnr | \n",
210 | "
\n",
211 | " \n",
212 | " \n",
213 | " \n",
214 | " | 0 | \n",
215 | " 1972 | \n",
216 | " 1 | \n",
217 | " 0 | \n",
218 | " 0 | \n",
219 | " 3 | \n",
220 | " 0 | \n",
221 | " 23 | \n",
222 | " 16 | \n",
223 | " 10 | \n",
224 | " 97 | \n",
225 | " ... | \n",
226 | " 0 | \n",
227 | " 18951.0 | \n",
228 | " 1949 | \n",
229 | " 0 | \n",
230 | " 0 | \n",
231 | " 0.4446 | \n",
232 | " 1 | \n",
233 | " 0 | \n",
234 | " 0 | \n",
235 | " 1.0 | \n",
236 | "
\n",
237 | " \n",
238 | " | 1 | \n",
239 | " 1972 | \n",
240 | " 2 | \n",
241 | " 21 | \n",
242 | " 2 | \n",
243 | " 4 | \n",
244 | " 5 | \n",
245 | " 70 | \n",
246 | " 10 | \n",
247 | " 8 | \n",
248 | " 8 | \n",
249 | " ... | \n",
250 | " 0 | \n",
251 | " 24366.0 | \n",
252 | " 1902 | \n",
253 | " 1923 | \n",
254 | " 0 | \n",
255 | " 0.8893 | \n",
256 | " 2 | \n",
257 | " 0 | \n",
258 | " 0 | \n",
259 | " 1.0 | \n",
260 | "
\n",
261 | " \n",
262 | " | 2 | \n",
263 | " 1972 | \n",
264 | " 3 | \n",
265 | " 20 | \n",
266 | " 2 | \n",
267 | " 5 | \n",
268 | " 4 | \n",
269 | " 48 | \n",
270 | " 12 | \n",
271 | " 8 | \n",
272 | " 8 | \n",
273 | " ... | \n",
274 | " 0 | \n",
275 | " 24366.0 | \n",
276 | " 1924 | \n",
277 | " 1944 | \n",
278 | " 0 | \n",
279 | " 0.8893 | \n",
280 | " 2 | \n",
281 | " 0 | \n",
282 | " 0 | \n",
283 | " 1.0 | \n",
284 | "
\n",
285 | " \n",
286 | " | 3 | \n",
287 | " 1972 | \n",
288 | " 4 | \n",
289 | " 24 | \n",
290 | " 2 | \n",
291 | " 5 | \n",
292 | " 0 | \n",
293 | " 27 | \n",
294 | " 17 | \n",
295 | " 16 | \n",
296 | " 12 | \n",
297 | " ... | \n",
298 | " 0 | \n",
299 | " 30458.0 | \n",
300 | " 1945 | \n",
301 | " 1969 | \n",
302 | " 0 | \n",
303 | " 0.8893 | \n",
304 | " 2 | \n",
305 | " 0 | \n",
306 | " 0 | \n",
307 | " 1.0 | \n",
308 | "
\n",
309 | " \n",
310 | " | 4 | \n",
311 | " 1972 | \n",
312 | " 5 | \n",
313 | " 22 | \n",
314 | " 2 | \n",
315 | " 2 | \n",
316 | " 2 | \n",
317 | " 61 | \n",
318 | " 12 | \n",
319 | " 8 | \n",
320 | " 8 | \n",
321 | " ... | \n",
322 | " 0 | \n",
323 | " 50763.0 | \n",
324 | " 1911 | \n",
325 | " 1933 | \n",
326 | " 0 | \n",
327 | " 0.8893 | \n",
328 | " 2 | \n",
329 | " 0 | \n",
330 | " 0 | \n",
331 | " 1.0 | \n",
332 | "
\n",
333 | " \n",
334 | "
\n",
335 | "
5 rows × 101 columns
\n",
336 | "
"
337 | ],
338 | "text/plain": [
339 | " year id_ agewed divorce sibs childs age educ paeduc maeduc \\\n",
340 | "0 1972 1 0 0 3 0 23 16 10 97 \n",
341 | "1 1972 2 21 2 4 5 70 10 8 8 \n",
342 | "2 1972 3 20 2 5 4 48 12 8 8 \n",
343 | "3 1972 4 24 2 5 0 27 17 16 12 \n",
344 | "4 1972 5 22 2 2 2 61 12 8 8 \n",
345 | "\n",
346 | " ... memchurh realinc cohort marcohrt ballot wtssall adults \\\n",
347 | "0 ... 0 18951.0 1949 0 0 0.4446 1 \n",
348 | "1 ... 0 24366.0 1902 1923 0 0.8893 2 \n",
349 | "2 ... 0 24366.0 1924 1944 0 0.8893 2 \n",
350 | "3 ... 0 30458.0 1945 1969 0 0.8893 2 \n",
351 | "4 ... 0 50763.0 1911 1933 0 0.8893 2 \n",
352 | "\n",
353 | " compuse databank wtssnr \n",
354 | "0 0 0 1.0 \n",
355 | "1 0 0 1.0 \n",
356 | "2 0 0 1.0 \n",
357 | "3 0 0 1.0 \n",
358 | "4 0 0 1.0 \n",
359 | "\n",
360 | "[5 rows x 101 columns]"
361 | ]
362 | },
363 | "execution_count": 36,
364 | "metadata": {},
365 | "output_type": "execute_result"
366 | }
367 | ],
368 | "source": [
369 | "gss = read_gss('gss_eda')\n",
370 | "print(gss.shape)\n",
371 | "gss.head()"
372 | ]
373 | },
374 | {
375 | "cell_type": "code",
376 | "execution_count": 37,
377 | "metadata": {},
378 | "outputs": [
379 | {
380 | "data": {
381 | "text/plain": [
382 | "0 21165\n",
383 | "1 30936\n",
384 | "2 9536\n",
385 | "8 722\n",
386 | "9 107\n",
387 | "Name: gunlaw, dtype: int64"
388 | ]
389 | },
390 | "execution_count": 37,
391 | "metadata": {},
392 | "output_type": "execute_result"
393 | }
394 | ],
395 | "source": [
396 | "gss.gunlaw.value_counts().sort_index()"
397 | ]
398 | },
399 | {
400 | "cell_type": "code",
401 | "execution_count": 38,
402 | "metadata": {},
403 | "outputs": [
404 | {
405 | "data": {
406 | "text/plain": [
407 | "0 24364\n",
408 | "1 10946\n",
409 | "2 25153\n",
410 | "8 1892\n",
411 | "9 111\n",
412 | "Name: grass, dtype: int64"
413 | ]
414 | },
415 | "execution_count": 38,
416 | "metadata": {},
417 | "output_type": "execute_result"
418 | }
419 | ],
420 | "source": [
421 | "gss.grass.value_counts().sort_index()"
422 | ]
423 | },
424 | {
425 | "cell_type": "code",
426 | "execution_count": 39,
427 | "metadata": {},
428 | "outputs": [],
429 | "source": [
430 | "def replace_invalid(df):\n",
431 | " df.realinc.replace([0], np.nan, inplace=True) \n",
432 | " df.educ.replace([98,99], np.nan, inplace=True)\n",
433 | " # 89 means 89 or older\n",
434 | " df.age.replace([98, 99], np.nan, inplace=True) \n",
435 | " df.cohort.replace([9999], np.nan, inplace=True)\n",
436 | " df.adults.replace([9], np.nan, inplace=True)\n",
437 | " df.gunlaw.replace([0,8,9], np.nan, inplace=True)\n",
438 | " df.grass.replace([0,8,9], np.nan, inplace=True)\n",
439 | "\n",
440 | "replace_invalid(gss)"
441 | ]
442 | },
443 | {
444 | "cell_type": "markdown",
445 | "metadata": {},
446 | "source": [
447 | "The proportion of women in this dataset is slightly higher than it probably is in the population, even after weighting.\n",
448 | "\n",
449 | "The issue seems to be that the GSS excludes people living in institutions, including prisons and army housing, which disproportionaly excludes men."
450 | ]
451 | },
452 | {
453 | "cell_type": "code",
454 | "execution_count": 40,
455 | "metadata": {},
456 | "outputs": [],
457 | "source": [
458 | "sex = gss.loc[gss.year==2010, 'sex']"
459 | ]
460 | },
461 | {
462 | "cell_type": "code",
463 | "execution_count": 41,
464 | "metadata": {},
465 | "outputs": [
466 | {
467 | "data": {
468 | "text/plain": [
469 | "1 0.43591\n",
470 | "2 0.56409\n",
471 | "Name: Pmf, dtype: float64"
472 | ]
473 | },
474 | "execution_count": 41,
475 | "metadata": {},
476 | "output_type": "execute_result"
477 | }
478 | ],
479 | "source": [
480 | "pmf = Pmf([1,2])\n",
481 | "pmf[1] = np.sum(sex==1)\n",
482 | "pmf[2] = np.sum(sex==2)\n",
483 | "pmf.normalize()\n",
484 | "pmf"
485 | ]
486 | },
487 | {
488 | "cell_type": "code",
489 | "execution_count": 42,
490 | "metadata": {},
491 | "outputs": [
492 | {
493 | "data": {
494 | "text/plain": [
495 | "1 0.451634\n",
496 | "2 0.548366\n",
497 | "Name: Pmf, dtype: float64"
498 | ]
499 | },
500 | "execution_count": 42,
501 | "metadata": {},
502 | "output_type": "execute_result"
503 | }
504 | ],
505 | "source": [
506 | "pmf = Pmf([1,2])\n",
507 | "pmf[1] = np.sum((sex==1) * gss.wtssall)\n",
508 | "pmf[2] = np.sum((sex==2) * gss.wtssall)\n",
509 | "pmf.normalize()\n",
510 | "pmf"
511 | ]
512 | },
513 | {
514 | "cell_type": "code",
515 | "execution_count": 43,
516 | "metadata": {},
517 | "outputs": [
518 | {
519 | "data": {
520 | "text/plain": [
521 | "1 0.453784\n",
522 | "2 0.546216\n",
523 | "Name: Pmf, dtype: float64"
524 | ]
525 | },
526 | "execution_count": 43,
527 | "metadata": {},
528 | "output_type": "execute_result"
529 | }
530 | ],
531 | "source": [
532 | "pmf = Pmf([1,2])\n",
533 | "pmf[1] = np.sum((sex==1) * gss.wtssnr)\n",
534 | "pmf[2] = np.sum((sex==2) * gss.wtssnr)\n",
535 | "pmf.normalize()\n",
536 | "pmf"
537 | ]
538 | },
539 | {
540 | "cell_type": "code",
541 | "execution_count": 44,
542 | "metadata": {},
543 | "outputs": [
544 | {
545 | "data": {
546 | "text/plain": [
547 | "1 0.463868\n",
548 | "2 0.536132\n",
549 | "Name: Pmf, dtype: float64"
550 | ]
551 | },
552 | "execution_count": 44,
553 | "metadata": {},
554 | "output_type": "execute_result"
555 | }
556 | ],
557 | "source": [
558 | "pmf = Pmf([1,2])\n",
559 | "pmf[1] = np.sum((sex==1) * gss.wtssall * gss.adults)\n",
560 | "pmf[2] = np.sum((sex==2) * gss.wtssall * gss.adults)\n",
561 | "pmf.normalize()\n",
562 | "pmf"
563 | ]
564 | },
565 | {
566 | "cell_type": "code",
567 | "execution_count": 45,
568 | "metadata": {},
569 | "outputs": [
570 | {
571 | "data": {
572 | "text/plain": [
573 | "1 0.485397\n",
574 | "2 0.514603\n",
575 | "Name: Pmf, dtype: float64"
576 | ]
577 | },
578 | "execution_count": 45,
579 | "metadata": {},
580 | "output_type": "execute_result"
581 | }
582 | ],
583 | "source": [
584 | "pmf = Pmf([1,2])\n",
585 | "pmf[1] = 114173831\n",
586 | "pmf[2] = 121043794\n",
587 | "pmf.normalize()\n",
588 | "pmf"
589 | ]
590 | },
591 | {
592 | "cell_type": "code",
593 | "execution_count": 46,
594 | "metadata": {
595 | "scrolled": true
596 | },
597 | "outputs": [],
598 | "source": [
599 | "gss['wtsample'] = gss['wtssall']\n",
600 | "gss.loc[gss.sex==1, 'wtsample'] *= 1.145"
601 | ]
602 | },
603 | {
604 | "cell_type": "code",
605 | "execution_count": 47,
606 | "metadata": {},
607 | "outputs": [
608 | {
609 | "data": {
610 | "text/plain": [
611 | "1 0.485338\n",
612 | "2 0.514662\n",
613 | "Name: Pmf, dtype: float64"
614 | ]
615 | },
616 | "execution_count": 47,
617 | "metadata": {},
618 | "output_type": "execute_result"
619 | }
620 | ],
621 | "source": [
622 | "pmf = Pmf([1,2])\n",
623 | "pmf[1] = np.sum((sex==1) * gss.wtsample)\n",
624 | "pmf[2] = np.sum((sex==2) * gss.wtsample)\n",
625 | "pmf.normalize()\n",
626 | "pmf"
627 | ]
628 | },
629 | {
630 | "cell_type": "code",
631 | "execution_count": 49,
632 | "metadata": {
633 | "scrolled": true
634 | },
635 | "outputs": [
636 | {
637 | "data": {
638 | "text/html": [
639 | "\n",
640 | "\n",
653 | "
\n",
654 | " \n",
655 | " \n",
656 | " | \n",
657 | " year | \n",
658 | " age | \n",
659 | " cohort | \n",
660 | " sex | \n",
661 | " race | \n",
662 | " educ | \n",
663 | " realinc | \n",
664 | " gunlaw | \n",
665 | " grass | \n",
666 | " wtssall | \n",
667 | "
\n",
668 | " \n",
669 | " \n",
670 | " \n",
671 | " | 0 | \n",
672 | " 1972 | \n",
673 | " 23.0 | \n",
674 | " 1949.0 | \n",
675 | " 2 | \n",
676 | " 1 | \n",
677 | " 16.0 | \n",
678 | " 18951.0 | \n",
679 | " 1.0 | \n",
680 | " NaN | \n",
681 | " 0.4446 | \n",
682 | "
\n",
683 | " \n",
684 | " | 1 | \n",
685 | " 1972 | \n",
686 | " 70.0 | \n",
687 | " 1902.0 | \n",
688 | " 1 | \n",
689 | " 1 | \n",
690 | " 10.0 | \n",
691 | " 24366.0 | \n",
692 | " 1.0 | \n",
693 | " NaN | \n",
694 | " 0.8893 | \n",
695 | "
\n",
696 | " \n",
697 | " | 2 | \n",
698 | " 1972 | \n",
699 | " 48.0 | \n",
700 | " 1924.0 | \n",
701 | " 2 | \n",
702 | " 1 | \n",
703 | " 12.0 | \n",
704 | " 24366.0 | \n",
705 | " 1.0 | \n",
706 | " NaN | \n",
707 | " 0.8893 | \n",
708 | "
\n",
709 | " \n",
710 | " | 3 | \n",
711 | " 1972 | \n",
712 | " 27.0 | \n",
713 | " 1945.0 | \n",
714 | " 2 | \n",
715 | " 1 | \n",
716 | " 17.0 | \n",
717 | " 30458.0 | \n",
718 | " 1.0 | \n",
719 | " NaN | \n",
720 | " 0.8893 | \n",
721 | "
\n",
722 | " \n",
723 | " | 4 | \n",
724 | " 1972 | \n",
725 | " 61.0 | \n",
726 | " 1911.0 | \n",
727 | " 2 | \n",
728 | " 1 | \n",
729 | " 12.0 | \n",
730 | " 50763.0 | \n",
731 | " 1.0 | \n",
732 | " NaN | \n",
733 | " 0.8893 | \n",
734 | "
\n",
735 | " \n",
736 | "
\n",
737 | "
"
738 | ],
739 | "text/plain": [
740 | " year age cohort sex race educ realinc gunlaw grass wtssall\n",
741 | "0 1972 23.0 1949.0 2 1 16.0 18951.0 1.0 NaN 0.4446\n",
742 | "1 1972 70.0 1902.0 1 1 10.0 24366.0 1.0 NaN 0.8893\n",
743 | "2 1972 48.0 1924.0 2 1 12.0 24366.0 1.0 NaN 0.8893\n",
744 | "3 1972 27.0 1945.0 2 1 17.0 30458.0 1.0 NaN 0.8893\n",
745 | "4 1972 61.0 1911.0 2 1 12.0 50763.0 1.0 NaN 0.8893"
746 | ]
747 | },
748 | "execution_count": 49,
749 | "metadata": {},
750 | "output_type": "execute_result"
751 | }
752 | ],
753 | "source": [
754 | "variables = ['year', 'age', 'cohort', 'sex', 'race', \n",
755 | " 'educ', 'realinc', 'gunlaw', 'grass', 'wtssall']\n",
756 | "\n",
757 | "subset = gss[variables]\n",
758 | "subset.head()"
759 | ]
760 | },
761 | {
762 | "cell_type": "code",
763 | "execution_count": 50,
764 | "metadata": {},
765 | "outputs": [],
766 | "source": [
767 | "# drop the 65 respondents with unknown household size\n",
768 | "# subset = subset.dropna(subset=['adults'])"
769 | ]
770 | },
771 | {
772 | "cell_type": "code",
773 | "execution_count": 51,
774 | "metadata": {},
775 | "outputs": [],
776 | "source": [
777 | "np.random.seed(19)\n",
778 | "sample = utils.resample_by_year(subset, 'wtssall')"
779 | ]
780 | },
781 | {
782 | "cell_type": "code",
783 | "execution_count": 52,
784 | "metadata": {},
785 | "outputs": [],
786 | "source": [
787 | "!rm gss.hdf5\n",
788 | "sample.to_hdf('gss.hdf5', 'gss')"
789 | ]
790 | },
791 | {
792 | "cell_type": "code",
793 | "execution_count": 53,
794 | "metadata": {},
795 | "outputs": [
796 | {
797 | "name": "stdout",
798 | "output_type": "stream",
799 | "text": [
800 | "CPU times: user 16 ms, sys: 4 ms, total: 20 ms\n",
801 | "Wall time: 18.8 ms\n"
802 | ]
803 | },
804 | {
805 | "data": {
806 | "text/plain": [
807 | "(62466, 10)"
808 | ]
809 | },
810 | "execution_count": 53,
811 | "metadata": {},
812 | "output_type": "execute_result"
813 | }
814 | ],
815 | "source": [
816 | "%time gss = pd.read_hdf('gss.hdf5', 'gss')\n",
817 | "gss.shape"
818 | ]
819 | },
820 | {
821 | "cell_type": "code",
822 | "execution_count": 54,
823 | "metadata": {},
824 | "outputs": [
825 | {
826 | "data": {
827 | "text/html": [
828 | "\n",
829 | "\n",
842 | "
\n",
843 | " \n",
844 | " \n",
845 | " | \n",
846 | " year | \n",
847 | " age | \n",
848 | " cohort | \n",
849 | " sex | \n",
850 | " race | \n",
851 | " educ | \n",
852 | " realinc | \n",
853 | " gunlaw | \n",
854 | " grass | \n",
855 | " wtssall | \n",
856 | "
\n",
857 | " \n",
858 | " \n",
859 | " \n",
860 | " | 0 | \n",
861 | " 1972 | \n",
862 | " 26.0 | \n",
863 | " 1946.0 | \n",
864 | " 1 | \n",
865 | " 1 | \n",
866 | " 18.0 | \n",
867 | " 13537.0 | \n",
868 | " 2.0 | \n",
869 | " NaN | \n",
870 | " 0.8893 | \n",
871 | "
\n",
872 | " \n",
873 | " | 1 | \n",
874 | " 1972 | \n",
875 | " 38.0 | \n",
876 | " 1934.0 | \n",
877 | " 2 | \n",
878 | " 1 | \n",
879 | " 12.0 | \n",
880 | " 18951.0 | \n",
881 | " 1.0 | \n",
882 | " NaN | \n",
883 | " 0.4446 | \n",
884 | "
\n",
885 | " \n",
886 | " | 2 | \n",
887 | " 1972 | \n",
888 | " 57.0 | \n",
889 | " 1915.0 | \n",
890 | " 1 | \n",
891 | " 1 | \n",
892 | " 12.0 | \n",
893 | " 30458.0 | \n",
894 | " 1.0 | \n",
895 | " NaN | \n",
896 | " 1.3339 | \n",
897 | "
\n",
898 | " \n",
899 | " | 3 | \n",
900 | " 1972 | \n",
901 | " 61.0 | \n",
902 | " 1911.0 | \n",
903 | " 2 | \n",
904 | " 1 | \n",
905 | " 14.0 | \n",
906 | " 37226.0 | \n",
907 | " 1.0 | \n",
908 | " NaN | \n",
909 | " 0.8893 | \n",
910 | "
\n",
911 | " \n",
912 | " | 4 | \n",
913 | " 1972 | \n",
914 | " 59.0 | \n",
915 | " 1913.0 | \n",
916 | " 1 | \n",
917 | " 1 | \n",
918 | " 12.0 | \n",
919 | " 30458.0 | \n",
920 | " 2.0 | \n",
921 | " NaN | \n",
922 | " 0.8893 | \n",
923 | "
\n",
924 | " \n",
925 | "
\n",
926 | "
"
927 | ],
928 | "text/plain": [
929 | " year age cohort sex race educ realinc gunlaw grass wtssall\n",
930 | "0 1972 26.0 1946.0 1 1 18.0 13537.0 2.0 NaN 0.8893\n",
931 | "1 1972 38.0 1934.0 2 1 12.0 18951.0 1.0 NaN 0.4446\n",
932 | "2 1972 57.0 1915.0 1 1 12.0 30458.0 1.0 NaN 1.3339\n",
933 | "3 1972 61.0 1911.0 2 1 14.0 37226.0 1.0 NaN 0.8893\n",
934 | "4 1972 59.0 1913.0 1 1 12.0 30458.0 2.0 NaN 0.8893"
935 | ]
936 | },
937 | "execution_count": 54,
938 | "metadata": {},
939 | "output_type": "execute_result"
940 | }
941 | ],
942 | "source": [
943 | "gss.head()"
944 | ]
945 | },
946 | {
947 | "cell_type": "code",
948 | "execution_count": 24,
949 | "metadata": {},
950 | "outputs": [
951 | {
952 | "data": {
953 | "text/plain": [
954 | "count 62466.000000\n",
955 | "mean 1994.072359\n",
956 | "std 12.937941\n",
957 | "min 1972.000000\n",
958 | "25% 1984.000000\n",
959 | "50% 1994.000000\n",
960 | "75% 2006.000000\n",
961 | "max 2016.000000\n",
962 | "Name: year, dtype: float64"
963 | ]
964 | },
965 | "execution_count": 24,
966 | "metadata": {},
967 | "output_type": "execute_result"
968 | }
969 | ],
970 | "source": [
971 | "gss['year'].describe()"
972 | ]
973 | },
974 | {
975 | "cell_type": "code",
976 | "execution_count": 25,
977 | "metadata": {},
978 | "outputs": [
979 | {
980 | "data": {
981 | "text/plain": [
982 | "count 62466.000000\n",
983 | "mean 1.541415\n",
984 | "std 0.498286\n",
985 | "min 1.000000\n",
986 | "25% 1.000000\n",
987 | "50% 2.000000\n",
988 | "75% 2.000000\n",
989 | "max 2.000000\n",
990 | "Name: sex, dtype: float64"
991 | ]
992 | },
993 | "execution_count": 25,
994 | "metadata": {},
995 | "output_type": "execute_result"
996 | }
997 | ],
998 | "source": [
999 | "gss['sex'].describe()"
1000 | ]
1001 | },
1002 | {
1003 | "cell_type": "code",
1004 | "execution_count": 26,
1005 | "metadata": {},
1006 | "outputs": [
1007 | {
1008 | "data": {
1009 | "text/plain": [
1010 | "count 62281.000000\n",
1011 | "mean 44.648320\n",
1012 | "std 17.072244\n",
1013 | "min 18.000000\n",
1014 | "25% 30.000000\n",
1015 | "50% 43.000000\n",
1016 | "75% 57.000000\n",
1017 | "max 89.000000\n",
1018 | "Name: age, dtype: float64"
1019 | ]
1020 | },
1021 | "execution_count": 26,
1022 | "metadata": {},
1023 | "output_type": "execute_result"
1024 | }
1025 | ],
1026 | "source": [
1027 | "gss['age'].describe()"
1028 | ]
1029 | },
1030 | {
1031 | "cell_type": "code",
1032 | "execution_count": 27,
1033 | "metadata": {},
1034 | "outputs": [
1035 | {
1036 | "data": {
1037 | "text/plain": [
1038 | "count 62282.000000\n",
1039 | "mean 1949.429996\n",
1040 | "std 20.734302\n",
1041 | "min 1883.000000\n",
1042 | "25% 1935.000000\n",
1043 | "50% 1951.000000\n",
1044 | "75% 1964.000000\n",
1045 | "max 1998.000000\n",
1046 | "Name: cohort, dtype: float64"
1047 | ]
1048 | },
1049 | "execution_count": 27,
1050 | "metadata": {},
1051 | "output_type": "execute_result"
1052 | }
1053 | ],
1054 | "source": [
1055 | "gss['cohort'].describe()"
1056 | ]
1057 | },
1058 | {
1059 | "cell_type": "code",
1060 | "execution_count": 28,
1061 | "metadata": {},
1062 | "outputs": [
1063 | {
1064 | "data": {
1065 | "text/plain": [
1066 | "count 62466.000000\n",
1067 | "mean 1.254955\n",
1068 | "std 0.554694\n",
1069 | "min 1.000000\n",
1070 | "25% 1.000000\n",
1071 | "50% 1.000000\n",
1072 | "75% 1.000000\n",
1073 | "max 3.000000\n",
1074 | "Name: race, dtype: float64"
1075 | ]
1076 | },
1077 | "execution_count": 28,
1078 | "metadata": {},
1079 | "output_type": "execute_result"
1080 | }
1081 | ],
1082 | "source": [
1083 | "gss['race'].describe()"
1084 | ]
1085 | },
1086 | {
1087 | "cell_type": "code",
1088 | "execution_count": 29,
1089 | "metadata": {},
1090 | "outputs": [
1091 | {
1092 | "data": {
1093 | "text/plain": [
1094 | "count 62304.000000\n",
1095 | "mean 12.831311\n",
1096 | "std 3.117027\n",
1097 | "min 0.000000\n",
1098 | "25% 12.000000\n",
1099 | "50% 12.000000\n",
1100 | "75% 15.000000\n",
1101 | "max 20.000000\n",
1102 | "Name: educ, dtype: float64"
1103 | ]
1104 | },
1105 | "execution_count": 29,
1106 | "metadata": {},
1107 | "output_type": "execute_result"
1108 | }
1109 | ],
1110 | "source": [
1111 | "gss['educ'].describe()"
1112 | ]
1113 | },
1114 | {
1115 | "cell_type": "code",
1116 | "execution_count": 30,
1117 | "metadata": {},
1118 | "outputs": [
1119 | {
1120 | "data": {
1121 | "text/plain": [
1122 | "count 55499.000000\n",
1123 | "mean 34702.430164\n",
1124 | "std 30665.659411\n",
1125 | "min 234.000000\n",
1126 | "25% 13750.000000\n",
1127 | "50% 26015.000000\n",
1128 | "75% 43426.000000\n",
1129 | "max 162607.000000\n",
1130 | "Name: realinc, dtype: float64"
1131 | ]
1132 | },
1133 | "execution_count": 30,
1134 | "metadata": {},
1135 | "output_type": "execute_result"
1136 | }
1137 | ],
1138 | "source": [
1139 | "gss['realinc'].describe()"
1140 | ]
1141 | },
1142 | {
1143 | "cell_type": "code",
1144 | "execution_count": 31,
1145 | "metadata": {},
1146 | "outputs": [
1147 | {
1148 | "data": {
1149 | "text/plain": [
1150 | "count 62466.000000\n",
1151 | "mean 1.213340\n",
1152 | "std 0.585544\n",
1153 | "min 0.411898\n",
1154 | "25% 0.918400\n",
1155 | "50% 1.062100\n",
1156 | "75% 1.515500\n",
1157 | "max 8.739876\n",
1158 | "Name: wtssall, dtype: float64"
1159 | ]
1160 | },
1161 | "execution_count": 31,
1162 | "metadata": {},
1163 | "output_type": "execute_result"
1164 | }
1165 | ],
1166 | "source": [
1167 | "gss['wtssall'].describe()"
1168 | ]
1169 | }
1170 | ],
1171 | "metadata": {
1172 | "kernelspec": {
1173 | "display_name": "Python 3",
1174 | "language": "python",
1175 | "name": "python3"
1176 | },
1177 | "language_info": {
1178 | "codemirror_mode": {
1179 | "name": "ipython",
1180 | "version": 3
1181 | },
1182 | "file_extension": ".py",
1183 | "mimetype": "text/x-python",
1184 | "name": "python",
1185 | "nbconvert_exporter": "python",
1186 | "pygments_lexer": "ipython3",
1187 | "version": "3.6.7"
1188 | }
1189 | },
1190 | "nbformat": 4,
1191 | "nbformat_minor": 1
1192 | }
1193 |
--------------------------------------------------------------------------------