├── .gitattributes
├── nsfg.hdf5
├── data
    ├── GSS.dat.gz
    ├── 2013_2015_FemPregData.dat.gz
    ├── GSS.do
    ├── GSS.dct
    └── 2013_2015_FemPregSetup.dct
├── gss_eda
    ├── GSS.dat.gz
    ├── GSS.dct
    └── GSS.do
├── README.md
├── LLCP2017.ASC.gz
├── environment.yml
├── LICENSE
├── .gitignore
├── distribution.py
├── utils.py
├── eds01_gss_clean.ipynb
└── gss_validate.ipynb


/.gitattributes:
--------------------------------------------------------------------------------
1 | LLCP2017.ASC.gz filter=lfs diff=lfs merge=lfs -text
2 | 


--------------------------------------------------------------------------------
/nsfg.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AllenDowney/ExploratoryDataAnalysis/HEAD/nsfg.hdf5


--------------------------------------------------------------------------------
/data/GSS.dat.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AllenDowney/ExploratoryDataAnalysis/HEAD/data/GSS.dat.gz


--------------------------------------------------------------------------------
/gss_eda/GSS.dat.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AllenDowney/ExploratoryDataAnalysis/HEAD/gss_eda/GSS.dat.gz


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ExploratoryDataAnalysis
2 | Repository for an online class on Exploratory Data Analysis in Python
3 | 


--------------------------------------------------------------------------------
/data/2013_2015_FemPregData.dat.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AllenDowney/ExploratoryDataAnalysis/HEAD/data/2013_2015_FemPregData.dat.gz


--------------------------------------------------------------------------------
/LLCP2017.ASC.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b13013ed06f8a69eb58b6c352208b4f5d8cfd0780896873ca39d81efcfb97a4c
3 | size 69310674
4 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: ExploratoryDataAnalysis
 2 | 
 3 | dependencies:
 4 |   - python=3.7
 5 |   - jupyter
 6 |   - numpy
 7 |   - matplotlib
 8 |   - seaborn
 9 |   - pandas
10 |   - pytables
11 |   - scipy
12 |   - scikit-learn
13 |   - pip
14 |   - pip:
15 |     - empiricaldist
16 | 
17 | 
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Allen Downey
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/distribution.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | 
 7 | def underride(d, **options):
 8 |     """Add key-value pairs to d only if key is not in d.
 9 | 
10 |     d: dictionary
11 |     options: keyword args to add to d
12 |     """
13 |     for key, val in options.items():
14 |         d.setdefault(key, val)
15 | 
16 |     return d
17 | 
18 | 
19 | class Pmf(pd.Series):
20 |     
21 |     def __init__(self, seq, name='Pmf', normalize=True):
22 |         series = pd.Series(seq).value_counts().sort_index()
23 |         super().__init__(series, name=name)
24 |         if normalize:
25 |             self.normalize()
26 | 
27 |     @property
28 |     def qs(self):
29 |         return self.index.values
30 | 
31 |     @property
32 |     def ps(self):
33 |         return self.values
34 | 
35 |     def __call__(self, qs):
36 |         return self.get(qs, 0)
37 | 
38 |     def normalize(self):
39 |         self /= self.sum()
40 | 
41 |     def bar(self, **options):
42 |         underride(options, label=self.name)
43 |         plt.bar(self.index, self.values, **options)
44 | 
45 |     def plot(self, **options):
46 |         underride(options, label=self.name)
47 |         plt.plot(self.index, self.values, **options)
48 | 
49 | 
50 | 
51 | from scipy.interpolate import interp1d
52 | 
53 | class Cdf(pd.Series):
54 | 
55 |     def __init__(self, seq, name='Cdf'):
56 |         pmf = Pmf(seq)
57 |         super().__init__(pmf.cumsum(), name=name)
58 | 
59 |     @property
60 |     def qs(self):
61 |         return self.index
62 | 
63 |     @property
64 |     def ps(self):
65 |         return self.values
66 | 
67 |     @property
68 |     def forward(self):
69 |         return interp1d(self.qs, self.ps,
70 |                         kind='previous',
71 |                         assume_sorted=True,
72 |                         bounds_error=False,
73 |                         fill_value=(0,1))
74 | 
75 |     @property
76 |     def inverse(self):
77 |         return interp1d(self.ps, self.qs,
78 |                         kind='next',
79 |                         assume_sorted=True,
80 |                         bounds_error=False,
81 |                         fill_value=(self.qs[0], np.nan))
82 | 
83 |     def __call__(self, qs):
84 |         return self.forward(qs)
85 | 
86 |     def percentile_rank(self, qs):
87 |         return self.forward(qs) * 100
88 | 
89 |     def percentile(self, percentile_ranks):
90 |         return self.inverse(percentile_ranks / 100)
91 | 
92 |     def step(self, **options):
93 |         underride(options, label=self.name, where='post')
94 |         plt.step(self.index, self.values, **options)
95 | 
96 |     def plot(self, **options):
97 |         underride(options, label=self.name)
98 |         plt.plot(self.index, self.values, **options)


--------------------------------------------------------------------------------
/data/GSS.do:
--------------------------------------------------------------------------------
  1 | #delimit ;
  2 | 
  3 |    infix
  4 |       year     1 - 20
  5 |       gun      21 - 40
  6 |       gunage   41 - 60
  7 |       gunnum   61 - 80
  8 |       owngun   81 - 100
  9 |       rowngun  101 - 120
 10 |       realinc  121 - 140
 11 |       conrinc  141 - 160
 12 |       hispanic 161 - 180
 13 |       cohort   181 - 200
 14 |       ballot   201 - 220
 15 |       gunlaw   221 - 240
 16 |       cappun   241 - 260
 17 |       id_      261 - 280
 18 |       age      281 - 300
 19 |       educ     301 - 320
 20 |       sex      321 - 340
 21 |       race     341 - 360
 22 |       income   361 - 380
 23 |       rincome  381 - 400
 24 |       srcbelt  401 - 420
 25 |       polviews 421 - 440
 26 |       natcrime 441 - 460
 27 |       wtssall  461 - 480
 28 | using GSS.dat;
 29 | 
 30 | label variable year     "Gss year for this respondent                       ";
 31 | label variable gun      "Ever threatened with gun or shot at";
 32 | label variable gunage   "Threatened or shot at as child or adult";
 33 | label variable gunnum   "If threatened or shot at--how many times";
 34 | label variable owngun   "Have gun in home";
 35 | label variable rowngun  "Does gun belong to r";
 36 | label variable realinc  "Family income in constant $";
 37 | label variable conrinc  "Respondent income in constant dollars";
 38 | label variable hispanic "Hispanic specified";
 39 | label variable cohort   "Year of birth";
 40 | label variable ballot   "Ballot used for interview";
 41 | label variable gunlaw   "Favor or oppose gun permits";
 42 | label variable cappun   "Favor or oppose death penalty for murder";
 43 | label variable id_      "Respondent id number";
 44 | label variable age      "Age of respondent";
 45 | label variable educ     "Highest year of school completed";
 46 | label variable sex      "Respondents sex";
 47 | label variable race     "Race of respondent";
 48 | label variable income   "Total family income";
 49 | label variable rincome  "Respondents income";
 50 | label variable srcbelt  "Src beltcode";
 51 | label variable polviews "Think of self as liberal or conservative";
 52 | label variable natcrime "Halting rising crime rate";
 53 | label variable wtssall  "Weight variable";
 54 | 
 55 | 
 56 | label define gsp001x
 57 |    9        "No answer"
 58 |    8        "Don't know"
 59 |    2        "No"
 60 |    1        "Yes"
 61 |    0        "Not applicable"
 62 | ;
 63 | label define gsp002x
 64 |    9        "No answer"
 65 |    8        "Don't know"
 66 |    3        "Both"
 67 |    2        "Adult"
 68 |    1        "Child"
 69 |    0        "Not applicable"
 70 | ;
 71 | label define gsp003x
 72 |    9        "No answer"
 73 |    8        "Not sure"
 74 |    3        "4+ times"
 75 |    2        "2-3 times"
 76 |    1        "Once"
 77 |    0        "Not applicable"
 78 | ;
 79 | label define gsp004x
 80 |    9        "No answer"
 81 |    8        "Don't know"
 82 |    3        "Refused"
 83 |    2        "No"
 84 |    1        "Yes"
 85 |    0        "Not applicable"
 86 | ;
 87 | label define gsp005x
 88 |    9        "No answer"
 89 |    8        "Don't know"
 90 |    3        "Refused"
 91 |    2        "No"
 92 |    1        "Yes"
 93 |    0        "Not applicable"
 94 | ;
 95 | label define gsp006x
 96 |    999999   "No answer"
 97 |    999998   "Dont know"
 98 |    0        "Not applicable"
 99 | ;
100 | label define gsp007x
101 |    999999   "No answer"
102 |    999998   "Dont know"
103 |    0        "Not applicable"
104 | ;
105 | label define gsp008x
106 |    99       "No answer"
107 |    98       "Don't know"
108 |    50       "Other, not specified"
109 |    47       "Hispanic"
110 |    46       "Latino/a"
111 |    45       "Latin"
112 |    41       "South american"
113 |    40       "Latin american"
114 |    35       "Filipino/a"
115 |    31       "Basque"
116 |    30       "Spanish"
117 |    25       "Chilean"
118 |    24       "Argentinian"
119 |    23       "Venezuelan"
120 |    22       "Columbian"
121 |    21       "Equadorian"
122 |    20       "Peruvian"
123 |    16       "West indian"
124 |    15       "Dominican"
125 |    11       "Honduran"
126 |    10       "Central american"
127 |    9        "Costa rican"
128 |    8        "Nicaraguan"
129 |    7        "Panamanian"
130 |    6        "Guatemalan"
131 |    5        "Salvadorian"
132 |    4        "Cuban"
133 |    3        "Puerto rican"
134 |    2        "Mexican, mexican american, chicano/a"
135 |    1        "Not hispanic"
136 |    0        "Not applicable"
137 | ;
138 | label define gsp009x
139 |    9999     "No answer"
140 |    0        "Not applicable"
141 | ;
142 | label define gsp010x
143 |    4        "Ballot d"
144 |    3        "Ballot c"
145 |    2        "Ballot b"
146 |    1        "Ballot a"
147 |    0        "Not applicable"
148 | ;
149 | label define gsp011x
150 |    9        "No answer"
151 |    8        "Don't know"
152 |    2        "Oppose"
153 |    1        "Favor"
154 |    0        "Not applicable"
155 | ;
156 | label define gsp012x
157 |    9        "No answer"
158 |    8        "Don't know"
159 |    2        "Oppose"
160 |    1        "Favor"
161 |    0        "Not applicable"
162 | ;
163 | label define gsp013x
164 |    99       "No answer"
165 |    98       "Don't know"
166 |    89       "89 or older"
167 | ;
168 | label define gsp014x
169 |    99       "No answer"
170 |    98       "Don't know"
171 |    97       "Not applicable"
172 | ;
173 | label define gsp015x
174 |    2        "Female"
175 |    1        "Male"
176 | ;
177 | label define gsp016x
178 |    3        "Other"
179 |    2        "Black"
180 |    1        "White"
181 |    0        "Not applicable"
182 | ;
183 | label define gsp017x
184 |    99       "No answer"
185 |    98       "Don't know"
186 |    13       "Refused"
187 |    12       "$25000 or more"
188 |    11       "$20000 - 24999"
189 |    10       "$15000 - 19999"
190 |    9        "$10000 - 14999"
191 |    8        "$8000 to 9999"
192 |    7        "$7000 to 7999"
193 |    6        "$6000 to 6999"
194 |    5        "$5000 to 5999"
195 |    4        "$4000 to 4999"
196 |    3        "$3000 to 3999"
197 |    2        "$1000 to 2999"
198 |    1        "Lt $1000"
199 |    0        "Not applicable"
200 | ;
201 | label define gsp018x
202 |    99       "No answer"
203 |    98       "Don't know"
204 |    13       "Refused"
205 |    12       "$25000 or more"
206 |    11       "$20000 - 24999"
207 |    10       "$15000 - 19999"
208 |    9        "$10000 - 14999"
209 |    8        "$8000 to 9999"
210 |    7        "$7000 to 7999"
211 |    6        "$6000 to 6999"
212 |    5        "$5000 to 5999"
213 |    4        "$4000 to 4999"
214 |    3        "$3000 to 3999"
215 |    2        "$1000 to 2999"
216 |    1        "Lt $1000"
217 |    0        "Not applicable"
218 | ;
219 | label define gsp019x
220 |    6        "Other rural"
221 |    5        "Other urban"
222 |    4        "Suburb, 13-100"
223 |    3        "Suburb, 12 lrgst"
224 |    2        "Smsa's 13-100"
225 |    1        "12 lrgst smsa's"
226 |    0        "Not assigned"
227 | ;
228 | label define gsp020x
229 |    9        "No answer"
230 |    8        "Don't know"
231 |    7        "Extrmly conservative"
232 |    6        "Conservative"
233 |    5        "Slghtly conservative"
234 |    4        "Moderate"
235 |    3        "Slightly liberal"
236 |    2        "Liberal"
237 |    1        "Extremely liberal"
238 |    0        "Not applicable"
239 | ;
240 | label define gsp021x
241 |    9        "No answer"
242 |    8        "Don't know"
243 |    3        "Too much"
244 |    2        "About right"
245 |    1        "Too little"
246 |    0        "Not applicable"
247 | ;
248 | 
249 | 
250 | label values gun      gsp001x;
251 | label values gunage   gsp002x;
252 | label values gunnum   gsp003x;
253 | label values owngun   gsp004x;
254 | label values rowngun  gsp005x;
255 | label values realinc  gsp006x;
256 | label values conrinc  gsp007x;
257 | label values hispanic gsp008x;
258 | label values cohort   gsp009x;
259 | label values ballot   gsp010x;
260 | label values gunlaw   gsp011x;
261 | label values cappun   gsp012x;
262 | label values age      gsp013x;
263 | label values educ     gsp014x;
264 | label values sex      gsp015x;
265 | label values race     gsp016x;
266 | label values income   gsp017x;
267 | label values rincome  gsp018x;
268 | label values srcbelt  gsp019x;
269 | label values polviews gsp020x;
270 | label values natcrime gsp021x;
271 | 
272 | 
273 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | 
  5 | import re
  6 | 
  7 | class FixedWidthVariables(object):
  8 |     """Represents a set of variables in a fixed width file."""
  9 | 
 10 |     def __init__(self, variables, index_base=0):
 11 |         """Initializes.
 12 | 
 13 |         variables: DataFrame
 14 |         index_base: are the indices 0 or 1 based?
 15 | 
 16 |         Attributes:
 17 |         colspecs: list of (start, end) index tuples
 18 |         names: list of string variable names
 19 |         """
 20 |         self.variables = variables
 21 | 
 22 |         # note: by default, subtract 1 from colspecs
 23 |         self.colspecs = variables[['start', 'end']] - index_base
 24 | 
 25 |         # convert colspecs to a list of pair of int
 26 |         self.colspecs = self.colspecs.astype(np.int).values.tolist()
 27 |         self.names = variables['name']
 28 | 
 29 |     def read_fixed_width(self, filename, **options):
 30 |         """Reads a fixed width ASCII file.
 31 | 
 32 |         filename: string filename
 33 | 
 34 |         returns: DataFrame
 35 |         """
 36 |         df = pd.read_fwf(filename,
 37 |                              colspecs=self.colspecs,
 38 |                              names=self.names,
 39 |                              **options)
 40 |         return df
 41 | 
 42 | 
 43 | def read_stata_dict(dct_file, **options):
 44 |     """Reads a Stata dictionary file.
 45 | 
 46 |     dct_file: string filename
 47 |     options: dict of options passed to open()
 48 | 
 49 |     returns: FixedWidthVariables object
 50 |     """
 51 |     type_map = dict(byte=int, int=int, long=int, float=float,
 52 |                     double=float, numeric=float)
 53 | 
 54 |     var_info = []
 55 |     with open(dct_file, **options) as f:
 56 |         for line in f:
 57 |             match = re.search( r'_column\(([^)]*)\)', line)
 58 |             if not match:
 59 |                 continue
 60 |             start = int(match.group(1))
 61 |             t = line.split()
 62 |             vtype, name, fstring = t[1:4]
 63 |             name = name.lower()
 64 |             if vtype.startswith('str'):
 65 |                 vtype = str
 66 |             else:
 67 |                 vtype = type_map[vtype]
 68 |             long_desc = ' '.join(t[4:]).strip('"')
 69 |             var_info.append((start, vtype, name, fstring, long_desc))
 70 | 
 71 |     columns = ['start', 'type', 'name', 'fstring', 'desc']
 72 |     variables = pd.DataFrame(var_info, columns=columns)
 73 | 
 74 |     # fill in the end column by shifting the start column
 75 |     variables['end'] = variables.start.shift(-1)
 76 |     variables.loc[len(variables)-1, 'end'] = 0
 77 | 
 78 |     dct = FixedWidthVariables(variables, index_base=1)
 79 |     return dct
 80 | 
 81 | 
 82 | def read_stata(dct_name, dat_name, **options):
 83 |     """Reads Stata files from the given directory.
 84 | 
 85 |     dirname: string
 86 | 
 87 |     returns: DataFrame
 88 |     """
 89 |     dct = read_stata_dict(dct_name)
 90 |     df = dct.read_fixed_width(dat_name, **options)
 91 |     return df
 92 | 
 93 | 
 94 | def sample_rows(df, nrows, replace=False):
 95 |     """Choose a sample of rows from a DataFrame.
 96 | 
 97 |     df: DataFrame
 98 |     nrows: number of rows
 99 |     replace: whether to sample with replacement
100 | 
101 |     returns: DataDf
102 |     """
103 |     indices = np.random.choice(df.index, nrows, replace=replace)
104 |     sample = df.loc[indices]
105 |     return sample
106 | 
107 | 
108 | def resample_rows(df):
109 |     """Resamples rows from a DataFrame.
110 | 
111 |     df: DataFrame
112 | 
113 |     returns: DataFrame
114 |     """
115 |     return sample_rows(df, len(df), replace=True)
116 | 
117 | 
118 | def resample_rows_weighted(df, column='finalwgt'):
119 |     """Resamples a DataFrame using probabilities proportional to given column.
120 | 
121 |     df: DataFrame
122 |     column: string column name to use as weights
123 | 
124 |     returns: DataFrame
125 |     """
126 |     weights = df[column].copy()
127 |     weights /= sum(weights)
128 |     indices = np.random.choice(df.index, len(df), replace=True, p=weights)
129 |     sample = df.loc[indices]
130 |     return sample
131 | 
132 | 
133 | def resample_by_year(df, column='wtssall'):
134 |     """Resample rows within each year.
135 | 
136 |     df: DataFrame
137 |     column: string name of weight variable
138 | 
139 |     returns DataFrame
140 |     """
141 |     grouped = df.groupby('year')
142 |     samples = [resample_rows_weighted(group, column)
143 |                for _, group in grouped]
144 |     sample = pd.concat(samples, ignore_index=True)
145 |     return sample
146 | 
147 | 
148 | def values(df, varname):
149 |     """Values and counts in index order.
150 | 
151 |     df: DataFrame
152 |     varname: strign column name
153 | 
154 |     returns: Series that maps from value to frequency
155 |     """
156 |     return df[varname].value_counts().sort_index()
157 | 
158 | def count_by_year(gss, varname):
159 |     """Groups by category and year and counts.
160 | 
161 |     gss: DataFrame
162 |     varname: string variable to group by
163 | 
164 |     returns: DataFrame with one row per year, one column per category.
165 |     """
166 |     grouped = gss.groupby([varname, 'year'])
167 |     count = grouped[varname].count().unstack(level=0)
168 | 
169 |     # note: the following is not ideal, because it does not
170 |     # distinguish 0 from NA, but in this dataset the only
171 |     # zeros are during years when the question was not asked.
172 |     count = count.replace(0, np.nan).dropna()
173 |     return count
174 |     
175 | def fill_missing(df, varname, badvals=[98, 99]):
176 |     """Fill missing data with random values.
177 | 
178 |     df: DataFrame
179 |     varname: string column name
180 |     badvals: list of values to be replaced
181 |     """
182 |     # replace badvals with NaN
183 |     df[varname].replace(badvals, np.nan, inplace=True)
184 | 
185 |     # get the index of rows missing varname
186 |     null = df[varname].isnull()
187 |     n_missing = sum(null)
188 | 
189 |     # choose a random sample from the non-missing values
190 |     fill = np.random.choice(df[varname].dropna(), n_missing, replace=True)
191 | 
192 |     # replace missing data with the samples
193 |     df.loc[null, varname] = fill
194 | 
195 |     # return the number of missing values replaced
196 |     return n_missing
197 | 
198 | 
199 | def round_into_bins(df, var, bin_width, high=None, low=0):
200 |     """Rounds values down to the bin they belong in.
201 | 
202 |     df: DataFrame
203 |     var: string variable name
204 |     bin_width: number, width of the bins
205 | 
206 |     returns: array of bin values
207 |     """
208 |     if high is None:
209 |         high = df[var].max()
210 | 
211 |     bins = np.arange(low, high+bin_width, bin_width)
212 |     indices = np.digitize(df[var], bins)
213 |     return bins[indices-1]
214 | 
215 | 
216 | def underride(d, **options):
217 |     """Add key-value pairs to d only if key is not in d.
218 | 
219 |     d: dictionary
220 |     options: keyword args to add to d
221 |     """
222 |     for key, val in options.items():
223 |         d.setdefault(key, val)
224 | 
225 |     return d
226 | 
227 | 
228 | def decorate(**options):
229 |     """Decorate the current axes.
230 |     Call decorate with keyword arguments like
231 |     decorate(title='Title',
232 |              xlabel='x',
233 |              ylabel='y')
234 |     The keyword arguments can be any of the axis properties
235 |     https://matplotlib.org/api/axes_api.html
236 |     In addition, you can use `legend=False` to suppress the legend.
237 |     And you can use `loc` to indicate the location of the legend
238 |     (the default value is 'best')
239 |     """
240 |     loc = options.pop('loc', 'best')
241 |     if options.pop('legend', True):
242 |         legend(loc=loc)
243 | 
244 |     plt.gca().set(**options)
245 |     plt.tight_layout()
246 | 
247 | 
248 | def legend(**options):
249 |     """Draws a legend only if there is at least one labeled item.
250 |     options are passed to plt.legend()
251 |     https://matplotlib.org/api/_as_gen/matplotlib.pyplot.legend.html
252 |     """
253 |     underride(options, loc='best')
254 | 
255 |     ax = plt.gca()
256 |     handles, labels = ax.get_legend_handles_labels()
257 |     #TODO: don't draw if there are none
258 |     ax.legend(handles, labels, **options)
259 | 
260 | from statsmodels.nonparametric.smoothers_lowess import lowess
261 | 
262 | def make_lowess(series):
263 |     """Use LOWESS to compute a smooth line.
264 | 
265 |     series: pd.Series
266 | 
267 |     returns: pd.Series
268 |     """
269 |     endog = series.values
270 |     exog = series.index.values
271 | 
272 |     smooth = lowess(endog, exog)
273 |     index, data = np.transpose(smooth)
274 | 
275 |     return pd.Series(data, index=index)
276 | 
277 | def plot_series_lowess(series, color):
278 |     """Plots a series of data points and a smooth line.
279 | 
280 |     series: pd.Series
281 |     color: string or tuple
282 |     """
283 |     series.plot(lw=0, marker='o', color=color, alpha=0.5)
284 |     smooth = make_lowess(series)
285 |     smooth.plot(label='_', color=color)
286 | 
287 | def plot_columns_lowess(df, columns, colors):
288 |     """Plot the columns in a DataFrame.
289 | 
290 |     df: pd.DataFrame
291 |     columns: list of column names, in the desired order
292 |     colors: mapping from column names to colors
293 |     """
294 |     for col in columns:
295 |         series = df[col]
296 |         plot_series_lowess(series, colors[col])
297 | 
298 | def anchor_legend(x, y):
299 |     """Put the legend at the given locationself.
300 | 
301 |     x: axis coordinate
302 |     y: axis coordinate
303 |     """
304 |     plt.legend(bbox_to_anchor=(x, y), loc='upper left', ncol=1)
305 | 


--------------------------------------------------------------------------------
/data/GSS.dct:
--------------------------------------------------------------------------------
  1 | infile dictionary {
  2 |     _column(1)    numeric                           POSTLIFE   %20f  "Belief in life after death"
  3 |    _column(21)    numeric                               LIFE   %20f  "Is life exciting or dull"
  4 |    _column(41)    numeric                            HELPFUL   %20f  "People helpful or looking out for selves"
  5 |    _column(61)    numeric                               FAIR   %20f  "People fair or try to take advantage"
  6 |    _column(81)    numeric                              TRUST   %20f  "Can people be trusted"
  7 |   _column(101)    numeric                           CONCLERG   %20f  "Confidence in organized religion"
  8 |   _column(121)    numeric                            CONEDUC   %20f  "Confidence in education"
  9 |   _column(141)    numeric                             CONFED   %20f  "Confid. in exec branch of fed govt"
 10 |   _column(161)    numeric                           CONPRESS   %20f  "Confidence in press"
 11 |   _column(181)    numeric                           CONJUDGE   %20f  "Confid. in united states supreme court"
 12 |   _column(201)    numeric                           CONLEGIS   %20f  "Confidence in congress"
 13 |   _column(221)    numeric                             HEALTH   %20f  "Condition of health"
 14 |   _column(241)    numeric                             HAPMAR   %20f  "Happiness of marriage"
 15 |   _column(261)    numeric                               PRAY   %20f  "How often does r pray"
 16 |   _column(281)    numeric                            RELIG16   %20f  "Religion in which raised"
 17 |   _column(301)    numeric                             FUND16   %20f  "How fundamentalist was r at age 16"
 18 |   _column(321)    numeric                            SPREL16   %20f  "Religion in which spouse raised"
 19 |   _column(341)    numeric                             PRAYER   %20f  "Bible prayer in public schools"
 20 |   _column(361)    numeric                              BIBLE   %20f  "Feelings about the bible"
 21 |   _column(381)    numeric                             RACMAR   %20f  "Favor law against racial intermarriage"
 22 |   _column(401)    numeric                            RACPRES   %20f  "Would vote for black president"
 23 |   _column(421)    numeric                           AFFRMACT   %20f  "Favor preference in hiring blacks"
 24 |   _column(441)    numeric                              HAPPY   %20f  "General happiness"
 25 |   _column(461)    numeric                            CONARMY   %20f  "Confidence in military"
 26 |   _column(481)    numeric                             SATJOB   %20f  "Job or housework"
 27 |   _column(501)    numeric                               FEAR   %20f  "Afraid to walk at night in neighborhood"
 28 |   _column(521)    numeric                             OWNGUN   %20f  "Have gun in home"
 29 |   _column(541)    numeric                             PISTOL   %20f  "Pistol or revolver in home"
 30 |   _column(561)    numeric                               HUNT   %20f  "Does r or spouse hunt"
 31 |   _column(581)    numeric                              PHONE   %20f  "Does r have telephone"
 32 |   _column(601)    numeric                           MEMCHURH   %20f  "Membership in church group"
 33 |   _column(621)      float                            REALINC   %20f  "Family income in constant $"
 34 |   _column(641)    numeric                             COHORT   %20f  "Year of birth"
 35 |   _column(661)    numeric                           MARCOHRT   %20f  "Year of first marriage"
 36 |   _column(681)    numeric                             BALLOT   %20f  "Ballot used for interview"
 37 |   _column(701)    numeric                           SPANKING   %20f  "Favor spanking to discipline child"
 38 |   _column(721)    numeric                            HOMOSEX   %20f  "Homosexual sex relations"
 39 |   _column(741)    numeric                             CLASS_   %20f  "Subjective class identification"
 40 |   _column(761)    numeric                             SATFIN   %20f  "Satisfaction with financial situation"
 41 |   _column(781)    numeric                            FINRELA   %20f  "Opinion of family income"
 42 |   _column(801)    numeric                             UNION_   %20f  "Does r or spouse belong to union"
 43 |   _column(821)    numeric                              FEPOL   %20f  "Women not suited for politics"
 44 |   _column(841)    numeric                              ABANY   %20f  "Abortion if woman wants for any reason"
 45 |   _column(861)    numeric                           CHLDIDEL   %20f  "Ideal number of children"
 46 |   _column(881)    numeric                            SEXEDUC   %20f  "Sex education in public schools"
 47 |   _column(901)    numeric                           PREMARSX   %20f  "Sex before marriage"
 48 |   _column(921)    numeric                            XMARSEX   %20f  "Sex with person other than spouse"
 49 |   _column(941)    numeric                            WTSSALL   %20f  "Weight variable"
 50 |   _column(961)    numeric                            RELITEN   %20f  "Strength of affiliation"
 51 |   _column(981)    numeric                               YEAR   %20f  "Gss year for this respondent                       "
 52 |  _column(1001)    numeric                              MADEG   %20f  "Mothers highest degree"
 53 |  _column(1021)    numeric                              SPDEG   %20f  "Spouses highest degree"
 54 |  _column(1041)    numeric                                SEX   %20f  "Respondents sex"
 55 |  _column(1061)    numeric                               RACE   %20f  "Race of respondent"
 56 |  _column(1081)    numeric                              RES16   %20f  "Type of place lived in when 16 yrs old"
 57 |  _column(1101)    numeric                              REG16   %20f  "Region of residence, age 16"
 58 |  _column(1121)    numeric                            SRCBELT   %20f  "Src beltcode"
 59 |  _column(1141)    numeric                            PARTYID   %20f  "Political party affiliation"
 60 |  _column(1161)    numeric                             PRES04   %20f  "Vote for kerry, bush, nader"
 61 |  _column(1181)    numeric                             PRES08   %20f  "Vote obama or mccain"
 62 |  _column(1201)    numeric                              PADEG   %20f  "Fathers highest degree"
 63 |  _column(1221)    numeric                             DEGREE   %20f  "Rs highest degree"
 64 |  _column(1241)    numeric                                ID_   %20f  "Respondent id number"
 65 |  _column(1261)    numeric                             AGEWED   %20f  "Age when first married"
 66 |  _column(1281)    numeric                            DIVORCE   %20f  "Ever been divorced or separated"
 67 |  _column(1301)    numeric                               SIBS   %20f  "Number of brothers and sisters"
 68 |  _column(1321)    numeric                             CHILDS   %20f  "Number of children"
 69 |  _column(1341)    numeric                                AGE   %20f  "Age of respondent"
 70 |  _column(1361)    numeric                               EDUC   %20f  "Highest year of school completed"
 71 |  _column(1381)    numeric                             PAEDUC   %20f  "Highest year school completed, father"
 72 |  _column(1401)    numeric                             MAEDUC   %20f  "Highest year school completed, mother"
 73 |  _column(1421)    numeric                             SPEDUC   %20f  "Highest year school completed, spouse"
 74 |  _column(1441)    numeric                             PRES12   %20f  "Vote obama or romney"
 75 |  _column(1461)    numeric                           POLVIEWS   %20f  "Think of self as liberal or conservative"
 76 |  _column(1481)    numeric                             COLATH   %20f  "Allow anti-religionist to teach"
 77 |  _column(1501)    numeric                             LIBATH   %20f  "Allow anti-religious book in library"
 78 |  _column(1521)    numeric                            SPKHOMO   %20f  "Allow homosexual to speak"
 79 |  _column(1541)    numeric                            COLHOMO   %20f  "Allow homosexual to teach"
 80 |  _column(1561)    numeric                            LIBHOMO   %20f  "Allow homosexuals book in library"
 81 |  _column(1581)    numeric                             CAPPUN   %20f  "Favor or oppose death penalty for murder"
 82 |  _column(1601)    numeric                             GUNLAW   %20f  "Favor or oppose gun permits"
 83 |  _column(1621)    numeric                              GRASS   %20f  "Should marijuana be made legal"
 84 |  _column(1641)    numeric                              RELIG   %20f  "Rs religious preference"
 85 |  _column(1661)    numeric                               FUND   %20f  "How fundamentalist is r currently"
 86 |  _column(1681)    numeric                             SPKATH   %20f  "Allow anti-religionist to speak"
 87 |  _column(1701)    numeric                            NATFARE   %20f  "Welfare"
 88 |  _column(1721)    numeric                            NATSPAC   %20f  "Space exploration program"
 89 |  _column(1741)    numeric                           NATENVIR   %20f  "Improving & protecting environment"
 90 |  _column(1761)    numeric                            NATHEAL   %20f  "Improving & protecting nations health"
 91 |  _column(1781)    numeric                            NATCITY   %20f  "Solving problems of big cities"
 92 |  _column(1801)    numeric                           NATCRIME   %20f  "Halting rising crime rate"
 93 |  _column(1821)    numeric                            NATDRUG   %20f  "Dealing with drug addiction"
 94 |  _column(1841)    numeric                            NATEDUC   %20f  "Improving nations education system"
 95 |  _column(1861)    numeric                            NATRACE   %20f  "Improving the conditions of blacks"
 96 |  _column(1881)    numeric                            NATARMS   %20f  "Military, armaments, and defense"
 97 |  _column(1901)    numeric                             NATAID   %20f  "Foreign aid"
 98 |  _column(1921)    numeric                             ATTEND   %20f  "How often r attends religious services"
 99 | }
100 | 


--------------------------------------------------------------------------------
/gss_eda/GSS.dct:
--------------------------------------------------------------------------------
  1 | infile dictionary {
  2 |     _column(1)    numeric                               YEAR   %20f  "Gss year for this respondent                       "
  3 |    _column(21)    numeric                                ID_   %20f  "Respondent id number"
  4 |    _column(41)    numeric                             AGEWED   %20f  "Age when first married"
  5 |    _column(61)    numeric                            DIVORCE   %20f  "Ever been divorced or separated"
  6 |    _column(81)    numeric                               SIBS   %20f  "Number of brothers and sisters"
  7 |   _column(101)    numeric                             CHILDS   %20f  "Number of children"
  8 |   _column(121)    numeric                                AGE   %20f  "Age of respondent"
  9 |   _column(141)    numeric                               EDUC   %20f  "Highest year of school completed"
 10 |   _column(161)    numeric                             PAEDUC   %20f  "Highest year school completed, father"
 11 |   _column(181)    numeric                             MAEDUC   %20f  "Highest year school completed, mother"
 12 |   _column(201)    numeric                             SPEDUC   %20f  "Highest year school completed, spouse"
 13 |   _column(221)    numeric                             DEGREE   %20f  "Rs highest degree"
 14 |   _column(241)    numeric                              PADEG   %20f  "Fathers highest degree"
 15 |   _column(261)    numeric                              MADEG   %20f  "Mothers highest degree"
 16 |   _column(281)    numeric                              SPDEG   %20f  "Spouses highest degree"
 17 |   _column(301)    numeric                                SEX   %20f  "Respondents sex"
 18 |   _column(321)    numeric                               RACE   %20f  "Race of respondent"
 19 |   _column(341)    numeric                              RES16   %20f  "Type of place lived in when 16 yrs old"
 20 |   _column(361)    numeric                              REG16   %20f  "Region of residence, age 16"
 21 |   _column(381)    numeric                            SRCBELT   %20f  "Src beltcode"
 22 |   _column(401)    numeric                            PARTYID   %20f  "Political party affiliation"
 23 |   _column(421)    numeric                             PRES04   %20f  "Vote for kerry, bush, nader"
 24 |   _column(441)    numeric                             PRES08   %20f  "Vote obama or mccain"
 25 |   _column(461)    numeric                             PRES12   %20f  "Vote obama or romney"
 26 |   _column(481)    numeric                           POLVIEWS   %20f  "Think of self as liberal or conservative"
 27 |   _column(501)    numeric                            NATSPAC   %20f  "Space exploration program"
 28 |   _column(521)    numeric                           NATENVIR   %20f  "Improving & protecting environment"
 29 |   _column(541)    numeric                            NATHEAL   %20f  "Improving & protecting nations health"
 30 |   _column(561)    numeric                            NATCITY   %20f  "Solving problems of big cities"
 31 |   _column(581)    numeric                           NATCRIME   %20f  "Halting rising crime rate"
 32 |   _column(601)    numeric                            NATDRUG   %20f  "Dealing with drug addiction"
 33 |   _column(621)    numeric                            NATEDUC   %20f  "Improving nations education system"
 34 |   _column(641)    numeric                            NATRACE   %20f  "Improving the conditions of blacks"
 35 |   _column(661)    numeric                            NATARMS   %20f  "Military, armaments, and defense"
 36 |   _column(681)    numeric                             NATAID   %20f  "Foreign aid"
 37 |   _column(701)    numeric                            NATFARE   %20f  "Welfare"
 38 |   _column(721)    numeric                             SPKATH   %20f  "Allow anti-religionist to speak"
 39 |   _column(741)    numeric                             COLATH   %20f  "Allow anti-religionist to teach"
 40 |   _column(761)    numeric                             LIBATH   %20f  "Allow anti-religious book in library"
 41 |   _column(781)    numeric                            SPKHOMO   %20f  "Allow homosexual to speak"
 42 |   _column(801)    numeric                            COLHOMO   %20f  "Allow homosexual to teach"
 43 |   _column(821)    numeric                            LIBHOMO   %20f  "Allow homosexuals book in library"
 44 |   _column(841)    numeric                             CAPPUN   %20f  "Favor or oppose death penalty for murder"
 45 |   _column(861)    numeric                             GUNLAW   %20f  "Favor or oppose gun permits"
 46 |   _column(881)    numeric                              GRASS   %20f  "Should marijuana be made legal"
 47 |   _column(901)    numeric                              RELIG   %20f  "Rs religious preference"
 48 |   _column(921)    numeric                               FUND   %20f  "How fundamentalist is r currently"
 49 |   _column(941)    numeric                             ATTEND   %20f  "How often r attends religious services"
 50 |   _column(961)    numeric                            RELITEN   %20f  "Strength of affiliation"
 51 |   _column(981)    numeric                           POSTLIFE   %20f  "Belief in life after death"
 52 |  _column(1001)    numeric                               PRAY   %20f  "How often does r pray"
 53 |  _column(1021)    numeric                            RELIG16   %20f  "Religion in which raised"
 54 |  _column(1041)    numeric                             FUND16   %20f  "How fundamentalist was r at age 16"
 55 |  _column(1061)    numeric                            SPREL16   %20f  "Religion in which spouse raised"
 56 |  _column(1081)    numeric                             PRAYER   %20f  "Bible prayer in public schools"
 57 |  _column(1101)    numeric                              BIBLE   %20f  "Feelings about the bible"
 58 |  _column(1121)    numeric                             RACMAR   %20f  "Favor law against racial intermarriage"
 59 |  _column(1141)    numeric                            RACPRES   %20f  "Would vote for black president"
 60 |  _column(1161)    numeric                           AFFRMACT   %20f  "Favor preference in hiring blacks"
 61 |  _column(1181)    numeric                              HAPPY   %20f  "General happiness"
 62 |  _column(1201)    numeric                             HAPMAR   %20f  "Happiness of marriage"
 63 |  _column(1221)    numeric                             HEALTH   %20f  "Condition of health"
 64 |  _column(1241)    numeric                               LIFE   %20f  "Is life exciting or dull"
 65 |  _column(1261)    numeric                            HELPFUL   %20f  "People helpful or looking out for selves"
 66 |  _column(1281)    numeric                               FAIR   %20f  "People fair or try to take advantage"
 67 |  _column(1301)    numeric                              TRUST   %20f  "Can people be trusted"
 68 |  _column(1321)    numeric                           CONCLERG   %20f  "Confidence in organized religion"
 69 |  _column(1341)    numeric                            CONEDUC   %20f  "Confidence in education"
 70 |  _column(1361)    numeric                             CONFED   %20f  "Confid. in exec branch of fed govt"
 71 |  _column(1381)    numeric                           CONPRESS   %20f  "Confidence in press"
 72 |  _column(1401)    numeric                           CONJUDGE   %20f  "Confid. in united states supreme court"
 73 |  _column(1421)    numeric                           CONLEGIS   %20f  "Confidence in congress"
 74 |  _column(1441)    numeric                            CONARMY   %20f  "Confidence in military"
 75 |  _column(1461)    numeric                             SATJOB   %20f  "Job or housework"
 76 |  _column(1481)    numeric                             CLASS_   %20f  "Subjective class identification"
 77 |  _column(1501)    numeric                             SATFIN   %20f  "Satisfaction with financial situation"
 78 |  _column(1521)    numeric                            FINRELA   %20f  "Opinion of family income"
 79 |  _column(1541)    numeric                             UNION_   %20f  "Does r or spouse belong to union"
 80 |  _column(1561)    numeric                              FEPOL   %20f  "Women not suited for politics"
 81 |  _column(1581)    numeric                              ABANY   %20f  "Abortion if woman wants for any reason"
 82 |  _column(1601)    numeric                           CHLDIDEL   %20f  "Ideal number of children"
 83 |  _column(1621)    numeric                            SEXEDUC   %20f  "Sex education in public schools"
 84 |  _column(1641)    numeric                           PREMARSX   %20f  "Sex before marriage"
 85 |  _column(1661)    numeric                            XMARSEX   %20f  "Sex with person other than spouse"
 86 |  _column(1681)    numeric                            HOMOSEX   %20f  "Homosexual sex relations"
 87 |  _column(1701)    numeric                           SPANKING   %20f  "Favor spanking to discipline child"
 88 |  _column(1721)    numeric                               FEAR   %20f  "Afraid to walk at night in neighborhood"
 89 |  _column(1741)    numeric                             OWNGUN   %20f  "Have gun in home"
 90 |  _column(1761)    numeric                             PISTOL   %20f  "Pistol or revolver in home"
 91 |  _column(1781)    numeric                               HUNT   %20f  "Does r or spouse hunt"
 92 |  _column(1801)    numeric                              PHONE   %20f  "Does r have telephone"
 93 |  _column(1821)    numeric                           MEMCHURH   %20f  "Membership in church group"
 94 |  _column(1841)      float                            REALINC   %20f  "Family income in constant $"
 95 |  _column(1861)    numeric                             COHORT   %20f  "Year of birth"
 96 |  _column(1881)    numeric                           MARCOHRT   %20f  "Year of first marriage"
 97 |  _column(1901)    numeric                             BALLOT   %20f  "Ballot used for interview"
 98 |  _column(1921)    numeric                            WTSSALL   %20f  "Weight variable"
 99 |  _column(1941)    numeric                             ADULTS   %20f  "Household members 18 yrs and older"
100 |  _column(1961)    numeric                            COMPUSE   %20f  "R use computer"
101 |  _column(1981)    numeric                           DATABANK   %20f  "Computer data threat to individual privacy"
102 |  _column(2001)    numeric                             WTSSNR   %20f  "Weight variable"
103 |  _column(2021)    numeric                             SPKRAC   %20f  "Allow racist to speak"
104 |  _column(2041)    numeric                             SPKCOM   %20f  "Allow communist to speak"
105 |  _column(2061)    numeric                             SPKMIL   %20f  "Allow militarist to speak"
106 |  _column(2081)    numeric                            SPKMSLM   %20f  "Allow muslim clergymen preaching hatred of the us"
107 | }
108 | 


--------------------------------------------------------------------------------
/eds01_gss_clean.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# EDS Case Study\n",
  8 |     "\n",
  9 |     "Load and resample GSS data\n",
 10 |     "\n",
 11 |     "Allen Downey\n",
 12 |     "\n",
 13 |     "[MIT License](https://en.wikipedia.org/wiki/MIT_License)"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 1,
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "# If we're running in Colab, set up the environment\n",
 23 |     "\n",
 24 |     "import sys\n",
 25 |     "IN_COLAB = 'google.colab' in sys.modules\n",
 26 |     "\n",
 27 |     "if IN_COLAB:\n",
 28 |     "    !pip install empiricaldist\n",
 29 |     "    !git clone --depth 1 https://github.com/AllenDowney/ExploratoryDataAnalysis\n",
 30 |     "    %cd ExploratoryDataAnalysis"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 2,
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "import pandas as pd\n",
 40 |     "import numpy as np\n",
 41 |     "import matplotlib.pyplot as plt\n",
 42 |     "import seaborn as sns\n",
 43 |     "\n",
 44 |     "import utils"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "### Reading the extract\n",
 52 |     "\n",
 53 |     "https://gssdataexplorer.norc.org/projects/52787/extracts\n",
 54 |     "\n",
 55 |     "Currently Pandas is not able to read the files generated by GSS in any of the standard formats: Stata, SPSS, Excel.\n",
 56 |     "\n",
 57 |     "As a workaround, I wrote the following functions to read the Stata dictionary file and use the information there to read the Stata data file using `pd.read_fwf` which reads fixed-width files."
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 3,
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "import re\n",
 67 |     "import os\n",
 68 |     "\n",
 69 |     "class FixedWidthVariables(object):\n",
 70 |     "    \"\"\"Represents a set of variables in a fixed width file.\"\"\"\n",
 71 |     "\n",
 72 |     "    def __init__(self, variables, index_base=0):\n",
 73 |     "        \"\"\"Initializes.\n",
 74 |     "\n",
 75 |     "        variables: DataFrame\n",
 76 |     "        index_base: are the indices 0 or 1 based?\n",
 77 |     "\n",
 78 |     "        Attributes:\n",
 79 |     "        colspecs: list of (start, end) index tuples\n",
 80 |     "        names: list of string variable names\n",
 81 |     "        \"\"\"\n",
 82 |     "        self.variables = variables\n",
 83 |     "\n",
 84 |     "        # note: by default, subtract 1 from colspecs\n",
 85 |     "        self.colspecs = variables[['start', 'end']] - index_base\n",
 86 |     "\n",
 87 |     "        # convert colspecs to a list of pair of int\n",
 88 |     "        self.colspecs = self.colspecs.astype(np.int).values.tolist()\n",
 89 |     "        self.names = variables['name']\n",
 90 |     "\n",
 91 |     "    def ReadFixedWidth(self, filename, **options):\n",
 92 |     "        \"\"\"Reads a fixed width ASCII file.\n",
 93 |     "\n",
 94 |     "        filename: string filename\n",
 95 |     "\n",
 96 |     "        returns: DataFrame\n",
 97 |     "        \"\"\"\n",
 98 |     "        df = pd.read_fwf(filename,\n",
 99 |     "                         colspecs=self.colspecs, \n",
100 |     "                         names=self.names,\n",
101 |     "                         **options)\n",
102 |     "        return df\n",
103 |     "\n",
104 |     "\n",
105 |     "def ReadStataDct(dct_file, **options):\n",
106 |     "    \"\"\"Reads a Stata dictionary file.\n",
107 |     "\n",
108 |     "    dct_file: string filename\n",
109 |     "    options: dict of options passed to open()\n",
110 |     "\n",
111 |     "    returns: FixedWidthVariables object\n",
112 |     "    \"\"\"\n",
113 |     "    type_map = dict(byte=int, int=int, long=int, float=float, \n",
114 |     "                    double=float, numeric=float)\n",
115 |     "\n",
116 |     "    var_info = []\n",
117 |     "    with open(dct_file, **options) as f:\n",
118 |     "        for line in f:\n",
119 |     "            match = re.search( r'_column\\(([^)]*)\\)', line)\n",
120 |     "            if not match:\n",
121 |     "                continue\n",
122 |     "            start = int(match.group(1))\n",
123 |     "            t = line.split()\n",
124 |     "            vtype, name, fstring = t[1:4]\n",
125 |     "            name = name.lower()\n",
126 |     "            if vtype.startswith('str'):\n",
127 |     "                vtype = str\n",
128 |     "            else:\n",
129 |     "                vtype = type_map[vtype]\n",
130 |     "            long_desc = ' '.join(t[4:]).strip('\"')\n",
131 |     "            var_info.append((start, vtype, name, fstring, long_desc))\n",
132 |     "            \n",
133 |     "    columns = ['start', 'type', 'name', 'fstring', 'desc']\n",
134 |     "    variables = pd.DataFrame(var_info, columns=columns)\n",
135 |     "\n",
136 |     "    # fill in the end column by shifting the start column\n",
137 |     "    variables['end'] = variables.start.shift(-1)\n",
138 |     "    variables.loc[len(variables)-1, 'end'] = 0\n",
139 |     "\n",
140 |     "    dct = FixedWidthVariables(variables, index_base=1)\n",
141 |     "    return dct\n",
142 |     "\n",
143 |     "def read_gss(dirname):\n",
144 |     "    \"\"\"Reads GSS files from the given directory.\n",
145 |     "    \n",
146 |     "    dirname: string\n",
147 |     "    \n",
148 |     "    returns: DataFrame\n",
149 |     "    \"\"\"\n",
150 |     "    dct_file = os.path.join(dirname, 'GSS.dct')\n",
151 |     "    dct = ReadStataDct(dct_file)\n",
152 |     "    \n",
153 |     "    data_file = os.path.join(dirname, 'GSS.dat.gz')\n",
154 |     "    gss = dct.ReadFixedWidth(data_file, compression='gzip')\n",
155 |     "    \n",
156 |     "    return gss"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": 4,
162 |    "metadata": {
163 |     "scrolled": true
164 |    },
165 |    "outputs": [
166 |     {
167 |      "name": "stdout",
168 |      "output_type": "stream",
169 |      "text": [
170 |       "(64814, 105)\n"
171 |      ]
172 |     },
173 |     {
174 |      "data": {
175 |       "text/html": [
176 |        "<div>\n",
177 |        "<style scoped>\n",
178 |        "    .dataframe tbody tr th:only-of-type {\n",
179 |        "        vertical-align: middle;\n",
180 |        "    }\n",
181 |        "\n",
182 |        "    .dataframe tbody tr th {\n",
183 |        "        vertical-align: top;\n",
184 |        "    }\n",
185 |        "\n",
186 |        "    .dataframe thead th {\n",
187 |        "        text-align: right;\n",
188 |        "    }\n",
189 |        "</style>\n",
190 |        "<table border=\"1\" class=\"dataframe\">\n",
191 |        "  <thead>\n",
192 |        "    <tr style=\"text-align: right;\">\n",
193 |        "      <th></th>\n",
194 |        "      <th>year</th>\n",
195 |        "      <th>id_</th>\n",
196 |        "      <th>agewed</th>\n",
197 |        "      <th>divorce</th>\n",
198 |        "      <th>sibs</th>\n",
199 |        "      <th>childs</th>\n",
200 |        "      <th>age</th>\n",
201 |        "      <th>educ</th>\n",
202 |        "      <th>paeduc</th>\n",
203 |        "      <th>maeduc</th>\n",
204 |        "      <th>...</th>\n",
205 |        "      <th>ballot</th>\n",
206 |        "      <th>wtssall</th>\n",
207 |        "      <th>adults</th>\n",
208 |        "      <th>compuse</th>\n",
209 |        "      <th>databank</th>\n",
210 |        "      <th>wtssnr</th>\n",
211 |        "      <th>spkrac</th>\n",
212 |        "      <th>spkcom</th>\n",
213 |        "      <th>spkmil</th>\n",
214 |        "      <th>spkmslm</th>\n",
215 |        "    </tr>\n",
216 |        "  </thead>\n",
217 |        "  <tbody>\n",
218 |        "    <tr>\n",
219 |        "      <th>0</th>\n",
220 |        "      <td>1972</td>\n",
221 |        "      <td>1</td>\n",
222 |        "      <td>0</td>\n",
223 |        "      <td>0</td>\n",
224 |        "      <td>3</td>\n",
225 |        "      <td>0</td>\n",
226 |        "      <td>23</td>\n",
227 |        "      <td>16</td>\n",
228 |        "      <td>10</td>\n",
229 |        "      <td>97</td>\n",
230 |        "      <td>...</td>\n",
231 |        "      <td>0</td>\n",
232 |        "      <td>0.4446</td>\n",
233 |        "      <td>1</td>\n",
234 |        "      <td>0</td>\n",
235 |        "      <td>0</td>\n",
236 |        "      <td>1.0</td>\n",
237 |        "      <td>0</td>\n",
238 |        "      <td>1</td>\n",
239 |        "      <td>0</td>\n",
240 |        "      <td>0</td>\n",
241 |        "    </tr>\n",
242 |        "    <tr>\n",
243 |        "      <th>1</th>\n",
244 |        "      <td>1972</td>\n",
245 |        "      <td>2</td>\n",
246 |        "      <td>21</td>\n",
247 |        "      <td>2</td>\n",
248 |        "      <td>4</td>\n",
249 |        "      <td>5</td>\n",
250 |        "      <td>70</td>\n",
251 |        "      <td>10</td>\n",
252 |        "      <td>8</td>\n",
253 |        "      <td>8</td>\n",
254 |        "      <td>...</td>\n",
255 |        "      <td>0</td>\n",
256 |        "      <td>0.8893</td>\n",
257 |        "      <td>2</td>\n",
258 |        "      <td>0</td>\n",
259 |        "      <td>0</td>\n",
260 |        "      <td>1.0</td>\n",
261 |        "      <td>0</td>\n",
262 |        "      <td>2</td>\n",
263 |        "      <td>0</td>\n",
264 |        "      <td>0</td>\n",
265 |        "    </tr>\n",
266 |        "    <tr>\n",
267 |        "      <th>2</th>\n",
268 |        "      <td>1972</td>\n",
269 |        "      <td>3</td>\n",
270 |        "      <td>20</td>\n",
271 |        "      <td>2</td>\n",
272 |        "      <td>5</td>\n",
273 |        "      <td>4</td>\n",
274 |        "      <td>48</td>\n",
275 |        "      <td>12</td>\n",
276 |        "      <td>8</td>\n",
277 |        "      <td>8</td>\n",
278 |        "      <td>...</td>\n",
279 |        "      <td>0</td>\n",
280 |        "      <td>0.8893</td>\n",
281 |        "      <td>2</td>\n",
282 |        "      <td>0</td>\n",
283 |        "      <td>0</td>\n",
284 |        "      <td>1.0</td>\n",
285 |        "      <td>0</td>\n",
286 |        "      <td>2</td>\n",
287 |        "      <td>0</td>\n",
288 |        "      <td>0</td>\n",
289 |        "    </tr>\n",
290 |        "    <tr>\n",
291 |        "      <th>3</th>\n",
292 |        "      <td>1972</td>\n",
293 |        "      <td>4</td>\n",
294 |        "      <td>24</td>\n",
295 |        "      <td>2</td>\n",
296 |        "      <td>5</td>\n",
297 |        "      <td>0</td>\n",
298 |        "      <td>27</td>\n",
299 |        "      <td>17</td>\n",
300 |        "      <td>16</td>\n",
301 |        "      <td>12</td>\n",
302 |        "      <td>...</td>\n",
303 |        "      <td>0</td>\n",
304 |        "      <td>0.8893</td>\n",
305 |        "      <td>2</td>\n",
306 |        "      <td>0</td>\n",
307 |        "      <td>0</td>\n",
308 |        "      <td>1.0</td>\n",
309 |        "      <td>0</td>\n",
310 |        "      <td>1</td>\n",
311 |        "      <td>0</td>\n",
312 |        "      <td>0</td>\n",
313 |        "    </tr>\n",
314 |        "    <tr>\n",
315 |        "      <th>4</th>\n",
316 |        "      <td>1972</td>\n",
317 |        "      <td>5</td>\n",
318 |        "      <td>22</td>\n",
319 |        "      <td>2</td>\n",
320 |        "      <td>2</td>\n",
321 |        "      <td>2</td>\n",
322 |        "      <td>61</td>\n",
323 |        "      <td>12</td>\n",
324 |        "      <td>8</td>\n",
325 |        "      <td>8</td>\n",
326 |        "      <td>...</td>\n",
327 |        "      <td>0</td>\n",
328 |        "      <td>0.8893</td>\n",
329 |        "      <td>2</td>\n",
330 |        "      <td>0</td>\n",
331 |        "      <td>0</td>\n",
332 |        "      <td>1.0</td>\n",
333 |        "      <td>0</td>\n",
334 |        "      <td>1</td>\n",
335 |        "      <td>0</td>\n",
336 |        "      <td>0</td>\n",
337 |        "    </tr>\n",
338 |        "  </tbody>\n",
339 |        "</table>\n",
340 |        "<p>5 rows × 105 columns</p>\n",
341 |        "</div>"
342 |       ],
343 |       "text/plain": [
344 |        "   year  id_  agewed  divorce  sibs  childs  age  educ  paeduc  maeduc  ...  \\\n",
345 |        "0  1972    1       0        0     3       0   23    16      10      97  ...   \n",
346 |        "1  1972    2      21        2     4       5   70    10       8       8  ...   \n",
347 |        "2  1972    3      20        2     5       4   48    12       8       8  ...   \n",
348 |        "3  1972    4      24        2     5       0   27    17      16      12  ...   \n",
349 |        "4  1972    5      22        2     2       2   61    12       8       8  ...   \n",
350 |        "\n",
351 |        "   ballot  wtssall  adults  compuse  databank  wtssnr  spkrac  spkcom  spkmil  \\\n",
352 |        "0       0   0.4446       1        0         0     1.0       0       1       0   \n",
353 |        "1       0   0.8893       2        0         0     1.0       0       2       0   \n",
354 |        "2       0   0.8893       2        0         0     1.0       0       2       0   \n",
355 |        "3       0   0.8893       2        0         0     1.0       0       1       0   \n",
356 |        "4       0   0.8893       2        0         0     1.0       0       1       0   \n",
357 |        "\n",
358 |        "   spkmslm  \n",
359 |        "0        0  \n",
360 |        "1        0  \n",
361 |        "2        0  \n",
362 |        "3        0  \n",
363 |        "4        0  \n",
364 |        "\n",
365 |        "[5 rows x 105 columns]"
366 |       ]
367 |      },
368 |      "execution_count": 4,
369 |      "metadata": {},
370 |      "output_type": "execute_result"
371 |     }
372 |    ],
373 |    "source": [
374 |     "gss = read_gss('gss_eda')\n",
375 |     "print(gss.shape)\n",
376 |     "gss.head()"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "markdown",
381 |    "metadata": {},
382 |    "source": [
383 |     "### Missing data\n",
384 |     "\n",
385 |     "For many variables, missing values are encoded with numbers, so we need to replace them before we do any analysis.\n",
386 |     "\n",
387 |     "For example, for `polviews`, the values 8, 9, and 0 represent \"Don't know\", \"No answer\", and \"Not applicable\".\n",
388 |     "\n",
389 |     "\"Not applicable\" usually means the respondent was not asked a particular question.\n",
390 |     "\n",
391 |     "To keep things simple, we'll treat all of these values as equivalent, but we should keep in mind that we lose some information by doing that.  For example, if a respondent refuses to answer a question, that might suggest something about their answer.  If so, treating their response as missing data might bias the results.\n",
392 |     "\n",
393 |     "Fortunately, for most questions the number of respondents who refused to answer is small."
394 |    ]
395 |   },
396 |   {
397 |    "cell_type": "code",
398 |    "execution_count": 5,
399 |    "metadata": {},
400 |    "outputs": [],
401 |    "source": [
402 |     "def replace_invalid(df):\n",
403 |     "    \"\"\"Replace invalid data with NaN.\n",
404 |     "    \n",
405 |     "    df: DataFrame\n",
406 |     "    \"\"\"\n",
407 |     "    df.realinc.replace([0], np.nan, inplace=True)                  \n",
408 |     "    df.educ.replace([98, 99], np.nan, inplace=True)\n",
409 |     "    # 89 means 89 or older\n",
410 |     "    df.age.replace([98, 99], np.nan, inplace=True) \n",
411 |     "    df.cohort.replace([9999], np.nan, inplace=True)\n",
412 |     "    df.adults.replace([9], np.nan, inplace=True)\n",
413 |     "    df.colhomo.replace([0, 8, 9], np.nan, inplace=True)\n",
414 |     "    df.libhomo.replace([0, 8, 9], np.nan, inplace=True)\n",
415 |     "    df.cappun.replace([0, 8, 9], np.nan, inplace=True)\n",
416 |     "    df.gunlaw.replace([0, 8, 9], np.nan, inplace=True)\n",
417 |     "    df.grass.replace([0, 8, 9], np.nan, inplace=True)\n",
418 |     "    df.fepol.replace([0, 8, 9], np.nan, inplace=True)\n",
419 |     "    df.abany.replace([0, 8, 9], np.nan, inplace=True)\n",
420 |     "    df.prayer.replace([0, 8, 9], np.nan, inplace=True)\n",
421 |     "    df.sexeduc.replace([0, 8, 9], np.nan, inplace=True)\n",
422 |     "    df.premarsx.replace([0, 8, 9], np.nan, inplace=True)\n",
423 |     "    df.xmarsex.replace([0, 8, 9], np.nan, inplace=True)\n",
424 |     "    df.homosex.replace([0, 5, 8, 9], np.nan, inplace=True)\n",
425 |     "    df.racmar.replace([0, 8, 9], np.nan, inplace=True)\n",
426 |     "    df.spanking.replace([0, 8, 9], np.nan, inplace=True)\n",
427 |     "    df.racpres.replace([0, 8, 9], np.nan, inplace=True)\n",
428 |     "    df.fear.replace([0, 8, 9], np.nan, inplace=True)\n",
429 |     "    df.databank.replace([0, 8, 9], np.nan, inplace=True)\n",
430 |     "    df.affrmact.replace([0, 8, 9], np.nan, inplace=True)\n",
431 |     "    df.happy.replace([0, 8, 9], np.nan, inplace=True)\n",
432 |     "    df.hapmar.replace([0, 8, 9], np.nan, inplace=True)\n",
433 |     "    df.natspac.replace([0, 8, 9], np.nan, inplace=True)\n",
434 |     "    df.natenvir.replace([0, 8, 9], np.nan, inplace=True)\n",
435 |     "    df.natheal.replace([0, 8, 9], np.nan, inplace=True)\n",
436 |     "    df.natcity.replace([0, 8, 9], np.nan, inplace=True)\n",
437 |     "    df.natcrime.replace([0, 8, 9], np.nan, inplace=True)\n",
438 |     "    df.natdrug.replace([0, 8, 9], np.nan, inplace=True)\n",
439 |     "    df.nateduc.replace([0, 8, 9], np.nan, inplace=True)\n",
440 |     "    df.natrace.replace([0, 8, 9], np.nan, inplace=True)\n",
441 |     "    df.natarms.replace([0, 8, 9], np.nan, inplace=True)\n",
442 |     "    df.nataid.replace([0, 8, 9], np.nan, inplace=True)\n",
443 |     "    df.natfare.replace([0, 8, 9], np.nan, inplace=True)\n",
444 |     "    df.health.replace([0, 8, 9], np.nan, inplace=True)\n",
445 |     "    df.life.replace([0, 8, 9], np.nan, inplace=True)\n",
446 |     "    df.helpful.replace([0, 8, 9], np.nan, inplace=True)\n",
447 |     "    df.fair.replace([0, 8, 9], np.nan, inplace=True)\n",
448 |     "    df.trust.replace([0, 8, 9], np.nan, inplace=True)\n",
449 |     "    df.conclerg.replace([0, 8, 9], np.nan, inplace=True)\n",
450 |     "    df.coneduc.replace([0, 8, 9], np.nan, inplace=True)\n",
451 |     "    df.confed.replace([0, 8, 9], np.nan, inplace=True)\n",
452 |     "    df.conpress.replace([0, 8, 9], np.nan, inplace=True)\n",
453 |     "    df.conjudge.replace([0, 8, 9], np.nan, inplace=True)\n",
454 |     "    df.conlegis.replace([0, 8, 9], np.nan, inplace=True)\n",
455 |     "    df.conarmy.replace([0, 8, 9], np.nan, inplace=True)\n",
456 |     "    df.spkhomo.replace([0, 8, 9], np.nan, inplace=True)\n",
457 |     "    df.spkath.replace([0, 8, 9], np.nan, inplace=True)\n",
458 |     "    df.colath.replace([0, 8, 9], np.nan, inplace=True)\n",
459 |     "    df.libath.replace([0, 8, 9], np.nan, inplace=True)\n",
460 |     "    df.spkrac.replace([0, 8, 9], np.nan, inplace=True)\n",
461 |     "    df.spkcom.replace([0, 8, 9], np.nan, inplace=True)\n",
462 |     "    df.spkmil.replace([0, 8, 9], np.nan, inplace=True)\n",
463 |     "    df.satjob.replace([0, 8, 9], np.nan, inplace=True)\n",
464 |     "    df.satfin.replace([0, 8, 9], np.nan, inplace=True)\n",
465 |     "    df.finrela.replace([0, 8, 9], np.nan, inplace=True)\n",
466 |     "\n",
467 |     "    df.union_.replace([0, 8, 9], np.nan, inplace=True)\n",
468 |     "    df.res16.replace([0, 8, 9], np.nan, inplace=True)\n",
469 |     "\n",
470 |     "    df.fund.replace([0, 8, 9], np.nan, inplace=True)\n",
471 |     "    df.memchurh.replace([0, 8, 9], np.nan, inplace=True)\n",
472 |     "    df.fund16.replace([0, 8, 9], np.nan, inplace=True)\n",
473 |     "    df.reliten.replace([0, 8, 9], np.nan, inplace=True)\n",
474 |     "    df.postlife.replace([0, 8, 9], np.nan, inplace=True)\n",
475 |     "    df.pray.replace([0, 8, 9], np.nan, inplace=True)\n",
476 |     "    df.sprel16.replace([0, 8, 9], np.nan, inplace=True)\n",
477 |     "    df.hunt.replace([0, 8, 9], np.nan, inplace=True)\n",
478 |     "    df.polviews.replace([0, 8, 9], np.nan, inplace=True)\n",
479 |     "\n",
480 |     "    df.compuse.replace([0, 8, 9], np.nan, inplace=True)\n",
481 |     "\n",
482 |     "    df.degree.replace([8, 9], np.nan, inplace=True)\n",
483 |     "    df.padeg.replace([8, 9], np.nan, inplace=True)\n",
484 |     "    df.madeg.replace([8, 9], np.nan, inplace=True)\n",
485 |     "    df.spdeg.replace([8, 9], np.nan, inplace=True)\n",
486 |     "    df.partyid.replace([8, 9], np.nan, inplace=True)\n",
487 |     "\n",
488 |     "    df.chldidel.replace([-1, 8, 9], np.nan, inplace=True)\n",
489 |     "\n",
490 |     "    df.attend.replace([9], np.nan, inplace=True)\n",
491 |     "    df.childs.replace([9], np.nan, inplace=True)\n",
492 |     "    df.adults.replace([9], np.nan, inplace=True)\n",
493 |     "\n",
494 |     "    df.divorce.replace([0, 8, 9], np.nan, inplace=True)\n",
495 |     "    df.agewed.replace([0, 98, 99], np.nan, inplace=True)\n",
496 |     "    df.relig.replace([0, 98, 99], np.nan, inplace=True)\n",
497 |     "    df.relig16.replace([0, 98, 99], np.nan, inplace=True)\n",
498 |     "    df.age.replace([0, 98, 99], np.nan, inplace=True)\n",
499 |     "    \n",
500 |     "    # note: sibs contains some unlikely numbers\n",
501 |     "    df.sibs.replace([-1, 98, 99], np.nan, inplace=True)\n",
502 |     "    df.educ.replace([97, 98, 99], np.nan, inplace=True)\n",
503 |     "    df.maeduc.replace([97, 98, 99], np.nan, inplace=True)\n",
504 |     "    df.paeduc.replace([97, 98, 99], np.nan, inplace=True)\n",
505 |     "    df.speduc.replace([97, 98, 99], np.nan, inplace=True)\n",
506 |     "\n",
507 |     "    df.cohort.replace([0, 9999], np.nan, inplace=True)\n",
508 |     "    df.marcohrt.replace([0, 9999], np.nan, inplace=True)\n",
509 |     "\n",
510 |     "    df.phone.replace([0, 2, 9], np.nan, inplace=True)\n",
511 |     "    df.owngun.replace([0, 3, 8, 9], np.nan, inplace=True)\n",
512 |     "    df.pistol.replace([0, 3, 8, 9], np.nan, inplace=True)\n",
513 |     "    df.class_.replace([0, 5, 8, 9], np.nan, inplace=True)\n",
514 |     "    df.pres04.replace([0, 8, 9], np.nan, inplace=True)\n",
515 |     "    df.pres08.replace([0, 8, 9], np.nan, inplace=True)\n",
516 |     "    df.pres12.replace([0, 8, 9], np.nan, inplace=True)\n",
517 |     "\n",
518 |     "replace_invalid(gss)"
519 |    ]
520 |   },
521 |   {
522 |    "cell_type": "markdown",
523 |    "metadata": {},
524 |    "source": [
525 |     "### Resampling\n",
526 |     "\n",
527 |     "The GSS uses stratified sampling, which means that some groups are deliberately oversampled to help with statistical validity.\n",
528 |     "\n",
529 |     "As a result, each respondent has a sampling weight which is proportional to the number of people in the population represented by the respondent.\n",
530 |     "\n",
531 |     "Before running any analysis, we should compensate for stratified sampling by \"resampling\", that is, by drawing a random sample from the dataset, where each respondent's chance of appearing in the sample is proportional to their sampling weight.\n",
532 |     "\n",
533 |     "`utils` provides a function to do this resampling."
534 |    ]
535 |   },
536 |   {
537 |    "cell_type": "code",
538 |    "execution_count": 6,
539 |    "metadata": {},
540 |    "outputs": [],
541 |    "source": [
542 |     "np.random.seed(19)\n",
543 |     "sample = utils.resample_by_year(gss, 'wtssall')"
544 |    ]
545 |   },
546 |   {
547 |    "cell_type": "markdown",
548 |    "metadata": {},
549 |    "source": [
550 |     "### Saving the results\n",
551 |     "\n",
552 |     "I'll save the results to an HDF5 file, which is a binary format that makes it much faster to read the data back."
553 |    ]
554 |   },
555 |   {
556 |    "cell_type": "code",
557 |    "execution_count": 7,
558 |    "metadata": {},
559 |    "outputs": [],
560 |    "source": [
561 |     "!rm eds.gss.hdf5"
562 |    ]
563 |   },
564 |   {
565 |    "cell_type": "code",
566 |    "execution_count": 8,
567 |    "metadata": {},
568 |    "outputs": [],
569 |    "source": [
570 |     "for i in range(3):\n",
571 |     "    np.random.seed(i)\n",
572 |     "    sample = utils.resample_by_year(gss, 'wtssall')\n",
573 |     "\n",
574 |     "    key = f'gss{i}'\n",
575 |     "    sample.to_hdf('eds.gss.hdf5', key)"
576 |    ]
577 |   },
578 |   {
579 |    "cell_type": "code",
580 |    "execution_count": 9,
581 |    "metadata": {},
582 |    "outputs": [
583 |     {
584 |      "name": "stdout",
585 |      "output_type": "stream",
586 |      "text": [
587 |       "CPU times: user 9.86 ms, sys: 19.8 ms, total: 29.7 ms\n",
588 |       "Wall time: 28.8 ms\n"
589 |      ]
590 |     },
591 |     {
592 |      "data": {
593 |       "text/plain": [
594 |        "(64814, 105)"
595 |       ]
596 |      },
597 |      "execution_count": 9,
598 |      "metadata": {},
599 |      "output_type": "execute_result"
600 |     }
601 |    ],
602 |    "source": [
603 |     "%time gss = pd.read_hdf('eds.gss.hdf5', 'gss0')\n",
604 |     "gss.shape"
605 |    ]
606 |   },
607 |   {
608 |    "cell_type": "code",
609 |    "execution_count": null,
610 |    "metadata": {},
611 |    "outputs": [],
612 |    "source": []
613 |   }
614 |  ],
615 |  "metadata": {
616 |   "kernelspec": {
617 |    "display_name": "Python 3",
618 |    "language": "python",
619 |    "name": "python3"
620 |   },
621 |   "language_info": {
622 |    "codemirror_mode": {
623 |     "name": "ipython",
624 |     "version": 3
625 |    },
626 |    "file_extension": ".py",
627 |    "mimetype": "text/x-python",
628 |    "name": "python",
629 |    "nbconvert_exporter": "python",
630 |    "pygments_lexer": "ipython3",
631 |    "version": "3.7.3"
632 |   }
633 |  },
634 |  "nbformat": 4,
635 |  "nbformat_minor": 1
636 | }
637 | 


--------------------------------------------------------------------------------
/data/2013_2015_FemPregSetup.dct:
--------------------------------------------------------------------------------
  1 | infile dictionary {
  2 | 
  3 | *****************************************************************
  4 | * NATIONAL SURVEY OF FAMILY GROWTH (NSFG), 2013-2015
  5 | * STATA Pregnancy Data Dictionary
  6 | *
  7 | * Warning: Edit this file at your own risk
  8 | *
  9 | *****************************************************************
 10 | 
 11 | _lines(1)
 12 | 
 13 | _line(1)
 14 | 
 15 |     _column(1)       long                             CASEID   %5f  "Case identification number"
 16 |     _column(6)       byte                           PREGORDR   %2f  "Pregnancy order (number)"
 17 |     _column(8)       byte                          HOWPREG_N   %2f  "BB-2 # of Weeks or Months Currently Pregnant"
 18 |    _column(10)       byte                          HOWPREG_P   %1f  "BB-2 Current pregnancy length reported in months or weeks"
 19 |    _column(11)       byte                           MOSCURRP   %1f  "Number of Months Currently Pregnant"
 20 |    _column(12)       byte                           NOWPRGDK   %1f  "BB-3 Which Trimester -- Current Pregnancy"
 21 |    _column(13)       byte                           PREGEND1   %1f  "BC-1 How Pregnancy Ended - 1st mention"
 22 |    _column(14)       byte                           PREGEND2   %1f  "BC-1 How Pregnancy Ended - 2nd mention"
 23 |    _column(15)       byte                           HOWENDDK   %1f  "BC-1b Did pregnancy result in a baby/babies born alive or did it end in some other way?"
 24 |    _column(16)       byte                           NBRNALIV   %1f  "BC-2 Number of babies born alive from this pregnancy"
 25 |    _column(17)       byte                           MULTBRTH   %1f  "BC-3 Was this a multiple birth"
 26 |    _column(18)       byte                           BORNALIV   %1f  "Number of babies born alive from this pregnancy"
 27 |    _column(19)       byte                         DATPRGEN_M   %2f  "BC-4a Month when pregnancy ended (if nonlivebirth)"
 28 |    _column(21)        int                         DATPRGEN_Y   %4f  "BC-4a Year when pregnancy ended (if nonlivebirth)"
 29 |    _column(25)        int                           CMOTPREG   %4f  "CM for Pregnancy End Date (if nonlivebirth)"
 30 |    _column(29)       byte                           AGEATEND   %2f  "BC-4b R's age at pregnancy's end date"
 31 |    _column(31)       byte                           HPAGEEND   %2f  "BC-4c Father's age at pregnancy's end date"
 32 |    _column(33)       byte                         GESTASUN_M   %2f  "BC-5 Gestational Length of Pregnancy in Months"
 33 |    _column(35)       byte                         GESTASUN_W   %2f  "BC-5 Gestational Length of Pregnancy in Weeks"
 34 |    _column(37)       byte                            WKSGEST   %2f  "Gestational length of completed pregnancy (in weeks)"
 35 |    _column(39)       byte                            MOSGEST   %2f  "Gestational length of completed pregnancy (in months)"
 36 |    _column(41)       byte                            DK1GEST   %1f  "BC-6 DK followup for gestational length of a stillbirth"
 37 |    _column(42)       byte                            DK2GEST   %1f  "BC-7 DK followup for gestational length of a livebirth"
 38 |    _column(43)       byte                            DK3GEST   %1f  "BC-8 DK followup for gestational length of a miscarr/abor/ectop"
 39 |    _column(44)       byte                      BPA_BDSCHECK1   %1f  "Whether 1st liveborn baby from this pregnancy was BPA or BDS"
 40 |    _column(45)       byte                           BABYSEX1   %1f  "BD-2 Sex of 1st Liveborn Baby from This Pregnancy"
 41 |    _column(46)       byte                       BIRTHWGT_LB1   %2f  "BD-3 Birthweight in Pounds - 1st baby from this pregnancy"
 42 |    _column(48)       byte                       BIRTHWGT_OZ1   %2f  "BD-3 Birthweight in Ounces - 1st baby from this pregnancy"
 43 |    _column(50)       byte                          LOBTHWGT1   %1f  "BD-4 Is Baby Low Birthweight- 1st baby from this pregnancy"
 44 |    _column(51)       byte                           BABYSEX2   %1f  "BD-2 Sex of 2nd Liveborn Baby from This Pregnancy"
 45 |    _column(52)       byte                       BIRTHWGT_LB2   %2f  "BD-3 Birthweight in Pounds - 2nd baby from this pregnancy"
 46 |    _column(54)       byte                       BIRTHWGT_OZ2   %2f  "BD-3 Birthweight in Ounces - 2nd baby from this pregnancy"
 47 |    _column(56)       byte                          LOBTHWGT2   %1f  "BD-4 Is Baby Low Birthweight- 2nd baby from this pregnancy"
 48 |    _column(57)       byte                           BABYSEX3   %1f  "BD-2 Sex of 3rd Liveborn Baby from This Pregnancy"
 49 |    _column(58)       byte                       BIRTHWGT_LB3   %1f  "BD-3 Birthweight in Pounds - 3rd baby from this pregnancy"
 50 |    _column(59)       byte                       BIRTHWGT_OZ3   %2f  "BD-3 Birthweight in Ounces - 3rd baby from this pregnancy"
 51 |    _column(61)       byte                          LOBTHWGT3   %1f  "BD-4 Is Baby Low Birthweight- 3rd baby from this pregnancy"
 52 |    _column(62)       byte                          BABYDOB_M   %2f  "BD-5 Month of delivery for this pregnancy"
 53 |    _column(64)        int                          BABYDOB_Y   %4f  "BD-5 Year of delivery for this pregnancy"
 54 |    _column(68)        int                           CMBABDOB   %4f  "CM for baby's or babies' date of birth (delivery date)"
 55 |    _column(72)        int                             KIDAGE   %3f  "Current Age (in mos) of R's child(ren) from this pregnancy"
 56 |    _column(75)       byte                            HPAGELB   %2f  "BD-6 Father's age at time of child(ren) s birth"
 57 |    _column(77)       byte                           BIRTHPLC   %1f  "BD-7 Place where R gave birth"
 58 |    _column(78)       byte                          PAYBIRTH1   %1f  "BD-8 Payment for delivery - 1st mention"
 59 |    _column(79)       byte                          PAYBIRTH2   %1f  "BD-8 Payment for delivery - 2nd mention"
 60 |    _column(80)       byte                          PAYBIRTH3   %1f  "BD-8 Payment for delivery - 3rd mention"
 61 |    _column(81)       byte                           CSECPRIM   %1f  "BD-9 Is this R's first c-section?"
 62 |    _column(82)       byte                           CSECMED1   %1f  "BD-10 Medical reasons reported for this C-section - 1st mention"
 63 |    _column(83)       byte                           CSECMED2   %1f  "BD-10 Medical reasons reported for this C-section - 2nd mention"
 64 |    _column(84)       byte                           CSECMED3   %1f  "BD-10 Medical reasons reported for this C-section - 3rd mention"
 65 |    _column(85)       byte                           CSECMED4   %1f  "BD-10 Medical reasons reported for this C-section - 4th mention"
 66 |    _column(86)       byte                           CSECPLAN   %1f  "BD-11 Was this c-section planned for by R?"
 67 |    _column(87)       byte                           KNEWPREG   %2f  "BE-1 Weeks pregnant when R learned she was pregnant"
 68 |    _column(89)       byte                           TRIMESTR   %1f  "BE-2a DK followup for KNEWPREG when gestation >= 6 mos"
 69 |    _column(90)       byte                           LTRIMEST   %1f  "BE-2b DK followup for KNEWPREG when gestation < 6 mos"
 70 |    _column(91)       byte                           PRIORSMK   %1f  "BE-3 Amount R smoked in 6 mos before R knew she was pregnant"
 71 |    _column(92)       byte                           POSTSMKS   %1f  "BE-4 R smoked at all after R knew she was pregnant"
 72 |    _column(93)       byte                           NPOSTSMK   %1f  "BE-5 Amount R smoked during pregnancy after R knew she was preg"
 73 |    _column(94)       byte                           GETPRENA   %1f  "BE-6 Any prenatal care for this pregnancy"
 74 |    _column(95)       byte                           BGNPRENA   %2f  "BE-7 Weeks pregnant at first prenatal care visit"
 75 |    _column(97)       byte                            PNCTRIM   %1f  "BE-8a DK followup for BGNPRENA when gestation >= 6 mos"
 76 |    _column(98)       byte                            LPNCTRI   %1f  "BE-8b DK followup for BGNPRENA when gestation < 6 mos"
 77 |    _column(99)       byte                          LIVEHERE1   %1f  "BG-1 Whether child lives with R - 1st from this pregnancy"
 78 |   _column(100)       byte                          ALIVENOW1   %1f  "BG-2 Whether child is still alive - 1st from this pregnancy"
 79 |   _column(101)       byte                        WHENDIED_M1   %2f  "BG-3 Month when child died - 1st from this pregnancy"
 80 |   _column(103)        int                        WHENDIED_Y1   %4f  "BG-3 Year when child died - 1st from this pregnancy"
 81 |   _column(107)        int                          CMKIDIED1   %4f  "CM for child's date of death - 1st from this pregnancy"
 82 |   _column(111)       byte                        WHENLEFT_M1   %2f  "BG-4 Month when child stopped living with R- 1st from this preg"
 83 |   _column(113)        int                        WHENLEFT_Y1   %4f  "BG-4 Year when child stopped living with R- 1st from this preg"
 84 |   _column(117)        int                          CMKIDLFT1   %4f  "CM for date child stopped living w/R - 1st from this pregnancy"
 85 |   _column(121)        int                           LASTAGE1   %3f  "Age (in mos) when child last lived w/R-1st from this pregnancy"
 86 |   _column(124)       byte                          WHERENOW1   %1f  "BG-5 Where child lives now - 1st from this pregnancy"
 87 |   _column(125)       byte                          LEGAGREE1   %1f  "BG-6 Legal agreement for where child lives - 1st from this preg"
 88 |   _column(126)       byte                          PARENEND1   %1f  "BG-7 Is R still legal mother of child - 1st from this pregnancy"
 89 |   _column(127)       byte                          ANYNURSE1   %1f  "BH-1 Whether R breastfed this child at all - 1st from this preg"
 90 |   _column(128)       byte                          FEDSOLID1   %1f  "BH-2 Has R fed this child anything other than breastmilk - 1st from this preg"
 91 |   _column(129)        int                        FRSTEATD_N1   %3f  "BH-3 Age (mos/wks/day) when 1st fed non-breastmilk - 1st from this preg"
 92 |   _column(132)       byte                        FRSTEATD_P1   %1f  "BH-3 Units (mos/wks/days) for FRSTEATD_N - 1st from this preg"
 93 |   _column(133)       byte                          FRSTEATD1   %2f  "Age (in mos) when 1st fed non-breastmilk - 1st from this preg"
 94 |   _column(135)       byte                          QUITNURS1   %1f  "BH-4 Has R stopped breastfeeding child - 1st from this preg"
 95 |   _column(136)        int                        AGEQTNUR_N1   %3f  "BH-5 Age (mos/wks/day) when stopped breastfeeding - 1st from this preg"
 96 |   _column(139)       byte                        AGEQTNUR_P1   %1f  "BH-5 Units (mos/wks/days) for AGEQTNUR_N - 1st from this preg"
 97 |   _column(140)       byte                          AGEQTNUR1   %2f  "Age (in mos) when R stopped nursing child - 1st from this preg"
 98 |   _column(142)       byte                          LIVEHERE2   %1f  "BG-1 Whether child lives with R - 2nd from this pregnancy"
 99 |   _column(143)       byte                          ALIVENOW2   %1f  "BG-2 Whether child is still alive - 2nd from this pregnancy"
100 |   _column(144)       byte                        WHENDIED_M2   %1f  "BG-3 Month when child died - 2nd from this pregnancy"
101 |   _column(145)        int                        WHENDIED_Y2   %4f  "BG-3 Year when child died - 2nd from this pregnancy"
102 |   _column(149)        int                          CMKIDIED2   %4f  "CM for child's date of death - 2nd from this pregnancy"
103 |   _column(153)       byte                        WHENLEFT_M2   %1f  "BG-4 Month when child stopped living with R - 2nd from this preg"
104 |   _column(154)        int                        WHENLEFT_Y2   %4f  "BG-4 Year when child stopped living with R - 2nd from this preg"
105 |   _column(158)        int                          CMKIDLFT2   %4f  "CM for date child stopped living w/R - 2nd from this pregnancy"
106 |   _column(162)        int                           LASTAGE2   %3f  "Age (in mos) when child last lived w/R - 2nd from this pregnancy"
107 |   _column(165)       byte                          WHERENOW2   %1f  "BG-5 Where child lives now - 2nd from this pregnancy"
108 |   _column(166)       byte                          LEGAGREE2   %1f  "BG-6 Legal agreement for where child lives - 2nd from this preg"
109 |   _column(167)       byte                          PARENEND2   %1f  "BG-7 Is R still legal mother of child - 2nd from this pregnancy"
110 |   _column(168)       byte                          ANYNURSE2   %1f  "BH-1 Whether R breastfed this child at all - 2nd from this preg"
111 |   _column(169)       byte                          FEDSOLID2   %1f  "BH-2 Has R fed this child anything other than breastmilk - 2nd from this preg"
112 |   _column(170)       byte                        FRSTEATD_N2   %1f  "BH-3 Age (mos/wks/day) when 1st fed non-breastmilk - 2nd from this preg"
113 |   _column(171)       byte                        FRSTEATD_P2   %1f  "BH-3 Units (mos/wks/days) for FRSTEATD_N - 2nd from this preg"
114 |   _column(172)       byte                          FRSTEATD2   %1f  "Age (in mos) when 1st fed non-breastmilk - 2nd from this preg"
115 |   _column(173)       byte                          QUITNURS2   %1f  "BH-4 Has R stopped breastfeeding child - 2nd from this preg"
116 |   _column(174)       byte                        AGEQTNUR_N2   %2f  "BH-5 Age (mos/wks/day) when stopped breastfeeding - 2nd from this preg"
117 |   _column(176)       byte                        AGEQTNUR_P2   %1f  "BH-5 Units (mos/wks/days) for AGEQTNUR_N - 2nd from this preg"
118 |   _column(177)       byte                          AGEQTNUR2   %2f  "Age (in mos) when R stopped nursing child - 2nd from this preg"
119 |   _column(179)       byte                          LIVEHERE3   %1f  "BG-1 Whether child lives with R - 3rd from this pregnancy"
120 |   _column(180)       byte                          ALIVENOW3   %1f  "BG-2 Whether child is still alive - 3rd from this pregnancy"
121 |   _column(181)       byte                        WHENDIED_M3   %1f  "BG-3 Month when child died - 3rd from this pregnancy"
122 |   _column(182)       byte                        WHENDIED_Y3   %1f  "BG-3 Year when child died - 3rd from this pregnancy"
123 |   _column(183)       byte                          CMKIDIED3   %1f  "CM for child's date of death - 3rd from this pregnancy"
124 |   _column(184)       byte                        WHENLEFT_M3   %1f  "BG-4 Month when child stopped living with R - 3rd from this preg"
125 |   _column(185)       byte                        WHENLEFT_Y3   %1f  "BG-4 Year when child stopped living with R - 3rd from this preg"
126 |   _column(186)       byte                          CMKIDLFT3   %1f  "CM for date child stopped living w/R - 3rd from this pregnancy"
127 |   _column(187)       byte                           LASTAGE3   %1f  "Age (in mos) when child last lived w/R - 3rd from this pregnancy"
128 |   _column(188)       byte                          WHERENOW3   %1f  "BG-5 Where child lives now - 3rd from this pregnancy"
129 |   _column(189)       byte                          LEGAGREE3   %1f  "BG-6 Legal agreement for where child lives - 3rd from this preg"
130 |   _column(190)       byte                          PARENEND3   %1f  "BG-7 Is R still legal mother of child - 3rd from this pregnancy"
131 |   _column(191)       byte                          ANYNURSE3   %1f  "BH-1 Whether R breastfed this child at all - 3rd from this preg"
132 |   _column(192)       byte                          FEDSOLID3   %1f  "BH-2 Has R fed this child anything other than breastmilk - 3rd from this preg"
133 |   _column(193)       byte                        FRSTEATD_N3   %1f  "BH-3 Age (mos/wks/day) when 1st fed non-breastmilk - 3rd from this preg"
134 |   _column(194)       byte                        FRSTEATD_P3   %1f  "BH-3 Units (mos/wks/days) for FRSTEATD_N - 3rd from this preg"
135 |   _column(195)       byte                          FRSTEATD3   %1f  "Age (in mos) when 1st fed non-breastmilk - 3rd from this preg"
136 |   _column(196)       byte                          QUITNURS3   %1f  "BH-4 Has R stopped breastfeeding child - 3rd from this preg"
137 |   _column(197)       byte                        AGEQTNUR_N3   %1f  "BH-5 Age (mos/wks/day) when stopped breastfeeding - 3rd from this preg"
138 |   _column(198)       byte                        AGEQTNUR_P3   %1f  "BH-5 Units (mos/wks/days) for AGEQTNUR_N - 3rd from this preg"
139 |   _column(199)       byte                          AGEQTNUR3   %1f  "Age (in mos) when R stopped nursing child - 3rd from this preg"
140 |   _column(200)       byte                         PRGOUTCOME   %1f  "Outcome of Pregnancy (based on priority ordering)"
141 |   _column(201)       byte                           OUTCOM_S   %1f  "Outcome of pregnancy (based on corrected/chron sorted data)"
142 |   _column(202)       byte                           NBRNLV_S   %1f  "# of babies born alive from this preg (based on CCSD)"
143 |   _column(203)        int                           CMPRGEND   %4f  "CM for Pregnancy End Date (regardless of outcome)"
144 |   _column(207)        int                           CMENDP_S   %4f  "CM date when pregnancy ended (based on CCSD)"
145 |   _column(211)        int                           CMPRGBEG   %4f  "CM for Pregnancy Start Date"
146 |   _column(215)        int                           CMPBEG_S   %4f  "CM date when pregnancy began (based on CCSD)"
147 |   _column(219)        int                           CMLASTLB   %4f  "CM for R's most recent live birth (based on CCSD)"
148 |   _column(223)        int                           CMLSTPRG   %4f  "CM for R's most recent completed pregnancy (based on CCSD)"
149 |   _column(227)        int                           CMFSTPRG   %4f  "CM for R's first completed pregnancy (based on CCSD)"
150 |   _column(231)        int                           CMPG1BEG   %4f  "CM date when R's 1st pregnancy began (based on CCSD)"
151 |   _column(235)        int                           CMINTSTR   %4f  "CM for date of beginning of pregnancy interval"
152 |   _column(239)        int                           CMINTFIN   %4f  "CM for date of end of pregnancy interval"
153 |   _column(243)        int                         CMINTSTROP   %4f  "Open interval: CM of date of beginning"
154 |   _column(247)        int                         CMINTFINOP   %4f  "Open interval: CM of date of end (mon of interview)"
155 |   _column(251)        int                         CMINTSTRCR   %4f  "Currently pregnant: CM of date of beginning of interval"
156 |   _column(255)        int                         CMINTFINCR   %4f  "Currently pregnant: CM of date of end of interval (mon of interview)"
157 |   _column(259)       byte                           ANYUSINT   %1f  "Any method use in pregnancy interval"
158 |   _column(260)       byte                           EVUSEINT   %1f  "EG-1 Use any method in pregnancy interval?"
159 |   _column(261)       byte                           STOPDUSE   %1f  "EG-2 Before you became preg, stop using all methods?"
160 |   _column(262)       byte                           WHYSTOPD   %1f  "EG-3 Stop using methods before preg because wanted preg?"
161 |   _column(263)       byte                         WHATMETH01   %2f  "EG-4 Method(s) using when became preg - 1st mention"
162 |   _column(265)       byte                         WHATMETH02   %2f  "EG-4 Method(s) using when became preg - 2nd mention"
163 |   _column(267)       byte                         WHATMETH03   %2f  "EG-4 Method(s) using when became preg - 3rd mention"
164 |   _column(269)       byte                         WHATMETH04   %2f  "EG-4 Method(s) using when became preg - 4th mention"
165 |   _column(271)       byte                           RESNOUSE   %1f  "EG-5 Reason not using/had stopped using method bec. wanted preg?"
166 |   _column(272)       byte                           WANTBOLD   %1f  "EG-6 Right bef preg, want to have baby at any time in future?"
167 |   _column(273)       byte                           PROBBABE   %1f  "EG-7 probably want baby at any time in future or not?"
168 |   _column(274)       byte                            CNFRMNO   %1f  "EG-8 Verify didn't want baby at any time in future"
169 |   _column(275)       byte                           WANTBLD2   %1f  "EG-9 Right before preg, want to have baby at any time in future? (2nd asking)"
170 |   _column(276)       byte                           TIMINGOK   %1f  "EG-10 Become preg too soon, right time, or later than you wanted?"
171 |   _column(277)        int                          TOOSOON_N   %3f  "EG-11 How much sooner than wanted became preg (months or years)"
172 |   _column(280)       byte                          TOOSOON_P   %1f  "EG-11 Choose mons or yrs for how much sooner became preg than wanted"
173 |   _column(281)       byte                           WTHPART1   %1f  "EG-12a Right before preg, want to have baby with that partner?"
174 |   _column(282)       byte                           WTHPART2   %1f  "EG-12b Right bef. preg, think might ever want to have baby w/that partner?"
175 |   _column(283)       byte                           FEELINPG   %2f  "EG-13 How happy to be preg. scale (0-10)"
176 |   _column(285)       byte                            HPWNOLD   %1f  "EG-16 Right bef preg, did the father want R to have baby at any time in future?"
177 |   _column(286)       byte                            TIMOKHP   %1f  "EG-17 R became preg sooner, right time, or later than father of preg wanted"
178 |   _column(287)       byte                            COHPBEG   %1f  "EG-18a Was R living w/father of preg at beginning of preg"
179 |   _column(288)       byte                            COHPEND   %1f  "EG-18b Was R living w/father of preg when preg ended/baby was born"
180 |   _column(289)       byte                           TELLFATH   %1f  "EG-19 Did R tell father of preg that she was pregnant"
181 |   _column(290)       byte                           WHENTELL   %1f  "EG-20 When did R tell father of preg about pregnancy: during or after?"
182 |   _column(291)       byte                           TRYSCALE   %2f  "EG-21 How hard trying to get/avoid pregnancy (0-10 scale)"
183 |   _column(293)       byte                           WANTSCAL   %2f  "EG-22 How much wanted to get/avoid pregnancy (0-10 scale)"
184 |   _column(295)       byte                            WHYPRG1   %1f  "EG-23 (unintended preg): method fail or R wasn't using properly-1st mention"
185 |   _column(296)       byte                            WHYPRG2   %1f  "EG-23 (unintended preg): method fail or R wasn't using properly-2nd mention"
186 |   _column(297)       byte                          WHYNOUSE1   %2f  "EG-24 (unintended preg) Reason didn't use contraceptn - 1st"
187 |   _column(299)       byte                          WHYNOUSE2   %1f  "EG-24 (unintended preg) Reason didn't use contraceptn - 2nd"
188 |   _column(300)       byte                          WHYNOUSE3   %1f  "EG-24 (unintended preg) Reason didn't use contraceptn - 3rd"
189 |   _column(301)       byte                          WHYNOUSE4   %1f  "EG-24 (unintended preg) Reason didn't use contraceptn - 4th"
190 |   _column(302)       byte                          WHYNOUSE5   %1f  "EG-24 (unintended preg) Reason didn't use contraceptn - 5th"
191 |   _column(303)       byte                           WHYNOPG1   %2f  "EG-24aa open-ended response: reason didn't think could get preg - 1st"
192 |   _column(305)       byte                           WHYNOPG2   %2f  "EG-24aa open-ended response: reason didn't think could get preg - 2nd"
193 |   _column(307)       byte                           MAINOUSE   %2f  "EG-24a (unintended preg) Main reason didn't use contraception"
194 |   _column(309)       byte                           PRGLNGTH   %2f  "Duration of completed pregnancy in weeks"
195 |   _column(311)       byte                            OUTCOME   %1f  "Pregnancy outcome"
196 |   _column(312)       byte                           BIRTHORD   %2f  "Birth order"
197 |   _column(314)        int                             DATEND   %4f  "CM date pregnancy ended"
198 |   _column(318)        int                            AGEPREG   %4f  "Age at pregnancy outcome"
199 |   _column(322)        int                            DATECON   %4f  "CM date of conception"
200 |   _column(326)        int                             AGECON   %4f  "Age at time of conception"
201 |   _column(330)       byte                           FMAROUT5   %1f  "Formal marital status at pregnancy outcome"
202 |   _column(331)       byte                           PMARPREG   %1f  "Whether pregnancy ended before R's 1st marriage (premaritally)"
203 |   _column(332)       byte                           RMAROUT6   %1f  "Informal marital status at pregnancy outcome - 6 categories"
204 |   _column(333)       byte                           FMARCON5   %1f  "Formal marital status at conception - 5 categories"
205 |   _column(334)       byte                           RMARCON6   %1f  "Informal marital status at conception - 6 categories"
206 |   _column(335)       byte                           LEARNPRG   %2f  "Number of weeks pregnant when R learned she was pregnant"
207 |   _column(337)       byte                           PNCAREWK   %2f  "Number of weeks pregnant at first prenatal care"
208 |   _column(339)       byte                           PAYDELIV   %1f  "Payment for delivery"
209 |   _column(340)       byte                               LBW1   %1f  "Low birthweight - 1st baby from this preg"
210 |   _column(341)       byte                           LIVCHILD   %1f  "Living arrangements for 1st liveborn child from this pregnancy"
211 |   _column(342)        int                           BFEEDWKS   %3f  "Duration of breastfeeding in weeks"
212 |   _column(345)       byte                           OLDWANTR   %1f  "Wantedness of pregnancy - respondent - Cycle 4 version"
213 |   _column(346)       byte                           OLDWANTP   %1f  "Wantedness of preg - R's partner (father of pregnancy) - Cycle 4 version"
214 |   _column(347)       byte                           WANTRESP   %1f  "Wantedness of pregnancy -- Respondent (RECODE)"
215 |   _column(348)       byte                           WANTPART   %1f  "Wantedness of pregnancy -- R's partner (RECODE)"
216 |   _column(349)        int                            TOOSOON   %3f  "Number of months too soon pregnancy occurred"
217 |   _column(352)       byte                           NEWWANTR   %1f  "Detailed wantedness of pregnancy - respondent"
218 |   _column(353)        int                            CMBIRTH   %4f  "Century month of R's birth"
219 |   _column(357)       byte                               AGER   %2f  "Age at interview"
220 |   _column(359)       byte                            AGESCRN   %2f  "R's age at screener"
221 |   _column(361)       byte                           FMARITAL   %1f  "Formal marital status"
222 |   _column(362)       byte                           RMARITAL   %1f  "Informal Marital Status"
223 |   _column(363)       byte                             EDUCAT   %2f  "Education (completed years of schooling)"
224 |   _column(365)       byte                             HIEDUC   %2f  "Highest completed year of school or degree"
225 |   _column(367)       byte                               RACE   %1f  "Race"
226 |   _column(368)       byte                           HISPANIC   %1f  "Hispanic origin"
227 |   _column(369)       byte                           HISPRACE   %1f  "Race & Hispanic origin of respondent - 1977 OMB standards (respondent recode)"
228 |   _column(370)       byte                          HISPRACE2   %1f  "Race & Hispanic origin of respondent - 1997 OMB standards (respondent recode)"
229 |   _column(371)       byte                           RCURPREG   %1f  "Pregnant at time of interview"
230 |   _column(372)       byte                            PREGNUM   %2f  "CAPI-based total number of pregnancies"
231 |   _column(374)       byte                             PARITY   %2f  "Total number of live births"
232 |   _column(376)       byte                           CURR_INS   %1f  "Current health insurance coverage"
233 |   _column(377)       byte                           PUBASSIS   %1f  "Whether R received public assistance in prior calendar year"
234 |   _column(378)        int                            POVERTY   %3f  "Poverty level income"
235 |   _column(381)       byte                           LABORFOR   %1f  "Labor force status"
236 |   _column(382)       byte                           RELIGION   %1f  "Current religious affiliation"
237 |   _column(383)       byte                              METRO   %1f  "Place of residence (Metropolitan / Nonmetropolitan)"
238 |   _column(384)       byte                             BRNOUT   %1f  "IB-8 R born outside of US"
239 |   _column(385)        int                            YRSTRUS   %4f  "Year R came to the United States"
240 |   _column(389)       byte                         PRGLNGTH_I   %1f  "PRGLNGTH Imputation Flag"
241 |   _column(390)       byte                          OUTCOME_I   %1f  "OUTCOME Imputation Flag"
242 |   _column(391)       byte                         BIRTHORD_I   %1f  "BIRTHORD Imputation Flag"
243 |   _column(392)       byte                           DATEND_I   %1f  "DATEND Imputation Flag"
244 |   _column(393)       byte                          AGEPREG_I   %1f  "AGEPREG Imputation Flag"
245 |   _column(394)       byte                          DATECON_I   %1f  "DATECON Imputation Flag"
246 |   _column(395)       byte                           AGECON_I   %1f  "AGECON Imputation Flag"
247 |   _column(396)       byte                         FMAROUT5_I   %1f  "FMAROUT5 Imputation Flag"
248 |   _column(397)       byte                         PMARPREG_I   %1f  "PMARPREG Imputation Flag"
249 |   _column(398)       byte                         RMAROUT6_I   %1f  "RMAROUT6 Imputation Flag"
250 |   _column(399)       byte                         FMARCON5_I   %1f  "FMARCON5 Imputation Flag"
251 |   _column(400)       byte                         RMARCON6_I   %1f  "RMARCON6 Imputation Flag"
252 |   _column(401)       byte                         LEARNPRG_I   %1f  "LEARNPRG Imputation Flag"
253 |   _column(402)       byte                         PNCAREWK_I   %1f  "PNCAREWK Imputation Flag"
254 |   _column(403)       byte                         PAYDELIV_I   %1f  "PAYDELIV Imputation Flag"
255 |   _column(404)       byte                             LBW1_I   %1f  "LBW1 Imputation Flag"
256 |   _column(405)       byte                         LIVCHILD_I   %1f  "LIVCHILD Imputation Flag"
257 |   _column(406)       byte                         BFEEDWKS_I   %1f  "BFEEDWKS Imputation Flag"
258 |   _column(407)       byte                         OLDWANTR_I   %1f  "OLDWANTR Imputation Flag"
259 |   _column(408)       byte                         OLDWANTP_I   %1f  "OLDWANTP Imputation Flag"
260 |   _column(409)       byte                         WANTRESP_I   %1f  "WANTRESP Imputation Flag"
261 |   _column(410)       byte                         WANTPART_I   %1f  "WANTPART Imputation Flag"
262 |   _column(411)       byte                          TOOSOON_I   %1f  "TOOSOON Imputation Flag"
263 |   _column(412)       byte                         NEWWANTR_I   %1f  "NEWWANTR Imputation Flag"
264 |   _column(413)       byte                             AGER_I   %1f  "AGER Imputation Flag"
265 |   _column(414)       byte                         FMARITAL_I   %1f  "FMARITAL Imputation Flag"
266 |   _column(415)       byte                         RMARITAL_I   %1f  "RMARITAL Imputation Flag"
267 |   _column(416)       byte                           EDUCAT_I   %1f  "EDUCAT Imputation Flag"
268 |   _column(417)       byte                           HIEDUC_I   %1f  "HIEDUC Imputation Flag"
269 |   _column(418)       byte                             RACE_I   %1f  "RACE Imputation Flag"
270 |   _column(419)       byte                         HISPANIC_I   %1f  "HISPANIC Imputation Flag"
271 |   _column(420)       byte                         HISPRACE_I   %1f  "HISPRACE Imputation Flag"
272 |   _column(421)       byte                        HISPRACE2_I   %1f  "HISPRACE2 Imputation Flag"
273 |   _column(422)       byte                         RCURPREG_I   %1f  "RCURPREG Imputation Flag"
274 |   _column(423)       byte                          PREGNUM_I   %1f  "PREGNUM Imputation Flag"
275 |   _column(424)       byte                           PARITY_I   %1f  "PARITY Imputation Flag"
276 |   _column(425)       byte                         CURR_INS_I   %1f  "CURR_INS Imputation Flag"
277 |   _column(426)       byte                         PUBASSIS_I   %1f  "PUBASSIS Imputation Flag"
278 |   _column(427)       byte                          POVERTY_I   %1f  "POVERTY Imputation Flag"
279 |   _column(428)       byte                         LABORFOR_I   %1f  "LABORFOR Imputation Flag"
280 |   _column(429)       byte                         RELIGION_I   %1f  "RELIGION Imputation Flag"
281 |   _column(430)       byte                            METRO_I   %1f  "METRO Imputation Flag"
282 |   _column(431)     double                       WGT2013_2015  %16f  "Final weight for the 2013-2015 NSFG"
283 |   _column(447)       byte                               SECU   %1f  "Randomized version of the sampling error computational unit"
284 |   _column(448)        int                               SEST   %3f  "Randomized version of the stratum"
285 |   _column(451)        int                            CMINTVW   %4f  "Century month for date of interview  (Computed in Flow Check A-1)"
286 |   _column(455)        int                            CMLSTYR   %4f  "Century month for month/year of interview minus one year (Computed in Flow Check A-1)"
287 |   _column(459)        int                           CMJAN3YR   %4f  "Century month of January Three Years Prior to Year of interview (Computed in Flow Check A-1)"
288 |   _column(463)        int                           CMJAN4YR   %4f  "Century month of January Four Years Prior to Year of Interview (Computed in Flow Check A-1)"
289 |   _column(467)        int                           CMJAN5YR   %4f  "Century month of January Five Years Prior to Year of Interview (Computed in Flow Check A-1)"
290 |   _column(471)       str2                            QUARTER   %2s  "Quarter when case was sampled"
291 |   _column(473)       str1                              PHASE   %1s  "Regular- or double-sample portion of the quarter"
292 |   _column(474)       str4                          INTVWYEAR   %4s  "Calendar year when interview occurred"
293 | }
294 | 


--------------------------------------------------------------------------------
/gss_eda/GSS.do:
--------------------------------------------------------------------------------
   1 | #delimit ;
   2 | 
   3 |    infix
   4 |       year     1 - 20
   5 |       id_      21 - 40
   6 |       agewed   41 - 60
   7 |       divorce  61 - 80
   8 |       sibs     81 - 100
   9 |       childs   101 - 120
  10 |       age      121 - 140
  11 |       educ     141 - 160
  12 |       paeduc   161 - 180
  13 |       maeduc   181 - 200
  14 |       speduc   201 - 220
  15 |       degree   221 - 240
  16 |       padeg    241 - 260
  17 |       madeg    261 - 280
  18 |       spdeg    281 - 300
  19 |       sex      301 - 320
  20 |       race     321 - 340
  21 |       res16    341 - 360
  22 |       reg16    361 - 380
  23 |       srcbelt  381 - 400
  24 |       partyid  401 - 420
  25 |       pres04   421 - 440
  26 |       pres08   441 - 460
  27 |       pres12   461 - 480
  28 |       polviews 481 - 500
  29 |       natspac  501 - 520
  30 |       natenvir 521 - 540
  31 |       natheal  541 - 560
  32 |       natcity  561 - 580
  33 |       natcrime 581 - 600
  34 |       natdrug  601 - 620
  35 |       nateduc  621 - 640
  36 |       natrace  641 - 660
  37 |       natarms  661 - 680
  38 |       nataid   681 - 700
  39 |       natfare  701 - 720
  40 |       spkath   721 - 740
  41 |       colath   741 - 760
  42 |       libath   761 - 780
  43 |       spkhomo  781 - 800
  44 |       colhomo  801 - 820
  45 |       libhomo  821 - 840
  46 |       cappun   841 - 860
  47 |       gunlaw   861 - 880
  48 |       grass    881 - 900
  49 |       relig    901 - 920
  50 |       fund     921 - 940
  51 |       attend   941 - 960
  52 |       reliten  961 - 980
  53 |       postlife 981 - 1000
  54 |       pray     1001 - 1020
  55 |       relig16  1021 - 1040
  56 |       fund16   1041 - 1060
  57 |       sprel16  1061 - 1080
  58 |       prayer   1081 - 1100
  59 |       bible    1101 - 1120
  60 |       racmar   1121 - 1140
  61 |       racpres  1141 - 1160
  62 |       affrmact 1161 - 1180
  63 |       happy    1181 - 1200
  64 |       hapmar   1201 - 1220
  65 |       health   1221 - 1240
  66 |       life     1241 - 1260
  67 |       helpful  1261 - 1280
  68 |       fair     1281 - 1300
  69 |       trust    1301 - 1320
  70 |       conclerg 1321 - 1340
  71 |       coneduc  1341 - 1360
  72 |       confed   1361 - 1380
  73 |       conpress 1381 - 1400
  74 |       conjudge 1401 - 1420
  75 |       conlegis 1421 - 1440
  76 |       conarmy  1441 - 1460
  77 |       satjob   1461 - 1480
  78 |       class_   1481 - 1500
  79 |       satfin   1501 - 1520
  80 |       finrela  1521 - 1540
  81 |       union_   1541 - 1560
  82 |       fepol    1561 - 1580
  83 |       abany    1581 - 1600
  84 |       chldidel 1601 - 1620
  85 |       sexeduc  1621 - 1640
  86 |       premarsx 1641 - 1660
  87 |       xmarsex  1661 - 1680
  88 |       homosex  1681 - 1700
  89 |       spanking 1701 - 1720
  90 |       fear     1721 - 1740
  91 |       owngun   1741 - 1760
  92 |       pistol   1761 - 1780
  93 |       hunt     1781 - 1800
  94 |       phone    1801 - 1820
  95 |       memchurh 1821 - 1840
  96 |       realinc  1841 - 1860
  97 |       cohort   1861 - 1880
  98 |       marcohrt 1881 - 1900
  99 |       ballot   1901 - 1920
 100 |       wtssall  1921 - 1940
 101 |       adults   1941 - 1960
 102 |       compuse  1961 - 1980
 103 |       databank 1981 - 2000
 104 |       wtssnr   2001 - 2020
 105 |       spkrac   2021 - 2040
 106 |       spkcom   2041 - 2060
 107 |       spkmil   2061 - 2080
 108 |       spkmslm  2081 - 2100
 109 | using GSS.dat;
 110 | 
 111 | label variable year     "Gss year for this respondent                       ";
 112 | label variable id_      "Respondent id number";
 113 | label variable agewed   "Age when first married";
 114 | label variable divorce  "Ever been divorced or separated";
 115 | label variable sibs     "Number of brothers and sisters";
 116 | label variable childs   "Number of children";
 117 | label variable age      "Age of respondent";
 118 | label variable educ     "Highest year of school completed";
 119 | label variable paeduc   "Highest year school completed, father";
 120 | label variable maeduc   "Highest year school completed, mother";
 121 | label variable speduc   "Highest year school completed, spouse";
 122 | label variable degree   "Rs highest degree";
 123 | label variable padeg    "Fathers highest degree";
 124 | label variable madeg    "Mothers highest degree";
 125 | label variable spdeg    "Spouses highest degree";
 126 | label variable sex      "Respondents sex";
 127 | label variable race     "Race of respondent";
 128 | label variable res16    "Type of place lived in when 16 yrs old";
 129 | label variable reg16    "Region of residence, age 16";
 130 | label variable srcbelt  "Src beltcode";
 131 | label variable partyid  "Political party affiliation";
 132 | label variable pres04   "Vote for kerry, bush, nader";
 133 | label variable pres08   "Vote obama or mccain";
 134 | label variable pres12   "Vote obama or romney";
 135 | label variable polviews "Think of self as liberal or conservative";
 136 | label variable natspac  "Space exploration program";
 137 | label variable natenvir "Improving & protecting environment";
 138 | label variable natheal  "Improving & protecting nations health";
 139 | label variable natcity  "Solving problems of big cities";
 140 | label variable natcrime "Halting rising crime rate";
 141 | label variable natdrug  "Dealing with drug addiction";
 142 | label variable nateduc  "Improving nations education system";
 143 | label variable natrace  "Improving the conditions of blacks";
 144 | label variable natarms  "Military, armaments, and defense";
 145 | label variable nataid   "Foreign aid";
 146 | label variable natfare  "Welfare";
 147 | label variable spkath   "Allow anti-religionist to speak";
 148 | label variable colath   "Allow anti-religionist to teach";
 149 | label variable libath   "Allow anti-religious book in library";
 150 | label variable spkhomo  "Allow homosexual to speak";
 151 | label variable colhomo  "Allow homosexual to teach";
 152 | label variable libhomo  "Allow homosexuals book in library";
 153 | label variable cappun   "Favor or oppose death penalty for murder";
 154 | label variable gunlaw   "Favor or oppose gun permits";
 155 | label variable grass    "Should marijuana be made legal";
 156 | label variable relig    "Rs religious preference";
 157 | label variable fund     "How fundamentalist is r currently";
 158 | label variable attend   "How often r attends religious services";
 159 | label variable reliten  "Strength of affiliation";
 160 | label variable postlife "Belief in life after death";
 161 | label variable pray     "How often does r pray";
 162 | label variable relig16  "Religion in which raised";
 163 | label variable fund16   "How fundamentalist was r at age 16";
 164 | label variable sprel16  "Religion in which spouse raised";
 165 | label variable prayer   "Bible prayer in public schools";
 166 | label variable bible    "Feelings about the bible";
 167 | label variable racmar   "Favor law against racial intermarriage";
 168 | label variable racpres  "Would vote for black president";
 169 | label variable affrmact "Favor preference in hiring blacks";
 170 | label variable happy    "General happiness";
 171 | label variable hapmar   "Happiness of marriage";
 172 | label variable health   "Condition of health";
 173 | label variable life     "Is life exciting or dull";
 174 | label variable helpful  "People helpful or looking out for selves";
 175 | label variable fair     "People fair or try to take advantage";
 176 | label variable trust    "Can people be trusted";
 177 | label variable conclerg "Confidence in organized religion";
 178 | label variable coneduc  "Confidence in education";
 179 | label variable confed   "Confid. in exec branch of fed govt";
 180 | label variable conpress "Confidence in press";
 181 | label variable conjudge "Confid. in united states supreme court";
 182 | label variable conlegis "Confidence in congress";
 183 | label variable conarmy  "Confidence in military";
 184 | label variable satjob   "Job or housework";
 185 | label variable class_   "Subjective class identification";
 186 | label variable satfin   "Satisfaction with financial situation";
 187 | label variable finrela  "Opinion of family income";
 188 | label variable union_   "Does r or spouse belong to union";
 189 | label variable fepol    "Women not suited for politics";
 190 | label variable abany    "Abortion if woman wants for any reason";
 191 | label variable chldidel "Ideal number of children";
 192 | label variable sexeduc  "Sex education in public schools";
 193 | label variable premarsx "Sex before marriage";
 194 | label variable xmarsex  "Sex with person other than spouse";
 195 | label variable homosex  "Homosexual sex relations";
 196 | label variable spanking "Favor spanking to discipline child";
 197 | label variable fear     "Afraid to walk at night in neighborhood";
 198 | label variable owngun   "Have gun in home";
 199 | label variable pistol   "Pistol or revolver in home";
 200 | label variable hunt     "Does r or spouse hunt";
 201 | label variable phone    "Does r have telephone";
 202 | label variable memchurh "Membership in church group";
 203 | label variable realinc  "Family income in constant $";
 204 | label variable cohort   "Year of birth";
 205 | label variable marcohrt "Year of first marriage";
 206 | label variable ballot   "Ballot used for interview";
 207 | label variable wtssall  "Weight variable";
 208 | label variable adults   "Household members 18 yrs and older";
 209 | label variable compuse  "R use computer";
 210 | label variable databank "Computer data threat to individual privacy";
 211 | label variable wtssnr   "Weight variable";
 212 | label variable spkrac   "Allow racist to speak";
 213 | label variable spkcom   "Allow communist to speak";
 214 | label variable spkmil   "Allow militarist to speak";
 215 | label variable spkmslm  "Allow muslim clergymen preaching hatred of the us";
 216 | 
 217 | 
 218 | label define gsp001x
 219 |    99       "No answer"
 220 |    98       "Don't know"
 221 |    0        "Not applicable"
 222 | ;
 223 | label define gsp002x
 224 |    9        "No answer"
 225 |    8        "Don't know"
 226 |    2        "No"
 227 |    1        "Yes"
 228 |    0        "Not applicable"
 229 | ;
 230 | label define gsp003x
 231 |    99       "No answer"
 232 |    98       "Don't know"
 233 |    -1       "Not applicable"
 234 | ;
 235 | label define gsp004x
 236 |    9        "Dk na"
 237 |    8        "Eight or more"
 238 | ;
 239 | label define gsp005x
 240 |    99       "No answer"
 241 |    98       "Don't know"
 242 |    89       "89 or older"
 243 | ;
 244 | label define gsp006x
 245 |    99       "No answer"
 246 |    98       "Don't know"
 247 |    97       "Not applicable"
 248 | ;
 249 | label define gsp007x
 250 |    99       "No answer"
 251 |    98       "Don't know"
 252 |    97       "Not applicable"
 253 | ;
 254 | label define gsp008x
 255 |    99       "No answer"
 256 |    98       "Don't know"
 257 |    97       "Not applicable"
 258 | ;
 259 | label define gsp009x
 260 |    99       "No answer"
 261 |    98       "Don't know"
 262 |    97       "Not applicable"
 263 | ;
 264 | label define gsp010x
 265 |    9        "No answer"
 266 |    8        "Don't know"
 267 |    7        "Not applicable"
 268 |    4        "Graduate"
 269 |    3        "Bachelor"
 270 |    2        "Junior college"
 271 |    1        "High school"
 272 |    0        "Lt high school"
 273 | ;
 274 | label define gsp011x
 275 |    9        "No answer"
 276 |    8        "Don't know"
 277 |    7        "Not applicable"
 278 |    4        "Graduate"
 279 |    3        "Bachelor"
 280 |    2        "Junior college"
 281 |    1        "High school"
 282 |    0        "Lt high school"
 283 | ;
 284 | label define gsp012x
 285 |    9        "No answer"
 286 |    8        "Don't know"
 287 |    7        "Not applicable"
 288 |    4        "Graduate"
 289 |    3        "Bachelor"
 290 |    2        "Junior college"
 291 |    1        "High school"
 292 |    0        "Lt high school"
 293 | ;
 294 | label define gsp013x
 295 |    9        "No answer"
 296 |    8        "Don't know"
 297 |    7        "Not applicable"
 298 |    4        "Graduate"
 299 |    3        "Bachelor"
 300 |    2        "Junior college"
 301 |    1        "High school"
 302 |    0        "Lt high school"
 303 | ;
 304 | label define gsp014x
 305 |    2        "Female"
 306 |    1        "Male"
 307 | ;
 308 | label define gsp015x
 309 |    3        "Other"
 310 |    2        "Black"
 311 |    1        "White"
 312 |    0        "Not applicable"
 313 | ;
 314 | label define gsp016x
 315 |    9        "No answer"
 316 |    8        "Don't know"
 317 |    6        "City gt 250000"
 318 |    5        "Big-city suburb"
 319 |    4        "50000 to 250000"
 320 |    3        "Town lt 50000"
 321 |    2        "Farm"
 322 |    1        "Country,nonfarm"
 323 |    0        "Not applicable"
 324 | ;
 325 | label define gsp017x
 326 |    9        "Pacific"
 327 |    8        "Mountain"
 328 |    7        "W. sou. central"
 329 |    6        "E. sou. central"
 330 |    5        "South atlantic"
 331 |    4        "W. nor. central"
 332 |    3        "E. nor. central"
 333 |    2        "Middle atlantic"
 334 |    1        "New england"
 335 |    0        "Foreign"
 336 | ;
 337 | label define gsp018x
 338 |    6        "Other rural"
 339 |    5        "Other urban"
 340 |    4        "Suburb, 13-100"
 341 |    3        "Suburb, 12 lrgst"
 342 |    2        "Smsa's 13-100"
 343 |    1        "12 lrgst smsa's"
 344 |    0        "Not assigned"
 345 | ;
 346 | label define gsp019x
 347 |    9        "No answer"
 348 |    8        "Don't know"
 349 |    7        "Other party"
 350 |    6        "Strong republican"
 351 |    5        "Not str republican"
 352 |    4        "Ind,near rep"
 353 |    3        "Independent"
 354 |    2        "Ind,near dem"
 355 |    1        "Not str democrat"
 356 |    0        "Strong democrat"
 357 | ;
 358 | label define gsp020x
 359 |    9        "No answer"
 360 |    8        "Dont know"
 361 |    6        "Didnt vote"
 362 |    4        "Other (specify)"
 363 |    3        "Nader"
 364 |    2        "Bush"
 365 |    1        "Kerry"
 366 |    0        "Not applicable"
 367 | ;
 368 | label define gsp021x
 369 |    9        "No answer"
 370 |    8        "Don't know"
 371 |    4        "Didn't vote"
 372 |    3        "Other candidate (specify)"
 373 |    2        "Mccain"
 374 |    1        "Obama"
 375 |    0        "Not applicable"
 376 | ;
 377 | label define gsp022x
 378 |    9        "No answer"
 379 |    8        "Don't know"
 380 |    4        "Didn't vote for president"
 381 |    3        "Other candidate (specify)"
 382 |    2        "Romney"
 383 |    1        "Obama"
 384 |    0        "Not applicable"
 385 | ;
 386 | label define gsp023x
 387 |    9        "No answer"
 388 |    8        "Don't know"
 389 |    7        "Extrmly conservative"
 390 |    6        "Conservative"
 391 |    5        "Slghtly conservative"
 392 |    4        "Moderate"
 393 |    3        "Slightly liberal"
 394 |    2        "Liberal"
 395 |    1        "Extremely liberal"
 396 |    0        "Not applicable"
 397 | ;
 398 | label define gsp024x
 399 |    9        "No answer"
 400 |    8        "Don't know"
 401 |    3        "Too much"
 402 |    2        "About right"
 403 |    1        "Too little"
 404 |    0        "Not applicable"
 405 | ;
 406 | label define gsp025x
 407 |    9        "No answer"
 408 |    8        "Don't know"
 409 |    3        "Too much"
 410 |    2        "About right"
 411 |    1        "Too little"
 412 |    0        "Not applicable"
 413 | ;
 414 | label define gsp026x
 415 |    9        "No answer"
 416 |    8        "Don't know"
 417 |    3        "Too much"
 418 |    2        "About right"
 419 |    1        "Too little"
 420 |    0        "Not applicable"
 421 | ;
 422 | label define gsp027x
 423 |    9        "No answer"
 424 |    8        "Don't know"
 425 |    3        "Too much"
 426 |    2        "About right"
 427 |    1        "Too little"
 428 |    0        "Not applicable"
 429 | ;
 430 | label define gsp028x
 431 |    9        "No answer"
 432 |    8        "Don't know"
 433 |    3        "Too much"
 434 |    2        "About right"
 435 |    1        "Too little"
 436 |    0        "Not applicable"
 437 | ;
 438 | label define gsp029x
 439 |    9        "No answer"
 440 |    8        "Don't know"
 441 |    3        "Too much"
 442 |    2        "About right"
 443 |    1        "Too little"
 444 |    0        "Not applicable"
 445 | ;
 446 | label define gsp030x
 447 |    9        "No answer"
 448 |    8        "Don't know"
 449 |    3        "Too much"
 450 |    2        "About right"
 451 |    1        "Too little"
 452 |    0        "Not applicable"
 453 | ;
 454 | label define gsp031x
 455 |    9        "No answer"
 456 |    8        "Don't know"
 457 |    3        "Too much"
 458 |    2        "About right"
 459 |    1        "Too little"
 460 |    0        "Not applicable"
 461 | ;
 462 | label define gsp032x
 463 |    9        "No answer"
 464 |    8        "Don't know"
 465 |    3        "Too much"
 466 |    2        "About right"
 467 |    1        "Too little"
 468 |    0        "Not applicable"
 469 | ;
 470 | label define gsp033x
 471 |    9        "No answer"
 472 |    8        "Don't know"
 473 |    3        "Too much"
 474 |    2        "About right"
 475 |    1        "Too little"
 476 |    0        "Not applicable"
 477 | ;
 478 | label define gsp034x
 479 |    9        "No answer"
 480 |    8        "Don't know"
 481 |    3        "Too much"
 482 |    2        "About right"
 483 |    1        "Too little"
 484 |    0        "Not applicable"
 485 | ;
 486 | label define gsp035x
 487 |    9        "No answer"
 488 |    8        "Don't know"
 489 |    2        "Not allowed"
 490 |    1        "Allowed"
 491 |    0        "Not applicable"
 492 | ;
 493 | label define gsp036x
 494 |    9        "No answer"
 495 |    8        "Don't know"
 496 |    5        "Not allowed"
 497 |    4        "Allowed"
 498 |    0        "Not applicable"
 499 | ;
 500 | label define gsp037x
 501 |    9        "No answer"
 502 |    8        "Don't know"
 503 |    2        "Not remove"
 504 |    1        "Remove"
 505 |    0        "Not applicable"
 506 | ;
 507 | label define gsp038x
 508 |    9        "No answer"
 509 |    8        "Don't know"
 510 |    2        "Not allowed"
 511 |    1        "Allowed"
 512 |    0        "Not applicable"
 513 | ;
 514 | label define gsp039x
 515 |    9        "No answer"
 516 |    8        "Don't know"
 517 |    5        "Not allowed"
 518 |    4        "Allowed"
 519 |    0        "Not applicable"
 520 | ;
 521 | label define gsp040x
 522 |    9        "No answer"
 523 |    8        "Don't know"
 524 |    2        "Not remove"
 525 |    1        "Remove"
 526 |    0        "Not applicable"
 527 | ;
 528 | label define gsp041x
 529 |    9        "No answer"
 530 |    8        "Don't know"
 531 |    2        "Oppose"
 532 |    1        "Favor"
 533 |    0        "Not applicable"
 534 | ;
 535 | label define gsp042x
 536 |    9        "No answer"
 537 |    8        "Don't know"
 538 |    2        "Oppose"
 539 |    1        "Favor"
 540 |    0        "Not applicable"
 541 | ;
 542 | label define gsp043x
 543 |    9        "No answer"
 544 |    8        "Don't know"
 545 |    2        "Not legal"
 546 |    1        "Legal"
 547 |    0        "Not applicable"
 548 | ;
 549 | label define gsp044x
 550 |    99       "No answer"
 551 |    98       "Don't know"
 552 |    13       "Inter-nondenominational"
 553 |    12       "Native american"
 554 |    11       "Christian"
 555 |    10       "Orthodox-christian"
 556 |    9        "Moslem/islam"
 557 |    8        "Other eastern"
 558 |    7        "Hinduism"
 559 |    6        "Buddhism"
 560 |    5        "Other"
 561 |    4        "None"
 562 |    3        "Jewish"
 563 |    2        "Catholic"
 564 |    1        "Protestant"
 565 |    0        "Not applicable"
 566 | ;
 567 | label define gsp045x
 568 |    9        "Na-excluded"
 569 |    8        "Don't know"
 570 |    3        "Liberal"
 571 |    2        "Moderate"
 572 |    1        "Fundamentalist"
 573 |    0        "Not applicable"
 574 | ;
 575 | label define gsp046x
 576 |    9        "Dk,na"
 577 |    8        "More thn once wk"
 578 |    7        "Every week"
 579 |    6        "Nrly every week"
 580 |    5        "2-3x a month"
 581 |    4        "Once a month"
 582 |    3        "Sevrl times a yr"
 583 |    2        "Once a year"
 584 |    1        "Lt once a year"
 585 |    0        "Never"
 586 | ;
 587 | label define gsp047x
 588 |    9        "No answer"
 589 |    8        "Don't know"
 590 |    4        "No religion"
 591 |    3        "Somewhat strong"
 592 |    2        "Not very strong"
 593 |    1        "Strong"
 594 |    0        "Not applicable"
 595 | ;
 596 | label define gsp048x
 597 |    9        "No answer"
 598 |    8        "Don't know"
 599 |    2        "No"
 600 |    1        "Yes"
 601 |    0        "Not applicable"
 602 | ;
 603 | label define gsp049x
 604 |    9        "No answer"
 605 |    8        "Don't know"
 606 |    6        "Never"
 607 |    5        "Lt once a week"
 608 |    4        "Once a week"
 609 |    3        "Several times a week"
 610 |    2        "Once a day"
 611 |    1        "Several times a day"
 612 |    0        "Not applicable"
 613 | ;
 614 | label define gsp050x
 615 |    99       "No answer"
 616 |    98       "Don't know"
 617 |    13       "Inter-nondenominational"
 618 |    12       "Native american"
 619 |    11       "Christian"
 620 |    10       "Orthodox-christian"
 621 |    9        "Moslem/islam"
 622 |    8        "Other eastern"
 623 |    7        "Hinduism"
 624 |    6        "Buddhism"
 625 |    5        "Other"
 626 |    4        "None"
 627 |    3        "Jewish"
 628 |    2        "Catholic"
 629 |    1        "Protestant"
 630 |    0        "Not applicable"
 631 | ;
 632 | label define gsp051x
 633 |    9        "Na-excluded"
 634 |    8        "Don't know"
 635 |    3        "Liberal"
 636 |    2        "Moderate"
 637 |    1        "Fundamentalist"
 638 |    0        "Not applicable"
 639 | ;
 640 | label define gsp052x
 641 |    9        "No answer"
 642 |    8        "Dont know"
 643 |    5        "Other"
 644 |    4        "None"
 645 |    3        "Jewish"
 646 |    2        "Catholic"
 647 |    1        "Protestant"
 648 |    0        "Not applicable"
 649 | ;
 650 | label define gsp053x
 651 |    9        "No answer"
 652 |    8        "Don't know"
 653 |    2        "Disapprove"
 654 |    1        "Approve"
 655 |    0        "Not applicable"
 656 | ;
 657 | label define gsp054x
 658 |    9        "No answer"
 659 |    8        "Don't know"
 660 |    4        "Other"
 661 |    3        "Book of fables"
 662 |    2        "Inspired word"
 663 |    1        "Word of god"
 664 |    0        "Not applicable"
 665 | ;
 666 | label define gsp055x
 667 |    9        "No answer"
 668 |    8        "Don't know"
 669 |    2        "No"
 670 |    1        "Yes"
 671 |    0        "Not applicable"
 672 | ;
 673 | label define gsp056x
 674 |    9        "No answer"
 675 |    8        "Don't know"
 676 |    2        "No"
 677 |    1        "Yes"
 678 |    0        "Not applicable"
 679 | ;
 680 | label define gsp057x
 681 |    9        "No answer"
 682 |    8        "Don't know"
 683 |    4        "Strongly oppose pref"
 684 |    3        "Oppose pref"
 685 |    2        "Support pref"
 686 |    1        "Strongly support pref"
 687 |    0        "Not applicable"
 688 | ;
 689 | label define gsp058x
 690 |    9        "No answer"
 691 |    8        "Don't know"
 692 |    3        "Not too happy"
 693 |    2        "Pretty happy"
 694 |    1        "Very happy"
 695 |    0        "Not applicable"
 696 | ;
 697 | label define gsp059x
 698 |    9        "No answer"
 699 |    8        "Don't know"
 700 |    3        "Not too happy"
 701 |    2        "Pretty happy"
 702 |    1        "Very happy"
 703 |    0        "Not applicable"
 704 | ;
 705 | label define gsp060x
 706 |    9        "No answer"
 707 |    8        "Don't know"
 708 |    4        "Poor"
 709 |    3        "Fair"
 710 |    2        "Good"
 711 |    1        "Excellent"
 712 |    0        "Not applicable"
 713 | ;
 714 | label define gsp061x
 715 |    9        "No answer"
 716 |    8        "Don't know"
 717 |    3        "Dull"
 718 |    2        "Routine"
 719 |    1        "Exciting"
 720 |    0        "Not applicable"
 721 | ;
 722 | label define gsp062x
 723 |    9        "No answer"
 724 |    8        "Don't know"
 725 |    3        "Depends"
 726 |    2        "Lookout for self"
 727 |    1        "Helpful"
 728 |    0        "Not applicable"
 729 | ;
 730 | label define gsp063x
 731 |    9        "No answer"
 732 |    8        "Don't know"
 733 |    3        "Depends"
 734 |    2        "Fair"
 735 |    1        "Take advantage"
 736 |    0        "Not applicable"
 737 | ;
 738 | label define gsp064x
 739 |    9        "No answer"
 740 |    8        "Don't know"
 741 |    3        "Depends"
 742 |    2        "Cannot trust"
 743 |    1        "Can trust"
 744 |    0        "Not applicable"
 745 | ;
 746 | label define gsp065x
 747 |    9        "No answer"
 748 |    8        "Don't know"
 749 |    3        "Hardly any"
 750 |    2        "Only some"
 751 |    1        "A great deal"
 752 |    0        "Not applicable"
 753 | ;
 754 | label define gsp066x
 755 |    9        "No answer"
 756 |    8        "Don't know"
 757 |    3        "Hardly any"
 758 |    2        "Only some"
 759 |    1        "A great deal"
 760 |    0        "Not applicable"
 761 | ;
 762 | label define gsp067x
 763 |    9        "No answer"
 764 |    8        "Don't know"
 765 |    3        "Hardly any"
 766 |    2        "Only some"
 767 |    1        "A great deal"
 768 |    0        "Not applicable"
 769 | ;
 770 | label define gsp068x
 771 |    9        "No answer"
 772 |    8        "Don't know"
 773 |    3        "Hardly any"
 774 |    2        "Only some"
 775 |    1        "A great deal"
 776 |    0        "Not applicable"
 777 | ;
 778 | label define gsp069x
 779 |    9        "No answer"
 780 |    8        "Don't know"
 781 |    3        "Hardly any"
 782 |    2        "Only some"
 783 |    1        "A great deal"
 784 |    0        "Not applicable"
 785 | ;
 786 | label define gsp070x
 787 |    9        "No answer"
 788 |    8        "Don't know"
 789 |    3        "Hardly any"
 790 |    2        "Only some"
 791 |    1        "A great deal"
 792 |    0        "Not applicable"
 793 | ;
 794 | label define gsp071x
 795 |    9        "No answer"
 796 |    8        "Don't know"
 797 |    3        "Hardly any"
 798 |    2        "Only some"
 799 |    1        "A great deal"
 800 |    0        "Not applicable"
 801 | ;
 802 | label define gsp072x
 803 |    9        "No answer"
 804 |    8        "Don't know"
 805 |    4        "Very dissatisfied"
 806 |    3        "A little dissat"
 807 |    2        "Mod. satisfied"
 808 |    1        "Very satisfied"
 809 |    0        "Not applicable"
 810 | ;
 811 | label define gsp073x
 812 |    9        "No answer"
 813 |    8        "Don't know"
 814 |    5        "No class"
 815 |    4        "Upper class"
 816 |    3        "Middle class"
 817 |    2        "Working class"
 818 |    1        "Lower class"
 819 |    0        "Not applicable"
 820 | ;
 821 | label define gsp074x
 822 |    9        "No answer"
 823 |    8        "Don't know"
 824 |    3        "Not at all sat"
 825 |    2        "More or less"
 826 |    1        "Satisfied"
 827 |    0        "Not applicable"
 828 | ;
 829 | label define gsp075x
 830 |    9        "No answer"
 831 |    8        "Don't know"
 832 |    5        "Far above average"
 833 |    4        "Above average"
 834 |    3        "Average"
 835 |    2        "Below average"
 836 |    1        "Far below average"
 837 |    0        "Not applicable"
 838 | ;
 839 | label define gsp076x
 840 |    9        "No answer"
 841 |    8        "Don't know"
 842 |    4        "Neither belongs"
 843 |    3        "R and spouse belong"
 844 |    2        "Spouse belongs"
 845 |    1        "R belongs"
 846 |    0        "Not applicable"
 847 | ;
 848 | label define gsp077x
 849 |    9        "No answer"
 850 |    8        "Not sure"
 851 |    2        "Disagree"
 852 |    1        "Agree"
 853 |    0        "Not applicable"
 854 | ;
 855 | label define gsp078x
 856 |    9        "No answer"
 857 |    8        "Don't know"
 858 |    2        "No"
 859 |    1        "Yes"
 860 |    0        "Not applicable"
 861 | ;
 862 | label define gsp079x
 863 |    9        "Dk,na"
 864 |    8        "As many as want"
 865 |    7        "Seven+"
 866 |    -1       "Not applicable"
 867 | ;
 868 | label define gsp080x
 869 |    9        "No answer"
 870 |    8        "Don't know"
 871 |    3        "Depends"
 872 |    2        "Oppose"
 873 |    1        "Favor"
 874 |    0        "Not applicable"
 875 | ;
 876 | label define gsp081x
 877 |    9        "No answer"
 878 |    8        "Don't know"
 879 |    5        "Other"
 880 |    4        "Not wrong at all"
 881 |    3        "Sometimes wrong"
 882 |    2        "Almst always wrg"
 883 |    1        "Always wrong"
 884 |    0        "Not applicable"
 885 | ;
 886 | label define gsp082x
 887 |    9        "No answer"
 888 |    8        "Don't know"
 889 |    5        "Other"
 890 |    4        "Not wrong at all"
 891 |    3        "Sometimes wrong"
 892 |    2        "Almst always wrg"
 893 |    1        "Always wrong"
 894 |    0        "Not applicable"
 895 | ;
 896 | label define gsp083x
 897 |    9        "No answer"
 898 |    8        "Don't know"
 899 |    5        "Other"
 900 |    4        "Not wrong at all"
 901 |    3        "Sometimes wrong"
 902 |    2        "Almst always wrg"
 903 |    1        "Always wrong"
 904 |    0        "Not applicable"
 905 | ;
 906 | label define gsp084x
 907 |    9        "No answer"
 908 |    8        "Don't know"
 909 |    4        "Strongly disagree"
 910 |    3        "Disagree"
 911 |    2        "Agree"
 912 |    1        "Strongly agree"
 913 |    0        "Not applicable"
 914 | ;
 915 | label define gsp085x
 916 |    9        "No answer"
 917 |    8        "Don't know"
 918 |    2        "No"
 919 |    1        "Yes"
 920 |    0        "Not applicable"
 921 | ;
 922 | label define gsp086x
 923 |    9        "No answer"
 924 |    8        "Don't know"
 925 |    3        "Refused"
 926 |    2        "No"
 927 |    1        "Yes"
 928 |    0        "Not applicable"
 929 | ;
 930 | label define gsp087x
 931 |    9        "No answer"
 932 |    8        "Don't know"
 933 |    3        "Refused"
 934 |    2        "No"
 935 |    1        "Yes"
 936 |    0        "Not applicable"
 937 | ;
 938 | label define gsp088x
 939 |    9        "No answer"
 940 |    8        "Don't know"
 941 |    4        "Neither"
 942 |    3        "Both"
 943 |    2        "Spouse"
 944 |    1        "Resp"
 945 |    0        "Not applicable"
 946 | ;
 947 | label define gsp089x
 948 |    9        "No answer"
 949 |    6        "Cellphone"
 950 |    5        "Phone,dk where"
 951 |    4        "Phone elsewhere"
 952 |    3        "Phone in home"
 953 |    2        "Refused"
 954 |    1        "No phone"
 955 |    0        "Not applicable"
 956 | ;
 957 | label define gsp090x
 958 |    9        "No answer"
 959 |    8        "Don't know"
 960 |    2        "No"
 961 |    1        "Yes"
 962 |    0        "Not applicable"
 963 | ;
 964 | label define gsp091x
 965 |    999999   "No answer"
 966 |    999998   "Dont know"
 967 |    0        "Not applicable"
 968 | ;
 969 | label define gsp092x
 970 |    9999     "No answer"
 971 |    0        "Not applicable"
 972 | ;
 973 | label define gsp093x
 974 |    9999     "No answer"
 975 |    0        "Not applicable"
 976 | ;
 977 | label define gsp094x
 978 |    4        "Ballot d"
 979 |    3        "Ballot c"
 980 |    2        "Ballot b"
 981 |    1        "Ballot a"
 982 |    0        "Not applicable"
 983 | ;
 984 | label define gsp095x
 985 |    9        "No answer"
 986 |    8        "8 or more"
 987 | ;
 988 | label define gsp096x
 989 |    9        "No answer"
 990 |    8        "Don't know"
 991 |    2        "No"
 992 |    1        "Yes"
 993 |    0        "Not applicable"
 994 | ;
 995 | label define gsp097x
 996 |    9        "No answer"
 997 |    8        "Cant choose"
 998 |    4        "Not a threat at all"
 999 |    3        "Not serious"
1000 |    2        "Fairly serious"
1001 |    1        "Very serious threat"
1002 |    0        "Not applicable"
1003 | ;
1004 | label define gsp098x
1005 |    9        "No answer"
1006 |    8        "Don't know"
1007 |    2        "Not allowed"
1008 |    1        "Allowed"
1009 |    0        "Not applicable"
1010 | ;
1011 | label define gsp099x
1012 |    9        "No answer"
1013 |    8        "Don't know"
1014 |    2        "Not allowed"
1015 |    1        "Allowed"
1016 |    0        "Not applicable"
1017 | ;
1018 | label define gsp100x
1019 |    9        "No answer"
1020 |    8        "Don't know"
1021 |    2        "Not allowed"
1022 |    1        "Allowed"
1023 |    0        "Not applicable"
1024 | ;
1025 | label define gsp101x
1026 |    9        "No answer"
1027 |    8        "Dont know"
1028 |    2        "Not allowed"
1029 |    1        "Yes, allowed"
1030 |    0        "Not applicable"
1031 | ;
1032 | 
1033 | 
1034 | label values agewed   gsp001x;
1035 | label values divorce  gsp002x;
1036 | label values sibs     gsp003x;
1037 | label values childs   gsp004x;
1038 | label values age      gsp005x;
1039 | label values educ     gsp006x;
1040 | label values paeduc   gsp007x;
1041 | label values maeduc   gsp008x;
1042 | label values speduc   gsp009x;
1043 | label values degree   gsp010x;
1044 | label values padeg    gsp011x;
1045 | label values madeg    gsp012x;
1046 | label values spdeg    gsp013x;
1047 | label values sex      gsp014x;
1048 | label values race     gsp015x;
1049 | label values res16    gsp016x;
1050 | label values reg16    gsp017x;
1051 | label values srcbelt  gsp018x;
1052 | label values partyid  gsp019x;
1053 | label values pres04   gsp020x;
1054 | label values pres08   gsp021x;
1055 | label values pres12   gsp022x;
1056 | label values polviews gsp023x;
1057 | label values natspac  gsp024x;
1058 | label values natenvir gsp025x;
1059 | label values natheal  gsp026x;
1060 | label values natcity  gsp027x;
1061 | label values natcrime gsp028x;
1062 | label values natdrug  gsp029x;
1063 | label values nateduc  gsp030x;
1064 | label values natrace  gsp031x;
1065 | label values natarms  gsp032x;
1066 | label values nataid   gsp033x;
1067 | label values natfare  gsp034x;
1068 | label values spkath   gsp035x;
1069 | label values colath   gsp036x;
1070 | label values libath   gsp037x;
1071 | label values spkhomo  gsp038x;
1072 | label values colhomo  gsp039x;
1073 | label values libhomo  gsp040x;
1074 | label values cappun   gsp041x;
1075 | label values gunlaw   gsp042x;
1076 | label values grass    gsp043x;
1077 | label values relig    gsp044x;
1078 | label values fund     gsp045x;
1079 | label values attend   gsp046x;
1080 | label values reliten  gsp047x;
1081 | label values postlife gsp048x;
1082 | label values pray     gsp049x;
1083 | label values relig16  gsp050x;
1084 | label values fund16   gsp051x;
1085 | label values sprel16  gsp052x;
1086 | label values prayer   gsp053x;
1087 | label values bible    gsp054x;
1088 | label values racmar   gsp055x;
1089 | label values racpres  gsp056x;
1090 | label values affrmact gsp057x;
1091 | label values happy    gsp058x;
1092 | label values hapmar   gsp059x;
1093 | label values health   gsp060x;
1094 | label values life     gsp061x;
1095 | label values helpful  gsp062x;
1096 | label values fair     gsp063x;
1097 | label values trust    gsp064x;
1098 | label values conclerg gsp065x;
1099 | label values coneduc  gsp066x;
1100 | label values confed   gsp067x;
1101 | label values conpress gsp068x;
1102 | label values conjudge gsp069x;
1103 | label values conlegis gsp070x;
1104 | label values conarmy  gsp071x;
1105 | label values satjob   gsp072x;
1106 | label values class_   gsp073x;
1107 | label values satfin   gsp074x;
1108 | label values finrela  gsp075x;
1109 | label values union_   gsp076x;
1110 | label values fepol    gsp077x;
1111 | label values abany    gsp078x;
1112 | label values chldidel gsp079x;
1113 | label values sexeduc  gsp080x;
1114 | label values premarsx gsp081x;
1115 | label values xmarsex  gsp082x;
1116 | label values homosex  gsp083x;
1117 | label values spanking gsp084x;
1118 | label values fear     gsp085x;
1119 | label values owngun   gsp086x;
1120 | label values pistol   gsp087x;
1121 | label values hunt     gsp088x;
1122 | label values phone    gsp089x;
1123 | label values memchurh gsp090x;
1124 | label values realinc  gsp091x;
1125 | label values cohort   gsp092x;
1126 | label values marcohrt gsp093x;
1127 | label values ballot   gsp094x;
1128 | label values adults   gsp095x;
1129 | label values compuse  gsp096x;
1130 | label values databank gsp097x;
1131 | label values spkrac   gsp098x;
1132 | label values spkcom   gsp099x;
1133 | label values spkmil   gsp100x;
1134 | label values spkmslm  gsp101x;
1135 | 
1136 | 
1137 | 


--------------------------------------------------------------------------------
/gss_validate.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Exploratory Data Analysis\n",
   8 |     "\n",
   9 |     "Load and validate GSS data\n",
  10 |     "\n",
  11 |     "Allen Downey\n",
  12 |     "\n",
  13 |     "[MIT License](https://en.wikipedia.org/wiki/MIT_License)"
  14 |    ]
  15 |   },
  16 |   {
  17 |    "cell_type": "code",
  18 |    "execution_count": 33,
  19 |    "metadata": {},
  20 |    "outputs": [],
  21 |    "source": [
  22 |     "%matplotlib inline\n",
  23 |     "\n",
  24 |     "import pandas as pd\n",
  25 |     "import numpy as np\n",
  26 |     "\n",
  27 |     "import matplotlib.pyplot as plt\n",
  28 |     "import seaborn as sns\n",
  29 |     "sns.set(style='white')\n",
  30 |     "\n",
  31 |     "import utils\n",
  32 |     "from utils import decorate\n",
  33 |     "from distribution import Pmf, Cdf"
  34 |    ]
  35 |   },
  36 |   {
  37 |    "cell_type": "code",
  38 |    "execution_count": 34,
  39 |    "metadata": {},
  40 |    "outputs": [],
  41 |    "source": [
  42 |     "def underride(d, **options):\n",
  43 |     "    \"\"\"Add key-value pairs to d only if key is not in d.\n",
  44 |     "\n",
  45 |     "    d: dictionary\n",
  46 |     "    options: keyword args to add to d\n",
  47 |     "    \"\"\"\n",
  48 |     "    for key, val in options.items():\n",
  49 |     "        d.setdefault(key, val)\n",
  50 |     "\n",
  51 |     "    return d"
  52 |    ]
  53 |   },
  54 |   {
  55 |    "cell_type": "markdown",
  56 |    "metadata": {},
  57 |    "source": [
  58 |     "## Loading and validation\n"
  59 |    ]
  60 |   },
  61 |   {
  62 |    "cell_type": "code",
  63 |    "execution_count": 35,
  64 |    "metadata": {},
  65 |    "outputs": [],
  66 |    "source": [
  67 |     "import re\n",
  68 |     "\n",
  69 |     "class FixedWidthVariables(object):\n",
  70 |     "    \"\"\"Represents a set of variables in a fixed width file.\"\"\"\n",
  71 |     "\n",
  72 |     "    def __init__(self, variables, index_base=0):\n",
  73 |     "        \"\"\"Initializes.\n",
  74 |     "\n",
  75 |     "        variables: DataFrame\n",
  76 |     "        index_base: are the indices 0 or 1 based?\n",
  77 |     "\n",
  78 |     "        Attributes:\n",
  79 |     "        colspecs: list of (start, end) index tuples\n",
  80 |     "        names: list of string variable names\n",
  81 |     "        \"\"\"\n",
  82 |     "        self.variables = variables\n",
  83 |     "\n",
  84 |     "        # note: by default, subtract 1 from colspecs\n",
  85 |     "        self.colspecs = variables[['start', 'end']] - index_base\n",
  86 |     "\n",
  87 |     "        # convert colspecs to a list of pair of int\n",
  88 |     "        self.colspecs = self.colspecs.astype(np.int).values.tolist()\n",
  89 |     "        self.names = variables['name']\n",
  90 |     "\n",
  91 |     "    def ReadFixedWidth(self, filename, **options):\n",
  92 |     "        \"\"\"Reads a fixed width ASCII file.\n",
  93 |     "\n",
  94 |     "        filename: string filename\n",
  95 |     "\n",
  96 |     "        returns: DataFrame\n",
  97 |     "        \"\"\"\n",
  98 |     "        df = pd.read_fwf(filename,\n",
  99 |     "                         colspecs=self.colspecs, \n",
 100 |     "                         names=self.names,\n",
 101 |     "                         **options)\n",
 102 |     "        return df\n",
 103 |     "\n",
 104 |     "\n",
 105 |     "def ReadStataDct(dct_file, **options):\n",
 106 |     "    \"\"\"Reads a Stata dictionary file.\n",
 107 |     "\n",
 108 |     "    dct_file: string filename\n",
 109 |     "    options: dict of options passed to open()\n",
 110 |     "\n",
 111 |     "    returns: FixedWidthVariables object\n",
 112 |     "    \"\"\"\n",
 113 |     "    type_map = dict(byte=int, int=int, long=int, float=float, \n",
 114 |     "                    double=float, numeric=float)\n",
 115 |     "\n",
 116 |     "    var_info = []\n",
 117 |     "    with open(dct_file, **options) as f:\n",
 118 |     "        for line in f:\n",
 119 |     "            match = re.search( r'_column\\(([^)]*)\\)', line)\n",
 120 |     "            if not match:\n",
 121 |     "                continue\n",
 122 |     "            start = int(match.group(1))\n",
 123 |     "            t = line.split()\n",
 124 |     "            vtype, name, fstring = t[1:4]\n",
 125 |     "            name = name.lower()\n",
 126 |     "            if vtype.startswith('str'):\n",
 127 |     "                vtype = str\n",
 128 |     "            else:\n",
 129 |     "                vtype = type_map[vtype]\n",
 130 |     "            long_desc = ' '.join(t[4:]).strip('\"')\n",
 131 |     "            var_info.append((start, vtype, name, fstring, long_desc))\n",
 132 |     "            \n",
 133 |     "    columns = ['start', 'type', 'name', 'fstring', 'desc']\n",
 134 |     "    variables = pd.DataFrame(var_info, columns=columns)\n",
 135 |     "\n",
 136 |     "    # fill in the end column by shifting the start column\n",
 137 |     "    variables['end'] = variables.start.shift(-1)\n",
 138 |     "    variables.loc[len(variables)-1, 'end'] = 0\n",
 139 |     "\n",
 140 |     "    dct = FixedWidthVariables(variables, index_base=1)\n",
 141 |     "    return dct\n",
 142 |     "\n",
 143 |     "def read_gss(dirname):\n",
 144 |     "    \"\"\"Reads GSS files from the given directory.\n",
 145 |     "    \n",
 146 |     "    dirname: string\n",
 147 |     "    \n",
 148 |     "    returns: DataFrame\n",
 149 |     "    \"\"\"\n",
 150 |     "    dct = ReadStataDct(dirname + '/GSS.dct')\n",
 151 |     "    gss = dct.ReadFixedWidth(dirname + '/GSS.dat.gz',\n",
 152 |     "                             compression='gzip')\n",
 153 |     "    return gss"
 154 |    ]
 155 |   },
 156 |   {
 157 |    "cell_type": "code",
 158 |    "execution_count": 36,
 159 |    "metadata": {},
 160 |    "outputs": [
 161 |     {
 162 |      "name": "stdout",
 163 |      "output_type": "stream",
 164 |      "text": [
 165 |       "(62466, 101)\n"
 166 |      ]
 167 |     },
 168 |     {
 169 |      "data": {
 170 |       "text/html": [
 171 |        "<div>\n",
 172 |        "<style scoped>\n",
 173 |        "    .dataframe tbody tr th:only-of-type {\n",
 174 |        "        vertical-align: middle;\n",
 175 |        "    }\n",
 176 |        "\n",
 177 |        "    .dataframe tbody tr th {\n",
 178 |        "        vertical-align: top;\n",
 179 |        "    }\n",
 180 |        "\n",
 181 |        "    .dataframe thead th {\n",
 182 |        "        text-align: right;\n",
 183 |        "    }\n",
 184 |        "</style>\n",
 185 |        "<table border=\"1\" class=\"dataframe\">\n",
 186 |        "  <thead>\n",
 187 |        "    <tr style=\"text-align: right;\">\n",
 188 |        "      <th></th>\n",
 189 |        "      <th>year</th>\n",
 190 |        "      <th>id_</th>\n",
 191 |        "      <th>agewed</th>\n",
 192 |        "      <th>divorce</th>\n",
 193 |        "      <th>sibs</th>\n",
 194 |        "      <th>childs</th>\n",
 195 |        "      <th>age</th>\n",
 196 |        "      <th>educ</th>\n",
 197 |        "      <th>paeduc</th>\n",
 198 |        "      <th>maeduc</th>\n",
 199 |        "      <th>...</th>\n",
 200 |        "      <th>memchurh</th>\n",
 201 |        "      <th>realinc</th>\n",
 202 |        "      <th>cohort</th>\n",
 203 |        "      <th>marcohrt</th>\n",
 204 |        "      <th>ballot</th>\n",
 205 |        "      <th>wtssall</th>\n",
 206 |        "      <th>adults</th>\n",
 207 |        "      <th>compuse</th>\n",
 208 |        "      <th>databank</th>\n",
 209 |        "      <th>wtssnr</th>\n",
 210 |        "    </tr>\n",
 211 |        "  </thead>\n",
 212 |        "  <tbody>\n",
 213 |        "    <tr>\n",
 214 |        "      <th>0</th>\n",
 215 |        "      <td>1972</td>\n",
 216 |        "      <td>1</td>\n",
 217 |        "      <td>0</td>\n",
 218 |        "      <td>0</td>\n",
 219 |        "      <td>3</td>\n",
 220 |        "      <td>0</td>\n",
 221 |        "      <td>23</td>\n",
 222 |        "      <td>16</td>\n",
 223 |        "      <td>10</td>\n",
 224 |        "      <td>97</td>\n",
 225 |        "      <td>...</td>\n",
 226 |        "      <td>0</td>\n",
 227 |        "      <td>18951.0</td>\n",
 228 |        "      <td>1949</td>\n",
 229 |        "      <td>0</td>\n",
 230 |        "      <td>0</td>\n",
 231 |        "      <td>0.4446</td>\n",
 232 |        "      <td>1</td>\n",
 233 |        "      <td>0</td>\n",
 234 |        "      <td>0</td>\n",
 235 |        "      <td>1.0</td>\n",
 236 |        "    </tr>\n",
 237 |        "    <tr>\n",
 238 |        "      <th>1</th>\n",
 239 |        "      <td>1972</td>\n",
 240 |        "      <td>2</td>\n",
 241 |        "      <td>21</td>\n",
 242 |        "      <td>2</td>\n",
 243 |        "      <td>4</td>\n",
 244 |        "      <td>5</td>\n",
 245 |        "      <td>70</td>\n",
 246 |        "      <td>10</td>\n",
 247 |        "      <td>8</td>\n",
 248 |        "      <td>8</td>\n",
 249 |        "      <td>...</td>\n",
 250 |        "      <td>0</td>\n",
 251 |        "      <td>24366.0</td>\n",
 252 |        "      <td>1902</td>\n",
 253 |        "      <td>1923</td>\n",
 254 |        "      <td>0</td>\n",
 255 |        "      <td>0.8893</td>\n",
 256 |        "      <td>2</td>\n",
 257 |        "      <td>0</td>\n",
 258 |        "      <td>0</td>\n",
 259 |        "      <td>1.0</td>\n",
 260 |        "    </tr>\n",
 261 |        "    <tr>\n",
 262 |        "      <th>2</th>\n",
 263 |        "      <td>1972</td>\n",
 264 |        "      <td>3</td>\n",
 265 |        "      <td>20</td>\n",
 266 |        "      <td>2</td>\n",
 267 |        "      <td>5</td>\n",
 268 |        "      <td>4</td>\n",
 269 |        "      <td>48</td>\n",
 270 |        "      <td>12</td>\n",
 271 |        "      <td>8</td>\n",
 272 |        "      <td>8</td>\n",
 273 |        "      <td>...</td>\n",
 274 |        "      <td>0</td>\n",
 275 |        "      <td>24366.0</td>\n",
 276 |        "      <td>1924</td>\n",
 277 |        "      <td>1944</td>\n",
 278 |        "      <td>0</td>\n",
 279 |        "      <td>0.8893</td>\n",
 280 |        "      <td>2</td>\n",
 281 |        "      <td>0</td>\n",
 282 |        "      <td>0</td>\n",
 283 |        "      <td>1.0</td>\n",
 284 |        "    </tr>\n",
 285 |        "    <tr>\n",
 286 |        "      <th>3</th>\n",
 287 |        "      <td>1972</td>\n",
 288 |        "      <td>4</td>\n",
 289 |        "      <td>24</td>\n",
 290 |        "      <td>2</td>\n",
 291 |        "      <td>5</td>\n",
 292 |        "      <td>0</td>\n",
 293 |        "      <td>27</td>\n",
 294 |        "      <td>17</td>\n",
 295 |        "      <td>16</td>\n",
 296 |        "      <td>12</td>\n",
 297 |        "      <td>...</td>\n",
 298 |        "      <td>0</td>\n",
 299 |        "      <td>30458.0</td>\n",
 300 |        "      <td>1945</td>\n",
 301 |        "      <td>1969</td>\n",
 302 |        "      <td>0</td>\n",
 303 |        "      <td>0.8893</td>\n",
 304 |        "      <td>2</td>\n",
 305 |        "      <td>0</td>\n",
 306 |        "      <td>0</td>\n",
 307 |        "      <td>1.0</td>\n",
 308 |        "    </tr>\n",
 309 |        "    <tr>\n",
 310 |        "      <th>4</th>\n",
 311 |        "      <td>1972</td>\n",
 312 |        "      <td>5</td>\n",
 313 |        "      <td>22</td>\n",
 314 |        "      <td>2</td>\n",
 315 |        "      <td>2</td>\n",
 316 |        "      <td>2</td>\n",
 317 |        "      <td>61</td>\n",
 318 |        "      <td>12</td>\n",
 319 |        "      <td>8</td>\n",
 320 |        "      <td>8</td>\n",
 321 |        "      <td>...</td>\n",
 322 |        "      <td>0</td>\n",
 323 |        "      <td>50763.0</td>\n",
 324 |        "      <td>1911</td>\n",
 325 |        "      <td>1933</td>\n",
 326 |        "      <td>0</td>\n",
 327 |        "      <td>0.8893</td>\n",
 328 |        "      <td>2</td>\n",
 329 |        "      <td>0</td>\n",
 330 |        "      <td>0</td>\n",
 331 |        "      <td>1.0</td>\n",
 332 |        "    </tr>\n",
 333 |        "  </tbody>\n",
 334 |        "</table>\n",
 335 |        "<p>5 rows × 101 columns</p>\n",
 336 |        "</div>"
 337 |       ],
 338 |       "text/plain": [
 339 |        "   year  id_  agewed  divorce  sibs  childs  age  educ  paeduc  maeduc  \\\n",
 340 |        "0  1972    1       0        0     3       0   23    16      10      97   \n",
 341 |        "1  1972    2      21        2     4       5   70    10       8       8   \n",
 342 |        "2  1972    3      20        2     5       4   48    12       8       8   \n",
 343 |        "3  1972    4      24        2     5       0   27    17      16      12   \n",
 344 |        "4  1972    5      22        2     2       2   61    12       8       8   \n",
 345 |        "\n",
 346 |        "    ...    memchurh  realinc  cohort  marcohrt  ballot  wtssall  adults  \\\n",
 347 |        "0   ...           0  18951.0    1949         0       0   0.4446       1   \n",
 348 |        "1   ...           0  24366.0    1902      1923       0   0.8893       2   \n",
 349 |        "2   ...           0  24366.0    1924      1944       0   0.8893       2   \n",
 350 |        "3   ...           0  30458.0    1945      1969       0   0.8893       2   \n",
 351 |        "4   ...           0  50763.0    1911      1933       0   0.8893       2   \n",
 352 |        "\n",
 353 |        "   compuse  databank  wtssnr  \n",
 354 |        "0        0         0     1.0  \n",
 355 |        "1        0         0     1.0  \n",
 356 |        "2        0         0     1.0  \n",
 357 |        "3        0         0     1.0  \n",
 358 |        "4        0         0     1.0  \n",
 359 |        "\n",
 360 |        "[5 rows x 101 columns]"
 361 |       ]
 362 |      },
 363 |      "execution_count": 36,
 364 |      "metadata": {},
 365 |      "output_type": "execute_result"
 366 |     }
 367 |    ],
 368 |    "source": [
 369 |     "gss = read_gss('gss_eda')\n",
 370 |     "print(gss.shape)\n",
 371 |     "gss.head()"
 372 |    ]
 373 |   },
 374 |   {
 375 |    "cell_type": "code",
 376 |    "execution_count": 37,
 377 |    "metadata": {},
 378 |    "outputs": [
 379 |     {
 380 |      "data": {
 381 |       "text/plain": [
 382 |        "0    21165\n",
 383 |        "1    30936\n",
 384 |        "2     9536\n",
 385 |        "8      722\n",
 386 |        "9      107\n",
 387 |        "Name: gunlaw, dtype: int64"
 388 |       ]
 389 |      },
 390 |      "execution_count": 37,
 391 |      "metadata": {},
 392 |      "output_type": "execute_result"
 393 |     }
 394 |    ],
 395 |    "source": [
 396 |     "gss.gunlaw.value_counts().sort_index()"
 397 |    ]
 398 |   },
 399 |   {
 400 |    "cell_type": "code",
 401 |    "execution_count": 38,
 402 |    "metadata": {},
 403 |    "outputs": [
 404 |     {
 405 |      "data": {
 406 |       "text/plain": [
 407 |        "0    24364\n",
 408 |        "1    10946\n",
 409 |        "2    25153\n",
 410 |        "8     1892\n",
 411 |        "9      111\n",
 412 |        "Name: grass, dtype: int64"
 413 |       ]
 414 |      },
 415 |      "execution_count": 38,
 416 |      "metadata": {},
 417 |      "output_type": "execute_result"
 418 |     }
 419 |    ],
 420 |    "source": [
 421 |     "gss.grass.value_counts().sort_index()"
 422 |    ]
 423 |   },
 424 |   {
 425 |    "cell_type": "code",
 426 |    "execution_count": 39,
 427 |    "metadata": {},
 428 |    "outputs": [],
 429 |    "source": [
 430 |     "def replace_invalid(df):\n",
 431 |     "    df.realinc.replace([0], np.nan, inplace=True)                  \n",
 432 |     "    df.educ.replace([98,99], np.nan, inplace=True)\n",
 433 |     "    # 89 means 89 or older\n",
 434 |     "    df.age.replace([98, 99], np.nan, inplace=True) \n",
 435 |     "    df.cohort.replace([9999], np.nan, inplace=True)\n",
 436 |     "    df.adults.replace([9], np.nan, inplace=True)\n",
 437 |     "    df.gunlaw.replace([0,8,9], np.nan, inplace=True)\n",
 438 |     "    df.grass.replace([0,8,9], np.nan, inplace=True)\n",
 439 |     "\n",
 440 |     "replace_invalid(gss)"
 441 |    ]
 442 |   },
 443 |   {
 444 |    "cell_type": "markdown",
 445 |    "metadata": {},
 446 |    "source": [
 447 |     "The proportion of women in this dataset is slightly higher than it probably is in the population, even after weighting.\n",
 448 |     "\n",
 449 |     "The issue seems to be that the GSS excludes people living in institutions, including prisons and army housing, which disproportionaly excludes men."
 450 |    ]
 451 |   },
 452 |   {
 453 |    "cell_type": "code",
 454 |    "execution_count": 40,
 455 |    "metadata": {},
 456 |    "outputs": [],
 457 |    "source": [
 458 |     "sex = gss.loc[gss.year==2010, 'sex']"
 459 |    ]
 460 |   },
 461 |   {
 462 |    "cell_type": "code",
 463 |    "execution_count": 41,
 464 |    "metadata": {},
 465 |    "outputs": [
 466 |     {
 467 |      "data": {
 468 |       "text/plain": [
 469 |        "1    0.43591\n",
 470 |        "2    0.56409\n",
 471 |        "Name: Pmf, dtype: float64"
 472 |       ]
 473 |      },
 474 |      "execution_count": 41,
 475 |      "metadata": {},
 476 |      "output_type": "execute_result"
 477 |     }
 478 |    ],
 479 |    "source": [
 480 |     "pmf = Pmf([1,2])\n",
 481 |     "pmf[1] = np.sum(sex==1)\n",
 482 |     "pmf[2] = np.sum(sex==2)\n",
 483 |     "pmf.normalize()\n",
 484 |     "pmf"
 485 |    ]
 486 |   },
 487 |   {
 488 |    "cell_type": "code",
 489 |    "execution_count": 42,
 490 |    "metadata": {},
 491 |    "outputs": [
 492 |     {
 493 |      "data": {
 494 |       "text/plain": [
 495 |        "1    0.451634\n",
 496 |        "2    0.548366\n",
 497 |        "Name: Pmf, dtype: float64"
 498 |       ]
 499 |      },
 500 |      "execution_count": 42,
 501 |      "metadata": {},
 502 |      "output_type": "execute_result"
 503 |     }
 504 |    ],
 505 |    "source": [
 506 |     "pmf = Pmf([1,2])\n",
 507 |     "pmf[1] = np.sum((sex==1) * gss.wtssall)\n",
 508 |     "pmf[2] = np.sum((sex==2) * gss.wtssall)\n",
 509 |     "pmf.normalize()\n",
 510 |     "pmf"
 511 |    ]
 512 |   },
 513 |   {
 514 |    "cell_type": "code",
 515 |    "execution_count": 43,
 516 |    "metadata": {},
 517 |    "outputs": [
 518 |     {
 519 |      "data": {
 520 |       "text/plain": [
 521 |        "1    0.453784\n",
 522 |        "2    0.546216\n",
 523 |        "Name: Pmf, dtype: float64"
 524 |       ]
 525 |      },
 526 |      "execution_count": 43,
 527 |      "metadata": {},
 528 |      "output_type": "execute_result"
 529 |     }
 530 |    ],
 531 |    "source": [
 532 |     "pmf = Pmf([1,2])\n",
 533 |     "pmf[1] = np.sum((sex==1) * gss.wtssnr)\n",
 534 |     "pmf[2] = np.sum((sex==2) * gss.wtssnr)\n",
 535 |     "pmf.normalize()\n",
 536 |     "pmf"
 537 |    ]
 538 |   },
 539 |   {
 540 |    "cell_type": "code",
 541 |    "execution_count": 44,
 542 |    "metadata": {},
 543 |    "outputs": [
 544 |     {
 545 |      "data": {
 546 |       "text/plain": [
 547 |        "1    0.463868\n",
 548 |        "2    0.536132\n",
 549 |        "Name: Pmf, dtype: float64"
 550 |       ]
 551 |      },
 552 |      "execution_count": 44,
 553 |      "metadata": {},
 554 |      "output_type": "execute_result"
 555 |     }
 556 |    ],
 557 |    "source": [
 558 |     "pmf = Pmf([1,2])\n",
 559 |     "pmf[1] = np.sum((sex==1) * gss.wtssall * gss.adults)\n",
 560 |     "pmf[2] = np.sum((sex==2) * gss.wtssall * gss.adults)\n",
 561 |     "pmf.normalize()\n",
 562 |     "pmf"
 563 |    ]
 564 |   },
 565 |   {
 566 |    "cell_type": "code",
 567 |    "execution_count": 45,
 568 |    "metadata": {},
 569 |    "outputs": [
 570 |     {
 571 |      "data": {
 572 |       "text/plain": [
 573 |        "1    0.485397\n",
 574 |        "2    0.514603\n",
 575 |        "Name: Pmf, dtype: float64"
 576 |       ]
 577 |      },
 578 |      "execution_count": 45,
 579 |      "metadata": {},
 580 |      "output_type": "execute_result"
 581 |     }
 582 |    ],
 583 |    "source": [
 584 |     "pmf = Pmf([1,2])\n",
 585 |     "pmf[1] = 114173831\n",
 586 |     "pmf[2] = 121043794\n",
 587 |     "pmf.normalize()\n",
 588 |     "pmf"
 589 |    ]
 590 |   },
 591 |   {
 592 |    "cell_type": "code",
 593 |    "execution_count": 46,
 594 |    "metadata": {
 595 |     "scrolled": true
 596 |    },
 597 |    "outputs": [],
 598 |    "source": [
 599 |     "gss['wtsample'] = gss['wtssall']\n",
 600 |     "gss.loc[gss.sex==1, 'wtsample'] *= 1.145"
 601 |    ]
 602 |   },
 603 |   {
 604 |    "cell_type": "code",
 605 |    "execution_count": 47,
 606 |    "metadata": {},
 607 |    "outputs": [
 608 |     {
 609 |      "data": {
 610 |       "text/plain": [
 611 |        "1    0.485338\n",
 612 |        "2    0.514662\n",
 613 |        "Name: Pmf, dtype: float64"
 614 |       ]
 615 |      },
 616 |      "execution_count": 47,
 617 |      "metadata": {},
 618 |      "output_type": "execute_result"
 619 |     }
 620 |    ],
 621 |    "source": [
 622 |     "pmf = Pmf([1,2])\n",
 623 |     "pmf[1] = np.sum((sex==1) * gss.wtsample)\n",
 624 |     "pmf[2] = np.sum((sex==2) * gss.wtsample)\n",
 625 |     "pmf.normalize()\n",
 626 |     "pmf"
 627 |    ]
 628 |   },
 629 |   {
 630 |    "cell_type": "code",
 631 |    "execution_count": 49,
 632 |    "metadata": {
 633 |     "scrolled": true
 634 |    },
 635 |    "outputs": [
 636 |     {
 637 |      "data": {
 638 |       "text/html": [
 639 |        "<div>\n",
 640 |        "<style scoped>\n",
 641 |        "    .dataframe tbody tr th:only-of-type {\n",
 642 |        "        vertical-align: middle;\n",
 643 |        "    }\n",
 644 |        "\n",
 645 |        "    .dataframe tbody tr th {\n",
 646 |        "        vertical-align: top;\n",
 647 |        "    }\n",
 648 |        "\n",
 649 |        "    .dataframe thead th {\n",
 650 |        "        text-align: right;\n",
 651 |        "    }\n",
 652 |        "</style>\n",
 653 |        "<table border=\"1\" class=\"dataframe\">\n",
 654 |        "  <thead>\n",
 655 |        "    <tr style=\"text-align: right;\">\n",
 656 |        "      <th></th>\n",
 657 |        "      <th>year</th>\n",
 658 |        "      <th>age</th>\n",
 659 |        "      <th>cohort</th>\n",
 660 |        "      <th>sex</th>\n",
 661 |        "      <th>race</th>\n",
 662 |        "      <th>educ</th>\n",
 663 |        "      <th>realinc</th>\n",
 664 |        "      <th>gunlaw</th>\n",
 665 |        "      <th>grass</th>\n",
 666 |        "      <th>wtssall</th>\n",
 667 |        "    </tr>\n",
 668 |        "  </thead>\n",
 669 |        "  <tbody>\n",
 670 |        "    <tr>\n",
 671 |        "      <th>0</th>\n",
 672 |        "      <td>1972</td>\n",
 673 |        "      <td>23.0</td>\n",
 674 |        "      <td>1949.0</td>\n",
 675 |        "      <td>2</td>\n",
 676 |        "      <td>1</td>\n",
 677 |        "      <td>16.0</td>\n",
 678 |        "      <td>18951.0</td>\n",
 679 |        "      <td>1.0</td>\n",
 680 |        "      <td>NaN</td>\n",
 681 |        "      <td>0.4446</td>\n",
 682 |        "    </tr>\n",
 683 |        "    <tr>\n",
 684 |        "      <th>1</th>\n",
 685 |        "      <td>1972</td>\n",
 686 |        "      <td>70.0</td>\n",
 687 |        "      <td>1902.0</td>\n",
 688 |        "      <td>1</td>\n",
 689 |        "      <td>1</td>\n",
 690 |        "      <td>10.0</td>\n",
 691 |        "      <td>24366.0</td>\n",
 692 |        "      <td>1.0</td>\n",
 693 |        "      <td>NaN</td>\n",
 694 |        "      <td>0.8893</td>\n",
 695 |        "    </tr>\n",
 696 |        "    <tr>\n",
 697 |        "      <th>2</th>\n",
 698 |        "      <td>1972</td>\n",
 699 |        "      <td>48.0</td>\n",
 700 |        "      <td>1924.0</td>\n",
 701 |        "      <td>2</td>\n",
 702 |        "      <td>1</td>\n",
 703 |        "      <td>12.0</td>\n",
 704 |        "      <td>24366.0</td>\n",
 705 |        "      <td>1.0</td>\n",
 706 |        "      <td>NaN</td>\n",
 707 |        "      <td>0.8893</td>\n",
 708 |        "    </tr>\n",
 709 |        "    <tr>\n",
 710 |        "      <th>3</th>\n",
 711 |        "      <td>1972</td>\n",
 712 |        "      <td>27.0</td>\n",
 713 |        "      <td>1945.0</td>\n",
 714 |        "      <td>2</td>\n",
 715 |        "      <td>1</td>\n",
 716 |        "      <td>17.0</td>\n",
 717 |        "      <td>30458.0</td>\n",
 718 |        "      <td>1.0</td>\n",
 719 |        "      <td>NaN</td>\n",
 720 |        "      <td>0.8893</td>\n",
 721 |        "    </tr>\n",
 722 |        "    <tr>\n",
 723 |        "      <th>4</th>\n",
 724 |        "      <td>1972</td>\n",
 725 |        "      <td>61.0</td>\n",
 726 |        "      <td>1911.0</td>\n",
 727 |        "      <td>2</td>\n",
 728 |        "      <td>1</td>\n",
 729 |        "      <td>12.0</td>\n",
 730 |        "      <td>50763.0</td>\n",
 731 |        "      <td>1.0</td>\n",
 732 |        "      <td>NaN</td>\n",
 733 |        "      <td>0.8893</td>\n",
 734 |        "    </tr>\n",
 735 |        "  </tbody>\n",
 736 |        "</table>\n",
 737 |        "</div>"
 738 |       ],
 739 |       "text/plain": [
 740 |        "   year   age  cohort  sex  race  educ  realinc  gunlaw  grass  wtssall\n",
 741 |        "0  1972  23.0  1949.0    2     1  16.0  18951.0     1.0    NaN   0.4446\n",
 742 |        "1  1972  70.0  1902.0    1     1  10.0  24366.0     1.0    NaN   0.8893\n",
 743 |        "2  1972  48.0  1924.0    2     1  12.0  24366.0     1.0    NaN   0.8893\n",
 744 |        "3  1972  27.0  1945.0    2     1  17.0  30458.0     1.0    NaN   0.8893\n",
 745 |        "4  1972  61.0  1911.0    2     1  12.0  50763.0     1.0    NaN   0.8893"
 746 |       ]
 747 |      },
 748 |      "execution_count": 49,
 749 |      "metadata": {},
 750 |      "output_type": "execute_result"
 751 |     }
 752 |    ],
 753 |    "source": [
 754 |     "variables = ['year', 'age', 'cohort', 'sex', 'race', \n",
 755 |     "             'educ', 'realinc', 'gunlaw', 'grass', 'wtssall']\n",
 756 |     "\n",
 757 |     "subset = gss[variables]\n",
 758 |     "subset.head()"
 759 |    ]
 760 |   },
 761 |   {
 762 |    "cell_type": "code",
 763 |    "execution_count": 50,
 764 |    "metadata": {},
 765 |    "outputs": [],
 766 |    "source": [
 767 |     "# drop the 65 respondents with unknown household size\n",
 768 |     "# subset = subset.dropna(subset=['adults'])"
 769 |    ]
 770 |   },
 771 |   {
 772 |    "cell_type": "code",
 773 |    "execution_count": 51,
 774 |    "metadata": {},
 775 |    "outputs": [],
 776 |    "source": [
 777 |     "np.random.seed(19)\n",
 778 |     "sample = utils.resample_by_year(subset, 'wtssall')"
 779 |    ]
 780 |   },
 781 |   {
 782 |    "cell_type": "code",
 783 |    "execution_count": 52,
 784 |    "metadata": {},
 785 |    "outputs": [],
 786 |    "source": [
 787 |     "!rm gss.hdf5\n",
 788 |     "sample.to_hdf('gss.hdf5', 'gss')"
 789 |    ]
 790 |   },
 791 |   {
 792 |    "cell_type": "code",
 793 |    "execution_count": 53,
 794 |    "metadata": {},
 795 |    "outputs": [
 796 |     {
 797 |      "name": "stdout",
 798 |      "output_type": "stream",
 799 |      "text": [
 800 |       "CPU times: user 16 ms, sys: 4 ms, total: 20 ms\n",
 801 |       "Wall time: 18.8 ms\n"
 802 |      ]
 803 |     },
 804 |     {
 805 |      "data": {
 806 |       "text/plain": [
 807 |        "(62466, 10)"
 808 |       ]
 809 |      },
 810 |      "execution_count": 53,
 811 |      "metadata": {},
 812 |      "output_type": "execute_result"
 813 |     }
 814 |    ],
 815 |    "source": [
 816 |     "%time gss = pd.read_hdf('gss.hdf5', 'gss')\n",
 817 |     "gss.shape"
 818 |    ]
 819 |   },
 820 |   {
 821 |    "cell_type": "code",
 822 |    "execution_count": 54,
 823 |    "metadata": {},
 824 |    "outputs": [
 825 |     {
 826 |      "data": {
 827 |       "text/html": [
 828 |        "<div>\n",
 829 |        "<style scoped>\n",
 830 |        "    .dataframe tbody tr th:only-of-type {\n",
 831 |        "        vertical-align: middle;\n",
 832 |        "    }\n",
 833 |        "\n",
 834 |        "    .dataframe tbody tr th {\n",
 835 |        "        vertical-align: top;\n",
 836 |        "    }\n",
 837 |        "\n",
 838 |        "    .dataframe thead th {\n",
 839 |        "        text-align: right;\n",
 840 |        "    }\n",
 841 |        "</style>\n",
 842 |        "<table border=\"1\" class=\"dataframe\">\n",
 843 |        "  <thead>\n",
 844 |        "    <tr style=\"text-align: right;\">\n",
 845 |        "      <th></th>\n",
 846 |        "      <th>year</th>\n",
 847 |        "      <th>age</th>\n",
 848 |        "      <th>cohort</th>\n",
 849 |        "      <th>sex</th>\n",
 850 |        "      <th>race</th>\n",
 851 |        "      <th>educ</th>\n",
 852 |        "      <th>realinc</th>\n",
 853 |        "      <th>gunlaw</th>\n",
 854 |        "      <th>grass</th>\n",
 855 |        "      <th>wtssall</th>\n",
 856 |        "    </tr>\n",
 857 |        "  </thead>\n",
 858 |        "  <tbody>\n",
 859 |        "    <tr>\n",
 860 |        "      <th>0</th>\n",
 861 |        "      <td>1972</td>\n",
 862 |        "      <td>26.0</td>\n",
 863 |        "      <td>1946.0</td>\n",
 864 |        "      <td>1</td>\n",
 865 |        "      <td>1</td>\n",
 866 |        "      <td>18.0</td>\n",
 867 |        "      <td>13537.0</td>\n",
 868 |        "      <td>2.0</td>\n",
 869 |        "      <td>NaN</td>\n",
 870 |        "      <td>0.8893</td>\n",
 871 |        "    </tr>\n",
 872 |        "    <tr>\n",
 873 |        "      <th>1</th>\n",
 874 |        "      <td>1972</td>\n",
 875 |        "      <td>38.0</td>\n",
 876 |        "      <td>1934.0</td>\n",
 877 |        "      <td>2</td>\n",
 878 |        "      <td>1</td>\n",
 879 |        "      <td>12.0</td>\n",
 880 |        "      <td>18951.0</td>\n",
 881 |        "      <td>1.0</td>\n",
 882 |        "      <td>NaN</td>\n",
 883 |        "      <td>0.4446</td>\n",
 884 |        "    </tr>\n",
 885 |        "    <tr>\n",
 886 |        "      <th>2</th>\n",
 887 |        "      <td>1972</td>\n",
 888 |        "      <td>57.0</td>\n",
 889 |        "      <td>1915.0</td>\n",
 890 |        "      <td>1</td>\n",
 891 |        "      <td>1</td>\n",
 892 |        "      <td>12.0</td>\n",
 893 |        "      <td>30458.0</td>\n",
 894 |        "      <td>1.0</td>\n",
 895 |        "      <td>NaN</td>\n",
 896 |        "      <td>1.3339</td>\n",
 897 |        "    </tr>\n",
 898 |        "    <tr>\n",
 899 |        "      <th>3</th>\n",
 900 |        "      <td>1972</td>\n",
 901 |        "      <td>61.0</td>\n",
 902 |        "      <td>1911.0</td>\n",
 903 |        "      <td>2</td>\n",
 904 |        "      <td>1</td>\n",
 905 |        "      <td>14.0</td>\n",
 906 |        "      <td>37226.0</td>\n",
 907 |        "      <td>1.0</td>\n",
 908 |        "      <td>NaN</td>\n",
 909 |        "      <td>0.8893</td>\n",
 910 |        "    </tr>\n",
 911 |        "    <tr>\n",
 912 |        "      <th>4</th>\n",
 913 |        "      <td>1972</td>\n",
 914 |        "      <td>59.0</td>\n",
 915 |        "      <td>1913.0</td>\n",
 916 |        "      <td>1</td>\n",
 917 |        "      <td>1</td>\n",
 918 |        "      <td>12.0</td>\n",
 919 |        "      <td>30458.0</td>\n",
 920 |        "      <td>2.0</td>\n",
 921 |        "      <td>NaN</td>\n",
 922 |        "      <td>0.8893</td>\n",
 923 |        "    </tr>\n",
 924 |        "  </tbody>\n",
 925 |        "</table>\n",
 926 |        "</div>"
 927 |       ],
 928 |       "text/plain": [
 929 |        "   year   age  cohort  sex  race  educ  realinc  gunlaw  grass  wtssall\n",
 930 |        "0  1972  26.0  1946.0    1     1  18.0  13537.0     2.0    NaN   0.8893\n",
 931 |        "1  1972  38.0  1934.0    2     1  12.0  18951.0     1.0    NaN   0.4446\n",
 932 |        "2  1972  57.0  1915.0    1     1  12.0  30458.0     1.0    NaN   1.3339\n",
 933 |        "3  1972  61.0  1911.0    2     1  14.0  37226.0     1.0    NaN   0.8893\n",
 934 |        "4  1972  59.0  1913.0    1     1  12.0  30458.0     2.0    NaN   0.8893"
 935 |       ]
 936 |      },
 937 |      "execution_count": 54,
 938 |      "metadata": {},
 939 |      "output_type": "execute_result"
 940 |     }
 941 |    ],
 942 |    "source": [
 943 |     "gss.head()"
 944 |    ]
 945 |   },
 946 |   {
 947 |    "cell_type": "code",
 948 |    "execution_count": 24,
 949 |    "metadata": {},
 950 |    "outputs": [
 951 |     {
 952 |      "data": {
 953 |       "text/plain": [
 954 |        "count    62466.000000\n",
 955 |        "mean      1994.072359\n",
 956 |        "std         12.937941\n",
 957 |        "min       1972.000000\n",
 958 |        "25%       1984.000000\n",
 959 |        "50%       1994.000000\n",
 960 |        "75%       2006.000000\n",
 961 |        "max       2016.000000\n",
 962 |        "Name: year, dtype: float64"
 963 |       ]
 964 |      },
 965 |      "execution_count": 24,
 966 |      "metadata": {},
 967 |      "output_type": "execute_result"
 968 |     }
 969 |    ],
 970 |    "source": [
 971 |     "gss['year'].describe()"
 972 |    ]
 973 |   },
 974 |   {
 975 |    "cell_type": "code",
 976 |    "execution_count": 25,
 977 |    "metadata": {},
 978 |    "outputs": [
 979 |     {
 980 |      "data": {
 981 |       "text/plain": [
 982 |        "count    62466.000000\n",
 983 |        "mean         1.541415\n",
 984 |        "std          0.498286\n",
 985 |        "min          1.000000\n",
 986 |        "25%          1.000000\n",
 987 |        "50%          2.000000\n",
 988 |        "75%          2.000000\n",
 989 |        "max          2.000000\n",
 990 |        "Name: sex, dtype: float64"
 991 |       ]
 992 |      },
 993 |      "execution_count": 25,
 994 |      "metadata": {},
 995 |      "output_type": "execute_result"
 996 |     }
 997 |    ],
 998 |    "source": [
 999 |     "gss['sex'].describe()"
1000 |    ]
1001 |   },
1002 |   {
1003 |    "cell_type": "code",
1004 |    "execution_count": 26,
1005 |    "metadata": {},
1006 |    "outputs": [
1007 |     {
1008 |      "data": {
1009 |       "text/plain": [
1010 |        "count    62281.000000\n",
1011 |        "mean        44.648320\n",
1012 |        "std         17.072244\n",
1013 |        "min         18.000000\n",
1014 |        "25%         30.000000\n",
1015 |        "50%         43.000000\n",
1016 |        "75%         57.000000\n",
1017 |        "max         89.000000\n",
1018 |        "Name: age, dtype: float64"
1019 |       ]
1020 |      },
1021 |      "execution_count": 26,
1022 |      "metadata": {},
1023 |      "output_type": "execute_result"
1024 |     }
1025 |    ],
1026 |    "source": [
1027 |     "gss['age'].describe()"
1028 |    ]
1029 |   },
1030 |   {
1031 |    "cell_type": "code",
1032 |    "execution_count": 27,
1033 |    "metadata": {},
1034 |    "outputs": [
1035 |     {
1036 |      "data": {
1037 |       "text/plain": [
1038 |        "count    62282.000000\n",
1039 |        "mean      1949.429996\n",
1040 |        "std         20.734302\n",
1041 |        "min       1883.000000\n",
1042 |        "25%       1935.000000\n",
1043 |        "50%       1951.000000\n",
1044 |        "75%       1964.000000\n",
1045 |        "max       1998.000000\n",
1046 |        "Name: cohort, dtype: float64"
1047 |       ]
1048 |      },
1049 |      "execution_count": 27,
1050 |      "metadata": {},
1051 |      "output_type": "execute_result"
1052 |     }
1053 |    ],
1054 |    "source": [
1055 |     "gss['cohort'].describe()"
1056 |    ]
1057 |   },
1058 |   {
1059 |    "cell_type": "code",
1060 |    "execution_count": 28,
1061 |    "metadata": {},
1062 |    "outputs": [
1063 |     {
1064 |      "data": {
1065 |       "text/plain": [
1066 |        "count    62466.000000\n",
1067 |        "mean         1.254955\n",
1068 |        "std          0.554694\n",
1069 |        "min          1.000000\n",
1070 |        "25%          1.000000\n",
1071 |        "50%          1.000000\n",
1072 |        "75%          1.000000\n",
1073 |        "max          3.000000\n",
1074 |        "Name: race, dtype: float64"
1075 |       ]
1076 |      },
1077 |      "execution_count": 28,
1078 |      "metadata": {},
1079 |      "output_type": "execute_result"
1080 |     }
1081 |    ],
1082 |    "source": [
1083 |     "gss['race'].describe()"
1084 |    ]
1085 |   },
1086 |   {
1087 |    "cell_type": "code",
1088 |    "execution_count": 29,
1089 |    "metadata": {},
1090 |    "outputs": [
1091 |     {
1092 |      "data": {
1093 |       "text/plain": [
1094 |        "count    62304.000000\n",
1095 |        "mean        12.831311\n",
1096 |        "std          3.117027\n",
1097 |        "min          0.000000\n",
1098 |        "25%         12.000000\n",
1099 |        "50%         12.000000\n",
1100 |        "75%         15.000000\n",
1101 |        "max         20.000000\n",
1102 |        "Name: educ, dtype: float64"
1103 |       ]
1104 |      },
1105 |      "execution_count": 29,
1106 |      "metadata": {},
1107 |      "output_type": "execute_result"
1108 |     }
1109 |    ],
1110 |    "source": [
1111 |     "gss['educ'].describe()"
1112 |    ]
1113 |   },
1114 |   {
1115 |    "cell_type": "code",
1116 |    "execution_count": 30,
1117 |    "metadata": {},
1118 |    "outputs": [
1119 |     {
1120 |      "data": {
1121 |       "text/plain": [
1122 |        "count     55499.000000\n",
1123 |        "mean      34702.430164\n",
1124 |        "std       30665.659411\n",
1125 |        "min         234.000000\n",
1126 |        "25%       13750.000000\n",
1127 |        "50%       26015.000000\n",
1128 |        "75%       43426.000000\n",
1129 |        "max      162607.000000\n",
1130 |        "Name: realinc, dtype: float64"
1131 |       ]
1132 |      },
1133 |      "execution_count": 30,
1134 |      "metadata": {},
1135 |      "output_type": "execute_result"
1136 |     }
1137 |    ],
1138 |    "source": [
1139 |     "gss['realinc'].describe()"
1140 |    ]
1141 |   },
1142 |   {
1143 |    "cell_type": "code",
1144 |    "execution_count": 31,
1145 |    "metadata": {},
1146 |    "outputs": [
1147 |     {
1148 |      "data": {
1149 |       "text/plain": [
1150 |        "count    62466.000000\n",
1151 |        "mean         1.213340\n",
1152 |        "std          0.585544\n",
1153 |        "min          0.411898\n",
1154 |        "25%          0.918400\n",
1155 |        "50%          1.062100\n",
1156 |        "75%          1.515500\n",
1157 |        "max          8.739876\n",
1158 |        "Name: wtssall, dtype: float64"
1159 |       ]
1160 |      },
1161 |      "execution_count": 31,
1162 |      "metadata": {},
1163 |      "output_type": "execute_result"
1164 |     }
1165 |    ],
1166 |    "source": [
1167 |     "gss['wtssall'].describe()"
1168 |    ]
1169 |   }
1170 |  ],
1171 |  "metadata": {
1172 |   "kernelspec": {
1173 |    "display_name": "Python 3",
1174 |    "language": "python",
1175 |    "name": "python3"
1176 |   },
1177 |   "language_info": {
1178 |    "codemirror_mode": {
1179 |     "name": "ipython",
1180 |     "version": 3
1181 |    },
1182 |    "file_extension": ".py",
1183 |    "mimetype": "text/x-python",
1184 |    "name": "python",
1185 |    "nbconvert_exporter": "python",
1186 |    "pygments_lexer": "ipython3",
1187 |    "version": "3.6.7"
1188 |   }
1189 |  },
1190 |  "nbformat": 4,
1191 |  "nbformat_minor": 1
1192 | }
1193 | 


--------------------------------------------------------------------------------