├── README.md
├── codebook.pdf
├── convert.py
├── data
    ├── USSC_PV_Report_2020.csv.xz
    ├── opafy08nid.csv.xz
    ├── opafy09nid.csv.xz
    ├── opafy10nid.csv.xz
    ├── opafy11nid.csv.xz
    ├── opafy12nid.csv.xz
    ├── opafy13nid.csv.xz
    ├── opafy14nid.csv.xz
    ├── opafy15nid.csv.xz
    ├── opafy16nid.csv.xz
    ├── opafy17nid.csv.xz
    ├── opafy18nid.csv.xz
    ├── opafy19nid.csv.xz
    ├── opafy20nid.csv.xz
    ├── opafy21nid.csv.xz
    ├── opafy22nid.csv.xz
    ├── ussc_sup_fy14.csv.xz
    ├── ussc_sup_fy15.csv.xz
    ├── ussc_sup_fy16.csv.xz
    └── ussc_sup_fy17.csv.xz
└── getdata.sh


/README.md:
--------------------------------------------------------------------------------
 1 | # Converting Sentencing Commission Files into CSVs
 2 | 
 3 | The good news: The United States Sentencing Commission makes
 4 | [_very_ detailed files](https://www.ussc.gov/research/datafiles/commission-datafiles#individual)
 5 | available about sentencing in the US. :tada:
 6 | 
 7 | The bad news: They are in a crazy fixed-width format and include SAS and SPSS scripts to read them into those programs and those programs alone. :scream:
 8 | 
 9 | So what can we do about it? Well, we can write a little converter that converts them all! These files will do that for you.
10 | 
11 | ## I just want the data
12 | 
13 | It turns out the data is small enough that you can upload it to GitHub! However, it's lzma compressed. Here's how you can decompress them.
14 | 
15 | ### First, a warning
16 | 
17 | These files are filled with tons of nulls. The typical file size compressed is around 10MB and uncompressed aroung 1.5GB. So so so many blank fields. Loading this directly into pandas on a small box will probably make your box sad. Instead, you should really look at the `usecols` kwarg of [pd.read_csv](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html).
18 | 
19 | ### Mac
20 | 
21 | If you're using homebrew, just do
22 | 
23 | ```
24 | $ brew install xz
25 | ```
26 | 
27 | Then you can open the files by doing
28 | 
29 | ```
30 | $ unxz [FILENAME].xz
31 | ```
32 | 
33 | ### Debian/Ubuntu
34 | 
35 | First install xz utilties
36 | 
37 | ```
38 | $ sudo apt update && sudo apt install xz-utils
39 | ```
40 | 
41 | Then you should be able to open files thus
42 | 
43 | ```
44 | $ xz -d [FILENAME].xz
45 | ```
46 | 
47 | ### Windows
48 | 
49 | Both 7zip and WinZip will open these files for you. Download and install them at your leisure.
50 | 
51 | ## Requirements
52 | 
53 | This is script has only been tested with Python 3 and it assumes you have `click` installed. But this is just for progress bars, so you can comment out those lines if you want.
54 | 
55 | ## Usage
56 | 
57 | First you'll need to get the data from the Sentencing Commission. The script `getdata.sh` gives examples, and will itself download FY08-20's data files.
58 | 
59 | Next you'll need to point the script `convert.py` at the file. For instance,
60 | 
61 | ```
62 | $ python3 convert.py data/opafy14nid.zip
63 | ```
64 | 
65 | This will leave you a file called `data/opafy14nid.csv` in that folder.
66 | 
67 | Be warned, these files end up being quite large, so you may want to gzip or xzip them.
68 | 
69 | ## License
70 | 
71 | MIT
72 | 


--------------------------------------------------------------------------------
/codebook.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/codebook.pdf


--------------------------------------------------------------------------------
/convert.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Convert Sentencing Commission files into CSVs.
  3 | 
  4 | Usage:
  5 |   $ python3 convert.py [LIST OF FILES TO CONVERT]
  6 | 
  7 | @author Kevin H. Wilson <khwilson@gmail.com>
  8 | """
  9 | import csv
 10 | import os
 11 | import sys
 12 | import tempfile
 13 | import zipfile
 14 | from pathlib import Path
 15 | from typing import Dict, List, Union
 16 | 
 17 | import click
 18 | 
 19 | 
 20 | def read_columns(filename: Union[str, Path]) -> List[Dict[str, Union[bool, int, str]]]:
 21 |     """
 22 |     Read the column names from the file with the passed name.
 23 |     Assumes it's in a SAS format that begins with INPUT and ends with
 24 |     a semicolon.
 25 | 
 26 |     Args:
 27 |       filename: The file name to read
 28 | 
 29 |     Returns:
 30 |       The list of column names where each element is a dict with key/vals:
 31 |         * name (str): The column name
 32 |         * is_char (bool): Whether the column is a string column
 33 |         * start (int): The (1-delimited) starting position of the column
 34 |         * end (int): The (1-delimited and inclusive) ending position of the column
 35 |     """
 36 |     columns = []
 37 |     with open(filename, "rt") as f:
 38 |         # Search for the line that starts with INPUT
 39 |         for line in f:
 40 |             if line.startswith("INPUT"):
 41 |                 break
 42 | 
 43 |         for line in f:
 44 |             # Kill all the extra whitespace
 45 |             line = line.strip()
 46 | 
 47 |             # Is this the last line?
 48 |             if line.endswith(";"):
 49 |                 # If so, strip the ; and the extra whitespace
 50 |                 last_line = True
 51 |                 line = line[:-1].strip()
 52 |             else:
 53 |                 last_line = False
 54 | 
 55 |             # Parse row into column names
 56 |             i = 0
 57 |             sline = line.split()
 58 |             while i < len(sline):
 59 |                 col_name = sline[i]
 60 |                 i += 1
 61 | 
 62 |                 if sline[i] == "$":
 63 |                     is_char = True
 64 |                     i += 1
 65 |                 else:
 66 |                     is_char = False
 67 | 
 68 |                 field_range = sline[i]
 69 |                 i += 1
 70 | 
 71 |                 # Field ranges are formatted either as # or #-#
 72 |                 sfield_range = field_range.split("-")
 73 |                 if len(sfield_range) == 1:
 74 |                     sfield_range = (sfield_range[0], sfield_range[0])
 75 | 
 76 |                 # Write out the column to the list
 77 |                 columns.append(
 78 |                     {
 79 |                         "name": col_name,
 80 |                         "is_char": is_char,
 81 |                         "start": int(sfield_range[0]),
 82 |                         "end": int(sfield_range[1]),
 83 |                     }
 84 |                 )
 85 | 
 86 |             if last_line:
 87 |                 break
 88 | 
 89 |     return columns
 90 | 
 91 | 
 92 | def convert_file(filename: Union[str, Path]):
 93 |     """
 94 |     Convert a file from the Sentencing Commission format into a CSV.
 95 |     Assumes the file is a ZIP file containing at least the following:
 96 |       - .sas: A file with the same name as `filename` except ending in .sas
 97 |       - .dat: A file with the same name as `filename` except ending in .dat
 98 | 
 99 |     The .dat file is a fixed-width file whose columns are described by the .sas
100 |     file. If you're looking at the .sas file, search for INPUT and LENGTH to
101 |     see the two main parts of the file. There are a _lot_ of columns.
102 | 
103 |     Args:
104 |       filename: The name of the file to convert
105 |     """
106 |     filename = Path(filename)
107 |     with tempfile.TemporaryDirectory() as tmpdir:
108 |         tmpdir = Path(tmpdir)
109 | 
110 |         # Unzip the contents of the file
111 |         with zipfile.ZipFile(filename, "r") as thefile:
112 |             thefile.extractall(tmpdir)
113 | 
114 |         # Read in the column names from the .sas file
115 |         sasfilename = filename.with_suffix(".sas").name
116 |         saspath = tmpdir / sasfilename
117 |         columns = read_columns(saspath)
118 | 
119 |         # Setup the path to the .dat file
120 |         datfilename = filename.with_suffix(".dat").name
121 |         datpath = tmpdir / datfilename
122 | 
123 |         # Open the output file
124 |         outfilename = filename.with_suffix(".csv")
125 |         badlines = []
126 |         with open(outfilename, "wt") as outfile:
127 |             # Write the column headers
128 |             writer = csv.writer(outfile)
129 |             writer.writerow([col["name"] for col in columns])
130 | 
131 |             # Read in the data
132 |             with click.progressbar(length=os.stat(datpath).st_size) as bar:
133 |                 with open(datpath, "rb") as infile:
134 |                     for line in infile:
135 |                         bar.update(len(line))
136 |                         line = line.decode("latin1")
137 | 
138 |                         # Read in a single row
139 |                         readrow = []
140 |                         for col in columns:
141 |                             val = line[col["start"] - 1 : col["end"]].strip()
142 | 
143 |                             # If it's numeric and not missing, format it nicely
144 |                             if val and not col["is_char"]:
145 |                                 if "." in val:
146 |                                     val = float(val)
147 |                                 else:
148 |                                     val = int(float(val))  # Handle 6e+10
149 |                             readrow.append(val)
150 | 
151 |                         # Write out the row
152 |                         writer.writerow(readrow)
153 | 
154 |     if badlines:
155 |         badfilename = filename.with_suffix(".bad")
156 |         with open(badfilename, "wb") as f:
157 |             for line in badlines:
158 |                 f.write(line)
159 | 
160 | 
161 | def main():
162 |     for filename in sys.argv[1:]:
163 |         convert_file(filename)
164 | 
165 | 
166 | if __name__ == "__main__":
167 |     main()
168 | 


--------------------------------------------------------------------------------
/data/USSC_PV_Report_2020.csv.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/USSC_PV_Report_2020.csv.xz


--------------------------------------------------------------------------------
/data/opafy08nid.csv.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/opafy08nid.csv.xz


--------------------------------------------------------------------------------
/data/opafy09nid.csv.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/opafy09nid.csv.xz


--------------------------------------------------------------------------------
/data/opafy10nid.csv.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/opafy10nid.csv.xz


--------------------------------------------------------------------------------
/data/opafy11nid.csv.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/opafy11nid.csv.xz


--------------------------------------------------------------------------------
/data/opafy12nid.csv.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/opafy12nid.csv.xz


--------------------------------------------------------------------------------
/data/opafy13nid.csv.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/opafy13nid.csv.xz


--------------------------------------------------------------------------------
/data/opafy14nid.csv.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/opafy14nid.csv.xz


--------------------------------------------------------------------------------
/data/opafy15nid.csv.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/opafy15nid.csv.xz


--------------------------------------------------------------------------------
/data/opafy16nid.csv.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/opafy16nid.csv.xz


--------------------------------------------------------------------------------
/data/opafy17nid.csv.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/opafy17nid.csv.xz


--------------------------------------------------------------------------------
/data/opafy18nid.csv.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/opafy18nid.csv.xz


--------------------------------------------------------------------------------
/data/opafy19nid.csv.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/opafy19nid.csv.xz


--------------------------------------------------------------------------------
/data/opafy20nid.csv.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/opafy20nid.csv.xz


--------------------------------------------------------------------------------
/data/opafy21nid.csv.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/opafy21nid.csv.xz


--------------------------------------------------------------------------------
/data/opafy22nid.csv.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/opafy22nid.csv.xz


--------------------------------------------------------------------------------
/data/ussc_sup_fy14.csv.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/ussc_sup_fy14.csv.xz


--------------------------------------------------------------------------------
/data/ussc_sup_fy15.csv.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/ussc_sup_fy15.csv.xz


--------------------------------------------------------------------------------
/data/ussc_sup_fy16.csv.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/ussc_sup_fy16.csv.xz


--------------------------------------------------------------------------------
/data/ussc_sup_fy17.csv.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/ussc_sup_fy17.csv.xz


--------------------------------------------------------------------------------
/getdata.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | wget https://www.ussc.gov/sites/default/files/zip/USSC_PV_Report_2020.zip
 4 | wget https://www.ussc.gov/sites/default/files/zip/ussc_sup_fy14.zip
 5 | wget https://www.ussc.gov/sites/default/files/zip/ussc_sup_fy15.zip
 6 | wget https://www.ussc.gov/sites/default/files/zip/ussc_sup_fy16.zip
 7 | wget https://www.ussc.gov/sites/default/files/zip/ussc_sup_fy17.zip
 8 | wget https://www.ussc.gov/sites/default/files/zip/opafy22nid.zip
 9 | wget https://www.ussc.gov/sites/default/files/zip/opafy21nid.zip
10 | wget https://www.ussc.gov/sites/default/files/zip/opafy20nid.zip
11 | wget https://www.ussc.gov/sites/default/files/zip/opafy19nid.zip
12 | wget https://www.ussc.gov/sites/default/files/zip/opafy18nid.zip
13 | wget https://www.ussc.gov/sites/default/files/zip/opafy17-nid.zip
14 | wget https://www.ussc.gov/sites/default/files/zip/opafy16-nid.zip
15 | wget https://www.ussc.gov/sites/default/files/zip/opafy15nid.zip
16 | wget https://www.ussc.gov/sites/default/files/zip/opafy14nid.zip
17 | wget https://www.ussc.gov/sites/default/files/zip/opafy13nid.zip
18 | wget https://www.ussc.gov/sites/default/files/zip/opafy12nid.zip
19 | wget https://www.ussc.gov/sites/default/files/zip/opafy11nid.zip
20 | wget https://www.ussc.gov/sites/default/files/zip/opafy10nid.zip
21 | wget https://www.ussc.gov/sites/default/files/zip/opafy09nid.zip
22 | wget https://www.ussc.gov/sites/default/files/zip/opafy08nid.zip
23 | 
24 | mkdir -p data
25 | mv *.zip data
26 | 
27 | for filename in $(ls data/*.zip); do
28 |   echo "Working on ${filename}"
29 |   python3 convert.py "${filename}"
30 |   echo "Converting to xz"
31 |   pv "${filename%????}.csv" | xz --stdout - > "${filename%????}.csv.xz"
32 |   rm "${filename}"
33 | done
34 | 


--------------------------------------------------------------------------------