├── README.md ├── codebook.pdf ├── convert.py ├── data ├── USSC_PV_Report_2020.csv.xz ├── opafy08nid.csv.xz ├── opafy09nid.csv.xz ├── opafy10nid.csv.xz ├── opafy11nid.csv.xz ├── opafy12nid.csv.xz ├── opafy13nid.csv.xz ├── opafy14nid.csv.xz ├── opafy15nid.csv.xz ├── opafy16nid.csv.xz ├── opafy17nid.csv.xz ├── opafy18nid.csv.xz ├── opafy19nid.csv.xz ├── opafy20nid.csv.xz ├── opafy21nid.csv.xz ├── opafy22nid.csv.xz ├── ussc_sup_fy14.csv.xz ├── ussc_sup_fy15.csv.xz ├── ussc_sup_fy16.csv.xz └── ussc_sup_fy17.csv.xz └── getdata.sh /README.md: -------------------------------------------------------------------------------- 1 | # Converting Sentencing Commission Files into CSVs 2 | 3 | The good news: The United States Sentencing Commission makes 4 | [_very_ detailed files](https://www.ussc.gov/research/datafiles/commission-datafiles#individual) 5 | available about sentencing in the US. :tada: 6 | 7 | The bad news: They are in a crazy fixed-width format and include SAS and SPSS scripts to read them into those programs and those programs alone. :scream: 8 | 9 | So what can we do about it? Well, we can write a little converter that converts them all! These files will do that for you. 10 | 11 | ## I just want the data 12 | 13 | It turns out the data is small enough that you can upload it to GitHub! However, it's lzma compressed. Here's how you can decompress them. 14 | 15 | ### First, a warning 16 | 17 | These files are filled with tons of nulls. The typical file size compressed is around 10MB and uncompressed aroung 1.5GB. So so so many blank fields. Loading this directly into pandas on a small box will probably make your box sad. Instead, you should really look at the `usecols` kwarg of [pd.read_csv](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html). 18 | 19 | ### Mac 20 | 21 | If you're using homebrew, just do 22 | 23 | ``` 24 | $ brew install xz 25 | ``` 26 | 27 | Then you can open the files by doing 28 | 29 | ``` 30 | $ unxz [FILENAME].xz 31 | ``` 32 | 33 | ### Debian/Ubuntu 34 | 35 | First install xz utilties 36 | 37 | ``` 38 | $ sudo apt update && sudo apt install xz-utils 39 | ``` 40 | 41 | Then you should be able to open files thus 42 | 43 | ``` 44 | $ xz -d [FILENAME].xz 45 | ``` 46 | 47 | ### Windows 48 | 49 | Both 7zip and WinZip will open these files for you. Download and install them at your leisure. 50 | 51 | ## Requirements 52 | 53 | This is script has only been tested with Python 3 and it assumes you have `click` installed. But this is just for progress bars, so you can comment out those lines if you want. 54 | 55 | ## Usage 56 | 57 | First you'll need to get the data from the Sentencing Commission. The script `getdata.sh` gives examples, and will itself download FY08-20's data files. 58 | 59 | Next you'll need to point the script `convert.py` at the file. For instance, 60 | 61 | ``` 62 | $ python3 convert.py data/opafy14nid.zip 63 | ``` 64 | 65 | This will leave you a file called `data/opafy14nid.csv` in that folder. 66 | 67 | Be warned, these files end up being quite large, so you may want to gzip or xzip them. 68 | 69 | ## License 70 | 71 | MIT 72 | -------------------------------------------------------------------------------- /codebook.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/codebook.pdf -------------------------------------------------------------------------------- /convert.py: -------------------------------------------------------------------------------- 1 | """ 2 | Convert Sentencing Commission files into CSVs. 3 | 4 | Usage: 5 | $ python3 convert.py [LIST OF FILES TO CONVERT] 6 | 7 | @author Kevin H. Wilson 8 | """ 9 | import csv 10 | import os 11 | import sys 12 | import tempfile 13 | import zipfile 14 | from pathlib import Path 15 | from typing import Dict, List, Union 16 | 17 | import click 18 | 19 | 20 | def read_columns(filename: Union[str, Path]) -> List[Dict[str, Union[bool, int, str]]]: 21 | """ 22 | Read the column names from the file with the passed name. 23 | Assumes it's in a SAS format that begins with INPUT and ends with 24 | a semicolon. 25 | 26 | Args: 27 | filename: The file name to read 28 | 29 | Returns: 30 | The list of column names where each element is a dict with key/vals: 31 | * name (str): The column name 32 | * is_char (bool): Whether the column is a string column 33 | * start (int): The (1-delimited) starting position of the column 34 | * end (int): The (1-delimited and inclusive) ending position of the column 35 | """ 36 | columns = [] 37 | with open(filename, "rt") as f: 38 | # Search for the line that starts with INPUT 39 | for line in f: 40 | if line.startswith("INPUT"): 41 | break 42 | 43 | for line in f: 44 | # Kill all the extra whitespace 45 | line = line.strip() 46 | 47 | # Is this the last line? 48 | if line.endswith(";"): 49 | # If so, strip the ; and the extra whitespace 50 | last_line = True 51 | line = line[:-1].strip() 52 | else: 53 | last_line = False 54 | 55 | # Parse row into column names 56 | i = 0 57 | sline = line.split() 58 | while i < len(sline): 59 | col_name = sline[i] 60 | i += 1 61 | 62 | if sline[i] == "$": 63 | is_char = True 64 | i += 1 65 | else: 66 | is_char = False 67 | 68 | field_range = sline[i] 69 | i += 1 70 | 71 | # Field ranges are formatted either as # or #-# 72 | sfield_range = field_range.split("-") 73 | if len(sfield_range) == 1: 74 | sfield_range = (sfield_range[0], sfield_range[0]) 75 | 76 | # Write out the column to the list 77 | columns.append( 78 | { 79 | "name": col_name, 80 | "is_char": is_char, 81 | "start": int(sfield_range[0]), 82 | "end": int(sfield_range[1]), 83 | } 84 | ) 85 | 86 | if last_line: 87 | break 88 | 89 | return columns 90 | 91 | 92 | def convert_file(filename: Union[str, Path]): 93 | """ 94 | Convert a file from the Sentencing Commission format into a CSV. 95 | Assumes the file is a ZIP file containing at least the following: 96 | - .sas: A file with the same name as `filename` except ending in .sas 97 | - .dat: A file with the same name as `filename` except ending in .dat 98 | 99 | The .dat file is a fixed-width file whose columns are described by the .sas 100 | file. If you're looking at the .sas file, search for INPUT and LENGTH to 101 | see the two main parts of the file. There are a _lot_ of columns. 102 | 103 | Args: 104 | filename: The name of the file to convert 105 | """ 106 | filename = Path(filename) 107 | with tempfile.TemporaryDirectory() as tmpdir: 108 | tmpdir = Path(tmpdir) 109 | 110 | # Unzip the contents of the file 111 | with zipfile.ZipFile(filename, "r") as thefile: 112 | thefile.extractall(tmpdir) 113 | 114 | # Read in the column names from the .sas file 115 | sasfilename = filename.with_suffix(".sas").name 116 | saspath = tmpdir / sasfilename 117 | columns = read_columns(saspath) 118 | 119 | # Setup the path to the .dat file 120 | datfilename = filename.with_suffix(".dat").name 121 | datpath = tmpdir / datfilename 122 | 123 | # Open the output file 124 | outfilename = filename.with_suffix(".csv") 125 | badlines = [] 126 | with open(outfilename, "wt") as outfile: 127 | # Write the column headers 128 | writer = csv.writer(outfile) 129 | writer.writerow([col["name"] for col in columns]) 130 | 131 | # Read in the data 132 | with click.progressbar(length=os.stat(datpath).st_size) as bar: 133 | with open(datpath, "rb") as infile: 134 | for line in infile: 135 | bar.update(len(line)) 136 | line = line.decode("latin1") 137 | 138 | # Read in a single row 139 | readrow = [] 140 | for col in columns: 141 | val = line[col["start"] - 1 : col["end"]].strip() 142 | 143 | # If it's numeric and not missing, format it nicely 144 | if val and not col["is_char"]: 145 | if "." in val: 146 | val = float(val) 147 | else: 148 | val = int(float(val)) # Handle 6e+10 149 | readrow.append(val) 150 | 151 | # Write out the row 152 | writer.writerow(readrow) 153 | 154 | if badlines: 155 | badfilename = filename.with_suffix(".bad") 156 | with open(badfilename, "wb") as f: 157 | for line in badlines: 158 | f.write(line) 159 | 160 | 161 | def main(): 162 | for filename in sys.argv[1:]: 163 | convert_file(filename) 164 | 165 | 166 | if __name__ == "__main__": 167 | main() 168 | -------------------------------------------------------------------------------- /data/USSC_PV_Report_2020.csv.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/USSC_PV_Report_2020.csv.xz -------------------------------------------------------------------------------- /data/opafy08nid.csv.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/opafy08nid.csv.xz -------------------------------------------------------------------------------- /data/opafy09nid.csv.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/opafy09nid.csv.xz -------------------------------------------------------------------------------- /data/opafy10nid.csv.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/opafy10nid.csv.xz -------------------------------------------------------------------------------- /data/opafy11nid.csv.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/opafy11nid.csv.xz -------------------------------------------------------------------------------- /data/opafy12nid.csv.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/opafy12nid.csv.xz -------------------------------------------------------------------------------- /data/opafy13nid.csv.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/opafy13nid.csv.xz -------------------------------------------------------------------------------- /data/opafy14nid.csv.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/opafy14nid.csv.xz -------------------------------------------------------------------------------- /data/opafy15nid.csv.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/opafy15nid.csv.xz -------------------------------------------------------------------------------- /data/opafy16nid.csv.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/opafy16nid.csv.xz -------------------------------------------------------------------------------- /data/opafy17nid.csv.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/opafy17nid.csv.xz -------------------------------------------------------------------------------- /data/opafy18nid.csv.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/opafy18nid.csv.xz -------------------------------------------------------------------------------- /data/opafy19nid.csv.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/opafy19nid.csv.xz -------------------------------------------------------------------------------- /data/opafy20nid.csv.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/opafy20nid.csv.xz -------------------------------------------------------------------------------- /data/opafy21nid.csv.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/opafy21nid.csv.xz -------------------------------------------------------------------------------- /data/opafy22nid.csv.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/opafy22nid.csv.xz -------------------------------------------------------------------------------- /data/ussc_sup_fy14.csv.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/ussc_sup_fy14.csv.xz -------------------------------------------------------------------------------- /data/ussc_sup_fy15.csv.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/ussc_sup_fy15.csv.xz -------------------------------------------------------------------------------- /data/ussc_sup_fy16.csv.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/ussc_sup_fy16.csv.xz -------------------------------------------------------------------------------- /data/ussc_sup_fy17.csv.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/khwilson/SentencingCommissionDatasets/cc8098a9549b27fa9a7197d721f829cc00bb2283/data/ussc_sup_fy17.csv.xz -------------------------------------------------------------------------------- /getdata.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | wget https://www.ussc.gov/sites/default/files/zip/USSC_PV_Report_2020.zip 4 | wget https://www.ussc.gov/sites/default/files/zip/ussc_sup_fy14.zip 5 | wget https://www.ussc.gov/sites/default/files/zip/ussc_sup_fy15.zip 6 | wget https://www.ussc.gov/sites/default/files/zip/ussc_sup_fy16.zip 7 | wget https://www.ussc.gov/sites/default/files/zip/ussc_sup_fy17.zip 8 | wget https://www.ussc.gov/sites/default/files/zip/opafy22nid.zip 9 | wget https://www.ussc.gov/sites/default/files/zip/opafy21nid.zip 10 | wget https://www.ussc.gov/sites/default/files/zip/opafy20nid.zip 11 | wget https://www.ussc.gov/sites/default/files/zip/opafy19nid.zip 12 | wget https://www.ussc.gov/sites/default/files/zip/opafy18nid.zip 13 | wget https://www.ussc.gov/sites/default/files/zip/opafy17-nid.zip 14 | wget https://www.ussc.gov/sites/default/files/zip/opafy16-nid.zip 15 | wget https://www.ussc.gov/sites/default/files/zip/opafy15nid.zip 16 | wget https://www.ussc.gov/sites/default/files/zip/opafy14nid.zip 17 | wget https://www.ussc.gov/sites/default/files/zip/opafy13nid.zip 18 | wget https://www.ussc.gov/sites/default/files/zip/opafy12nid.zip 19 | wget https://www.ussc.gov/sites/default/files/zip/opafy11nid.zip 20 | wget https://www.ussc.gov/sites/default/files/zip/opafy10nid.zip 21 | wget https://www.ussc.gov/sites/default/files/zip/opafy09nid.zip 22 | wget https://www.ussc.gov/sites/default/files/zip/opafy08nid.zip 23 | 24 | mkdir -p data 25 | mv *.zip data 26 | 27 | for filename in $(ls data/*.zip); do 28 | echo "Working on ${filename}" 29 | python3 convert.py "${filename}" 30 | echo "Converting to xz" 31 | pv "${filename%????}.csv" | xz --stdout - > "${filename%????}.csv.xz" 32 | rm "${filename}" 33 | done 34 | --------------------------------------------------------------------------------