├── src └── pacer_tools │ ├── code │ ├── __init__.py │ ├── db │ │ ├── __init__.py │ │ └── rdf │ │ │ ├── __init__.py │ │ │ ├── constants.py │ │ │ ├── make_graph_data_fulton_county.py │ │ │ └── utils.py │ ├── tasks │ │ ├── __init__.py │ │ ├── build_unique_table.py │ │ └── redact_pacer.py │ ├── downloader │ │ ├── __init__.py │ │ └── demo.json │ ├── parsers │ │ ├── __init__.py │ │ ├── parse_all.sh │ │ ├── parse_subset.sh │ │ ├── schemas │ │ │ ├── jel_v1.schema.json │ │ │ ├── docket_entry_v1.schema.json │ │ │ ├── sel_v1.schema.json │ │ │ ├── party_cv_v1.schema.json │ │ │ ├── party_cr_v1.schema.json │ │ │ ├── case_cv_v1.schema.json │ │ │ └── case_cr_v1.schema.json │ │ ├── README.md │ │ └── parse_summary.py │ ├── support │ │ ├── __init__.py │ │ ├── core.py │ │ ├── .gitignore │ │ ├── scales_shell.py │ │ ├── language_tools.py │ │ ├── core_data │ │ │ ├── district_courts_94.csv │ │ │ ├── statey2code.json │ │ │ └── nature_suit.csv │ │ ├── viz_tools.py │ │ ├── counsel_functions.py │ │ ├── settings.py │ │ ├── stats.py │ │ ├── mongo_connector.py │ │ ├── docket_functions.py │ │ ├── court_functions.py │ │ ├── disambiguation_functions.py │ │ ├── README.md │ │ ├── research_tools.py │ │ ├── bundler.py │ │ └── text_functions.py │ └── cli.py │ ├── data │ ├── exclude.csv │ └── annotation │ │ ├── member_lead_links.jsonl │ │ ├── fjc_district_codes.json │ │ ├── statey2code.json │ │ ├── nature_suit.csv │ │ └── district_courts.csv │ ├── __init__.py │ ├── .gitignore │ ├── requirements.yml │ └── requirements.txt ├── demo ├── document_input.csv ├── auth.json └── query_conf.json ├── .gitignore ├── setup.py └── README.md /src/pacer_tools/code/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/pacer_tools/code/db/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/pacer_tools/code/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/pacer_tools/code/db/rdf/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/pacer_tools/code/downloader/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/pacer_tools/code/parsers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/pacer_tools/code/support/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/pacer_tools/data/exclude.csv: -------------------------------------------------------------------------------- 1 | ucid 2 | -------------------------------------------------------------------------------- /src/pacer_tools/data/annotation/member_lead_links.jsonl: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demo/document_input.csv: -------------------------------------------------------------------------------- 1 | ucid,doc_no 2 | psc;;1:07-cv-00431,2 3 | -------------------------------------------------------------------------------- /demo/auth.json: -------------------------------------------------------------------------------- 1 | { 2 | "user": "", 3 | "pass": "" 4 | } 5 | -------------------------------------------------------------------------------- /demo/query_conf.json: -------------------------------------------------------------------------------- 1 | { 2 | "case_status": "closed", 3 | "filed_from": "06/29/2007", 4 | "filed_to": "07/01/2007" 5 | } 6 | -------------------------------------------------------------------------------- /src/pacer_tools/code/downloader/demo.json: -------------------------------------------------------------------------------- 1 | { 2 | "case_status": "closed", 3 | "filed_from": "06/29/2007", 4 | "filed_to": "07/01/2007" 5 | } 6 | -------------------------------------------------------------------------------- /src/pacer_tools/__init__.py: -------------------------------------------------------------------------------- 1 | from pacer_tools.code.cli import main as cli 2 | import pacer_tools.code.support.data_tools as dtools 3 | import pacer_tools.code.support.fhandle_tools as ftools -------------------------------------------------------------------------------- /src/pacer_tools/code/support/core.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | def std_path(fpath): 4 | ''' Standardise a filepath, returns a Path object''' 5 | if type(fpath) is str: 6 | fpath = Path(fpath.replace('\\','/')) 7 | return fpath 8 | -------------------------------------------------------------------------------- /src/pacer_tools/code/support/.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | data/* 3 | **/*.auth 4 | *.pyc 5 | *.key 6 | *.env 7 | noacri.db 8 | **/.ipynb_checkpoints/* 9 | **/_examples/* 10 | **/_misc.ipynb 11 | **/login.auth 12 | **/*.auth 13 | .vscode/* 14 | **/_temp_/* 15 | code/downloader/test/* 16 | **/geckodriver.log 17 | -------------------------------------------------------------------------------- /src/pacer_tools/code/cli.py: -------------------------------------------------------------------------------- 1 | import click 2 | from pacer_tools.code.downloader.scrapers import scraper 3 | from pacer_tools.code.parsers.parse_pacer import parser 4 | 5 | 6 | @click.group() 7 | def main(): 8 | pass 9 | 10 | main.add_command(scraper) 11 | main.add_command(parser) 12 | 13 | 14 | if __name__ == '__main__': 15 | main() -------------------------------------------------------------------------------- /src/pacer_tools/.gitignore: -------------------------------------------------------------------------------- 1 | **/node_modules/* 2 | 3 | .DS_Store 4 | **/.ipynb_checkpoints/* 5 | **/_examples/* 6 | **/_misc.ipynb 7 | **/fjc_scott.ipynb 8 | **/_temp_/* 9 | **/living_reports/**/.gitignore 10 | noacri.db 11 | .vscode/* 12 | **/geckodriver.log 13 | code/downloader/test/* 14 | code/downloader/logs/* 15 | **/conductor/logs/* 16 | **/nohup*.out 17 | 18 | **/*.auth 19 | *.pyc 20 | *.key 21 | *.env 22 | **/login.auth 23 | **/*.auth 24 | -------------------------------------------------------------------------------- /src/pacer_tools/code/parsers/parse_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # After reading the directory path, passes all arguments through to parse_pacer.py; use -f for force and -d for debug (-fd for both) 3 | # Example: bash parse_all.sh ../../data/pacer -fd 4 | # See arguments documented in parse_pacer.py 5 | 6 | dir=$1 7 | shift 8 | for courtdir in $dir/*/; do 9 | echo $courtdir; 10 | python parse_pacer.py $courtdir/html/ $courtdir/json/ "$@"; 11 | done 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build/* 2 | dist/* 3 | 4 | src/*.egg-info 5 | 6 | **/node_modules/* 7 | 8 | .DS_Store 9 | **/.ipynb_checkpoints/* 10 | **/_examples/* 11 | **/_misc.ipynb 12 | **/fjc_scott.ipynb 13 | **/_temp_/* 14 | **/living_reports/**/.gitignore 15 | noacri.db 16 | .vscode/* 17 | **/geckodriver.log 18 | code/downloader/test/* 19 | code/downloader/logs/* 20 | **/conductor/logs/* 21 | **/nohup*.out 22 | 23 | **/*.auth 24 | *.pyc 25 | *.key 26 | *.env 27 | **/login.auth 28 | **/*.auth 29 | -------------------------------------------------------------------------------- /src/pacer_tools/code/db/rdf/constants.py: -------------------------------------------------------------------------------- 1 | from rdflib import Namespace 2 | 3 | SCALES = Namespace("http://schemas.scales-okn.org/rdf/scales#") 4 | J = Namespace("http://release.niem.gov/niem/domains/jxdm/7.2/#") 5 | NC = Namespace("http://release.niem.gov/niem/niem-core/5.0/#") 6 | FIPS = Namespace("http://release.niem.gov/niem/codes/fips/5.2/#") 7 | NIBRS = Namespace("http://fbi.gov/cjis/nibrs/2023.0/") 8 | OCCS = Namespace("http://release.niem.gov/niem/codes/occs/5.0/#") 9 | TREATMENT = Namespace("http://sail.ua.edu/ruralkg/treatmentprovider/") -------------------------------------------------------------------------------- /src/pacer_tools/code/parsers/parse_subset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | dir=$1 4 | shift 5 | while getopts s:e: flag; do 6 | case "$flag" in 7 | s) 8 | startdir=${OPTARG} 9 | ;; 10 | e) 11 | enddir=${OPTARG} 12 | ;; 13 | esac 14 | done 15 | 16 | shift 4 17 | for courtdir in $dir/*; do 18 | if [ ! $(basename $courtdir) \< $(basename $startdir) ] && [ ! $(basename $courtdir) \> $(basename $enddir) ] 19 | then 20 | echo "Running on ${courtdir}" 21 | python parse_pacer.py $courtdir/html/ $courtdir/json/ "$@"; 22 | else 23 | echo "Skipping ${courtdir}"; 24 | fi 25 | done 26 | -------------------------------------------------------------------------------- /src/pacer_tools/code/support/scales_shell.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import importlib 3 | from pathlib import Path 4 | 5 | import pandas as pd 6 | 7 | sys.path.append(str(Path(__file__).resolve().parents[1])) 8 | 9 | reload = importlib.reload 10 | import_dict = { 11 | 'support.fhandle_tools': 'ftools', 12 | 'support.settings': 'settings', 13 | 'support.data_tools': 'dtools', 14 | 'support.docket_entry_identification':'dei', 15 | 'support.court_functions':'cf', 16 | 'support.judge_functions':'jf', 17 | } 18 | 19 | print('') 20 | for mod, alias in import_dict.items(): 21 | globals().update({alias:importlib.import_module(mod)}) 22 | print(f"Imported {mod} as {alias}") 23 | 24 | dff = dtools.load_unique_files_df() 25 | print(f"Imported unique files df as dff") 26 | -------------------------------------------------------------------------------- /src/pacer_tools/code/support/language_tools.py: -------------------------------------------------------------------------------- 1 | def nearest_ent_index(search_phrase, text, ents): 2 | ''' 3 | Identifies nearest entity to a search phrase in a text block. 4 | Ents must be generated from text, search_phrase must be in text. 5 | input: 6 | * search_phrase -- str, regex to search for 7 | * text -- str, document text 8 | * ents -- list, list of spacy entities that should be considered in search 9 | output: 10 | * min_index -- int, index for the entity list of the closest spacy entity to search phrase 11 | ''' 12 | import re 13 | 14 | bspan, espan = re.search(search_phrase, text).span() 15 | #Subtract bspan, then we want the minimum distance that is positive 16 | start_chars = [ent.start_char - bspan for ent in ents] 17 | m = min(i for i in start_chars) 18 | min_index = start_chars.index(m) 19 | return min_index 20 | -------------------------------------------------------------------------------- /src/pacer_tools/code/support/core_data/district_courts_94.csv: -------------------------------------------------------------------------------- 1 | akd 2 | almd 3 | alnd 4 | alsd 5 | ared 6 | arwd 7 | azd 8 | cacd 9 | caed 10 | cand 11 | casd 12 | cod 13 | ctd 14 | dcd 15 | ded 16 | flmd 17 | flnd 18 | flsd 19 | gamd 20 | gand 21 | gasd 22 | gud 23 | hid 24 | iand 25 | iasd 26 | idd 27 | ilcd 28 | ilnd 29 | ilsd 30 | innd 31 | insd 32 | ksd 33 | kyed 34 | kywd 35 | laed 36 | lamd 37 | lawd 38 | mad 39 | mdd 40 | med 41 | mied 42 | miwd 43 | mnd 44 | moed 45 | mowd 46 | msnd 47 | mssd 48 | mtd 49 | nced 50 | ncmd 51 | ncwd 52 | ndd 53 | ned 54 | nhd 55 | njd 56 | nmd 57 | nmid 58 | nvd 59 | nyed 60 | nynd 61 | nysd 62 | nywd 63 | ohnd 64 | ohsd 65 | oked 66 | oknd 67 | okwd 68 | ord 69 | paed 70 | pamd 71 | pawd 72 | prd 73 | rid 74 | scd 75 | sdd 76 | tned 77 | tnmd 78 | tnwd 79 | txed 80 | txnd 81 | txsd 82 | txwd 83 | utd 84 | vaed 85 | vawd 86 | vid 87 | vtd 88 | waed 89 | wawd 90 | wied 91 | wiwd 92 | wvnd 93 | wvsd 94 | wyd 95 | -------------------------------------------------------------------------------- /src/pacer_tools/requirements.yml: -------------------------------------------------------------------------------- 1 | # Create this environment using `conda env create -f ` 2 | # Update this enviroment by activating it and then `conda env update -f ` 3 | name: scales_env 4 | channels: 5 | - defaults 6 | - conda-forge 7 | dependencies: 8 | - python=3.8.* 9 | - pandas=1.3.* 10 | - spacy=3.2.* 11 | - scikit-learn 12 | - statsmodels 13 | - ipython 14 | - scipy 15 | - seaborn 16 | - click 17 | - numpy 18 | - tqdm 19 | - selenium 20 | - selenium-requests 21 | - simplejson 22 | - xlrd 23 | - lxml 24 | - bs4 25 | - spacy-lookups-data 26 | - psycopg2 27 | - sqlalchemy 28 | - python-dotenv 29 | - xmltodict 30 | - pymongo 31 | - rdflib 32 | - fuzzywuzzy 33 | - toolz 34 | - pip 35 | - pip: 36 | - usaddress 37 | - anyio 38 | - asyncclick 39 | - flashtext 40 | # - psycopg2-binary 41 | # - cenpy 42 | # - html5lib 43 | # - geopandas 44 | -------------------------------------------------------------------------------- /src/pacer_tools/code/tasks/build_unique_table.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pathlib import Path 3 | 4 | import click 5 | 6 | sys.path.append(str(Path(__file__).resolve().parents[1])) 7 | from support import settings 8 | from support import data_tools as dtools 9 | 10 | @click.command() 11 | @click.option('--outfile', '-o', default=settings.UNIQUE_FILES_TABLE, show_default=True) 12 | @click.option('--nrows', '-n', default=None) 13 | def main(outfile, nrows): 14 | 15 | if outfile == settings.UNIQUE_FILES_TABLE: 16 | if not click.confirm(f"Overwrite the existing table at {outfile} ?"): 17 | return 18 | 19 | if nrows: 20 | nrows = int(nrows) 21 | 22 | df = dtools.generate_unique_filepaths(outfile, nrows) 23 | print(f"\nUnique filepaths table (with shape {df.shape}) output to {Path(outfile).resolve()}") 24 | 25 | exist_count = df.fpath.map(lambda x: (settings.PROJECT_ROOT/x).exists()).sum() 26 | print(f'\nFile existence check: {exist_count:,} / {len(df):,}') 27 | 28 | if __name__ == '__main__': 29 | main() 30 | -------------------------------------------------------------------------------- /src/pacer_tools/code/support/viz_tools.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | from matplotlib.ticker import PercentFormatter 3 | from matplotlib.ticker import FuncFormatter 4 | 5 | # Palette 6 | def pal(n=5, ind=False, cmap=False): 7 | ''' 8 | Return a blue to orange palette 9 | 10 | Inputs: 11 | n (int) - no. of colours in the palette 12 | ind (int) - the index of the single colour in the palette to return 13 | cmap (bool) - whether to return a cmap 14 | ''' 15 | h_neg, h_pos, s, l = 255, 22, 99, 65 16 | 17 | if cmap: 18 | return sns.diverging_palette(h_neg, h_pos, s, l, as_cmap=True) 19 | 20 | if n == 3: 21 | palette = [pal(4,ind)[i] for i in [0,2,3]] 22 | else: 23 | palette = sns.diverging_palette(h_neg, h_pos, s, l, n=n) 24 | 25 | #If index specified return a tuple of that color 26 | if type(ind)==int: 27 | return tuple(palette[ind]) 28 | # Else return the whole palette 29 | else: 30 | return palette 31 | 32 | # Graph label formatters 33 | fmt_thou = FuncFormatter(lambda x,p: f"{x:,.0f}") 34 | fmt_perc = PercentFormatter 35 | -------------------------------------------------------------------------------- /src/pacer_tools/data/annotation/fjc_district_codes.json: -------------------------------------------------------------------------------- 1 | {"00": "med", "47": "ohnd", "01": "mad", "48": "ohsd", "02": "nhd", "49": "tned", "03": "rid", "50": "tnmd", "04": "prd", "51": "tnwd", "05": "ctd", "52": "ilnd", "06": "nynd", "53": "ilcd", "07": "nyed", "54": "ilsd", "08": "nysd", "55": "innd", "09": "nywd", "56": "insd", "10": "vtd", "57": "wied", "11": "ded", "58": "wiwd", "12": "njd", "60": "ared", "13": "paed", "61": "arwd", "14": "pamd", "62": "iand", "15": "pawd", "63": "iasd", "16": "mdd", "64": "mnd", "17": "nced", "65": "moed", "18": "ncmd", "66": "mowd", "19": "ncwd", "67": "ned", "20": "scd", "68": "ndd", "22": "vaed", "69": "sdd", "23": "vawd", "7-": "akd", "24": "wvnd", "70": "azd", "25": "wvsd", "71": "cand", "26": "alnd", "72": "caed", "27": "almd", "73": "cacd", "28": "alsd", "74": "casd", "29": "flnd", "75": "hid", "3A": "flmd", "76": "idd", "3C": "flsd", "77": "mtd", "3E": "gand", "78": "nvd", "3G": "gamd", "79": "ord", "3J": "gasd", "80": "waed", "3L": "laed", "81": "wawd", "3N": "lamd", "82": "cod", "36": "lawd", "83": "ksd", "37": "msnd", "84": "nmd", "38": "mssd", "85": "oknd", "39": "txnd", "86": "oked", "40": "txed", "87": "okwd", "41": "txsd", "88": "utd", "42": "txwd", "89": "wyd", "43": "kyed", "90": "dcd", "44": "kywd", "91": "vid", "45": "mied", "93": "gud", "46": "miwd", "94": "nmid"} -------------------------------------------------------------------------------- /src/pacer_tools/data/annotation/statey2code.json: -------------------------------------------------------------------------------- 1 | { 2 | "alabama": "al", 3 | "alaska": "ak", 4 | "arizona": "az", 5 | "arkansas": "ar", 6 | "california": "ca", 7 | "colorado": "co", 8 | "connecticut": "ct", 9 | "delaware": "de", 10 | "district of columbia": "dc", 11 | "florida": "fl", 12 | "georgia": "ga", 13 | "hawaii": "hi", 14 | "idaho": "id", 15 | "illinois": "il", 16 | "indiana": "in", 17 | "iowa": "ia", 18 | "kansas": "ks", 19 | "kentucky": "ky", 20 | "louisiana": "la", 21 | "maine": "me", 22 | "maryland": "md", 23 | "massachusetts": "ma", 24 | "michigan": "mi", 25 | "minnesota": "mn", 26 | "mississippi": "ms", 27 | "missouri": "mo", 28 | "montana": "mt", 29 | "nebraska": "ne", 30 | "nevada": "nv", 31 | "new hampshire": "nh", 32 | "new jersey": "nj", 33 | "new mexico": "nm", 34 | "new york": "ny", 35 | "north carolina": "nc", 36 | "north dakota": "nd", 37 | "ohio": "oh", 38 | "oklahoma": "ok", 39 | "oregon": "or", 40 | "pennsylvania": "pa", 41 | "rhode island": "ri", 42 | "south carolina": "sc", 43 | "south dakota": "sd", 44 | "tennessee": "tn", 45 | "texas": "tx", 46 | "utah": "ut", 47 | "vermont": "vt", 48 | "virginia": "va", 49 | "washington": "wa", 50 | "west virginia": "wv", 51 | "wisconsin": "wi", 52 | "wyoming": "wy", 53 | "guam": "gu", 54 | "northern mariana islands": "nmi", 55 | "puerto rico": "pr", 56 | "virgin islands": "vi" 57 | } -------------------------------------------------------------------------------- /src/pacer_tools/code/support/core_data/statey2code.json: -------------------------------------------------------------------------------- 1 | { 2 | "alabama": "al", 3 | "alaska": "ak", 4 | "arizona": "az", 5 | "arkansas": "ar", 6 | "california": "ca", 7 | "colorado": "co", 8 | "connecticut": "ct", 9 | "delaware": "de", 10 | "district of columbia": "dc", 11 | "florida": "fl", 12 | "georgia": "ga", 13 | "hawaii": "hi", 14 | "idaho": "id", 15 | "illinois": "il", 16 | "indiana": "in", 17 | "iowa": "ia", 18 | "kansas": "ks", 19 | "kentucky": "ky", 20 | "louisiana": "la", 21 | "maine": "me", 22 | "maryland": "md", 23 | "massachusetts": "ma", 24 | "michigan": "mi", 25 | "minnesota": "mn", 26 | "mississippi": "ms", 27 | "missouri": "mo", 28 | "montana": "mt", 29 | "nebraska": "ne", 30 | "nevada": "nv", 31 | "new hampshire": "nh", 32 | "new jersey": "nj", 33 | "new mexico": "nm", 34 | "new york": "ny", 35 | "north carolina": "nc", 36 | "north dakota": "nd", 37 | "ohio": "oh", 38 | "oklahoma": "ok", 39 | "oregon": "or", 40 | "pennsylvania": "pa", 41 | "rhode island": "ri", 42 | "south carolina": "sc", 43 | "south dakota": "sd", 44 | "tennessee": "tn", 45 | "texas": "tx", 46 | "utah": "ut", 47 | "vermont": "vt", 48 | "virginia": "va", 49 | "washington": "wa", 50 | "west virginia": "wv", 51 | "wisconsin": "wi", 52 | "wyoming": "wy", 53 | "guam": "gu", 54 | "northern mariana islands": "nmi", 55 | "puerto rico": "pr", 56 | "virgin islands": "vi" 57 | } -------------------------------------------------------------------------------- /src/pacer_tools/code/tasks/redact_pacer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import json 4 | import spacy 5 | import click 6 | import pandas as pd 7 | 8 | import sys 9 | from pathlib import Path 10 | sys.path.append(str(Path(__file__).resolve().parents[1])) 11 | from support import data_tools as dtools 12 | nlp = spacy.load("en_core_web_trf") 13 | 14 | 15 | 16 | def _redact_file(fpath, outdir_replacement_target, outdir_replacement_text): 17 | is_html = 'html' in fpath 18 | fpath_new = fpath.replace(outdir_replacement_target, outdir_replacement_text) 19 | data = dtools.load_case(fpath=fpath, html=is_html) 20 | 21 | try: 22 | data_redacted = dtools.redact_private_individual_names(data, is_html=is_html, elective_nlp=nlp) 23 | os.makedirs(os.path.dirname(fpath_new), exist_ok=True) 24 | with open(fpath_new, 'w') as f: 25 | if is_html: 26 | f.write(data_redacted) 27 | else: 28 | json.dump(data_redacted, f) 29 | 30 | print(f'Created {fpath_new}') 31 | except Exception as e: 32 | print(f'Error while creating {fpath_new}: {e}') 33 | 34 | 35 | 36 | @click.command() 37 | @click.argument('file_pattern') 38 | @click.argument('outdir_replacement_target') 39 | @click.argument('outdir_replacement_text') 40 | def main(file_pattern, outdir_replacement_target, outdir_replacement_text): 41 | 42 | fpaths = glob.glob(file_pattern) 43 | print(f'Compiled list of {len(fpaths)} files to redact') 44 | for fpath in fpaths: 45 | _redact_file(str(Path(fpath).resolve()), outdir_replacement_target, outdir_replacement_text) 46 | print('Finished redacting') 47 | 48 | if __name__ == '__main__': 49 | main() -------------------------------------------------------------------------------- /src/pacer_tools/code/support/counsel_functions.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | import json 4 | import sys 5 | from pathlib import Path 6 | sys.path.append(str(Path(__file__).resolve().parents[1])) 7 | 8 | from support import fhandle_tools as ftools 9 | from support import settings 10 | 11 | 12 | def load_counsel_clusters(): 13 | ''' Simple Loader File''' 14 | return pd.read_json(settings.COUNSEL_DIS_CLUSTS, lines=True) 15 | 16 | def load_disambiguated_counsels(ucid, as_df=True): 17 | ''' 18 | Load Counsel data (from relevant .jsonl files in the COUNSEL_DIS_DIR) 19 | 20 | Inputs: 21 | - ucid (str or iterable): can be a single ucid (str) or any iterable (list / pd.Series) 22 | - as_df (bool): if true returns as type pd.DataFrame, otherwise list of dicts 23 | 24 | Output: 25 | (pd.DataFrame or list of dicts) Disambiguated counsel data for the given ucid(s) 26 | ''' 27 | 28 | # Coerce to an iterable 29 | if type(ucid) is str: 30 | ucid = [ucid] 31 | 32 | ROW_DAT = [] 33 | for each in ucid: 34 | # create filepath 35 | fname = ftools.build_counsel_filename_from_ucid(each) 36 | # load file 37 | results = [] 38 | if fname.exists(): 39 | with open(fname, 'r') as json_file: 40 | json_list = list(json_file) 41 | for json_str in json_list: 42 | results.append(json.loads(json_str)) 43 | 44 | ROW_DAT+=results 45 | 46 | # return dataframe 47 | if ROW_DAT: 48 | if as_df: 49 | COUNSELS = pd.DataFrame(ROW_DAT) 50 | else: 51 | COUNSELS = ROW_DAT 52 | 53 | return COUNSELS 54 | else: 55 | return None 56 | 57 | 58 | return 59 | 60 | def load_firm_clusters(): 61 | return 62 | 63 | def load_disambiguated_firms(ucid, as_df=True): 64 | return -------------------------------------------------------------------------------- /src/pacer_tools/code/support/settings.py: -------------------------------------------------------------------------------- 1 | ''' 2 | File: settings.py 3 | Author: Adam Pah 4 | Description: Settings file 5 | ''' 6 | import sys 7 | from pathlib import Path 8 | sys.path.append(str(Path(__file__).resolve().parents[1])) 9 | 10 | 11 | PROJECT_ROOT = Path(__file__).resolve().parents[2] 12 | 13 | CORE_DATA = PROJECT_ROOT / 'code'/ 'support' / 'core_data' 14 | DATAPATH = PROJECT_ROOT / 'data' 15 | ANNO_PATH = DATAPATH / 'annotation' 16 | PACER_PATH = DATAPATH / 'pacer' # generate using scrapers.py 17 | 18 | COURTFILE = CORE_DATA / 'district_courts.csv' 19 | DISTRICT_COURTS_94 = CORE_DATA / 'district_courts_94.csv' 20 | STATEY2CODE = CORE_DATA / 'statey2code.json' 21 | NATURE_SUIT = CORE_DATA / 'nature_suit.csv' 22 | JUDGEFILE = CORE_DATA / 'judge_demographics.csv' 23 | BAMAG_JUDGES = CORE_DATA / 'brmag_judges.csv' 24 | BAMAG_POSITIONS = CORE_DATA / 'brmag_positions.csv' 25 | 26 | MEM_DF = DATAPATH / 'member_cases.csv' 27 | LOG_DIR = DATAPATH / 'logs' 28 | EXCLUDE_CASES = DATAPATH / 'exclude.csv' 29 | UNIQUE_FILES_TABLE = DATAPATH / 'unique_docket_filepaths_table.csv' # generate using generate_unique_filepaths in data_tools.py 30 | FJC = DATAPATH / 'fjc' # generate using fjc.gov/research/idb and fjc_functions.py 31 | 32 | MEMBER_LEAD_LINKS = ANNO_PATH / 'member_lead_links.jsonl' 33 | ROLE_MAPPINGS = ANNO_PATH / 'role_mappings.json' 34 | JEL_JSONL = ANNO_PATH / 'judge_disambiguation' / 'JEL.jsonl' # generate using the Research-Materials repo 35 | ONTOLOGY_LABELS = ANNO_PATH / 'ontology' / 'labels.csv' # generate using the scales-nlp repo 36 | 37 | ANNO_PATH_CLAYTON = ANNO_PATH / 'counties' / 'ga_clayton' 38 | NIBRS_CATEGORIES_CLAYTON = ANNO_PATH_CLAYTON / 'nibrs' / 'nibrs_categories.csv' 39 | NIBRS_CROSSWALK_CLAYTON = ANNO_PATH_CLAYTON / 'nibrs' / 'nibrs_crosswalk.csv' 40 | 41 | # included on behalf of make_graph_data_pacer.py 42 | # (in infrastructure_dev, this is a dev/prod switch, but it's not included here because pacer-tools is always prod) 43 | use_datastore = lambda path: path 44 | -------------------------------------------------------------------------------- /src/pacer_tools/code/parsers/schemas/jel_v1.schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/schema#", 3 | "$id": "http://github.com/scales-okn/PACER-tools/schemas/JEL_v1.json", 4 | "title": "JEL Schema", 5 | "description": "The JEL is the Judge Entity Lookup for all unique judge entities" , 6 | "properties": { 7 | "name": { 8 | "type": "string", 9 | "description": "The name of the unique judge entity" 10 | }, 11 | "Presentable_Name": { 12 | "type": "string", 13 | "description": "A human readable entity name with first letters capitalized in each token" 14 | }, 15 | "SJID": { 16 | "type": "string", 17 | "description": "Unique SCALES Judge Identifier for the Paren Entity associated with this location" 18 | }, 19 | "SCALES_Judge_Label": { 20 | "type": "number", 21 | "enum": ["FJC Judge","Magistrate_Judge","Nondescript_Judge","Judicial_Actor","Bankruptcy_Judge","District_Judge"], 22 | "description": "The predicted judge type based on the SCALES disambiguation routine and algorithmic labelling" 23 | }, 24 | "Head_UCIDs": { 25 | "type": "number", 26 | "description": "The total number of unique docket headers this entity existed on from our disambiguation sample" 27 | }, 28 | "Tot_UCIDs": { 29 | "type": "number", 30 | "description": "The total number of unique dockets this entity existed on from our disambiguation sample" 31 | }, 32 | "Full_Name": { 33 | "type": "number", 34 | "description": "If the judge entity is a known Article III judge from the FJC biological dictionary, then this is the concatenation of FJC name fields" 35 | }, 36 | "NID": { 37 | "type": "string", 38 | "description": "If the judge entity is a known Article III judge from the FJC biological dictionary, then this is the FJC NID field" 39 | } 40 | } 41 | } -------------------------------------------------------------------------------- /src/pacer_tools/code/support/stats.py: -------------------------------------------------------------------------------- 1 | def bootstrap_district_variation(checkdf): 2 | ''' 3 | For each courut does a t test on the difference between an individual judge and the court and the other judges in the court 4 | Accounts for uneven sample sizes 5 | 6 | input: 7 | * checkdf - dataframe where each row is a case, columns are: 8 | ['court', 'judge', 'resolutuion'] 9 | A positive outcome for the procedural ruling ('resolution') is 1 and a negative outcome is 0 10 | standard social science encoding 11 | output: 12 | * scidf - dataframe where each row is a judge, columns are: 13 | ['Judge', 'Diff', 'LB', 'UB', 'sig'] 14 | diff is the actual difference, lb and ub are the confidence bounds, and sig is if 1 if it doesn't cross zero 15 | ''' 16 | import numpy as np 17 | from scipy import stats 18 | import pandas as pd 19 | 20 | def _identify_sig(row): 21 | if np.sign(row['LB'])==np.sign(row['UB']): 22 | return 1 23 | else: 24 | return 0 25 | 26 | judge_data = [] 27 | courts = [x for x in checkdf.court.unique() if x!='nmid'] 28 | for court in courts: 29 | #Just subset to keep the naming shorter 30 | cdf = checkdf[checkdf.court == court] 31 | #Get the judge list 32 | judges = cdf.judge.unique() 33 | #District differences 34 | for j in judges: 35 | jdf = cdf[cdf.judge==j] 36 | njdf = cdf[cdf.judge!=j] 37 | mu_1 = np.mean(jdf.resolution) 38 | mu_2 = np.mean(njdf.resolution) 39 | s_1 = np.std(jdf.resolution) 40 | s_2 = np.std(njdf.resolution) 41 | diff = (mu_1-mu_2) 42 | #Uneven samples 43 | se = np.sqrt(s_1**2/len(jdf) + s_2**2/len(njdf)) 44 | ndf = (se**2)**2/( (s_1**2/len(jdf))**2/(len(jdf)-1) + (s_2**2/len(njdf))**2/(len(njdf)-1) ) 45 | lb = diff - stats.t.ppf(0.975, ndf)*se 46 | ub = diff + stats.t.ppf(0.975, ndf)*se 47 | 48 | judge_data.append([j, diff, lb, ub]) 49 | 50 | scidf = pd.DataFrame(judge_data, columns = ['Judge', 'Diff', 'LB', 'UB']) 51 | scidf['sig'] = scidf.apply(_identify_sig, axis=1) 52 | return scidf 53 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # to update the pypi package: 2 | # (1) rm -r build dist (not strictly necessary, but prevents superfluous uploads of old versions) 3 | # (1) iterate the version in this file 4 | # (2) python setup.py bdist_wheel --universal (check.warn(importable) can be ignored if data_files takes care of the directories in question) 5 | # (3) twine upload dist/* (requires pypi credentials) 6 | 7 | from setuptools import setup, find_packages 8 | from glob import glob 9 | 10 | from pathlib import Path 11 | base_dir = Path(__file__).parent 12 | long_description = (base_dir / "README.md").read_text() 13 | 14 | setup( 15 | name='pacer-tools', 16 | version='0.1.12', 17 | long_description=long_description, 18 | long_description_content_type='text/markdown', 19 | package_dir={'': 'src'}, 20 | packages=find_packages('src'), 21 | install_requires=[ 22 | 'async-generator', 'attrs', 'beautifulsoup4', 'bs4', 23 | 'cchardet', 'cffi', 'chardet', 'charset-normalizer', 24 | 'click', 'configuration-maker', 'cryptography', 25 | 'cssselect', 'feedparser', 'filelock', 'future', 26 | 'geonamescache', 'h11', 'html5lib', 'idna', 27 | 'lxml', 'numpy', 'outcome', 'pandas', 'pathlib', 28 | 'probableparsing', 'pycparser', # 'pymongo', 29 | 'pyOpenSSL', 'PySocks', 30 | 'python-crfsuite', 'python-dateutil', 'python-dotenv', 31 | 'python-Levenshtein', 'pytz', 'rdflib', 'requests', 32 | 'requests-file', 'scipy', 'selenium', 33 | 'selenium-requests', 'sgmllib3k', 'simplejson', 34 | 'six', 'sniffio', 35 | 'sortedcontainers', 'soupsieve', 'tldextract', 36 | 'tqdm', 'trio', 'trio-websocket', 'urllib3', 37 | 'urllib3-secure-extra', 'usaddress', 'webencodings', 38 | 'wsproto', 'xmltodict' 39 | ], 40 | entry_points={ 41 | 'console_scripts': [ 42 | 'pacer-tools = pacer_tools:cli', 43 | ], 44 | }, 45 | data_files=[ 46 | ('pacer_tools', glob('src/pacer_tools/code/support/core_data/*.*')), 47 | ('pacer_tools', glob('src/pacer_tools/data/*.*')), 48 | ('pacer_tools', glob('src/pacer_tools/data/annotation/*.*')), 49 | ('pacer_tools', glob('src/pacer_tools/data/annotation/counties/ga_clayton/nibrs/*.*')), 50 | ], 51 | include_package_data = True, 52 | ) 53 | -------------------------------------------------------------------------------- /src/pacer_tools/code/support/mongo_connector.py: -------------------------------------------------------------------------------- 1 | ''' Based on: https://gist.github.com/mangangreg/f84d8899e961c48a8539b813e746eac6 2 | ''' 3 | import os 4 | import sys 5 | import time 6 | from pathlib import Path 7 | 8 | from pymongo import MongoClient 9 | from dotenv import load_dotenv 10 | 11 | HERE = Path(__file__).parent 12 | 13 | class SCALESMongo: 14 | def __init__(self, user=None, password=None, host=None, port=None, database=None, env_file=HERE/'.mongo.env'): 15 | 16 | # Load the env file 17 | load_dotenv(env_file) 18 | 19 | self.user = user or os.getenv('MONGO_USER') 20 | self.password = password or os.getenv('MONGO_PASSWORD') 21 | self.host = host or os.getenv('MONGO_HOST') or 'localhost' 22 | self.port = port or os.getenv('MONGO_PORT') or 27017 23 | self.database = database or os.getenv('MONGO_DATABASE') 24 | 25 | # Build the URI 26 | self.URI = self._constructURI() 27 | 28 | # Initialise connection and db 29 | self.connection = None 30 | self.db = None 31 | 32 | def _constructURI(self): 33 | return f"mongodb://{self.user}:{self.password}@{self.host}:{self.port}" 34 | 35 | def connect(self): 36 | self.connection = MongoClient(self.URI) 37 | self.db = self.connection[self.database] 38 | 39 | class SaneResult: 40 | ''' A sane/readable Pymongo result object ''' 41 | 42 | def __init__(self, res): 43 | self.res = res 44 | self.counts = self.build_counts(res) 45 | self.counts_string = " ".join(f"{k}={v}" for k,v in self.counts.items()).rstrip() 46 | 47 | def __repr__(self): 48 | if not self.res: 49 | return '' 50 | class_str = str(self.res.__class__).strip('<> ') 51 | return f"<{class_str} acknowledged={self.res.acknowledged} {self.counts_string}>" 52 | 53 | def build_counts(self, res): 54 | ''' Find the attributes that contain insert/update count numbers ''' 55 | 56 | counts = {} 57 | for k in dir(res): 58 | if k.endswith('count'): 59 | counts.update({k: res.__getattribute__(k)}) 60 | elif k.endswith('_ids') and not k.startswith('_'): 61 | counts.update({k.split('_ids',maxsplit=1)[0]: len(res.__getattribute__(k))}) 62 | 63 | return counts 64 | 65 | 66 | -------------------------------------------------------------------------------- /src/pacer_tools/code/support/docket_functions.py: -------------------------------------------------------------------------------- 1 | import re 2 | def restrict_to_court_motions(docket_entries): 3 | ''' 4 | Restricts docket entries to court motions 5 | restriction is based on the idea that 6 | input: 7 | docket entries list [[date, num, text], ...] 8 | output: 9 | docket entries list [[date, num, text] 10 | ''' 11 | import re 12 | re_court_action = re.compile('\([a-z\, ]{3,20}\)') 13 | responsive = [] 14 | for i, docket_line in enumerate(docket_entries): 15 | #Does the docket line exist, if not do nothing 16 | if len(docket_line) == 3: 17 | try: 18 | search_result = re_court_action.search(docket_line[-1]) 19 | if search_result != None: 20 | responsive.append(i) 21 | except TypeError: 22 | #Not a string 23 | pass 24 | return responsive 25 | 26 | 27 | def checker_notice_of_removal(docket_entries): 28 | ''' 29 | Checks the docket to see if a case has been removed 30 | ''' 31 | removed_case = False 32 | if len(docket_entries) > 0: 33 | for line in docket_entries: 34 | try: 35 | if 'notice of removal' in line[-1].lower(): 36 | removed_case = True 37 | except: 38 | pass 39 | return removed_case 40 | 41 | 42 | def inter_event_series(docket_entries, docket_indices): 43 | ''' 44 | For a given docket, constructs the inter event time series 45 | input: 46 | docket entries list [[date, num, text], ...[ 47 | output: 48 | list inter event series in days [0, 2, 3, ....] 49 | ''' 50 | import pandas as pd 51 | import numpy as np 52 | if len(docket_indices) > 0: 53 | df = pd.DataFrame(np.array(docket_entries)[docket_indices], columns=['date','link','desc']) 54 | df['pdate'] = pd.to_datetime(df.date) 55 | inter_event = df['pdate'].diff().dt.days[1:].values.tolist() 56 | else: 57 | return [] 58 | return inter_event 59 | 60 | def find_pattern(docket_entries, pat, rlim=None): 61 | ''' 62 | Binary check for occurence of pattern in docket, returns True if at least one match 63 | inputs 64 | - docket_entries(list) - docket entries list from case json 65 | - pat (regex) - the pattern to search for 66 | - rlim (int) - the right limit character index to search as far as 67 | output 68 | (bool) - Returns true if pattern found on any line 69 | ''' 70 | if not docket_entries or not len(docket_entries): 71 | return False 72 | 73 | # Deal with singleton line 74 | if type(docket_entries)==list and type(docket_entries[0]) != list: 75 | return False 76 | 77 | for line in docket_entries: 78 | 79 | if len(line)==4 and type(line[2])==str: 80 | if re.search(pat, line[2][:rlim], re.I): 81 | return True -------------------------------------------------------------------------------- /src/pacer_tools/code/parsers/schemas/docket_entry_v1.schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://json-schema.org/draft/2020-12/schema", 3 | "$id": "http://github.com/scales-okn/PACER-tools/schemas/docket_entry_v1.schema.json", 4 | "title": "An entry in a case docket", 5 | "description": "A schema for a docket entry in a PACER docket report" , 6 | "properties": { 7 | "date_filed": { 8 | "type": "string", 9 | "description": "The date the docket entry was filed" 10 | }, 11 | "ind": { 12 | "type": "string", 13 | "description": "The PACER index, printed in the '#' column of the docket entry" 14 | }, 15 | "docket_text": { 16 | "type": "string", 17 | "description": "The raw text of the docket entry" 18 | }, 19 | "documents": { 20 | "type": "object", 21 | "description": "The documents associated with a docket line", 22 | "items": { 23 | "$ref": "#/$defs/document" 24 | } 25 | }, 26 | "edges": { 27 | "type": "array", 28 | "description": "In-line references to other docket lines, captured as edge triples", 29 | "items": [ 30 | { 31 | "type": "number", 32 | "description": "The source node (always the row itself)" 33 | }, 34 | { 35 | "type": "number", 36 | "description": "The target node (the preceeding row/document its pointing to)" 37 | }, 38 | { 39 | "type": "object", 40 | "description": "The span, relative to docket_text, where the reference appears", 41 | "properties":{ 42 | "start": { 43 | "type": "number", 44 | "description": "The character index, relative to docket_text, of the start of the reference" 45 | }, 46 | "end": { 47 | "type": "number", 48 | "description": "The character index, relative to docket_text, of the end of the reference" 49 | } 50 | } 51 | } 52 | 53 | ] 54 | } 55 | }, 56 | "$defs": { 57 | "document": { 58 | "type": "object", 59 | "description": "A document associated with a case. Keys in this object are ..", 60 | "propertyNames": "^\\d+$", 61 | "additionalProperties": { 62 | "type": "object", 63 | "description":"", 64 | "properties": { 65 | "url": { 66 | "type": "string", 67 | "description": "The url of the document" 68 | }, 69 | "span": { 70 | "type": "object", 71 | "description": "The span, relative to docket_text, where the reference to the document appears (always null for the 0th document, which is linked from the '#' column", 72 | "properties":{ 73 | "start": { 74 | "type": "number", 75 | "description": "The character index, relative to docket_text, of the start of the reference" 76 | }, 77 | "end": { 78 | "type": "number", 79 | "description": "The character index, relative to docket_text, of the end of the reference" 80 | } 81 | } 82 | } 83 | } 84 | } 85 | } 86 | } 87 | } -------------------------------------------------------------------------------- /src/pacer_tools/code/parsers/schemas/sel_v1.schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/schema#", 3 | "$id": "http://github.com/scales-okn/PACER-tools/schemas/SEL_v1.json", 4 | "title": "SEL Schema", 5 | "description": "SEL files are Spacy Entity Lookup files for a particular docket's judge entities" , 6 | "properties": { 7 | "Entity_Extraction_Method": { 8 | "type": "string", 9 | "enum": ["SPACY JNLP2", "referred_judges", "assigned_judge"], 10 | "description": "The method used to extract and identify an entity at this location. A Spacy Judge NLP Model, or docket key values" 11 | }, 12 | "docket_source": { 13 | "type": "string", 14 | "enum": ["line_entry","case_header", "case_parties"], 15 | "description": "Source from the docket that this entity was pulled from." 16 | }, 17 | "judge_enum": { 18 | "type": "number", 19 | "description": "If there are multiple entities in a lookup source, we enumerate them pythonically in the order they are listed; null for docket entries" 20 | }, 21 | "party_enum": { 22 | "type": "number", 23 | "description": "The enumerated party number (0-indexed) that the judge entity was tied to on a criminal case; null for civil and docket entries" 24 | }, 25 | "pacer_id": { 26 | "type": "number", 27 | "description": "If the judge in the header metadata was given an ID on the PACER HTML, we replilcate that here" 28 | }, 29 | "docket_index": { 30 | "type": "number", 31 | "description": "The docket entry index (0-start) that this entity is located at; null for case_header and case_parties" 32 | }, 33 | "ucid": { 34 | "type": "string", 35 | "description": "Unique case identifier (SCALES internal)" 36 | }, 37 | "cid": { 38 | "type": "string", 39 | "description": "Local court case identifier" 40 | }, 41 | "court": { 42 | "type": "string", 43 | "description": "Abbreviation for the Federal District Court this case is docketed at" 44 | }, 45 | "year": { 46 | "type": "number", 47 | "description": "Approximation of the filing date year for the overall case" 48 | }, 49 | "original_text": { 50 | "type": "string", 51 | "description": "Original string of text found on the docket that our entity was extracted from, padded with preceding and trailing tokens" 52 | }, 53 | "Extracted_Entity": { 54 | "type": "string", 55 | "description": "Specific string of text that represents the entity this row of data references" 56 | }, 57 | "Prefix_Categories": { 58 | "type": "string", 59 | "enum":["assigned_judge","referred_judges", "Bankruptcy_Judge","Circuit_Appeals","District_Judge","Magistrate_Judge","Nondescript_Judge","Judicial_Actor"], 60 | "description": "The categorization of the preceding text before the entity. The label corresponds to types of words appearing before the judge entity" 61 | }, 62 | "Transferred_Flag": { 63 | "type": "boolean", 64 | "description": "A boolean flag indicating if the judge entity was prefaced immediately prior with terminology related to case transferrals." 65 | }, 66 | "full_span_start": { 67 | "type": "number", 68 | "description": "Span starting point for the Original Text with respect to the overall text at the particular docket_source-docket_index location" 69 | }, 70 | "full_span_end": { 71 | "type": "number", 72 | "description": "Span ending point for the Original Text with respect to the overall text at the particular docket_source-docket_index location" 73 | }, 74 | "Entity_Span_Start": { 75 | "type": "number", 76 | "description": "Span starting point for the Extracted Entity with respect to the overall text at the particular docket_source-docket_index location" 77 | }, 78 | "Entity_Span_End": { 79 | "type": "number", 80 | "description": "Span ending point for the Extracted Entity with respect to the overall text at the particular docket_source-docket_index location" 81 | }, 82 | "Parent_Entity": { 83 | "type": "string", 84 | "description": "The identified unique entity the Extracted Entity in the data row is referring to." 85 | }, 86 | "SJID": { 87 | "type": "string", 88 | "description": "Unique SCALES Judge Identifier for the Paren Entity associated with this location" 89 | } 90 | } 91 | } -------------------------------------------------------------------------------- /src/pacer_tools/code/support/court_functions.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | import sys 4 | from pathlib import Path 5 | 6 | import pandas as pd 7 | 8 | sys.path.append(str(Path(__file__).resolve().parents[1])) 9 | from support import settings 10 | 11 | CARDINALS = ['northern', 'southern', 'eastern', 'western', 'central', 'middle'] 12 | STATEY2CODE = json.load(open(settings.STATEY2CODE)) 13 | COURTS_94 = [x.strip() for x in open(settings.DISTRICT_COURTS_94).readlines()] 14 | 15 | re_state_codes = '|'.join(STATEY2CODE.values()) 16 | re_card = '|'.join(x[0].lower() for x in CARDINALS) 17 | re_court_abbrev = rf"^(?P{re_state_codes})(?P({re_card})?)d$" 18 | 19 | # Read in court data 20 | courtdf = pd.read_csv(settings.COURTFILE, index_col=0) 21 | 22 | abbr2name_dict = dict(zip(courtdf.index, courtdf.courtname)) 23 | name2abbr_dict = dict(zip(courtdf.courtname, courtdf.index)) 24 | 25 | # Full name like "Oklahoma Western", useful for fjc 26 | full_name = (courtdf.state +' ' + courtdf.cardinal.fillna('')).str.strip() 27 | fullname2abbr_dict = dict(zip(full_name,courtdf.index)) 28 | 29 | def make_courtname(row): 30 | ''' 31 | Creates a court name that looks like [Cardinal]-[State] 32 | ''' 33 | courtname = '' 34 | 35 | if type(row.cardinal) == str: 36 | courtname += row.cardinal + '-' 37 | 38 | courtname += row.state 39 | courtname = courtname.lower().replace(' ','-') 40 | return courtname 41 | 42 | def abbr2name(abbr): 43 | ''' 44 | Convert court abbreviate to court name 45 | inputs: 46 | abbr - 4-letter court abbreviation e.g. ilnd 47 | outputs: 48 | courtname: the name of the court e.g. north-illinois 49 | ''' 50 | return abbr2name_dict[abbr] 51 | 52 | def name2abbr(name, ordinal_first=True): 53 | ''' 54 | Convert court abbreviate to court name 55 | inputs: 56 | name - court name e.g. 'northern illinois' or 'northern illinois' 57 | outputs: 58 | abbr - 4-letter court abbreviation e.g. ilnd 59 | ''' 60 | if 'district' in name and 'columbia' not in name: 61 | name = name.replace('district', '').rstrip() 62 | 63 | # If the ordinal is not first, reverse it: 64 | if not ordinal_first: 65 | nlist = name.split() 66 | if nlist[-1] in CARDINALS: 67 | name = " ".join([nlist[-1], *nlist[:-1]]) 68 | if ' ' in name or '-' not in name: 69 | name = name.lower().replace(' ', '-') 70 | 71 | return name2abbr_dict[name] 72 | 73 | def abbr2full(abbr): 74 | ''' 75 | Convert a court abbreviation to the full title format 76 | Ex. 'txsd' -> "U.S. District Court for the Southern District of Texas" 77 | 78 | Inputs: 79 | abbr (str) - court abbreviate 80 | Outputs: 81 | str 82 | ''' 83 | #Get the court abbreviation cardinal direction and state name from the court dataframe 84 | try: 85 | cardinal = courtdf[courtdf.index.eq(abbr)].cardinal.values[0] 86 | cardinal = cardinal + ' ' if (type(cardinal)==str) else '' 87 | state = courtdf[courtdf.index.eq(abbr)].state.values[0] 88 | 89 | #Make the string 90 | return f"U.S. District Court for the {cardinal}District of {state}" 91 | except IndexError: 92 | print("Error with court abbreviation:", abbr) 93 | return None 94 | 95 | def classify(court_raw): 96 | ''' Classify any district court ''' 97 | court = court_raw.lower() 98 | # Check if it already matches an abbreviation 99 | if re.match(re_court_abbrev, court): 100 | return court 101 | 102 | # Deal with DC separately as 'District' has problematic matching 103 | elif 'columbia' in court: 104 | return 'dcd' 105 | 106 | else: 107 | # Look for state and cardinal words 108 | court = re.sub('[-,]',' ', court).strip() 109 | state = re.search("|".join(STATEY2CODE.keys()), court) 110 | if not state: 111 | return 112 | elif state.group() == 'northern mariana islands': 113 | card_letter = '' 114 | else: 115 | cardinal = re.search("|".join(CARDINALS), court) 116 | if cardinal: 117 | card_letter = cardinal.group()[0] if cardinal else '' 118 | 119 | # Case with "District Court, N.D. Illinois" 120 | elif not cardinal and 'D.' in court_raw: 121 | #Search for cardinal letter (case sensitive) 122 | match = re.search("(?P[A-Z])\.", court_raw.replace("D.",'')) 123 | card_letter = match.groupdict()['card_letter'].lower() if match else '' 124 | else: 125 | card_letter = '' 126 | 127 | 128 | # state code + cardinal letter + 'd' e.g. ilnd 129 | abbrev = f"{STATEY2CODE[state.group()]}{card_letter}d" 130 | return abbrev 131 | -------------------------------------------------------------------------------- /src/pacer_tools/code/support/disambiguation_functions.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | import json 4 | import sys 5 | from pathlib import Path 6 | sys.path.append(str(Path(__file__).resolve().parents[1])) 7 | 8 | from support import fhandle_tools as ftools 9 | from support import settings 10 | 11 | 12 | def load_CENSUS_cities(): 13 | ''' Simple Loader Function''' 14 | return pd.read_csv(settings.CENSUS_CITIES, encoding="ISO-8859-1") 15 | 16 | def load_AMLAW_100(): 17 | ''' Simple Loader Function''' 18 | return pd.read_csv(settings.AMLAW_100) 19 | 20 | def load_hybrid_350(): 21 | ''' Simple Loader Function''' 22 | return pd.read_csv(settings.HYBRID_FIRMS) 23 | 24 | def load_counsel_clusters(): 25 | ''' Simple Loader Function''' 26 | return pd.read_json(settings.COUNSEL_DIS_CLUSTS, lines=True) 27 | 28 | def load_firm_clusters(): 29 | ''' Simple Loader Function''' 30 | return pd.read_json(settings.FIRM_DIS_CLUSTS, lines=True) 31 | 32 | def load_party_clusters(): 33 | ''' Simple Loader Function''' 34 | return pd.read_json(settings.PARTY_DIS_CLUSTS, lines=True) 35 | 36 | def load_disambiguated_counsels(ucid, as_df=True, collection_location=None): 37 | ''' 38 | Load Counsel data (from relevant .jsonl files in the COUNSEL_DIS_DIR) 39 | 40 | Inputs: 41 | - ucid (str or iterable): can be a single ucid (str) or any iterable (list / pd.Series) 42 | - as_df (bool): if true returns as type pd.DataFrame, otherwise list of dicts 43 | 44 | Output: 45 | (pd.DataFrame or list of dicts) Disambiguated counsel data for the given ucid(s) if the counsel appeared multiple times in the corpus 46 | ''' 47 | 48 | # Coerce to an iterable 49 | if type(ucid) is str: 50 | ucid = [ucid] 51 | 52 | ROW_DAT = [] 53 | for each in ucid: 54 | # create filepath 55 | fname = ftools.build_counsel_filename_from_ucid(each, collection_location) 56 | # load file 57 | results = [] 58 | if fname.exists(): 59 | with open(fname, 'r') as json_file: 60 | json_list = list(json_file) 61 | for json_str in json_list: 62 | results.append(json.loads(json_str)) 63 | 64 | ROW_DAT+=results 65 | 66 | # return dataframe 67 | if ROW_DAT: 68 | if as_df: 69 | COUNSELS = pd.DataFrame(ROW_DAT) 70 | else: 71 | COUNSELS = ROW_DAT 72 | 73 | return COUNSELS 74 | else: 75 | return None 76 | 77 | def load_disambiguated_firms(ucid, as_df=True, collection_location=None): 78 | ''' 79 | Load Firm data (from relevant .jsonl files in the FIRM_DIS_DIR) 80 | 81 | Inputs: 82 | - ucid (str or iterable): can be a single ucid (str) or any iterable (list / pd.Series) 83 | - as_df (bool): if true returns as type pd.DataFrame, otherwise list of dicts 84 | 85 | Output: 86 | (pd.DataFrame or list of dicts) Disambiguated firm data for the given ucid(s) if the firm appeared multiple times in the corpus 87 | ''' 88 | 89 | # Coerce to an iterable 90 | if type(ucid) is str: 91 | ucid = [ucid] 92 | 93 | ROW_DAT = [] 94 | for each in ucid: 95 | # create filepath 96 | fname = ftools.build_firm_filename_from_ucid(each, collection_location) 97 | # load file 98 | results = [] 99 | if fname.exists(): 100 | with open(fname, 'r') as json_file: 101 | json_list = list(json_file) 102 | for json_str in json_list: 103 | results.append(json.loads(json_str)) 104 | 105 | ROW_DAT+=results 106 | 107 | # return dataframe 108 | if ROW_DAT: 109 | if as_df: 110 | FIRMS = pd.DataFrame(ROW_DAT) 111 | else: 112 | FIRMS = ROW_DAT 113 | 114 | return FIRMS 115 | else: 116 | return None 117 | 118 | 119 | def load_disambiguated_parties(ucid, as_df=True, collection_location=None): 120 | ''' 121 | Load Party data (from relevant .jsonl files in the FIRM_DIS_DIR) 122 | 123 | Inputs: 124 | - ucid (str or iterable): can be a single ucid (str) or any iterable (list / pd.Series) 125 | - as_df (bool): if true returns as type pd.DataFrame, otherwise list of dicts 126 | 127 | Output: 128 | (pd.DataFrame or list of dicts) Disambiguated party data for the given ucid(s) if the party appeared multiple times in the corpus 129 | ''' 130 | 131 | # Coerce to an iterable 132 | if type(ucid) is str: 133 | ucid = [ucid] 134 | 135 | ROW_DAT = [] 136 | for each in ucid: 137 | # create filepath 138 | fname = ftools.build_party_filename_from_ucid(each, collection_location) 139 | # load file 140 | results = [] 141 | if fname.exists(): 142 | with open(fname, 'r') as json_file: 143 | json_list = list(json_file) 144 | for json_str in json_list: 145 | results.append(json.loads(json_str)) 146 | 147 | ROW_DAT+=results 148 | 149 | # return dataframe 150 | if ROW_DAT: 151 | if as_df: 152 | PARTIES = pd.DataFrame(ROW_DAT) 153 | else: 154 | PARTIES = ROW_DAT 155 | 156 | return PARTIES 157 | else: 158 | return None 159 | -------------------------------------------------------------------------------- /src/pacer_tools/code/parsers/schemas/party_cv_v1.schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/schema#", 3 | "$id": "http://github.com/scales-okn/PACER-tools/schemas/party_cv_v1.schema.json", 4 | "type": "object", 5 | "properties": { 6 | "name": { 7 | "type": "string", 8 | "description": "Name of the party in the case" 9 | }, 10 | "counsel": { 11 | "type": "array", 12 | "description": "An array of the party's lawyers in the case", 13 | "items": { 14 | "type": "object", 15 | "properties": { 16 | "name": { 17 | "type": "string", 18 | "description": "Counsel name" 19 | }, 20 | "office_name": { 21 | "type": "string", 22 | "description": "Name of the counsel's firm or legal office" 23 | }, 24 | "address": { 25 | "type": "string", 26 | "description": "Counsel address, string delimited by '\\n' newline character" 27 | }, 28 | "phone": { 29 | "type": "string", 30 | "description": "Counsel phone number, e.g. (123) 456-7890" 31 | }, 32 | "fax": { 33 | "type": "null", 34 | "description": "Counsel fax number" 35 | }, 36 | "email": { 37 | "type": "string", 38 | "description": "Counsel email address" 39 | }, 40 | "is_lead_attorney": { 41 | "type": "boolean", 42 | "description": "Whether or not counsel is listed as lead attorney" 43 | }, 44 | "is_pro_hac_vice": { 45 | "type": "boolean", 46 | "description": "Whether or not counsel is listed as pro hac vice" 47 | }, 48 | "is_notice_attorney": { 49 | "type": "boolean", 50 | "description": "Whether or not counsel is listed as attorney to be noticed" 51 | }, 52 | "see_above_for_address": { 53 | "type": "boolean", 54 | "description": "Whether or not address is listed as 'see above for address', meaning address info should be obtained from preceding counsel entries" 55 | }, 56 | "designation": { 57 | "type": "string", 58 | "description": "This counsel's designation within the case (Retained, Government Attorney, Public Defender, etc)" 59 | }, 60 | "bar_status": { 61 | "type": "string", 62 | "description": "This counsel's standing with respect to the general bar (Admitted, Not Admitted, etc)" 63 | }, 64 | "trial_bar_status": { 65 | "type": "string", 66 | "description": "This counsel's standing with respect to the ILND trial bar, when applicable" 67 | }, 68 | "counsel_terminating_date": { 69 | "type": "string", 70 | "description": "The date (if any) that this counsel was terminated from the case" 71 | }, 72 | "raw_info": { 73 | "type": "string", 74 | "description": "The original value of the Pacer counsel field before being parsed out into the above SCALES fields" 75 | }, 76 | "recap_counsel_error": { 77 | "type": "boolean", 78 | "description": "A flag indicating that counsel information is missing due to a Recap error in which lawyers with identical names are merged incorrectly" 79 | } 80 | } 81 | } 82 | }, 83 | "is_pro_se": { 84 | "type": "boolean", 85 | "description": "Whether or not the party is appearing pro se, i.e. representing themselves" 86 | }, 87 | "pro_se_source": { 88 | "type": "string", 89 | "description": "The source that led us to believe this party is pro se ('explicit'=written out in the docket, 'implicit'=address-like info found for a lawyerless party)", 90 | "enum": ["explicit", "implicit"] 91 | }, 92 | "extra_pro_se_info": { 93 | "type": "string", 94 | "description":"For pro se parties, non-subheading party-related text that doesn't fit into counsel buckets (prisoner number, prison name, etc) - newline-delimited" 95 | }, 96 | "terminating_date": { 97 | "type": "string", 98 | "description":"The date (if any) that this party was terminated from the case" 99 | }, 100 | "extra_info": { 101 | "type": "string", 102 | "description":"Any Pacer subheadings for this party (alt names, corporation types, the capacity in which they're appearing, etc) - newline-delimited" 103 | }, 104 | "role": { 105 | "type": "string", 106 | "description": "This party's role in the case, as listed in their Pacer heading (e.g. 'Defendant', 'Plaintiff', 'Petitioner', 'Appellant'...)" 107 | }, 108 | "party_type": { 109 | "type": "string", 110 | "description": "The broad bucket in which this party's role belongs, ascertained via a hand-coded mapping of the role", 111 | "enum": ["defendant", "plaintiff", "misc", "other_party", "bk_party"] 112 | }, 113 | "pacer_id": { 114 | "type": "number", 115 | "description": "Pacer's intra-case defendant id - always null for civil cases, but retained just in case users mistake it for a universal field" 116 | }, 117 | "recap_party_error": { 118 | "type": "boolean", 119 | "description": "A flag indicating that party information is missing due to a Recap error in which parties with identical names are merged incorrectly" 120 | } 121 | } 122 | 123 | } -------------------------------------------------------------------------------- /src/pacer_tools/data/annotation/nature_suit.csv: -------------------------------------------------------------------------------- 1 | number,name,sub_type,major_type 2 | 110,Insurance,contract,contract 3 | 120,Marine,contract,contract 4 | 130,Miller Act,contract,contract 5 | 140,Negotiable Instrument,contract,contract 6 | 150,Recovery Of Overparyment & Enforcement Of Judgment,contract,contract 7 | 151,Medicare Act,contract,contract 8 | 152,Recovery Of Defaulted Student Loans (Excl. Veterans),contract,contract 9 | 153,Recovery Of Overpayment Of Veteran S Benefits,contract,contract 10 | 160,Stockholders Suits,contract,contract 11 | 190,Other Contract,contract,contract 12 | 195,Contract Product Liability,contract,contract 13 | 196,Franchise,contract,contract 14 | 210,Land Condemnation,real property,real property 15 | 220,Foreclosure,real property,real property 16 | 230,Rent Lease & Ejectment,real property,real property 17 | 240,Torts To Land,real property,real property 18 | 245,Tort Product Liability,real property,real property 19 | 290,All Other Property,real property,real property 20 | 310,Airplane,personal injury,torts 21 | 315,Airplane Product Liability,personal injury,torts 22 | 320,"Assault, Libel, & Slander",personal injury,torts 23 | 330,Federalemployers Liability,personal injury,torts 24 | 340,Marine,personal injury,torts 25 | 345,Marine Product Liability,personal injury,torts 26 | 350,Motor Vehicle,personal injury,torts 27 | 355,Motor Vehicle Product Liability,personal injury,torts 28 | 360,Other Personal Injury,personal injury,torts 29 | 362,Personal Injury- Medical Malpractice,personal injury,torts 30 | 365,Personal Injury- Product Liability,personal injury,torts 31 | 367,Personal Injury - Health Care/Pharmaceutical Personal Injury/Product Liability,personal injury,torts 32 | 368,Asbestos Personal Injury Product Liability,personal injury,torts 33 | 375,False Claims Act,personal injury,torts 34 | 376,376 Qui Tam (31 U.S.C. 3729(A)),personal injury,torts 35 | 370,Other Fraud,personal property,torts 36 | 371,Truth In Lending,personal property,torts 37 | 380,Other Personal Property Damage,personal property,torts 38 | 385,Property Damage Product Liability,personal property,torts 39 | 422,Appeal 28 Usc 158,bankruptcy,bankruptcy 40 | 423,Withdrawal 28 Usc 157,bankruptcy,bankruptcy 41 | 440,Other Civil Rights,civil rights,civil rights 42 | 441,Voting,civil rights,civil rights 43 | 442,Employment,civil rights,civil rights 44 | 443,Housing/Accommodations,civil rights,civil rights 45 | 444,Welfare,civil rights,civil rights 46 | 445,Amer W/Disabilities-Employment,civil rights,civil rights 47 | 446,Amer W/Disabilities - Other,civil rights,civil rights 48 | 448,Education,civil rights,civil rights 49 | 462,Naturalization Application,immigration,immigration 50 | 463,Habeas Corpus - Alien Detainee,immigration,immigration 51 | 465,Other Immigration Actions,immigration,immigration 52 | 510,Motions To Vacate Sentence,prisoner petitions,prisoner petitions 53 | 530,General,habeas corpus,prisoner petitions 54 | 535,Death Penalty,habeas corpus,prisoner petitions 55 | 540,Mandamus & Other,habeas corpus,prisoner petitions 56 | 550,Civil Rights,habeas corpus,prisoner petitions 57 | 555,Prison Condition,habeas corpus,prisoner petitions 58 | 560,Conditions Of Confinement,civil detainee,prisoner petitions 59 | 610,Agriculture,forfeiture/penalty,forfeiture/penalty 60 | 620,Other Food & Drug,forfeiture/penalty,forfeiture/penalty 61 | 625,Drug Related Seizure Of Property 21 Usc 881 630 Liquor Laws,forfeiture/penalty,forfeiture/penalty 62 | 630,Liquor Laws,forfeiture/penalty,forfeiture/penalty 63 | 640,Rr & Truck,forfeiture/penalty,forfeiture/penalty 64 | 650,Airline Regulations,forfeiture/penalty,forfeiture/penalty 65 | 660,Occupational Safety/Health,forfeiture/penalty,forfeiture/penalty 66 | 690,Other,forfeiture/penalty,forfeiture/penalty 67 | 710,Fair Labor Standards Act,labor,labor 68 | 720,Labor/Management Relations,labor,labor 69 | 730,Labor/Management Reporting & Disclosure Act,labor,labor 70 | 740,Railway Labor Act,labor,labor 71 | 751,Family And Medical Leave Act,labor,labor 72 | 790,Other Labor Litigation,labor,labor 73 | 791,Employee Retirement Income Security Act,labor,labor 74 | 820,Copyrights,property rights,property rights 75 | 830,Patent,property rights,property rights 76 | 835,Patent Abbreviated New Drug Application (Anda),property rights,property rights 77 | 840,Trademark,property rights,property rights 78 | 880,Defend Trade Secrets Act Of 2016 (Dtsa),property rights,property rights 79 | 861,Hia (1395Ff),social security,social security 80 | 862,Black Lung (923),social security,social security 81 | 863,Diwc/Diww (405(G)),social security,social security 82 | 864,Ssid Title Xvi,social security,social security 83 | 865,Rsi (405(G)),social security,social security 84 | 870,Taxes (U.S. Plaintiff Or Defendant),federal tax suits,federal tax suits 85 | 871,Irs-Third Party 26 Usc 7609,federal tax suits,federal tax suits 86 | 400,State Reapportionment,other statutes,other statutes 87 | 410,Antitrust,other statutes,other statutes 88 | 430,Banks And Banking,other statutes,other statutes 89 | 450,Commerce,other statutes,other statutes 90 | 460,Deportation,other statutes,other statutes 91 | 470,Racketeer Influenced And Corrupt Organizations,other statutes,other statutes 92 | 480,Consumer Credit,other statutes,other statutes 93 | 485,Telephone Consumer Protection Act (Tcpa),other statutes,other statutes 94 | 490,Cable/Sat Tv,other statutes,other statutes 95 | 810,Selective Service,other statutes,other statutes 96 | 850,Securities/Commodities/Exchange,other statutes,other statutes 97 | 875,Customer Challenge 12 Usc 3410,other statutes,other statutes 98 | 890,Other Statutory Actions,other statutes,other statutes 99 | 891,Agricultural Acts,other statutes,other statutes 100 | 892,Economic Stabilization Act,other statutes,other statutes 101 | 893,Environmental Matters,other statutes,other statutes 102 | 894,Energy Allocation Act,other statutes,other statutes 103 | 895,Freedom Of Information Act,other statutes,other statutes 104 | 896,Arbitration,other statutes,other statutes 105 | 899,Administrative Procedure Act/Review Or Appeal Of Agency Decision,other statutes,other statutes 106 | 900,Appeal Of Fee Determination Under Equal Access To Justice Act,other statutes,other statutes 107 | 950,Constitutionality Of State Statutes,other statutes,other statutes 108 | 990,Other,other statutes,other statutes 109 | 999,Miscellaneous Cases,other statutes,other statutes -------------------------------------------------------------------------------- /src/pacer_tools/code/support/core_data/nature_suit.csv: -------------------------------------------------------------------------------- 1 | number,name,sub_type,major_type 2 | 110,Insurance,contract,contract 3 | 120,Marine,contract,contract 4 | 130,Miller Act,contract,contract 5 | 140,Negotiable Instrument,contract,contract 6 | 150,Recovery Of Overparyment & Enforcement Of Judgment,contract,contract 7 | 151,Medicare Act,contract,contract 8 | 152,Recovery Of Defaulted Student Loans (Excl. Veterans),contract,contract 9 | 153,Recovery Of Overpayment Of Veteran S Benefits,contract,contract 10 | 160,Stockholders Suits,contract,contract 11 | 190,Other Contract,contract,contract 12 | 195,Contract Product Liability,contract,contract 13 | 196,Franchise,contract,contract 14 | 210,Land Condemnation,real property,real property 15 | 220,Foreclosure,real property,real property 16 | 230,Rent Lease & Ejectment,real property,real property 17 | 240,Torts To Land,real property,real property 18 | 245,Tort Product Liability,real property,real property 19 | 290,All Other Property,real property,real property 20 | 310,Airplane,personal injury,torts 21 | 315,Airplane Product Liability,personal injury,torts 22 | 320,"Assault, Libel, & Slander",personal injury,torts 23 | 330,Federalemployers Liability,personal injury,torts 24 | 340,Marine,personal injury,torts 25 | 345,Marine Product Liability,personal injury,torts 26 | 350,Motor Vehicle,personal injury,torts 27 | 355,Motor Vehicle Product Liability,personal injury,torts 28 | 360,Other Personal Injury,personal injury,torts 29 | 362,Personal Injury- Medical Malpractice,personal injury,torts 30 | 365,Personal Injury- Product Liability,personal injury,torts 31 | 367,Personal Injury - Health Care/Pharmaceutical Personal Injury/Product Liability,personal injury,torts 32 | 368,Asbestos Personal Injury Product Liability,personal injury,torts 33 | 375,False Claims Act,personal injury,torts 34 | 376,376 Qui Tam (31 U.S.C. 3729(A)),personal injury,torts 35 | 370,Other Fraud,personal property,torts 36 | 371,Truth In Lending,personal property,torts 37 | 380,Other Personal Property Damage,personal property,torts 38 | 385,Property Damage Product Liability,personal property,torts 39 | 422,Appeal 28 Usc 158,bankruptcy,bankruptcy 40 | 423,Withdrawal 28 Usc 157,bankruptcy,bankruptcy 41 | 440,Other Civil Rights,civil rights,civil rights 42 | 441,Voting,civil rights,civil rights 43 | 442,Employment,civil rights,civil rights 44 | 443,Housing/Accommodations,civil rights,civil rights 45 | 444,Welfare,civil rights,civil rights 46 | 445,Amer W/Disabilities-Employment,civil rights,civil rights 47 | 446,Amer W/Disabilities - Other,civil rights,civil rights 48 | 448,Education,civil rights,civil rights 49 | 462,Naturalization Application,immigration,immigration 50 | 463,Habeas Corpus - Alien Detainee,immigration,immigration 51 | 465,Other Immigration Actions,immigration,immigration 52 | 510,Motions To Vacate Sentence,prisoner petitions,prisoner petitions 53 | 530,General,habeas corpus,prisoner petitions 54 | 535,Death Penalty,habeas corpus,prisoner petitions 55 | 540,Mandamus & Other,habeas corpus,prisoner petitions 56 | 550,Civil Rights,habeas corpus,prisoner petitions 57 | 555,Prison Condition,habeas corpus,prisoner petitions 58 | 560,Conditions Of Confinement,civil detainee,prisoner petitions 59 | 610,Agriculture,forfeiture/penalty,forfeiture/penalty 60 | 620,Other Food & Drug,forfeiture/penalty,forfeiture/penalty 61 | 625,Drug Related Seizure Of Property 21 Usc 881 630 Liquor Laws,forfeiture/penalty,forfeiture/penalty 62 | 630,Liquor Laws,forfeiture/penalty,forfeiture/penalty 63 | 640,Rr & Truck,forfeiture/penalty,forfeiture/penalty 64 | 650,Airline Regulations,forfeiture/penalty,forfeiture/penalty 65 | 660,Occupational Safety/Health,forfeiture/penalty,forfeiture/penalty 66 | 690,Other,forfeiture/penalty,forfeiture/penalty 67 | 710,Fair Labor Standards Act,labor,labor 68 | 720,Labor/Management Relations,labor,labor 69 | 730,Labor/Management Reporting & Disclosure Act,labor,labor 70 | 740,Railway Labor Act,labor,labor 71 | 751,Family And Medical Leave Act,labor,labor 72 | 790,Other Labor Litigation,labor,labor 73 | 791,Employee Retirement Income Security Act,labor,labor 74 | 820,Copyrights,property rights,property rights 75 | 830,Patent,property rights,property rights 76 | 835,Patent Abbreviated New Drug Application (Anda),property rights,property rights 77 | 840,Trademark,property rights,property rights 78 | 880,Defend Trade Secrets Act Of 2016 (Dtsa),property rights,property rights 79 | 861,Hia (1395Ff),social security,social security 80 | 862,Black Lung (923),social security,social security 81 | 863,Diwc/Diww (405(G)),social security,social security 82 | 864,Ssid Title Xvi,social security,social security 83 | 865,Rsi (405(G)),social security,social security 84 | 870,Taxes (U.S. Plaintiff Or Defendant),federal tax suits,federal tax suits 85 | 871,Irs-Third Party 26 Usc 7609,federal tax suits,federal tax suits 86 | 400,State Reapportionment,other statutes,other statutes 87 | 410,Antitrust,other statutes,other statutes 88 | 430,Banks And Banking,other statutes,other statutes 89 | 450,Commerce,other statutes,other statutes 90 | 460,Deportation,other statutes,other statutes 91 | 470,Racketeer Influenced And Corrupt Organizations,other statutes,other statutes 92 | 480,Consumer Credit,other statutes,other statutes 93 | 485,Telephone Consumer Protection Act (Tcpa),other statutes,other statutes 94 | 490,Cable/Sat Tv,other statutes,other statutes 95 | 810,Selective Service,other statutes,other statutes 96 | 850,Securities/Commodities/Exchange,other statutes,other statutes 97 | 875,Customer Challenge 12 Usc 3410,other statutes,other statutes 98 | 890,Other Statutory Actions,other statutes,other statutes 99 | 891,Agricultural Acts,other statutes,other statutes 100 | 892,Economic Stabilization Act,other statutes,other statutes 101 | 893,Environmental Matters,other statutes,other statutes 102 | 894,Energy Allocation Act,other statutes,other statutes 103 | 895,Freedom Of Information Act,other statutes,other statutes 104 | 896,Arbitration,other statutes,other statutes 105 | 899,Administrative Procedure Act/Review Or Appeal Of Agency Decision,other statutes,other statutes 106 | 900,Appeal Of Fee Determination Under Equal Access To Justice Act,other statutes,other statutes 107 | 950,Constitutionality Of State Statutes,other statutes,other statutes 108 | 990,Other,other statutes,other statutes 109 | 999,Miscellaneous Cases,other statutes,other statutes -------------------------------------------------------------------------------- /src/pacer_tools/code/support/README.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | Collection of common tools and functions for SCALES project 3 | 4 | # Filehandle Tools 5 | In `fhandle_tools` there are several functions to simplify and unify common transformations of filenames and case names. For full function argument documentation see the docstrings. Below is a summary of some of the most useful methods with usage examples. 6 | 7 | ## Quick lookup 8 | ### Dockets/Cases 9 | | method |input | output| 10 | |--|--|--| 11 | | *decompose_caseno* | `"1:16-cv-12345-2-ABC-DEF"` |`{'office': '1', 'year': '16',...}` | 12 | | *clean_case_id* | `"1:16-cv-12345-2-ABC-DEF"` | `"1:16-cv-12345"` | 13 | |*generate_docket_filename*|`"1:16-cv-12345",def_no=3, ind=2`|`"1:16-cv-12345-3_2.html"`| 14 | 15 | 16 | ### Documents 17 | | method |input | output| 18 | |--|--|--| 19 | |*generate_document_id*|`"ilnd;;1:16-cv-12345", index=3, att_index=10`|`"ilnd;;1-16-cv-12345_3_10"`| 20 | |*generate_document_fname*|`"ilnd;;1-16-cv-12345_3_10", user_hash="12345678"`|`"ilnd;;1-16-cv-12345_3_10_u12345678_t210106.pdf"`| 21 | |*parse_document_fname*|`"ilnd;;1-16-cv-12345_3_10_u12345678_t210106.pdf"`| `{'index':3, 'att_index':10, ...`| 22 | 23 | ## Dockets/Cases 24 | **decompose_caseno**(*case_no, pattern=re_case_no_gr*) 25 | 26 | *Takes a PACER-style case no. and returns a dictionary of its decomposed parts.* 27 | ```python 28 | decompose_caseno("1:16-cv-12345-2-ABC-DEF") 29 | >> {'office': '1', 30 | 'year': '16', 31 | 'case_type': 'cv', 32 | 'case_no': '12345', 33 | 'judge_names': ['ABC', 'DEF'], 34 | 'def_no': '2', 35 | 'update_ind': ''} 36 | ``` 37 | 38 | **clean_case_id**(*case_no, allow_def_stub=False)* 39 | 40 | *Takes a case id and cleans off anything that isn't the office, year, case type, and case no. Can handle filenames also.* 41 | ```python 42 | clean_case_id("1:16-cv-12345-2-ABC-DEF") 43 | >> "1:16-cv-12345" 44 | clean_case_id("1-16-cv-12345_1.html") 45 | >> "1:16-cv-12345" 46 | ``` 47 | 48 | **generate_docket_filename**(*case_name, def_no=None, ind=None, ext='html'*) 49 | 50 | *Generate the filename for a docket* 51 | ```python 52 | generate_docket_filename("1:16-cv-12345") 53 | >> "1:16-cv-12345.html" 54 | generate_docket_filename("1:16-cv-12345",def_no=3, ind=2) 55 | >> "1:16-cv-12345-3_2.html" 56 | ``` 57 | 58 | ## Documents 59 | 60 | **generate_document_id**(*ucid, index, att_index=None*) 61 | 62 | *Generate a unique id name for case document download* 63 | ```python 64 | generate_document_id("ilnd;;1:16-cv-12345", 3) 65 | >> "ilnd;;1-16-cv-12345_3" 66 | generate_document_id("ilnd;;1:16-cv-12345", 3, 10) 67 | >> "ilnd;;1-16-cv-12345_3_10" 68 | ``` 69 | 70 | **generate_document_fname**(*doc_id, user_hash, ext='pdf'*) 71 | 72 | *Generate a unique file name for case document download* 73 | ```python 74 | generate_document_fname("ilnd;;1-16-cv-12345_3_10",user_hash=12345678 ) 75 | >> "ilnd;;1-16-cv-12345_3_10_u12345678_t210106.pdf" 76 | ``` 77 | **parse_document_fname**(*fname'*) 78 | 79 | *Parse a document filename, return the component parts as a dict* 80 | ```python 81 | parse_document_fname("ilnd;;1-16-cv-12345_3_10_u12345678_t210106.pdf" ) 82 | >> {'doc_id': 'ilnd;;1-16-cv-12345_3_10', 83 | 'index': '3', 84 | 'att_index': '10', 85 | 'user_hash': '12345678', 86 | 'download_time': '210106', 87 | 'ext': 'pdf', 88 | 'ucid': 'ilnd;;1:16-cv-12345'} 89 | ``` 90 | 91 | ## Other 92 | **get_expected_path**(*ucid, ext='json', pacer_path=settings.PACER_PATH, def_no=None*) 93 | 94 | *Find the expected path of the json file for the case* 95 | ```python 96 | get_expected_path("ilnd;;1:16-cv-12345") 97 | >> "{{abs}}/data/pacer/ilnd/json/1-16-cv-12345.json" 98 | get_expected_path("ilnd;;1:16-cv-12345", ext="html", def_no=2) 99 | >> "{{abs}}/data/pacer/html/json/1-16-cv-12345_2.html" 100 | ``` 101 | **get_pacer_url**(*court, page*) 102 | 103 | *Get a court-specific pacer url for various pages: query, login, logout, docket, document link, possible case* 104 | 105 | ```python 106 | get_pacer_url("ilnd", "query") 107 | >>> "https://ecf.ilnd.uscourts.gov/cgi-bin/iquery.pl" 108 | get_pacer_url("txed", "logout") 109 | >>> "https://ecf.txed.uscourts.gov/cgi-bin/login.pl?logout" 110 | ``` 111 | 112 | 113 | # Research Tools (`research_tools.py`) 114 | ## Docket Searcher 115 | ### Description 116 | The docket searcher is a tool to analyze case dockets for events/patterns and build a table of observations. 117 | The tool takes a collection of docket reports, and for each line of each docket report it does the following: 118 | 1. Checks if the text of the docket line matches **basic criteria**. This can be one of two ways: 119 | - The docket line matches the *wide_net* 120 | - The docket line matches the *docket_line_fn* function 121 | 2. If so, checks the line for a variety of patterns (patterns, computed_attrs) and use this to build a row for the result set. 122 | 123 | ### Usage 124 | 125 | ```python 126 | docket_searcher(case_paths, outfile, wide_net, patterns, 127 | computed_attrs={}, rlim=None, line_match_fn=None) 128 | ``` 129 | - **case_paths** (list): a list of filepaths to case data (.json files) that are relative to the project root 130 | - **outfile** (str): the output file (.csv) 131 | - **wide_net** (list): a list of regex patterns 132 | - **patterns** (dict): a dictionary of regex patterns with (variable_name, pattern) pairs 133 | - **computed_attrs** (dict): a dictionary of (variable_name, function) pairs. The functions take two arguments (*docket_line*, *case*) where the *docket_line* is a list and *case* is a parsed case json 134 | - **rlim** (int): a right limit to narrow search within docket entry text 135 | - **line_match_fn** (function): a function to use to instead to check if a line matches the basic criteria. The function takes two arguments (docket_line, case) as above. If line_match_fn is supplied it is used instead of *wide_net* to check basic criteria. 136 | 137 | 138 | ### Example 139 | ```python 140 | import research_tools as rt 141 | 142 | case_paths = [...] 143 | outfile = 'results_table.csv' 144 | wide_net = ['seal', 'protective'] 145 | 146 | patterns = { 147 | 'seal_motion':'(motion|order)( to)? seal', 148 | 'grant_part': 'granting in part motion to seal', 149 | 'deny_part' : 'denying in part motion to seal' 150 | } 151 | 152 | def date_diff(x,y): 153 | return (pd.Timestamp(x) - pd.TimeStamp(y)).days 154 | 155 | computed_attrs = { 156 | 'case_type': lambda dl,c: c['case_type'], 157 | 'days_from_filing': lambda dl,c: date_diff(dl[0], c['filing_date']) 158 | } 159 | 160 | rt.docket_searcher(case_paths, 'res_tab.csv', wide_net, 161 | patterns, computed_attrs) 162 | ``` 163 | 164 | */res_tab.csv* 165 | ``` 166 | fpath,ucid,court,judge,case_type,case_type,days_from_filing,seal_motion,grant_part,deny_part 167 | ,ilnd;;>,ilnd,Judge A,cr,0,1,0,0 168 | ,ilnd;;>,ilnd,Judge A,cr,12,0,1,1 169 | ,ilnd;;>,ilnd,Judge B,cr,2,1,0,0 170 | ,ilnd;;>,ilnd,Judge B,cr,7,0,1,0 171 | ,ilnd;;>,ilnd,Judge B,cr,8,0,0,1 172 | 173 | ``` 174 | 175 | ### Output 176 | The output file has a row for each docket line that meets the basic criteria. 177 | The output columns are (in the following order): 178 | 179 | - *ucid* 180 | - *court* 181 | - *judge* 182 | - *fpath* 183 | - *date*: the docket line date 184 | - *ind*: the index of docket line, relative to docket list in case json 185 | - *text*: the first 100 characters of the docket line text 186 | 187 | Following the above are: 188 | - all columns generated by **computed_attrs** keys 189 | - all columns from **patterns** keys 190 | 191 | -------------------------------------------------------------------------------- /src/pacer_tools/code/support/research_tools.py: -------------------------------------------------------------------------------- 1 | import re 2 | import csv 3 | import sys 4 | from pathlib import Path 5 | 6 | sys.path.append(str(Path(__file__).resolve().parents[1])) 7 | from support import data_tools as dtools 8 | 9 | # Case-level metadata for results 10 | case_metadata = { 11 | 'ucid': lambda case: dtools.ucid(case['download_court'], case['case_id']), 12 | 'court': lambda case: case['download_court'], 13 | 'judge': lambda case: case['judge'], 14 | } 15 | 16 | def pattern_matcher(patterns, text_str): 17 | ''' 18 | Search for a group of patterns in the same string, return spans of matches 19 | 20 | Inputs: 21 | - patterns (dict): key-value pairs of pattern name to pattern value (regex pattern) 22 | - text_str (str): the text to search in 23 | Output 24 | matches(dict): key-value pairs of (pattern name, match span), 25 | if there is a match the span is a tuple of integers, otherwise it is None 26 | ''' 27 | _get_span_ = lambda match: match.span() if match else None 28 | 29 | return {name: _get_span_(re.search(pattern, text_str,re.I)) for name,pattern in patterns.items()} 30 | 31 | def wide_net_match_line(docket_line, case, wide_net=[], wide_net_fn=None): 32 | ''' 33 | Check single docket line for wide net match, or uses wide_net_fn if supplied 34 | Inputs: 35 | - docket_line (list): a single docket line 36 | - case (json): The case json 37 | - wide_net (list): a list of regex patterns co ca 38 | - match_fn (function): a match function to run if no docket_patterns supplied 39 | ''' 40 | if wide_net_fn is not None: 41 | return wide_net_fn(docket_line, case) 42 | else: 43 | full_pattern = '|'.join(f"({pat})" for pat in wide_net) 44 | return bool(re.search(full_pattern, docket_line['docket_text'], re.I)) 45 | 46 | def row_builder(docket_line, ind, case, fpath, patterns, computed_attrs={}, rlim=None): 47 | ''' 48 | Function to build observation row of result set. 49 | 50 | Inputs: 51 | - docket_line (tuple): The docket entry (date, #, docket text) 52 | - ind (int): index of docket_line (relative to dockets list in json) 53 | - case (json): The case json 54 | - fpath (str): file path 55 | - patterns (dict): a dictionary of pattern names and regex patterns 56 | - computed_attrs (dict): A dictionary with attribute names as keys, 57 | and functions taking docket_line and case as values 58 | e.g.{'is2020': lambda dl, c: dl[0].year==2020} 59 | - rlim (int): right limit to search text 60 | Output: 61 | row (dict) 62 | ''' 63 | row = { 64 | # Case-level metadata 65 | **{k: fn(case) for k,fn in case_metadata.items()}, 66 | 'fpath': fpath, 67 | 'date': docket_line['date_filed'], 68 | 'ind': ind, 69 | 'text': docket_line['docket_text'][:100], 70 | # Computed attribues 71 | **{k: fn(docket_line, case) for k,fn in computed_attrs.items()}, 72 | # Pattern matches 73 | **pattern_matcher(patterns, docket_line['docket_text'][:rlim]), 74 | } 75 | return row 76 | 77 | def get_case_matches(fpath, patterns, wide_net, 78 | computed_attrs={}, rlim=None, wide_net_fn=None, skip_non_matches=False): 79 | ''' 80 | Process a case and return observation rows 81 | 82 | Output: 83 | (list) of obersvation rows (dicts) 84 | ''' 85 | 86 | case_rows = [] 87 | case = dtools.load_case(fpath) 88 | 89 | for ind, line in enumerate(case['docket']): 90 | 91 | if wide_net_match_line(line, case, wide_net, wide_net_fn): 92 | # Use row builder 93 | row = row_builder(docket_line=line, ind=ind, case=case, fpath=fpath, 94 | patterns=patterns, computed_attrs=computed_attrs, rlim=rlim) 95 | 96 | if skip_non_matches: 97 | # Only add row if at least one pattern match 98 | if not any(v for k,v in row.items() if k in patterns): 99 | continue 100 | 101 | case_rows.append(row) 102 | 103 | return case_rows 104 | 105 | def docket_searcher(case_paths, outfile, patterns, wide_net=[], computed_attrs={}, 106 | rlim=None, wide_net_fn=None, skip_non_matches=False): 107 | ''' 108 | Main function to build results set from criteria 109 | 110 | Inputs: 111 | - case_paths (iterable): list of filepaths 112 | - outfile (str or Path): path to output file (.csv) 113 | - patterns (dict): a dictionary of patterns 114 | - wide_net (list): a list of wide regex patterns to match on docket lines 115 | - computed_attrs (dict): a dictionary of computed attributes 116 | (named functions that take (docket_line, case) inputs) 117 | - rlim (int): right limit on characters in docket text to analyze 118 | - wide_net_fn (function): a function that takes (docket_line, case) arguments, 119 | where docket_line is dict (from the case['docket'] array) and case is the case dict, 120 | and maps to a boolean, if supplied will be used to decide on a row match instead of wide_net 121 | - skip_non_matches (bool): Useful for debugging/exploring, if true then 122 | rows that match the wide net but have no pattern matches are not written to outfile 123 | ''' 124 | if (not len(wide_net)) and (wide_net_fn is None): 125 | raise ValueError('Must supply either wide_net or wide_net_fn') 126 | 127 | # Get table column headers 128 | headers = [*case_metadata.keys(), 'fpath', 'date','ind', 'text', *computed_attrs.keys(), *patterns.keys()] 129 | 130 | # Open outfile for writing 131 | with open(outfile, 'w', encoding='utf-8') as rfile: 132 | writer = csv.writer(rfile) 133 | writer.writerow(headers) 134 | 135 | for fpath in case_paths: 136 | case_rows = get_case_matches(fpath, patterns, wide_net, computed_attrs, rlim, wide_net_fn, skip_non_matches) 137 | print(f" found {len(case_rows)} rows with matches") 138 | 139 | if len(case_rows): 140 | # Write to file 141 | for row_dict in case_rows: 142 | # Ensure ordered printing by headers 143 | 144 | #TODO: make this an append 145 | writer.writerow(row_dict[k] for k in headers) 146 | 147 | print(f'Docket Searcher complete, results located at {outfile}') 148 | 149 | def make_spacy_spans(row_series, pat_cols): 150 | ''' Convert a row from docket searcher output to a spaCy span-like output 151 | Inputs: 152 | - row_series(pd.Series): a pandas series/row 153 | - pat_cols (list): list of str of column names in row_series that are pattern columns 154 | Output: 155 | (list of dicts) with start, end, label keys 156 | 157 | Example: 158 | 159 | row_series = 160 | ucid ##### 161 | year ###### 162 | pat1 (10,15) 163 | pat2 (30,40) 164 | 165 | pat_cols = ['pat1', 'pat2'] 166 | 167 | output: [ 168 | {'start':10, 'end':15, 'label':'pat1'}, 169 | {'start':30, 'end':40, 'label':'pat2'} 170 | ] 171 | 172 | 173 | ''' 174 | return [{'start':int(v[0]), 'end':int(v[1]), 'label':k} for k,v in row_series[pat_cols].iteritems() if v] 175 | -------------------------------------------------------------------------------- /src/pacer_tools/code/parsers/schemas/party_cr_v1.schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/schema#", 3 | "$id": "http://github.com/scales-okn/PACER-tools/schemas/party_cr_v1.schema.json", 4 | "type": "object", 5 | "properties": { 6 | "name": { 7 | "type": "string", 8 | "description": "Name of the party in the case" 9 | }, 10 | "counsel": { 11 | "type": "array", 12 | "description": "An array of the party's lawyers in the case", 13 | "items": { 14 | "type": "object", 15 | "properties": { 16 | "name": { 17 | "type": "string", 18 | "description": "Counsel name" 19 | }, 20 | "office_name": { 21 | "type": "string", 22 | "description": "Name of the counsel's firm or legal office" 23 | }, 24 | "address": { 25 | "type": "string", 26 | "description": "Counsel address, string delimited by '\\n' newline character" 27 | }, 28 | "phone": { 29 | "type": "string", 30 | "description": "Counsel phone number, e.g. (123) 456-7890" 31 | }, 32 | "fax": { 33 | "type": "null", 34 | "description": "Counsel fax number" 35 | }, 36 | "email": { 37 | "type": "string", 38 | "description": "Counsel email address" 39 | }, 40 | "is_lead_attorney": { 41 | "type": "boolean", 42 | "description": "Whether or not counsel is listed as lead attorney" 43 | }, 44 | "is_pro_hac_vice": { 45 | "type": "boolean", 46 | "description": "Whether or not counsel is listed as pro hac vice" 47 | }, 48 | "is_notice_attorney": { 49 | "type": "boolean", 50 | "description": "Whether or not counsel is listed as attorney to be noticed" 51 | }, 52 | "see_above_for_address": { 53 | "type": "boolean", 54 | "description": "Whether or not address is listed as 'see above for address', meaning address info should be obtained from preceding counsel entries" 55 | }, 56 | "designation": { 57 | "type": "string", 58 | "description": "This counsel's designation within the case (Retained, Government Attorney, Public Defender, etc)" 59 | }, 60 | "bar_status": { 61 | "type": "string", 62 | "description": "This counsel's standing with respect to the general bar (Admitted, Not Admitted, etc)" 63 | }, 64 | "trial_bar_status": { 65 | "type": "string", 66 | "description": "This counsel's standing with respect to the ILND trial bar, when applicable" 67 | }, 68 | "counsel_terminating_date": { 69 | "type": "string", 70 | "description": "The date (if any) that this counsel was terminated from the case" 71 | }, 72 | "raw_info": { 73 | "type": "string", 74 | "description": "The original value of the Pacer counsel field before being parsed out into the above SCALES fields" 75 | }, 76 | "recap_counsel_error": { 77 | "type": "boolean", 78 | "description": "A flag indicating that counsel information is missing due to a Recap error in which lawyers with identical names are merged incorrectly" 79 | } 80 | } 81 | } 82 | }, 83 | "is_pro_se": { 84 | "type": "boolean", 85 | "description": "Whether or not the party is appearing pro se, i.e. representing themselves" 86 | }, 87 | "pro_se_source": { 88 | "type": "string", 89 | "description": "The source that led us to believe this party is pro se ('explicit'=written out in the docket, 'implicit'=address-like info found for a lawyerless party)", 90 | "enum": ["explicit", "implicit"] 91 | }, 92 | "extra_pro_se_info": { 93 | "type": "string", 94 | "description":"For pro se parties, non-subheading party-related text that doesn't fit into counsel buckets (prisoner number, prison name, etc) - newline-delimited" 95 | }, 96 | "terminating_date": { 97 | "type": "string", 98 | "description":"The date (if any) that this party was terminated from the case" 99 | }, 100 | "extra_info": { 101 | "type": "string", 102 | "description":"Any Pacer subheadings for this party (alt names, corporation types, the capacity in which they're appearing, etc) - newline-delimited" 103 | }, 104 | "judge": { 105 | "type": "string", 106 | "description": "The defendant-specific assigned judge" 107 | }, 108 | "appeals_case_ids": { 109 | "type": "array", 110 | "description": "The case ids of any defendant-specific appeals issuing from this case", 111 | "items": { 112 | "type": "string" 113 | } 114 | }, 115 | "role": { 116 | "type": "string", 117 | "description": "This party's role in the case, as listed in their Pacer heading (e.g. 'Defendant', 'Plaintiff', 'Petitioner', 'Appellant'...)" 118 | }, 119 | "party_type": { 120 | "type": "string", 121 | "description": "The broad bucket in which this party's role belongs, ascertained via a hand-coded mapping of the role", 122 | "enum": ["defendant", "plaintiff", "misc", "other_party", "bk_party"] 123 | }, 124 | "pacer_id": { 125 | "type": "number", 126 | "description": "The intra-case defendant id - e.g. for a defendant listed as 'Jane Doe (2),' the Pacer id is 2" 127 | }, 128 | "referred_judges": { 129 | "type": "array", 130 | "description": "The defendant-specific referred judges listed after 'Referred to:'; only present when the case was referred to a second judge", 131 | "items": { 132 | "type": "string" 133 | } 134 | }, 135 | "pending_counts": { 136 | "type": "array", 137 | "description": "", 138 | "items": { 139 | "$ref": "#/$defs/count" 140 | } 141 | }, 142 | "terminated_counts": { 143 | "type": "array", 144 | "description": "", 145 | "items": { 146 | "$ref": "#/$defs/count" 147 | } 148 | }, 149 | "highest_offense_level_opening": { 150 | "type": "string", 151 | "description": "The degree of the most serious charges against this defendant at the start of the case (felony, misdemeanor, etc)" 152 | }, 153 | "highest_offense_level_terminated": { 154 | "type": "string", 155 | "description": "The degree of the most serious charges against this defendant at the end of the case (felony, misdemeanor, etc)" 156 | }, 157 | "complaints_text": { 158 | "type": "string", 159 | "description": "The primary criminal complaints against this defendant (sometimes supplements the count fields, sometimes duplicates them, and sometimes stands in for them)" 160 | }, 161 | "complaints_disposition": { 162 | "type": "string", 163 | "description": "The disposition of the complaints (prison time, dismissal, etc)" 164 | }, 165 | "recap_party_error": { 166 | "type": "boolean", 167 | "description": "A flag indicating that party information is missing due to a Recap error in which parties with identical names are merged incorrectly" 168 | } 169 | }, 170 | "$defs": { 171 | "count": { 172 | "type": "object", 173 | "description": "A count against a defendant", 174 | "properties": { 175 | "pacer_id": { 176 | "type": "string", 177 | "description": "The intra-defendant count id - e.g. for a count listed as 'FRAUD AND FALSE STATEMENTS(1s-2s),' the Pacer id is 1s-2s" 178 | }, 179 | "text": { 180 | "type": "string", 181 | "description": "The text of the count (e.g. 'FRAUD AND FALSE STATEMENTS')" 182 | }, 183 | "disposition": { 184 | "type": "string", 185 | "description": "The disposition of the count (prison time, dismissal, etc)" 186 | } 187 | } 188 | } 189 | } 190 | 191 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | *In addition to this documentation page, you may also find SCALES's [documentation site](https://docs.scales-okn.org/) helpful, particularly the pages concerning our scraper and parser.* 2 | 3 | # Table of Contents 4 | * [Scraper Tutorial](README.md#scraper-tutorial) 5 | 1) [Getting Started](README.md#1-getting-started) 6 | 2) [Pacer credentials](README.md#2-pacer-credentials) 7 | 3) [Query Scraper](README.md#3-query-scraper) 8 | 4) [Docket Scraper](README.md#4-docket-scraper) 9 | 5) [Document Scraper](README.md#5-document-scraper) 10 | * [Parser Tutorial](README.md#parser-tutorial) 11 | 12 | # Scraper Tutorial 13 | This is a tutorial on how to use the SCALES Scraper tool to download data from PACER. 14 | 15 | The scraper has three modules: 16 | 17 | 1. Query scraper: to download case queries 18 | 2. Docket Scraper: to download case dockets 19 | 3. Document Scraper: to download case documents 20 | 21 | This tutorial will step through the basics of getting set up with the scraping tool and using each module. For more details, see the documentation [here](src/pacer_tools/code/downloader/README.md) 22 | 23 | The tutorial uses the PACER demo site located here: https://dcecf.psc.uscourts.gov/cgi-bin/ShowIndex.pl 24 | 25 | This is a demo version of PACER with demo credentials that can be used for free. It contains a sample of cases from Western District of New York (*nywd*). However because it runs on its own domain with *psc* (PACER Service Center) instead of on the *nywd* PACER site, we will use **psc** as the court abbreviation for this tutorial. 26 | 27 | ## 1. Getting Started 28 | 29 | - Install the package: `pip install pacer-tools` 30 | - Make sure you have a recent version of Firefox installed (80.0+) and [GeckoDriver](https://github.com/mozilla/geckodriver) for Firefox 31 | 32 | **Download folder** 33 | For this tutorial we are going to use the resources in the */demo* directory of this repo and will put our data into */demo/pacer*. The scraper separates out data by district, so it's best to have a subdirectory for each district, named by court abbreviation (e.g. *demo/pacer/ilnd* for Northern District of Illinois). When the scraper runs it will build the necessary structure inside of that subdirectory that it needs to download and house the data from Pacer. 34 | 35 | Since we are using the PACER demo, we will use the court abbreviation it uses which is *psc* (for PACER Service Centre). The scraper will take an `inpath` argument, to which we will pass *demo/pacer/psc*. 36 | 37 | ## 2. Pacer credentials 38 | For most use you will need to put your Pacer login details into a json file. For this tutorial we'll be using the Pacer training site with the login details contained in *demo/auth.json*. When you are running the scraper using your own credentials you can use that file as a template. 39 | 40 | ## 3. Query Scraper 41 | The first thing we'll do with the scraper is download some query results. There is a demo query located at *demo/query_conf.json*. This is a *.json* file that maps search criteria to fields in the Pacer query form. 42 | To create your own query later you can use the query builder (see the documentation). 43 | 44 | Throughout this tutorial we will be using the scraper command from the PACER-tools command-line utility. Run `pacer-tools scraper` to see the full set of arguments. 45 | 46 | **Running script** 47 | 48 | To use the Query Scraper we just need to run the following: 49 | 50 | pacer-tools scraper --override-time --query-conf demo/query_conf.json demo/pacer/psc 51 | 52 | - *The `--override-time` flag is to override time restriction (as it is designed to run be run overnight) 53 | - The `--query-conf` option points the scraper to a json config file with the parameters for our query. 54 | 55 | The user will be prompted for the following: 56 | 57 | - **Mode**: for this step we want to choose *query* 58 | - **Court**: for the demo site the court abbreviation we want to enter is *psc* 59 | - **Auth path**: This is the relative path to our PACER login credentials. Running this from the *downloader* folder the path to the demo credentials is *login/demo.auth* 60 | - **Case limit**: This limits the number of cases downloaded in a single session, to prevent accidental overspending on PACER. For this example lets just enter 50. 61 | 62 | *Note*: 63 | *All of these parameters that the user was prompted for can actually be given as arguments to the script. These are all explained in full in the documentation. To avoid the prompting you can instead run:* 64 | 65 | pacer-tools scraper --override-time --query-conf demo/query_conf.json -m query -c psc -a demo/auth.json -cl 50 demo/pacer/psc 66 | 67 | 68 | **Result** 69 | Once these values have all been input, the Scraper should launch at this point and download the query results. You should see in the terminal the following message: 70 | 71 | > Query results saved to /psc/queries 72 | 73 | If you navigate to the *psc* folder you will see firstly that a few subfolders have been created to house the data, and secondly within the *queries* folder there should be a *.html* file that contains the query results. 74 | 75 | 76 | ## 4. Docket Scraper 77 | Next we will take that query results file and download all of the dockets for the listed cases. The Docket Scraper module can take a *.html* query file, which we have just downloaded, as its input. 78 | 79 | **Running script** 80 | To use the Docket Scraper we will run the following: 81 | 82 | pacer-tools scraper -m docket --docket-input demo/pacer/psc/queries/.html -c psc -a demo/auth.json -cl 50 --override-time demo/pacer/psc 83 | 84 | - The `--docket-input` option takes the path to the query file. The actual name of the query file (``) will vary as it includes a timestamp. 85 | 86 | The Docket Scraper (as well as the Document Scraper which will look at next) runs asynchronously across multiple Firefox instances, by default two. The no. of instances (workers) running can be adjusted by the `n-workers` option (see the documentation). 87 | 88 | *Note: the scraper only keeps the civil and criminal cases, to download a specific case type you can use the ``--case-type`` option.* 89 | 90 | **Result** 91 | Once both browsers have finished and closed, all of the cases from the query results file should be downloaded and can be found in *demo/pacer/psc/html* 92 | 93 | 94 | 95 | ## 5. Document Scraper 96 | Lastly, we will get the actual documents associated with docket lines of the cases. The docket scraper can take a few different types of inputs, including a list of specific cases, but for this tutorial we will give it the directory of docket *.html* files as an input so that documents for all cases will be downloaded. By default, for each case all documents and attachments will be downloaded. 97 | 98 | **Running script** 99 | To use the Document Scraper we run the following: 100 | 101 | pacer-tools scraper -m document -c psc -a demo/auth.json -cl 50 --override-time --document-input demo/document_input.csv demo/pacer/psc 102 | 103 | - There is a default limit of 1000 documents per case. Any case that has more than 1000 documents will be skipped. This limit can be changed by the `--document-limit` option. 104 | 105 | **Result** 106 | The Document Scraper will usually take significantly longer to run than the Docket Scraper given the volume of documents in most cases. Once the documents have finished downloading they can be found in the *demo/pacer/psc/docs* folder. 107 | 108 | 109 | **Attachments and specific documents** 110 | 111 | 112 | - To skip docket line attachments you can use the `--no-document-att` flag. 113 | - To get specific documents from specific cases, you can use the `--document-input` option to pass a *.csv* file with cases ids and list specific documents to retrieve, see the documentation for more. 114 | 115 | 116 | To see more specifics, options and use cases check out the detailed documentation [here](src/pacer_tools/code/downloader/README.md). 117 | 118 | # Parser Tutorial 119 | 120 | This short section explains how to use the SCALES Parser tool to read HTMLs downloaded from Pacer and convert them into JSON format. The parser takes as its input the results of running the [docket scraper](README.md#4-docket-scraper) - namely, a folder of HTMLs. 121 | 122 | **Running script** 123 | To use the parser on the HTMLs from the docket scraper in the previous tutorial, we will simply run the following: 124 | 125 | pacer-tools parser demo/pacer/psc/html 126 | 127 | **Result** 128 | Once the parser has finished, all the parsed versions of the HTML files can be found in */data/pacer/psc/json*. 129 | 130 | To see more specifics, options, and details on the JSON schema, check out the detailed documentation [here](src/pacer_tools/code/parsers/README.md). 131 | -------------------------------------------------------------------------------- /src/pacer_tools/code/parsers/schemas/case_cv_v1.schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/schema#", 3 | "$id": "http://github.com/scales-okn/PACER-tools/schemas/case_cv_v1.schema.json", 4 | "title": "Civil Case Schema", 5 | "description": "A schema for a PACER civil court case docket report" , 6 | "properties": { 7 | "docket": { 8 | "type": "array", 9 | "description": "Ordered array of entries in the case docket", 10 | "items": { 11 | "$ref": "http://github.com/scales-okn/PACER-tools/schemas/docket_entry_v1.schema.json" 12 | } 13 | }, 14 | "parties": { 15 | "type": "array", 16 | "description": "Parties involved in the case", 17 | "items": { 18 | "$ref": "http://github.com/scales-okn/PACER-tools/schemas/party_cv_v1.schema.json" 19 | } 20 | }, 21 | "case_id": { 22 | "type": "string", 23 | "description": "Pacer's case ID, which has the form O:YY-TY-##### (where O is a court office code, YY is a year, TY is the case type, and ##### is a numeric identifier associated with this case)" 24 | }, 25 | "case_type": { 26 | "type": "string", 27 | "description": "Abbreviation for the case type ('cv' for civil, 'cr' for criminal, etc)" 28 | }, 29 | "court": { 30 | "type": "string", 31 | "description": "The (lowercase) Pacer court abbreviation (e.g. 'ilnd' for Northern District of Illinois)" 32 | }, 33 | "ucid": { 34 | "type": "string", 35 | "description":"SCALES's case ID (stands for 'unique case id'), generated by connecting the court abbreviation to the Pacer case ID, delimited by a double semicolon (e.g. 'ilnd;;1:16-cv-00001)" 36 | }, 37 | "city": { 38 | "type": "string", 39 | "description": "The city/division within the district where the case appeared; comes from the parenthesis in the header of the docket at the end of the court district (e.g. for 'Northern District of Illinois - CM/ECF LIVE, Ver 6.3.1 (Chicago)', the city is 'Chicago')" 40 | }, 41 | "header_case_id": { 42 | "type": "string", 43 | "description": "Similar to case_id, but pulled from the docket itself rather than the filepath; sometimes contains extra information like judge initials and 'All Defendants'" 44 | }, 45 | "filing_date": { 46 | "type": "string", 47 | "description": "The date the case was filed - format: MM/DD/YYYY" 48 | }, 49 | "terminating_date": { 50 | "type": "string", 51 | "description": "The date the case was terminated - format: MM/DD/YYYY" 52 | }, 53 | "case_status": { 54 | "type": "string", 55 | "description": "This will be 'open' if a terminating date is listed, else 'closed'", 56 | "enum": ["open", "closed"] 57 | }, 58 | "judge": { 59 | "type": "string", 60 | "description": "The assigned judge on the case (this is the raw string from the docket header); for criminal cases, this is zeroed out and superseded by defendant-specific judges" 61 | }, 62 | "referred_judges": { 63 | "type": "array", 64 | "description": "The referred judges on the case (only present when the case was referred to a second judge); for criminal cases, this is zeroed out and superseded by defendant-specific referred judges", 65 | "items": { 66 | "type": "string" 67 | } 68 | }, 69 | "nature_suit": { 70 | "type": "string", 71 | "description": "The nature of suit for the case (this is matched to the list of known NOS where possible for uniformity of spelling/case, and otherwise left as the raw extracted string)" 72 | }, 73 | "jury_demand": { 74 | "type": "string", 75 | "description": "The jury demand specified in the case header" 76 | }, 77 | "cause": { 78 | "type": "string", 79 | "description": "The cause specified in the case header" 80 | }, 81 | "jurisdiction": { 82 | "type": "string", 83 | "description": "The jurisdiction of the case (e.g. 'Federal Question')" 84 | }, 85 | "monetary_demand": { 86 | "type": "string", 87 | "description": "The monetary demand specified in the case header" 88 | }, 89 | "lead_case_pacer_id": { 90 | "type": "string", 91 | "description": "The internal Pacer id of the lead case, when the case is part of multi-district litigation (MDL) or a consolidated case" 92 | }, 93 | "lead_case_id": { 94 | "type": "string", 95 | "description": "The case id of the lead case, when the case is part of multi-district litigation (MDL) or a consolidated case" 96 | }, 97 | "magistrate_case_ids": { 98 | "type": "array", 99 | "description": "The case ids of any magistrate judge cases (case type 'mj') issuing from this case", 100 | "items": { 101 | "type": "string" 102 | } 103 | }, 104 | "related_cases": { 105 | "type": "array", 106 | "description": "Any case ids listed as 'related cases' in this case's header", 107 | "items": { 108 | "type": "string" 109 | } 110 | }, 111 | "other_courts": { 112 | "type": "array", 113 | "description": "Case IDs provided by Pacer as 'Case in other court'; doesn't pick up all alternate case IDs (e.g. appeals court case numbers)", 114 | "items": { 115 | "type": "string" 116 | } 117 | }, 118 | "filed_in_error_text": { 119 | "type": "string", 120 | "description": "The text line in the header, if any, that indicates this case was erroneously filed ('incorrectly filed,' 'not used,' 'do not docket,' etc)" 121 | }, 122 | "case_flags": { 123 | "type": "array", 124 | "description": "The flags in the top right-hand corner of the case docket", 125 | "items": { 126 | "type": "string" 127 | } 128 | }, 129 | "appeals_case_ids": { 130 | "type": "array", 131 | "description": "The case ids of any defendant-specific appeals issuing from this case; for criminal cases, this is zeroed out and superseded by defendant-specific appeals case ids", 132 | "items": { 133 | "type": "string" 134 | } 135 | }, 136 | "case_name": { 137 | "type": "string", 138 | "description": "The title of the case (e.g. 'USA v. Doe')" 139 | }, 140 | "docket_available": { 141 | "type": "boolean", 142 | "description": "True if html docket data is available for this case; False if the source for this case was pre-parsed data (e.g. from Recap)" 143 | }, 144 | "member_case_key": { 145 | "type": "string", 146 | "description": "a UCID-formatted version of lead_case_id (if this case is the lead case, this field will match the 'ucid' field)" 147 | }, 148 | "mdl_code": { 149 | "type": "number", 150 | "description": "The MDL code that this case is part of, if applicable" 151 | }, 152 | "mdl_id_source": { 153 | "type": "string", 154 | "description": "The source that led us to believe this case is part of an MDL", 155 | "enum": ["lead_case_id", "flag"] 156 | }, 157 | "is_mdl": { 158 | "type": "boolean", 159 | "description": "Whether or not this case is part of an MDL; this field will be True if an MDL code was found, or if there is a case flag that starts with 'MDL'" 160 | }, 161 | "is_multi": { 162 | "type": "boolean", 163 | "description": "True if this case is part of an MDL, if it has a lead case id, if a list of member cases appears in the header, or if it has appeared in another court" 164 | }, 165 | "billable_pages": { 166 | "type": "number", 167 | "description": "The number of billable pages on Pacer for this docket report" 168 | }, 169 | "cost": { 170 | "type": "number", 171 | "description": "The cost (in $) of downloading the docket report from Pacer" 172 | }, 173 | "download_timestamp": { 174 | "type": "string", 175 | "description": "The time of download of this case from Pacer - format: MM/DD/YYYY hh:mm:ss" 176 | }, 177 | "n_docket_reports": { 178 | "type": "number", 179 | "description": "The number of docket reports (HTML) used to generate this JSON; will be 1 if only 1 docket report, or >1 if the case was pieced together from multiple updates (i.e. multiple HTMLs)" 180 | }, 181 | "source": { 182 | "type": "string", 183 | "description": "A comma-delimited list of sources for this data, generally 'pacer' or 'recap'" 184 | }, 185 | "recap_id": { 186 | "type": "number", 187 | "description": "The Recap id for this case, if applicable" 188 | }, 189 | "download_url": { 190 | "type": "string", 191 | "description": "The url on Pacer from which the docket was downloaded" 192 | }, 193 | "case_pacer_id": { 194 | "type": "string", 195 | "description": "The unique numerical id that Pacer uses internally to identify this document (pulled from Pacer's XML responses to user queries; not visible on the docket sheet itself)" 196 | }, 197 | "summary": { 198 | "type": "object", 199 | "description": "Data from the case summary, if available", 200 | "$ref": "http://github.com/scales-okn/PACER-tools/schemas/summary_cv_v1.schema.json" 201 | } 202 | } 203 | } -------------------------------------------------------------------------------- /src/pacer_tools/code/parsers/schemas/case_cr_v1.schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/schema#", 3 | "$id": "http://github.com/scales-okn/PACER-tools/schemas/case_cr_v1.schema.json", 4 | "title": "Civil Case Schema", 5 | "description": "A schema for a PACER criminal court case docket report" , 6 | "properties": { 7 | "docket": { 8 | "type": "array", 9 | "description": "Ordered array of entries in the case docket", 10 | "items": { 11 | "$ref": "http://github.com/scales-okn/PACER-tools/schemas/docket_entry_v1.schema.json" 12 | } 13 | }, 14 | "parties": { 15 | "type": "array", 16 | "description": "Parties involved in the case", 17 | "items": { 18 | "$ref": "http://github.com/scales-okn/PACER-tools/schemas/party_cr_v1.schema.json" 19 | } 20 | }, 21 | "case_id": { 22 | "type": "string", 23 | "description": "Pacer's case ID, which has the form O:YY-TY-##### (where O is a court office code, YY is a year, TY is the case type, and ##### is a numeric identifier associated with this case)" 24 | }, 25 | "case_type": { 26 | "type": "string", 27 | "description": "Abbreviation for the case type ('cv' for civil, 'cr' for criminal, etc)" 28 | }, 29 | "court": { 30 | "type": "string", 31 | "description": "The (lowercase) Pacer court abbreviation (e.g. 'ilnd' for Northern District of Illinois)" 32 | }, 33 | "ucid": { 34 | "type": "string", 35 | "description":"SCALES's case ID (stands for 'unique case id'), generated by connecting the court abbreviation to the Pacer case ID, delimited by a double semicolon (e.g. 'ilnd;;1:16-cv-00001)" 36 | }, 37 | "city": { 38 | "type": "string", 39 | "description": "The city/division within the district where the case appeared; comes from the parenthesis in the header of the docket at the end of the court district (e.g. for 'Northern District of Illinois - CM/ECF LIVE, Ver 6.3.1 (Chicago)', the city is 'Chicago')" 40 | }, 41 | "header_case_id": { 42 | "type": "string", 43 | "description": "Similar to case_id, but pulled from the docket itself rather than the filepath; sometimes contains extra information like judge initials and 'All Defendants'" 44 | }, 45 | "filing_date": { 46 | "type": "string", 47 | "description": "The date the case was filed - format: MM/DD/YYYY" 48 | }, 49 | "terminating_date": { 50 | "type": "string", 51 | "description": "The date the case was terminated - format: MM/DD/YYYY" 52 | }, 53 | "case_status": { 54 | "type": "string", 55 | "description": "This will be 'open' if a terminating date is listed, else 'closed'", 56 | "enum": ["open", "closed"] 57 | }, 58 | "judge": { 59 | "type": "string", 60 | "description": "The assigned judge on the case (this is the raw string from the docket header); for criminal cases, this is zeroed out and superseded by defendant-specific judges" 61 | }, 62 | "referred_judges": { 63 | "type": "array", 64 | "description": "The referred judges on the case (only present when the case was referred to a second judge); for criminal cases, this is zeroed out and superseded by defendant-specific referred judges", 65 | "items": { 66 | "type": "string" 67 | } 68 | }, 69 | "nature_suit": { 70 | "type": "string", 71 | "description": "The nature of suit for the case (this is matched to the list of known NOS where possible for uniformity of spelling/case, and otherwise left as the raw extracted string)" 72 | }, 73 | "jury_demand": { 74 | "type": "string", 75 | "description": "The jury demand specified in the case header" 76 | }, 77 | "cause": { 78 | "type": "string", 79 | "description": "The cause specified in the case header" 80 | }, 81 | "jurisdiction": { 82 | "type": "string", 83 | "description": "The jurisdiction of the case (e.g. 'Federal Question')" 84 | }, 85 | "monetary_demand": { 86 | "type": "string", 87 | "description": "The monetary demand specified in the case header" 88 | }, 89 | "lead_case_pacer_id": { 90 | "type": "string", 91 | "description": "The internal Pacer id of the lead case, when the case is part of multi-district litigation (MDL) or a consolidated case" 92 | }, 93 | "lead_case_id": { 94 | "type": "string", 95 | "description": "The case id of the lead case, when the case is part of multi-district litigation (MDL) or a consolidated case" 96 | }, 97 | "magistrate_case_ids": { 98 | "type": "array", 99 | "description": "The case ids of any magistrate judge cases (case type 'mj') issuing from this case", 100 | "items": { 101 | "type": "string" 102 | } 103 | }, 104 | "related_cases": { 105 | "type": "array", 106 | "description": "Any case ids listed as 'related cases' in this case's header", 107 | "items": { 108 | "type": "string" 109 | } 110 | }, 111 | "other_courts": { 112 | "type": "array", 113 | "description": "Case IDs provided by Pacer as 'Case in other court'; doesn't pick up all alternate case IDs (e.g. appeals court case numbers)", 114 | "items": { 115 | "type": "string" 116 | } 117 | }, 118 | "filed_in_error_text": { 119 | "type": "string", 120 | "description": "The text line in the header, if any, that indicates this case was erroneously filed ('incorrectly filed,' 'not used,' 'do not docket,' etc)" 121 | }, 122 | "case_flags": { 123 | "type": "array", 124 | "description": "The flags in the top right-hand corner of the case docket", 125 | "items": { 126 | "type": "string" 127 | } 128 | }, 129 | "appeals_case_ids": { 130 | "type": "array", 131 | "description": "The case ids of any defendant-specific appeals issuing from this case; for criminal cases, this is zeroed out and superseded by defendant-specific appeals case ids", 132 | "items": { 133 | "type": "string" 134 | } 135 | }, 136 | "case_name": { 137 | "type": "string", 138 | "description": "The title of the case (e.g. 'USA v. Doe')" 139 | }, 140 | "docket_available": { 141 | "type": "boolean", 142 | "description": "True if html docket data is available for this case; False if the source for this case was pre-parsed data (e.g. from Recap)" 143 | }, 144 | "member_case_key": { 145 | "type": "string", 146 | "description": "a UCID-formatted version of lead_case_id (if this case is the lead case, this field will match the 'ucid' field)" 147 | }, 148 | "mdl_code": { 149 | "type": "number", 150 | "description": "The MDL code that this case is part of, if applicable" 151 | }, 152 | "mdl_id_source": { 153 | "type": "string", 154 | "description": "The source that led us to believe this case is part of an MDL", 155 | "enum": ["lead_case_id", "flag"] 156 | }, 157 | "is_mdl": { 158 | "type": "boolean", 159 | "description": "Whether or not this case is part of an MDL; this field will be True if an MDL code was found, or if there is a case flag that starts with 'MDL'" 160 | }, 161 | "is_multi": { 162 | "type": "boolean", 163 | "description": "True if this case is part of an MDL, if it has a lead case id, if a list of member cases appears in the header, or if it has appeared in another court" 164 | }, 165 | "billable_pages": { 166 | "type": "number", 167 | "description": "The number of billable pages on Pacer for this docket report" 168 | }, 169 | "cost": { 170 | "type": "number", 171 | "description": "The cost (in $) of downloading the docket report from Pacer" 172 | }, 173 | "download_timestamp": { 174 | "type": "string", 175 | "description": "The time of download of this case from Pacer - format: MM/DD/YYYY hh:mm:ss" 176 | }, 177 | "n_docket_reports": { 178 | "type": "number", 179 | "description": "The number of docket reports (HTML) used to generate this JSON; will be 1 if only 1 docket report, or >1 if the case was pieced together from multiple updates (i.e. multiple HTMLs)" 180 | }, 181 | "source": { 182 | "type": "string", 183 | "description": "A comma-delimited list of sources for this data, generally 'pacer' or 'recap'" 184 | }, 185 | "recap_id": { 186 | "type": "number", 187 | "description": "The Recap id for this case, if applicable" 188 | }, 189 | "download_url": { 190 | "type": "string", 191 | "description": "The url on Pacer from which the docket was downloaded" 192 | }, 193 | "case_pacer_id": { 194 | "type": "string", 195 | "description": "The unique numerical id that Pacer uses internally to identify this document (pulled from Pacer's XML responses to user queries; not visible on the docket sheet itself)" 196 | }, 197 | "summary": { 198 | "type": "object", 199 | "description": "Data from the case summary, if available", 200 | "$ref": "http://github.com/scales-okn/PACER-tools/schemas/summary_cr_v1.schema.json" 201 | } 202 | } 203 | } -------------------------------------------------------------------------------- /src/pacer_tools/requirements.txt: -------------------------------------------------------------------------------- 1 | # This file may be used to create an environment using: 2 | # $ conda create --name --file 3 | # platform: osx-64 4 | _ipyw_jlab_nb_ext_conf=0.1.0=py37_0 5 | alabaster=0.7.12=py37_0 6 | anaconda=2020.02=py37_0 7 | anaconda-client=1.7.2=py37_0 8 | anaconda-navigator=1.9.12=py37_0 9 | anaconda-project=0.8.4=py_0 10 | applaunchservices=0.2.1=py_0 11 | appnope=0.1.0=py37_0 12 | appscript=1.1.0=py37h1de35cc_0 13 | argh=0.26.2=py37_0 14 | asn1crypto=1.3.0=py37_0 15 | astroid=2.3.3=py37_0 16 | astropy=4.0=py37h1de35cc_0 17 | atomicwrites=1.3.0=py37_1 18 | attrs=19.3.0=py_0 19 | autopep8=1.4.4=py_0 20 | babel=2.8.0=py_0 21 | backcall=0.1.0=py37_0 22 | backports=1.0=py_2 23 | backports.functools_lru_cache=1.6.1=py_0 24 | backports.shutil_get_terminal_size=1.0.0=py37_2 25 | backports.tempfile=1.0=py_1 26 | backports.weakref=1.0.post1=py_1 27 | beautifulsoup4=4.9.3=pyhb0f4dca_0 28 | bitarray=1.2.1=py37h1de35cc_0 29 | bkcharts=0.2=py37_0 30 | blas=1.0=mkl 31 | bleach=3.1.0=py37_0 32 | blosc=1.16.3=hd9629dc_0 33 | bokeh=1.4.0=py37_0 34 | boto=2.49.0=py37_0 35 | bottleneck=1.3.2=py37h776bbcc_0 36 | brotlipy=0.7.0=py38h9ed2024_1003 37 | bzip2=1.0.8=h1de35cc_0 38 | ca-certificates=2020.10.14=0 39 | certifi=2020.6.20=pyhd3eb1b0_3 40 | cffi=1.14.3=py38h2125817_2 41 | chardet=3.0.4=py37_1003 42 | click=7.1.2=py_0 43 | cloudpickle=1.3.0=py_0 44 | clyent=1.2.2=py37_1 45 | colorama=0.4.3=py_0 46 | conda=4.8.4=py37_0 47 | conda-build=3.18.11=py37_0 48 | conda-env=2.6.0=1 49 | conda-package-handling=1.6.0=py37h1de35cc_0 50 | conda-verify=3.4.2=py_1 51 | contextlib2=0.6.0.post1=py_0 52 | cryptography=3.2.1=py38hbcfaee0_1 53 | curl=7.68.0=ha441bb4_0 54 | cycler=0.10.0=py37_0 55 | cython=0.29.15=py37h0a44026_0 56 | cytoolz=0.10.1=py37h1de35cc_0 57 | dask=2.11.0=py_0 58 | dask-core=2.11.0=py_0 59 | dbus=1.13.12=h90a0687_0 60 | decorator=4.4.1=py_0 61 | defusedxml=0.6.0=py_0 62 | diff-match-patch=20181111=py_0 63 | distributed=2.11.0=py37_0 64 | docutils=0.16=py37_0 65 | entrypoints=0.3=py37_0 66 | et_xmlfile=1.0.1=py37_0 67 | expat=2.2.6=h0a44026_0 68 | fastcache=1.1.0=py37h1de35cc_0 69 | filelock=3.0.12=py_0 70 | flake8=3.7.9=py37_0 71 | flask=1.1.1=py_0 72 | freetype=2.9.1=hb4e5f40_0 73 | fsspec=0.6.2=py_0 74 | future=0.18.2=py37_0 75 | get_terminal_size=1.0.0=h7520d66_0 76 | gettext=0.19.8.1=h15daf44_3 77 | gevent=1.4.0=py37h1de35cc_0 78 | glib=2.63.1=hd977a24_0 79 | glob2=0.7=py_0 80 | gmp=6.1.2=hb37e062_1 81 | gmpy2=2.0.8=py37h6ef4df4_2 82 | greenlet=0.4.15=py37h1de35cc_0 83 | h5py=2.10.0=py37h3134771_0 84 | hdf5=1.10.4=hfa1e0ec_0 85 | heapdict=1.0.1=py_0 86 | html5lib=1.0.1=py37_0 87 | hypothesis=5.5.4=py_0 88 | icu=58.2=h4b95b61_1 89 | idna=2.10=py_0 90 | imageio=2.6.1=py37_0 91 | imagesize=1.2.0=py_0 92 | importlib_metadata=1.5.0=py37_0 93 | intel-openmp=2019.4=233 94 | intervaltree=3.0.2=py_0 95 | ipykernel=5.1.4=py37h39e3cac_0 96 | ipython=7.12.0=py37h5ca1d4c_0 97 | ipython_genutils=0.2.0=py37_0 98 | ipywidgets=7.5.1=py_0 99 | isort=4.3.21=py37_0 100 | itsdangerous=1.1.0=py37_0 101 | jbig=2.1=h4d881f8_0 102 | jdcal=1.4.1=py_0 103 | jedi=0.14.1=py37_0 104 | jinja2=2.11.1=py_0 105 | joblib=0.14.1=py_0 106 | jpeg=9b=he5867d9_2 107 | json5=0.9.1=py_0 108 | jsonschema=3.2.0=py37_0 109 | jupyter=1.0.0=py37_7 110 | jupyter_client=5.3.4=py37_0 111 | jupyter_console=6.1.0=py_0 112 | jupyter_core=4.6.1=py37_0 113 | jupyterlab=1.2.6=pyhf63ae98_0 114 | jupyterlab_server=1.0.6=py_0 115 | keyring=21.1.0=py37_0 116 | kiwisolver=1.1.0=py37h0a44026_0 117 | krb5=1.17.1=hddcf347_0 118 | lazy-object-proxy=1.4.3=py37h1de35cc_0 119 | libarchive=3.3.3=h786848e_5 120 | libcurl=7.68.0=h051b688_0 121 | libcxx=10.0.0=1 122 | libcxxabi=4.0.1=hcfea43d_1 123 | libedit=3.1.20191231=h1de35cc_1 124 | libffi=3.3=hb1e8313_2 125 | libgfortran=3.0.1=h93005f0_2 126 | libiconv=1.15=hdd342a3_7 127 | liblief=0.9.0=h2a1bed3_2 128 | libpng=1.6.37=ha441bb4_0 129 | libsodium=1.0.16=h3efe00b_0 130 | libspatialindex=1.9.3=h0a44026_0 131 | libssh2=1.8.2=ha12b0ac_0 132 | libtiff=4.1.0=hcb84e12_0 133 | libxml2=2.9.9=hf6e021a_1 134 | libxslt=1.1.33=h33a18ac_0 135 | llvm-openmp=4.0.1=hcfea43d_1 136 | llvmlite=0.31.0=py37h1341992_0 137 | locket=0.2.0=py37_1 138 | lxml=4.5.0=py37hef8c89e_0 139 | lz4-c=1.8.1.2=h1de35cc_0 140 | lzo=2.10=h362108e_2 141 | markupsafe=1.1.1=py37h1de35cc_0 142 | matplotlib=3.1.3=py37_0 143 | matplotlib-base=3.1.3=py37h9aa3819_0 144 | mccabe=0.6.1=py37_1 145 | mistune=0.8.4=py37h1de35cc_0 146 | mkl=2019.4=233 147 | mkl-service=2.3.0=py38hfbe908c_0 148 | mkl_fft=1.2.0=py38hc64f4ea_0 149 | mkl_random=1.1.1=py38h959d312_0 150 | mock=4.0.1=py_0 151 | more-itertools=8.2.0=py_0 152 | mpc=1.1.0=h6ef4df4_1 153 | mpfr=4.0.1=h3018a27_3 154 | mpmath=1.1.0=py37_0 155 | msgpack-python=0.6.1=py37h04f5b5a_1 156 | multipledispatch=0.6.0=py37_0 157 | navigator-updater=0.2.1=py37_0 158 | nbconvert=5.6.1=py37_0 159 | nbformat=5.0.4=py_0 160 | ncurses=6.2=h0a44026_1 161 | networkx=2.4=py_0 162 | nltk=3.4.5=py37_0 163 | nose=1.3.7=py37_2 164 | notebook=6.0.3=py37_0 165 | numba=0.48.0=py37h6c726b0_0 166 | numexpr=2.7.1=py37hce01a72_0 167 | numpy=1.19.2=py38h456fd55_0 168 | numpy-base=1.19.2=py38hcfb5961_0 169 | numpydoc=0.9.2=py_0 170 | olefile=0.46=py37_0 171 | openpyxl=3.0.3=py_0 172 | openssl=1.1.1h=haf1e3a3_0 173 | packaging=20.1=py_0 174 | pandas=1.1.3=py38hb1e8313_0 175 | pandoc=2.2.3.2=0 176 | pandocfilters=1.4.2=py37_1 177 | parso=0.5.2=py_0 178 | partd=1.1.0=py_0 179 | path=13.1.0=py37_0 180 | path.py=12.4.0=0 181 | pathlib2=2.3.5=py37_0 182 | pathtools=0.1.2=py_1 183 | patsy=0.5.1=py37_0 184 | pcre=8.43=h0a44026_0 185 | pep8=1.7.1=py37_0 186 | pexpect=4.8.0=py37_0 187 | pickleshare=0.7.5=py37_0 188 | pillow=7.0.0=py37h4655f20_0 189 | pip=20.2.4=py38hecd8cb5_0 190 | pkginfo=1.5.0.1=py37_0 191 | pluggy=0.13.1=py37_0 192 | ply=3.11=py37_0 193 | prometheus_client=0.7.1=py_0 194 | prompt_toolkit=3.0.3=py_0 195 | psutil=5.6.7=py37h1de35cc_0 196 | ptyprocess=0.6.0=py37_0 197 | py=1.8.1=py_0 198 | py-lief=0.9.0=py37h1413db1_2 199 | pycodestyle=2.5.0=py37_0 200 | pycosat=0.6.3=py37h1de35cc_0 201 | pycparser=2.20=py_2 202 | pycrypto=2.6.1=py37h1de35cc_9 203 | pycurl=7.43.0.5=py37ha12b0ac_0 204 | pydocstyle=4.0.1=py_0 205 | pyflakes=2.1.1=py37_0 206 | pygments=2.5.2=py_0 207 | pylint=2.4.4=py37_0 208 | pyodbc=4.0.30=py37h0a44026_0 209 | pyopenssl=19.1.0=pyhd3eb1b0_1 210 | pyparsing=2.4.6=py_0 211 | pyqt=5.9.2=py37h655552a_2 212 | pyrsistent=0.15.7=py37h1de35cc_0 213 | pysocks=1.7.1=py38_1 214 | pytables=3.6.1=py37h5bccee9_0 215 | pytest=5.3.5=py37_0 216 | pytest-arraydiff=0.3=py37h39e3cac_0 217 | pytest-astropy=0.8.0=py_0 218 | pytest-astropy-header=0.1.2=py_0 219 | pytest-doctestplus=0.5.0=py_0 220 | pytest-openfiles=0.4.0=py_0 221 | pytest-remotedata=0.3.2=py37_0 222 | python=3.8.5=h26836e1_1 223 | python-dateutil=2.8.1=py_0 224 | python-jsonrpc-server=0.3.4=py_0 225 | python-language-server=0.31.7=py37_0 226 | python-libarchive-c=2.8=py37_13 227 | python.app=2=py37_10 228 | pytz=2020.1=py_0 229 | pywavelets=1.1.1=py37h1de35cc_0 230 | pyyaml=5.3=py37h1de35cc_0 231 | pyzmq=18.1.1=py37h0a44026_0 232 | qdarkstyle=2.8=py_0 233 | qt=5.9.7=h468cd18_1 234 | qtawesome=0.6.1=py_0 235 | qtconsole=4.6.0=py_1 236 | qtpy=1.9.0=py_0 237 | readline=8.0=h1de35cc_0 238 | requests=2.22.0=py37_1 239 | ripgrep=11.0.2=he32d670_0 240 | rope=0.16.0=py_0 241 | rtree=0.9.3=py37_0 242 | ruamel_yaml=0.15.87=py37h1de35cc_0 243 | scikit-image=0.16.2=py37h6c726b0_0 244 | scikit-learn=0.22.1=py37h27c97d8_0 245 | scipy=1.4.1=py37h9fa6033_0 246 | seaborn=0.10.0=py_0 247 | selenium=3.141.0=py38h1de35cc_1001 248 | send2trash=1.5.0=py37_0 249 | setuptools=50.3.1=py38hecd8cb5_1 250 | simplegeneric=0.8.1=py37_2 251 | simplejson=3.17.0=py37h1de35cc_0 252 | singledispatch=3.4.0.3=py37_0 253 | sip=4.19.8=py37h0a44026_0 254 | six=1.15.0=py38hecd8cb5_0 255 | snappy=1.1.7=he62c110_3 256 | snowballstemmer=2.0.0=py_0 257 | sortedcollections=1.1.2=py37_0 258 | sortedcontainers=2.1.0=py37_0 259 | soupsieve=2.0.1=py_0 260 | sphinx=2.4.0=py_0 261 | sphinxcontrib=1.0=py37_1 262 | sphinxcontrib-applehelp=1.0.1=py_0 263 | sphinxcontrib-devhelp=1.0.1=py_0 264 | sphinxcontrib-htmlhelp=1.0.2=py_0 265 | sphinxcontrib-jsmath=1.0.1=py_0 266 | sphinxcontrib-qthelp=1.0.2=py_0 267 | sphinxcontrib-serializinghtml=1.1.3=py_0 268 | sphinxcontrib-websupport=1.2.0=py_0 269 | spyder=4.0.1=py37_0 270 | spyder-kernels=1.8.1=py37_0 271 | sqlalchemy=1.3.13=py37h1de35cc_0 272 | sqlite=3.33.0=hffcf06c_0 273 | statsmodels=0.11.0=py37h1de35cc_0 274 | sympy=1.5.1=py37_0 275 | tbb=2020.0=h04f5b5a_0 276 | tblib=1.6.0=py_0 277 | terminado=0.8.3=py37_0 278 | testpath=0.4.4=py_0 279 | tk=8.6.10=hb0a8c7a_0 280 | toolz=0.10.0=py_0 281 | tornado=6.0.3=py37h1de35cc_3 282 | tqdm=4.51.0=pyhd3eb1b0_0 283 | traitlets=4.3.3=py37_0 284 | ujson=1.35=py37h1de35cc_0 285 | unicodecsv=0.14.1=py37_0 286 | unixodbc=2.3.7=h1de35cc_0 287 | urllib3=1.25.11=py_0 288 | watchdog=0.10.2=py37h1de35cc_0 289 | wcwidth=0.1.8=py_0 290 | webencodings=0.5.1=py37_1 291 | werkzeug=1.0.0=py_0 292 | wheel=0.35.1=pyhd3eb1b0_0 293 | widgetsnbextension=3.5.1=py37_0 294 | wrapt=1.11.2=py37h1de35cc_0 295 | wurlitzer=2.0.0=py37_0 296 | xlrd=1.2.0=py37_0 297 | xlsxwriter=1.2.7=py_0 298 | xlwings=0.17.1=py37_0 299 | xlwt=1.3.0=py37_0 300 | xmltodict=0.12.0=py_0 301 | xz=5.2.5=h1de35cc_0 302 | yaml=0.1.7=hc338f04_2 303 | yapf=0.28.0=py_0 304 | zeromq=4.3.1=h0a44026_3 305 | zict=1.0.0=py_0 306 | zipp=2.2.0=py_0 307 | zlib=1.2.11=h1de35cc_3 308 | zstd=1.3.7=h5bba6e5_0 309 | -------------------------------------------------------------------------------- /src/pacer_tools/code/support/bundler.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Tool for bundling files together 3 | ''' 4 | import re 5 | import sys 6 | import json 7 | import shutil 8 | from pathlib import Path 9 | 10 | import pandas as pd 11 | from bs4 import BeautifulSoup 12 | from tqdm.autonotebook import tqdm 13 | 14 | sys.path.append(str(Path(__file__).resolve().parents[1])) 15 | from support import settings 16 | from support import data_tools as dtools 17 | 18 | def index_style(additional=''): 19 | ''' 20 | Build css style tag 21 | Inputs: 22 | - additional (str): additional str to be inserted on-the-fly 23 | Output: 24 | (str) a valid html-style tag 25 | ''' 26 | base_style = open(settings.STYLE/'bundler_index.css').read().replace('\n','') 27 | return f"" 28 | 29 | def bundler(indf, name, notes=None, overwrite=False, anno_col=None): 30 | ''' 31 | Bundle up a collection of files 32 | Inputs: 33 | - indf (DataFrame): any dataframe with an fpath column to identify files 34 | - name (str): name of directory to bundle into (will be put in /data/{name}) 35 | - notes (str): notes to be injected under the header (html string) 36 | - anno_col (str): name of annotations column if any, column should be valid json string 37 | ''' 38 | df = indf.copy() 39 | # Want to include the index if it's ucid 40 | if df.index.name == 'ucid': 41 | df = df.reset_index() 42 | 43 | if anno_col: 44 | # import pdb;pdb.set_trace() 45 | df[anno_col] = df[anno_col].map(json.loads) 46 | 47 | # Columns needed to generate 48 | if 'fpath' not in df.columns: 49 | raise ValueError('DataFrame must include fpath column to point to file locations') 50 | elif 'ucid' not in df.columns: 51 | raise ValueError('DataFrame must include ucid to identify case') 52 | 53 | # Handle directory 54 | bundle_dir = settings.BUNDLES/name 55 | if bundle_dir.exists(): 56 | if overwrite: 57 | # Delete all files in the directory 58 | for file in bundle_dir.iterdir(): 59 | file.unlink() 60 | else: 61 | raise ValueError(f'The directory {str(bundle_dir)} already exists') 62 | else: 63 | bundle_dir.mkdir(parents=True) 64 | 65 | # Start building html index page with strings 66 | heading = f"

Data Dump: {name}

" 67 | notes = f'''
NOTES: {notes}
''' if notes else '' 68 | opening = f"{index_style()}{heading}{notes}" 69 | 70 | # Start building table rows 71 | table_rows = [] 72 | header = [f"{val}" for val in df.columns if val!=anno_col] 73 | table_rows.append("".join(header)) 74 | 75 | for i,row in tqdm(df.iterrows(), total=len(df)): 76 | # Get filepath 77 | rel_path = row.fpath 78 | if type(rel_path) is str: 79 | rel_path = Path(rel_path.replace('\\','/')) 80 | abs_path = settings.PROJECT_ROOT / rel_path 81 | 82 | # Annotation scenario 83 | if 'pacer' in abs_path.parts and anno_col and row[anno_col]: 84 | # Load the html text and json data to make the annotated docket 85 | hpath = dtools.get_pacer_html(abs_path) 86 | html_text = open(hpath, 'r', encoding='utf-8').read() 87 | json_data = dtools.load_case(row.fpath) 88 | new_html = make_annotated_docket(html_text, json_data, row[anno_col]) 89 | 90 | # Copy the new (annotated) html into the bundle directory 91 | tqdm.write(f"Annotating {row.ucid}") 92 | new_name = row.ucid.replace(':', '-') + '.html' 93 | with open(bundle_dir/new_name, 'w', encoding='utf-8') as wfile: 94 | wfile.write(new_html) 95 | 96 | else: 97 | if 'pacer' in abs_path.parts: 98 | # Get the path to the html file 99 | abs_path = dtools.get_pacer_html(abs_path) 100 | 101 | # Copy the file 102 | tqdm.write(f"Copying {row.ucid}") 103 | new_name = row.ucid.replace(':', '-') + abs_path.suffix 104 | shutil.copyfile(abs_path, bundle_dir/new_name) 105 | 106 | 107 | cells = [f"{v}" for k,v in row.iteritems() if k!=anno_col] 108 | row_string = f'''''' + "".join(cells) + "" 109 | table_rows.append(row_string) 110 | 111 | # Finish out the html string for the index 112 | table = f"{''.join(table_rows)}
" 113 | closing = f"" 114 | html = opening + table + closing 115 | 116 | with open(bundle_dir/'_index.html', 'w+') as wfile: 117 | wfile.write(html) 118 | 119 | print(f"\nFiles Succesfully bundled into {bundle_dir}") 120 | 121 | def build_new_td(json_text, row_annotations, soup=None, inner_html=False): 122 | ''' 123 | Make a new td cell to replace current docket text td cell, for a single docket entry/row 124 | 125 | Inputs: 126 | - json_text (str): the cleaned docket text from the saved json 127 | - row_annotations (list): a list of dicts of annotation spans for a single docket line 128 | e.x. [{'start': 0, 'end':10, 'label':"SOMETHING"}] 129 | - soup (bs4 instance): soup needed to make a tag, if None will create an empty soup 130 | - inner_html (bool): if true returns inner html as string 131 | Output 132 | new_td (bs4 object or str): new td cell to be inserted 133 | ''' 134 | if not soup: 135 | soup = BeautifulSoup('','html.parser') 136 | 137 | new_td = soup.new_tag('td') 138 | 139 | # Index pointer to current place in original json 140 | og_pointer = 0 141 | 142 | # Sort annotation by 'start' 143 | row_annotations.sort(key=lambda x: x['start']) 144 | 145 | # Iterate through each annotation and 'swap out' original text for new span 146 | for annot in row_annotations: 147 | 148 | # Get all the text up until this annotation 149 | new_td.append( json_text[ og_pointer: annot['start'] ] ) 150 | 151 | # Build the span html tag, add attributes that allow for styling/highlighting 152 | span_tag = soup.new_tag('span', attrs={'class':"annotation", 'data-label':annot['label']}) 153 | span_tag.string = json_text[annot['start']:annot['end']] 154 | new_td.append(span_tag) 155 | 156 | # Set the pointer to the end index of the annotation 157 | og_pointer = annot['end'] 158 | 159 | # Get the last bit of the docket 160 | new_td.append( json_text[ og_pointer: ] ) 161 | 162 | if inner_html: 163 | return new_td.decode_contents() 164 | else: 165 | return new_td 166 | 167 | def make_annotated_docket(html_text, json_data, case_annotations): 168 | ''' 169 | Main function to build annotated html for a PACER docket 170 | 171 | Inputs: 172 | - html_text (str) 173 | - json_data (dict) 174 | - case_annotations (dict): mapping from row index (int, ordinal index) -> annotation data list of dicts e.g. {2: [ {span1},...], 5: [ {span2}, ...]} 175 | 176 | Output: 177 | (str) html source text for annotated html 178 | ''' 179 | 180 | # Make the soup 181 | soup = BeautifulSoup(html_text, 'html.parser') 182 | 183 | docket_table = soup.select('table')[-2] 184 | 185 | for row_index, tr in enumerate(docket_table.select('tr')[1:]): 186 | 187 | # Skip row if no annotation 188 | if row_index not in case_annotations.keys(): 189 | continue 190 | 191 | tr.attrs['class'] = tr.attrs.get('class', '') + ' annotated' 192 | 193 | #Isolate the original td 194 | docket_entry_td = tr.select('td')[2] 195 | 196 | # Gather info for new td 197 | jdata_text = json_data['docket'][row_index]['docket_text'] 198 | row_annotations = case_annotations[row_index] 199 | 200 | # Build and inject new td 201 | new_cell = build_new_td(jdata_text, row_annotations, soup) 202 | docket_entry_td.replace_with(new_cell) 203 | 204 | 205 | # Inject the style.css file into the header 206 | style_tag = soup.new_tag('style') 207 | style_tag.string = open(settings.STYLE/'pacer_docket.css').read().replace('\n','') 208 | soup.head.append(style_tag) 209 | 210 | return re.sub(r"b'|\\n|\\t",'',str(soup)) 211 | 212 | def make_annotated_docket_for_dash(html_text, json_data, case_annotations, range_to_keep): 213 | ''' 214 | Annotate a Pacer docket and return only a specified range of docket lines in JSON format - for use with make_excerpts() 215 | 216 | Inputs: 217 | - html_text (str) 218 | - json_data (dict) 219 | - case_annotations (dict): mapping from row index (within case) -> annotation data dict e.g. {'2': [ {span1}, ... ], '5': [ {span2}, ... ]} 220 | - range_to_keep (range): the range of docket lines (SCALES-indexed) to be returned 221 | 222 | Output: 223 | (dict) the docket excerpt as a JSON 224 | ''' 225 | 226 | # make the preliminary JSON and the soup 227 | new_json = {"case_id": json_data['case_id'], "docket": []} 228 | soup = BeautifulSoup(html_text, 'html.parser') 229 | docket_table = soup.select('table')[-2] 230 | 231 | # check whether each row needs to be inserted into the final JSON 232 | for row_index, tr in enumerate(docket_table.select('tr')[1:]): 233 | if row_index in range_to_keep: 234 | 235 | # check whether this row needs to be annotated 236 | if str(row_index) in case_annotations.keys(): 237 | row_annotations = case_annotations[str(row_index)] 238 | tr.attrs['class'] = tr.attrs.get('class', '') + ' annotated' 239 | else: 240 | row_annotations = {} 241 | 242 | # gather remaining info needed for build_new_td() 243 | old_entry = json_data['docket'][row_index] 244 | new_docket_text = old_entry['docket_text'] 245 | 246 | # build & insert new docket entry 247 | new_docket_html = build_new_td(new_docket_text, row_annotations, inner_html=True) 248 | new_entry = { 249 | "date_filed": old_entry['date_filed'], 250 | "ind": old_entry['ind'], 251 | "docket_text": new_docket_text, 252 | "docket_html": new_docket_html 253 | } 254 | new_json['docket'].append(new_entry) 255 | 256 | return new_json -------------------------------------------------------------------------------- /src/pacer_tools/code/support/text_functions.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pathlib import Path 3 | sys.path.append(str(Path(__file__).resolve().parents[1])) 4 | from support import data_tools as dtools 5 | 6 | def pro_se_identifier(party: dict, threshold: int=85): 7 | """Given a party dict from a SCALES json, identify if the party is PRO SE 8 | (if identified, returns index of counsel block that equals the party) 9 | 10 | Args: 11 | party (dict): SCALES json formatted party dict from a case 12 | threshold (int, optional): fuzzywuzzy fuzz.[MATCH_TYPE] ratio threshold to constitute a match. Defaults to 85. 13 | 14 | Returns: 15 | bool, int or NoneType: returns True if party is pro se, False if not; 16 | second output: 17 | when bool is true - returns index of counsel block that is the pro se party 18 | when bool is false- returns NoneType 19 | 20 | Notes: 21 | This function requires a Bool flag to indicate success. When just an index or None was returned, explicit 22 | type checking was needed to confirm if a match occurred because None and 0 (a plausible counsel index return) 23 | both case to False -- the result was if type(output)==int: --> use int 24 | """ 25 | 26 | ## -------------Internals---------------- ## 27 | 28 | def _return_success(index_selected): 29 | """format the function output with bool messaging 30 | 31 | Args: 32 | index_selected (int): the integer index of the counsel block that corresponds to the pro se party 33 | 34 | Returns: 35 | bool, int: returns True if party for party is pro se; index of counsel block that is the pro se party 36 | """ 37 | # during matching, the COUNSELS are given an attribute that tracks their original index in the counsels list 38 | # pop this if it was created so that the user's original input data remains unchanged upon return 39 | for counsel in COUNSELS: 40 | counsel.pop('original_index', None) 41 | return True, index_selected 42 | def _return_failure(rapid = False): 43 | """[summary] 44 | 45 | Args: 46 | rapid (bool, optional): if the function fails early before any data was changed, don't iterate through counsels. Defaults to False. 47 | 48 | Returns: 49 | bool, NoneType: party is not pro se; none since no index matches 50 | """ 51 | 52 | if rapid: 53 | return False, None 54 | 55 | # during matching, the COUNSELS are given an attribute that tracks their original index in the counsels list 56 | # pop this if it was created so that the user's original input data remains unchanged upon return 57 | for counsel in COUNSELS: 58 | counsel.pop('original_index', None) 59 | return False, None 60 | 61 | def _call_fuzzy(party_name, counsels, match_type = 'ratio'): 62 | """call a fuzzy matching run across a list of counsels using the parent functions 63 | threshold, and the specified matching type 64 | 65 | Args: 66 | party_name (str): string of the party name being checked 67 | counsels (list): list of SCALES json formatted counsel dicts, with an added attribute for their original index 68 | match_type (str, optional): which fuzzywuzzy fuzz match should we employ. Defaults to 'ratio'. 69 | 70 | Returns: 71 | [type]: [description] 72 | """ 73 | if match_type=='ratio': 74 | fuzzycall = fuzz.ratio 75 | elif match_type == 'token-set': 76 | fuzzycall = fuzz.token_set_ratio 77 | else: 78 | fuzzycall = fuzz.ratio 79 | 80 | if len(party_name) <=8: 81 | fuzzycall = fuzz.partial_token_set_ratio 82 | 83 | # failsafes are used to escape bad fuzzy matches before they happen 84 | # A. USA fuzzies into AUSA and many other generic X of USA roles. In general, we know the USA as a party 85 | # represents itself and that the term USA should not fuzzy into the individual counsel names 86 | # B. material witnesses and parties that are just abbreviations or single letters inadvertently match 87 | # their counsels middle initials or initialed names i.e. L.W. as a party matched James L. Watson. 88 | # if a party is a nondescript initial grouping, we do not fuzzy match it 89 | FAILSAFES = [ 90 | lambda party_name: party_name.lower().strip()=='usa', 91 | lambda party_name: all(len(tok.strip())==1 for tok in party_name.replace('.',' ').split()) 92 | ] 93 | for failsafe in FAILSAFES: 94 | if failsafe(party_name): 95 | return None 96 | 97 | matches = [] # start with no matches 98 | for counsel in [c for c in counsels if c['name']]: # only compare counsels that had a name 99 | if len(counsel['name'])<=8: 100 | fuzzycall = fuzz.partial_token_set_ratio 101 | FR = fuzzycall( counsel['name'] , party_name ) # fuzzy match score 102 | if FR >= threshold: 103 | matches.append((counsel, FR)) # if it matches, add to our matches 104 | if matches: 105 | # our threshold is high enough that any match is believable, if there are multiple, take the top one (?) 106 | # 0th index is top score 107 | # [0][0] is the counsel object in the tuple 108 | winner = sorted(matches, key = lambda tups: tups[1], reverse=True)[0][0] 109 | return _return_success(winner['original_index']) 110 | return None 111 | 112 | ## -------------------------------------- ## 113 | 114 | ## EARLY FAILSAFE 115 | # if any json keys are missing or NoneTypes, kick out 116 | if not party['counsel'] or not party['name']: 117 | return _return_failure(rapid=True) 118 | 119 | # will be using this everywhere 120 | COUNSELS = party['counsel'] 121 | # add an attribute once that specifies the enumerated index of each iterable in the list 122 | # this saves us from continued enumeration and any ordering preservation 123 | for original_index, counsel in enumerate(COUNSELS): 124 | counsel['original_index'] = original_index 125 | 126 | #################################### 127 | # CONTROL BLOCK if restrictive pro se flag showed up in json from parse 128 | #################################### 129 | # if the parser already believes this to be a pro-se entry, leverage that as a head start 130 | if any((bool(counsel['is_pro_se']) for counsel in party['counsel'])): 131 | # if only one counsel, hooray no logic return it 132 | if len(COUNSELS)==1: 133 | return _return_success(0) 134 | 135 | # else: need to confirm that there is actually a "PRO SE" and there is only one counsel block that matches the criteria 136 | # looking for a singular "PRO SE" counsel 137 | matches = [] 138 | for counsel in COUNSELS: 139 | if counsel['name']: # IF THERE IS A NAME FOR THE COUNSEL 140 | check = counsel['name'] 141 | if counsel['entity_info'].get('raw_info'): # IF THERE IS ALSO RAW INFO 142 | extra_info = dtools.extra_info_cleaner(counsel['entity_info'].get('raw_info')) 143 | if extra_info: 144 | check += '\n' + extra_info 145 | elif counsel['entity_info'].get('raw_info'): # THERE IS NO NAME, CHECK IF RAW INFO 146 | check = dtools.extra_info_cleaner(counsel['entity_info'].get('raw_info')) 147 | else: # NO NAME, NO RAW INFO.... THATS WHACK, WE CANT COMPARE IT 148 | continue 149 | 150 | # the explicit code that triggered the party level flag 151 | if "PRO SE" in check: 152 | matches.append(counsel) 153 | if len(matches)==1: # if only one counsel is pro se, return their original index 154 | return _return_success(matches[0]['original_index']) 155 | 156 | 157 | #################################### 158 | # CONTROL BLOCK if party exactly represented text in counsels 159 | #################################### 160 | # dockets have whacky spacing on parties but not counsels sometimes -- normalize whitespace and case 161 | space_voider = lambda x: " ".join(x.strip().split()).lower() 162 | 163 | sv_party_name = space_voider( party['name'] ) # normalized party name 164 | sv_counsels = [(space_voider( c['name'] ), c['original_index']) for c in COUNSELS if c['name']] # normalized counsel names 165 | 166 | # if the normalized party appears in normalized counsel names verbatim, trigger and match 167 | if sv_party_name in sv_counsels: 168 | # (efficiency of "in" comparison presumed) 169 | # the match ends up as a tuple, return the original index for kick out 170 | match = [counsel for counsel in sv_counsels if counsel == sv_party_name][0] 171 | return _return_success(match[1]) 172 | 173 | #################################### 174 | # CONTROL BLOCK TOKEN SET RATIO 175 | #################################### 176 | # final layer is a token set ratio check across the party and counsel names 177 | # fuzzywuzzy normalizes whitespace when generating tokens 178 | # if a party has prefixes, but the counsel form of the name does not, we still have 179 | # a successful token set match since one's tokens are wholly present in the others 180 | # the wrapper below will change match_type internally if a string is shorter than 9 characters 181 | from fuzzywuzzy import fuzz 182 | 183 | fuzzed = _call_fuzzy(party['name'], COUNSELS, match_type="token-set") 184 | if fuzzed: 185 | return fuzzed 186 | 187 | return _return_failure() 188 | 189 | 190 | ################################################ 191 | # Ngram similarity functions 192 | ################################################ 193 | 194 | def ngrams(string, n=3): 195 | import re 196 | string = re.sub(r'[,-./]|\sBD',r'', string) 197 | ngrams = zip(*[string[i:] for i in range(n)]) 198 | return [''.join(ngram) for ngram in ngrams] 199 | 200 | def cossim_top(A, B, ntop, lower_bound=0): 201 | import numpy as np 202 | import sparse_dot_topn.sparse_dot_topn as ct 203 | from scipy.sparse import csr_matrix 204 | # force A and B as a CSR matrix. 205 | # If they have already been CSR, there is no overhead 206 | A = A.tocsr() 207 | B = B.tocsr() 208 | M, _ = A.shape 209 | _, N = B.shape 210 | 211 | idx_dtype = np.int32 212 | 213 | nnz_max = M*ntop 214 | 215 | indptr = np.zeros(M+1, dtype=idx_dtype) 216 | indices = np.zeros(nnz_max, dtype=idx_dtype) 217 | data = np.zeros(nnz_max, dtype=A.dtype) 218 | 219 | ct.sparse_dot_topn( 220 | M, N, np.asarray(A.indptr, dtype=idx_dtype), 221 | np.asarray(A.indices, dtype=idx_dtype), 222 | A.data, 223 | np.asarray(B.indptr, dtype=idx_dtype), 224 | np.asarray(B.indices, dtype=idx_dtype), 225 | B.data, 226 | ntop, 227 | lower_bound, 228 | indptr, indices, data) 229 | 230 | return csr_matrix((data,indices,indptr),shape=(M,N)) 231 | 232 | def get_matches_df(sparse_matrix): 233 | import pandas as pd 234 | non_zeros = sparse_matrix.nonzero() 235 | return pd.DataFrame({'left_side_idx': non_zeros[0], \ 236 | 'right_side_idx': non_zeros[1], \ 237 | 'similairity': sparse_matrix.data}) 238 | 239 | def swapper(tidx, name_vector): 240 | return name_vector[tidx] 241 | 242 | ################################### 243 | # Basic cosine 244 | ################################### 245 | 246 | -------------------------------------------------------------------------------- /src/pacer_tools/code/db/rdf/make_graph_data_fulton_county.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script has been included solely in an effort to make our RDF-graph-build process 3 | transparent; it was copy-pasted directly from SCALES's private infrastructure repo, and 4 | has not been tested here! Our assumption is that, because the raw data used for this 5 | portion of our graph comes from a private dataset, nobody besides us will run this 6 | script. If we're incorrect about this assumption, feel free to contact us at 7 | engineering@scales-okn.org. 8 | """ 9 | 10 | import os 11 | import json 12 | import logging 13 | import sys 14 | import argparse 15 | from pathlib import Path 16 | import utils 17 | from typing import Any, Dict, List 18 | from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed 19 | 20 | from tqdm import tqdm 21 | from rdflib import Graph, Namespace, Literal, RDF, XSD 22 | 23 | sys.path.append(str(Path.cwd().parents[1].resolve())) 24 | import utils 25 | from constants import SCALES, J, NC, NIBRS, OCCS 26 | from support import settings 27 | 28 | logger = logging.getLogger(__name__) 29 | logger.setLevel(logging.INFO) 30 | _handler = logging.FileHandler("error.log") 31 | _handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")) 32 | logger.addHandler(_handler) 33 | 34 | _make_party_uri_fulton = lambda charge_id: utils._make_party_uri(f'ga-fulton-{int(charge_id)}', 0) 35 | 36 | 37 | def _create_graph() -> Graph: 38 | """Create a blank graph and bind standard prefixes.""" 39 | g = Graph() 40 | g.bind("scales", SCALES) 41 | g.bind("j", J) 42 | g.bind("nc", NC) 43 | g.bind("nibrs", NIBRS) 44 | g.bind("occs", OCCS) 45 | g.bind("rdf", RDF) 46 | return g 47 | 48 | def _make_metadata_graph(): 49 | """Take care of a few Fulton-County-Jail-related triples we might want to use at some point.""" 50 | g = _create_graph() 51 | facility_uri = utils._make_generic_uri('Facility', 'ga-fulton-county-jail') 52 | g.add((facility_uri, NC.FacilityName, 'Fulton County Jail')) 53 | g.add((facility_uri, NC.PhysicalAddress, '901 Rice St NW, Atlanta, GA 30318')) 54 | g.add((facility_uri, OCCS.FacilityFunctionCode, '11-13 11 33')) # "Detention Center" (see https://niem.github.io/model/5.0/occs/FacilityFunctionCodeSimpleType/#diagram) 55 | return g 56 | 57 | def _build_docket_subgraph( 58 | g: Graph, 59 | ucid: str, 60 | hearings: List[Dict[str, Any]], 61 | case_uri, 62 | ): 63 | """Convert the list of hearings to a Register-of-Actions style sub-graph.""" 64 | if not hearings: 65 | return 66 | 67 | table_uri = utils._make_generic_uri("DocketTable", f"{ucid}") 68 | g.add((case_uri, J.RegisterOfActions, table_uri)) 69 | g.add((table_uri, RDF.type, J.RegisterOfActions)) 70 | 71 | for idx, hearing in enumerate(hearings): 72 | entry_uri = utils._make_docket_uri(ucid, idx) 73 | g.add((table_uri, J.RegisterAction, entry_uri)) 74 | g.add((entry_uri, RDF.type, J.RegisterAction)) 75 | 76 | # Filing / event date 77 | h_date = hearing.get("hearing_date") 78 | if h_date: 79 | g.add( 80 | ( 81 | entry_uri, 82 | J.RegisterActionDate, 83 | Literal(utils._date_to_xsd(h_date), datatype=XSD.date), 84 | ) 85 | ) 86 | 87 | # Description text (type, result, etc.) 88 | parts = [hearing.get("hearing_type")] 89 | if hearing.get("result"): 90 | parts.append(f": {hearing['result']}") 91 | if hearing.get("result_type"): 92 | parts.append(f"({hearing['result_type']})") 93 | contents = " ".join([p for p in parts if p]) 94 | if contents: 95 | g.add( 96 | ( 97 | entry_uri, 98 | J.RegisterActionDescriptionText, 99 | Literal(utils._escape_quotes(contents)), 100 | ) 101 | ) 102 | 103 | 104 | def process_json_file(json_path: str): 105 | """Parse a single Fulton-county *charge* JSON and return a list(triples).""" 106 | try: 107 | with open(json_path, "r") as fh: 108 | data = json.load(fh) 109 | except Exception as exc: # pylint: disable=broad-except 110 | logger.error("Error reading %s: %s", json_path, exc) 111 | return None 112 | 113 | g = _create_graph() 114 | 115 | charge_id = data.get("charge_id") 116 | charge_uri = utils._make_generic_uri("Charge", f"ga-fulton-01-{int(charge_id)}") 117 | 118 | g.add((charge_uri, RDF.type, J.Charge)) 119 | desc = data.get("charge_offense_description") 120 | g.add((charge_uri, J.ChargeText, Literal(utils._escape_quotes(desc)))) 121 | 122 | severity = data.get("severity") 123 | g.add((charge_uri, J.ChargeSeverityLevelCode, Literal(severity))) 124 | 125 | # Charge decision / status 126 | # decision = data.get("charge_decision") or {} 127 | # if decision.get("charge_decision"): 128 | # g.add( 129 | # ( 130 | # charge_uri, 131 | # J.ChargeDispositionCategoryText, 132 | # Literal(decision["charge_decision"]), 133 | # ) 134 | # ) 135 | # if decision.get("charge_status"): 136 | # g.add( 137 | # ( 138 | # charge_uri, 139 | # NC.StatusDescriptionText, 140 | # Literal(decision["charge_status"]), 141 | # ) 142 | # ) 143 | # if decision.get("file_date"): 144 | # g.add( 145 | # ( 146 | # charge_uri, 147 | # NC.StartDate, 148 | # Literal( 149 | # utils._date_to_xsd(decision["file_date"]), 150 | # datatype=XSD.date, 151 | # ), 152 | # ) 153 | # ) 154 | # if decision.get("charge_decision_date"): 155 | # g.add( 156 | # ( 157 | # charge_uri, 158 | # NC.EndDate, 159 | # Literal( 160 | # utils._date_to_xsd(decision["charge_decision_date"]), 161 | # datatype=XSD.date, 162 | # ), 163 | # ) 164 | # ) 165 | 166 | case_info = data.get("case") 167 | if case_info: 168 | case_nbr = case_info.get("case_nbr") 169 | ucid = f"ga-fulton-01-{case_nbr}" 170 | case_uri = utils._make_case_uri(ucid) 171 | 172 | g.add((charge_uri, J.ChargeFiledCase, case_uri)) 173 | g.add((case_uri, NC.CaseDocketID, Literal(utils._escape_quotes(case_nbr)))) 174 | g.add((case_uri, RDF.type, nc.CourtCase)) 175 | g.add((case_uri, RDF.type, SCALES.CriminalCase)) 176 | g.add((case_uri, NC.CaseGeneralCategoryText, Literal("criminal"))) 177 | 178 | # Hearings / register of actions 179 | _build_docket_subgraph(g, ucid, case_info.get("hearings", []), case_uri) 180 | 181 | if data.get("bond_type"): 182 | g.add((charge_uri, J.BondType, Literal(data["bond_type"]))) 183 | if data.get("bond_amount"): 184 | try: 185 | amt = float(data["bond_amount"]) 186 | except (TypeError, ValueError): 187 | amt = data["bond_amount"] 188 | g.add((charge_uri, J.BondAmount, Literal(amt, datatype=XSD.float))) 189 | 190 | booking = data.get("booking") 191 | if booking: 192 | booking_uri = utils._make_generic_uri('Booking', f"ga-fulton-{int(booking['jailing_id'])}") 193 | g.add((booking_uri, J.BookingFacility, utils._make_generic_uri('Facility', 'ga-fulton-county-jail'))) 194 | party_uri = _make_party_uri_fulton(charge_id) # we don't create this uri earlier because the booking dict is where the party info resides 195 | 196 | # Link charge/booking/party 197 | g.add((charge_uri, J.Booking, booking_uri)) 198 | g.add((booking_uri, RDF.type, J.Booking)) 199 | g.add((party_uri, J.PersonCharge, charge_uri)) 200 | g.add((party_uri, RDF.type, J.BookingSubject)) 201 | 202 | # g.add((party_uri, J.ParticipantRoleCategoryText, Literal("defendant"))) # commented this out because not all arrestees become defendants 203 | if booking.get("gender"): 204 | g.add((party_uri, J.PersonSexCode, Literal(booking["gender"]))) 205 | if booking.get("race"): 206 | g.add((party_uri, NC.PersonRaceText, Literal(booking["race"]))) 207 | 208 | if booking.get("booking_date"): 209 | g.add( 210 | ( 211 | booking_uri, 212 | NC.StartDate, 213 | Literal( 214 | utils._date_to_xsd(booking["booking_date"]), datatype=XSD.date 215 | ), 216 | ) 217 | ) 218 | if booking.get("release_date"): 219 | g.add( 220 | ( 221 | booking_uri, 222 | NC.EndDate, 223 | Literal( 224 | utils._date_to_xsd(booking["release_date"]), datatype=XSD.date 225 | ), 226 | ) 227 | ) 228 | 229 | return list(g) 230 | 231 | 232 | def _write_graph_worker(graph: Graph, outdir: Path, file_name=None): 233 | utils._write_graph_to_file(graph, outdir, file_name=file_name) 234 | 235 | 236 | def main(indir: str, outdir: str): 237 | """Read all JSON charge files beneath *indir* and emit Turtle files to *outdir*.""" 238 | indir_p = Path(indir) 239 | outdir_p = Path(outdir) 240 | outdir_p.mkdir(parents=True, exist_ok=True) 241 | 242 | utils._write_graph_to_file(_make_metadata_graph(), outdir, file_name='facility.ttl') 243 | 244 | json_files = [str(f) for f in indir_p.rglob("*.json") if f.is_file()] 245 | logger.info("Discovered %d JSON files in %s", len(json_files), indir) 246 | 247 | record_counter = 0 248 | global_graph = _create_graph() 249 | write_futures = [] 250 | 251 | with ProcessPoolExecutor(max_workers=12) as proc_exec, ThreadPoolExecutor( 252 | max_workers=8 253 | ) as thread_exec: 254 | futures = {proc_exec.submit(process_json_file, jf): jf for jf in json_files} 255 | 256 | with tqdm(total=len(json_files), desc="Processing charges") as pbar: 257 | for future in as_completed(futures): 258 | triples = future.result() 259 | if triples: 260 | for triple in triples: 261 | global_graph.add(triple) 262 | record_counter += 1 263 | 264 | # Flush every 10k records (adjust as needed) 265 | if record_counter >= 10000: 266 | wf = thread_exec.submit( 267 | _write_graph_worker, global_graph, outdir_p 268 | ) 269 | write_futures.append(wf) 270 | record_counter = 0 271 | global_graph = _create_graph() 272 | 273 | pbar.update(1) 274 | 275 | # Final flush 276 | if record_counter: 277 | utils._write_graph_to_file(global_graph, outdir_p) 278 | 279 | # Await parallel writers 280 | for wf in as_completed(write_futures): 281 | try: 282 | wf.result() 283 | except Exception as exc: # pylint: disable=broad-except 284 | logger.error("Error in write operation: %s", exc) 285 | 286 | # entities (added by scott) 287 | utils.process_entities( 288 | (settings.PARTY_DIS_UNIVERSAL,), 289 | outdir, 290 | _make_party_uri_fulton, 291 | ('charge_id',), 292 | filter_funcs={settings.PARTY_DIS_UNIVERSAL: ( 293 | lambda df: df[df.court.eq('ga-fulton')]) 294 | } 295 | ) 296 | 297 | 298 | if __name__ == "__main__": 299 | parser = argparse.ArgumentParser( 300 | description="Parse Fulton-county charge JSON files and emit Turtle graphs", 301 | ) 302 | parser.add_argument("indir", help="Directory containing input JSON files") 303 | parser.add_argument("outdir", help="Directory where TTL files will be written") 304 | args = parser.parse_args() 305 | 306 | main(args.indir, args.outdir) 307 | -------------------------------------------------------------------------------- /src/pacer_tools/code/db/rdf/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | import ast 3 | import time 4 | import pandas as pd 5 | from tqdm import tqdm 6 | from pathlib import Path 7 | from rdflib import Graph, URIRef 8 | 9 | from constants import SCALES 10 | 11 | manual_offense_mapping = { 12 | "ASSAULT AGGRAVATED": "AGGRAVATED ASSAULT", 13 | "ASSAULT SIMPLE": "SIMPLE ASSAULT", 14 | "INTIMIDATION": "INTIMIDATION", 15 | "DRUG PARAPHERNALIA OFFENSES": "DRUG EQUIPMENT VIOLATIONS", 16 | "EQUIPMENT DRUG": "DRUG EQUIPMENT VIOLATIONS", 17 | "FALSE PRETENSES": "FALSE PRETENSES/SWINDLE/CONFIDENCE GAME", 18 | "SWINDLE": "FALSE PRETENSES/SWINDLE/CONFIDENCE GAME", 19 | "CONFIDENCE GAME": "FALSE PRETENSES/SWINDLE/CONFIDENCE GAME", 20 | "AUTOMATED TELLER MACHINE": "CREDIT CARD/AUTOMATED TELLER MACHINE FRAUD", 21 | "CREDIT CARD FRAUD": "CREDIT CARD/AUTOMATED TELLER MACHINE FRAUD", 22 | "IMPERSONATION": "IMPERSONATION", 23 | "FRAUD WELFARE": "WELFARE FRAUD", 24 | "FRAUD TELEPHONE": "WIRE FRAUD", 25 | "FRAUD IDENTITY THEFT": "IDENTITY THEFT", 26 | "COMPUTER CRIME": "HACKING/COMPUTER INVASION", 27 | "FRAUD HACKING/COMPUTER\nINVASION": "HACKING/COMPUTER INVASION", 28 | "BETTING UNLAWFUL": "BETTING/WAGERING", 29 | "TRANSMITTING WAGERING INFORMATION": "BETTING/WAGERING", 30 | "WAGERING UNLAWFUL": "BETTING/WAGERING", 31 | "GAMBLING PARAPHERNALIA DEVICES EQUIPMENT POSESSION": "GAMBLING EQUIPMENT VIOLATIONS", 32 | "BRIBERY SPORTS": "SPORTS TAMPERING", 33 | "HOMICIDE JUSTIFIABLE": "JUSTIFIABLE HOMICIDE", 34 | "COMMERCIALIZED SEX COMMERCIAL SEX": "HUMAN TRAFFICKING, COMMERCIAL SEX ACTS", 35 | "HUMAN TRAFFICKING\nCOMMERCIAL SEX ACTS": "HUMAN TRAFFICKING, COMMERCIAL SEX ACTS", 36 | "PICKPOCKET": "POCKET-PICKING", 37 | "PURSE-SNATCHING": "PURSE-SNATCHING", 38 | "SHOPLIFTING": "SHOPLIFTING", 39 | "THEFT FROM A BUILDING": "THEFT FROM BUILDING", 40 | "THEFT FROM A COIN-OPERATED\nMACHINE OR DEVICE": "THEFT FROM COIN-OPERATED MACHINE OR DEVICE", 41 | "THEFT FROM A MOTOR VEHICLE": "THEFT FROM MOTOR VEHICLE", 42 | "STRIPPING MOTOR VEHICLE": "THEFT OF MOTOR VEHICLE PARTS OR ACCESSORIES", 43 | "PIMPING": "ASSISTING OR PROMOTING PROSTITUTION", 44 | "TRANSPORTING PERSONS FOR PROSTITUTION": "ASSISTING OR PROMOTING PROSTITUTION", 45 | "FREQUENTING A HOUSE OF\nPROSTITUTION": "PURCHASING PROSTITUTION", 46 | "RAPE": "RAPE", 47 | "SODOMY": "SODOMY", 48 | "SEXUAL ASSAULT WITH AN OBJECT": "SEXUAL ASSAULT WITH AN OBJECT", 49 | "FONDLING": "FONDLING", 50 | "INCEST": "INCEST", 51 | "RAPE STATUTORY": "STATUTORY RAPE", 52 | "EXPLOSIVES": "EXPLOSIVES", 53 | } 54 | 55 | drug_keywords_apd = { 56 | "crack": {"nibrs_code": "A", "nibrs_drug": "Crack Cocaine"}, 57 | "caine": {"nibrs_code": "B", "nibrs_drug": "Cocaine (All forms except crack)"}, 58 | "hash": {"nibrs_code": "C", "nibrs_drug": "Hashish"}, 59 | "roin": {"nibrs_code": "D", "nibrs_drug": "Heroin"}, 60 | "juana": {"nibrs_code": "E", "nibrs_drug": "Marijuana"}, 61 | "morp": {"nibrs_code": "F", "nibrs_drug": "Morphine"}, 62 | # 'opium': {'nibrs_code': 'G', 'nibrs_drug': 'Opium'}, 63 | "narc": {"nibrs_code": "H", "nibrs_drug": "Other Narcotics"}, 64 | "lsd": {"nibrs_code": "I", "nibrs_drug": "LSD"}, 65 | "pcp": {"nibrs_code": "J", "nibrs_drug": "PCP"}, 66 | "halluc": {"nibrs_code": "K", "nibrs_drug": "Other Hallucinogens"}, 67 | "amphe": {"nibrs_code": "L", "nibrs_drug": "Amphetamines/Methamphetamines"}, 68 | "stim": {"nibrs_code": "M", "nibrs_drug": "Other Stimulants"}, 69 | "barbit": {"nibrs_code": "N", "nibrs_drug": "Barbiturates"}, 70 | "depress": {"nibrs_code": "O", "nibrs_drug": "Other Depressants"}, 71 | "unknown": {"nibrs_code": "U", "nibrs_drug": "Unknown Drug Type"}, 72 | "drug": {"nibrs_code": "P", "nibrs_drug": "Other Drugs"}, 73 | # 'over 3': 'X' 74 | } 75 | exclusions_apd = () 76 | 77 | drug_keywords_clayton = { 78 | "cocaine": {"nibrs_code": "B", "nibrs_drug": "Cocaine (All forms except crack)"}, 79 | "substance or marijuana": {"nibrs_code": "U", "nibrs_drug": "Unknown Drug Type"}, 80 | "substance/marijuana": {"nibrs_code": "U", "nibrs_drug": "Unknown Drug Type"}, 81 | "marijuana": {"nibrs_code": "E", "nibrs_drug": "Marijuana"}, 82 | "thc": {"nibrs_code": "E", "nibrs_drug": "Marijuana"}, 83 | "ecstacy": {"nibrs_code": "K", "nibrs_drug": "Other Hallucinogens"}, 84 | "amphetamine": {"nibrs_code": "L", "nibrs_drug": "Amphetamines/Methamphetamines"}, 85 | "methaqualone": {"nibrs_code": "O", "nibrs_drug": "Other Depressants"}, 86 | "ephedrine": {"nibrs_code": "P", "nibrs_drug": "Other Drugs"}, 87 | "glue": {"nibrs_code": "P", "nibrs_drug": "Other Drugs"}, 88 | "nitrous": {"nibrs_code": "P", "nibrs_drug": "Other Drugs"}, 89 | "steroid": {"nibrs_code": "P", "nibrs_drug": "Other Drugs"}, 90 | "drug": {"nibrs_code": "U", "nibrs_drug": "Unknown Drug Type"}, 91 | "narcotic": {"nibrs_code": "U", "nibrs_drug": "Unknown Drug Type"}, 92 | "gcsa": {"nibrs_code": "U", "nibrs_drug": "Unknown Drug Type"}, 93 | "substa": {"nibrs_code": "U", "nibrs_drug": "Unknown Drug Type"}, 94 | "medication": {"nibrs_code": "U", "nibrs_drug": "Unknown Drug Type"}, 95 | "morphine, opium, heroin": {"nibrs_code": "U", "nibrs_drug": "Unknown Drug Type"}, 96 | } 97 | exclusions_clayton = ( 98 | "alcohol-drugs", 99 | "drug related object", 100 | "drugs/alcohol or under influence", 101 | "drugs,alcohol", 102 | "drugs, weapons or alcohol", 103 | "dumping", 104 | ) 105 | 106 | 107 | 108 | def process_entities(fpaths, outdir, party_uri_func, fields_needed, filter_funcs={}): 109 | ''' 110 | fpaths: an iterable of filepaths from which to extract entity info 111 | party_uri_func: a function with which to generate a party uri for each dataframe row (not an entity uri, as df.id will be used for these by default) 112 | fields_needed: an iterable of fields that party_uri_func needs (i believe this is more performant than using iterrows) 113 | filters: optionally, a dict that maps each desired filepath to a lambda function that filters a dataframe (e.g. to exclude weak keys or select courts in PARTY_DIS_UNIVERSAL) 114 | ''' 115 | g = Graph() 116 | g.bind('scales', SCALES) 117 | 118 | for fpath in fpaths: 119 | df = pd.read_csv(fpath) 120 | filter_func = filter_funcs.get(fpath) 121 | if filter_func: 122 | df = filter_func(df) 123 | if 'id' not in df.columns: 124 | raise Exception(f"process_entities expects {fpath} to contain an 'id' column") 125 | 126 | spids = list(df.id) 127 | field_lists = [list(df[field]) for field in fields_needed] 128 | for i in tqdm(range(len(df)), desc='Processing disambiguated parties'): 129 | values = [lst[i] for lst in field_lists] 130 | g.add((party_uri_func(*values), SCALES.isInstanceOfEntity, _make_generic_uri('PartyEntity', spids[i]))) 131 | 132 | if i and not i%50000: 133 | _write_graph_to_file(g, outdir, infix="entities") 134 | g = Graph() 135 | g.bind('scales', SCALES) 136 | 137 | # TODO merge ids in a more conservative disambiguation file when a more liberal disambiguation file suggests we can 138 | _write_graph_to_file(g, outdir, infix="entities") 139 | 140 | 141 | def parse_drugs(df, charge_col, source, from_cli=False): 142 | results = [] 143 | processed_indices = set() 144 | drug_keywords = {"apd": drug_keywords_apd, "clayton": drug_keywords_clayton}[source] 145 | exclusions = {"apd": exclusions_apd, "clayton": exclusions_clayton}[source] 146 | 147 | for index, row in df.iterrows(): 148 | if index in processed_indices: 149 | continue 150 | arrest_charge = str(row[charge_col]) 151 | for keyword, code in drug_keywords.items(): 152 | if keyword.lower() in arrest_charge.lower() and not any( 153 | x in arrest_charge.lower() for x in exclusions 154 | ): 155 | results.append( 156 | { 157 | "index": index, 158 | charge_col: arrest_charge, 159 | "keyword": keyword, 160 | "nibrs_code": code["nibrs_code"], 161 | "nibrs_drug": code["nibrs_drug"], 162 | } 163 | ) 164 | processed_indices.add(index) 165 | break # exit inner loop once a match is found for this record 166 | 167 | results_df = pd.DataFrame(results) 168 | if from_cli: 169 | results_df.to_csv("apd_drug_arrests.csv", index=False) 170 | print(f"Total arrest records processed: {len(df):,}") 171 | print(f"Total matches found: {len(results_df):,}") 172 | if not results_df.empty: 173 | print("\nNIBRS drugs by match count:\n") 174 | top_drugs = results_df["nibrs_drug"].value_counts().head(10) 175 | for drug, count in top_drugs.items(): 176 | # Find the keyword for this drug 177 | keyword = results_df[results_df["nibrs_drug"] == drug]["keyword"].iloc[ 178 | 0 179 | ] 180 | print(f" {drug}, {keyword}: {count}") 181 | 182 | drug_matches = results_df[results_df["nibrs_drug"] == drug] 183 | top_charges = drug_matches[charge_col].value_counts() 184 | for charge, charge_count in top_charges.items(): 185 | print(f" - {charge}: {charge_count}") 186 | print() 187 | else: 188 | print("No matches found.") 189 | else: 190 | return results_df 191 | 192 | 193 | def _escape_quotes(text): 194 | if text is None: 195 | return None 196 | 197 | text = str(text) 198 | if '"' in text: 199 | return text.replace('"', "'") 200 | return text 201 | 202 | 203 | def _date_to_xsd(date_str): 204 | if not date_str: 205 | return None 206 | 207 | # convert to string and strip whitespace 208 | date_str = str(date_str).strip() 209 | 210 | # 1) Fast-path: leading ISO YYYY-MM-DD (optionally followed by time info) 211 | if len(date_str) >= 10 and date_str[4] == "-" and date_str[7] == "-": 212 | return date_str[:10] 213 | 214 | # 2) Try a list of known patterns via time.strptime 215 | patterns = [ 216 | "%Y-%m", # '2016-03' 217 | "%m/%d/%Y", # '03/12/2016' 218 | "%d/%m/%Y", # '12/03/2016' (rare) 219 | "%m/%Y", # '03/2016' 220 | ] 221 | 222 | for fmt in patterns: 223 | try: 224 | parsed = time.strptime(date_str, fmt) 225 | # Default missing day/month handled by strptime (defaults to 1) 226 | return time.strftime("%Y-%m-%d", parsed) 227 | except ValueError: 228 | pass 229 | # raise ValueError(f"Invalid date format: {date_str}") 230 | 231 | 232 | def _make_case_uri(ucid): 233 | return URIRef(f"{SCALES}Case/{ucid}") 234 | 235 | def _make_docket_uri(ucid, idx): 236 | return URIRef(f"{SCALES}DocketEntry/{ucid}_de{int(idx)}") 237 | 238 | def _make_charge_uri(ucid, dft_idx, chg_idx): 239 | if type(chg_idx)==str: 240 | chg_idx = re.sub('[ :;,./="]', "", chg_idx) 241 | return URIRef(f"{SCALES}Charge/{ucid}_p{int(dft_idx)}_c{chg_idx}") 242 | 243 | def _make_sentence_uri(ucid, entry_idx, sentence_idx): 244 | return URIRef(f"{SCALES}Sentence/{ucid}_de{int(entry_idx)}_s{int(sentence_idx)}") 245 | 246 | def _make_party_uri(ucid, idx): 247 | return URIRef(f"{SCALES}Party/{ucid}_p{int(idx)}") 248 | 249 | def _make_counsel_uri(ucid, idx): 250 | return URIRef(f"{SCALES}Lawyer/{ucid}_l{int(idx)}") 251 | 252 | def _make_generic_uri(namespace, entity_id): 253 | return URIRef(f"{SCALES}{namespace}/{entity_id}") 254 | 255 | 256 | def _write_graph_to_file(graph, outdir, file_name=None, infix=None): 257 | """Write the current graph to a file with a unique, sortable name.""" 258 | file_name = file_name or f"graph_{infix+'_' if infix else ''}{time.time_ns()}.ttl" 259 | outpath = Path(outdir) / Path(file_name) 260 | outpath.parent.mkdir(parents=True, exist_ok=True) 261 | print(f"Writing TTL to {outpath}") 262 | graph.serialize(destination=str(outpath), format="turtle", encoding="utf-8") 263 | print(f"Wrote TTL to {outpath}") 264 | -------------------------------------------------------------------------------- /src/pacer_tools/code/parsers/README.md: -------------------------------------------------------------------------------- 1 | # Description 2 | A parser that reads HTMLs downloaded from Pacer.gov and breaks them up into JSON format. 3 | 4 | # Usage 5 | To run the parser: 6 | ``` 7 | python parse_pacer.py [OPTIONS] INPUT_DIR 8 | ``` 9 | ### Arguments 10 | - `INPUT_DIR`: Relative path to the folder where HTMLs will be read, e.g. `../../data/pacer/ilnd/html` 11 | 12 | If you are using the parser in conjunction with SCALES's Pacer scraper, you will likely want your input directory to be the scraper-generated `html` folder within your chosen court directory, as outlined [here](../downloader/README.md#directory-structure). Similarly the output and summaries directories will be inferred as the `json` and `summaries` folder within that chosen court directory, but can be overriden by providing values for `output-dir` and `summaries-dir` 13 | 14 | ### Options 15 | - `-o, --output-dir TEXT` *(path)* The folder where the parsed JSONs will be placed into. If none is provided they will placed in `INPUT_DIR/../json/` 16 | - `-s, --summaries-dir TEXT` *(path)* The folder where the scraper will look for accompanying case summaries. the parsed JSONs will be placed into. If none is provided it will deault to `INPUT_DIR/../summaries/`. See more on case summaries [below](#case-summaries) 17 | - `-c, --court TEXT` *(defaults to none)* The standard abbreviation for the district court being parsed, e.g. `ilnd`. If not specified, and if using the directory structure mentioned above, the parser will inference the court abbreviation from the parent folder. 18 | - `-d, --debug` *(flag)* Turns off concurrency in the parser. Useful for ensuring that error traces are printed properly. 19 | - `-f, --force-rerun` *(flag)* Tells the parser to process HTMLs even when their corresponding JSONs already exist. Useful for obtaining fresh parses after scraping updates to existing dockets. 20 | - `--force-ucids` *(path)* A path to a .csv file that contais a 'ucid' column. If supplied the parser will force rerun only on HTMLs that match up with the provided UCIDs (rather than force rerunning on the entire INPATH) 21 | - `-nw, --n-workers INTEGER` *(defaults to 16)* Number of concurrent workers to run simultaneously - i.e., no. of simultaneous parses running. 22 | 23 | ### Shell scripts 24 | Two shell scripts, `parse_all.sh` and `parse_subset.sh`, are provided for batch runs across multiple court directories. To run them: 25 | 26 | sh parse_all.sh INPATH [OPTIONS] 27 | sh parse_subset.sh INPATH -s STARTDIR -e ENDDIR [OPTIONS] 28 | 29 | where `INPATH` is the relative path to a parent folder containing multiple court directories, `STARTDIR` and `ENDDIR` define the inclusive alphabetical range of court directories to parse (e.g. `nyed` through `nywd`), and `OPTIONS` are any command-line options you would like to pass through to `parse_pacer.py` (e.g. `--debug`, `--force-rerun`, `--n-workers`). 30 | 31 | *Note: each court directory in the batch must include an HTML folder for input and a JSON folder for output, as is true in the scraper-generated directory structure.* 32 | 33 | 34 | 35 | 36 | # JSON Schema 37 | The following fields are inferenced from the filepath: 38 | - `case_id` *(string)* - Pacer's case ID, which has the form O:YY-TY-##### (where O is a court office code, YY is a year, TY is the case type, and ##### is a numeric identifier associated with this case) 39 | - `case_type` *(string)* - usually 'cr' (criminal) or 'cv' (civil); other types are acceptable ('mc', 'bk'...), but they will result in an incomplete parse 40 | - `download_court` *(string)* - read from the command line if passed in with the `-c` option 41 | - `ucid` *(string)* - SCALES's case ID (stands for 'unique case id'), generated by prepending the court abbreviation to the Pacer case ID and used to ensure that cases with identical Pacer IDs from different districts can be distinguished from one another 42 | 43 | The following fields are pulled from the header of the Pacer docket: 44 | - `header_case_id` *(string)* - similar to `case_id`, but pulled from the docket itself rather than the filepath 45 | - `case_name` *(string)* 46 | - `filing_date` *(string)* 47 | - `terminating_date` *(string)* 48 | - `case_status` *(string)* - 'open' if a terminating date is listed, else 'closed' 49 | - `judge` *(string)* 50 | - `referred_judge` *(string)* - only present when the case was referred to a second judge 51 | - `nature_suit` *(string)* - civil cases only 52 | - `jury_demand` *(string)* - civil cases only 53 | - `cause` *(string)* - civil cases only 54 | - `jurisdiction` *(string)* - civil cases only 55 | - `monetary_demand` *(string)* - civil cases only 56 | - `lead_case_id` *(string)* - only present when the case is part of multi-district litigation (MDL) 57 | - `other_court` *(string)* - only present when another case ID is provided by Pacer as 'Case in other court'; doesn't pick up all alternate case IDs (e.g. appeals court case numbers) 58 | - `case_flags` *(list of strings)* - only present when there are flags listed in the upper right corner of the Pacer docket 59 | - `mdl_code` *(integer)* 60 | 61 | The following fields are pulled from the body of the Pacer docket: 62 | - `plaintiffs`, `defendants`, `bankruptcy_parties`, `other_parties`, `misc_participants` *(dictionary)* - each key is the name of a participant in the case, and each value is a dictionary with the following structure: 63 | - `counsel` *(dictionary):* - each key is the name of a lawyer representing this participant, and each value is a dictionary with the following structure: 64 | - `office` *(string)* 65 | - `is_lead_attorney` *(boolean)* 66 | - `is_pro_hac_vice` *(boolean)* 67 | - `additional_info` *(dictionary)* - keys vary according to the information in the docket ('Designation,' 'Bar Status,' etc.) 68 | - `is_pro_se` *(boolean)* 69 | - `roles` *(list of strings)* - 'Plaintiff,' 'Petitioner,' 'Movant,' etc. 70 | - `pending_counts`, `terminated_counts` *(dictionary)* - criminal cases only; each key is the name of a party who was charged with a criminal count, and each value is a list in which each element has the following dictionary structure: 71 | - `counts` *(string)* 72 | - `disposition` *(string)* 73 | - `complaints` *(dictionary)* - certain criminal cases only; each key is the name of a party who was charged with a criminal count, and each value is the statute(s) specified as the basis of the charges 74 | - `docket_available` *(boolean)* 75 | - `docket` *(list of dictionaries)* - contains one item per docket entry, structured as follows: 76 | - `date_filed` *(string)* 77 | - `ind` *(string)* - Pacer's numerical index for this entry (can be an empty string, as not all Pacer entries are numbered) 78 | - `docket_text` *(string)* 79 | - `documents` *(dictionary)* - each key is either a non-zero attachment number or '0' for the main document, and each value is a dictionary with the following structure: 80 | - `url` *(string)* - the Pacer URL for this document 81 | - `span` *(dictionary)* - the starting and ending indices (within `docket_text`) of the hyperlink to the document, formatted as a dictionary with keys `start` and `end` 82 | - `edges` *(list of tuples)* - each element is a three-value tuple (encoded in graph-edge format) representing a hyperlink between two docket entries, with the first value encoding the index of the source entry within `docket`, the second value encoding the index of the target entry, and the third value encoding the starting and ending indices of the hyperlink within `docket_text` (as specified in `span` above) 83 | 84 | The following fields are not pulled directly from the Pacer docket, and are primarily meant for internal use: 85 | - `mdl_id_source` *(string)* - the origin of `mdl_code` (either 'lead_case_id' or 'flags') 86 | - `is_mdl` *(boolean)* - true if `mdl_code` is non-null or if the case has any MDL flags 87 | - `is_multi` *(boolean)* - true if `is_mdl` is true or if any of `lead_case_id`, `member_case_key`, or `other_court` is non-null 88 | - `member_case_key` *(string)* - a UCID-formatted version of `lead_case_id` (or a copy of `ucid` if this case is a lead case); used to write MDL-related data to an external file for improved performance 89 | - `source` *(string)* - used to distinguish between JSONs from this parser and similarly-formatted JSONs from other sources); if generated by this parser, will always be 'pacer' 90 | - `download_url` *(string)* - the URL from which this HTML was downloaded; only present if parsing an HTML from the SCALES scraper 91 | 92 | The following fields are pulled from the 'Transaction Receipt' at the bottom of the Pacer docket: 93 | - `billable_pages` *(integer)* 94 | - `cost` *(float)* 95 | - `download_timestamp` *(string)* 96 | - `n_docket_reports` *(integer)* - the number of times the SCALES scraper has modified this docket (1 if there have never been updates, >1 if new docket entries have been added after the initial download) 97 | - `pacer_case_id` *(integer)* - the unique numerical ID that Pacer uses internally to identify this document (pulled from Pacer's XML responses to user queries; not visible on the docket sheet itself) 98 | 99 | Case summaries: 100 | - `summary` *(object)* - case summary information, fully documented below 101 | 102 | ## Case summaries 103 | Case summaries can be downloaded through the SCALES scraper. They provide some additional information that is not available in the case docket reports. By default the scraper will place any downloaded summaries in the `/summaries` sub-directory of a given court directory. 104 | 105 | When the parser runs it will also parse any summaries associated with a case. It will search for the html files for these summaries in the summaries sub-directory (which can be manually specified with the `--summaries-dir` option). 106 | 107 | The schema for civil cases and criminal cases are slightly different due to PACER presenting the data in different ways. The main difference is that for criminal cases, each defendant has its own unique list of plaintiffs whereas for civil cases there is a single list of all parties in a case (including both plaintiffs and defendants). 108 | 109 | ### Civil Schema 110 | - `case_id` (*string*) - the case id e.g. '1:16-cv-00001, All defendants' 111 | - `case_name` (*string*) - e.g. 'USA v. Johnson et al.' 112 | - `date_filed` (*string*) - the case filing date 113 | - `date_terminated` (*string*) - the case terminating date 114 | - `date_of_last_filing` (*string*) - the date of last filing in the case 115 | - `presiding` (*string*) - presidint judge, if any 116 | - `referral` (*string*) - referred judge, if any 117 | - `billable_pages` (*int*) - no. of billable pages (usually just 1) 118 | - `cost` (*float*) - the cost of downloading the case summary (usually 0.10) 119 | - `download_timestamp` (*string*) - the time the case summary was downloaded 120 | - `defendants` (*list of objects*) - a list of defendants in the case. For each defendant there is: 121 | - `plaintiffs` (*string*) - list of objects containing `role`, `represented_by` and contact fields `fax`, `email` and `phone`. Note: often non-fax related things end up in the `fax` field e.g. 'US Govt Attorney' 122 | - `name` (*string*) - defendant name 123 | - `ind` (*string*) - defendant index within the case (*should* link back to the docket report) 124 | - `office` (*string*) - the court office 125 | - `county` (*string*) - the court county 126 | - `filed` (*string*) - defendant-specific filing date 127 | - `terminated` (*string*) - defendant-specific filing date 128 | - `reopened` (*string*) - defendant-specific reopneing date 129 | - `other_court_case` (*string*) - other associated cases 130 | - `defendant_custody_status` (*string*) - 131 | - `flags` (*list of strings*) - pacer flags that applied to the defendant e.g. ['CLOSED','PRO_SE' ] 132 | - `pending_status` (*string*) - 133 | - `magistrate_case` (*string*) - previous magistrate case, if any 134 | - `counts` (*list of objects*) - containing `count` (the count reference e.g. '1sss') `citation`, `offense_level` and `text` (the text associated with the count) 135 | - `complaints` (*list of objects*) - containing `citation`, `offense_level` and `text` (the text associated with the count) 136 | 137 | 138 | ### Criminal Schema 139 | - `case_id` (*string*) - the case id e.g. '1:16-cv-00001, All defendants' 140 | - `case_name` (*string*) - e.g. 'USA v. Johnson et al.' 141 | - `date_filed` (*string*) - the case filing date 142 | - `date_terminated` (*string*) - the case terminating date 143 | - `date_of_last_filing` (*string*) - the date of last filing in the case 144 | - `presiding` (*string*) - presidint judge, if any 145 | - `referral` (*string*) - referred judge, if any 146 | - `billable_pages` (*int*) - no. of billable pages (usually just 1) 147 | - `cost` (*float*) - the cost of downloading the case summary (usually 0.10) 148 | - `download_timestamp` (*string*) - the time the case summary was downloaded 149 | - `parties` (*list of objects*) - a list of parties in the case. For each party there is: 150 | - `role` (*string*) - their role in the case e.g ('Plaintiff', 'Defendant') 151 | - `name` (*string*) - party name 152 | - `represented_by` (*string*) - name of party's representation 153 | - `fax` (*string*) - contact fax no., note: often non-fax related things end up in the `fax` field e.g. 'Pro Hac Vice', 'MDL' 154 | - `email` (*string*) - contact email address 155 | - `phone` (*string*) - contact phone no. 156 | -------------------------------------------------------------------------------- /src/pacer_tools/code/parsers/parse_summary.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys 3 | from pathlib import Path 4 | 5 | from bs4 import BeautifulSoup 6 | 7 | sys.path.append(str(Path.cwd().resolve().parents[1])) 8 | from support import data_tools as dtools 9 | from support import fhandle_tools as ftools 10 | 11 | # Patterns 12 | RE_DEF = '^(?P[\s\S]+?)\s* \((?P\S+)\)$' 13 | CASE_META_KEYS = ('case_id', 'case_name', 'presiding', 'referral', 'date_filed', 'date_terminated', 'date_of_last_filing') 14 | 15 | 16 | class Pipeline: 17 | ''' A simple pipeline structure for parsing''' 18 | def __init__(self, pipes): 19 | ''' 20 | Inputs: 21 | - pipes (list): a list of functions that map (data,extracted) -> (data,extracted) 22 | ''' 23 | self.pipes = pipes 24 | 25 | def process(self, data, extracted): 26 | for fn in self.pipes: 27 | data, extracted = fn(data, extracted) 28 | 29 | return data, extracted 30 | 31 | def scrub_bad_tags(data, extracted): 32 | ''' 33 | Remove the tags that are breaking the parsing because of: 34 | 1. Illegal nesting e.g. text 35 | 2. Unclosed

tags 36 | Inputs: 37 | - data (str): raw html string from summary page html 38 | - extracted (dict) 39 | Outputs: 40 | (data,extracted) as above 41 | ''' 42 | 43 | pat = '||||

|

' 44 | data = re.sub(pat,'', data, flags=re.I) 45 | 46 | return data, extracted 47 | 48 | def extract_header(data, extracted): 49 | ''' Extract the header 50 | Inputs: 51 | - data (bs4 object): the page soup 52 | - extracted (dict): 53 | Outputs: 54 | (data, extracted) as above 55 | ''' 56 | soup = data 57 | header = soup.select_one('#cmecfMainContent center') 58 | header_vals = [] 59 | 60 | for el in header.contents: 61 | try: 62 | val = el.text 63 | except: 64 | val = el 65 | val=val.strip() 66 | if val: 67 | header_vals.append(val) 68 | 69 | header_data = { 70 | 'case_id': header_vals[0], 71 | 'case_name': header_vals[1] 72 | } 73 | pairs_start = 2 74 | 75 | if 'presiding' in header_vals[pairs_start]: 76 | header_data['presiding'] = header_vals[pairs_start] 77 | pairs_start += 1 78 | 79 | if 'referr' in header_vals[pairs_start]: 80 | header_data['referral'] = header_vals[pairs_start] 81 | pairs_start += 1 82 | 83 | # Add pairs 84 | for i in range(pairs_start,len(header_vals),2): 85 | try: 86 | key_name = "_".join(header_vals[i].rstrip(':').lower().split()) 87 | header_data[key_name] = header_vals[i+1] 88 | except IndexError: 89 | print('Something unseen in header metadata') 90 | 91 | extracted.update(header_data) 92 | 93 | return data, extracted 94 | 95 | def extract_cell(tag, as_tuple=False): 96 | ''' 97 | Extract metadata from a table cell tag 98 | 99 | Inputs: 100 | - tag (bs4 element): a table cell (td), though if extracting a table will be a tr (see below) 101 | 102 | Outputs: 103 | - (dict/tuple) dict with a single mapping {key:val} if it's a regular cell, or else a dict with 104 | multiple keys and values if the cell is itself a table with multiple fields/rows. Unless as_tuple 105 | is True, then a single (key,val) tuple is returned 106 | ''' 107 | # Recursively deals with cell if it contains a table 108 | if tag.select('table'): 109 | table_data = {} 110 | for tr in tag.select('tr'): 111 | table_data.update( extract_cell(tr)) 112 | return table_data 113 | else: 114 | 115 | key, val = tag.text.split(':',1) 116 | key = '_'.join(key.lower().strip().split()) 117 | val = val.strip().replace(' ',' ') 118 | return {key:val} if not as_tuple else (key,val) 119 | 120 | def grab_parties_table(table): 121 | ''' 122 | Grab info from tables that have rows that look like the following: 123 | 124 | Plaintiff/Defendant: represented by Phone/Fax/email... 125 | 126 | For criminal cases this will be the Plaintiff table (one for each defendant). 127 | For civil cases this will correspond to the entire party table. 128 | 129 | Inputs: 130 | - table (bs4 tag): the tag corresponding to the table to parse 131 | Outputs: 132 | - parties (dict): the parties info from the table 133 | ''' 134 | parties = [] 135 | 136 | rows = [ch for ch in table.children if ch.name=='tr'] 137 | if not len(rows): 138 | tbody = table.select_one('tbody') 139 | 140 | # If there is literally nothing between the
tags 141 | if not tbody: 142 | return parties 143 | else: 144 | rows = [ch for ch in table.select_one('tbody').children if ch.name=='tr'] 145 | 146 | for i, tr in enumerate(rows): 147 | party = {} 148 | 149 | cells = [ch for ch in tr.children if ch.name=='td'] 150 | 151 | # Get plaintiff role and name first 152 | role_and_name = cells[0] 153 | role, name = extract_cell(role_and_name, as_tuple=True) 154 | party['role'] = role 155 | party['name'] = name 156 | 157 | # If no representation ifno 158 | if len(cells) < 3: 159 | party['represented_by'] = None 160 | 161 | else: 162 | if cells[1].text.strip() == 'represented by': 163 | party['represented_by'] = cells[2].text.strip() 164 | 165 | contact = cells[3] 166 | party.update( extract_cell(contact) ) 167 | 168 | parties.append(party) 169 | 170 | return parties 171 | 172 | def get_civil_parties(data, extracted): 173 | ''' 174 | Method to extract main data from summary for civiil cases 175 | 176 | Inputs: 177 | - data (bs4 object): the soup 178 | - extracted (dict): the case extracted data 179 | 180 | Outputs: 181 | (data, extracted) as above 182 | 183 | ''' 184 | main_tables_cv = data.select('#cmecfMainContent > table') 185 | 186 | if len(main_tables_cv) < 2: 187 | raise ValueError 188 | 189 | case_data = {} 190 | 191 | meta_table = main_tables_cv[0] 192 | parties_table = main_tables_cv[1] 193 | 194 | for i, tr in enumerate(ch for ch in meta_table.select('tr') if ch.name=='tr'): 195 | 196 | 197 | tr_text = tr.text.strip() 198 | 199 | # Skip blank lines 200 | if not tr_text: 201 | continue 202 | 203 | else: 204 | key=None 205 | for child in tr.children: 206 | if child.name=='td': 207 | 208 | # Check if a value present with no key 209 | if not child.select('b') and key is not None: 210 | # Use key from previous iteration: 211 | extracted[key] = child.text 212 | else: 213 | 214 | 215 | key,val = extract_cell(child, as_tuple=True) 216 | 217 | extracted[key] = val 218 | 219 | 220 | # PLAINTIFFS 221 | parties = grab_parties_table(parties_table) 222 | extracted['parties'] = parties 223 | 224 | return data, extracted 225 | 226 | def get_criminal_def_pla(data, extracted): 227 | ''' 228 | Method to extract main data from summary for criminal cases 229 | 230 | Inputs: 231 | - data (bs4 object): the soup 232 | - extracted (dict): the case extracted data 233 | 234 | Outputs: 235 | (data, extracted) as above 236 | 237 | ''' 238 | 239 | extracted['defendants'] = [] 240 | 241 | main_tables = data.select('#cmecfMainContent > table') 242 | 243 | 244 | if not (len(main_tables) % 2 == 0): 245 | raise ValueError('Imbalanced number of plaintiff/defendant tables') 246 | 247 | # Iterate over the tables in pairs (defendant info, list of plaintiffs) 248 | for def_ord in range(0, len(main_tables), 2): 249 | 250 | defendant_table = main_tables[def_ord] 251 | plaintiff_table = main_tables[def_ord+1] 252 | 253 | defendant = {'counts': [], 'complaints':[], 'plaintiffs':[], } 254 | count_instance, cmplt_instance = None, None 255 | 256 | for i, tr in enumerate(defendant_table.select('tr')): 257 | 258 | tr_text = tr.text.strip() 259 | 260 | # Skip blank lines 261 | if not tr_text: 262 | continue 263 | 264 | # First row, grab defendant name 265 | elif i==0: 266 | def_text = tr_text 267 | def_match = re.match(RE_DEF, def_text) 268 | match_dict = def_match.groupdict() if def_match else {} 269 | defendant['name'] = match_dict.get('name').strip().replace(' ',' ') 270 | defendant['ind'] = match_dict.get('ind').strip() 271 | continue 272 | 273 | # New count row 274 | elif tr_text.startswith('Count:'): 275 | count_instance = {} 276 | for td in tr.select('td'): 277 | count_instance.update( extract_cell(td) ) 278 | 279 | # If previous line was a new count instance, grab the count_text from this line 280 | elif count_instance: 281 | count_instance['text'] = tr_text 282 | defendant['counts'].append(count_instance.copy()) 283 | # Reset count instance 284 | count_instance = None 285 | 286 | # New count row 287 | elif tr_text.startswith('Complaint'): 288 | #Set count to none 289 | cmplt_instance = {} 290 | 291 | # Skip the first cell (the 'Complaint' cell, not a k:v pair) 292 | for td in tr.select('td')[1:]: 293 | cmplt_instance.update( extract_cell(td) ) 294 | 295 | # If previous line was a new count instance, grab the count_text from this line 296 | elif cmplt_instance: 297 | cmplt_instance['text'] = tr_text 298 | defendant['complaints'].append(cmplt_instance.copy()) 299 | # Reset count instance 300 | cmplt_instance = None 301 | 302 | # Magistrate info is split over multiple tds, so just pass the whole row 303 | elif tr_text.startswith('Magistrate'): 304 | defendant.update( extract_cell(tr) ) 305 | 306 | 307 | # Otherwise it's general data about the defendant's case, grab it 308 | else: 309 | for child in tr.children: 310 | if child.name=='td': 311 | try: 312 | defendant.update( extract_cell(child) ) 313 | except: 314 | if child.text.strip().lower().startswith('complaint'): 315 | defendant.update({'complaint':None}) 316 | 317 | 318 | # flag/flags 319 | if 'flag' in defendant: 320 | defendant['flags'] = defendant['flag'] 321 | del defendant['flag'] 322 | defendant['flags'] = (defendant.get('flags') or '').split(',') 323 | 324 | # other court case/cases 325 | if 'other_court_cases' in defendant: 326 | defendant['other_court_case'] = defendant['other_court_cases'] 327 | del defendant['other_court_cases'] 328 | 329 | # PLAINTIFFS 330 | plaintiffs = grab_parties_table(plaintiff_table) 331 | defendant['plaintiffs'] = plaintiffs 332 | 333 | extracted['defendants'].append(defendant) 334 | 335 | return data, extracted 336 | 337 | def get_main_data(data, extracted): 338 | ''' Get the main data from the summary, switches function between civil and criminal main functions''' 339 | 340 | case_type = ftools.decompose_caseno(extracted['case_id'])['case_type'] 341 | 342 | if case_type == 'cv': 343 | data, extracted = get_civil_parties(data,extracted) 344 | 345 | elif case_type == 'cr': 346 | data, extracted = get_criminal_def_pla(data, extracted) 347 | 348 | else: 349 | raise ValueError('Only know how to parse cv and cr cases') 350 | 351 | return data, extracted 352 | 353 | def ensure_keys(data, extracted): 354 | ''' Guarantee key existence for all fields case meta keys, even if they weren't found''' 355 | 356 | for k in CASE_META_KEYS: 357 | extracted[k] = extracted.get(k,'') 358 | 359 | return data, extracted 360 | 361 | def get_summary_transaction_data(data, extracted): 362 | ''' Get the transaction data for the summary''' 363 | transaction_data = ftools.parse_transaction_history(str(data)) 364 | extracted['billable_pages'] = int(transaction_data['billable_pages']) if 'billable_pages' in transaction_data.keys() else None 365 | extracted['cost'] = float(transaction_data['cost']) if 'cost' in transaction_data.keys() else None 366 | extracted['download_timestamp'] = transaction_data.get('timestamp','') 367 | 368 | return data, extracted 369 | 370 | # This is the complete summary pipeline 371 | # Use the inherited Pipeline.process method to process data 372 | SummaryPipeline = Pipeline([ 373 | scrub_bad_tags, 374 | lambda d,e: (BeautifulSoup(d,'html.parser'), e) , 375 | extract_header, 376 | get_main_data, 377 | ensure_keys, 378 | get_summary_transaction_data 379 | ]) -------------------------------------------------------------------------------- /src/pacer_tools/data/annotation/district_courts.csv: -------------------------------------------------------------------------------- 1 | abbreviation,name,circuit,citation_abbreviation,state,cardinal,courtname,homepage,start_date,end_date,count,jurisdiction 2 | dcd,"District Court, District of Columbia",District of Columbia,D.D.C.,District of Columbia,,district-of-columbia,http://www.dcd.uscou…,Unknown,Unknown,30822.0,Federal District 3 | ald,"District Court, D. Alabama",,D. Ala.,Alabama,,alabama,,1820-04-21,1824-03-10,0.0,Federal District 4 | almd,"District Court, M.D. Alabama",Eleventh,M.D. Ala.,Alabama,Middle,middle-alabama,http://www.almd.usco…,Unknown,Unknown,2266.0,Federal District 5 | alnd,"District Court, N.D. Alabama",Eleventh,N.D. Ala.,Alabama,Northern,northern-alabama,http://www.alnd.usco…,Unknown,Unknown,1247.0,Federal District 6 | alsd,"District Court, S.D. Alabama",Eleventh,S.D. Ala.,Alabama,Southern,southern-alabama,http://www.als.uscou…,Unknown,Unknown,724.0,Federal District 7 | akd,"District Court, D. Alaska",Ninth,D. Alaska,Alaska,,alaska,http://www.akd.uscou…,Unknown,Unknown,444.0,Federal District 8 | azd,"District Court, D. Arizona",Ninth,D. Ariz.,Arizona,,arizona,http://www.azd.uscou…,Unknown,Unknown,1286.0,Federal District 9 | ared,"District Court, E.D. Arkansas",Eighth,E.D. Ark.,Arkansas,Eastern,eastern-arkansas,http://www.are.uscou…,Unknown,Unknown,1190.0,Federal District 10 | arwd,"District Court, W.D. Arkansas",Eighth,W.D. Ark.,Arkansas,Western,western-arkansas,http://www.arwd.usco…,Unknown,Unknown,810.0,Federal District 11 | cacd,"District Court, C.D. California",Ninth,C.D. Cal.,California,Central,central-california,http://www.cacd.usco…,Unknown,Unknown,3246.0,Federal District 12 | caed,"District Court, E.D. California",Ninth,E.D. Cal.,California,Eastern,eastern-california,http://www.caed.usco…,Unknown,Unknown,1258.0,Federal District 13 | cand,"District Court, N.D. California",Ninth,N.D. Cal.,California,Northern,northern-california,http://www.cand.usco…,Unknown,Unknown,4669.0,Federal District 14 | casd,"District Court, S.D. California",Ninth,S.D. Cal.,California,Southern,southern-california,http://www.casd.usco…,Unknown,Unknown,1725.0,Federal District 15 | cod,"District Court, D. Colorado",Tenth,D. Colo.,Colorado,,colorado,http://www.cod.uscou…,Unknown,Unknown,3237.0,Federal District 16 | ctd,"District Court, D. Connecticut",Second,D. Conn.,Connecticut,,connecticut,http://www.ctd.uscou…,Unknown,Unknown,4495.0,Federal District 17 | ded,"District Court, D. Delaware",Third,D. Del.,Delaware,,delaware,http://www.ded.uscou…,Unknown,Unknown,3748.0,Federal District 18 | flmd,"District Court, M.D. Florida",Eleventh,M.D. Fla.,Florida,Middle,middle-florida,http://www.flmd.usco…,Unknown,Unknown,2998.0,Federal District 19 | flnd,"District Court, N.D. Florida",Eleventh,N.D. Fla.,Florida,Northern,northern-florida,http://www.flnd.usco…,Unknown,Unknown,536.0,Federal District 20 | flsd,"District Court, S.D. Florida",Eleventh,S.D. Fla.,Florida,Southern,southern-florida,http://www.flsd.usco…,Unknown,Unknown,4193.0,Federal District 21 | gamd,"District Court, M.D. Georgia",Eleventh,M.D. Ga.,Georgia,Middle,middle-georgia,http://www.gamd.usco…,Unknown,Unknown,837.0,Federal District 22 | gand,"District Court, N.D. Georgia",Eleventh,N.D. Ga.,Georgia,Northern,northern-georgia,http://www.gand.usco…,Unknown,Unknown,2814.0,Federal District 23 | gasd,"District Court, S.D. Georgia",Eleventh,S.D. Ga.,Georgia,Southern,southern-georgia,http://www.gasd.usco…,Unknown,Unknown,771.0,Federal District 24 | hid,"District Court, D. Hawaii",Ninth,D. Haw.,Hawaii,,hawaii,http://www.hid.uscou…,Unknown,Unknown,1321.0,Federal District 25 | idd,"District Court, D. Idaho",Ninth,D. Idaho,Idaho,,idaho,http://www.id.uscour…,Unknown,Unknown,430.0,Federal District 26 | ilcd,"District Court, C.D. Illinois",Seventh,C.D. Ill.,Illinois,Central,central-illinois,http://www.ilcd.usco…,Unknown,Unknown,1029.0,Federal District 27 | ilnd,"District Court, N.D. Illinois",Seventh,N.D. Ill.,Illinois,Northern,northern-illinois,http://www.ilnd.usco…,Unknown,Unknown,12460.0,Federal District 28 | ilsd,"District Court, S.D. Illinois",Seventh,S.D. Ill.,Illinois,Southern,southern-illinois,http://www.ilsd.usco…,Unknown,Unknown,505.0,Federal District 29 | innd,"District Court, N.D. Indiana",Seventh,N.D. Ind.,Indiana,Northern,northern-indiana,http://www.innd.usco…,4/21/2028,Unknown,1829.0,Federal District 30 | insd,"District Court, S.D. Indiana",Seventh,S.D. Ind.,Indiana,Southern,southern-indiana,http://www.insd.usco…,4/21/2028,Unknown,1387.0,Federal District 31 | indianad,"District Court, D. Indiana",,D. Ind.,Indiana,,indiana,http://www.insd.usco…,1817-03-03,4/21/2028,3.0,Federal District 32 | iand,"District Court, N.D. Iowa",Eighth,N.D. Iowa,Iowa,Northern,northern-iowa,http://www.iand.usco…,Unknown,Unknown,1230.0,Federal District 33 | iasd,"District Court, S.D. Iowa",Eighth,S.D. Iowa,Iowa,Southern,southern-iowa,http://www.iasd.usco…,Unknown,Unknown,1165.0,Federal District 34 | ksd,"District Court, D. Kansas",Tenth,D. Kan.,Kansas,,kansas,http://www.ksd.uscou…,Unknown,Unknown,4748.0,Federal District 35 | kyed,"District Court, E.D. Kentucky",Sixth,E.D. Ky.,Kentucky,Eastern,eastern-kentucky,http://www.kyed.usco…,Unknown,Unknown,863.0,Federal District 36 | kywd,"District Court, W.D. Kentucky",Sixth,W.D. Ky.,Kentucky,Western,western-kentucky,http://www.kywd.usco…,Unknown,Unknown,1011.0,Federal District 37 | laed,"District Court, E.D. Louisiana.",Fifth,E.D. La.,Louisiana,Eastern,eastern-louisiana,http://www.laed.usco…,Unknown,Unknown,2763.0,Federal District 38 | lamd,"District Court, M.D. Louisiana",Fifth,M.D. La.,Louisiana,Middle,middle-louisiana,http://www.lamd.usco…,Unknown,Unknown,706.0,Federal District 39 | lawd,"District Court, W.D. Louisiana",Fifth,W.D. La.,Louisiana,Western,western-louisiana,http://www.lawd.usco…,Unknown,Unknown,1223.0,Federal District 40 | med,"District Court, D. Maine",First,D. Me.,Maine,,maine,http://www.med.uscou…,Unknown,Unknown,2505.0,Federal District 41 | mdd,"District Court, D. Maryland",Fourth,D. Maryland,Maryland,,maryland,https://www.mdd.usco…,Unknown,Unknown,5001.0,Federal District 42 | mad,"District Court, D. Massachusetts",First,D. Mass.,Massachusetts,,massachusetts,http://www.mad.uscou…,Unknown,Unknown,7532.0,Federal District 43 | mied,"District Court, E.D. Michigan",Sixth,E.D. Mich.,Michigan,Eastern,eastern-michigan,http://www.mied.usco…,Unknown,Unknown,5436.0,Federal District 44 | miwd,"District Court, W.D. Michigan",Sixth,W.D. Mich.,Michigan,Western,western-michigan,http://www.miwd.usco…,Unknown,Unknown,1696.0,Federal District 45 | mnd,"District Court, D. Minnesota",Eighth,D. Minnesota,Minnesota,,minnesota,http://www.mnd.uscou…,Unknown,Unknown,3249.0,Federal District 46 | msnd,"District Court, N.D. Mississippi",Fifth,N.D. Miss.,Mississippi,Northern,northern-mississippi,http://www.msnd.usco…,Unknown,Unknown,942.0,Federal District 47 | mssd,"District Court, S.D. Mississippi",Fifth,S.D. Miss.,Mississippi,Southern,southern-mississippi,http://www.mssd.usco…,Unknown,Unknown,1696.0,Federal District 48 | moed,"District Court, E.D. Missouri",Eighth,E.D. Mo.,Missouri,Eastern,eastern-missouri,http://www.moed.usco…,Unknown,Unknown,2632.0,Federal District 49 | mowd,"District Court, W.D. Missouri",Eighth,W.D. Mo.,Missouri,Western,western-missouri,http://www.mow.uscou…,Unknown,Unknown,1957.0,Federal District 50 | mtd,"District Court, D. Montana",Ninth,D. Mont.,Montana,,montana,http://www.mtd.uscou…,Unknown,Unknown,816.0,Federal District 51 | ned,"District Court, D. Nebraska",Eighth,D. Neb.,Nebraska,,nebraska,http://www.ned.uscou…,Unknown,Unknown,1165.0,Federal District 52 | nvd,"District Court, D. Nevada",Ninth,D. Nev.,Nevada,,nevada,http://www.nvd.uscou…,Unknown,Unknown,1221.0,Federal District 53 | nhd,"District Court, D. New Hampshire",First,D.N.H.,New Hampshire,,new-hampshire,http://www.nhd.uscou…,Unknown,Unknown,912.0,Federal District 54 | njd,"District Court, D. New Jersey",Third,D.N.J.,New Jersey,,new-jersey,http://www.njd.uscou…,1789-09-24,Unknown,4865.0,Federal District 55 | nmd,"District Court, D. New Mexico",Tenth,D.N.M.,New Mexico,,new-mexico,http://www.nmcourt.f…,Unknown,Unknown,1084.0,Federal District 56 | nyed,"District Court, E.D. New York",Second,E.D.N.Y,New York,Eastern,eastern-new-york,http://www.nyed.usco…,Unknown,Unknown,7792.0,Federal District 57 | nynd,"District Court, N.D. New York",Second,N.D.N.Y.,New York,Northern,northern-new-york,http://www.nynd.usco…,Unknown,Unknown,2578.0,Federal District 58 | nysd,"District Court, S.D. New York",Second,S.D.N.Y.,New York,Southern,southern-new-york,http://www.nysd.usco…,Unknown,Unknown,26704.0,Federal District 59 | nywd,"District Court, W.D. New York",Second,W.D.N.Y.,New York,Western,western-new-york,http://www.nywd.usco…,Unknown,Unknown,2924.0,Federal District 60 | nced,"District Court, E.D. North Carolina",Fourth,E.D.N.C.,North Carolina,Eastern,eastern-north-carolina,http://www.nced.usco…,Unknown,Unknown,1076.0,Federal District 61 | ncmd,"District Court, M.D. North Carolina",Fourth,M.D.N.C.,North Carolina,Middle,middle-north-carolina,http://www.ncmd.usco…,Unknown,Unknown,1083.0,Federal District 62 | ncwd,"District Court, W.D. North Carolina",Fourth,W.D.N.C.,North Carolina,Western,western-north-carolina,http://www.ncwd.usco…,Unknown,Unknown,1061.0,Federal District 63 | ndd,"District Court, D. North Dakota",Eighth,D.N.D.,North Dakota,,north-dakota,http://www.ndd.uscou…,Unknown,Unknown,606.0,Federal District 64 | ohnd,"District Court, N.D. Ohio",Sixth,N.D. Ohio,Ohio,Northern,northern-ohio,http://www.ohnd.usco…,Unknown,Unknown,3293.0,Federal District 65 | ohsd,"District Court, S.D. Ohio",Sixth,S.D. Ohio,Ohio,Southern,southern-ohio,http://www.ohsd.usco…,Unknown,Unknown,2860.0,Federal District 66 | oked,"District Court, E.D. Oklahoma",Tenth,E.D. Okla.,Oklahoma,Eastern,eastern-oklahoma,http://www.oked.usco…,Unknown,Unknown,174.0,Federal District 67 | oknd,"District Court, N.D. Oklahoma",Tenth,N.D. Okla.,Oklahoma,Northern,northern-oklahoma,http://www.oknd.usco…,Unknown,Unknown,485.0,Federal District 68 | okwd,"District Court, W.D. Oklahoma",Tenth,W.D. Okla.,Oklahoma,Western,western-oklahoma,http://www.okwd.usco…,,,,Federal District 69 | ord,"District Court, D. Oregon",Ninth,D. Or.,Oregon,,oregon,http://www.ord.uscou…,Unknown,Unknown,1859.0,Federal District 70 | paed,"District Court, E.D. Pennsylvania",Third,E.D. Pa.,Pennsylvania,Eastern,eastern-pennsylvania,http://www.paed.usco…,Unknown,Unknown,10441.0,Federal District 71 | pamd,"District Court, M.D. Pennsylvania",Third,M.D. Penn.,Pennsylvania,Middle,middle-pennsylvania,http://www.pamd.usco…,Unknown,Unknown,2304.0,Federal District 72 | pawd,"District Court, W.D. Pennsylvania",Third,W.D. Pa.,Pennsylvania,Western,western-pennsylvania,http://www.pawd.usco…,Unknown,Unknown,3250.0,Federal District 73 | rid,"District Court, D. Rhode Island",First,D.R.I.,Rhode Island,,rhode-island,http://www.rid.uscou…,Unknown,Unknown,1556.0,Federal District 74 | southcarolinaed,"District Court, E.D. South Carolina",,E.D.S.C.,South Carolina,Eastern,eastern-south-carolina,http://www.scd.uscou…,1823-02-21,10/7/1965,181.0,Federal District 75 | southcarolinawd,"District Court, W.D. South Carolina",,W.D.S.C.,South Carolina,Western,western-south-carolina,http://www.ncwd.usco…,1823-02-21,10/7/1965,95.0,Federal District 76 | scd,"District Court, D. South Carolina",Fourth,D.S.C.,South Carolina,,south-carolina,https://www.scd.uscourts.gov/,,,,Federal District 77 | sdd,"District Court, D. South Dakota",Eighth,D.S.D.,South Dakota,,south-dakota,http://www.sdd.uscou…,Unknown,Unknown,867.0,Federal District 78 | tned,"District Court, E.D. Tennessee",Sixth,E.D. Tenn.,Tennessee,Eastern,eastern-tennessee,http://www.tned.usco…,Unknown,Unknown,1363.0,Federal District 79 | tnmd,"District Court, M.D. Tennessee",Sixth,M.D. Tenn.,Tennessee,Middle,middle-tennessee,http://www.tnmd.usco…,Unknown,Unknown,1147.0,Federal District 80 | tnwd,"District Court, W.D. Tennessee",Sixth,W.D. Tenn.,Tennessee,Western,western-tennessee,http://www.tnwd.usco…,Unknown,Unknown,892.0,Federal District 81 | txed,"District Court, E.D. Texas",Fifth,E.D. Tex.,Texas,Eastern,eastern-texas,http://www.txed.usco…,Unknown,Unknown,1591.0,Federal District 82 | txnd,"District Court, N.D. Texas",Fifth,N.D. Tex.,Texas,Northern,northern-texas,http://www.txnd.usco…,Unknown,Unknown,2412.0,Federal District 83 | txsd,"District Court, S.D. Texas",Fifth,S.D. Tex.,Texas,Southern,southern-texas,http://www.txs.uscou…,Unknown,Unknown,3497.0,Federal District 84 | txwd,"District Court, W.D. Texas",Fifth,W.D. Tex.,Texas,Western,western-texas,http://www.txwd.usco…,Unknown,Unknown,1283.0,Federal District 85 | utd,"District Court, D. Utah",Tenth,D. Utah,Utah,,utah,http://www.utd.uscou…,Unknown,Unknown,1237.0,Federal District 86 | vtd,"District Court, D. Vermont",Second,D. Vt.,Vermont,,vermont,http://www.vtd.uscou…,Unknown,Unknown,668.0,Federal District 87 | vaed,"District Court, E.D. Virginia",Fourth,E.D. Va.,Virginia,Eastern,eastern-virginia,http://www.vaed.usco…,Unknown,Unknown,4530.0,Federal District 88 | vawd,"District Court, W.D. Virginia",Fourth,W.D. Va.,Virginia,Western,western-virginia,http://www.vawd.usco…,Unknown,Unknown,2018.0,Federal District 89 | waed,"District Court, E.D. Washington",Ninth,E.D. Wash.,Washington,Eastern,eastern-washington,http://www.waed.usco…,3/2/05,Unknown,437.0,Federal District 90 | wawd,"District Court, W.D. Washington",Ninth,W.D. Wash.,Washington,Western,western-washington,http://www.wawd.usco…,3/2/05,Unknown,1146.0,Federal District 91 | wvnd,"District Court, N.D. West Virginia",Fourth,N.D.W. Va.,West Virginia,Northern,northern-west-virginia,http://www.wvnd.usco…,Unknown,Unknown,466.0,Federal District 92 | wvsd,"District Court, S.D. West Virginia",Fourth,S.D.W. Va,West Virginia,Southern,southern-west-virginia,http://www.wvsd.usco…,Unknown,Unknown,1238.0,Federal District 93 | wied,"District Court, E.D. Wisconsin",Seventh,E.D. Wis.,Wisconsin,Eastern,eastern-wisconsin,http://www.wied.usco…,Unknown,Unknown,2740.0,Federal District 94 | wiwd,"District Court, W.D. Wisconsin",Seventh,W.D. Wis.,Wisconsin,Western,western-wisconsin,http://www.wiwd.usco…,Unknown,Unknown,1181.0,Federal District 95 | wyd,"District Court, D. Wyoming",Tenth,D. Wyo.,Wyoming,,wyoming,http://www.wyd.uscou…,Unknown,Unknown,481.0,Federal District 96 | gud,"District Court, D. Guam",Ninth,D. Guam,Guam,,guam,http://www.gud.uscou…,Unknown,Unknown,38.0,Federal District 97 | nmid,"District Court, Northern Mariana Islands",Ninth,N. Mar. I.,Northern Mariana Islands,,northern-mariana-islands,http://www.nmid.usco…,Unknown,Unknown,16.0,Federal District 98 | prd,"District Court, D. Puerto Rico",First,D.P.R.,Puerto Rico,,puerto-rico,http://www.prd.uscou…,Unknown,Unknown,4054.0,Federal District 99 | vid,"District Court, Virgin Islands",Third,D.V.I.,Virgin Islands,,virgin-islands,http://www.vid.uscou…,Unknown,Unknown,656.0,Federal District 100 | --------------------------------------------------------------------------------