├── src
    └── pacer_tools
    │   ├── code
    │       ├── __init__.py
    │       ├── db
    │       │   ├── __init__.py
    │       │   └── rdf
    │       │   │   ├── __init__.py
    │       │   │   ├── constants.py
    │       │   │   ├── make_graph_data_fulton_county.py
    │       │   │   └── utils.py
    │       ├── tasks
    │       │   ├── __init__.py
    │       │   ├── build_unique_table.py
    │       │   └── redact_pacer.py
    │       ├── downloader
    │       │   ├── __init__.py
    │       │   └── demo.json
    │       ├── parsers
    │       │   ├── __init__.py
    │       │   ├── parse_all.sh
    │       │   ├── parse_subset.sh
    │       │   ├── schemas
    │       │   │   ├── jel_v1.schema.json
    │       │   │   ├── docket_entry_v1.schema.json
    │       │   │   ├── sel_v1.schema.json
    │       │   │   ├── party_cv_v1.schema.json
    │       │   │   ├── party_cr_v1.schema.json
    │       │   │   ├── case_cv_v1.schema.json
    │       │   │   └── case_cr_v1.schema.json
    │       │   ├── README.md
    │       │   └── parse_summary.py
    │       ├── support
    │       │   ├── __init__.py
    │       │   ├── core.py
    │       │   ├── .gitignore
    │       │   ├── scales_shell.py
    │       │   ├── language_tools.py
    │       │   ├── core_data
    │       │   │   ├── district_courts_94.csv
    │       │   │   ├── statey2code.json
    │       │   │   └── nature_suit.csv
    │       │   ├── viz_tools.py
    │       │   ├── counsel_functions.py
    │       │   ├── settings.py
    │       │   ├── stats.py
    │       │   ├── mongo_connector.py
    │       │   ├── docket_functions.py
    │       │   ├── court_functions.py
    │       │   ├── disambiguation_functions.py
    │       │   ├── README.md
    │       │   ├── research_tools.py
    │       │   ├── bundler.py
    │       │   └── text_functions.py
    │       └── cli.py
    │   ├── data
    │       ├── exclude.csv
    │       └── annotation
    │       │   ├── member_lead_links.jsonl
    │       │   ├── fjc_district_codes.json
    │       │   ├── statey2code.json
    │       │   ├── nature_suit.csv
    │       │   └── district_courts.csv
    │   ├── __init__.py
    │   ├── .gitignore
    │   ├── requirements.yml
    │   └── requirements.txt
├── demo
    ├── document_input.csv
    ├── auth.json
    └── query_conf.json
├── .gitignore
├── setup.py
└── README.md


/src/pacer_tools/code/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/pacer_tools/code/db/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/pacer_tools/code/tasks/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/pacer_tools/code/db/rdf/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/pacer_tools/code/downloader/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/pacer_tools/code/parsers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/pacer_tools/code/support/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/pacer_tools/data/exclude.csv:
--------------------------------------------------------------------------------
1 | ucid
2 | 


--------------------------------------------------------------------------------
/src/pacer_tools/data/annotation/member_lead_links.jsonl:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/demo/document_input.csv:
--------------------------------------------------------------------------------
1 | ucid,doc_no
2 | psc;;1:07-cv-00431,2
3 | 


--------------------------------------------------------------------------------
/demo/auth.json:
--------------------------------------------------------------------------------
1 | {
2 |   "user": "<PACER_USERNAME>",
3 |   "pass": "<PACER_PASSWORD>"
4 | }
5 | 


--------------------------------------------------------------------------------
/demo/query_conf.json:
--------------------------------------------------------------------------------
1 | {
2 |   "case_status": "closed",
3 |   "filed_from": "06/29/2007",
4 |   "filed_to": "07/01/2007"
5 | }
6 | 


--------------------------------------------------------------------------------
/src/pacer_tools/code/downloader/demo.json:
--------------------------------------------------------------------------------
1 | {
2 |   "case_status": "closed",
3 |   "filed_from": "06/29/2007",
4 |   "filed_to": "07/01/2007"
5 | }
6 | 


--------------------------------------------------------------------------------
/src/pacer_tools/__init__.py:
--------------------------------------------------------------------------------
1 | from pacer_tools.code.cli import main as cli
2 | import pacer_tools.code.support.data_tools as dtools
3 | import pacer_tools.code.support.fhandle_tools as ftools


--------------------------------------------------------------------------------
/src/pacer_tools/code/support/core.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | 
3 | def std_path(fpath):
4 |     ''' Standardise a filepath, returns a Path object'''
5 |     if type(fpath) is str:
6 |         fpath = Path(fpath.replace('\\','/'))
7 |     return fpath
8 | 


--------------------------------------------------------------------------------
/src/pacer_tools/code/support/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | data/*
 3 | **/*.auth
 4 | *.pyc
 5 | *.key
 6 | *.env
 7 | noacri.db
 8 | **/.ipynb_checkpoints/*
 9 | **/_examples/*
10 | **/_misc.ipynb
11 | **/login.auth
12 | **/*.auth
13 | .vscode/*
14 | **/_temp_/*
15 | code/downloader/test/*
16 | **/geckodriver.log
17 | 


--------------------------------------------------------------------------------
/src/pacer_tools/code/cli.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | from pacer_tools.code.downloader.scrapers import scraper
 3 | from pacer_tools.code.parsers.parse_pacer import parser
 4 | 
 5 | 
 6 | @click.group()
 7 | def main():
 8 |     pass
 9 | 
10 | main.add_command(scraper)
11 | main.add_command(parser)
12 | 
13 | 
14 | if __name__ == '__main__':
15 |     main()


--------------------------------------------------------------------------------
/src/pacer_tools/.gitignore:
--------------------------------------------------------------------------------
 1 | **/node_modules/*
 2 | 
 3 | .DS_Store
 4 | **/.ipynb_checkpoints/*
 5 | **/_examples/*
 6 | **/_misc.ipynb
 7 | **/fjc_scott.ipynb
 8 | **/_temp_/*
 9 | **/living_reports/**/.gitignore
10 | noacri.db
11 | .vscode/*
12 | **/geckodriver.log
13 | code/downloader/test/*
14 | code/downloader/logs/*
15 | **/conductor/logs/*
16 | **/nohup*.out
17 | 
18 | **/*.auth
19 | *.pyc
20 | *.key
21 | *.env
22 | **/login.auth
23 | **/*.auth
24 | 


--------------------------------------------------------------------------------
/src/pacer_tools/code/parsers/parse_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # After reading the directory path, passes all arguments through to parse_pacer.py; use -f for force and -d for debug (-fd for both)
 3 | # Example: bash parse_all.sh ../../data/pacer -fd
 4 | # See arguments documented in parse_pacer.py
 5 | 
 6 | dir=$1
 7 | shift
 8 | for courtdir in $dir/*/; do 
 9 |     echo $courtdir;
10 |     python parse_pacer.py $courtdir/html/ $courtdir/json/ "$@"; 
11 | done
12 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | build/*
 2 | dist/*
 3 | 
 4 | src/*.egg-info
 5 | 
 6 | **/node_modules/*
 7 | 
 8 | .DS_Store
 9 | **/.ipynb_checkpoints/*
10 | **/_examples/*
11 | **/_misc.ipynb
12 | **/fjc_scott.ipynb
13 | **/_temp_/*
14 | **/living_reports/**/.gitignore
15 | noacri.db
16 | .vscode/*
17 | **/geckodriver.log
18 | code/downloader/test/*
19 | code/downloader/logs/*
20 | **/conductor/logs/*
21 | **/nohup*.out
22 | 
23 | **/*.auth
24 | *.pyc
25 | *.key
26 | *.env
27 | **/login.auth
28 | **/*.auth
29 | 


--------------------------------------------------------------------------------
/src/pacer_tools/code/db/rdf/constants.py:
--------------------------------------------------------------------------------
1 | from rdflib import Namespace
2 | 
3 | SCALES = Namespace("http://schemas.scales-okn.org/rdf/scales#")
4 | J = Namespace("http://release.niem.gov/niem/domains/jxdm/7.2/#")
5 | NC = Namespace("http://release.niem.gov/niem/niem-core/5.0/#")
6 | FIPS = Namespace("http://release.niem.gov/niem/codes/fips/5.2/#")
7 | NIBRS = Namespace("http://fbi.gov/cjis/nibrs/2023.0/")
8 | OCCS = Namespace("http://release.niem.gov/niem/codes/occs/5.0/#")
9 | TREATMENT = Namespace("http://sail.ua.edu/ruralkg/treatmentprovider/")


--------------------------------------------------------------------------------
/src/pacer_tools/code/parsers/parse_subset.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | dir=$1
 4 | shift
 5 | while getopts s:e: flag; do
 6 |     case "$flag" in
 7 |         s)  
 8 |             startdir=${OPTARG}
 9 |             ;;
10 |         e)
11 |             enddir=${OPTARG}
12 |             ;;
13 |     esac
14 | done
15 | 
16 | shift 4
17 | for courtdir in $dir/*; do 
18 |     if [ ! $(basename $courtdir) \< $(basename $startdir) ] && [ ! $(basename $courtdir) \> $(basename $enddir) ]
19 |     then
20 |         echo "Running on ${courtdir}"
21 |         python parse_pacer.py $courtdir/html/ $courtdir/json/ "$@";
22 |     else
23 |         echo "Skipping ${courtdir}";
24 |     fi
25 | done
26 | 


--------------------------------------------------------------------------------
/src/pacer_tools/code/support/scales_shell.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import importlib
 3 | from pathlib import Path
 4 | 
 5 | import pandas as pd
 6 | 
 7 | sys.path.append(str(Path(__file__).resolve().parents[1]))
 8 | 
 9 | reload = importlib.reload
10 | import_dict = {
11 |     'support.fhandle_tools': 'ftools',
12 |     'support.settings': 'settings',
13 |     'support.data_tools': 'dtools',
14 |     'support.docket_entry_identification':'dei',
15 |     'support.court_functions':'cf',
16 |     'support.judge_functions':'jf',
17 | }
18 | 
19 | print('')
20 | for mod, alias in import_dict.items():
21 |     globals().update({alias:importlib.import_module(mod)})
22 |     print(f"Imported {mod} as {alias}")
23 | 
24 | dff = dtools.load_unique_files_df()
25 | print(f"Imported unique files df as dff")
26 | 


--------------------------------------------------------------------------------
/src/pacer_tools/code/support/language_tools.py:
--------------------------------------------------------------------------------
 1 | def nearest_ent_index(search_phrase, text, ents):
 2 |     '''
 3 |     Identifies nearest entity to a search phrase in a text block.
 4 |     Ents must be generated from text, search_phrase must be in text.
 5 |     input:
 6 |         * search_phrase -- str, regex to search for
 7 |         * text -- str, document text
 8 |         * ents -- list, list of spacy entities that should be considered in search
 9 |     output:
10 |         * min_index -- int, index for the entity list of the closest spacy entity to search phrase
11 |     '''
12 |     import re 
13 | 
14 |     bspan, espan = re.search(search_phrase, text).span()
15 |     #Subtract bspan, then we want the minimum distance that is positive
16 |     start_chars = [ent.start_char - bspan for ent in ents]
17 |     m = min(i for i in start_chars)
18 |     min_index = start_chars.index(m)
19 |     return min_index
20 | 


--------------------------------------------------------------------------------
/src/pacer_tools/code/support/core_data/district_courts_94.csv:
--------------------------------------------------------------------------------
 1 | akd
 2 | almd
 3 | alnd
 4 | alsd
 5 | ared
 6 | arwd
 7 | azd
 8 | cacd
 9 | caed
10 | cand
11 | casd
12 | cod
13 | ctd
14 | dcd
15 | ded
16 | flmd
17 | flnd
18 | flsd
19 | gamd
20 | gand
21 | gasd
22 | gud
23 | hid
24 | iand
25 | iasd
26 | idd
27 | ilcd
28 | ilnd
29 | ilsd
30 | innd
31 | insd
32 | ksd
33 | kyed
34 | kywd
35 | laed
36 | lamd
37 | lawd
38 | mad
39 | mdd
40 | med
41 | mied
42 | miwd
43 | mnd
44 | moed
45 | mowd
46 | msnd
47 | mssd
48 | mtd
49 | nced
50 | ncmd
51 | ncwd
52 | ndd
53 | ned
54 | nhd
55 | njd
56 | nmd
57 | nmid
58 | nvd
59 | nyed
60 | nynd
61 | nysd
62 | nywd
63 | ohnd
64 | ohsd
65 | oked
66 | oknd
67 | okwd
68 | ord
69 | paed
70 | pamd
71 | pawd
72 | prd
73 | rid
74 | scd
75 | sdd
76 | tned
77 | tnmd
78 | tnwd
79 | txed
80 | txnd
81 | txsd
82 | txwd
83 | utd
84 | vaed
85 | vawd
86 | vid
87 | vtd
88 | waed
89 | wawd
90 | wied
91 | wiwd
92 | wvnd
93 | wvsd
94 | wyd
95 | 


--------------------------------------------------------------------------------
/src/pacer_tools/requirements.yml:
--------------------------------------------------------------------------------
 1 | # Create this environment using `conda env create -f <this file>`
 2 | # Update this enviroment by activating it and then `conda env update -f <this file>`
 3 | name: scales_env
 4 | channels:
 5 |   - defaults
 6 |   - conda-forge
 7 | dependencies:
 8 |   - python=3.8.*
 9 |   - pandas=1.3.*
10 |   - spacy=3.2.*
11 |   - scikit-learn
12 |   - statsmodels
13 |   - ipython
14 |   - scipy
15 |   - seaborn
16 |   - click
17 |   - numpy
18 |   - tqdm
19 |   - selenium
20 |   - selenium-requests
21 |   - simplejson
22 |   - xlrd
23 |   - lxml
24 |   - bs4
25 |   - spacy-lookups-data
26 |   - psycopg2
27 |   - sqlalchemy
28 |   - python-dotenv
29 |   - xmltodict
30 |   - pymongo
31 |   - rdflib
32 |   - fuzzywuzzy
33 |   - toolz
34 |   - pip
35 |   - pip:
36 |       - usaddress
37 |       - anyio
38 |       - asyncclick
39 |       - flashtext
40 |   # - psycopg2-binary
41 |   # - cenpy
42 |   # - html5lib
43 |   # - geopandas
44 | 


--------------------------------------------------------------------------------
/src/pacer_tools/code/tasks/build_unique_table.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from pathlib import Path
 3 | 
 4 | import click
 5 | 
 6 | sys.path.append(str(Path(__file__).resolve().parents[1]))
 7 | from support import settings
 8 | from support import data_tools as dtools
 9 | 
10 | @click.command()
11 | @click.option('--outfile', '-o', default=settings.UNIQUE_FILES_TABLE, show_default=True)
12 | @click.option('--nrows', '-n', default=None)
13 | def main(outfile, nrows):
14 | 
15 |     if outfile == settings.UNIQUE_FILES_TABLE:
16 |         if not click.confirm(f"Overwrite the existing table at {outfile} ?"):
17 |             return
18 | 
19 |     if nrows:
20 |         nrows = int(nrows)
21 | 
22 |     df = dtools.generate_unique_filepaths(outfile, nrows)
23 |     print(f"\nUnique filepaths table (with shape {df.shape}) output to {Path(outfile).resolve()}")
24 | 
25 |     exist_count = df.fpath.map(lambda x: (settings.PROJECT_ROOT/x).exists()).sum()
26 |     print(f'\nFile existence check: {exist_count:,} / {len(df):,}')
27 | 
28 | if __name__ == '__main__':
29 |     main()
30 | 


--------------------------------------------------------------------------------
/src/pacer_tools/code/support/viz_tools.py:
--------------------------------------------------------------------------------
 1 | import seaborn as sns
 2 | from matplotlib.ticker import PercentFormatter
 3 | from matplotlib.ticker import FuncFormatter
 4 | 
 5 | # Palette
 6 | def pal(n=5, ind=False, cmap=False):
 7 |     '''
 8 |         Return a blue to orange palette
 9 | 
10 |         Inputs:
11 |             n (int) - no. of colours in the palette
12 |             ind (int) - the index of the single colour in the palette to return
13 |             cmap (bool) - whether to return a cmap
14 |     '''
15 |     h_neg, h_pos, s, l = 255, 22, 99, 65
16 | 
17 |     if cmap:
18 |         return sns.diverging_palette(h_neg, h_pos, s, l, as_cmap=True)
19 | 
20 |     if n == 3:
21 |         palette = [pal(4,ind)[i] for i in [0,2,3]]
22 |     else:
23 |         palette = sns.diverging_palette(h_neg, h_pos, s, l, n=n)
24 | 
25 |     #If index specified return a tuple of that color
26 |     if type(ind)==int:
27 |         return tuple(palette[ind])
28 |     # Else return the whole palette
29 |     else:
30 |         return palette
31 | 
32 | # Graph label formatters
33 | fmt_thou = FuncFormatter(lambda x,p: f"{x:,.0f}")
34 | fmt_perc = PercentFormatter
35 | 


--------------------------------------------------------------------------------
/src/pacer_tools/data/annotation/fjc_district_codes.json:
--------------------------------------------------------------------------------
1 | {"00": "med", "47": "ohnd", "01": "mad", "48": "ohsd", "02": "nhd", "49": "tned", "03": "rid", "50": "tnmd", "04": "prd", "51": "tnwd", "05": "ctd", "52": "ilnd", "06": "nynd", "53": "ilcd", "07": "nyed", "54": "ilsd", "08": "nysd", "55": "innd", "09": "nywd", "56": "insd", "10": "vtd", "57": "wied", "11": "ded", "58": "wiwd", "12": "njd", "60": "ared", "13": "paed", "61": "arwd", "14": "pamd", "62": "iand", "15": "pawd", "63": "iasd", "16": "mdd", "64": "mnd", "17": "nced", "65": "moed", "18": "ncmd", "66": "mowd", "19": "ncwd", "67": "ned", "20": "scd", "68": "ndd", "22": "vaed", "69": "sdd", "23": "vawd", "7-": "akd", "24": "wvnd", "70": "azd", "25": "wvsd", "71": "cand", "26": "alnd", "72": "caed", "27": "almd", "73": "cacd", "28": "alsd", "74": "casd", "29": "flnd", "75": "hid", "3A": "flmd", "76": "idd", "3C": "flsd", "77": "mtd", "3E": "gand", "78": "nvd", "3G": "gamd", "79": "ord", "3J": "gasd", "80": "waed", "3L": "laed", "81": "wawd", "3N": "lamd", "82": "cod", "36": "lawd", "83": "ksd", "37": "msnd", "84": "nmd", "38": "mssd", "85": "oknd", "39": "txnd", "86": "oked", "40": "txed", "87": "okwd", "41": "txsd", "88": "utd", "42": "txwd", "89": "wyd", "43": "kyed", "90": "dcd", "44": "kywd", "91": "vid", "45": "mied", "93": "gud", "46": "miwd", "94": "nmid"}


--------------------------------------------------------------------------------
/src/pacer_tools/data/annotation/statey2code.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "alabama": "al",
 3 |   "alaska": "ak",
 4 |   "arizona": "az",
 5 |   "arkansas": "ar",
 6 |   "california": "ca",
 7 |   "colorado": "co",
 8 |   "connecticut": "ct",
 9 |   "delaware": "de",
10 |   "district of columbia": "dc",
11 |   "florida": "fl",
12 |   "georgia": "ga",
13 |   "hawaii": "hi",
14 |   "idaho": "id",
15 |   "illinois": "il",
16 |   "indiana": "in",
17 |   "iowa": "ia",
18 |   "kansas": "ks",
19 |   "kentucky": "ky",
20 |   "louisiana": "la",
21 |   "maine": "me",
22 |   "maryland": "md",
23 |   "massachusetts": "ma",
24 |   "michigan": "mi",
25 |   "minnesota": "mn",
26 |   "mississippi": "ms",
27 |   "missouri": "mo",
28 |   "montana": "mt",
29 |   "nebraska": "ne",
30 |   "nevada": "nv",
31 |   "new hampshire": "nh",
32 |   "new jersey": "nj",
33 |   "new mexico": "nm",
34 |   "new york": "ny",
35 |   "north carolina": "nc",
36 |   "north dakota": "nd",
37 |   "ohio": "oh",
38 |   "oklahoma": "ok",
39 |   "oregon": "or",
40 |   "pennsylvania": "pa",
41 |   "rhode island": "ri",
42 |   "south carolina": "sc",
43 |   "south dakota": "sd",
44 |   "tennessee": "tn",
45 |   "texas": "tx",
46 |   "utah": "ut",
47 |   "vermont": "vt",
48 |   "virginia": "va",
49 |   "washington": "wa",
50 |   "west virginia": "wv",
51 |   "wisconsin": "wi",
52 |   "wyoming": "wy",
53 |   "guam": "gu",
54 |   "northern mariana islands": "nmi",
55 |   "puerto rico": "pr",
56 |   "virgin islands": "vi"
57 | }


--------------------------------------------------------------------------------
/src/pacer_tools/code/support/core_data/statey2code.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "alabama": "al",
 3 |   "alaska": "ak",
 4 |   "arizona": "az",
 5 |   "arkansas": "ar",
 6 |   "california": "ca",
 7 |   "colorado": "co",
 8 |   "connecticut": "ct",
 9 |   "delaware": "de",
10 |   "district of columbia": "dc",
11 |   "florida": "fl",
12 |   "georgia": "ga",
13 |   "hawaii": "hi",
14 |   "idaho": "id",
15 |   "illinois": "il",
16 |   "indiana": "in",
17 |   "iowa": "ia",
18 |   "kansas": "ks",
19 |   "kentucky": "ky",
20 |   "louisiana": "la",
21 |   "maine": "me",
22 |   "maryland": "md",
23 |   "massachusetts": "ma",
24 |   "michigan": "mi",
25 |   "minnesota": "mn",
26 |   "mississippi": "ms",
27 |   "missouri": "mo",
28 |   "montana": "mt",
29 |   "nebraska": "ne",
30 |   "nevada": "nv",
31 |   "new hampshire": "nh",
32 |   "new jersey": "nj",
33 |   "new mexico": "nm",
34 |   "new york": "ny",
35 |   "north carolina": "nc",
36 |   "north dakota": "nd",
37 |   "ohio": "oh",
38 |   "oklahoma": "ok",
39 |   "oregon": "or",
40 |   "pennsylvania": "pa",
41 |   "rhode island": "ri",
42 |   "south carolina": "sc",
43 |   "south dakota": "sd",
44 |   "tennessee": "tn",
45 |   "texas": "tx",
46 |   "utah": "ut",
47 |   "vermont": "vt",
48 |   "virginia": "va",
49 |   "washington": "wa",
50 |   "west virginia": "wv",
51 |   "wisconsin": "wi",
52 |   "wyoming": "wy",
53 |   "guam": "gu",
54 |   "northern mariana islands": "nmi",
55 |   "puerto rico": "pr",
56 |   "virgin islands": "vi"
57 | }


--------------------------------------------------------------------------------
/src/pacer_tools/code/tasks/redact_pacer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import glob
 3 | import json
 4 | import spacy
 5 | import click
 6 | import pandas as pd
 7 | 
 8 | import sys
 9 | from pathlib import Path
10 | sys.path.append(str(Path(__file__).resolve().parents[1]))
11 | from support import data_tools as dtools
12 | nlp = spacy.load("en_core_web_trf")
13 | 
14 | 
15 | 
16 | def _redact_file(fpath, outdir_replacement_target, outdir_replacement_text):
17 |     is_html = 'html' in fpath
18 |     fpath_new = fpath.replace(outdir_replacement_target, outdir_replacement_text)
19 |     data = dtools.load_case(fpath=fpath, html=is_html)
20 |     
21 |     try:
22 |         data_redacted = dtools.redact_private_individual_names(data, is_html=is_html, elective_nlp=nlp)
23 |         os.makedirs(os.path.dirname(fpath_new), exist_ok=True)
24 |         with open(fpath_new, 'w') as f:
25 |             if is_html:
26 |                 f.write(data_redacted)
27 |             else:
28 |                 json.dump(data_redacted, f)
29 |         
30 |         print(f'Created {fpath_new}')
31 |     except Exception as e:
32 |         print(f'Error while creating {fpath_new}: {e}')
33 | 
34 |     
35 | 
36 | @click.command()
37 | @click.argument('file_pattern')
38 | @click.argument('outdir_replacement_target')
39 | @click.argument('outdir_replacement_text')
40 | def main(file_pattern, outdir_replacement_target, outdir_replacement_text):
41 | 
42 |     fpaths = glob.glob(file_pattern)
43 |     print(f'Compiled list of {len(fpaths)} files to redact')
44 |     for fpath in fpaths:
45 |         _redact_file(str(Path(fpath).resolve()), outdir_replacement_target, outdir_replacement_text)
46 |     print('Finished redacting')
47 | 
48 | if __name__ == '__main__':
49 |     main()


--------------------------------------------------------------------------------
/src/pacer_tools/code/support/counsel_functions.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | import json
 4 | import sys
 5 | from pathlib import Path
 6 | sys.path.append(str(Path(__file__).resolve().parents[1]))
 7 | 
 8 | from support import fhandle_tools as ftools
 9 | from support import settings
10 | 
11 | 
12 | def load_counsel_clusters():
13 |     ''' Simple Loader File'''
14 |     return pd.read_json(settings.COUNSEL_DIS_CLUSTS, lines=True)
15 | 
16 | def load_disambiguated_counsels(ucid, as_df=True):
17 |     '''
18 |     Load Counsel data (from relevant .jsonl files in the COUNSEL_DIS_DIR)
19 | 
20 |     Inputs:
21 |         - ucid (str or iterable): can be a single ucid (str) or any iterable (list / pd.Series)
22 |         - as_df (bool): if true returns as type pd.DataFrame, otherwise list of dicts
23 | 
24 |     Output:
25 |         (pd.DataFrame or list of dicts) Disambiguated counsel data for the given ucid(s)
26 |     '''
27 | 
28 |     # Coerce to an iterable
29 |     if type(ucid) is str:
30 |         ucid = [ucid]
31 | 
32 |     ROW_DAT = []
33 |     for each in ucid:
34 |         # create filepath
35 |         fname = ftools.build_counsel_filename_from_ucid(each)
36 |         # load file
37 |         results = []
38 |         if fname.exists():
39 |             with open(fname, 'r') as json_file:
40 |                 json_list = list(json_file)
41 |                 for json_str in json_list:
42 |                     results.append(json.loads(json_str))
43 | 
44 |         ROW_DAT+=results
45 | 
46 |     # return dataframe
47 |     if ROW_DAT:
48 |         if as_df:
49 |             COUNSELS = pd.DataFrame(ROW_DAT)
50 |         else:
51 |             COUNSELS = ROW_DAT
52 | 
53 |         return COUNSELS
54 |     else:
55 |         return None
56 | 
57 | 
58 |     return
59 | 
60 | def load_firm_clusters():
61 |     return
62 | 
63 | def load_disambiguated_firms(ucid, as_df=True):
64 |     return


--------------------------------------------------------------------------------
/src/pacer_tools/code/support/settings.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | File: settings.py
 3 | Author: Adam Pah
 4 | Description: Settings file
 5 | '''
 6 | import sys
 7 | from pathlib import Path
 8 | sys.path.append(str(Path(__file__).resolve().parents[1]))
 9 | 
10 | 
11 | PROJECT_ROOT = Path(__file__).resolve().parents[2]
12 | 
13 | CORE_DATA = PROJECT_ROOT / 'code'/ 'support' / 'core_data'
14 | DATAPATH = PROJECT_ROOT / 'data'
15 | ANNO_PATH = DATAPATH / 'annotation'
16 | PACER_PATH = DATAPATH / 'pacer' # generate using scrapers.py
17 | 
18 | COURTFILE = CORE_DATA / 'district_courts.csv'
19 | DISTRICT_COURTS_94 = CORE_DATA / 'district_courts_94.csv'
20 | STATEY2CODE = CORE_DATA / 'statey2code.json'
21 | NATURE_SUIT = CORE_DATA / 'nature_suit.csv'
22 | JUDGEFILE = CORE_DATA / 'judge_demographics.csv'
23 | BAMAG_JUDGES = CORE_DATA / 'brmag_judges.csv'
24 | BAMAG_POSITIONS = CORE_DATA / 'brmag_positions.csv'
25 | 
26 | MEM_DF = DATAPATH / 'member_cases.csv'
27 | LOG_DIR = DATAPATH / 'logs'
28 | EXCLUDE_CASES = DATAPATH / 'exclude.csv'
29 | UNIQUE_FILES_TABLE = DATAPATH / 'unique_docket_filepaths_table.csv' # generate using generate_unique_filepaths in data_tools.py
30 | FJC =  DATAPATH / 'fjc' # generate using fjc.gov/research/idb and fjc_functions.py
31 | 
32 | MEMBER_LEAD_LINKS = ANNO_PATH / 'member_lead_links.jsonl'
33 | ROLE_MAPPINGS = ANNO_PATH / 'role_mappings.json'
34 | JEL_JSONL = ANNO_PATH / 'judge_disambiguation' / 'JEL.jsonl' # generate using the Research-Materials repo
35 | ONTOLOGY_LABELS = ANNO_PATH / 'ontology' / 'labels.csv' # generate using the scales-nlp repo
36 | 
37 | ANNO_PATH_CLAYTON = ANNO_PATH / 'counties' / 'ga_clayton'
38 | NIBRS_CATEGORIES_CLAYTON = ANNO_PATH_CLAYTON / 'nibrs' / 'nibrs_categories.csv'
39 | NIBRS_CROSSWALK_CLAYTON = ANNO_PATH_CLAYTON / 'nibrs' / 'nibrs_crosswalk.csv'
40 | 
41 | # included on behalf of make_graph_data_pacer.py
42 | # (in infrastructure_dev, this is a dev/prod switch, but it's not included here because pacer-tools is always prod)
43 | use_datastore = lambda path: path
44 | 


--------------------------------------------------------------------------------
/src/pacer_tools/code/parsers/schemas/jel_v1.schema.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "http://json-schema.org/schema#",
 3 |     "$id": "http://github.com/scales-okn/PACER-tools/schemas/JEL_v1.json",
 4 |     "title": "JEL Schema",
 5 |     "description": "The JEL is the Judge Entity Lookup for all unique judge entities" ,
 6 |     "properties": {
 7 |       "name": {
 8 |         "type": "string",
 9 |         "description": "The name of the unique judge entity"
10 |       },
11 |       "Presentable_Name": {
12 |         "type": "string",
13 |         "description": "A human readable entity name with first letters capitalized in each token"
14 |       },
15 |       "SJID": {
16 |         "type": "string",
17 |         "description": "Unique SCALES Judge Identifier for the Paren Entity associated with this location"
18 |       },
19 |       "SCALES_Judge_Label": {
20 |         "type": "number",
21 |         "enum": ["FJC Judge","Magistrate_Judge","Nondescript_Judge","Judicial_Actor","Bankruptcy_Judge","District_Judge"],
22 |         "description": "The predicted judge type based on the SCALES disambiguation routine and algorithmic labelling"
23 |       },
24 |       "Head_UCIDs": {
25 |         "type": "number",
26 |         "description": "The total number of unique docket headers this entity existed on from our disambiguation sample"
27 |       },
28 |       "Tot_UCIDs": {
29 |         "type": "number",
30 |         "description": "The total number of unique dockets this entity existed on from our disambiguation sample"
31 |       },
32 |       "Full_Name": {
33 |         "type": "number",
34 |         "description": "If the judge entity is a known Article III judge from the FJC biological dictionary, then this is the concatenation of FJC name fields"
35 |       },
36 |       "NID": {
37 |         "type": "string",
38 |         "description": "If the judge entity is a known Article III judge from the FJC biological dictionary, then this is the FJC NID field"
39 |       }
40 |     }
41 |   }


--------------------------------------------------------------------------------
/src/pacer_tools/code/support/stats.py:
--------------------------------------------------------------------------------
 1 | def bootstrap_district_variation(checkdf):
 2 |     '''
 3 |     For each courut does  a t test on the difference between an individual judge and the court and the other judges in the court
 4 |     Accounts for uneven sample sizes
 5 | 
 6 |     input:
 7 |     * checkdf - dataframe where each row is a case, columns are:
 8 |                 ['court', 'judge', 'resolutuion']
 9 |                 A positive outcome for the procedural ruling ('resolution') is 1 and a negative outcome is 0
10 |                 standard social science encoding
11 |     output:
12 |     * scidf - dataframe where each row is a judge, columns are:
13 |               ['Judge', 'Diff', 'LB', 'UB',  'sig']
14 |               diff is the actual difference, lb and ub are the confidence bounds, and sig is if 1 if it doesn't cross zero
15 |     '''
16 |     import numpy as np
17 |     from scipy import stats
18 |     import pandas as pd
19 | 
20 |     def _identify_sig(row):
21 |         if np.sign(row['LB'])==np.sign(row['UB']):
22 |             return 1
23 |         else:
24 |             return 0
25 | 
26 |     judge_data = []
27 |     courts = [x for x in checkdf.court.unique() if x!='nmid']
28 |     for court in courts:
29 |         #Just subset to keep the naming shorter
30 |         cdf = checkdf[checkdf.court == court]
31 |         #Get the judge list
32 |         judges = cdf.judge.unique()
33 |         #District differences
34 |         for j in judges:
35 |             jdf = cdf[cdf.judge==j]
36 |             njdf = cdf[cdf.judge!=j]
37 |             mu_1 = np.mean(jdf.resolution)
38 |             mu_2 = np.mean(njdf.resolution)
39 |             s_1 = np.std(jdf.resolution)
40 |             s_2 = np.std(njdf.resolution)
41 |             diff = (mu_1-mu_2)
42 |             #Uneven samples
43 |             se = np.sqrt(s_1**2/len(jdf) + s_2**2/len(njdf))
44 |             ndf = (se**2)**2/( (s_1**2/len(jdf))**2/(len(jdf)-1) + (s_2**2/len(njdf))**2/(len(njdf)-1) )
45 |             lb = diff - stats.t.ppf(0.975, ndf)*se
46 |             ub = diff + stats.t.ppf(0.975, ndf)*se
47 |             
48 |             judge_data.append([j, diff, lb, ub])
49 | 
50 |     scidf = pd.DataFrame(judge_data, columns = ['Judge', 'Diff', 'LB', 'UB'])
51 |     scidf['sig'] = scidf.apply(_identify_sig, axis=1)
52 |     return scidf
53 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # to update the pypi package:
 2 | # (1) rm -r build dist (not strictly necessary, but prevents superfluous uploads of old versions)
 3 | # (1) iterate the version in this file
 4 | # (2) python setup.py bdist_wheel --universal (check.warn(importable) can be ignored if data_files takes care of the directories in question)
 5 | # (3) twine upload dist/* (requires pypi credentials)
 6 | 
 7 | from setuptools import setup, find_packages
 8 | from glob import glob
 9 | 
10 | from pathlib import Path
11 | base_dir = Path(__file__).parent
12 | long_description = (base_dir / "README.md").read_text()
13 | 
14 | setup(
15 | 	name='pacer-tools',
16 | 	version='0.1.12',
17 |     long_description=long_description,
18 |     long_description_content_type='text/markdown',
19 | 	package_dir={'': 'src'},
20 | 	packages=find_packages('src'),
21 | 	install_requires=[
22 |         'async-generator', 'attrs', 'beautifulsoup4', 'bs4', 
23 |         'cchardet', 'cffi', 'chardet', 'charset-normalizer', 
24 |         'click', 'configuration-maker', 'cryptography', 
25 |         'cssselect', 'feedparser', 'filelock', 'future', 
26 |         'geonamescache', 'h11', 'html5lib', 'idna',
27 |         'lxml', 'numpy', 'outcome', 'pandas', 'pathlib', 
28 |         'probableparsing', 'pycparser', # 'pymongo',
29 |         'pyOpenSSL', 'PySocks', 
30 |         'python-crfsuite', 'python-dateutil', 'python-dotenv', 
31 |         'python-Levenshtein', 'pytz', 'rdflib', 'requests', 
32 |         'requests-file', 'scipy', 'selenium', 
33 |         'selenium-requests', 'sgmllib3k', 'simplejson',
34 |         'six', 'sniffio', 
35 |         'sortedcontainers', 'soupsieve', 'tldextract', 
36 |         'tqdm', 'trio', 'trio-websocket', 'urllib3', 
37 |         'urllib3-secure-extra', 'usaddress', 'webencodings', 
38 |         'wsproto', 'xmltodict'
39 |     ],
40 | 	entry_points={
41 | 		'console_scripts': [
42 | 			'pacer-tools = pacer_tools:cli',
43 | 		],
44 | 	},
45 |     data_files=[
46 |         ('pacer_tools', glob('src/pacer_tools/code/support/core_data/*.*')),
47 |         ('pacer_tools', glob('src/pacer_tools/data/*.*')),
48 |         ('pacer_tools', glob('src/pacer_tools/data/annotation/*.*')),
49 |         ('pacer_tools', glob('src/pacer_tools/data/annotation/counties/ga_clayton/nibrs/*.*')),
50 |     ],
51 |     include_package_data = True,
52 | )
53 | 


--------------------------------------------------------------------------------
/src/pacer_tools/code/support/mongo_connector.py:
--------------------------------------------------------------------------------
 1 | ''' Based on: https://gist.github.com/mangangreg/f84d8899e961c48a8539b813e746eac6
 2 | '''
 3 | import os
 4 | import sys
 5 | import time
 6 | from pathlib import Path
 7 | 
 8 | from pymongo import MongoClient
 9 | from dotenv import load_dotenv
10 | 
11 | HERE = Path(__file__).parent
12 | 
13 | class SCALESMongo:
14 |     def __init__(self, user=None, password=None, host=None, port=None, database=None, env_file=HERE/'.mongo.env'):
15 | 
16 |         # Load the env file
17 |         load_dotenv(env_file)
18 | 
19 |         self.user = user or os.getenv('MONGO_USER')
20 |         self.password = password or os.getenv('MONGO_PASSWORD')
21 |         self.host = host or os.getenv('MONGO_HOST') or 'localhost'
22 |         self.port = port or os.getenv('MONGO_PORT') or 27017
23 |         self.database = database or os.getenv('MONGO_DATABASE')
24 | 
25 |         # Build the URI
26 |         self.URI = self._constructURI()
27 | 
28 |         # Initialise connection and db
29 |         self.connection = None
30 |         self.db = None
31 | 
32 |     def _constructURI(self):
33 |         return f"mongodb://{self.user}:{self.password}@{self.host}:{self.port}"
34 | 
35 |     def connect(self):
36 |         self.connection = MongoClient(self.URI)
37 |         self.db = self.connection[self.database]
38 | 
39 | class SaneResult:
40 |     ''' A sane/readable Pymongo result object '''
41 | 
42 |     def __init__(self, res):
43 |         self.res = res
44 |         self.counts = self.build_counts(res)
45 |         self.counts_string = " ".join(f"{k}={v}" for k,v in self.counts.items()).rstrip()
46 | 
47 |     def __repr__(self):
48 |         if not self.res:
49 |             return ''
50 |         class_str = str(self.res.__class__).strip('<> ')
51 |         return f"<{class_str} acknowledged={self.res.acknowledged} {self.counts_string}>"
52 | 
53 |     def build_counts(self, res):
54 |         ''' Find the attributes that contain insert/update count numbers '''
55 | 
56 |         counts = {}
57 |         for k in dir(res):
58 |             if k.endswith('count'):
59 |                 counts.update({k: res.__getattribute__(k)})
60 |             elif k.endswith('_ids') and not k.startswith('_'):
61 |                 counts.update({k.split('_ids',maxsplit=1)[0]: len(res.__getattribute__(k))})
62 | 
63 |         return counts
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/src/pacer_tools/code/support/docket_functions.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def restrict_to_court_motions(docket_entries):
 3 |     '''
 4 |     Restricts docket entries to court motions
 5 |     restriction is based on the idea that
 6 |     input:
 7 |         docket entries list [[date, num, text], ...]
 8 |     output:
 9 |         docket entries list [[date, num, text]
10 |     '''
11 |     import re
12 |     re_court_action = re.compile('\([a-z\, ]{3,20}\)')
13 |     responsive = []
14 |     for i, docket_line in enumerate(docket_entries):
15 |         #Does the docket line exist, if not do nothing
16 |         if len(docket_line) == 3:
17 |             try:
18 |                 search_result = re_court_action.search(docket_line[-1])
19 |                 if search_result != None:
20 |                     responsive.append(i)
21 |             except TypeError:
22 |                 #Not a string
23 |                 pass
24 |     return responsive
25 | 
26 | 
27 | def checker_notice_of_removal(docket_entries):
28 |     '''
29 |     Checks the docket to see if a case has been removed
30 |     '''
31 |     removed_case = False
32 |     if len(docket_entries) > 0:
33 |         for line in docket_entries:
34 |             try:
35 |                 if 'notice of removal' in line[-1].lower():
36 |                     removed_case = True
37 |             except:
38 |                 pass
39 |     return removed_case
40 | 
41 | 
42 | def inter_event_series(docket_entries, docket_indices):
43 |     '''
44 |     For a given docket, constructs the inter event time series
45 |     input:
46 |         docket entries list [[date, num, text], ...[
47 |     output:
48 |         list inter event series in days [0, 2, 3, ....]
49 |     '''
50 |     import pandas as pd
51 |     import numpy as np
52 |     if len(docket_indices) > 0:
53 |         df = pd.DataFrame(np.array(docket_entries)[docket_indices], columns=['date','link','desc'])
54 |         df['pdate'] = pd.to_datetime(df.date)
55 |         inter_event = df['pdate'].diff().dt.days[1:].values.tolist()
56 |     else:
57 |         return []
58 |     return inter_event
59 | 
60 | def find_pattern(docket_entries, pat, rlim=None):
61 |     '''
62 |         Binary check for occurence of pattern in docket, returns True if at least one match
63 |         inputs
64 |             - docket_entries(list) - docket entries list from case json
65 |             - pat (regex) - the pattern to search for
66 |             - rlim (int) - the right limit character index to search as far as
67 |         output
68 |             (bool) - Returns true if pattern found on any line
69 |     '''
70 |     if not docket_entries or not len(docket_entries):
71 |         return False
72 | 
73 |     # Deal with singleton line
74 |     if type(docket_entries)==list and type(docket_entries[0]) != list:
75 |         return False
76 | 
77 |     for line in docket_entries:
78 | 
79 |         if len(line)==4 and type(line[2])==str:
80 |             if re.search(pat, line[2][:rlim], re.I):
81 |                 return True


--------------------------------------------------------------------------------
/src/pacer_tools/code/parsers/schemas/docket_entry_v1.schema.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://json-schema.org/draft/2020-12/schema",
 3 |   "$id": "http://github.com/scales-okn/PACER-tools/schemas/docket_entry_v1.schema.json",
 4 |   "title": "An entry in a case docket",
 5 |   "description": "A schema for a docket entry in a PACER docket report" ,
 6 |   "properties": {
 7 |     "date_filed": {
 8 |       "type": "string",
 9 |       "description": "The date the docket entry was filed"
10 |     },
11 |     "ind": {
12 |       "type": "string",
13 |       "description": "The PACER index, printed in the '#' column of the docket entry"
14 |     },
15 |     "docket_text": {
16 |       "type": "string",
17 |       "description": "The raw text of the docket entry"
18 |     },
19 |     "documents": {
20 |       "type": "object",
21 |       "description": "The documents associated with a docket line",
22 |       "items": {
23 |         "$ref": "#/$defs/document"
24 |         }
25 |     },
26 |     "edges": {
27 |       "type": "array",
28 |       "description": "In-line references to other docket lines, captured as edge triples",
29 |       "items": [
30 |         {
31 |           "type": "number",
32 |           "description": "The source node (always the row itself)"
33 |         },
34 |         {
35 |           "type": "number",
36 |           "description": "The target node (the preceeding row/document its pointing to)"
37 |         },
38 |         {
39 |           "type": "object",
40 |           "description": "The span, relative to docket_text, where the reference appears",
41 |           "properties":{
42 |             "start": {
43 |               "type": "number",
44 |               "description": "The character index, relative to docket_text, of the start of the reference"
45 |             },
46 |             "end": {
47 |               "type": "number",
48 |               "description": "The character index, relative to docket_text, of the end of the reference"
49 |             }
50 |           }
51 |         }
52 | 
53 |       ]
54 |     }
55 |   },
56 |   "$defs": {
57 |     "document": {
58 |       "type": "object",
59 |       "description": "A document associated with a case. Keys in this object are ..",
60 |       "propertyNames": "^\\d+$",
61 |       "additionalProperties": {
62 |         "type": "object",
63 |         "description":"",
64 |         "properties": {
65 |           "url": {
66 |             "type": "string",
67 |             "description": "The url of the document"
68 |           },
69 |           "span": {
70 |             "type": "object",
71 |             "description": "The span, relative to docket_text, where the reference to the document appears (always null for the 0th document, which is linked from the '#' column",
72 |             "properties":{
73 |               "start": {
74 |                 "type": "number",
75 |                 "description": "The character index, relative to docket_text, of the start of the reference"
76 |               },
77 |               "end": {
78 |                 "type": "number",
79 |                 "description": "The character index, relative to docket_text, of the end of the reference"
80 |               }
81 |             }
82 |           }
83 |         }
84 |       }
85 |     }
86 |   }
87 | }


--------------------------------------------------------------------------------
/src/pacer_tools/code/parsers/schemas/sel_v1.schema.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "http://json-schema.org/schema#",
 3 |   "$id": "http://github.com/scales-okn/PACER-tools/schemas/SEL_v1.json",
 4 |   "title": "SEL Schema",
 5 |   "description": "SEL files are Spacy Entity Lookup files for a particular docket's judge entities" ,
 6 |   "properties": {
 7 |     "Entity_Extraction_Method": {
 8 |       "type": "string",
 9 |       "enum": ["SPACY JNLP2", "referred_judges", "assigned_judge"],
10 |       "description": "The method used to extract and identify an entity at this location. A Spacy Judge NLP Model, or docket key values"
11 |     },
12 |     "docket_source": {
13 |       "type": "string",
14 |       "enum": ["line_entry","case_header", "case_parties"],
15 |       "description": "Source from the docket that this entity was pulled from."
16 |     },
17 |     "judge_enum": {
18 |       "type": "number",
19 |       "description": "If there are multiple entities in a lookup source, we enumerate them pythonically in the order they are listed; null for docket entries"
20 |     },
21 |     "party_enum": {
22 |       "type": "number",
23 |       "description": "The enumerated party number (0-indexed) that the judge entity was tied to on a criminal case; null for civil and docket entries"
24 |     },
25 |     "pacer_id": {
26 |       "type": "number",
27 |       "description": "If the judge in the header metadata was given an ID on the PACER HTML, we replilcate that here"
28 |     },
29 |     "docket_index": {
30 |       "type": "number",
31 |       "description": "The docket entry index (0-start) that this entity is located at; null for case_header and case_parties"
32 |     },
33 |     "ucid": {
34 |       "type": "string",
35 |       "description": "Unique case identifier (SCALES internal)"
36 |     },
37 |     "cid": {
38 |       "type": "string",
39 |       "description": "Local court case identifier"
40 |     },
41 |     "court": {
42 |       "type": "string",
43 |       "description": "Abbreviation for the Federal District Court this case is docketed at"
44 |     },
45 |     "year": {
46 |       "type": "number",
47 |       "description": "Approximation of the filing date year for the overall case"
48 |     },
49 |     "original_text": {
50 |       "type": "string",
51 |       "description": "Original string of text found on the docket that our entity was extracted from, padded with preceding and trailing tokens"
52 |     },
53 |     "Extracted_Entity": {
54 |       "type": "string",
55 |       "description": "Specific string of text that represents the entity this row of data references"
56 |     },
57 |     "Prefix_Categories": {
58 |       "type": "string",
59 |       "enum":["assigned_judge","referred_judges", "Bankruptcy_Judge","Circuit_Appeals","District_Judge","Magistrate_Judge","Nondescript_Judge","Judicial_Actor"],
60 |       "description": "The categorization of the preceding text before the entity. The label corresponds to types of words appearing before the judge entity"
61 |     },
62 |     "Transferred_Flag": {
63 |       "type": "boolean",
64 |       "description": "A boolean flag indicating if the judge entity was prefaced immediately prior with terminology related to case transferrals."
65 |     },
66 |     "full_span_start": {
67 |       "type": "number",
68 |       "description": "Span starting point for the Original Text with respect to the overall text at the particular docket_source-docket_index location"
69 |     },
70 |     "full_span_end": {
71 |       "type": "number",
72 |       "description": "Span ending point for the Original Text with respect to the overall text at the particular docket_source-docket_index location"
73 |     },
74 |     "Entity_Span_Start": {
75 |       "type": "number",
76 |       "description": "Span starting point for the Extracted Entity with respect to the overall text at the particular docket_source-docket_index location"
77 |     },
78 |     "Entity_Span_End": {
79 |       "type": "number",
80 |       "description": "Span ending point for the Extracted Entity with respect to the overall text at the particular docket_source-docket_index location"
81 |     },
82 |     "Parent_Entity": {
83 |       "type": "string",
84 |       "description": "The identified unique entity the Extracted Entity in the data row is referring to."
85 |     },
86 |     "SJID": {
87 |       "type": "string",
88 |       "description": "Unique SCALES Judge Identifier for the Paren Entity associated with this location"
89 |     }
90 |   }
91 | }


--------------------------------------------------------------------------------
/src/pacer_tools/code/support/court_functions.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import json
  3 | import sys
  4 | from pathlib import Path
  5 | 
  6 | import pandas as pd
  7 | 
  8 | sys.path.append(str(Path(__file__).resolve().parents[1]))
  9 | from support import settings
 10 | 
 11 | CARDINALS = ['northern', 'southern', 'eastern', 'western', 'central', 'middle']
 12 | STATEY2CODE = json.load(open(settings.STATEY2CODE))
 13 | COURTS_94 = [x.strip() for x in open(settings.DISTRICT_COURTS_94).readlines()]
 14 | 
 15 | re_state_codes = '|'.join(STATEY2CODE.values())
 16 | re_card = '|'.join(x[0].lower() for x in CARDINALS)
 17 | re_court_abbrev = rf"^(?P<state_code>{re_state_codes})(?P<ord_code>({re_card})?)d$"
 18 | 
 19 | # Read in court data
 20 | courtdf = pd.read_csv(settings.COURTFILE, index_col=0)
 21 | 
 22 | abbr2name_dict = dict(zip(courtdf.index, courtdf.courtname))
 23 | name2abbr_dict = dict(zip(courtdf.courtname, courtdf.index))
 24 | 
 25 | # Full name like "Oklahoma Western", useful for fjc
 26 | full_name = (courtdf.state +' ' +  courtdf.cardinal.fillna('')).str.strip()
 27 | fullname2abbr_dict = dict(zip(full_name,courtdf.index))
 28 | 
 29 | def make_courtname(row):
 30 |     '''
 31 |     Creates a court name that looks like [Cardinal]-[State]
 32 |     '''
 33 |     courtname = ''
 34 | 
 35 |     if type(row.cardinal) == str:
 36 |         courtname += row.cardinal + '-'
 37 | 
 38 |     courtname += row.state
 39 |     courtname = courtname.lower().replace(' ','-')
 40 |     return courtname
 41 | 
 42 | def abbr2name(abbr):
 43 |     '''
 44 |     Convert court abbreviate to court name
 45 |     inputs:
 46 |         abbr - 4-letter court abbreviation e.g. ilnd
 47 |     outputs:
 48 |         courtname: the name of the court e.g. north-illinois
 49 |     '''
 50 |     return abbr2name_dict[abbr]
 51 | 
 52 | def name2abbr(name, ordinal_first=True):
 53 |     '''
 54 |     Convert court abbreviate to court name
 55 |     inputs:
 56 |         name - court name e.g. 'northern illinois' or 'northern illinois'
 57 |     outputs:
 58 |         abbr - 4-letter court abbreviation e.g. ilnd
 59 |     '''
 60 |     if 'district' in name and 'columbia' not in name:
 61 |         name = name.replace('district', '').rstrip()
 62 | 
 63 |     # If the ordinal is not first, reverse it:
 64 |     if not ordinal_first:
 65 |         nlist = name.split()
 66 |         if nlist[-1] in CARDINALS:
 67 |             name = " ".join([nlist[-1], *nlist[:-1]])
 68 |     if ' ' in name or '-' not in name:
 69 |         name = name.lower().replace(' ', '-')
 70 | 
 71 |     return name2abbr_dict[name]
 72 | 
 73 | def abbr2full(abbr):
 74 |     '''
 75 |         Convert a court abbreviation to the full title format
 76 |         Ex. 'txsd' -> "U.S. District Court for the Southern District of Texas"
 77 | 
 78 |         Inputs:
 79 |             abbr (str) - court abbreviate
 80 |         Outputs:
 81 |             str
 82 |     '''
 83 |     #Get the court abbreviation cardinal direction and state name from the court dataframe
 84 |     try:
 85 |         cardinal = courtdf[courtdf.index.eq(abbr)].cardinal.values[0]
 86 |         cardinal = cardinal + ' ' if (type(cardinal)==str) else ''
 87 |         state = courtdf[courtdf.index.eq(abbr)].state.values[0]
 88 | 
 89 |         #Make the string
 90 |         return  f"U.S. District Court for the {cardinal}District of {state}"
 91 |     except IndexError:
 92 |         print("Error with court abbreviation:", abbr)
 93 |         return None
 94 | 
 95 | def classify(court_raw):
 96 |     ''' Classify any district court '''
 97 |     court = court_raw.lower()
 98 |     # Check if it already matches an abbreviation
 99 |     if re.match(re_court_abbrev, court):
100 |         return court
101 | 
102 |     # Deal with DC separately as 'District' has problematic matching
103 |     elif 'columbia' in court:
104 |         return 'dcd'
105 | 
106 |     else:
107 |         # Look for state and cardinal words
108 |         court = re.sub('[-,]',' ', court).strip()
109 |         state = re.search("|".join(STATEY2CODE.keys()), court)
110 |         if not state:
111 |             return
112 |         elif state.group() == 'northern mariana islands':
113 |             card_letter = ''
114 |         else:
115 |             cardinal = re.search("|".join(CARDINALS), court)
116 |             if cardinal:
117 |                 card_letter = cardinal.group()[0] if cardinal else ''
118 | 
119 |             # Case with "District Court, N.D. Illinois"
120 |             elif not cardinal and 'D.' in court_raw:
121 |                 #Search for cardinal letter (case sensitive)
122 |                 match = re.search("(?P<card_letter>[A-Z])\.", court_raw.replace("D.",''))
123 |                 card_letter = match.groupdict()['card_letter'].lower() if match else ''
124 |             else:
125 |                 card_letter = ''
126 | 
127 | 
128 |         # state code + cardinal letter + 'd' e.g. ilnd
129 |         abbrev = f"{STATEY2CODE[state.group()]}{card_letter}d"
130 |         return abbrev
131 | 


--------------------------------------------------------------------------------
/src/pacer_tools/code/support/disambiguation_functions.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | 
  3 | import json
  4 | import sys
  5 | from pathlib import Path
  6 | sys.path.append(str(Path(__file__).resolve().parents[1]))
  7 | 
  8 | from support import fhandle_tools as ftools
  9 | from support import settings
 10 | 
 11 | 
 12 | def load_CENSUS_cities():
 13 |     ''' Simple Loader Function'''
 14 |     return pd.read_csv(settings.CENSUS_CITIES, encoding="ISO-8859-1")
 15 | 
 16 | def load_AMLAW_100():
 17 |     ''' Simple Loader Function'''
 18 |     return pd.read_csv(settings.AMLAW_100)
 19 | 
 20 | def load_hybrid_350():
 21 |     ''' Simple Loader Function'''
 22 |     return pd.read_csv(settings.HYBRID_FIRMS)
 23 | 
 24 | def load_counsel_clusters():
 25 |     ''' Simple Loader Function'''
 26 |     return pd.read_json(settings.COUNSEL_DIS_CLUSTS, lines=True)
 27 | 
 28 | def load_firm_clusters():
 29 |     ''' Simple Loader Function'''
 30 |     return pd.read_json(settings.FIRM_DIS_CLUSTS, lines=True)
 31 | 
 32 | def load_party_clusters():
 33 |     ''' Simple Loader Function'''
 34 |     return pd.read_json(settings.PARTY_DIS_CLUSTS, lines=True)
 35 | 
 36 | def load_disambiguated_counsels(ucid, as_df=True, collection_location=None):
 37 |     '''
 38 |     Load Counsel data (from relevant .jsonl files in the COUNSEL_DIS_DIR)
 39 | 
 40 |     Inputs:
 41 |         - ucid (str or iterable): can be a single ucid (str) or any iterable (list / pd.Series)
 42 |         - as_df (bool): if true returns as type pd.DataFrame, otherwise list of dicts
 43 | 
 44 |     Output:
 45 |         (pd.DataFrame or list of dicts) Disambiguated counsel data for the given ucid(s) if the counsel appeared multiple times in the corpus
 46 |     '''
 47 | 
 48 |     # Coerce to an iterable
 49 |     if type(ucid) is str:
 50 |         ucid = [ucid]
 51 | 
 52 |     ROW_DAT = []
 53 |     for each in ucid:
 54 |         # create filepath
 55 |         fname = ftools.build_counsel_filename_from_ucid(each, collection_location)
 56 |         # load file
 57 |         results = []
 58 |         if fname.exists():
 59 |             with open(fname, 'r') as json_file:
 60 |                 json_list = list(json_file)
 61 |                 for json_str in json_list:
 62 |                     results.append(json.loads(json_str))
 63 | 
 64 |         ROW_DAT+=results
 65 | 
 66 |     # return dataframe
 67 |     if ROW_DAT:
 68 |         if as_df:
 69 |             COUNSELS = pd.DataFrame(ROW_DAT)
 70 |         else:
 71 |             COUNSELS = ROW_DAT
 72 | 
 73 |         return COUNSELS
 74 |     else:
 75 |         return None
 76 | 
 77 | def load_disambiguated_firms(ucid, as_df=True, collection_location=None):
 78 |     '''
 79 |     Load Firm data (from relevant .jsonl files in the FIRM_DIS_DIR)
 80 | 
 81 |     Inputs:
 82 |         - ucid (str or iterable): can be a single ucid (str) or any iterable (list / pd.Series)
 83 |         - as_df (bool): if true returns as type pd.DataFrame, otherwise list of dicts
 84 | 
 85 |     Output:
 86 |         (pd.DataFrame or list of dicts) Disambiguated firm data for the given ucid(s) if the firm appeared multiple times in the corpus
 87 |     '''
 88 | 
 89 |     # Coerce to an iterable
 90 |     if type(ucid) is str:
 91 |         ucid = [ucid]
 92 | 
 93 |     ROW_DAT = []
 94 |     for each in ucid:
 95 |         # create filepath
 96 |         fname = ftools.build_firm_filename_from_ucid(each, collection_location)
 97 |         # load file
 98 |         results = []
 99 |         if fname.exists():
100 |             with open(fname, 'r') as json_file:
101 |                 json_list = list(json_file)
102 |                 for json_str in json_list:
103 |                     results.append(json.loads(json_str))
104 | 
105 |         ROW_DAT+=results
106 | 
107 |     # return dataframe
108 |     if ROW_DAT:
109 |         if as_df:
110 |             FIRMS = pd.DataFrame(ROW_DAT)
111 |         else:
112 |             FIRMS = ROW_DAT
113 | 
114 |         return FIRMS
115 |     else:
116 |         return None
117 | 
118 | 
119 | def load_disambiguated_parties(ucid, as_df=True, collection_location=None):
120 |     '''
121 |     Load Party data (from relevant .jsonl files in the FIRM_DIS_DIR)
122 | 
123 |     Inputs:
124 |         - ucid (str or iterable): can be a single ucid (str) or any iterable (list / pd.Series)
125 |         - as_df (bool): if true returns as type pd.DataFrame, otherwise list of dicts
126 | 
127 |     Output:
128 |         (pd.DataFrame or list of dicts) Disambiguated party data for the given ucid(s) if the party appeared multiple times in the corpus
129 |     '''
130 | 
131 |     # Coerce to an iterable
132 |     if type(ucid) is str:
133 |         ucid = [ucid]
134 | 
135 |     ROW_DAT = []
136 |     for each in ucid:
137 |         # create filepath
138 |         fname = ftools.build_party_filename_from_ucid(each, collection_location)
139 |         # load file
140 |         results = []
141 |         if fname.exists():
142 |             with open(fname, 'r') as json_file:
143 |                 json_list = list(json_file)
144 |                 for json_str in json_list:
145 |                     results.append(json.loads(json_str))
146 | 
147 |         ROW_DAT+=results
148 | 
149 |     # return dataframe
150 |     if ROW_DAT:
151 |         if as_df:
152 |             PARTIES = pd.DataFrame(ROW_DAT)
153 |         else:
154 |             PARTIES = ROW_DAT
155 | 
156 |         return PARTIES
157 |     else:
158 |         return None
159 |     


--------------------------------------------------------------------------------
/src/pacer_tools/code/parsers/schemas/party_cv_v1.schema.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "$schema": "http://json-schema.org/schema#",
  3 |   "$id": "http://github.com/scales-okn/PACER-tools/schemas/party_cv_v1.schema.json",
  4 |   "type": "object",
  5 |   "properties": {
  6 |     "name": {
  7 |       "type": "string",
  8 |       "description": "Name of the party in the case"
  9 |     },
 10 |     "counsel": {
 11 |       "type": "array",
 12 |       "description": "An array of the party's lawyers in the case",
 13 |       "items": {
 14 |         "type": "object",
 15 |         "properties": {
 16 |           "name": {
 17 |             "type": "string",
 18 |             "description": "Counsel name"
 19 |           },
 20 |           "office_name": {
 21 |             "type": "string",
 22 |             "description": "Name of the counsel's firm or legal office"
 23 |           },
 24 |           "address": {
 25 |             "type": "string",
 26 |             "description": "Counsel address, string delimited by '\\n' newline character"
 27 |           },
 28 |           "phone": {
 29 |             "type": "string",
 30 |             "description": "Counsel phone number, e.g. (123) 456-7890"
 31 |           },
 32 |           "fax": {
 33 |             "type": "null",
 34 |             "description": "Counsel fax number"
 35 |           },
 36 |           "email": {
 37 |             "type": "string",
 38 |             "description": "Counsel email address"
 39 |           },
 40 |           "is_lead_attorney": {
 41 |             "type": "boolean",
 42 |             "description": "Whether or not counsel is listed as lead attorney"
 43 |           },
 44 |           "is_pro_hac_vice": {
 45 |             "type": "boolean",
 46 |             "description": "Whether or not counsel is listed as pro hac vice"
 47 |           },
 48 |           "is_notice_attorney": {
 49 |             "type": "boolean",
 50 |             "description": "Whether or not counsel is listed as attorney to be noticed"
 51 |           },
 52 |           "see_above_for_address": {
 53 |             "type": "boolean",
 54 |             "description": "Whether or not address is listed as 'see above for address', meaning address info should be obtained from preceding counsel entries"
 55 |           },
 56 |           "designation": {
 57 |             "type": "string",
 58 |             "description": "This counsel's designation within the case (Retained, Government Attorney, Public Defender, etc)"
 59 |           },
 60 |           "bar_status": {
 61 |             "type": "string",
 62 |             "description": "This counsel's standing with respect to the general bar (Admitted, Not Admitted, etc)"
 63 |           },
 64 |           "trial_bar_status": {
 65 |             "type": "string",
 66 |             "description": "This counsel's standing with respect to the ILND trial bar, when applicable"
 67 |           },
 68 |           "counsel_terminating_date": {
 69 |             "type": "string",
 70 |             "description": "The date (if any) that this counsel was terminated from the case"
 71 |           },
 72 |           "raw_info": {
 73 |             "type": "string",
 74 |             "description": "The original value of the Pacer counsel field before being parsed out into the above SCALES fields"
 75 |           },
 76 |           "recap_counsel_error": {
 77 |             "type": "boolean",
 78 |             "description": "A flag indicating that counsel information is missing due to a Recap error in which lawyers with identical names are merged incorrectly"
 79 |           }
 80 |         }
 81 |       }
 82 |     },
 83 |     "is_pro_se": {
 84 |       "type": "boolean",
 85 |       "description": "Whether or not the party is appearing pro se, i.e. representing themselves"
 86 |     },
 87 |     "pro_se_source": {
 88 |       "type": "string",
 89 |       "description": "The source that led us to believe this party is pro se ('explicit'=written out in the docket, 'implicit'=address-like info found for a lawyerless party)",
 90 |       "enum": ["explicit", "implicit"]
 91 |     },
 92 |     "extra_pro_se_info": {
 93 |       "type": "string",
 94 |       "description":"For pro se parties, non-subheading party-related text that doesn't fit into counsel buckets (prisoner number, prison name, etc) - newline-delimited"
 95 |     },
 96 |     "terminating_date": {
 97 |       "type": "string",
 98 |       "description":"The date (if any) that this party was terminated from the case"
 99 |     },
100 |     "extra_info": {
101 |       "type": "string",
102 |       "description":"Any Pacer subheadings for this party (alt names, corporation types, the capacity in which they're appearing, etc) - newline-delimited"
103 |     },
104 |     "role": {
105 |       "type": "string",
106 |       "description": "This party's role in the case, as listed in their Pacer heading (e.g. 'Defendant', 'Plaintiff', 'Petitioner', 'Appellant'...)"
107 |     },
108 |     "party_type": {
109 |       "type": "string",
110 |       "description": "The broad bucket in which this party's role belongs, ascertained via a hand-coded mapping of the role",
111 |       "enum": ["defendant", "plaintiff", "misc", "other_party", "bk_party"]
112 |     },
113 |     "pacer_id": {
114 |       "type": "number",
115 |       "description": "Pacer's intra-case defendant id - always null for civil cases, but retained just in case users mistake it for a universal field"
116 |     },
117 |     "recap_party_error": {
118 |       "type": "boolean",
119 |       "description": "A flag indicating that party information is missing due to a Recap error in which parties with identical names are merged incorrectly"
120 |     }
121 |   }
122 | 
123 | }


--------------------------------------------------------------------------------
/src/pacer_tools/data/annotation/nature_suit.csv:
--------------------------------------------------------------------------------
  1 | number,name,sub_type,major_type
  2 | 110,Insurance,contract,contract
  3 | 120,Marine,contract,contract
  4 | 130,Miller Act,contract,contract
  5 | 140,Negotiable Instrument,contract,contract
  6 | 150,Recovery Of Overparyment & Enforcement Of Judgment,contract,contract
  7 | 151,Medicare Act,contract,contract
  8 | 152,Recovery Of Defaulted Student Loans (Excl. Veterans),contract,contract
  9 | 153,Recovery Of Overpayment Of Veteran S Benefits,contract,contract
 10 | 160,Stockholders Suits,contract,contract
 11 | 190,Other Contract,contract,contract
 12 | 195,Contract Product Liability,contract,contract
 13 | 196,Franchise,contract,contract
 14 | 210,Land Condemnation,real property,real property
 15 | 220,Foreclosure,real property,real property
 16 | 230,Rent Lease & Ejectment,real property,real property
 17 | 240,Torts To Land,real property,real property
 18 | 245,Tort Product Liability,real property,real property
 19 | 290,All Other Property,real property,real property
 20 | 310,Airplane,personal injury,torts
 21 | 315,Airplane Product Liability,personal injury,torts
 22 | 320,"Assault, Libel, & Slander",personal injury,torts
 23 | 330,Federalemployers Liability,personal injury,torts
 24 | 340,Marine,personal injury,torts
 25 | 345,Marine Product Liability,personal injury,torts
 26 | 350,Motor Vehicle,personal injury,torts
 27 | 355,Motor Vehicle Product Liability,personal injury,torts
 28 | 360,Other Personal Injury,personal injury,torts
 29 | 362,Personal Injury- Medical Malpractice,personal injury,torts
 30 | 365,Personal Injury- Product Liability,personal injury,torts
 31 | 367,Personal Injury - Health Care/Pharmaceutical Personal Injury/Product Liability,personal injury,torts
 32 | 368,Asbestos Personal Injury Product Liability,personal injury,torts
 33 | 375,False Claims Act,personal injury,torts
 34 | 376,376 Qui Tam (31 U.S.C. 3729(A)),personal injury,torts
 35 | 370,Other Fraud,personal property,torts
 36 | 371,Truth In Lending,personal property,torts
 37 | 380,Other Personal Property Damage,personal property,torts
 38 | 385,Property Damage Product Liability,personal property,torts
 39 | 422,Appeal 28 Usc 158,bankruptcy,bankruptcy
 40 | 423,Withdrawal 28 Usc 157,bankruptcy,bankruptcy
 41 | 440,Other Civil Rights,civil rights,civil rights
 42 | 441,Voting,civil rights,civil rights
 43 | 442,Employment,civil rights,civil rights
 44 | 443,Housing/Accommodations,civil rights,civil rights
 45 | 444,Welfare,civil rights,civil rights
 46 | 445,Amer W/Disabilities-Employment,civil rights,civil rights
 47 | 446,Amer W/Disabilities - Other,civil rights,civil rights
 48 | 448,Education,civil rights,civil rights
 49 | 462,Naturalization Application,immigration,immigration
 50 | 463,Habeas Corpus - Alien Detainee,immigration,immigration
 51 | 465,Other Immigration Actions,immigration,immigration
 52 | 510,Motions To Vacate Sentence,prisoner petitions,prisoner petitions
 53 | 530,General,habeas corpus,prisoner petitions
 54 | 535,Death Penalty,habeas corpus,prisoner petitions
 55 | 540,Mandamus & Other,habeas corpus,prisoner petitions
 56 | 550,Civil Rights,habeas corpus,prisoner petitions
 57 | 555,Prison Condition,habeas corpus,prisoner petitions
 58 | 560,Conditions Of Confinement,civil detainee,prisoner petitions
 59 | 610,Agriculture,forfeiture/penalty,forfeiture/penalty
 60 | 620,Other Food & Drug,forfeiture/penalty,forfeiture/penalty
 61 | 625,Drug Related Seizure Of Property 21 Usc 881 630 Liquor Laws,forfeiture/penalty,forfeiture/penalty
 62 | 630,Liquor Laws,forfeiture/penalty,forfeiture/penalty
 63 | 640,Rr & Truck,forfeiture/penalty,forfeiture/penalty
 64 | 650,Airline Regulations,forfeiture/penalty,forfeiture/penalty
 65 | 660,Occupational Safety/Health,forfeiture/penalty,forfeiture/penalty
 66 | 690,Other,forfeiture/penalty,forfeiture/penalty
 67 | 710,Fair Labor Standards Act,labor,labor
 68 | 720,Labor/Management Relations,labor,labor
 69 | 730,Labor/Management Reporting & Disclosure Act,labor,labor
 70 | 740,Railway Labor Act,labor,labor
 71 | 751,Family And Medical Leave Act,labor,labor
 72 | 790,Other Labor Litigation,labor,labor
 73 | 791,Employee Retirement Income Security Act,labor,labor
 74 | 820,Copyrights,property rights,property rights
 75 | 830,Patent,property rights,property rights
 76 | 835,Patent Abbreviated New Drug Application (Anda),property rights,property rights
 77 | 840,Trademark,property rights,property rights
 78 | 880,Defend Trade Secrets Act Of 2016 (Dtsa),property rights,property rights
 79 | 861,Hia (1395Ff),social security,social security
 80 | 862,Black Lung (923),social security,social security
 81 | 863,Diwc/Diww (405(G)),social security,social security
 82 | 864,Ssid Title Xvi,social security,social security
 83 | 865,Rsi (405(G)),social security,social security
 84 | 870,Taxes (U.S. Plaintiff Or Defendant),federal tax suits,federal tax suits
 85 | 871,Irs-Third Party 26 Usc 7609,federal tax suits,federal tax suits
 86 | 400,State Reapportionment,other statutes,other statutes
 87 | 410,Antitrust,other statutes,other statutes
 88 | 430,Banks And Banking,other statutes,other statutes
 89 | 450,Commerce,other statutes,other statutes
 90 | 460,Deportation,other statutes,other statutes
 91 | 470,Racketeer Influenced And Corrupt Organizations,other statutes,other statutes
 92 | 480,Consumer Credit,other statutes,other statutes
 93 | 485,Telephone Consumer Protection Act (Tcpa),other statutes,other statutes
 94 | 490,Cable/Sat Tv,other statutes,other statutes
 95 | 810,Selective Service,other statutes,other statutes
 96 | 850,Securities/Commodities/Exchange,other statutes,other statutes
 97 | 875,Customer Challenge 12 Usc 3410,other statutes,other statutes
 98 | 890,Other Statutory Actions,other statutes,other statutes
 99 | 891,Agricultural Acts,other statutes,other statutes
100 | 892,Economic Stabilization Act,other statutes,other statutes
101 | 893,Environmental Matters,other statutes,other statutes
102 | 894,Energy Allocation Act,other statutes,other statutes
103 | 895,Freedom Of Information Act,other statutes,other statutes
104 | 896,Arbitration,other statutes,other statutes
105 | 899,Administrative Procedure Act/Review Or Appeal Of Agency Decision,other statutes,other statutes
106 | 900,Appeal Of Fee Determination Under Equal Access To Justice Act,other statutes,other statutes
107 | 950,Constitutionality Of State Statutes,other statutes,other statutes
108 | 990,Other,other statutes,other statutes
109 | 999,Miscellaneous Cases,other statutes,other statutes


--------------------------------------------------------------------------------
/src/pacer_tools/code/support/core_data/nature_suit.csv:
--------------------------------------------------------------------------------
  1 | number,name,sub_type,major_type
  2 | 110,Insurance,contract,contract
  3 | 120,Marine,contract,contract
  4 | 130,Miller Act,contract,contract
  5 | 140,Negotiable Instrument,contract,contract
  6 | 150,Recovery Of Overparyment & Enforcement Of Judgment,contract,contract
  7 | 151,Medicare Act,contract,contract
  8 | 152,Recovery Of Defaulted Student Loans (Excl. Veterans),contract,contract
  9 | 153,Recovery Of Overpayment Of Veteran S Benefits,contract,contract
 10 | 160,Stockholders Suits,contract,contract
 11 | 190,Other Contract,contract,contract
 12 | 195,Contract Product Liability,contract,contract
 13 | 196,Franchise,contract,contract
 14 | 210,Land Condemnation,real property,real property
 15 | 220,Foreclosure,real property,real property
 16 | 230,Rent Lease & Ejectment,real property,real property
 17 | 240,Torts To Land,real property,real property
 18 | 245,Tort Product Liability,real property,real property
 19 | 290,All Other Property,real property,real property
 20 | 310,Airplane,personal injury,torts
 21 | 315,Airplane Product Liability,personal injury,torts
 22 | 320,"Assault, Libel, & Slander",personal injury,torts
 23 | 330,Federalemployers Liability,personal injury,torts
 24 | 340,Marine,personal injury,torts
 25 | 345,Marine Product Liability,personal injury,torts
 26 | 350,Motor Vehicle,personal injury,torts
 27 | 355,Motor Vehicle Product Liability,personal injury,torts
 28 | 360,Other Personal Injury,personal injury,torts
 29 | 362,Personal Injury- Medical Malpractice,personal injury,torts
 30 | 365,Personal Injury- Product Liability,personal injury,torts
 31 | 367,Personal Injury - Health Care/Pharmaceutical Personal Injury/Product Liability,personal injury,torts
 32 | 368,Asbestos Personal Injury Product Liability,personal injury,torts
 33 | 375,False Claims Act,personal injury,torts
 34 | 376,376 Qui Tam (31 U.S.C. 3729(A)),personal injury,torts
 35 | 370,Other Fraud,personal property,torts
 36 | 371,Truth In Lending,personal property,torts
 37 | 380,Other Personal Property Damage,personal property,torts
 38 | 385,Property Damage Product Liability,personal property,torts
 39 | 422,Appeal 28 Usc 158,bankruptcy,bankruptcy
 40 | 423,Withdrawal 28 Usc 157,bankruptcy,bankruptcy
 41 | 440,Other Civil Rights,civil rights,civil rights
 42 | 441,Voting,civil rights,civil rights
 43 | 442,Employment,civil rights,civil rights
 44 | 443,Housing/Accommodations,civil rights,civil rights
 45 | 444,Welfare,civil rights,civil rights
 46 | 445,Amer W/Disabilities-Employment,civil rights,civil rights
 47 | 446,Amer W/Disabilities - Other,civil rights,civil rights
 48 | 448,Education,civil rights,civil rights
 49 | 462,Naturalization Application,immigration,immigration
 50 | 463,Habeas Corpus - Alien Detainee,immigration,immigration
 51 | 465,Other Immigration Actions,immigration,immigration
 52 | 510,Motions To Vacate Sentence,prisoner petitions,prisoner petitions
 53 | 530,General,habeas corpus,prisoner petitions
 54 | 535,Death Penalty,habeas corpus,prisoner petitions
 55 | 540,Mandamus & Other,habeas corpus,prisoner petitions
 56 | 550,Civil Rights,habeas corpus,prisoner petitions
 57 | 555,Prison Condition,habeas corpus,prisoner petitions
 58 | 560,Conditions Of Confinement,civil detainee,prisoner petitions
 59 | 610,Agriculture,forfeiture/penalty,forfeiture/penalty
 60 | 620,Other Food & Drug,forfeiture/penalty,forfeiture/penalty
 61 | 625,Drug Related Seizure Of Property 21 Usc 881 630 Liquor Laws,forfeiture/penalty,forfeiture/penalty
 62 | 630,Liquor Laws,forfeiture/penalty,forfeiture/penalty
 63 | 640,Rr & Truck,forfeiture/penalty,forfeiture/penalty
 64 | 650,Airline Regulations,forfeiture/penalty,forfeiture/penalty
 65 | 660,Occupational Safety/Health,forfeiture/penalty,forfeiture/penalty
 66 | 690,Other,forfeiture/penalty,forfeiture/penalty
 67 | 710,Fair Labor Standards Act,labor,labor
 68 | 720,Labor/Management Relations,labor,labor
 69 | 730,Labor/Management Reporting & Disclosure Act,labor,labor
 70 | 740,Railway Labor Act,labor,labor
 71 | 751,Family And Medical Leave Act,labor,labor
 72 | 790,Other Labor Litigation,labor,labor
 73 | 791,Employee Retirement Income Security Act,labor,labor
 74 | 820,Copyrights,property rights,property rights
 75 | 830,Patent,property rights,property rights
 76 | 835,Patent Abbreviated New Drug Application (Anda),property rights,property rights
 77 | 840,Trademark,property rights,property rights
 78 | 880,Defend Trade Secrets Act Of 2016 (Dtsa),property rights,property rights
 79 | 861,Hia (1395Ff),social security,social security
 80 | 862,Black Lung (923),social security,social security
 81 | 863,Diwc/Diww (405(G)),social security,social security
 82 | 864,Ssid Title Xvi,social security,social security
 83 | 865,Rsi (405(G)),social security,social security
 84 | 870,Taxes (U.S. Plaintiff Or Defendant),federal tax suits,federal tax suits
 85 | 871,Irs-Third Party 26 Usc 7609,federal tax suits,federal tax suits
 86 | 400,State Reapportionment,other statutes,other statutes
 87 | 410,Antitrust,other statutes,other statutes
 88 | 430,Banks And Banking,other statutes,other statutes
 89 | 450,Commerce,other statutes,other statutes
 90 | 460,Deportation,other statutes,other statutes
 91 | 470,Racketeer Influenced And Corrupt Organizations,other statutes,other statutes
 92 | 480,Consumer Credit,other statutes,other statutes
 93 | 485,Telephone Consumer Protection Act (Tcpa),other statutes,other statutes
 94 | 490,Cable/Sat Tv,other statutes,other statutes
 95 | 810,Selective Service,other statutes,other statutes
 96 | 850,Securities/Commodities/Exchange,other statutes,other statutes
 97 | 875,Customer Challenge 12 Usc 3410,other statutes,other statutes
 98 | 890,Other Statutory Actions,other statutes,other statutes
 99 | 891,Agricultural Acts,other statutes,other statutes
100 | 892,Economic Stabilization Act,other statutes,other statutes
101 | 893,Environmental Matters,other statutes,other statutes
102 | 894,Energy Allocation Act,other statutes,other statutes
103 | 895,Freedom Of Information Act,other statutes,other statutes
104 | 896,Arbitration,other statutes,other statutes
105 | 899,Administrative Procedure Act/Review Or Appeal Of Agency Decision,other statutes,other statutes
106 | 900,Appeal Of Fee Determination Under Equal Access To Justice Act,other statutes,other statutes
107 | 950,Constitutionality Of State Statutes,other statutes,other statutes
108 | 990,Other,other statutes,other statutes
109 | 999,Miscellaneous Cases,other statutes,other statutes


--------------------------------------------------------------------------------
/src/pacer_tools/code/support/README.md:
--------------------------------------------------------------------------------
  1 | # Overview
  2 | Collection of common tools and functions for SCALES project
  3 | 
  4 | # Filehandle Tools 
  5 | In `fhandle_tools` there are several functions to simplify and unify common transformations of filenames and case names. For full function argument documentation see the docstrings. Below is a summary of some of the most useful methods with usage examples.
  6 | 
  7 | ## Quick lookup
  8 | ### Dockets/Cases
  9 | | method |input  | output|
 10 | |--|--|--|
 11 | | *decompose_caseno* | `"1:16-cv-12345-2-ABC-DEF"` |`{'office': '1', 'year': '16',...}`  |
 12 | | *clean_case_id* | `"1:16-cv-12345-2-ABC-DEF"` | `"1:16-cv-12345"` |
 13 | |*generate_docket_filename*|`"1:16-cv-12345",def_no=3, ind=2`|`"1:16-cv-12345-3_2.html"`|
 14 | 
 15 | 
 16 | ### Documents
 17 | | method |input  | output|
 18 | |--|--|--|
 19 | |*generate_document_id*|`"ilnd;;1:16-cv-12345", index=3, att_index=10`|`"ilnd;;1-16-cv-12345_3_10"`|
 20 | |*generate_document_fname*|`"ilnd;;1-16-cv-12345_3_10", user_hash="12345678"`|`"ilnd;;1-16-cv-12345_3_10_u12345678_t210106.pdf"`|
 21 | |*parse_document_fname*|`"ilnd;;1-16-cv-12345_3_10_u12345678_t210106.pdf"`| `{'index':3, 'att_index':10, ...`|
 22 | 
 23 | ## Dockets/Cases
 24 | **decompose_caseno**(*case_no, pattern=re_case_no_gr*)
 25 | 
 26 | *Takes a PACER-style case no. and returns a dictionary of its decomposed parts.*
 27 | ```python
 28 | decompose_caseno("1:16-cv-12345-2-ABC-DEF")
 29 | >> {'office': '1',
 30 |  'year': '16',
 31 |  'case_type': 'cv',
 32 |  'case_no': '12345',
 33 |  'judge_names': ['ABC', 'DEF'],
 34 |  'def_no': '2',
 35 |  'update_ind': ''}
 36 | ```
 37 | 
 38 | **clean_case_id**(*case_no, allow_def_stub=False)*
 39 | 
 40 | *Takes a case id and cleans off anything that isn't the office, year, case type, and case no. Can handle filenames also.*
 41 | ```python
 42 | clean_case_id("1:16-cv-12345-2-ABC-DEF")
 43 | >> "1:16-cv-12345"
 44 | clean_case_id("1-16-cv-12345_1.html")
 45 | >> "1:16-cv-12345"
 46 | ```
 47 | 
 48 | **generate_docket_filename**(*case_name, def_no=None, ind=None, ext='html'*)
 49 | 
 50 | *Generate the filename for a docket*
 51 | ```python
 52 | generate_docket_filename("1:16-cv-12345")
 53 | >> "1:16-cv-12345.html"
 54 | generate_docket_filename("1:16-cv-12345",def_no=3, ind=2)
 55 | >> "1:16-cv-12345-3_2.html"
 56 | ```
 57 | 
 58 | ## Documents
 59 | 
 60 | **generate_document_id**(*ucid, index, att_index=None*)
 61 | 
 62 | *Generate a unique id name for case document download*
 63 | ```python
 64 | generate_document_id("ilnd;;1:16-cv-12345", 3)
 65 | >> "ilnd;;1-16-cv-12345_3"
 66 | generate_document_id("ilnd;;1:16-cv-12345", 3, 10)
 67 | >> "ilnd;;1-16-cv-12345_3_10"
 68 | ```
 69 | 
 70 | **generate_document_fname**(*doc_id, user_hash, ext='pdf'*)
 71 | 
 72 | *Generate a unique file name for case document download*
 73 | ```python
 74 | generate_document_fname("ilnd;;1-16-cv-12345_3_10",user_hash=12345678 )
 75 | >> "ilnd;;1-16-cv-12345_3_10_u12345678_t210106.pdf"
 76 | ```
 77 | **parse_document_fname**(*fname'*)
 78 | 
 79 | *Parse a document filename, return the component parts as a dict*
 80 | ```python
 81 | parse_document_fname("ilnd;;1-16-cv-12345_3_10_u12345678_t210106.pdf" )
 82 | >> {'doc_id': 'ilnd;;1-16-cv-12345_3_10',
 83 |  'index': '3',
 84 |  'att_index': '10',
 85 |  'user_hash': '12345678',
 86 |  'download_time': '210106',
 87 |  'ext': 'pdf',
 88 |  'ucid': 'ilnd;;1:16-cv-12345'}
 89 | ```
 90 | 
 91 | ## Other
 92 | **get_expected_path**(*ucid, ext='json', pacer_path=settings.PACER_PATH, def_no=None*)
 93 | 
 94 | *Find the expected path of the json file for the case*
 95 | ```python
 96 | get_expected_path("ilnd;;1:16-cv-12345")
 97 | >> "{{abs}}/data/pacer/ilnd/json/1-16-cv-12345.json"
 98 | get_expected_path("ilnd;;1:16-cv-12345", ext="html", def_no=2)
 99 | >> "{{abs}}/data/pacer/html/json/1-16-cv-12345_2.html"
100 | ```
101 | **get_pacer_url**(*court, page*)
102 | 
103 | *Get a court-specific pacer url for various pages: query, login, logout, docket, document link, possible case*
104 | 
105 | ```python
106 | get_pacer_url("ilnd", "query")
107 | >>> "https://ecf.ilnd.uscourts.gov/cgi-bin/iquery.pl"
108 | get_pacer_url("txed", "logout")
109 | >>> "https://ecf.txed.uscourts.gov/cgi-bin/login.pl?logout"
110 | ```
111 | 
112 | 
113 | # Research Tools (`research_tools.py`)
114 | ## Docket Searcher
115 | ### Description
116 | The docket searcher is a tool to analyze case dockets for events/patterns and build a table of observations.
117 | The tool takes a collection of docket reports, and for each line of each docket report it does the following:
118 |  1. Checks if the text of the docket line matches **basic criteria**. This can be one of two ways:
119 | 	 - The docket line matches the *wide_net*
120 | 	 - The docket line matches the *docket_line_fn* function
121 |  2. If so, checks the line for a variety of patterns (patterns, computed_attrs) and use this to build a row for the result set.
122 | 
123 | ### Usage
124 | 
125 | ```python
126 | docket_searcher(case_paths, outfile, wide_net, patterns,
127 | 	        computed_attrs={}, rlim=None, line_match_fn=None)
128 | ```
129 | - **case_paths** (list): a list of filepaths to case data (.json files) that are relative to the project root
130 | - **outfile** (str): the output file (.csv)
131 | - **wide_net** (list): a list of regex patterns
132 | - **patterns** (dict): a dictionary of regex patterns with (variable_name, pattern) pairs
133 | - **computed_attrs** (dict): a dictionary of (variable_name, function) pairs. The functions take two arguments (*docket_line*, *case*) where the *docket_line* is a list and *case* is a parsed case json
134 | - **rlim** (int): a right limit to narrow search within docket entry text
135 | - **line_match_fn** (function): a function to use to instead to check if a line matches the basic criteria. The function takes two arguments (docket_line, case) as above. If line_match_fn is supplied it is used instead of *wide_net* to check basic criteria.
136 | 
137 | 
138 | ### Example
139 | ```python
140 | import research_tools as rt
141 | 
142 | case_paths = [...]
143 | outfile = 'results_table.csv'
144 | wide_net = ['seal', 'protective']
145 | 
146 | patterns = {
147 | 	'seal_motion':'(motion|order)( to)? seal',
148 | 	'grant_part': 'granting in part motion to seal',
149 | 	'deny_part' : 'denying in part motion to seal'
150 | }
151 | 
152 | def date_diff(x,y):
153 | 	return (pd.Timestamp(x) - pd.TimeStamp(y)).days
154 | 
155 | computed_attrs = {
156 | 	'case_type': lambda dl,c: c['case_type'],
157 | 	'days_from_filing': lambda dl,c: date_diff(dl[0], c['filing_date'])
158 | }
159 | 
160 | rt.docket_searcher(case_paths, 'res_tab.csv', wide_net,
161 | 				patterns, computed_attrs)
162 | ```
163 | 
164 | */res_tab.csv*
165 | ```
166 | fpath,ucid,court,judge,case_type,case_type,days_from_filing,seal_motion,grant_part,deny_part
167 | <fpath_caseA>,ilnd;;<caseno_caseA>>,ilnd,Judge A,cr,0,1,0,0
168 | <fpath_caseA>,ilnd;;<caseno_caseA>>,ilnd,Judge A,cr,12,0,1,1
169 | <fpath_caseB>,ilnd;;<caseno_caseB>>,ilnd,Judge B,cr,2,1,0,0
170 | <fpath_caseB>,ilnd;;<caseno_caseB>>,ilnd,Judge B,cr,7,0,1,0
171 | <fpath_caseB>,ilnd;;<caseno_caseB>>,ilnd,Judge B,cr,8,0,0,1
172 | 
173 | ```
174 | 
175 | ### Output
176 | The output file has a row for each docket line that meets the basic criteria.
177 | The output columns are (in the following order):
178 | 
179 |  - *ucid*
180 |  - *court*
181 |  - *judge*
182 |  - *fpath*
183 |  - *date*: the docket line date
184 |  - *ind*: the index of docket line, relative to docket list in case json
185 |  - *text*: the first 100 characters of the docket line text
186 | 
187 | Following the above are:
188 |  - all columns generated by **computed_attrs** keys
189 |  - all columns from **patterns** keys
190 | 
191 | 


--------------------------------------------------------------------------------
/src/pacer_tools/code/support/research_tools.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import csv
  3 | import sys
  4 | from pathlib import Path
  5 | 
  6 | sys.path.append(str(Path(__file__).resolve().parents[1]))
  7 | from support import data_tools as dtools
  8 | 
  9 | # Case-level metadata for results
 10 | case_metadata = {
 11 |     'ucid': lambda case: dtools.ucid(case['download_court'], case['case_id']),
 12 |     'court': lambda case: case['download_court'],
 13 |     'judge': lambda case: case['judge'],
 14 | }
 15 | 
 16 | def pattern_matcher(patterns, text_str):
 17 |     '''
 18 |     Search for a group of patterns in the same string, return spans of matches
 19 | 
 20 |     Inputs:
 21 |         - patterns (dict): key-value pairs of pattern name to pattern value (regex pattern)
 22 |         - text_str (str): the text to search in
 23 |     Output
 24 |         matches(dict): key-value pairs of (pattern name, match span),
 25 |                 if there is a match the span is a tuple of integers, otherwise it is None
 26 |     '''
 27 |     _get_span_ = lambda match: match.span() if match else None
 28 | 
 29 |     return {name: _get_span_(re.search(pattern, text_str,re.I)) for name,pattern in patterns.items()}
 30 | 
 31 | def wide_net_match_line(docket_line, case, wide_net=[], wide_net_fn=None):
 32 |     '''
 33 |     Check single docket line for wide net match, or uses wide_net_fn if supplied
 34 |     Inputs:
 35 |         - docket_line (list): a single docket line
 36 |         - case (json): The case json
 37 |         - wide_net (list): a list of regex patterns co ca
 38 |         - match_fn (function): a match function to run if no docket_patterns supplied
 39 |     '''
 40 |     if wide_net_fn is not None:
 41 |         return wide_net_fn(docket_line, case)
 42 |     else:
 43 |         full_pattern = '|'.join(f"({pat})" for pat in wide_net)
 44 |         return bool(re.search(full_pattern, docket_line['docket_text'], re.I))
 45 | 
 46 | def row_builder(docket_line, ind, case, fpath, patterns, computed_attrs={},  rlim=None):
 47 |     '''
 48 |     Function to build observation row of result set.
 49 | 
 50 |     Inputs:
 51 |         - docket_line (tuple): The docket entry (date, #, docket text)
 52 |         - ind (int): index of docket_line (relative to dockets list in json)
 53 |         - case (json): The case json
 54 |         - fpath (str): file path
 55 |         - patterns (dict): a dictionary of pattern names and regex patterns
 56 |         - computed_attrs (dict): A dictionary with attribute names as keys,
 57 |                         and functions taking docket_line and case as values
 58 |                         e.g.{'is2020': lambda dl, c: dl[0].year==2020}
 59 |         - rlim (int): right limit to search text
 60 |     Output:
 61 |         row (dict)
 62 |     '''
 63 |     row = {
 64 |         # Case-level metadata
 65 |         **{k: fn(case) for k,fn in case_metadata.items()},
 66 |         'fpath': fpath,
 67 |         'date': docket_line['date_filed'],
 68 |         'ind': ind,
 69 |         'text': docket_line['docket_text'][:100],
 70 |         # Computed attribues
 71 |         **{k: fn(docket_line, case) for k,fn in computed_attrs.items()},
 72 |         # Pattern matches
 73 |         **pattern_matcher(patterns, docket_line['docket_text'][:rlim]),
 74 |     }
 75 |     return row
 76 | 
 77 | def get_case_matches(fpath, patterns, wide_net,
 78 |                     computed_attrs={}, rlim=None, wide_net_fn=None, skip_non_matches=False):
 79 |     '''
 80 |     Process a case and return observation rows
 81 | 
 82 |     Output:
 83 |     (list) of obersvation rows (dicts)
 84 |     '''
 85 | 
 86 |     case_rows = []
 87 |     case = dtools.load_case(fpath)
 88 | 
 89 |     for ind, line in enumerate(case['docket']):
 90 | 
 91 |         if wide_net_match_line(line, case, wide_net, wide_net_fn):
 92 |             # Use row builder
 93 |             row = row_builder(docket_line=line, ind=ind, case=case, fpath=fpath,
 94 |                     patterns=patterns, computed_attrs=computed_attrs, rlim=rlim)
 95 | 
 96 |             if skip_non_matches:
 97 |                 # Only add row if at least one pattern match
 98 |                 if not any(v for k,v in row.items() if k in patterns):
 99 |                     continue
100 | 
101 |             case_rows.append(row)
102 | 
103 |     return case_rows
104 | 
105 | def docket_searcher(case_paths, outfile, patterns, wide_net=[], computed_attrs={},
106 |                      rlim=None, wide_net_fn=None, skip_non_matches=False):
107 |     '''
108 |     Main function to build results set from criteria
109 | 
110 |     Inputs:
111 |         - case_paths (iterable): list of filepaths
112 |         - outfile (str or Path): path to output file (.csv)
113 |         - patterns (dict): a dictionary of patterns
114 |         - wide_net (list): a list of wide regex patterns to match on docket lines
115 |         - computed_attrs (dict): a dictionary of computed attributes
116 |                         (named functions that take (docket_line, case) inputs)
117 |         - rlim (int): right limit on characters in docket text to analyze
118 |         - wide_net_fn (function): a function that takes (docket_line, case) arguments,
119 |            where docket_line is dict (from the case['docket'] array) and case is the case dict,
120 |            and maps to a boolean, if supplied will be used to decide on a row match instead of wide_net
121 |         - skip_non_matches (bool): Useful for debugging/exploring, if true then
122 |           rows that match the wide net but have no pattern matches are not written to outfile
123 |     '''
124 |     if (not len(wide_net)) and (wide_net_fn is None):
125 |         raise ValueError('Must supply either wide_net or wide_net_fn')
126 | 
127 |     # Get table column headers
128 |     headers = [*case_metadata.keys(), 'fpath', 'date','ind', 'text', *computed_attrs.keys(), *patterns.keys()]
129 | 
130 |     # Open outfile for writing
131 |     with open(outfile, 'w', encoding='utf-8') as rfile:
132 |         writer = csv.writer(rfile)
133 |         writer.writerow(headers)
134 | 
135 |         for fpath in case_paths:
136 |             case_rows = get_case_matches(fpath, patterns, wide_net, computed_attrs, rlim, wide_net_fn, skip_non_matches)
137 |             print(f"<case:{fpath}> found {len(case_rows)} rows with matches")
138 | 
139 |             if len(case_rows):
140 |                 # Write to file
141 |                 for row_dict in case_rows:
142 |                     # Ensure ordered printing by headers
143 | 
144 |                     #TODO: make this an append
145 |                     writer.writerow(row_dict[k] for k in headers)
146 | 
147 |     print(f'Docket Searcher complete, results located at {outfile}')
148 | 
149 | def make_spacy_spans(row_series, pat_cols):
150 |     ''' Convert a row from docket searcher output to a spaCy span-like output
151 |     Inputs:
152 |         - row_series(pd.Series): a pandas series/row
153 |         - pat_cols (list): list of str of column names in row_series that are pattern columns
154 |     Output:
155 |         (list of dicts) with start, end, label keys
156 | 
157 |     Example:
158 | 
159 |         row_series =
160 |             ucid #####
161 |             year ######
162 |             pat1 (10,15)
163 |             pat2 (30,40)
164 | 
165 |         pat_cols = ['pat1', 'pat2']
166 | 
167 |         output: [
168 |                     {'start':10, 'end':15, 'label':'pat1'},
169 |                     {'start':30, 'end':40, 'label':'pat2'}
170 |                 ]
171 | 
172 | 
173 |     '''
174 |     return [{'start':int(v[0]), 'end':int(v[1]), 'label':k} for k,v in row_series[pat_cols].iteritems() if v]
175 | 


--------------------------------------------------------------------------------
/src/pacer_tools/code/parsers/schemas/party_cr_v1.schema.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "$schema": "http://json-schema.org/schema#",
  3 |   "$id": "http://github.com/scales-okn/PACER-tools/schemas/party_cr_v1.schema.json",
  4 |   "type": "object",
  5 |   "properties": {
  6 |     "name": {
  7 |       "type": "string",
  8 |       "description": "Name of the party in the case"
  9 |     },
 10 |     "counsel": {
 11 |       "type": "array",
 12 |       "description": "An array of the party's lawyers in the case",
 13 |       "items": {
 14 |         "type": "object",
 15 |         "properties": {
 16 |           "name": {
 17 |             "type": "string",
 18 |             "description": "Counsel name"
 19 |           },
 20 |           "office_name": {
 21 |             "type": "string",
 22 |             "description": "Name of the counsel's firm or legal office"
 23 |           },
 24 |           "address": {
 25 |             "type": "string",
 26 |             "description": "Counsel address, string delimited by '\\n' newline character"
 27 |           },
 28 |           "phone": {
 29 |             "type": "string",
 30 |             "description": "Counsel phone number, e.g. (123) 456-7890"
 31 |           },
 32 |           "fax": {
 33 |             "type": "null",
 34 |             "description": "Counsel fax number"
 35 |           },
 36 |           "email": {
 37 |             "type": "string",
 38 |             "description": "Counsel email address"
 39 |           },
 40 |           "is_lead_attorney": {
 41 |             "type": "boolean",
 42 |             "description": "Whether or not counsel is listed as lead attorney"
 43 |           },
 44 |           "is_pro_hac_vice": {
 45 |             "type": "boolean",
 46 |             "description": "Whether or not counsel is listed as pro hac vice"
 47 |           },
 48 |           "is_notice_attorney": {
 49 |             "type": "boolean",
 50 |             "description": "Whether or not counsel is listed as attorney to be noticed"
 51 |           },
 52 |           "see_above_for_address": {
 53 |             "type": "boolean",
 54 |             "description": "Whether or not address is listed as 'see above for address', meaning address info should be obtained from preceding counsel entries"
 55 |           },
 56 |           "designation": {
 57 |             "type": "string",
 58 |             "description": "This counsel's designation within the case (Retained, Government Attorney, Public Defender, etc)"
 59 |           },
 60 |           "bar_status": {
 61 |             "type": "string",
 62 |             "description": "This counsel's standing with respect to the general bar (Admitted, Not Admitted, etc)"
 63 |           },
 64 |           "trial_bar_status": {
 65 |             "type": "string",
 66 |             "description": "This counsel's standing with respect to the ILND trial bar, when applicable"
 67 |           },
 68 |           "counsel_terminating_date": {
 69 |             "type": "string",
 70 |             "description": "The date (if any) that this counsel was terminated from the case"
 71 |           },
 72 |           "raw_info": {
 73 |             "type": "string",
 74 |             "description": "The original value of the Pacer counsel field before being parsed out into the above SCALES fields"
 75 |           },
 76 |           "recap_counsel_error": {
 77 |             "type": "boolean",
 78 |             "description": "A flag indicating that counsel information is missing due to a Recap error in which lawyers with identical names are merged incorrectly"
 79 |           }
 80 |         }
 81 |       }
 82 |     },
 83 |     "is_pro_se": {
 84 |       "type": "boolean",
 85 |       "description": "Whether or not the party is appearing pro se, i.e. representing themselves"
 86 |     },
 87 |     "pro_se_source": {
 88 |       "type": "string",
 89 |       "description": "The source that led us to believe this party is pro se ('explicit'=written out in the docket, 'implicit'=address-like info found for a lawyerless party)",
 90 |       "enum": ["explicit", "implicit"]
 91 |     },
 92 |     "extra_pro_se_info": {
 93 |       "type": "string",
 94 |       "description":"For pro se parties, non-subheading party-related text that doesn't fit into counsel buckets (prisoner number, prison name, etc) - newline-delimited"
 95 |     },
 96 |     "terminating_date": {
 97 |       "type": "string",
 98 |       "description":"The date (if any) that this party was terminated from the case"
 99 |     },
100 |     "extra_info": {
101 |       "type": "string",
102 |       "description":"Any Pacer subheadings for this party (alt names, corporation types, the capacity in which they're appearing, etc) - newline-delimited"
103 |     },
104 |     "judge": {
105 |       "type": "string",
106 |       "description": "The defendant-specific assigned judge"
107 |     },
108 |     "appeals_case_ids": {
109 |       "type": "array",
110 |       "description": "The case ids of any defendant-specific appeals issuing from this case",
111 |       "items": {
112 |         "type": "string"
113 |       }
114 |     },
115 |     "role": {
116 |       "type": "string",
117 |       "description": "This party's role in the case, as listed in their Pacer heading (e.g. 'Defendant', 'Plaintiff', 'Petitioner', 'Appellant'...)"
118 |     },
119 |     "party_type": {
120 |       "type": "string",
121 |       "description": "The broad bucket in which this party's role belongs, ascertained via a hand-coded mapping of the role",
122 |       "enum": ["defendant", "plaintiff", "misc", "other_party", "bk_party"]
123 |     },
124 |     "pacer_id": {
125 |       "type": "number",
126 |       "description": "The intra-case defendant id - e.g. for a defendant listed as 'Jane Doe (2),' the Pacer id is 2"
127 |     },
128 |     "referred_judges": {
129 |       "type": "array",
130 |       "description": "The defendant-specific referred judges listed after 'Referred to:'; only present when the case was referred to a second judge",
131 |       "items": {
132 |         "type": "string"
133 |       }
134 |     },
135 |     "pending_counts": {
136 |       "type": "array",
137 |       "description": "",
138 |       "items": {
139 |         "$ref": "#/$defs/count"
140 |       }
141 |     },
142 |     "terminated_counts": {
143 |       "type": "array",
144 |       "description": "",
145 |       "items": {
146 |         "$ref": "#/$defs/count"
147 |       }
148 |     },
149 |     "highest_offense_level_opening": {
150 |       "type": "string",
151 |       "description": "The degree of the most serious charges against this defendant at the start of the case (felony, misdemeanor, etc)"
152 |     },
153 |     "highest_offense_level_terminated": {
154 |       "type": "string",
155 |       "description": "The degree of the most serious charges against this defendant at the end of the case (felony, misdemeanor, etc)"
156 |     },
157 |     "complaints_text": {
158 |       "type": "string",
159 |       "description": "The primary criminal complaints against this defendant (sometimes supplements the count fields, sometimes duplicates them, and sometimes stands in for them)"
160 |     },
161 |     "complaints_disposition": {
162 |       "type": "string",
163 |       "description": "The disposition of the complaints (prison time, dismissal, etc)"
164 |     },
165 |     "recap_party_error": {
166 |       "type": "boolean",
167 |       "description": "A flag indicating that party information is missing due to a Recap error in which parties with identical names are merged incorrectly"
168 |     }
169 |   },
170 |   "$defs": {
171 |     "count": {
172 |       "type": "object",
173 |       "description": "A count against a defendant",
174 |       "properties": {
175 |         "pacer_id": {
176 |           "type": "string",
177 |           "description": "The intra-defendant count id - e.g. for a count listed as 'FRAUD AND FALSE STATEMENTS(1s-2s),' the Pacer id is 1s-2s"
178 |         },
179 |         "text": {
180 |           "type": "string",
181 |           "description": "The text of the count (e.g. 'FRAUD AND FALSE STATEMENTS')"
182 |         },
183 |         "disposition": {
184 |           "type": "string",
185 |           "description": "The disposition of the count (prison time, dismissal, etc)"
186 |         }
187 |       }
188 |     }
189 |   }
190 | 
191 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | *In addition to this documentation page, you may also find SCALES's [documentation site](https://docs.scales-okn.org/) helpful, particularly the pages concerning our scraper and parser.*
  2 | 
  3 | # Table of Contents
  4 | * [Scraper Tutorial](README.md#scraper-tutorial)
  5 |     1) [Getting Started](README.md#1-getting-started)
  6 |     2) [Pacer credentials](README.md#2-pacer-credentials)
  7 |     3) [Query Scraper](README.md#3-query-scraper)
  8 |     4) [Docket Scraper](README.md#4-docket-scraper)
  9 |     5) [Document Scraper](README.md#5-document-scraper)
 10 | * [Parser Tutorial](README.md#parser-tutorial)
 11 | 
 12 | # Scraper Tutorial
 13 | This is a tutorial on how to use the SCALES Scraper tool to download data from PACER. 
 14 | 
 15 | The scraper has three modules:
 16 | 
 17 |  1. Query scraper: to download case queries
 18 |  2. Docket Scraper: to download case dockets
 19 |  3. Document Scraper: to download case documents
 20 | 
 21 | This tutorial will step through the basics of getting set up with the scraping tool and using each module. For more details, see the documentation [here](src/pacer_tools/code/downloader/README.md)
 22 | 
 23 | The tutorial uses the PACER demo site located here: https://dcecf.psc.uscourts.gov/cgi-bin/ShowIndex.pl
 24 | 
 25 | This is a demo version of PACER with demo credentials that can be used for free. It contains a sample of cases from Western District of New York (*nywd*). However because it runs on its own domain with *psc* (PACER Service Center) instead of on the *nywd* PACER site, we will use **psc** as the court abbreviation for this tutorial.
 26 | 
 27 | ## 1. Getting Started
 28 | 
 29 |  - Install the package: `pip install pacer-tools`
 30 |  - Make sure you have a recent version of Firefox installed (80.0+) and [GeckoDriver](https://github.com/mozilla/geckodriver) for Firefox
 31 | 
 32 | **Download folder**
 33 | For this tutorial we are going to use the resources in the */demo* directory of this repo and will put our data into */demo/pacer*. The scraper separates out data by district, so it's best to have a subdirectory for each district, named by court abbreviation (e.g. *demo/pacer/ilnd* for Northern District of Illinois). When the scraper runs it will build the necessary structure inside of that subdirectory that it needs to download and house the data from Pacer.
 34 | 
 35 | Since we are using the PACER demo, we will use the court abbreviation it uses which is *psc* (for PACER Service Centre). The scraper will take an `inpath` argument, to which we will pass *demo/pacer/psc*.
 36 | 
 37 | ## 2. Pacer credentials
 38 | For most use you will need to put your Pacer login details into a json file. For this tutorial we'll be using the Pacer training site with the login details contained in *demo/auth.json*. When you are running the scraper using your own credentials you can use that file as a template.
 39 | 
 40 | ## 3. Query Scraper
 41 | The first thing we'll do with the scraper is download some query results. There is a demo query located at *demo/query_conf.json*. This is a *.json* file that maps search criteria to fields in the Pacer query form.
 42 |  To create your own query later you can use the query builder (see the documentation).
 43 | 
 44 | Throughout this tutorial we will be using the scraper command from the PACER-tools command-line utility.  Run `pacer-tools scraper` to see the full set of arguments.
 45 | 
 46 | **Running script**
 47 | 
 48 | To use the Query Scraper we just need to run the following:
 49 | 
 50 |     pacer-tools scraper --override-time --query-conf demo/query_conf.json demo/pacer/psc
 51 | 
 52 |  - *The `--override-time` flag is to override time restriction (as it is designed to run be run overnight)
 53 |  - The `--query-conf` option points the scraper to a json config file with the parameters for our query.
 54 | 
 55 | The user will be prompted for the following:
 56 | 
 57 |  - **Mode**: for this step we want to choose *query*
 58 |  - **Court**: for the demo site the court abbreviation we want to enter is *psc*
 59 |  - **Auth path**: This is the relative path to our PACER login credentials. Running this from the *downloader* folder the path to the demo credentials is *login/demo.auth*
 60 |  - **Case limit**: This limits the number of cases downloaded in a single session, to prevent accidental overspending on PACER. For this example lets just enter 50.
 61 | 
 62 | *Note*:
 63 | *All of these parameters that the user was prompted for can actually be given as arguments to the script. These are all explained in full in the documentation. To avoid the prompting you can instead run:*
 64 | 
 65 |     pacer-tools scraper --override-time --query-conf demo/query_conf.json -m query -c psc -a demo/auth.json -cl 50 demo/pacer/psc
 66 | 
 67 | 
 68 | **Result**
 69 | Once these values have all been input, the Scraper should launch at this point and download the query results. You should see in the terminal the following message:
 70 |   
 71 | > Query results saved to <path_to_psc>/psc/queries 
 72 | 
 73 | If you navigate to the *psc* folder you will see firstly that a few subfolders have been created to house the data, and secondly within the *queries* folder there should be a *.html* file that contains the query results.
 74 | 
 75 | 
 76 | ## 4. Docket Scraper
 77 | Next we will take that query results file and download all of the dockets for the listed cases. The Docket Scraper module can take a *.html* query file, which we have just downloaded, as its input.
 78 | 
 79 | **Running script**
 80 | To use the Docket Scraper we will run the following:
 81 | 
 82 |     pacer-tools scraper -m docket --docket-input demo/pacer/psc/queries/<query_file>.html -c psc -a demo/auth.json -cl 50 --override-time demo/pacer/psc
 83 | 
 84 |  - The `--docket-input` option takes the path to the query file. The actual name of the query file (`<query_file>`) will vary as it includes a timestamp.
 85 | 
 86 | The Docket Scraper (as well as the Document Scraper which will look at next) runs asynchronously across multiple Firefox instances, by default two. The no. of instances (workers) running can be adjusted by the `n-workers` option (see the documentation).
 87 | 
 88 | *Note: the scraper only keeps the civil and criminal cases, to download a specific case type you can use the ``--case-type`` option.*
 89 | 
 90 | **Result**
 91 | Once both browsers have finished and closed, all of the cases from the query results file should be downloaded and can be found in *demo/pacer/psc/html*
 92 | 
 93 | 
 94 | 
 95 | ## 5. Document Scraper
 96 | Lastly, we will get the actual documents associated with docket lines of the cases. The docket scraper can take a few different types of inputs, including a list of specific cases, but for this tutorial we will give it the directory of docket *.html* files as an input so that documents for all cases will be downloaded. By default, for each case all documents and attachments will be downloaded.
 97 | 
 98 | **Running script**
 99 | To use the Document Scraper we run the following:
100 | 
101 |     pacer-tools scraper -m document -c psc -a demo/auth.json -cl 50 --override-time --document-input demo/document_input.csv demo/pacer/psc
102 |     
103 |  - There is a default limit of 1000 documents per case. Any case that has more than 1000 documents will be skipped. This limit can be changed by the  `--document-limit` option.
104 | 
105 | **Result**
106 | The Document Scraper will usually take significantly longer to run than the Docket Scraper given the volume of documents in most cases. Once the documents have finished downloading they can be found in the *demo/pacer/psc/docs* folder.
107 |  
108 | 
109 |  **Attachments and specific documents**
110 |  
111 | 
112 |  - To skip docket line attachments you can use the `--no-document-att` flag.
113 |  - To get specific documents from specific cases, you can use the `--document-input` option to pass a *.csv* file with cases ids and list specific documents to retrieve, see the documentation for more.
114 | 
115 | 
116 | To see more specifics, options and use cases check out the detailed documentation [here](src/pacer_tools/code/downloader/README.md).
117 | 
118 | # Parser Tutorial
119 | 
120 | This short section explains how to use the SCALES Parser tool to read HTMLs downloaded from Pacer and convert them into JSON format. The parser takes as its input the results of running the [docket scraper](README.md#4-docket-scraper) - namely, a folder of HTMLs.
121 | 
122 | **Running script**
123 | To use the parser on the HTMLs from the docket scraper in the previous tutorial, we will simply run the following:
124 | 
125 |     pacer-tools parser demo/pacer/psc/html
126 | 
127 | **Result**
128 | Once the parser has finished, all the parsed versions of the HTML files can be found in */data/pacer/psc/json*.
129 | 
130 | To see more specifics, options, and details on the JSON schema, check out the detailed documentation [here](src/pacer_tools/code/parsers/README.md).
131 | 


--------------------------------------------------------------------------------
/src/pacer_tools/code/parsers/schemas/case_cv_v1.schema.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "$schema": "http://json-schema.org/schema#",
  3 |   "$id": "http://github.com/scales-okn/PACER-tools/schemas/case_cv_v1.schema.json",
  4 |   "title": "Civil Case Schema",
  5 |   "description": "A schema for a PACER civil court case docket report" ,
  6 |   "properties": {
  7 |     "docket": {
  8 |       "type": "array",
  9 |       "description": "Ordered array of entries in the case docket",
 10 |       "items": {
 11 |         "$ref": "http://github.com/scales-okn/PACER-tools/schemas/docket_entry_v1.schema.json"
 12 |       }
 13 |     },
 14 |     "parties": {
 15 |       "type": "array",
 16 |       "description": "Parties involved in the case",
 17 |       "items": {
 18 |         "$ref": "http://github.com/scales-okn/PACER-tools/schemas/party_cv_v1.schema.json"
 19 |       }
 20 |     },
 21 |     "case_id": {
 22 |       "type": "string",
 23 |       "description": "Pacer's case ID, which has the form O:YY-TY-##### (where O is a court office code, YY is a year, TY is the case type, and ##### is a numeric identifier associated with this case)"
 24 |     },
 25 |     "case_type": {
 26 |       "type": "string",
 27 |       "description": "Abbreviation for the case type ('cv' for civil, 'cr' for criminal, etc)"
 28 |     },
 29 |     "court": {
 30 |       "type": "string",
 31 |       "description": "The (lowercase) Pacer court abbreviation (e.g. 'ilnd' for Northern District of Illinois)"
 32 |     },
 33 |     "ucid": {
 34 |       "type": "string",
 35 |       "description":"SCALES's case ID (stands for 'unique case id'), generated by connecting the court abbreviation to the Pacer case ID, delimited by a double semicolon (e.g. 'ilnd;;1:16-cv-00001)"
 36 |     },
 37 |     "city": {
 38 |       "type": "string",
 39 |       "description": "The city/division within the district where the case appeared; comes from the parenthesis in the header of the docket at the end of the court district (e.g. for 'Northern District of Illinois - CM/ECF LIVE, Ver 6.3.1 (Chicago)', the city is 'Chicago')"
 40 |     },
 41 |     "header_case_id": {
 42 |       "type": "string",
 43 |       "description": "Similar to case_id, but pulled from the docket itself rather than the filepath; sometimes contains extra information like judge initials and 'All Defendants'"
 44 |     },
 45 |     "filing_date": {
 46 |       "type": "string",
 47 |       "description": "The date the case was filed - format: MM/DD/YYYY"
 48 |     },
 49 |     "terminating_date": {
 50 |       "type": "string",
 51 |       "description": "The date the case was terminated - format: MM/DD/YYYY"
 52 |     },
 53 |     "case_status": {
 54 |       "type": "string",
 55 |       "description": "This will be 'open' if a terminating date is listed, else 'closed'",
 56 |       "enum": ["open", "closed"]
 57 |     },
 58 |     "judge": {
 59 |       "type": "string",
 60 |       "description": "The assigned judge on the case (this is the raw string from the docket header); for criminal cases, this is zeroed out and superseded by defendant-specific judges"
 61 |     },
 62 |     "referred_judges": {
 63 |       "type": "array",
 64 |       "description": "The referred judges on the case (only present when the case was referred to a second judge); for criminal cases, this is zeroed out and superseded by defendant-specific referred judges",
 65 |       "items": {
 66 |         "type": "string"
 67 |       }
 68 |     },
 69 |     "nature_suit": {
 70 |       "type": "string",
 71 |       "description": "The nature of suit for the case (this is matched to the list of known NOS where possible for uniformity of spelling/case, and otherwise left as the raw extracted string)"
 72 |     },
 73 |     "jury_demand": {
 74 |       "type": "string",
 75 |       "description": "The jury demand specified in the case header"
 76 |     },
 77 |     "cause": {
 78 |       "type": "string",
 79 |       "description": "The cause specified in the case header"
 80 |     },
 81 |     "jurisdiction": {
 82 |       "type": "string",
 83 |       "description": "The jurisdiction of the case (e.g. 'Federal Question')"
 84 |     },
 85 |     "monetary_demand": {
 86 |       "type": "string",
 87 |       "description": "The monetary demand specified in the case header"
 88 |     },
 89 |     "lead_case_pacer_id": {
 90 |       "type": "string",
 91 |       "description": "The internal Pacer id of the lead case, when the case is part of multi-district litigation (MDL) or a consolidated case"
 92 |     },
 93 |     "lead_case_id": {
 94 |       "type": "string",
 95 |       "description": "The case id of the lead case, when the case is part of multi-district litigation (MDL) or a consolidated case"
 96 |     },
 97 |     "magistrate_case_ids": {
 98 |       "type": "array",
 99 |       "description": "The case ids of any magistrate judge cases (case type 'mj') issuing from this case",
100 |       "items": {
101 |         "type": "string"
102 |       }
103 |     },
104 |     "related_cases": {
105 |       "type": "array",
106 |       "description": "Any case ids listed as 'related cases' in this case's header",
107 |       "items": {
108 |         "type": "string"
109 |       }
110 |     },
111 |     "other_courts": {
112 |       "type": "array",
113 |       "description": "Case IDs provided by Pacer as 'Case in other court'; doesn't pick up all alternate case IDs (e.g. appeals court case numbers)",
114 |       "items": {
115 |         "type": "string"
116 |       }
117 |     },
118 |     "filed_in_error_text": {
119 |       "type": "string",
120 |       "description": "The text line in the header, if any, that indicates this case was erroneously filed ('incorrectly filed,' 'not used,' 'do not docket,' etc)"
121 |     },
122 |     "case_flags": {
123 |       "type": "array",
124 |       "description": "The flags in the top right-hand corner of the case docket",
125 |       "items": {
126 |         "type": "string"
127 |       }
128 |     },
129 |     "appeals_case_ids": {
130 |       "type": "array",
131 |       "description": "The case ids of any defendant-specific appeals issuing from this case; for criminal cases, this is zeroed out and superseded by defendant-specific appeals case ids",
132 |       "items": {
133 |         "type": "string"
134 |       }
135 |     },
136 |     "case_name": {
137 |       "type": "string",
138 |       "description": "The title of the case (e.g. 'USA v. Doe')"
139 |     },
140 |     "docket_available": {
141 |       "type": "boolean",
142 |       "description": "True if html docket data is available for this case; False if the source for this case was pre-parsed data (e.g. from Recap)"
143 |     },
144 |     "member_case_key": {
145 |       "type": "string",
146 |       "description": "a UCID-formatted version of lead_case_id (if this case is the lead case, this field will match the 'ucid' field)"
147 |     },
148 |     "mdl_code": {
149 |       "type": "number",
150 |       "description": "The MDL code that this case is part of, if applicable"
151 |     },
152 |     "mdl_id_source": {
153 |       "type": "string",
154 |       "description": "The source that led us to believe this case is part of an MDL",
155 |       "enum": ["lead_case_id", "flag"]
156 |     },
157 |     "is_mdl": {
158 |       "type": "boolean",
159 |       "description": "Whether or not this case is part of an MDL; this field will be True if an MDL code was found, or if there is a case flag that starts with 'MDL'"
160 |     },
161 |     "is_multi": {
162 |       "type": "boolean",
163 |       "description": "True if this case is part of an MDL, if it has a lead case id, if a list of member cases appears in the header, or if it has appeared in another court"
164 |     },
165 |     "billable_pages": {
166 |       "type": "number",
167 |       "description": "The number of billable pages on Pacer for this docket report"
168 |     },
169 |     "cost": {
170 |       "type": "number",
171 |       "description": "The cost (in $) of downloading the docket report from Pacer"
172 |     },
173 |     "download_timestamp": {
174 |       "type": "string",
175 |       "description": "The time of download of this case from Pacer - format: MM/DD/YYYY hh:mm:ss"
176 |     },
177 |     "n_docket_reports": {
178 |       "type": "number",
179 |       "description": "The number of docket reports (HTML) used to generate this JSON; will be 1 if only 1 docket report, or >1 if the case was pieced together from multiple updates (i.e. multiple HTMLs)"
180 |     },
181 |     "source": {
182 |       "type": "string",
183 |       "description": "A comma-delimited list of sources for this data, generally 'pacer' or 'recap'"
184 |     },
185 |     "recap_id": {
186 |       "type": "number",
187 |       "description": "The Recap id for this case, if applicable"
188 |     },
189 |     "download_url": {
190 |       "type": "string",
191 |       "description": "The url on Pacer from which the docket was downloaded"
192 |     },
193 |     "case_pacer_id": {
194 |       "type": "string",
195 |       "description": "The unique numerical id that Pacer uses internally to identify this document (pulled from Pacer's XML responses to user queries; not visible on the docket sheet itself)"
196 |     },
197 |     "summary": {
198 |       "type": "object",
199 |       "description": "Data from the case summary, if available",
200 |       "$ref": "http://github.com/scales-okn/PACER-tools/schemas/summary_cv_v1.schema.json"
201 |     }
202 |   }
203 | }


--------------------------------------------------------------------------------
/src/pacer_tools/code/parsers/schemas/case_cr_v1.schema.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "$schema": "http://json-schema.org/schema#",
  3 |   "$id": "http://github.com/scales-okn/PACER-tools/schemas/case_cr_v1.schema.json",
  4 |   "title": "Civil Case Schema",
  5 |   "description": "A schema for a PACER criminal court case docket report" ,
  6 |   "properties": {
  7 |     "docket": {
  8 |       "type": "array",
  9 |       "description": "Ordered array of entries in the case docket",
 10 |       "items": {
 11 |         "$ref": "http://github.com/scales-okn/PACER-tools/schemas/docket_entry_v1.schema.json"
 12 |       }
 13 |     },
 14 |     "parties": {
 15 |       "type": "array",
 16 |       "description": "Parties involved in the case",
 17 |       "items": {
 18 |         "$ref": "http://github.com/scales-okn/PACER-tools/schemas/party_cr_v1.schema.json"
 19 |       }
 20 |     },
 21 |     "case_id": {
 22 |       "type": "string",
 23 |       "description": "Pacer's case ID, which has the form O:YY-TY-##### (where O is a court office code, YY is a year, TY is the case type, and ##### is a numeric identifier associated with this case)"
 24 |     },
 25 |     "case_type": {
 26 |       "type": "string",
 27 |       "description": "Abbreviation for the case type ('cv' for civil, 'cr' for criminal, etc)"
 28 |     },
 29 |     "court": {
 30 |       "type": "string",
 31 |       "description": "The (lowercase) Pacer court abbreviation (e.g. 'ilnd' for Northern District of Illinois)"
 32 |     },
 33 |     "ucid": {
 34 |       "type": "string",
 35 |       "description":"SCALES's case ID (stands for 'unique case id'), generated by connecting the court abbreviation to the Pacer case ID, delimited by a double semicolon (e.g. 'ilnd;;1:16-cv-00001)"
 36 |     },
 37 |     "city": {
 38 |       "type": "string",
 39 |       "description": "The city/division within the district where the case appeared; comes from the parenthesis in the header of the docket at the end of the court district (e.g. for 'Northern District of Illinois - CM/ECF LIVE, Ver 6.3.1 (Chicago)', the city is 'Chicago')"
 40 |     },
 41 |     "header_case_id": {
 42 |       "type": "string",
 43 |       "description": "Similar to case_id, but pulled from the docket itself rather than the filepath; sometimes contains extra information like judge initials and 'All Defendants'"
 44 |     },
 45 |     "filing_date": {
 46 |       "type": "string",
 47 |       "description": "The date the case was filed - format: MM/DD/YYYY"
 48 |     },
 49 |     "terminating_date": {
 50 |       "type": "string",
 51 |       "description": "The date the case was terminated - format: MM/DD/YYYY"
 52 |     },
 53 |     "case_status": {
 54 |       "type": "string",
 55 |       "description": "This will be 'open' if a terminating date is listed, else 'closed'",
 56 |       "enum": ["open", "closed"]
 57 |     },
 58 |     "judge": {
 59 |       "type": "string",
 60 |       "description": "The assigned judge on the case (this is the raw string from the docket header); for criminal cases, this is zeroed out and superseded by defendant-specific judges"
 61 |     },
 62 |     "referred_judges": {
 63 |       "type": "array",
 64 |       "description": "The referred judges on the case (only present when the case was referred to a second judge); for criminal cases, this is zeroed out and superseded by defendant-specific referred judges",
 65 |       "items": {
 66 |         "type": "string"
 67 |       }
 68 |     },
 69 |     "nature_suit": {
 70 |       "type": "string",
 71 |       "description": "The nature of suit for the case (this is matched to the list of known NOS where possible for uniformity of spelling/case, and otherwise left as the raw extracted string)"
 72 |     },
 73 |     "jury_demand": {
 74 |       "type": "string",
 75 |       "description": "The jury demand specified in the case header"
 76 |     },
 77 |     "cause": {
 78 |       "type": "string",
 79 |       "description": "The cause specified in the case header"
 80 |     },
 81 |     "jurisdiction": {
 82 |       "type": "string",
 83 |       "description": "The jurisdiction of the case (e.g. 'Federal Question')"
 84 |     },
 85 |     "monetary_demand": {
 86 |       "type": "string",
 87 |       "description": "The monetary demand specified in the case header"
 88 |     },
 89 |     "lead_case_pacer_id": {
 90 |       "type": "string",
 91 |       "description": "The internal Pacer id of the lead case, when the case is part of multi-district litigation (MDL) or a consolidated case"
 92 |     },
 93 |     "lead_case_id": {
 94 |       "type": "string",
 95 |       "description": "The case id of the lead case, when the case is part of multi-district litigation (MDL) or a consolidated case"
 96 |     },
 97 |     "magistrate_case_ids": {
 98 |       "type": "array",
 99 |       "description": "The case ids of any magistrate judge cases (case type 'mj') issuing from this case",
100 |       "items": {
101 |         "type": "string"
102 |       }
103 |     },
104 |     "related_cases": {
105 |       "type": "array",
106 |       "description": "Any case ids listed as 'related cases' in this case's header",
107 |       "items": {
108 |         "type": "string"
109 |       }
110 |     },
111 |     "other_courts": {
112 |       "type": "array",
113 |       "description": "Case IDs provided by Pacer as 'Case in other court'; doesn't pick up all alternate case IDs (e.g. appeals court case numbers)",
114 |       "items": {
115 |         "type": "string"
116 |       }
117 |     },
118 |     "filed_in_error_text": {
119 |       "type": "string",
120 |       "description": "The text line in the header, if any, that indicates this case was erroneously filed ('incorrectly filed,' 'not used,' 'do not docket,' etc)"
121 |     },
122 |     "case_flags": {
123 |       "type": "array",
124 |       "description": "The flags in the top right-hand corner of the case docket",
125 |       "items": {
126 |         "type": "string"
127 |       }
128 |     },
129 |     "appeals_case_ids": {
130 |       "type": "array",
131 |       "description": "The case ids of any defendant-specific appeals issuing from this case; for criminal cases, this is zeroed out and superseded by defendant-specific appeals case ids",
132 |       "items": {
133 |         "type": "string"
134 |       }
135 |     },
136 |     "case_name": {
137 |       "type": "string",
138 |       "description": "The title of the case (e.g. 'USA v. Doe')"
139 |     },
140 |     "docket_available": {
141 |       "type": "boolean",
142 |       "description": "True if html docket data is available for this case; False if the source for this case was pre-parsed data (e.g. from Recap)"
143 |     },
144 |     "member_case_key": {
145 |       "type": "string",
146 |       "description": "a UCID-formatted version of lead_case_id (if this case is the lead case, this field will match the 'ucid' field)"
147 |     },
148 |     "mdl_code": {
149 |       "type": "number",
150 |       "description": "The MDL code that this case is part of, if applicable"
151 |     },
152 |     "mdl_id_source": {
153 |       "type": "string",
154 |       "description": "The source that led us to believe this case is part of an MDL",
155 |       "enum": ["lead_case_id", "flag"]
156 |     },
157 |     "is_mdl": {
158 |       "type": "boolean",
159 |       "description": "Whether or not this case is part of an MDL; this field will be True if an MDL code was found, or if there is a case flag that starts with 'MDL'"
160 |     },
161 |     "is_multi": {
162 |       "type": "boolean",
163 |       "description": "True if this case is part of an MDL, if it has a lead case id, if a list of member cases appears in the header, or if it has appeared in another court"
164 |     },
165 |     "billable_pages": {
166 |       "type": "number",
167 |       "description": "The number of billable pages on Pacer for this docket report"
168 |     },
169 |     "cost": {
170 |       "type": "number",
171 |       "description": "The cost (in $) of downloading the docket report from Pacer"
172 |     },
173 |     "download_timestamp": {
174 |       "type": "string",
175 |       "description": "The time of download of this case from Pacer - format: MM/DD/YYYY hh:mm:ss"
176 |     },
177 |     "n_docket_reports": {
178 |       "type": "number",
179 |       "description": "The number of docket reports (HTML) used to generate this JSON; will be 1 if only 1 docket report, or >1 if the case was pieced together from multiple updates (i.e. multiple HTMLs)"
180 |     },
181 |     "source": {
182 |       "type": "string",
183 |       "description": "A comma-delimited list of sources for this data, generally 'pacer' or 'recap'"
184 |     },
185 |     "recap_id": {
186 |       "type": "number",
187 |       "description": "The Recap id for this case, if applicable"
188 |     },
189 |     "download_url": {
190 |       "type": "string",
191 |       "description": "The url on Pacer from which the docket was downloaded"
192 |     },
193 |     "case_pacer_id": {
194 |       "type": "string",
195 |       "description": "The unique numerical id that Pacer uses internally to identify this document (pulled from Pacer's XML responses to user queries; not visible on the docket sheet itself)"
196 |     },
197 |     "summary": {
198 |       "type": "object",
199 |       "description": "Data from the case summary, if available",
200 |       "$ref": "http://github.com/scales-okn/PACER-tools/schemas/summary_cr_v1.schema.json"
201 |     }
202 |   }
203 | }


--------------------------------------------------------------------------------
/src/pacer_tools/requirements.txt:
--------------------------------------------------------------------------------
  1 | # This file may be used to create an environment using:
  2 | # $ conda create --name <env> --file <this file>
  3 | # platform: osx-64
  4 | _ipyw_jlab_nb_ext_conf=0.1.0=py37_0
  5 | alabaster=0.7.12=py37_0
  6 | anaconda=2020.02=py37_0
  7 | anaconda-client=1.7.2=py37_0
  8 | anaconda-navigator=1.9.12=py37_0
  9 | anaconda-project=0.8.4=py_0
 10 | applaunchservices=0.2.1=py_0
 11 | appnope=0.1.0=py37_0
 12 | appscript=1.1.0=py37h1de35cc_0
 13 | argh=0.26.2=py37_0
 14 | asn1crypto=1.3.0=py37_0
 15 | astroid=2.3.3=py37_0
 16 | astropy=4.0=py37h1de35cc_0
 17 | atomicwrites=1.3.0=py37_1
 18 | attrs=19.3.0=py_0
 19 | autopep8=1.4.4=py_0
 20 | babel=2.8.0=py_0
 21 | backcall=0.1.0=py37_0
 22 | backports=1.0=py_2
 23 | backports.functools_lru_cache=1.6.1=py_0
 24 | backports.shutil_get_terminal_size=1.0.0=py37_2
 25 | backports.tempfile=1.0=py_1
 26 | backports.weakref=1.0.post1=py_1
 27 | beautifulsoup4=4.9.3=pyhb0f4dca_0
 28 | bitarray=1.2.1=py37h1de35cc_0
 29 | bkcharts=0.2=py37_0
 30 | blas=1.0=mkl
 31 | bleach=3.1.0=py37_0
 32 | blosc=1.16.3=hd9629dc_0
 33 | bokeh=1.4.0=py37_0
 34 | boto=2.49.0=py37_0
 35 | bottleneck=1.3.2=py37h776bbcc_0
 36 | brotlipy=0.7.0=py38h9ed2024_1003
 37 | bzip2=1.0.8=h1de35cc_0
 38 | ca-certificates=2020.10.14=0
 39 | certifi=2020.6.20=pyhd3eb1b0_3
 40 | cffi=1.14.3=py38h2125817_2
 41 | chardet=3.0.4=py37_1003
 42 | click=7.1.2=py_0
 43 | cloudpickle=1.3.0=py_0
 44 | clyent=1.2.2=py37_1
 45 | colorama=0.4.3=py_0
 46 | conda=4.8.4=py37_0
 47 | conda-build=3.18.11=py37_0
 48 | conda-env=2.6.0=1
 49 | conda-package-handling=1.6.0=py37h1de35cc_0
 50 | conda-verify=3.4.2=py_1
 51 | contextlib2=0.6.0.post1=py_0
 52 | cryptography=3.2.1=py38hbcfaee0_1
 53 | curl=7.68.0=ha441bb4_0
 54 | cycler=0.10.0=py37_0
 55 | cython=0.29.15=py37h0a44026_0
 56 | cytoolz=0.10.1=py37h1de35cc_0
 57 | dask=2.11.0=py_0
 58 | dask-core=2.11.0=py_0
 59 | dbus=1.13.12=h90a0687_0
 60 | decorator=4.4.1=py_0
 61 | defusedxml=0.6.0=py_0
 62 | diff-match-patch=20181111=py_0
 63 | distributed=2.11.0=py37_0
 64 | docutils=0.16=py37_0
 65 | entrypoints=0.3=py37_0
 66 | et_xmlfile=1.0.1=py37_0
 67 | expat=2.2.6=h0a44026_0
 68 | fastcache=1.1.0=py37h1de35cc_0
 69 | filelock=3.0.12=py_0
 70 | flake8=3.7.9=py37_0
 71 | flask=1.1.1=py_0
 72 | freetype=2.9.1=hb4e5f40_0
 73 | fsspec=0.6.2=py_0
 74 | future=0.18.2=py37_0
 75 | get_terminal_size=1.0.0=h7520d66_0
 76 | gettext=0.19.8.1=h15daf44_3
 77 | gevent=1.4.0=py37h1de35cc_0
 78 | glib=2.63.1=hd977a24_0
 79 | glob2=0.7=py_0
 80 | gmp=6.1.2=hb37e062_1
 81 | gmpy2=2.0.8=py37h6ef4df4_2
 82 | greenlet=0.4.15=py37h1de35cc_0
 83 | h5py=2.10.0=py37h3134771_0
 84 | hdf5=1.10.4=hfa1e0ec_0
 85 | heapdict=1.0.1=py_0
 86 | html5lib=1.0.1=py37_0
 87 | hypothesis=5.5.4=py_0
 88 | icu=58.2=h4b95b61_1
 89 | idna=2.10=py_0
 90 | imageio=2.6.1=py37_0
 91 | imagesize=1.2.0=py_0
 92 | importlib_metadata=1.5.0=py37_0
 93 | intel-openmp=2019.4=233
 94 | intervaltree=3.0.2=py_0
 95 | ipykernel=5.1.4=py37h39e3cac_0
 96 | ipython=7.12.0=py37h5ca1d4c_0
 97 | ipython_genutils=0.2.0=py37_0
 98 | ipywidgets=7.5.1=py_0
 99 | isort=4.3.21=py37_0
100 | itsdangerous=1.1.0=py37_0
101 | jbig=2.1=h4d881f8_0
102 | jdcal=1.4.1=py_0
103 | jedi=0.14.1=py37_0
104 | jinja2=2.11.1=py_0
105 | joblib=0.14.1=py_0
106 | jpeg=9b=he5867d9_2
107 | json5=0.9.1=py_0
108 | jsonschema=3.2.0=py37_0
109 | jupyter=1.0.0=py37_7
110 | jupyter_client=5.3.4=py37_0
111 | jupyter_console=6.1.0=py_0
112 | jupyter_core=4.6.1=py37_0
113 | jupyterlab=1.2.6=pyhf63ae98_0
114 | jupyterlab_server=1.0.6=py_0
115 | keyring=21.1.0=py37_0
116 | kiwisolver=1.1.0=py37h0a44026_0
117 | krb5=1.17.1=hddcf347_0
118 | lazy-object-proxy=1.4.3=py37h1de35cc_0
119 | libarchive=3.3.3=h786848e_5
120 | libcurl=7.68.0=h051b688_0
121 | libcxx=10.0.0=1
122 | libcxxabi=4.0.1=hcfea43d_1
123 | libedit=3.1.20191231=h1de35cc_1
124 | libffi=3.3=hb1e8313_2
125 | libgfortran=3.0.1=h93005f0_2
126 | libiconv=1.15=hdd342a3_7
127 | liblief=0.9.0=h2a1bed3_2
128 | libpng=1.6.37=ha441bb4_0
129 | libsodium=1.0.16=h3efe00b_0
130 | libspatialindex=1.9.3=h0a44026_0
131 | libssh2=1.8.2=ha12b0ac_0
132 | libtiff=4.1.0=hcb84e12_0
133 | libxml2=2.9.9=hf6e021a_1
134 | libxslt=1.1.33=h33a18ac_0
135 | llvm-openmp=4.0.1=hcfea43d_1
136 | llvmlite=0.31.0=py37h1341992_0
137 | locket=0.2.0=py37_1
138 | lxml=4.5.0=py37hef8c89e_0
139 | lz4-c=1.8.1.2=h1de35cc_0
140 | lzo=2.10=h362108e_2
141 | markupsafe=1.1.1=py37h1de35cc_0
142 | matplotlib=3.1.3=py37_0
143 | matplotlib-base=3.1.3=py37h9aa3819_0
144 | mccabe=0.6.1=py37_1
145 | mistune=0.8.4=py37h1de35cc_0
146 | mkl=2019.4=233
147 | mkl-service=2.3.0=py38hfbe908c_0
148 | mkl_fft=1.2.0=py38hc64f4ea_0
149 | mkl_random=1.1.1=py38h959d312_0
150 | mock=4.0.1=py_0
151 | more-itertools=8.2.0=py_0
152 | mpc=1.1.0=h6ef4df4_1
153 | mpfr=4.0.1=h3018a27_3
154 | mpmath=1.1.0=py37_0
155 | msgpack-python=0.6.1=py37h04f5b5a_1
156 | multipledispatch=0.6.0=py37_0
157 | navigator-updater=0.2.1=py37_0
158 | nbconvert=5.6.1=py37_0
159 | nbformat=5.0.4=py_0
160 | ncurses=6.2=h0a44026_1
161 | networkx=2.4=py_0
162 | nltk=3.4.5=py37_0
163 | nose=1.3.7=py37_2
164 | notebook=6.0.3=py37_0
165 | numba=0.48.0=py37h6c726b0_0
166 | numexpr=2.7.1=py37hce01a72_0
167 | numpy=1.19.2=py38h456fd55_0
168 | numpy-base=1.19.2=py38hcfb5961_0
169 | numpydoc=0.9.2=py_0
170 | olefile=0.46=py37_0
171 | openpyxl=3.0.3=py_0
172 | openssl=1.1.1h=haf1e3a3_0
173 | packaging=20.1=py_0
174 | pandas=1.1.3=py38hb1e8313_0
175 | pandoc=2.2.3.2=0
176 | pandocfilters=1.4.2=py37_1
177 | parso=0.5.2=py_0
178 | partd=1.1.0=py_0
179 | path=13.1.0=py37_0
180 | path.py=12.4.0=0
181 | pathlib2=2.3.5=py37_0
182 | pathtools=0.1.2=py_1
183 | patsy=0.5.1=py37_0
184 | pcre=8.43=h0a44026_0
185 | pep8=1.7.1=py37_0
186 | pexpect=4.8.0=py37_0
187 | pickleshare=0.7.5=py37_0
188 | pillow=7.0.0=py37h4655f20_0
189 | pip=20.2.4=py38hecd8cb5_0
190 | pkginfo=1.5.0.1=py37_0
191 | pluggy=0.13.1=py37_0
192 | ply=3.11=py37_0
193 | prometheus_client=0.7.1=py_0
194 | prompt_toolkit=3.0.3=py_0
195 | psutil=5.6.7=py37h1de35cc_0
196 | ptyprocess=0.6.0=py37_0
197 | py=1.8.1=py_0
198 | py-lief=0.9.0=py37h1413db1_2
199 | pycodestyle=2.5.0=py37_0
200 | pycosat=0.6.3=py37h1de35cc_0
201 | pycparser=2.20=py_2
202 | pycrypto=2.6.1=py37h1de35cc_9
203 | pycurl=7.43.0.5=py37ha12b0ac_0
204 | pydocstyle=4.0.1=py_0
205 | pyflakes=2.1.1=py37_0
206 | pygments=2.5.2=py_0
207 | pylint=2.4.4=py37_0
208 | pyodbc=4.0.30=py37h0a44026_0
209 | pyopenssl=19.1.0=pyhd3eb1b0_1
210 | pyparsing=2.4.6=py_0
211 | pyqt=5.9.2=py37h655552a_2
212 | pyrsistent=0.15.7=py37h1de35cc_0
213 | pysocks=1.7.1=py38_1
214 | pytables=3.6.1=py37h5bccee9_0
215 | pytest=5.3.5=py37_0
216 | pytest-arraydiff=0.3=py37h39e3cac_0
217 | pytest-astropy=0.8.0=py_0
218 | pytest-astropy-header=0.1.2=py_0
219 | pytest-doctestplus=0.5.0=py_0
220 | pytest-openfiles=0.4.0=py_0
221 | pytest-remotedata=0.3.2=py37_0
222 | python=3.8.5=h26836e1_1
223 | python-dateutil=2.8.1=py_0
224 | python-jsonrpc-server=0.3.4=py_0
225 | python-language-server=0.31.7=py37_0
226 | python-libarchive-c=2.8=py37_13
227 | python.app=2=py37_10
228 | pytz=2020.1=py_0
229 | pywavelets=1.1.1=py37h1de35cc_0
230 | pyyaml=5.3=py37h1de35cc_0
231 | pyzmq=18.1.1=py37h0a44026_0
232 | qdarkstyle=2.8=py_0
233 | qt=5.9.7=h468cd18_1
234 | qtawesome=0.6.1=py_0
235 | qtconsole=4.6.0=py_1
236 | qtpy=1.9.0=py_0
237 | readline=8.0=h1de35cc_0
238 | requests=2.22.0=py37_1
239 | ripgrep=11.0.2=he32d670_0
240 | rope=0.16.0=py_0
241 | rtree=0.9.3=py37_0
242 | ruamel_yaml=0.15.87=py37h1de35cc_0
243 | scikit-image=0.16.2=py37h6c726b0_0
244 | scikit-learn=0.22.1=py37h27c97d8_0
245 | scipy=1.4.1=py37h9fa6033_0
246 | seaborn=0.10.0=py_0
247 | selenium=3.141.0=py38h1de35cc_1001
248 | send2trash=1.5.0=py37_0
249 | setuptools=50.3.1=py38hecd8cb5_1
250 | simplegeneric=0.8.1=py37_2
251 | simplejson=3.17.0=py37h1de35cc_0
252 | singledispatch=3.4.0.3=py37_0
253 | sip=4.19.8=py37h0a44026_0
254 | six=1.15.0=py38hecd8cb5_0
255 | snappy=1.1.7=he62c110_3
256 | snowballstemmer=2.0.0=py_0
257 | sortedcollections=1.1.2=py37_0
258 | sortedcontainers=2.1.0=py37_0
259 | soupsieve=2.0.1=py_0
260 | sphinx=2.4.0=py_0
261 | sphinxcontrib=1.0=py37_1
262 | sphinxcontrib-applehelp=1.0.1=py_0
263 | sphinxcontrib-devhelp=1.0.1=py_0
264 | sphinxcontrib-htmlhelp=1.0.2=py_0
265 | sphinxcontrib-jsmath=1.0.1=py_0
266 | sphinxcontrib-qthelp=1.0.2=py_0
267 | sphinxcontrib-serializinghtml=1.1.3=py_0
268 | sphinxcontrib-websupport=1.2.0=py_0
269 | spyder=4.0.1=py37_0
270 | spyder-kernels=1.8.1=py37_0
271 | sqlalchemy=1.3.13=py37h1de35cc_0
272 | sqlite=3.33.0=hffcf06c_0
273 | statsmodels=0.11.0=py37h1de35cc_0
274 | sympy=1.5.1=py37_0
275 | tbb=2020.0=h04f5b5a_0
276 | tblib=1.6.0=py_0
277 | terminado=0.8.3=py37_0
278 | testpath=0.4.4=py_0
279 | tk=8.6.10=hb0a8c7a_0
280 | toolz=0.10.0=py_0
281 | tornado=6.0.3=py37h1de35cc_3
282 | tqdm=4.51.0=pyhd3eb1b0_0
283 | traitlets=4.3.3=py37_0
284 | ujson=1.35=py37h1de35cc_0
285 | unicodecsv=0.14.1=py37_0
286 | unixodbc=2.3.7=h1de35cc_0
287 | urllib3=1.25.11=py_0
288 | watchdog=0.10.2=py37h1de35cc_0
289 | wcwidth=0.1.8=py_0
290 | webencodings=0.5.1=py37_1
291 | werkzeug=1.0.0=py_0
292 | wheel=0.35.1=pyhd3eb1b0_0
293 | widgetsnbextension=3.5.1=py37_0
294 | wrapt=1.11.2=py37h1de35cc_0
295 | wurlitzer=2.0.0=py37_0
296 | xlrd=1.2.0=py37_0
297 | xlsxwriter=1.2.7=py_0
298 | xlwings=0.17.1=py37_0
299 | xlwt=1.3.0=py37_0
300 | xmltodict=0.12.0=py_0
301 | xz=5.2.5=h1de35cc_0
302 | yaml=0.1.7=hc338f04_2
303 | yapf=0.28.0=py_0
304 | zeromq=4.3.1=h0a44026_3
305 | zict=1.0.0=py_0
306 | zipp=2.2.0=py_0
307 | zlib=1.2.11=h1de35cc_3
308 | zstd=1.3.7=h5bba6e5_0
309 | 


--------------------------------------------------------------------------------
/src/pacer_tools/code/support/bundler.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Tool for bundling files together
  3 | '''
  4 | import re
  5 | import sys
  6 | import json
  7 | import shutil
  8 | from pathlib import Path
  9 | 
 10 | import pandas as pd
 11 | from bs4 import BeautifulSoup
 12 | from tqdm.autonotebook import tqdm
 13 | 
 14 | sys.path.append(str(Path(__file__).resolve().parents[1]))
 15 | from support import settings
 16 | from support import data_tools as dtools
 17 | 
 18 | def index_style(additional=''):
 19 |     '''
 20 |     Build css style tag
 21 |     Inputs:
 22 |         - additional (str): additional str to be inserted on-the-fly
 23 |     Output:
 24 |         (str) a valid html-style tag
 25 |     '''
 26 |     base_style = open(settings.STYLE/'bundler_index.css').read().replace('\n','')
 27 |     return f"<style>{base_style}{additional}</style>"
 28 | 
 29 | def bundler(indf, name, notes=None, overwrite=False, anno_col=None):
 30 |     '''
 31 |     Bundle up a collection of files
 32 |     Inputs:
 33 |         - indf (DataFrame): any dataframe with an fpath column to identify files
 34 |         - name (str): name of directory to bundle into (will be put in /data/{name})
 35 |         - notes (str): notes to be injected under the header (html string)
 36 |         - anno_col (str): name of annotations column if any, column should be valid json string
 37 |     '''
 38 |     df = indf.copy()
 39 |     # Want to include the index if it's ucid
 40 |     if df.index.name == 'ucid':
 41 |         df = df.reset_index()
 42 | 
 43 |     if anno_col:
 44 |         # import pdb;pdb.set_trace()
 45 |         df[anno_col] = df[anno_col].map(json.loads)
 46 | 
 47 |     # Columns needed to generate
 48 |     if 'fpath' not in df.columns:
 49 |         raise ValueError('DataFrame must include fpath column to point to file locations')
 50 |     elif 'ucid' not in df.columns:
 51 |         raise ValueError('DataFrame must include ucid to identify case')
 52 | 
 53 |     # Handle directory
 54 |     bundle_dir = settings.BUNDLES/name
 55 |     if bundle_dir.exists():
 56 |         if overwrite:
 57 |             # Delete all files in the directory
 58 |             for file in bundle_dir.iterdir():
 59 |                 file.unlink()
 60 |         else:
 61 |             raise ValueError(f'The directory {str(bundle_dir)} already exists')
 62 |     else:
 63 |         bundle_dir.mkdir(parents=True)
 64 | 
 65 |     # Start building html index page with strings
 66 |     heading = f"<h1 class='heading'>Data Dump: {name}</h1>"
 67 |     notes = f'''<div class="notes">NOTES: {notes}</div>''' if notes else ''
 68 |     opening = f"<html>{index_style()}<body>{heading}{notes}"
 69 | 
 70 |     # Start building table rows
 71 |     table_rows = []
 72 |     header = [f"<th>{val}</th>" for val in df.columns if val!=anno_col]
 73 |     table_rows.append("".join(header))
 74 | 
 75 |     for i,row in tqdm(df.iterrows(), total=len(df)):
 76 |         # Get filepath
 77 |         rel_path = row.fpath
 78 |         if type(rel_path) is str:
 79 |             rel_path = Path(rel_path.replace('\\','/'))
 80 |         abs_path = settings.PROJECT_ROOT / rel_path
 81 | 
 82 |         # Annotation scenario
 83 |         if 'pacer' in abs_path.parts and anno_col and row[anno_col]:
 84 |             # Load the html text and json data to make the annotated docket
 85 |             hpath = dtools.get_pacer_html(abs_path)
 86 |             html_text = open(hpath, 'r', encoding='utf-8').read()
 87 |             json_data = dtools.load_case(row.fpath)
 88 |             new_html = make_annotated_docket(html_text, json_data, row[anno_col])
 89 | 
 90 |             # Copy the new (annotated) html into the bundle directory
 91 |             tqdm.write(f"Annotating {row.ucid}")
 92 |             new_name = row.ucid.replace(':', '-') + '.html'
 93 |             with open(bundle_dir/new_name, 'w', encoding='utf-8') as wfile:
 94 |                 wfile.write(new_html)
 95 | 
 96 |         else:
 97 |             if 'pacer' in abs_path.parts:
 98 |                 # Get the path to the html file
 99 |                 abs_path = dtools.get_pacer_html(abs_path)
100 | 
101 |             # Copy the file
102 |             tqdm.write(f"Copying {row.ucid}")
103 |             new_name = row.ucid.replace(':', '-') + abs_path.suffix
104 |             shutil.copyfile(abs_path, bundle_dir/new_name)
105 | 
106 | 
107 |         cells = [f"<td>{v}</td>" for k,v in row.iteritems() if k!=anno_col]
108 |         row_string = f'''<tr onclick="window.open('{new_name}')">''' + "".join(cells) + "</tr>"
109 |         table_rows.append(row_string)
110 | 
111 |     # Finish out the html string for the index
112 |     table = f"<table class='maintable'>{''.join(table_rows)}</table>"
113 |     closing = f"</body></html>"
114 |     html = opening + table + closing
115 | 
116 |     with open(bundle_dir/'_index.html', 'w+') as wfile:
117 |         wfile.write(html)
118 | 
119 |     print(f"\nFiles Succesfully bundled into {bundle_dir}")
120 | 
121 | def build_new_td(json_text, row_annotations, soup=None, inner_html=False):
122 |     '''
123 |     Make a new td cell to replace current docket text td cell, for a single docket entry/row
124 | 
125 |     Inputs:
126 |         - json_text (str): the cleaned docket text from the saved json
127 |         - row_annotations (list): a list of dicts of annotation spans for a single docket line
128 |                                  e.x. [{'start': 0, 'end':10, 'label':"SOMETHING"}]
129 |         - soup (bs4 instance): soup needed to make a tag, if None will create an empty soup
130 |         - inner_html (bool): if true returns inner html as string
131 |     Output
132 |         new_td (bs4 object or str): new td cell to be inserted
133 |     '''
134 |     if not soup:
135 |         soup = BeautifulSoup('','html.parser')
136 | 
137 |     new_td = soup.new_tag('td')
138 | 
139 |     # Index pointer to current place in original json
140 |     og_pointer = 0
141 | 
142 |     # Sort annotation by 'start'
143 |     row_annotations.sort(key=lambda x: x['start'])
144 | 
145 |     # Iterate through each annotation and 'swap out' original text for new span
146 |     for annot in row_annotations:
147 | 
148 |         # Get all the text up until this annotation
149 |         new_td.append( json_text[ og_pointer: annot['start'] ] )
150 | 
151 |         # Build the span html tag, add attributes that allow for styling/highlighting
152 |         span_tag = soup.new_tag('span', attrs={'class':"annotation", 'data-label':annot['label']})
153 |         span_tag.string = json_text[annot['start']:annot['end']]
154 |         new_td.append(span_tag)
155 | 
156 |         # Set the pointer to the end index of the annotation
157 |         og_pointer = annot['end']
158 | 
159 |     # Get the last bit of the docket
160 |     new_td.append( json_text[ og_pointer: ] )
161 | 
162 |     if inner_html:
163 |         return new_td.decode_contents()
164 |     else:
165 |         return new_td
166 | 
167 | def make_annotated_docket(html_text, json_data, case_annotations):
168 |     '''
169 |     Main function to build annotated html for a PACER docket
170 | 
171 |     Inputs:
172 |         - html_text (str)
173 |         - json_data (dict)
174 |         - case_annotations (dict): mapping from row index (int, ordinal index) -> annotation data list of dicts e.g. {2: [ {span1},...], 5: [ {span2}, ...]}
175 | 
176 |     Output:
177 |         (str) html source text for annotated html
178 |     '''
179 | 
180 |     # Make the soup
181 |     soup = BeautifulSoup(html_text, 'html.parser')
182 | 
183 |     docket_table = soup.select('table')[-2]
184 | 
185 |     for row_index, tr in enumerate(docket_table.select('tr')[1:]):
186 | 
187 |         # Skip row if no annotation
188 |         if row_index not in case_annotations.keys():
189 |             continue
190 | 
191 |         tr.attrs['class'] = tr.attrs.get('class', '') + ' annotated'
192 | 
193 |         #Isolate the original td
194 |         docket_entry_td = tr.select('td')[2]
195 | 
196 |         # Gather info for new td
197 |         jdata_text = json_data['docket'][row_index]['docket_text']
198 |         row_annotations = case_annotations[row_index]
199 | 
200 |         # Build and inject new td
201 |         new_cell = build_new_td(jdata_text, row_annotations, soup)
202 |         docket_entry_td.replace_with(new_cell)
203 | 
204 | 
205 |     # Inject the style.css file into the header
206 |     style_tag = soup.new_tag('style')
207 |     style_tag.string = open(settings.STYLE/'pacer_docket.css').read().replace('\n','')
208 |     soup.head.append(style_tag)
209 | 
210 |     return re.sub(r"b'|\\n|\\t",'',str(soup))
211 | 
212 | def make_annotated_docket_for_dash(html_text, json_data, case_annotations, range_to_keep):
213 |     '''
214 |     Annotate a Pacer docket and return only a specified range of docket lines in JSON format - for use with make_excerpts()
215 | 
216 |     Inputs:
217 |         - html_text (str)
218 |         - json_data (dict)
219 |         - case_annotations (dict): mapping from row index (within case) -> annotation data dict e.g. {'2': [ {span1}, ... ], '5': [ {span2}, ... ]}
220 |         - range_to_keep (range): the range of docket lines (SCALES-indexed) to be returned
221 | 
222 |     Output:
223 |         (dict) the docket excerpt as a JSON
224 |     '''
225 | 
226 |     # make the preliminary JSON and the soup
227 |     new_json = {"case_id": json_data['case_id'], "docket": []}
228 |     soup = BeautifulSoup(html_text, 'html.parser')
229 |     docket_table = soup.select('table')[-2]
230 | 
231 |     # check whether each row needs to be inserted into the final JSON
232 |     for row_index, tr in enumerate(docket_table.select('tr')[1:]):
233 |         if row_index in range_to_keep:
234 | 
235 |             # check whether this row needs to be annotated
236 |             if str(row_index) in case_annotations.keys():
237 |                 row_annotations = case_annotations[str(row_index)]
238 |                 tr.attrs['class'] = tr.attrs.get('class', '') + ' annotated'
239 |             else:
240 |                 row_annotations = {}
241 | 
242 |             # gather remaining info needed for build_new_td()
243 |             old_entry = json_data['docket'][row_index]
244 |             new_docket_text = old_entry['docket_text']
245 | 
246 |             # build & insert new docket entry
247 |             new_docket_html = build_new_td(new_docket_text, row_annotations, inner_html=True)
248 |             new_entry = {
249 |                 "date_filed": old_entry['date_filed'],
250 |                 "ind": old_entry['ind'],
251 |                 "docket_text": new_docket_text,
252 |                 "docket_html": new_docket_html
253 |                 }
254 |             new_json['docket'].append(new_entry)
255 | 
256 |     return new_json


--------------------------------------------------------------------------------
/src/pacer_tools/code/support/text_functions.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from pathlib import Path
  3 | sys.path.append(str(Path(__file__).resolve().parents[1]))
  4 | from support import data_tools as dtools
  5 | 
  6 | def pro_se_identifier(party: dict, threshold: int=85):
  7 |     """Given a party dict from a SCALES json, identify if the party is PRO SE
  8 |     (if identified, returns index of counsel block that equals the party)
  9 | 
 10 |     Args:
 11 |         party (dict): SCALES json formatted party dict from a case
 12 |         threshold (int, optional): fuzzywuzzy fuzz.[MATCH_TYPE] ratio threshold to constitute a match. Defaults to 85.
 13 | 
 14 |     Returns:
 15 |         bool, int or NoneType: returns True if party is pro se, False if not; 
 16 |             second output:
 17 |             when bool is true - returns index of counsel block that is the pro se party
 18 |             when bool is false- returns NoneType
 19 |     
 20 |     Notes:
 21 |         This function requires a Bool flag to indicate success. When just an index or None was returned, explicit
 22 |         type checking was needed to confirm if a match occurred because None and 0 (a plausible counsel index return)
 23 |         both case to False -- the result was if type(output)==int: --> use int
 24 |     """
 25 |     
 26 |     ## -------------Internals---------------- ##
 27 |     
 28 |     def _return_success(index_selected):
 29 |         """format the function output with bool messaging
 30 | 
 31 |         Args:
 32 |             index_selected (int): the integer index of the counsel block that corresponds to the pro se party
 33 | 
 34 |         Returns:
 35 |             bool, int: returns True if party for party is pro se;  index of counsel block that is the pro se party
 36 |         """
 37 |         # during matching, the COUNSELS are given an attribute that tracks their original index in the counsels list
 38 |         # pop this if it was created so that the user's original input data remains unchanged upon return
 39 |         for counsel in COUNSELS:
 40 |             counsel.pop('original_index', None)
 41 |         return True, index_selected
 42 |     def _return_failure(rapid = False):
 43 |         """[summary]
 44 | 
 45 |         Args:
 46 |             rapid (bool, optional): if the function fails early before any data was changed, don't iterate through counsels. Defaults to False.
 47 | 
 48 |         Returns:
 49 |             bool, NoneType: party is not pro se; none since no index matches
 50 |         """
 51 | 
 52 |         if rapid:
 53 |             return False, None
 54 | 
 55 |         # during matching, the COUNSELS are given an attribute that tracks their original index in the counsels list
 56 |         # pop this if it was created so that the user's original input data remains unchanged upon return
 57 |         for counsel in COUNSELS:
 58 |             counsel.pop('original_index', None)
 59 |         return False, None
 60 |     
 61 |     def _call_fuzzy(party_name, counsels, match_type = 'ratio'):
 62 |         """call a fuzzy matching run across a list of counsels using the parent functions
 63 |         threshold, and the specified matching type
 64 | 
 65 |         Args:
 66 |             party_name (str): string of the party name being checked
 67 |             counsels (list): list of SCALES json formatted counsel dicts, with an added attribute for their original index
 68 |             match_type (str, optional): which fuzzywuzzy fuzz match should we employ. Defaults to 'ratio'.
 69 | 
 70 |         Returns:
 71 |             [type]: [description]
 72 |         """
 73 |         if match_type=='ratio':
 74 |             fuzzycall = fuzz.ratio
 75 |         elif match_type == 'token-set':
 76 |             fuzzycall = fuzz.token_set_ratio
 77 |         else:
 78 |             fuzzycall = fuzz.ratio
 79 |         
 80 |         if len(party_name) <=8:
 81 |             fuzzycall = fuzz.partial_token_set_ratio
 82 | 
 83 |         # failsafes are used to escape bad fuzzy matches before they happen
 84 |         # A. USA fuzzies into AUSA and many other generic X of USA roles. In general, we know the USA as a party
 85 |         # represents itself and that the term USA should not fuzzy into the individual counsel names
 86 |         # B. material witnesses and parties that are just abbreviations or single letters inadvertently match
 87 |         # their counsels middle initials or initialed names i.e. L.W. as a party matched James L. Watson.
 88 |         # if a party is a nondescript initial grouping, we do not fuzzy match it 
 89 |         FAILSAFES = [
 90 |             lambda party_name: party_name.lower().strip()=='usa',
 91 |             lambda party_name: all(len(tok.strip())==1 for tok in party_name.replace('.',' ').split()) 
 92 |         ]
 93 |         for failsafe in FAILSAFES:
 94 |             if failsafe(party_name):
 95 |                 return None
 96 | 
 97 |         matches = [] # start with no matches
 98 |         for counsel in [c for c in counsels if c['name']]: # only compare counsels that had a name
 99 |             if len(counsel['name'])<=8:
100 |                 fuzzycall = fuzz.partial_token_set_ratio
101 |             FR = fuzzycall( counsel['name'] , party_name ) # fuzzy match score
102 |             if FR >= threshold: 
103 |                 matches.append((counsel, FR)) # if it matches, add to our matches
104 |         if matches:
105 |             # our threshold is high enough that any match is believable, if there are multiple, take the top one (?)
106 |             # 0th index is top score
107 |             # [0][0] is the counsel object in the tuple
108 |             winner = sorted(matches, key = lambda tups: tups[1], reverse=True)[0][0]
109 |             return _return_success(winner['original_index'])
110 |         return None
111 |     
112 |     ## -------------------------------------- ##
113 |        
114 |     ## EARLY FAILSAFE
115 |     # if any json keys are missing or NoneTypes, kick out
116 |     if not party['counsel'] or not party['name']:
117 |         return _return_failure(rapid=True)
118 |     
119 |     # will be using this everywhere
120 |     COUNSELS = party['counsel']
121 |     # add an attribute once that specifies the enumerated index of each iterable in the list
122 |     # this saves us from continued enumeration and any ordering preservation
123 |     for original_index, counsel in enumerate(COUNSELS):
124 |         counsel['original_index'] = original_index
125 |     
126 |     ####################################
127 |     # CONTROL BLOCK if restrictive pro se flag showed up in json from parse
128 |     ####################################
129 |     # if the parser already believes this to be a pro-se entry, leverage that as a head start
130 |     if any((bool(counsel['is_pro_se']) for counsel in party['counsel'])):
131 |         # if only one counsel, hooray no logic return it
132 |         if len(COUNSELS)==1:
133 |             return _return_success(0)
134 | 
135 |         # else: need to confirm that there is actually a "PRO SE" and there is only one counsel block that matches the criteria
136 |         # looking for a singular "PRO SE" counsel
137 |         matches = []
138 |         for counsel in COUNSELS:
139 |             if counsel['name']: # IF THERE IS A NAME FOR THE COUNSEL
140 |                 check = counsel['name']
141 |                 if counsel['entity_info'].get('raw_info'): # IF THERE IS ALSO RAW INFO
142 |                     extra_info = dtools.extra_info_cleaner(counsel['entity_info'].get('raw_info'))
143 |                     if extra_info:
144 |                         check += '\n' + extra_info
145 |             elif counsel['entity_info'].get('raw_info'): # THERE IS NO NAME, CHECK IF RAW INFO
146 |                 check = dtools.extra_info_cleaner(counsel['entity_info'].get('raw_info'))
147 |             else: # NO NAME, NO RAW INFO.... THATS WHACK, WE CANT COMPARE IT
148 |                 continue
149 |             
150 |             # the explicit code that triggered the party level flag
151 |             if "PRO SE" in check:
152 |                 matches.append(counsel)
153 |         if len(matches)==1: # if only one counsel is pro se, return their original index
154 |             return _return_success(matches[0]['original_index']) 
155 | 
156 | 
157 |     ####################################
158 |     # CONTROL BLOCK if party exactly represented text in counsels
159 |     ####################################
160 |     # dockets have whacky spacing on parties but not counsels sometimes -- normalize whitespace and case
161 |     space_voider = lambda x: " ".join(x.strip().split()).lower()
162 |     
163 |     sv_party_name = space_voider( party['name'] ) # normalized party name
164 |     sv_counsels = [(space_voider( c['name'] ), c['original_index']) for c in COUNSELS if c['name']] # normalized counsel names
165 |     
166 |     # if the normalized party appears in normalized counsel names verbatim, trigger and match
167 |     if sv_party_name in sv_counsels:
168 |         # (efficiency of "in" comparison presumed)
169 |         # the match ends up as a tuple, return the original index for kick out
170 |         match = [counsel for counsel in sv_counsels if counsel == sv_party_name][0]
171 |         return _return_success(match[1])
172 |     
173 |     ####################################
174 |     # CONTROL BLOCK TOKEN SET RATIO
175 |     ####################################
176 |     # final layer is a token set ratio check across the party and counsel names
177 |     # fuzzywuzzy normalizes whitespace when generating tokens
178 |     # if a party has prefixes, but the counsel form of the name does not, we still have
179 |     # a successful token set match since one's tokens are wholly present in the others
180 |     # the wrapper below will change match_type internally if a string is shorter than 9 characters
181 |     from fuzzywuzzy import fuzz
182 | 
183 |     fuzzed = _call_fuzzy(party['name'], COUNSELS, match_type="token-set")
184 |     if fuzzed:
185 |         return fuzzed
186 |     
187 |     return _return_failure()
188 | 
189 | 
190 | ################################################
191 | # Ngram similarity functions
192 | ################################################
193 | 
194 | def ngrams(string, n=3):
195 |     import re
196 |     string = re.sub(r'[,-./]|\sBD',r'', string)
197 |     ngrams = zip(*[string[i:] for i in range(n)])
198 |     return [''.join(ngram) for ngram in ngrams]
199 | 
200 | def cossim_top(A, B, ntop, lower_bound=0):
201 |     import numpy as np
202 |     import sparse_dot_topn.sparse_dot_topn as ct
203 |     from scipy.sparse import csr_matrix
204 |     # force A and B as a CSR matrix.
205 |     # If they have already been CSR, there is no overhead
206 |     A = A.tocsr()
207 |     B = B.tocsr()
208 |     M, _ = A.shape
209 |     _, N = B.shape
210 |  
211 |     idx_dtype = np.int32
212 |  
213 |     nnz_max = M*ntop
214 |  
215 |     indptr = np.zeros(M+1, dtype=idx_dtype)
216 |     indices = np.zeros(nnz_max, dtype=idx_dtype)
217 |     data = np.zeros(nnz_max, dtype=A.dtype)
218 | 
219 |     ct.sparse_dot_topn(
220 |         M, N, np.asarray(A.indptr, dtype=idx_dtype),
221 |         np.asarray(A.indices, dtype=idx_dtype),
222 |         A.data,
223 |         np.asarray(B.indptr, dtype=idx_dtype),
224 |         np.asarray(B.indices, dtype=idx_dtype),
225 |         B.data,
226 |         ntop,
227 |         lower_bound,
228 |         indptr, indices, data)
229 | 
230 |     return csr_matrix((data,indices,indptr),shape=(M,N))
231 | 
232 | def get_matches_df(sparse_matrix):
233 |     import pandas as pd
234 |     non_zeros = sparse_matrix.nonzero()
235 |     return pd.DataFrame({'left_side_idx': non_zeros[0], \
236 |                          'right_side_idx': non_zeros[1], \
237 |                          'similairity': sparse_matrix.data})
238 | 
239 | def swapper(tidx, name_vector):
240 |     return name_vector[tidx]
241 | 
242 | ###################################
243 | # Basic cosine
244 | ###################################
245 | 
246 | 


--------------------------------------------------------------------------------
/src/pacer_tools/code/db/rdf/make_graph_data_fulton_county.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script has been included solely in an effort to make our RDF-graph-build process
  3 | transparent; it was copy-pasted directly from SCALES's private infrastructure repo, and
  4 | has not been tested here! Our assumption is that, because the raw data used for this
  5 | portion of our graph comes from a private dataset, nobody besides us will run this
  6 | script. If we're incorrect about this assumption, feel free to contact us at
  7 | engineering@scales-okn.org.
  8 | """
  9 | 
 10 | import os
 11 | import json
 12 | import logging
 13 | import sys
 14 | import argparse
 15 | from pathlib import Path
 16 | import utils
 17 | from typing import Any, Dict, List
 18 | from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
 19 | 
 20 | from tqdm import tqdm
 21 | from rdflib import Graph, Namespace, Literal, RDF, XSD
 22 | 
 23 | sys.path.append(str(Path.cwd().parents[1].resolve()))
 24 | import utils
 25 | from constants import SCALES, J, NC, NIBRS, OCCS
 26 | from support import settings
 27 | 
 28 | logger = logging.getLogger(__name__)
 29 | logger.setLevel(logging.INFO)
 30 | _handler = logging.FileHandler("error.log")
 31 | _handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
 32 | logger.addHandler(_handler)
 33 | 
 34 | _make_party_uri_fulton = lambda charge_id: utils._make_party_uri(f'ga-fulton-{int(charge_id)}', 0)
 35 | 
 36 | 
 37 | def _create_graph() -> Graph:
 38 |     """Create a blank graph and bind standard prefixes."""
 39 |     g = Graph()
 40 |     g.bind("scales", SCALES)
 41 |     g.bind("j", J)
 42 |     g.bind("nc", NC)
 43 |     g.bind("nibrs", NIBRS)
 44 |     g.bind("occs", OCCS)
 45 |     g.bind("rdf", RDF)
 46 |     return g
 47 | 
 48 | def _make_metadata_graph():
 49 |     """Take care of a few Fulton-County-Jail-related triples we might want to use at some point."""
 50 |     g = _create_graph()
 51 |     facility_uri = utils._make_generic_uri('Facility', 'ga-fulton-county-jail')
 52 |     g.add((facility_uri, NC.FacilityName, 'Fulton County Jail'))
 53 |     g.add((facility_uri, NC.PhysicalAddress, '901 Rice St NW, Atlanta, GA 30318'))
 54 |     g.add((facility_uri, OCCS.FacilityFunctionCode, '11-13 11 33')) # "Detention Center" (see https://niem.github.io/model/5.0/occs/FacilityFunctionCodeSimpleType/#diagram)
 55 |     return g
 56 | 
 57 | def _build_docket_subgraph(
 58 |     g: Graph,
 59 |     ucid: str,
 60 |     hearings: List[Dict[str, Any]],
 61 |     case_uri,
 62 | ):
 63 |     """Convert the list of hearings to a Register-of-Actions style sub-graph."""
 64 |     if not hearings:
 65 |         return
 66 | 
 67 |     table_uri = utils._make_generic_uri("DocketTable", f"{ucid}")
 68 |     g.add((case_uri, J.RegisterOfActions, table_uri))
 69 |     g.add((table_uri, RDF.type, J.RegisterOfActions))
 70 | 
 71 |     for idx, hearing in enumerate(hearings):
 72 |         entry_uri = utils._make_docket_uri(ucid, idx)
 73 |         g.add((table_uri, J.RegisterAction, entry_uri))
 74 |         g.add((entry_uri, RDF.type, J.RegisterAction))
 75 | 
 76 |         # Filing / event date
 77 |         h_date = hearing.get("hearing_date")
 78 |         if h_date:
 79 |             g.add(
 80 |                 (
 81 |                     entry_uri,
 82 |                     J.RegisterActionDate,
 83 |                     Literal(utils._date_to_xsd(h_date), datatype=XSD.date),
 84 |                 )
 85 |             )
 86 | 
 87 |         # Description text (type, result, etc.)
 88 |         parts = [hearing.get("hearing_type")]
 89 |         if hearing.get("result"):
 90 |             parts.append(f": {hearing['result']}")
 91 |         if hearing.get("result_type"):
 92 |             parts.append(f"({hearing['result_type']})")
 93 |         contents = " ".join([p for p in parts if p])
 94 |         if contents:
 95 |             g.add(
 96 |                 (
 97 |                     entry_uri,
 98 |                     J.RegisterActionDescriptionText,
 99 |                     Literal(utils._escape_quotes(contents)),
100 |                 )
101 |             )
102 | 
103 | 
104 | def process_json_file(json_path: str):
105 |     """Parse a single Fulton-county *charge* JSON and return a list(triples)."""
106 |     try:
107 |         with open(json_path, "r") as fh:
108 |             data = json.load(fh)
109 |     except Exception as exc:  # pylint: disable=broad-except
110 |         logger.error("Error reading %s: %s", json_path, exc)
111 |         return None
112 | 
113 |     g = _create_graph()
114 | 
115 |     charge_id = data.get("charge_id")
116 |     charge_uri = utils._make_generic_uri("Charge", f"ga-fulton-01-{int(charge_id)}")
117 | 
118 |     g.add((charge_uri, RDF.type, J.Charge))
119 |     desc = data.get("charge_offense_description")
120 |     g.add((charge_uri, J.ChargeText, Literal(utils._escape_quotes(desc))))
121 | 
122 |     severity = data.get("severity")
123 |     g.add((charge_uri, J.ChargeSeverityLevelCode, Literal(severity)))
124 | 
125 |     # Charge decision / status
126 |     # decision = data.get("charge_decision") or {}
127 |     # if decision.get("charge_decision"):
128 |     #     g.add(
129 |     #         (
130 |     #             charge_uri,
131 |     #             J.ChargeDispositionCategoryText,
132 |     #             Literal(decision["charge_decision"]),
133 |     #         )
134 |     #     )
135 |     # if decision.get("charge_status"):
136 |     #     g.add(
137 |     #         (
138 |     #             charge_uri,
139 |     #             NC.StatusDescriptionText,
140 |     #             Literal(decision["charge_status"]),
141 |     #         )
142 |     #     )
143 |     # if decision.get("file_date"):
144 |     #     g.add(
145 |     #         (
146 |     #             charge_uri,
147 |     #             NC.StartDate,
148 |     #             Literal(
149 |     #                 utils._date_to_xsd(decision["file_date"]),
150 |     #                 datatype=XSD.date,
151 |     #             ),
152 |     #         )
153 |     #     )
154 |     # if decision.get("charge_decision_date"):
155 |     #     g.add(
156 |     #         (
157 |     #             charge_uri,
158 |     #             NC.EndDate,
159 |     #             Literal(
160 |     #                 utils._date_to_xsd(decision["charge_decision_date"]),
161 |     #                 datatype=XSD.date,
162 |     #             ),
163 |     #         )
164 |     #     )
165 | 
166 |     case_info = data.get("case")
167 |     if case_info:
168 |         case_nbr = case_info.get("case_nbr")
169 |         ucid = f"ga-fulton-01-{case_nbr}"
170 |         case_uri = utils._make_case_uri(ucid)
171 | 
172 |         g.add((charge_uri, J.ChargeFiledCase, case_uri))
173 |         g.add((case_uri, NC.CaseDocketID, Literal(utils._escape_quotes(case_nbr))))
174 |         g.add((case_uri, RDF.type, nc.CourtCase))
175 |         g.add((case_uri, RDF.type, SCALES.CriminalCase))
176 |         g.add((case_uri, NC.CaseGeneralCategoryText, Literal("criminal")))
177 | 
178 |         # Hearings / register of actions
179 |         _build_docket_subgraph(g, ucid, case_info.get("hearings", []), case_uri)
180 | 
181 |     if data.get("bond_type"):
182 |         g.add((charge_uri, J.BondType, Literal(data["bond_type"])))
183 |     if data.get("bond_amount"):
184 |         try:
185 |             amt = float(data["bond_amount"])
186 |         except (TypeError, ValueError):
187 |             amt = data["bond_amount"]
188 |         g.add((charge_uri, J.BondAmount, Literal(amt, datatype=XSD.float)))
189 | 
190 |     booking = data.get("booking")
191 |     if booking:
192 |         booking_uri = utils._make_generic_uri('Booking', f"ga-fulton-{int(booking['jailing_id'])}")
193 |         g.add((booking_uri, J.BookingFacility, utils._make_generic_uri('Facility', 'ga-fulton-county-jail')))
194 |         party_uri = _make_party_uri_fulton(charge_id) # we don't create this uri earlier because the booking dict is where the party info resides
195 | 
196 |         # Link charge/booking/party
197 |         g.add((charge_uri, J.Booking, booking_uri))
198 |         g.add((booking_uri, RDF.type, J.Booking))
199 |         g.add((party_uri, J.PersonCharge, charge_uri))
200 |         g.add((party_uri, RDF.type, J.BookingSubject))
201 | 
202 |         # g.add((party_uri, J.ParticipantRoleCategoryText, Literal("defendant"))) # commented this out because not all arrestees become defendants
203 |         if booking.get("gender"):
204 |             g.add((party_uri, J.PersonSexCode, Literal(booking["gender"])))
205 |         if booking.get("race"):
206 |             g.add((party_uri, NC.PersonRaceText, Literal(booking["race"])))
207 | 
208 |         if booking.get("booking_date"):
209 |             g.add(
210 |                 (
211 |                     booking_uri,
212 |                     NC.StartDate,
213 |                     Literal(
214 |                         utils._date_to_xsd(booking["booking_date"]), datatype=XSD.date
215 |                     ),
216 |                 )
217 |             )
218 |         if booking.get("release_date"):
219 |             g.add(
220 |                 (
221 |                     booking_uri,
222 |                     NC.EndDate,
223 |                     Literal(
224 |                         utils._date_to_xsd(booking["release_date"]), datatype=XSD.date
225 |                     ),
226 |                 )
227 |             )
228 | 
229 |     return list(g)
230 | 
231 | 
232 | def _write_graph_worker(graph: Graph, outdir: Path, file_name=None):
233 |     utils._write_graph_to_file(graph, outdir, file_name=file_name)
234 | 
235 | 
236 | def main(indir: str, outdir: str):
237 |     """Read all JSON charge files beneath *indir* and emit Turtle files to *outdir*."""
238 |     indir_p = Path(indir)
239 |     outdir_p = Path(outdir)
240 |     outdir_p.mkdir(parents=True, exist_ok=True)
241 | 
242 |     utils._write_graph_to_file(_make_metadata_graph(), outdir, file_name='facility.ttl')
243 | 
244 |     json_files = [str(f) for f in indir_p.rglob("*.json") if f.is_file()]
245 |     logger.info("Discovered %d JSON files in %s", len(json_files), indir)
246 | 
247 |     record_counter = 0
248 |     global_graph = _create_graph()
249 |     write_futures = []
250 |     
251 |     with ProcessPoolExecutor(max_workers=12) as proc_exec, ThreadPoolExecutor(
252 |         max_workers=8
253 |     ) as thread_exec:
254 |         futures = {proc_exec.submit(process_json_file, jf): jf for jf in json_files}
255 | 
256 |         with tqdm(total=len(json_files), desc="Processing charges") as pbar:
257 |             for future in as_completed(futures):
258 |                 triples = future.result()
259 |                 if triples:
260 |                     for triple in triples:
261 |                         global_graph.add(triple)
262 |                     record_counter += 1
263 | 
264 |                     # Flush every 10k records (adjust as needed)
265 |                     if record_counter >= 10000:
266 |                         wf = thread_exec.submit(
267 |                             _write_graph_worker, global_graph, outdir_p
268 |                         )
269 |                         write_futures.append(wf)
270 |                         record_counter = 0
271 |                         global_graph = _create_graph()
272 | 
273 |                 pbar.update(1)
274 | 
275 |     # Final flush
276 |     if record_counter:
277 |         utils._write_graph_to_file(global_graph, outdir_p)
278 | 
279 |     # Await parallel writers
280 |     for wf in as_completed(write_futures):
281 |         try:
282 |             wf.result()
283 |         except Exception as exc:  # pylint: disable=broad-except
284 |             logger.error("Error in write operation: %s", exc)
285 | 
286 |     # entities (added by scott)
287 |     utils.process_entities(
288 |         (settings.PARTY_DIS_UNIVERSAL,),
289 |         outdir,
290 |         _make_party_uri_fulton,
291 |         ('charge_id',),
292 |         filter_funcs={settings.PARTY_DIS_UNIVERSAL: (
293 |             lambda df: df[df.court.eq('ga-fulton')])
294 |         }
295 |     )
296 | 
297 | 
298 | if __name__ == "__main__":
299 |     parser = argparse.ArgumentParser(
300 |         description="Parse Fulton-county charge JSON files and emit Turtle graphs",
301 |     )
302 |     parser.add_argument("indir", help="Directory containing input JSON files")
303 |     parser.add_argument("outdir", help="Directory where TTL files will be written")
304 |     args = parser.parse_args()
305 | 
306 |     main(args.indir, args.outdir)
307 | 


--------------------------------------------------------------------------------
/src/pacer_tools/code/db/rdf/utils.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import ast
  3 | import time
  4 | import pandas as pd
  5 | from tqdm import tqdm
  6 | from pathlib import Path
  7 | from rdflib import Graph, URIRef
  8 | 
  9 | from constants import SCALES
 10 | 
 11 | manual_offense_mapping = {
 12 |     "ASSAULT AGGRAVATED": "AGGRAVATED ASSAULT",
 13 |     "ASSAULT SIMPLE": "SIMPLE ASSAULT",
 14 |     "INTIMIDATION": "INTIMIDATION",
 15 |     "DRUG PARAPHERNALIA OFFENSES": "DRUG EQUIPMENT VIOLATIONS",
 16 |     "EQUIPMENT DRUG": "DRUG EQUIPMENT VIOLATIONS",
 17 |     "FALSE PRETENSES": "FALSE PRETENSES/SWINDLE/CONFIDENCE GAME",
 18 |     "SWINDLE": "FALSE PRETENSES/SWINDLE/CONFIDENCE GAME",
 19 |     "CONFIDENCE GAME": "FALSE PRETENSES/SWINDLE/CONFIDENCE GAME",
 20 |     "AUTOMATED TELLER MACHINE": "CREDIT CARD/AUTOMATED TELLER MACHINE FRAUD",
 21 |     "CREDIT CARD FRAUD": "CREDIT CARD/AUTOMATED TELLER MACHINE FRAUD",
 22 |     "IMPERSONATION": "IMPERSONATION",
 23 |     "FRAUD WELFARE": "WELFARE FRAUD",
 24 |     "FRAUD TELEPHONE": "WIRE FRAUD",
 25 |     "FRAUD IDENTITY THEFT": "IDENTITY THEFT",
 26 |     "COMPUTER CRIME": "HACKING/COMPUTER INVASION",
 27 |     "FRAUD HACKING/COMPUTER\nINVASION": "HACKING/COMPUTER INVASION",
 28 |     "BETTING UNLAWFUL": "BETTING/WAGERING",
 29 |     "TRANSMITTING WAGERING INFORMATION": "BETTING/WAGERING",
 30 |     "WAGERING UNLAWFUL": "BETTING/WAGERING",
 31 |     "GAMBLING PARAPHERNALIA DEVICES EQUIPMENT POSESSION": "GAMBLING EQUIPMENT VIOLATIONS",
 32 |     "BRIBERY SPORTS": "SPORTS TAMPERING",
 33 |     "HOMICIDE JUSTIFIABLE": "JUSTIFIABLE HOMICIDE",
 34 |     "COMMERCIALIZED SEX COMMERCIAL SEX": "HUMAN TRAFFICKING, COMMERCIAL SEX ACTS",
 35 |     "HUMAN TRAFFICKING\nCOMMERCIAL SEX ACTS": "HUMAN TRAFFICKING, COMMERCIAL SEX ACTS",
 36 |     "PICKPOCKET": "POCKET-PICKING",
 37 |     "PURSE-SNATCHING": "PURSE-SNATCHING",
 38 |     "SHOPLIFTING": "SHOPLIFTING",
 39 |     "THEFT FROM A BUILDING": "THEFT FROM BUILDING",
 40 |     "THEFT FROM A COIN-OPERATED\nMACHINE OR DEVICE": "THEFT FROM COIN-OPERATED MACHINE OR DEVICE",
 41 |     "THEFT FROM A MOTOR VEHICLE": "THEFT FROM MOTOR VEHICLE",
 42 |     "STRIPPING MOTOR VEHICLE": "THEFT OF MOTOR VEHICLE PARTS OR ACCESSORIES",
 43 |     "PIMPING": "ASSISTING OR PROMOTING PROSTITUTION",
 44 |     "TRANSPORTING PERSONS FOR PROSTITUTION": "ASSISTING OR PROMOTING PROSTITUTION",
 45 |     "FREQUENTING A HOUSE OF\nPROSTITUTION": "PURCHASING PROSTITUTION",
 46 |     "RAPE": "RAPE",
 47 |     "SODOMY": "SODOMY",
 48 |     "SEXUAL ASSAULT WITH AN OBJECT": "SEXUAL ASSAULT WITH AN OBJECT",
 49 |     "FONDLING": "FONDLING",
 50 |     "INCEST": "INCEST",
 51 |     "RAPE STATUTORY": "STATUTORY RAPE",
 52 |     "EXPLOSIVES": "EXPLOSIVES",
 53 | }
 54 | 
 55 | drug_keywords_apd = {
 56 |     "crack": {"nibrs_code": "A", "nibrs_drug": "Crack Cocaine"},
 57 |     "caine": {"nibrs_code": "B", "nibrs_drug": "Cocaine (All forms except crack)"},
 58 |     "hash": {"nibrs_code": "C", "nibrs_drug": "Hashish"},
 59 |     "roin": {"nibrs_code": "D", "nibrs_drug": "Heroin"},
 60 |     "juana": {"nibrs_code": "E", "nibrs_drug": "Marijuana"},
 61 |     "morp": {"nibrs_code": "F", "nibrs_drug": "Morphine"},
 62 |     # 'opium': {'nibrs_code': 'G', 'nibrs_drug': 'Opium'},
 63 |     "narc": {"nibrs_code": "H", "nibrs_drug": "Other Narcotics"},
 64 |     "lsd": {"nibrs_code": "I", "nibrs_drug": "LSD"},
 65 |     "pcp": {"nibrs_code": "J", "nibrs_drug": "PCP"},
 66 |     "halluc": {"nibrs_code": "K", "nibrs_drug": "Other Hallucinogens"},
 67 |     "amphe": {"nibrs_code": "L", "nibrs_drug": "Amphetamines/Methamphetamines"},
 68 |     "stim": {"nibrs_code": "M", "nibrs_drug": "Other Stimulants"},
 69 |     "barbit": {"nibrs_code": "N", "nibrs_drug": "Barbiturates"},
 70 |     "depress": {"nibrs_code": "O", "nibrs_drug": "Other Depressants"},
 71 |     "unknown": {"nibrs_code": "U", "nibrs_drug": "Unknown Drug Type"},
 72 |     "drug": {"nibrs_code": "P", "nibrs_drug": "Other Drugs"},
 73 |     # 'over 3': 'X'
 74 | }
 75 | exclusions_apd = ()
 76 | 
 77 | drug_keywords_clayton = {
 78 |     "cocaine": {"nibrs_code": "B", "nibrs_drug": "Cocaine (All forms except crack)"},
 79 |     "substance or marijuana": {"nibrs_code": "U", "nibrs_drug": "Unknown Drug Type"},
 80 |     "substance/marijuana": {"nibrs_code": "U", "nibrs_drug": "Unknown Drug Type"},
 81 |     "marijuana": {"nibrs_code": "E", "nibrs_drug": "Marijuana"},
 82 |     "thc": {"nibrs_code": "E", "nibrs_drug": "Marijuana"},
 83 |     "ecstacy": {"nibrs_code": "K", "nibrs_drug": "Other Hallucinogens"},
 84 |     "amphetamine": {"nibrs_code": "L", "nibrs_drug": "Amphetamines/Methamphetamines"},
 85 |     "methaqualone": {"nibrs_code": "O", "nibrs_drug": "Other Depressants"},
 86 |     "ephedrine": {"nibrs_code": "P", "nibrs_drug": "Other Drugs"},
 87 |     "glue": {"nibrs_code": "P", "nibrs_drug": "Other Drugs"},
 88 |     "nitrous": {"nibrs_code": "P", "nibrs_drug": "Other Drugs"},
 89 |     "steroid": {"nibrs_code": "P", "nibrs_drug": "Other Drugs"},
 90 |     "drug": {"nibrs_code": "U", "nibrs_drug": "Unknown Drug Type"},
 91 |     "narcotic": {"nibrs_code": "U", "nibrs_drug": "Unknown Drug Type"},
 92 |     "gcsa": {"nibrs_code": "U", "nibrs_drug": "Unknown Drug Type"},
 93 |     "substa": {"nibrs_code": "U", "nibrs_drug": "Unknown Drug Type"},
 94 |     "medication": {"nibrs_code": "U", "nibrs_drug": "Unknown Drug Type"},
 95 |     "morphine, opium, heroin": {"nibrs_code": "U", "nibrs_drug": "Unknown Drug Type"},
 96 | }
 97 | exclusions_clayton = (
 98 |     "alcohol-drugs",
 99 |     "drug related object",
100 |     "drugs/alcohol or under influence",
101 |     "drugs,alcohol",
102 |     "drugs, weapons or alcohol",
103 |     "dumping",
104 | )
105 | 
106 | 
107 | 
108 | def process_entities(fpaths, outdir, party_uri_func, fields_needed, filter_funcs={}):
109 |     '''
110 |     fpaths: an iterable of filepaths from which to extract entity info
111 |     party_uri_func: a function with which to generate a party uri for each dataframe row (not an entity uri, as df.id will be used for these by default)
112 |     fields_needed: an iterable of fields that party_uri_func needs (i believe this is more performant than using iterrows)
113 |     filters: optionally, a dict that maps each desired filepath to a lambda function that filters a dataframe (e.g. to exclude weak keys or select courts in PARTY_DIS_UNIVERSAL)
114 |     '''
115 |     g = Graph()
116 |     g.bind('scales', SCALES)
117 | 
118 |     for fpath in fpaths:
119 |         df = pd.read_csv(fpath)
120 |         filter_func = filter_funcs.get(fpath)
121 |         if filter_func:
122 |             df = filter_func(df)
123 |         if 'id' not in df.columns:
124 |             raise Exception(f"process_entities expects {fpath} to contain an 'id' column")
125 | 
126 |         spids = list(df.id)
127 |         field_lists = [list(df[field]) for field in fields_needed]
128 |         for i in tqdm(range(len(df)), desc='Processing disambiguated parties'):
129 |             values = [lst[i] for lst in field_lists]
130 |             g.add((party_uri_func(*values), SCALES.isInstanceOfEntity, _make_generic_uri('PartyEntity', spids[i])))
131 | 
132 |             if i and not i%50000:
133 |                 _write_graph_to_file(g, outdir, infix="entities")
134 |                 g = Graph()
135 |                 g.bind('scales', SCALES)
136 | 
137 |     # TODO merge ids in a more conservative disambiguation file when a more liberal disambiguation file suggests we can
138 |     _write_graph_to_file(g, outdir, infix="entities")
139 | 
140 | 
141 | def parse_drugs(df, charge_col, source, from_cli=False):
142 |     results = []
143 |     processed_indices = set()
144 |     drug_keywords = {"apd": drug_keywords_apd, "clayton": drug_keywords_clayton}[source]
145 |     exclusions = {"apd": exclusions_apd, "clayton": exclusions_clayton}[source]
146 | 
147 |     for index, row in df.iterrows():
148 |         if index in processed_indices:
149 |             continue
150 |         arrest_charge = str(row[charge_col])
151 |         for keyword, code in drug_keywords.items():
152 |             if keyword.lower() in arrest_charge.lower() and not any(
153 |                 x in arrest_charge.lower() for x in exclusions
154 |             ):
155 |                 results.append(
156 |                     {
157 |                         "index": index,
158 |                         charge_col: arrest_charge,
159 |                         "keyword": keyword,
160 |                         "nibrs_code": code["nibrs_code"],
161 |                         "nibrs_drug": code["nibrs_drug"],
162 |                     }
163 |                 )
164 |                 processed_indices.add(index)
165 |                 break  # exit inner loop once a match is found for this record
166 | 
167 |     results_df = pd.DataFrame(results)
168 |     if from_cli:
169 |         results_df.to_csv("apd_drug_arrests.csv", index=False)
170 |         print(f"Total arrest records processed: {len(df):,}")
171 |         print(f"Total matches found: {len(results_df):,}")
172 |         if not results_df.empty:
173 |             print("\nNIBRS drugs by match count:\n")
174 |             top_drugs = results_df["nibrs_drug"].value_counts().head(10)
175 |             for drug, count in top_drugs.items():
176 |                 # Find the keyword for this drug
177 |                 keyword = results_df[results_df["nibrs_drug"] == drug]["keyword"].iloc[
178 |                     0
179 |                 ]
180 |                 print(f"  {drug}, {keyword}: {count}")
181 | 
182 |                 drug_matches = results_df[results_df["nibrs_drug"] == drug]
183 |                 top_charges = drug_matches[charge_col].value_counts()
184 |                 for charge, charge_count in top_charges.items():
185 |                     print(f"    - {charge}: {charge_count}")
186 |                 print()
187 |         else:
188 |             print("No matches found.")
189 |     else:
190 |         return results_df
191 | 
192 | 
193 | def _escape_quotes(text):
194 |     if text is None:
195 |         return None
196 | 
197 |     text = str(text)
198 |     if '"' in text:
199 |         return text.replace('"', "'")
200 |     return text
201 | 
202 | 
203 | def _date_to_xsd(date_str):
204 |     if not date_str:
205 |         return None
206 | 
207 |     # convert to string and strip whitespace
208 |     date_str = str(date_str).strip()
209 | 
210 |     # 1) Fast-path: leading ISO YYYY-MM-DD (optionally followed by time info)
211 |     if len(date_str) >= 10 and date_str[4] == "-" and date_str[7] == "-":
212 |         return date_str[:10]
213 | 
214 |     # 2) Try a list of known patterns via time.strptime
215 |     patterns = [
216 |         "%Y-%m",  # '2016-03'
217 |         "%m/%d/%Y",  # '03/12/2016'
218 |         "%d/%m/%Y",  # '12/03/2016' (rare)
219 |         "%m/%Y",  # '03/2016'
220 |     ]
221 | 
222 |     for fmt in patterns:
223 |         try:
224 |             parsed = time.strptime(date_str, fmt)
225 |             # Default missing day/month handled by strptime (defaults to 1)
226 |             return time.strftime("%Y-%m-%d", parsed)
227 |         except ValueError:
228 |             pass
229 |             # raise ValueError(f"Invalid date format: {date_str}")
230 | 
231 | 
232 | def _make_case_uri(ucid):
233 |     return URIRef(f"{SCALES}Case/{ucid}")
234 | 
235 | def _make_docket_uri(ucid, idx):
236 |     return URIRef(f"{SCALES}DocketEntry/{ucid}_de{int(idx)}")
237 | 
238 | def _make_charge_uri(ucid, dft_idx, chg_idx):
239 |     if type(chg_idx)==str:
240 |         chg_idx = re.sub('[ :;,./="]', "", chg_idx)
241 |     return URIRef(f"{SCALES}Charge/{ucid}_p{int(dft_idx)}_c{chg_idx}")
242 | 
243 | def _make_sentence_uri(ucid, entry_idx, sentence_idx):
244 |     return URIRef(f"{SCALES}Sentence/{ucid}_de{int(entry_idx)}_s{int(sentence_idx)}")
245 | 
246 | def _make_party_uri(ucid, idx):
247 |     return URIRef(f"{SCALES}Party/{ucid}_p{int(idx)}")
248 | 
249 | def _make_counsel_uri(ucid, idx):
250 |     return URIRef(f"{SCALES}Lawyer/{ucid}_l{int(idx)}")
251 | 
252 | def _make_generic_uri(namespace, entity_id):
253 |     return URIRef(f"{SCALES}{namespace}/{entity_id}")
254 | 
255 | 
256 | def _write_graph_to_file(graph, outdir, file_name=None, infix=None):
257 |     """Write the current graph to a file with a unique, sortable name."""
258 |     file_name = file_name or f"graph_{infix+'_' if infix else ''}{time.time_ns()}.ttl"
259 |     outpath = Path(outdir) / Path(file_name)
260 |     outpath.parent.mkdir(parents=True, exist_ok=True)
261 |     print(f"Writing TTL to {outpath}")
262 |     graph.serialize(destination=str(outpath), format="turtle", encoding="utf-8")
263 |     print(f"Wrote TTL to {outpath}")
264 | 


--------------------------------------------------------------------------------
/src/pacer_tools/code/parsers/README.md:
--------------------------------------------------------------------------------
  1 | # Description
  2 | A parser that reads HTMLs downloaded from Pacer.gov and breaks them up into JSON format.
  3 | 
  4 | # Usage
  5 | To run the parser:
  6 | ```
  7 | python parse_pacer.py [OPTIONS] INPUT_DIR
  8 | ```
  9 | ### Arguments
 10 |  - `INPUT_DIR`: Relative path to the folder where HTMLs will be read, e.g.   `../../data/pacer/ilnd/html`
 11 | 
 12 | If you are using the parser in conjunction with SCALES's Pacer scraper, you will likely want your input directory to be the scraper-generated `html` folder within your chosen court directory, as outlined [here](../downloader/README.md#directory-structure).  Similarly the output and summaries directories will be inferred as the `json` and `summaries` folder within that chosen court directory, but can be overriden by providing values for `output-dir` and `summaries-dir`
 13 | 
 14 | ### Options
 15 | - `-o, --output-dir TEXT` *(path)* The folder where the parsed JSONs will be placed into. If none is provided they will placed in `INPUT_DIR/../json/`
 16 | - `-s, --summaries-dir TEXT` *(path)* The folder where the scraper will look for accompanying case summaries.  the parsed JSONs will be placed into. If none is provided it will deault to `INPUT_DIR/../summaries/`. See more on case summaries [below](#case-summaries)
 17 | - `-c, --court TEXT` *(defaults to none)* The standard abbreviation for the district court being parsed, e.g. `ilnd`. If not specified, and if using the directory structure mentioned above, the parser will inference the court abbreviation from the parent folder.
 18 | - `-d, --debug` *(flag)* Turns off concurrency in the parser. Useful for ensuring that error traces are printed properly.
 19 | - `-f, --force-rerun` *(flag)* Tells the parser to process HTMLs even when their corresponding JSONs already exist. Useful for obtaining fresh parses after scraping updates to existing dockets.
 20 | -  `--force-ucids` *(path)* A path to a .csv file that contais a 'ucid' column. If supplied the parser will force rerun only on HTMLs that match up with the provided UCIDs (rather than force rerunning on the entire INPATH)
 21 | - `-nw, --n-workers INTEGER` *(defaults to 16)* Number of concurrent workers to run simultaneously - i.e., no. of simultaneous parses running.
 22 | 
 23 | ### Shell scripts
 24 | Two shell scripts, `parse_all.sh` and `parse_subset.sh`, are provided for batch runs across multiple court directories. To run them:
 25 | 
 26 |     sh parse_all.sh INPATH [OPTIONS]
 27 |     sh parse_subset.sh INPATH -s STARTDIR -e ENDDIR [OPTIONS]
 28 | 
 29 | where `INPATH` is the relative path to a parent folder containing multiple court directories, `STARTDIR` and `ENDDIR` define the inclusive alphabetical range of court directories to parse (e.g. `nyed` through `nywd`), and `OPTIONS` are any command-line options you would like to pass through to `parse_pacer.py` (e.g. `--debug`, `--force-rerun`, `--n-workers`).
 30 | 
 31 | *Note: each court directory in the batch must include an HTML folder for input and a JSON folder for output, as is true in the scraper-generated directory structure.*
 32 | 
 33 | 
 34 | 
 35 | 
 36 | # JSON Schema
 37 | The following fields are inferenced from the filepath:
 38 | - `case_id` *(string)* - Pacer's case ID, which has the form O:YY-TY-##### (where O is a court office code, YY is a year, TY is the case type, and ##### is a numeric identifier associated with this case)
 39 | - `case_type` *(string)* - usually 'cr' (criminal) or 'cv' (civil); other types are acceptable ('mc', 'bk'...), but they will result in an incomplete parse
 40 | - `download_court` *(string)* - read from the command line if passed in with the `-c` option
 41 | - `ucid` *(string)* - SCALES's case ID (stands for 'unique case id'), generated by prepending the court abbreviation to the Pacer case ID and used to ensure that cases with identical Pacer IDs from different districts can be distinguished from one another
 42 | 
 43 | The following fields are pulled from the header of the Pacer docket:
 44 | - `header_case_id` *(string)* - similar to `case_id`, but pulled from the docket itself rather than the filepath
 45 | - `case_name` *(string)*
 46 | - `filing_date` *(string)*
 47 | - `terminating_date` *(string)*
 48 | - `case_status` *(string)* - 'open' if a terminating date is listed, else 'closed'
 49 | - `judge` *(string)*
 50 | - `referred_judge` *(string)* - only present when the case was referred to a second judge
 51 | - `nature_suit` *(string)* - civil cases only
 52 | - `jury_demand` *(string)* - civil cases only
 53 | - `cause` *(string)* - civil cases only
 54 | - `jurisdiction` *(string)* - civil cases only
 55 | - `monetary_demand` *(string)* - civil cases only
 56 | - `lead_case_id` *(string)* - only present when the case is part of multi-district litigation (MDL)
 57 | - `other_court` *(string)* - only present when another case ID is provided by Pacer as 'Case in other court'; doesn't pick up all alternate case IDs (e.g. appeals court case numbers)
 58 | - `case_flags` *(list of strings)* - only present when there are flags listed in the upper right corner of the Pacer docket
 59 | - `mdl_code` *(integer)*
 60 | 
 61 | The following fields are pulled from the body of the Pacer docket:
 62 | - `plaintiffs`, `defendants`, `bankruptcy_parties`, `other_parties`, `misc_participants`  *(dictionary)* - each key is the name of a participant in the case, and each value is a dictionary with the following structure:
 63 |   - `counsel` *(dictionary):* - each key is the name of a lawyer representing this participant, and each value is a dictionary with the following structure:
 64 |     - `office` *(string)*
 65 |     - `is_lead_attorney` *(boolean)*
 66 |     - `is_pro_hac_vice` *(boolean)*
 67 |     - `additional_info` *(dictionary)* - keys vary according to the information in the docket ('Designation,' 'Bar Status,' etc.)
 68 |   - `is_pro_se` *(boolean)*
 69 |   - `roles` *(list of strings)* - 'Plaintiff,' 'Petitioner,' 'Movant,' etc.
 70 | - `pending_counts`, `terminated_counts` *(dictionary)* - criminal cases only; each key is the name of a party who was charged with a criminal count, and each value is a list in which each element has the following dictionary structure:
 71 |   - `counts` *(string)*
 72 |   - `disposition` *(string)*
 73 | - `complaints` *(dictionary)* - certain criminal cases only; each key is the name of a party who was charged with a criminal count, and each value is the statute(s) specified as the basis of the charges
 74 | - `docket_available` *(boolean)*
 75 | - `docket` *(list of dictionaries)* - contains one item per docket entry, structured as follows:
 76 |   - `date_filed` *(string)*
 77 |   - `ind` *(string)* - Pacer's numerical index for this entry (can be an empty string, as not all Pacer entries are numbered)
 78 |   - `docket_text` *(string)*
 79 |   - `documents` *(dictionary)* - each key is either a non-zero attachment number or '0' for the main document, and each value is a dictionary with the following structure:
 80 |     - `url` *(string)* - the Pacer URL for this document
 81 |     - `span` *(dictionary)* - the starting and ending indices (within `docket_text`) of the hyperlink to the document, formatted as a dictionary with keys `start` and `end`
 82 |   - `edges` *(list of tuples)* - each element is a three-value tuple (encoded in graph-edge format) representing a hyperlink between two docket entries, with the first value encoding the index of the source entry within `docket`, the second value encoding the index of the target entry, and the third value encoding the starting and ending indices of the hyperlink within `docket_text` (as specified in `span` above)
 83 | 
 84 | The following fields are not pulled directly from the Pacer docket, and are primarily meant for internal use:
 85 | - `mdl_id_source` *(string)* - the origin of `mdl_code` (either 'lead_case_id' or 'flags')
 86 | - `is_mdl` *(boolean)* - true if `mdl_code` is non-null or if the case has any MDL flags
 87 | - `is_multi` *(boolean)* - true if `is_mdl` is true or if any of `lead_case_id`, `member_case_key`, or `other_court` is non-null
 88 | - `member_case_key` *(string)* - a UCID-formatted version of `lead_case_id` (or a copy of `ucid` if this case is a lead case); used to write MDL-related data to an external file for improved performance
 89 | - `source` *(string)* - used to distinguish between JSONs from this parser and similarly-formatted JSONs from other sources); if generated by this parser, will always be 'pacer'
 90 | - `download_url` *(string)* - the URL from which this HTML was downloaded; only present if parsing an HTML from the SCALES scraper
 91 | 
 92 | The following fields are pulled from the 'Transaction Receipt' at the bottom of the Pacer docket:
 93 | - `billable_pages` *(integer)*
 94 | - `cost` *(float)*
 95 | - `download_timestamp` *(string)*
 96 | - `n_docket_reports` *(integer)* - the number of times the SCALES scraper has modified this docket (1 if there have never been updates, >1 if new docket entries have been added after the initial download)
 97 | - `pacer_case_id` *(integer)* - the unique numerical ID that Pacer uses internally to identify this document (pulled from Pacer's XML responses to user queries; not visible on the docket sheet itself)
 98 | 
 99 | Case summaries:
100 | - `summary` *(object)* - case summary information, fully documented below
101 | 
102 | ## Case summaries
103 | Case summaries can be downloaded through the SCALES scraper. They provide some additional information that is not available in the case docket reports. By default the scraper will place any downloaded summaries in the `/summaries` sub-directory of a given court directory.
104 | 
105 | When the parser runs it will also parse any summaries associated with a case. It will search for the html files for these summaries in the summaries sub-directory (which can be manually specified with the `--summaries-dir` option).
106 | 
107 | The schema for civil cases and criminal cases are slightly different due to PACER presenting the data in different ways. The main difference is that for criminal cases, each defendant has its own unique list of plaintiffs whereas for civil cases there is a single list of all parties in a case (including both plaintiffs and defendants).
108 | 
109 | ### Civil Schema
110 | - `case_id` (*string*) - the case id e.g. '1:16-cv-00001, All defendants'
111 | - `case_name` (*string*) - e.g. 'USA v. Johnson et al.'
112 | - `date_filed` (*string*) - the case filing date
113 | - `date_terminated` (*string*) - the case terminating date
114 | - `date_of_last_filing` (*string*) - the date of last filing in the case
115 | - `presiding` (*string*) - presidint judge, if any
116 | - `referral` (*string*) -  referred judge, if any
117 | - `billable_pages` (*int*) - no. of billable pages (usually just 1)
118 | - `cost` (*float*) - the cost of downloading the case summary (usually 0.10)
119 | - `download_timestamp` (*string*) - the time the case summary was downloaded
120 | - `defendants` (*list of objects*) - a list of defendants in the case. For each defendant there is:
121 | 	- `plaintiffs` (*string*) - list of objects containing `role`, `represented_by` and contact fields `fax`, `email` and `phone`. Note: often non-fax related things end up in the `fax` field e.g. 'US Govt Attorney'
122 | 	- `name` (*string*) - defendant name
123 | 	- `ind` (*string*) -  defendant index within the case (*should* link back to the docket report)
124 | 	- `office` (*string*) - the court office
125 | 	- `county` (*string*) - the court county
126 | 	- 	`filed` (*string*) - defendant-specific filing date
127 | 	- `terminated` (*string*) - defendant-specific filing date
128 | 	- `reopened` (*string*) - defendant-specific reopneing date
129 | 	- `other_court_case` (*string*) - other associated cases
130 | 	- `defendant_custody_status` (*string*) -
131 | 	- `flags` (*list of strings*) - pacer flags that applied to the defendant e.g. ['CLOSED','PRO_SE' ]
132 | 	- `pending_status` (*string*) -
133 | 	- `magistrate_case` (*string*) - previous magistrate case, if any
134 | 	-  `counts` (*list of objects*) - containing `count` (the count reference e.g. '1sss') `citation`, `offense_level` and `text` (the text associated with the count)
135 | 	-  `complaints` (*list of objects*) - containing `citation`, `offense_level` and `text` (the text associated with the count)
136 | 
137 | 
138 | ### Criminal Schema
139 | - `case_id` (*string*) - the case id e.g. '1:16-cv-00001, All defendants'
140 | - `case_name` (*string*) - e.g. 'USA v. Johnson et al.'
141 | - `date_filed` (*string*) - the case filing date
142 | - `date_terminated` (*string*) - the case terminating date
143 | - `date_of_last_filing` (*string*) - the date of last filing in the case
144 | - `presiding` (*string*) - presidint judge, if any
145 | - `referral` (*string*) -  referred judge, if any
146 | - `billable_pages` (*int*) - no. of billable pages (usually just 1)
147 | - `cost` (*float*) - the cost of downloading the case summary (usually 0.10)
148 | - `download_timestamp` (*string*) - the time the case summary was downloaded
149 | - `parties` (*list of objects*) - a list of parties in the case. For each party there is:
150 | 	- `role` (*string*) - their role in the case e.g ('Plaintiff', 'Defendant')
151 | 	- `name` (*string*) - party name
152 | 	- `represented_by` (*string*) -  name of party's representation
153 | 	- `fax` (*string*) - contact fax no., note: often non-fax related things end up in the `fax` field e.g. 'Pro Hac Vice', 'MDL'
154 | 	- `email` (*string*) - contact email address
155 | 	- 	`phone` (*string*) - contact phone no.
156 | 


--------------------------------------------------------------------------------
/src/pacer_tools/code/parsers/parse_summary.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import sys
  3 | from pathlib import Path
  4 | 
  5 | from bs4 import BeautifulSoup
  6 | 
  7 | sys.path.append(str(Path.cwd().resolve().parents[1]))
  8 | from support import data_tools as dtools
  9 | from support import fhandle_tools as ftools
 10 | 
 11 | # Patterns
 12 | RE_DEF = '^(?P<name>[\s\S]+?)\s* \((?P<ind>\S+)\)$'
 13 | CASE_META_KEYS = ('case_id', 'case_name', 'presiding', 'referral', 'date_filed', 'date_terminated', 'date_of_last_filing')
 14 | 
 15 | 
 16 | class Pipeline:
 17 |     ''' A simple pipeline structure for parsing'''
 18 |     def __init__(self, pipes):
 19 |         '''
 20 |         Inputs:
 21 |             - pipes (list): a list of functions that map (data,extracted) -> (data,extracted)
 22 |         '''
 23 |         self.pipes = pipes
 24 | 
 25 |     def process(self, data, extracted):
 26 |         for fn in self.pipes:
 27 |             data, extracted = fn(data, extracted)
 28 | 
 29 |         return data, extracted
 30 | 
 31 | def scrub_bad_tags(data, extracted):
 32 |     '''
 33 |     Remove the tags that are breaking the parsing because of:
 34 |         1. Illegal nesting e.g. <B><U>text</B></U>
 35 |         2. Unclosed <p> tags
 36 |     Inputs:
 37 |         - data (str): raw html string from summary page html
 38 |         - extracted (dict)
 39 |     Outputs:
 40 |         (data,extracted) as above
 41 |     '''
 42 | 
 43 |     pat = '<B><U>|</B></U>|<U><B>|</U></B>|<p>|</p>'
 44 |     data = re.sub(pat,'', data, flags=re.I)
 45 | 
 46 |     return data, extracted
 47 | 
 48 | def extract_header(data, extracted):
 49 |     ''' Extract the header
 50 |     Inputs:
 51 |         - data (bs4 object): the page soup
 52 |         - extracted (dict):
 53 |     Outputs:
 54 |         (data, extracted) as above
 55 |     '''
 56 |     soup = data
 57 |     header = soup.select_one('#cmecfMainContent center')
 58 |     header_vals = []
 59 | 
 60 |     for el in header.contents:
 61 |         try:
 62 |             val =  el.text
 63 |         except:
 64 |             val = el
 65 |         val=val.strip()
 66 |         if val:
 67 |             header_vals.append(val)
 68 | 
 69 |     header_data = {
 70 |         'case_id': header_vals[0],
 71 |         'case_name': header_vals[1]
 72 |     }
 73 |     pairs_start = 2
 74 | 
 75 |     if 'presiding' in header_vals[pairs_start]:
 76 |         header_data['presiding'] = header_vals[pairs_start]
 77 |         pairs_start += 1
 78 | 
 79 |     if 'referr' in header_vals[pairs_start]:
 80 |         header_data['referral'] = header_vals[pairs_start]
 81 |         pairs_start += 1
 82 | 
 83 |     # Add pairs
 84 |     for i in range(pairs_start,len(header_vals),2):
 85 |         try:
 86 |             key_name = "_".join(header_vals[i].rstrip(':').lower().split())
 87 |             header_data[key_name] = header_vals[i+1]
 88 |         except IndexError:
 89 |             print('Something unseen in header metadata')
 90 | 
 91 |     extracted.update(header_data)
 92 | 
 93 |     return data, extracted
 94 | 
 95 | def extract_cell(tag, as_tuple=False):
 96 |     '''
 97 |     Extract metadata from a table cell tag
 98 | 
 99 |     Inputs:
100 |         - tag (bs4 element): a table cell (td), though if extracting a table will be a tr (see below)
101 | 
102 |     Outputs:
103 |         - (dict/tuple) dict with a single mapping {key:val} if it's a regular cell, or else a dict with
104 |             multiple keys and values if the cell is itself a table with multiple fields/rows. Unless as_tuple
105 |             is True, then a single (key,val) tuple is returned
106 |     '''
107 |     # Recursively deals with cell if it contains a table
108 |     if tag.select('table'):
109 |         table_data = {}
110 |         for tr in tag.select('tr'):
111 |             table_data.update( extract_cell(tr))
112 |         return table_data
113 |     else:
114 | 
115 |         key, val = tag.text.split(':',1)
116 |         key = '_'.join(key.lower().strip().split())
117 |         val = val.strip().replace('  ',' ')
118 |     return {key:val} if not as_tuple else (key,val)
119 | 
120 | def grab_parties_table(table):
121 |     '''
122 |     Grab info from tables that have rows that look like the following:
123 | 
124 |     Plaintiff/Defendant: <PartyName> represented by <LawyerName>      Phone/Fax/email...
125 | 
126 |     For criminal cases this will be the Plaintiff table (one for each defendant).
127 |     For civil cases this will correspond to the entire party table.
128 | 
129 |     Inputs:
130 |         - table (bs4 tag): the <table> tag corresponding to the table to parse
131 |     Outputs:
132 |         - parties (dict): the parties info from the table
133 |     '''
134 |     parties = []
135 | 
136 |     rows = [ch for ch in table.children if ch.name=='tr']
137 |     if not len(rows):
138 |         tbody = table.select_one('tbody')
139 | 
140 |         # If there is literally nothing between the <table></table> tags
141 |         if not tbody:
142 |             return parties
143 |         else:
144 |             rows = [ch for ch in table.select_one('tbody').children if ch.name=='tr']
145 | 
146 |     for i, tr in enumerate(rows):
147 |         party = {}
148 | 
149 |         cells = [ch for ch in tr.children if ch.name=='td']
150 | 
151 |         # Get plaintiff role and name first
152 |         role_and_name = cells[0]
153 |         role, name = extract_cell(role_and_name, as_tuple=True)
154 |         party['role'] = role
155 |         party['name'] = name
156 | 
157 |         # If no representation ifno
158 |         if len(cells) < 3:
159 |             party['represented_by'] = None
160 | 
161 |         else:
162 |             if cells[1].text.strip() == 'represented by':
163 |                 party['represented_by'] = cells[2].text.strip()
164 | 
165 |                 contact = cells[3]
166 |                 party.update( extract_cell(contact) )
167 | 
168 |         parties.append(party)
169 | 
170 |     return parties
171 | 
172 | def get_civil_parties(data, extracted):
173 |     '''
174 |     Method to extract main data from summary for civiil cases
175 | 
176 |     Inputs:
177 |         - data (bs4 object): the soup
178 |         - extracted (dict): the case extracted data
179 | 
180 |     Outputs:
181 |         (data, extracted) as above
182 | 
183 |     '''
184 |     main_tables_cv = data.select('#cmecfMainContent >  table')
185 | 
186 |     if len(main_tables_cv) < 2:
187 |         raise ValueError
188 | 
189 |     case_data = {}
190 | 
191 |     meta_table = main_tables_cv[0]
192 |     parties_table = main_tables_cv[1]
193 | 
194 |     for i, tr in enumerate(ch for ch in meta_table.select('tr') if ch.name=='tr'):
195 | 
196 | 
197 |         tr_text = tr.text.strip()
198 | 
199 |         # Skip blank lines
200 |         if not tr_text:
201 |             continue
202 | 
203 |         else:
204 |             key=None
205 |             for child in tr.children:
206 |                 if child.name=='td':
207 | 
208 |                     # Check if a value present with no key
209 |                     if not child.select('b') and key is not None:
210 |                         # Use key from previous iteration:
211 |                         extracted[key] = child.text
212 |                     else:
213 | 
214 | 
215 |                         key,val = extract_cell(child, as_tuple=True)
216 | 
217 |                         extracted[key] = val
218 | 
219 | 
220 |     # PLAINTIFFS
221 |     parties = grab_parties_table(parties_table)
222 |     extracted['parties'] = parties
223 | 
224 |     return data, extracted
225 | 
226 | def get_criminal_def_pla(data, extracted):
227 |     '''
228 |     Method to extract main data from summary for criminal cases
229 | 
230 |     Inputs:
231 |         - data (bs4 object): the soup
232 |         - extracted (dict): the case extracted data
233 | 
234 |     Outputs:
235 |         (data, extracted) as above
236 | 
237 |     '''
238 | 
239 |     extracted['defendants'] = []
240 | 
241 |     main_tables = data.select('#cmecfMainContent >  table')
242 | 
243 | 
244 |     if not (len(main_tables) % 2 == 0):
245 |         raise ValueError('Imbalanced number of plaintiff/defendant tables')
246 | 
247 |     # Iterate over the tables in pairs (defendant info, list of plaintiffs)
248 |     for def_ord in range(0, len(main_tables), 2):
249 | 
250 |         defendant_table = main_tables[def_ord]
251 |         plaintiff_table = main_tables[def_ord+1]
252 | 
253 |         defendant = {'counts': [], 'complaints':[], 'plaintiffs':[], }
254 |         count_instance, cmplt_instance = None, None
255 | 
256 |         for i, tr in enumerate(defendant_table.select('tr')):
257 | 
258 |             tr_text = tr.text.strip()
259 | 
260 |             # Skip blank lines
261 |             if not tr_text:
262 |                 continue
263 | 
264 |             # First row, grab defendant name
265 |             elif i==0:
266 |                 def_text = tr_text
267 |                 def_match = re.match(RE_DEF, def_text)
268 |                 match_dict = def_match.groupdict() if def_match else {}
269 |                 defendant['name'] = match_dict.get('name').strip().replace('  ',' ')
270 |                 defendant['ind'] = match_dict.get('ind').strip()
271 |                 continue
272 | 
273 |             # New count row
274 |             elif tr_text.startswith('Count:'):
275 |                 count_instance = {}
276 |                 for td in tr.select('td'):
277 |                     count_instance.update( extract_cell(td) )
278 | 
279 |             # If previous line was a new count instance, grab the count_text from this line
280 |             elif count_instance:
281 |                 count_instance['text'] = tr_text
282 |                 defendant['counts'].append(count_instance.copy())
283 |                 # Reset count instance
284 |                 count_instance = None
285 | 
286 |             # New count row
287 |             elif tr_text.startswith('Complaint'):
288 |                 #Set count to none
289 |                 cmplt_instance = {}
290 | 
291 |                 # Skip the first cell (the 'Complaint' cell, not a k:v pair)
292 |                 for td in tr.select('td')[1:]:
293 |                     cmplt_instance.update( extract_cell(td) )
294 | 
295 |             # If previous line was a new count instance, grab the count_text from this line
296 |             elif cmplt_instance:
297 |                 cmplt_instance['text'] = tr_text
298 |                 defendant['complaints'].append(cmplt_instance.copy())
299 |                 # Reset count instance
300 |                 cmplt_instance = None
301 | 
302 |             # Magistrate info is split over multiple tds, so just pass the whole row
303 |             elif tr_text.startswith('Magistrate'):
304 |                 defendant.update( extract_cell(tr) )
305 | 
306 | 
307 |             # Otherwise it's general data about the defendant's case, grab it
308 |             else:
309 |                 for child in tr.children:
310 |                     if child.name=='td':
311 |                         try:
312 |                             defendant.update( extract_cell(child) )
313 |                         except:
314 |                             if child.text.strip().lower().startswith('complaint'):
315 |                                 defendant.update({'complaint':None})
316 | 
317 | 
318 |         # flag/flags
319 |         if 'flag' in defendant:
320 |             defendant['flags'] =  defendant['flag']
321 |             del defendant['flag']
322 |         defendant['flags'] = (defendant.get('flags') or '').split(',')
323 | 
324 |         # other court case/cases
325 |         if 'other_court_cases' in defendant:
326 |             defendant['other_court_case'] = defendant['other_court_cases']
327 |             del defendant['other_court_cases']
328 | 
329 |         # PLAINTIFFS
330 |         plaintiffs = grab_parties_table(plaintiff_table)
331 |         defendant['plaintiffs'] = plaintiffs
332 | 
333 |         extracted['defendants'].append(defendant)
334 | 
335 |     return data, extracted
336 | 
337 | def get_main_data(data, extracted):
338 |     ''' Get the main data from the summary, switches function between civil and criminal main functions'''
339 | 
340 |     case_type = ftools.decompose_caseno(extracted['case_id'])['case_type']
341 | 
342 |     if case_type == 'cv':
343 |         data, extracted = get_civil_parties(data,extracted)
344 | 
345 |     elif case_type == 'cr':
346 |         data, extracted = get_criminal_def_pla(data, extracted)
347 | 
348 |     else:
349 |         raise ValueError('Only know how to parse cv and cr cases')
350 | 
351 |     return data, extracted
352 | 
353 | def ensure_keys(data, extracted):
354 |     ''' Guarantee key existence for all fields case meta keys, even if they weren't found'''
355 | 
356 |     for k in CASE_META_KEYS:
357 |         extracted[k] = extracted.get(k,'')
358 | 
359 |     return data, extracted
360 | 
361 | def get_summary_transaction_data(data, extracted):
362 |     ''' Get the transaction data for the summary'''
363 |     transaction_data = ftools.parse_transaction_history(str(data))
364 |     extracted['billable_pages'] = int(transaction_data['billable_pages']) if 'billable_pages' in transaction_data.keys() else None
365 |     extracted['cost'] = float(transaction_data['cost']) if 'cost' in transaction_data.keys() else None
366 |     extracted['download_timestamp'] = transaction_data.get('timestamp','')
367 | 
368 |     return data, extracted
369 | 
370 | # This is the complete summary pipeline
371 | # Use the inherited Pipeline.process method to process data
372 | SummaryPipeline = Pipeline([
373 |     scrub_bad_tags,
374 |     lambda d,e: (BeautifulSoup(d,'html.parser'), e) ,
375 |     extract_header,
376 |     get_main_data,
377 |     ensure_keys,
378 |     get_summary_transaction_data
379 | ])


--------------------------------------------------------------------------------
/src/pacer_tools/data/annotation/district_courts.csv:
--------------------------------------------------------------------------------
  1 | abbreviation,name,circuit,citation_abbreviation,state,cardinal,courtname,homepage,start_date,end_date,count,jurisdiction
  2 | dcd,"District Court, District of Columbia",District of Columbia,D.D.C.,District of Columbia,,district-of-columbia,http://www.dcd.uscou…,Unknown,Unknown,30822.0,Federal District
  3 | ald,"District Court, D. Alabama",,D. Ala.,Alabama,,alabama,,1820-04-21,1824-03-10,0.0,Federal District
  4 | almd,"District Court, M.D. Alabama",Eleventh,M.D. Ala.,Alabama,Middle,middle-alabama,http://www.almd.usco…,Unknown,Unknown,2266.0,Federal District
  5 | alnd,"District Court, N.D. Alabama",Eleventh,N.D. Ala.,Alabama,Northern,northern-alabama,http://www.alnd.usco…,Unknown,Unknown,1247.0,Federal District
  6 | alsd,"District Court, S.D. Alabama",Eleventh,S.D. Ala.,Alabama,Southern,southern-alabama,http://www.als.uscou…,Unknown,Unknown,724.0,Federal District
  7 | akd,"District Court, D. Alaska",Ninth,D. Alaska,Alaska,,alaska,http://www.akd.uscou…,Unknown,Unknown,444.0,Federal District
  8 | azd,"District Court, D. Arizona",Ninth,D. Ariz.,Arizona,,arizona,http://www.azd.uscou…,Unknown,Unknown,1286.0,Federal District
  9 | ared,"District Court, E.D. Arkansas",Eighth,E.D. Ark.,Arkansas,Eastern,eastern-arkansas,http://www.are.uscou…,Unknown,Unknown,1190.0,Federal District
 10 | arwd,"District Court, W.D. Arkansas",Eighth,W.D. Ark.,Arkansas,Western,western-arkansas,http://www.arwd.usco…,Unknown,Unknown,810.0,Federal District
 11 | cacd,"District Court, C.D. California",Ninth,C.D. Cal.,California,Central,central-california,http://www.cacd.usco…,Unknown,Unknown,3246.0,Federal District
 12 | caed,"District Court, E.D. California",Ninth,E.D. Cal.,California,Eastern,eastern-california,http://www.caed.usco…,Unknown,Unknown,1258.0,Federal District
 13 | cand,"District Court, N.D. California",Ninth,N.D. Cal.,California,Northern,northern-california,http://www.cand.usco…,Unknown,Unknown,4669.0,Federal District
 14 | casd,"District Court, S.D. California",Ninth,S.D. Cal.,California,Southern,southern-california,http://www.casd.usco…,Unknown,Unknown,1725.0,Federal District
 15 | cod,"District Court, D. Colorado",Tenth,D. Colo.,Colorado,,colorado,http://www.cod.uscou…,Unknown,Unknown,3237.0,Federal District
 16 | ctd,"District Court, D. Connecticut",Second,D. Conn.,Connecticut,,connecticut,http://www.ctd.uscou…,Unknown,Unknown,4495.0,Federal District
 17 | ded,"District Court, D. Delaware",Third,D. Del.,Delaware,,delaware,http://www.ded.uscou…,Unknown,Unknown,3748.0,Federal District
 18 | flmd,"District Court, M.D. Florida",Eleventh,M.D. Fla.,Florida,Middle,middle-florida,http://www.flmd.usco…,Unknown,Unknown,2998.0,Federal District
 19 | flnd,"District Court, N.D. Florida",Eleventh,N.D. Fla.,Florida,Northern,northern-florida,http://www.flnd.usco…,Unknown,Unknown,536.0,Federal District
 20 | flsd,"District Court, S.D. Florida",Eleventh,S.D. Fla.,Florida,Southern,southern-florida,http://www.flsd.usco…,Unknown,Unknown,4193.0,Federal District
 21 | gamd,"District Court, M.D. Georgia",Eleventh,M.D. Ga.,Georgia,Middle,middle-georgia,http://www.gamd.usco…,Unknown,Unknown,837.0,Federal District
 22 | gand,"District Court, N.D. Georgia",Eleventh,N.D. Ga.,Georgia,Northern,northern-georgia,http://www.gand.usco…,Unknown,Unknown,2814.0,Federal District
 23 | gasd,"District Court, S.D. Georgia",Eleventh,S.D. Ga.,Georgia,Southern,southern-georgia,http://www.gasd.usco…,Unknown,Unknown,771.0,Federal District
 24 | hid,"District Court, D. Hawaii",Ninth,D. Haw.,Hawaii,,hawaii,http://www.hid.uscou…,Unknown,Unknown,1321.0,Federal District
 25 | idd,"District Court, D. Idaho",Ninth,D. Idaho,Idaho,,idaho,http://www.id.uscour…,Unknown,Unknown,430.0,Federal District
 26 | ilcd,"District Court, C.D. Illinois",Seventh,C.D. Ill.,Illinois,Central,central-illinois,http://www.ilcd.usco…,Unknown,Unknown,1029.0,Federal District
 27 | ilnd,"District Court, N.D. Illinois",Seventh,N.D. Ill.,Illinois,Northern,northern-illinois,http://www.ilnd.usco…,Unknown,Unknown,12460.0,Federal District
 28 | ilsd,"District Court, S.D. Illinois",Seventh,S.D. Ill.,Illinois,Southern,southern-illinois,http://www.ilsd.usco…,Unknown,Unknown,505.0,Federal District
 29 | innd,"District Court, N.D. Indiana",Seventh,N.D. Ind.,Indiana,Northern,northern-indiana,http://www.innd.usco…,4/21/2028,Unknown,1829.0,Federal District
 30 | insd,"District Court, S.D. Indiana",Seventh,S.D. Ind.,Indiana,Southern,southern-indiana,http://www.insd.usco…,4/21/2028,Unknown,1387.0,Federal District
 31 | indianad,"District Court, D. Indiana",,D. Ind.,Indiana,,indiana,http://www.insd.usco…,1817-03-03,4/21/2028,3.0,Federal District
 32 | iand,"District Court, N.D. Iowa",Eighth,N.D. Iowa,Iowa,Northern,northern-iowa,http://www.iand.usco…,Unknown,Unknown,1230.0,Federal District
 33 | iasd,"District Court, S.D. Iowa",Eighth,S.D. Iowa,Iowa,Southern,southern-iowa,http://www.iasd.usco…,Unknown,Unknown,1165.0,Federal District
 34 | ksd,"District Court, D. Kansas",Tenth,D. Kan.,Kansas,,kansas,http://www.ksd.uscou…,Unknown,Unknown,4748.0,Federal District
 35 | kyed,"District Court, E.D. Kentucky",Sixth,E.D. Ky.,Kentucky,Eastern,eastern-kentucky,http://www.kyed.usco…,Unknown,Unknown,863.0,Federal District
 36 | kywd,"District Court, W.D. Kentucky",Sixth,W.D. Ky.,Kentucky,Western,western-kentucky,http://www.kywd.usco…,Unknown,Unknown,1011.0,Federal District
 37 | laed,"District Court, E.D. Louisiana.",Fifth,E.D. La.,Louisiana,Eastern,eastern-louisiana,http://www.laed.usco…,Unknown,Unknown,2763.0,Federal District
 38 | lamd,"District Court, M.D. Louisiana",Fifth,M.D. La.,Louisiana,Middle,middle-louisiana,http://www.lamd.usco…,Unknown,Unknown,706.0,Federal District
 39 | lawd,"District Court, W.D. Louisiana",Fifth,W.D. La.,Louisiana,Western,western-louisiana,http://www.lawd.usco…,Unknown,Unknown,1223.0,Federal District
 40 | med,"District Court, D. Maine",First,D. Me.,Maine,,maine,http://www.med.uscou…,Unknown,Unknown,2505.0,Federal District
 41 | mdd,"District Court, D. Maryland",Fourth,D. Maryland,Maryland,,maryland,https://www.mdd.usco…,Unknown,Unknown,5001.0,Federal District
 42 | mad,"District Court, D. Massachusetts",First,D. Mass.,Massachusetts,,massachusetts,http://www.mad.uscou…,Unknown,Unknown,7532.0,Federal District
 43 | mied,"District Court, E.D. Michigan",Sixth,E.D. Mich.,Michigan,Eastern,eastern-michigan,http://www.mied.usco…,Unknown,Unknown,5436.0,Federal District
 44 | miwd,"District Court, W.D. Michigan",Sixth,W.D. Mich.,Michigan,Western,western-michigan,http://www.miwd.usco…,Unknown,Unknown,1696.0,Federal District
 45 | mnd,"District Court, D. Minnesota",Eighth,D. Minnesota,Minnesota,,minnesota,http://www.mnd.uscou…,Unknown,Unknown,3249.0,Federal District
 46 | msnd,"District Court, N.D. Mississippi",Fifth,N.D. Miss.,Mississippi,Northern,northern-mississippi,http://www.msnd.usco…,Unknown,Unknown,942.0,Federal District
 47 | mssd,"District Court, S.D. Mississippi",Fifth,S.D. Miss.,Mississippi,Southern,southern-mississippi,http://www.mssd.usco…,Unknown,Unknown,1696.0,Federal District
 48 | moed,"District Court, E.D. Missouri",Eighth,E.D. Mo.,Missouri,Eastern,eastern-missouri,http://www.moed.usco…,Unknown,Unknown,2632.0,Federal District
 49 | mowd,"District Court, W.D. Missouri",Eighth,W.D. Mo.,Missouri,Western,western-missouri,http://www.mow.uscou…,Unknown,Unknown,1957.0,Federal District
 50 | mtd,"District Court, D. Montana",Ninth,D. Mont.,Montana,,montana,http://www.mtd.uscou…,Unknown,Unknown,816.0,Federal District
 51 | ned,"District Court, D. Nebraska",Eighth,D. Neb.,Nebraska,,nebraska,http://www.ned.uscou…,Unknown,Unknown,1165.0,Federal District
 52 | nvd,"District Court, D. Nevada",Ninth,D. Nev.,Nevada,,nevada,http://www.nvd.uscou…,Unknown,Unknown,1221.0,Federal District
 53 | nhd,"District Court, D. New Hampshire",First,D.N.H.,New Hampshire,,new-hampshire,http://www.nhd.uscou…,Unknown,Unknown,912.0,Federal District
 54 | njd,"District Court, D. New Jersey",Third,D.N.J.,New Jersey,,new-jersey,http://www.njd.uscou…,1789-09-24,Unknown,4865.0,Federal District
 55 | nmd,"District Court, D. New Mexico",Tenth,D.N.M.,New Mexico,,new-mexico,http://www.nmcourt.f…,Unknown,Unknown,1084.0,Federal District
 56 | nyed,"District Court, E.D. New York",Second,E.D.N.Y,New York,Eastern,eastern-new-york,http://www.nyed.usco…,Unknown,Unknown,7792.0,Federal District
 57 | nynd,"District Court, N.D. New York",Second,N.D.N.Y.,New York,Northern,northern-new-york,http://www.nynd.usco…,Unknown,Unknown,2578.0,Federal District
 58 | nysd,"District Court, S.D. New York",Second,S.D.N.Y.,New York,Southern,southern-new-york,http://www.nysd.usco…,Unknown,Unknown,26704.0,Federal District
 59 | nywd,"District Court, W.D. New York",Second,W.D.N.Y.,New York,Western,western-new-york,http://www.nywd.usco…,Unknown,Unknown,2924.0,Federal District
 60 | nced,"District Court, E.D. North Carolina",Fourth,E.D.N.C.,North Carolina,Eastern,eastern-north-carolina,http://www.nced.usco…,Unknown,Unknown,1076.0,Federal District
 61 | ncmd,"District Court, M.D. North Carolina",Fourth,M.D.N.C.,North Carolina,Middle,middle-north-carolina,http://www.ncmd.usco…,Unknown,Unknown,1083.0,Federal District
 62 | ncwd,"District Court, W.D. North Carolina",Fourth,W.D.N.C.,North Carolina,Western,western-north-carolina,http://www.ncwd.usco…,Unknown,Unknown,1061.0,Federal District
 63 | ndd,"District Court, D. North Dakota",Eighth,D.N.D.,North Dakota,,north-dakota,http://www.ndd.uscou…,Unknown,Unknown,606.0,Federal District
 64 | ohnd,"District Court, N.D. Ohio",Sixth,N.D. Ohio,Ohio,Northern,northern-ohio,http://www.ohnd.usco…,Unknown,Unknown,3293.0,Federal District
 65 | ohsd,"District Court, S.D. Ohio",Sixth,S.D. Ohio,Ohio,Southern,southern-ohio,http://www.ohsd.usco…,Unknown,Unknown,2860.0,Federal District
 66 | oked,"District Court, E.D. Oklahoma",Tenth,E.D. Okla.,Oklahoma,Eastern,eastern-oklahoma,http://www.oked.usco…,Unknown,Unknown,174.0,Federal District
 67 | oknd,"District Court, N.D. Oklahoma",Tenth,N.D. Okla.,Oklahoma,Northern,northern-oklahoma,http://www.oknd.usco…,Unknown,Unknown,485.0,Federal District
 68 | okwd,"District Court, W.D. Oklahoma",Tenth,W.D. Okla.,Oklahoma,Western,western-oklahoma,http://www.okwd.usco…,,,,Federal District
 69 | ord,"District Court, D. Oregon",Ninth,D. Or.,Oregon,,oregon,http://www.ord.uscou…,Unknown,Unknown,1859.0,Federal District
 70 | paed,"District Court, E.D. Pennsylvania",Third,E.D. Pa.,Pennsylvania,Eastern,eastern-pennsylvania,http://www.paed.usco…,Unknown,Unknown,10441.0,Federal District
 71 | pamd,"District Court, M.D. Pennsylvania",Third,M.D. Penn.,Pennsylvania,Middle,middle-pennsylvania,http://www.pamd.usco…,Unknown,Unknown,2304.0,Federal District
 72 | pawd,"District Court, W.D. Pennsylvania",Third,W.D. Pa.,Pennsylvania,Western,western-pennsylvania,http://www.pawd.usco…,Unknown,Unknown,3250.0,Federal District
 73 | rid,"District Court, D. Rhode Island",First,D.R.I.,Rhode Island,,rhode-island,http://www.rid.uscou…,Unknown,Unknown,1556.0,Federal District
 74 | southcarolinaed,"District Court, E.D. South Carolina",,E.D.S.C.,South Carolina,Eastern,eastern-south-carolina,http://www.scd.uscou…,1823-02-21,10/7/1965,181.0,Federal District
 75 | southcarolinawd,"District Court, W.D. South Carolina",,W.D.S.C.,South Carolina,Western,western-south-carolina,http://www.ncwd.usco…,1823-02-21,10/7/1965,95.0,Federal District
 76 | scd,"District Court, D. South Carolina",Fourth,D.S.C.,South Carolina,,south-carolina,https://www.scd.uscourts.gov/,,,,Federal District
 77 | sdd,"District Court, D. South Dakota",Eighth,D.S.D.,South Dakota,,south-dakota,http://www.sdd.uscou…,Unknown,Unknown,867.0,Federal District
 78 | tned,"District Court, E.D. Tennessee",Sixth,E.D. Tenn.,Tennessee,Eastern,eastern-tennessee,http://www.tned.usco…,Unknown,Unknown,1363.0,Federal District
 79 | tnmd,"District Court, M.D. Tennessee",Sixth,M.D. Tenn.,Tennessee,Middle,middle-tennessee,http://www.tnmd.usco…,Unknown,Unknown,1147.0,Federal District
 80 | tnwd,"District Court, W.D. Tennessee",Sixth,W.D. Tenn.,Tennessee,Western,western-tennessee,http://www.tnwd.usco…,Unknown,Unknown,892.0,Federal District
 81 | txed,"District Court, E.D. Texas",Fifth,E.D. Tex.,Texas,Eastern,eastern-texas,http://www.txed.usco…,Unknown,Unknown,1591.0,Federal District
 82 | txnd,"District Court, N.D. Texas",Fifth,N.D. Tex.,Texas,Northern,northern-texas,http://www.txnd.usco…,Unknown,Unknown,2412.0,Federal District
 83 | txsd,"District Court, S.D. Texas",Fifth,S.D. Tex.,Texas,Southern,southern-texas,http://www.txs.uscou…,Unknown,Unknown,3497.0,Federal District
 84 | txwd,"District Court, W.D. Texas",Fifth,W.D. Tex.,Texas,Western,western-texas,http://www.txwd.usco…,Unknown,Unknown,1283.0,Federal District
 85 | utd,"District Court, D. Utah",Tenth,D. Utah,Utah,,utah,http://www.utd.uscou…,Unknown,Unknown,1237.0,Federal District
 86 | vtd,"District Court, D. Vermont",Second,D. Vt.,Vermont,,vermont,http://www.vtd.uscou…,Unknown,Unknown,668.0,Federal District
 87 | vaed,"District Court, E.D. Virginia",Fourth,E.D. Va.,Virginia,Eastern,eastern-virginia,http://www.vaed.usco…,Unknown,Unknown,4530.0,Federal District
 88 | vawd,"District Court, W.D. Virginia",Fourth,W.D. Va.,Virginia,Western,western-virginia,http://www.vawd.usco…,Unknown,Unknown,2018.0,Federal District
 89 | waed,"District Court, E.D. Washington",Ninth,E.D. Wash.,Washington,Eastern,eastern-washington,http://www.waed.usco…,3/2/05,Unknown,437.0,Federal District
 90 | wawd,"District Court, W.D. Washington",Ninth,W.D. Wash.,Washington,Western,western-washington,http://www.wawd.usco…,3/2/05,Unknown,1146.0,Federal District
 91 | wvnd,"District Court, N.D. West Virginia",Fourth,N.D.W. Va.,West Virginia,Northern,northern-west-virginia,http://www.wvnd.usco…,Unknown,Unknown,466.0,Federal District
 92 | wvsd,"District Court, S.D. West Virginia",Fourth,S.D.W. Va,West Virginia,Southern,southern-west-virginia,http://www.wvsd.usco…,Unknown,Unknown,1238.0,Federal District
 93 | wied,"District Court, E.D. Wisconsin",Seventh,E.D. Wis.,Wisconsin,Eastern,eastern-wisconsin,http://www.wied.usco…,Unknown,Unknown,2740.0,Federal District
 94 | wiwd,"District Court, W.D. Wisconsin",Seventh,W.D. Wis.,Wisconsin,Western,western-wisconsin,http://www.wiwd.usco…,Unknown,Unknown,1181.0,Federal District
 95 | wyd,"District Court, D. Wyoming",Tenth,D. Wyo.,Wyoming,,wyoming,http://www.wyd.uscou…,Unknown,Unknown,481.0,Federal District
 96 | gud,"District Court, D. Guam",Ninth,D. Guam,Guam,,guam,http://www.gud.uscou…,Unknown,Unknown,38.0,Federal District
 97 | nmid,"District Court, Northern Mariana Islands",Ninth,N. Mar. I.,Northern Mariana Islands,,northern-mariana-islands,http://www.nmid.usco…,Unknown,Unknown,16.0,Federal District
 98 | prd,"District Court, D. Puerto Rico",First,D.P.R.,Puerto Rico,,puerto-rico,http://www.prd.uscou…,Unknown,Unknown,4054.0,Federal District
 99 | vid,"District Court, Virgin Islands",Third,D.V.I.,Virgin Islands,,virgin-islands,http://www.vid.uscou…,Unknown,Unknown,656.0,Federal District
100 | 


--------------------------------------------------------------------------------