├── P0_download_bulk.py
├── LICENSE
├── README.md
└── P1_extract_text.py


/P0_download_bulk.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import os
 3 | 
 4 | save_dest = Path("data/source")
 5 | save_dest.mkdir(exist_ok=True, parents=True)
 6 | 
 7 | 
 8 | def download_set(name):
 9 | 
10 |     url = f"https://www.courtlistener.com/api/bulk-data/{name}/all.tar"
11 |     f_save = save_dest / f"{name}_all.tar"
12 | 
13 |     if not f_save.exists():
14 |         os.system(f"curl {url} -o {f_save}")
15 | 
16 | 
17 | # Dockets look like detailed meta information, not text
18 | # datasets = ['dockets', 'opinions']
19 | 
20 | datasets = ["opinions"]
21 | 
22 | for name in datasets:
23 |     download_set(name)
24 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020-2022 Travis Hoppe
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # The-Pile-FreeLaw
 2 | 
 3 | Download, parse, and filter data from [Court Listener](https://www.courtlistener.com/api/bulk-info/), part of the FreeLaw project. Data-ready for [The-Pile](https://github.com/EleutherAI/The-Pile).
 4 | 
 5 | While Data exists for multiple modalites (eg. dockets, people, ...), we focus on court opinions for language modeling.
 6 | 
 7 | ### Pile-V2 Statistics
 8 | 
 9 |     ✔ Saved to data/FreeLaw_Opinions.jsonl.zst
10 |     ℹ Date completed 12/15/2021
11 |     ℹ Saved 4,940,710 opinions (1,378,695 added, 38.7% growth)
12 |     ℹ Uncompressed filesize 69,470,526,055
13 |     ℹ Compressed filesize   20,975,955,178
14 |     ℹ sha256sum 8a38c34f181aa121c3a7360ad63e3e8c0b1ea0913de08a4bf1b68b3eabae3e66
15 | 
16 | Additional columns to find free text were added [`html_columbia`,
17 | `html_anon_2020`, `html_with_citations`, `xml_harvard`] which accounts
18 | for the larger increase from V1 to V2.
19 | 
20 | ### Pile-V1 Statistics
21 | 
22 |     ✔ Saved to data/FreeLaw_Opinions.jsonl.zst
23 |     ℹ Saved 3,562,015 opinions
24 |     ℹ Uncompressed filesize   56,138,746,490
25 |     ℹ Compressed filesize     17,013,175,549
26 |     ℹ sha256sum 7d7ba907cf397e8585bb3ef148b3e9678edbf142b2247460f907c16aecbaed2d


--------------------------------------------------------------------------------
/P1_extract_text.py:
--------------------------------------------------------------------------------
  1 | import tarfile
  2 | from tqdm import tqdm
  3 | from pathlib import Path
  4 | import io
  5 | import json
  6 | import tempfile
  7 | import bs4
  8 | import jsonlines
  9 | from dspipe import Pipe
 10 | from wasabi import msg
 11 | 
 12 | f_tar = "data/source/opinions_all.tar"
 13 | f_save = "data/FreeLaw_Opinions.jsonl"
 14 | 
 15 | 
 16 | def jurisdiction_iter():
 17 | 
 18 |     # Handle nested tars
 19 |     with tarfile.open(f_tar, "r") as ALL:
 20 | 
 21 |         for packet in ALL:
 22 | 
 23 |             with tempfile.TemporaryDirectory() as tmp_dir:
 24 |                 ALL.extract(packet, path=tmp_dir)
 25 |                 f_subtar = Path(tmp_dir) / packet.name
 26 | 
 27 |                 with open(f_subtar, "rb") as FIN:
 28 |                     raw = FIN.read()
 29 | 
 30 |                 yield packet.name, io.BytesIO(raw)
 31 | 
 32 |             """
 33 |                 with tarfile.open(f_subtar, "r") as TAR:
 34 |                     for k, subpacket in enumerate(TAR):
 35 | 
 36 |                         js = TAR.extractfile(subpacket).read()
 37 |                         yield packet.name, subpacket.name, js
 38 |             """
 39 | 
 40 | 
 41 | def idempotent(x):
 42 |     return x
 43 | 
 44 | 
 45 | def html2text(x):
 46 |     soup = bs4.BeautifulSoup(x, "lxml")
 47 |     return soup.get_text()
 48 | 
 49 | 
 50 | field_order = [
 51 |     ("plain_text", idempotent),
 52 |     ("html", html2text),
 53 |     ("html_lawbox", html2text),
 54 |     ("html_columbia", html2text),
 55 |     ("html_anon_2020", html2text),
 56 |     ("html_with_citations", html2text),
 57 |     ("xml_harvard", html2text),
 58 | ]
 59 | 
 60 | error_str = (
 61 |     "Unable to extract the content from this file. Please try reading the original."
 62 | )
 63 | 
 64 | 
 65 | def parse_json(item):
 66 | 
 67 |     name0, name1, raw_str = item
 68 |     js = json.loads(raw_str)
 69 | 
 70 |     text = None
 71 | 
 72 |     if "html" in js and js["html"] == error_str:
 73 |         return None
 74 | 
 75 |     for k, func in field_order:
 76 |         if k in js and isinstance(js[k], str) and len(js[k]):
 77 |             text = func(js[k])
 78 | 
 79 |     if text is None:
 80 |         msg.fail(f"No text found for {name0} {name1}")
 81 |         return None
 82 | 
 83 |     meta = {}
 84 |     meta["case_jurisdiction"] = name0
 85 |     meta["case_ID"] = name1
 86 |     meta["date_created"] = js["date_created"]
 87 | 
 88 |     data = {"meta": meta, "text": text}
 89 | 
 90 |     return data
 91 | 
 92 | 
 93 | def data_iter(name0, tar_bytes):
 94 | 
 95 |     with tarfile.open(fileobj=tar_bytes) as TAR:
 96 |         for k, subpacket in enumerate(TAR):
 97 |             raw_json = TAR.extractfile(subpacket).read()
 98 |             name1 = subpacket.name
 99 |             yield name0, name1, raw_json
100 | 
101 | 
102 | with jsonlines.open(f_save, "w") as FOUT:
103 | 
104 |     for name0, tar_bytes in jurisdiction_iter():
105 |         msg.info(f"Processing {name0}")
106 | 
107 |         for js in Pipe(data_iter(name0, tar_bytes))(parse_json, -1):
108 | 
109 |             if js is None:
110 |                 continue
111 | 
112 |             FOUT.write(js)
113 | 


--------------------------------------------------------------------------------