├── P0_download_bulk.py ├── LICENSE ├── README.md └── P1_extract_text.py /P0_download_bulk.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import os 3 | 4 | save_dest = Path("data/source") 5 | save_dest.mkdir(exist_ok=True, parents=True) 6 | 7 | 8 | def download_set(name): 9 | 10 | url = f"https://www.courtlistener.com/api/bulk-data/{name}/all.tar" 11 | f_save = save_dest / f"{name}_all.tar" 12 | 13 | if not f_save.exists(): 14 | os.system(f"curl {url} -o {f_save}") 15 | 16 | 17 | # Dockets look like detailed meta information, not text 18 | # datasets = ['dockets', 'opinions'] 19 | 20 | datasets = ["opinions"] 21 | 22 | for name in datasets: 23 | download_set(name) 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020-2022 Travis Hoppe 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # The-Pile-FreeLaw 2 | 3 | Download, parse, and filter data from [Court Listener](https://www.courtlistener.com/api/bulk-info/), part of the FreeLaw project. Data-ready for [The-Pile](https://github.com/EleutherAI/The-Pile). 4 | 5 | While Data exists for multiple modalites (eg. dockets, people, ...), we focus on court opinions for language modeling. 6 | 7 | ### Pile-V2 Statistics 8 | 9 | ✔ Saved to data/FreeLaw_Opinions.jsonl.zst 10 | ℹ Date completed 12/15/2021 11 | ℹ Saved 4,940,710 opinions (1,378,695 added, 38.7% growth) 12 | ℹ Uncompressed filesize 69,470,526,055 13 | ℹ Compressed filesize 20,975,955,178 14 | ℹ sha256sum 8a38c34f181aa121c3a7360ad63e3e8c0b1ea0913de08a4bf1b68b3eabae3e66 15 | 16 | Additional columns to find free text were added [`html_columbia`, 17 | `html_anon_2020`, `html_with_citations`, `xml_harvard`] which accounts 18 | for the larger increase from V1 to V2. 19 | 20 | ### Pile-V1 Statistics 21 | 22 | ✔ Saved to data/FreeLaw_Opinions.jsonl.zst 23 | ℹ Saved 3,562,015 opinions 24 | ℹ Uncompressed filesize 56,138,746,490 25 | ℹ Compressed filesize 17,013,175,549 26 | ℹ sha256sum 7d7ba907cf397e8585bb3ef148b3e9678edbf142b2247460f907c16aecbaed2d -------------------------------------------------------------------------------- /P1_extract_text.py: -------------------------------------------------------------------------------- 1 | import tarfile 2 | from tqdm import tqdm 3 | from pathlib import Path 4 | import io 5 | import json 6 | import tempfile 7 | import bs4 8 | import jsonlines 9 | from dspipe import Pipe 10 | from wasabi import msg 11 | 12 | f_tar = "data/source/opinions_all.tar" 13 | f_save = "data/FreeLaw_Opinions.jsonl" 14 | 15 | 16 | def jurisdiction_iter(): 17 | 18 | # Handle nested tars 19 | with tarfile.open(f_tar, "r") as ALL: 20 | 21 | for packet in ALL: 22 | 23 | with tempfile.TemporaryDirectory() as tmp_dir: 24 | ALL.extract(packet, path=tmp_dir) 25 | f_subtar = Path(tmp_dir) / packet.name 26 | 27 | with open(f_subtar, "rb") as FIN: 28 | raw = FIN.read() 29 | 30 | yield packet.name, io.BytesIO(raw) 31 | 32 | """ 33 | with tarfile.open(f_subtar, "r") as TAR: 34 | for k, subpacket in enumerate(TAR): 35 | 36 | js = TAR.extractfile(subpacket).read() 37 | yield packet.name, subpacket.name, js 38 | """ 39 | 40 | 41 | def idempotent(x): 42 | return x 43 | 44 | 45 | def html2text(x): 46 | soup = bs4.BeautifulSoup(x, "lxml") 47 | return soup.get_text() 48 | 49 | 50 | field_order = [ 51 | ("plain_text", idempotent), 52 | ("html", html2text), 53 | ("html_lawbox", html2text), 54 | ("html_columbia", html2text), 55 | ("html_anon_2020", html2text), 56 | ("html_with_citations", html2text), 57 | ("xml_harvard", html2text), 58 | ] 59 | 60 | error_str = ( 61 | "Unable to extract the content from this file. Please try reading the original." 62 | ) 63 | 64 | 65 | def parse_json(item): 66 | 67 | name0, name1, raw_str = item 68 | js = json.loads(raw_str) 69 | 70 | text = None 71 | 72 | if "html" in js and js["html"] == error_str: 73 | return None 74 | 75 | for k, func in field_order: 76 | if k in js and isinstance(js[k], str) and len(js[k]): 77 | text = func(js[k]) 78 | 79 | if text is None: 80 | msg.fail(f"No text found for {name0} {name1}") 81 | return None 82 | 83 | meta = {} 84 | meta["case_jurisdiction"] = name0 85 | meta["case_ID"] = name1 86 | meta["date_created"] = js["date_created"] 87 | 88 | data = {"meta": meta, "text": text} 89 | 90 | return data 91 | 92 | 93 | def data_iter(name0, tar_bytes): 94 | 95 | with tarfile.open(fileobj=tar_bytes) as TAR: 96 | for k, subpacket in enumerate(TAR): 97 | raw_json = TAR.extractfile(subpacket).read() 98 | name1 = subpacket.name 99 | yield name0, name1, raw_json 100 | 101 | 102 | with jsonlines.open(f_save, "w") as FOUT: 103 | 104 | for name0, tar_bytes in jurisdiction_iter(): 105 | msg.info(f"Processing {name0}") 106 | 107 | for js in Pipe(data_iter(name0, tar_bytes))(parse_json, -1): 108 | 109 | if js is None: 110 | continue 111 | 112 | FOUT.write(js) 113 | --------------------------------------------------------------------------------