├── .github ├── FUNDING.yml └── workflows │ ├── pytest.yml │ └── python-publish.yml ├── .gitignore ├── LICENSE.md ├── MANIFEST.in ├── README.md ├── citeurl ├── __init__.py ├── authority.py ├── citation.py ├── citator.py ├── cli.py ├── mdx.py ├── regex_mods.py ├── templates │ ├── caselaw.yaml │ ├── general federal law.yaml │ ├── secondary sources.yaml │ ├── specific federal laws.yaml │ └── state law.yaml ├── tokens.py └── web │ ├── __init__.py │ ├── citeurl.js │ ├── logo.svg │ ├── makejs.py │ ├── resources.py │ ├── server.py │ └── style.css ├── docs ├── 404.html ├── assets │ ├── _mkdocstrings.css │ ├── favicon.png │ ├── images │ │ └── favicon.png │ ├── javascripts │ │ ├── bundle.d6f25eb3.min.js │ │ ├── bundle.d6f25eb3.min.js.map │ │ ├── lunr │ │ │ ├── min │ │ │ │ ├── lunr.ar.min.js │ │ │ │ ├── lunr.da.min.js │ │ │ │ ├── lunr.de.min.js │ │ │ │ ├── lunr.du.min.js │ │ │ │ ├── lunr.el.min.js │ │ │ │ ├── lunr.es.min.js │ │ │ │ ├── lunr.fi.min.js │ │ │ │ ├── lunr.fr.min.js │ │ │ │ ├── lunr.he.min.js │ │ │ │ ├── lunr.hi.min.js │ │ │ │ ├── lunr.hu.min.js │ │ │ │ ├── lunr.hy.min.js │ │ │ │ ├── lunr.it.min.js │ │ │ │ ├── lunr.ja.min.js │ │ │ │ ├── lunr.jp.min.js │ │ │ │ ├── lunr.kn.min.js │ │ │ │ ├── lunr.ko.min.js │ │ │ │ ├── lunr.multi.min.js │ │ │ │ ├── lunr.nl.min.js │ │ │ │ ├── lunr.no.min.js │ │ │ │ ├── lunr.pt.min.js │ │ │ │ ├── lunr.ro.min.js │ │ │ │ ├── lunr.ru.min.js │ │ │ │ ├── lunr.sa.min.js │ │ │ │ ├── lunr.stemmer.support.min.js │ │ │ │ ├── lunr.sv.min.js │ │ │ │ ├── lunr.ta.min.js │ │ │ │ ├── lunr.te.min.js │ │ │ │ ├── lunr.th.min.js │ │ │ │ ├── lunr.tr.min.js │ │ │ │ ├── lunr.vi.min.js │ │ │ │ └── lunr.zh.min.js │ │ │ ├── tinyseg.js │ │ │ └── wordcut.js │ │ └── workers │ │ │ ├── search.6ce7567c.min.js │ │ │ └── search.6ce7567c.min.js.map │ ├── logo.svg │ └── stylesheets │ │ ├── main.8c3ca2c6.min.css │ │ ├── main.8c3ca2c6.min.css.map │ │ ├── palette.06af60db.min.css │ │ └── palette.06af60db.min.css.map ├── frontends │ └── index.html ├── index.html ├── library │ └── index.html ├── objects.inv ├── search │ └── search_index.json ├── sitemap.xml ├── sitemap.xml.gz └── template-yamls │ └── index.html ├── docs_source ├── assets │ ├── favicon.png │ └── logo.svg ├── frontends.md ├── index.md ├── library.md └── template-yamls.md ├── mkdocs.yml ├── setup.py └── tests ├── __init__.py ├── _test_links.py ├── test_core.py ├── test_js.py ├── test_mdx.py └── test_templates.py /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: simonsherred 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | -------------------------------------------------------------------------------- /.github/workflows/pytest.yml: -------------------------------------------------------------------------------- 1 | name: tests 2 | on: [push, pull_request] 3 | jobs: 4 | build: 5 | name: Run Tests 6 | runs-on: ubuntu-latest 7 | steps: 8 | - uses: actions/checkout@v2 9 | 10 | - name: Set up Python 3.9 11 | uses: actions/setup-python@v2 12 | with: 13 | python-version: 3.9 14 | 15 | - name: Install CiteURL 16 | run: python3 -m pip install citeurl 17 | 18 | - name: Install test dependencies 19 | run: python3 -m pip install pytest pytest-cov pytest-github-actions-annotate-failures quickjs markdown 20 | 21 | - name: Test with pytest 22 | run: python3 -m pytest . --exitfirst --verbose --failed-first --cov=citeurl 23 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: push 12 | 13 | jobs: 14 | pypi-publish: 15 | name: upload release to PyPI 16 | runs-on: ubuntu-latest 17 | environment: 18 | name: pypi 19 | url: https://pypi.org/p/citeurl 20 | permissions: 21 | id-token: write 22 | steps: 23 | - uses: actions/checkout@v2 24 | - name: Set up Python 25 | uses: actions/setup-python@v2 26 | with: 27 | python-version: '3.x' 28 | - name: Install dependencies 29 | run: | 30 | python -m pip install --upgrade pip 31 | pip install build 32 | - name: Build package 33 | run: python -m build 34 | - name: publish package distributions 35 | uses: pypa/gh-action-pypi-publish@release/v1 36 | 37 | # deploy: 38 | # runs-on: ubuntu-latest 39 | # steps: 40 | # - uses: actions/checkout@v2 41 | # - name: Set up Python 42 | # uses: actions/setup-python@v2 43 | # with: 44 | # python-version: '3.x' 45 | # - name: Install dependencies 46 | # run: | 47 | # python -m pip install --upgrade pip 48 | # pip install build 49 | # - name: Build package 50 | # run: python -m build 51 | # - name: Publish package 52 | # uses: pypa/gh-action-pypi-#publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 53 | # with: 54 | # user: __token__ 55 | # password: ${{ secrets.PYPI_API_TOKEN }} 56 | 57 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.sh 2 | *.egg-info/ 3 | **/__pycache__/* 4 | **/.pytest_cache/* 5 | **/.mypy_cache/* 6 | **.sync-conflict 7 | build/ 8 | demo/ 9 | dist/ 10 | custom_templates/ 11 | .coverage 12 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2020 Simon Raindrum Sherred 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE.md 2 | include README.md 3 | graft citeurl 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 | CiteURL is an extensible tool that parses legal citations and makes links to websites where you can read the relevant law for free. It can be used to quickly look up a reference, or to insert a hyperlink for every long- or short-form citation found in a longer text. 5 | 6 | If you want to quickly try it out, it's available as a web app at [citation.link](https://www.citation.link). 7 | 8 | --- 9 | 10 | Here's a sample of the links CiteURL can make: 11 | 12 | > Federal law provides that courts should award prevailing civil rights plaintiffs reasonable attorneys fees, 42 USC § 1988(b), and, by discretion, expert fees, id. at (c). This is because the importance of civil rights litigation cannot be measured by a damages judgment. See Riverside v. Rivera, 477 U.S. 561 (1986). But Evans v. Jeff D. upheld a settlement where the plaintiffs got everything they wanted, on condition that they waive attorneys' fees. 475 U.S. 717 (1986). This ruling lets savvy defendants create a wedge between plaintiffs and their attorneys, discouraging civil rights suits and undermining the court's logic in Riverside, 477 U.S. at 574-78. 13 | 14 | --- 15 | 16 | By default, CiteURL supports Bluebook-style citations to [over 130 sources](https://github.com/raindrum/citeurl/blob/main/citeurl/templates) of U.S. law, including: 17 | 18 | - most reported state and federal court opinions 19 | - the U.S. Code and Code of Federal Regulations 20 | - the U.S. Constitution and all state constitutions 21 | - the codified laws for every state and territory except Arkansas, Georgia, Guam, and Puerto Rico. 22 | 23 | You can also add more sources of law by [writing your own citation templates](https://raindrum.github.io/citeurl/template-yamls/) in YAML format. 24 | 25 | ## Installation 26 | 27 | To install just enough to make CiteURL work, run this command: 28 | 29 | ```bash 30 | python3 -m pip install citeurl 31 | ``` 32 | 33 | Substitute `citeurl[full]` for `citeurl` if you want to install the optional dependencies `flask` and `appdirs`, necessary for hosting citeurl as a website and reading custom templates from the user's home directory. 34 | 35 | 36 | 37 | ## Usage 38 | 39 | CiteURL provides four command-line tools: 40 | 41 | - `citeurl process`: Parse a text and insert an HTML hyperlink for every citation it contains, including shortform citations. 42 | - `citeurl lookup`: Look up a single citation and display information about it. 43 | - `citeurl host`: Host an instance of CiteURL as a web app like [citation.link](https://www.citation.link). 44 | - `citeurl makejs`: Export an instance of CiteURL's lookup feature as JavaScript or a static web page. More info is available [here](https://raindrum.github.io/citeurl/frontends#javascript). 45 | 46 | Each command has its own command-line arguments you can view with the `-h` option. They all share the `-t` option, which allows you to load a list of custom [citation templates](https://raindrum.github.io/citeurl/template-yamls/) in YAML form. 47 | 48 | Here are a few common use cases: 49 | 50 | ```bash 51 | # Process a court opinion and output a version where each citation is hyperlinked: 52 | citeurl process -i INPUT_FILE.html -o OUTPUT_FILE.html 53 | ``` 54 | 55 | ```bash 56 | # Look up a single citation and open it directly in a browser 57 | citeurl lookup "42 USC 1983" -b 58 | ``` 59 | 60 | ```bash 61 | # List the top ten authorities cited in a text, from most citations to least: 62 | cat INPUT_FILE.html | citeurl process -a 10 63 | ``` 64 | 65 | ```bash 66 | # Host a lookup tool with custom templates, and serve it on the local network: 67 | citeurl host -t PATH_TO_YOUR_TEMPLATES.YAML -s 68 | ``` 69 | 70 | CiteURL is also available in a few other forms besides the command-line tool: 71 | 72 | - [citation.link](https://www.citation.link), the web app 73 | - [a flexible Python library](https://raindrum.github.io/citeurl/library) 74 | - [an extension](https://raindrum.github.io/citeurl/frontends#markdown-extension) to [Python-Markdown](https://python-markdown.github.io/) 75 | - [a desktop search provider](https://extensions.gnome.org/extension/4225/gnome-citeurl-search-provider/) for Linux users with the GNOME shell 76 | 77 | ## Credits 78 | 79 | Many thanks to these websites, which CiteURL's default templates frequently link to: 80 | 81 | - Harvard's [Caselaw Access Project](https://cite.case.law/) - for most court cases 82 | - [CourtListener](https://www.courtlistener.com/) - for other court cases 83 | - Cornell's [Legal Information Institute](https://www.law.cornell.edu/) - for the U.S. Code and many federal rules 84 | - [Ballotpedia](https://ballotpedia.org) - for the vast majority of state constitutions 85 | - [LawServer.com](https://www.lawserver.com/tools/laws) - for statutes in about a dozen states and territories whose websites don't have a compatible URL scheme 86 | -------------------------------------------------------------------------------- /citeurl/__init__.py: -------------------------------------------------------------------------------- 1 | from .citation import Citation 2 | from .citator import Citator, Template, cite, list_cites, insert_links 3 | from .authority import Authority, list_authorities 4 | from .tokens import TokenType, TokenOperation, StringBuilder 5 | -------------------------------------------------------------------------------- /citeurl/authority.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Union 3 | from functools import cached_property 4 | from copy import copy 5 | 6 | from .citation import Citation 7 | 8 | class Authority: 9 | def __init__( 10 | self, 11 | model_cite: Union[Citation, str], 12 | ignored_tokens = [ 13 | 'subsection', 14 | 'subdivision', 15 | 'clause', 16 | 'pincite', 17 | 'paragraph', 18 | ], 19 | ): 20 | self.template = model_cite.template 21 | self.ignored_tokens = ignored_tokens 22 | self.tokens = {} 23 | for key, value in model_cite.tokens.items(): 24 | if key in ignored_tokens: 25 | break 26 | else: 27 | self.tokens[key] = value 28 | self.citations = [model_cite] 29 | 30 | def __str__(self): 31 | return self.name 32 | 33 | def __repr__(self): 34 | return ( 35 | f'{self.name} ({len(self.citations)} ' 36 | f'cite{"s" if len(self.citations) > 1 else ""})' 37 | ) 38 | 39 | def __contains__(self, cite: Citation): 40 | """ 41 | Whether the citation is a reference to this authority, i.e. 42 | """ 43 | if cite.template.name != self.template.name: 44 | return False 45 | for key, value in self.tokens.items(): 46 | counterpart_value = cite.tokens.get(key) 47 | if counterpart_value == value: 48 | continue 49 | elif ( 50 | self.template.tokens[key].severable 51 | and type(counterpart_value) is str 52 | and counterpart_value.startswith(value) 53 | ): 54 | continue 55 | return False 56 | return True 57 | 58 | @cached_property 59 | def name(self): 60 | # build a name the proper way if a name_builder is defined 61 | if self.template.name_builder: 62 | return self.template.name_builder(self.tokens) 63 | 64 | # otherwise use horrible regex magic to reverse-engineer a name. 65 | # first find a longform citation to use as a starting point 66 | base_cite = self.citations[0] 67 | while base_cite.parent: 68 | base_cite = base_cite.parent 69 | 70 | # next construct a regex pattern to pull out the relevant 71 | # tokens, along with the non-token text preceding each one. 72 | # the non-token "prelude" text lets us replace the text of each 73 | # token only when it appears in the proper context 74 | pattern = '' 75 | for token in self.tokens: 76 | regex = re.escape(base_cite.tokens[token]) 77 | segment = f"""((?P<{token}_prelude>.*?)(?P<{token}>{regex}))?""" 78 | if pattern: 79 | pattern = pattern[:-2] + segment + ')?' 80 | else: 81 | pattern = segment 82 | 83 | match = re.match(pattern, base_cite.text) 84 | 85 | # slice off all the text after the last relevant token. This is 86 | # to remove thingsl like subsections, etc. It assumes that all 87 | # the optional tokens (subsection, pincite, etc) appear *after* 88 | # all the mandatory ones. 89 | base_cite_text = base_cite.text[:match.span(token)[1]] 90 | 91 | # for each token, replace the value from the longform citation 92 | # with the corresponding value for *this* authority 93 | for token in self.tokens: 94 | if not match.group(token): 95 | continue 96 | prelude = str(match.group(f'{token}_prelude')) 97 | old_value = prelude + match.group(token) 98 | new_value = prelude + self.tokens[token] 99 | base_cite_text = base_cite_text.replace(old_value, new_value) 100 | return base_cite_text 101 | 102 | @cached_property 103 | def URL(self): 104 | if self.template.URL_builder: 105 | url = self.template.URL_builder(self.tokens) 106 | if url: 107 | url = url.replace(' ', '%20') 108 | else: 109 | url = None 110 | return url 111 | 112 | 113 | def list_authorities( 114 | cites: list[Citation], 115 | ignored_tokens = ['subsection', 'clause', 'pincite', 'paragraph'], 116 | known_authorities: list[Authority] = [], 117 | sort_by_cites: bool = True, 118 | ) -> list[Authority]: 119 | """ 120 | Get a list of all the authorities that appear in the given list of 121 | citations. An authority represents a distinct section of law or 122 | court case. Two citations to the same authority can have different 123 | tokens, as long as those tokens are in the list of ignored_tokens. 124 | """ 125 | authorities = copy(known_authorities) or [] 126 | for cite in cites: 127 | for authority in authorities: 128 | if cite in authority: 129 | authority.citations.append(cite) 130 | break 131 | else: 132 | authorities.append(Authority(cite, ignored_tokens)) 133 | if sort_by_cites: 134 | authorities.sort(key=lambda x: -len(x.citations)) 135 | return authorities 136 | -------------------------------------------------------------------------------- /citeurl/citation.py: -------------------------------------------------------------------------------- 1 | # python standard imports 2 | from typing import Iterable 3 | import re 4 | 5 | # internal imports 6 | from .regex_mods import process_pattern, match_regexes 7 | 8 | BASIC_ID_REGEX = re.compile(r'(? str: 132 | if self.template.URL_builder: 133 | url = self.template.URL_builder(self.tokens) 134 | if url: 135 | url = url.replace(' ', '%20') 136 | else: 137 | url = None 138 | return url 139 | 140 | @property 141 | def name(self) -> str: 142 | if self.template.name_builder: 143 | return self.template.name_builder(self.tokens) 144 | else: 145 | return None 146 | 147 | def get_shortform_cites(self) -> Iterable: 148 | keep_trying = True 149 | span_start = self.span[1] 150 | while keep_trying: 151 | try: 152 | match = next(match_regexes( 153 | regexes=self.shortform_regexes, 154 | text=self.source_text, 155 | span=(span_start,), 156 | )) 157 | span_start = match.span()[1] 158 | try: 159 | yield Citation( 160 | match=match, 161 | template=self.template, 162 | parent=self, 163 | ) 164 | except SyntaxError: # it's an invalid citation 165 | pass 166 | except StopIteration: 167 | keep_trying = False 168 | 169 | def get_idform_cite(self, until_index: int=None): 170 | try: 171 | match = next(match_regexes( 172 | regexes = self.idform_regexes, 173 | text = self.source_text, 174 | span = (self.span[1], until_index) 175 | )) 176 | return Citation(match=match, template=self.template, parent=self) 177 | except StopIteration: 178 | return None 179 | except SyntaxError: 180 | return None 181 | 182 | def get_next_child(self, span: tuple=None): 183 | try: 184 | match = next(match_regexes( 185 | regexes = self.shortform_regexes + self.idform_regexes, 186 | text = self.source_text, 187 | span = span if span else (self.span[1], ), 188 | )) 189 | return Citation(match=match, template=self.template, parent=self) 190 | except StopIteration: 191 | return None 192 | 193 | def __str__(self): 194 | return str(self.text) 195 | 196 | def __repr__(self): 197 | return str(self.text) 198 | return ( 199 | f'Citation(match={self.match}, template={repr(self.template)}' 200 | + (f', parent={repr(self.parent)}' if self.parent else '') 201 | ) 202 | 203 | def __contains__(self, other_cite): 204 | """ 205 | Returns True if both citations are from templates with the same 206 | name, and the only difference between their tokens is that the 207 | other one has a more specific (i.e. higher-indexed) token than 208 | any of this one's. Severable tokens are considered a match if 209 | the other token's value *starts with* this one's. 210 | """ 211 | if ( 212 | other_cite.template.name != self.template.name 213 | or other_cite.tokens == self.tokens 214 | ): 215 | return False 216 | for key, value in self.tokens.items(): 217 | if value and other_cite.tokens.get(key) != value: 218 | if ( 219 | self.template.tokens[key].severable 220 | and other_cite.tokens[key] 221 | and other_cite.tokens[key].startswith(value) 222 | ): 223 | continue 224 | else: 225 | return False 226 | else: 227 | return True 228 | 229 | def __eq__(self, other_cite): 230 | """ 231 | Returns True if both citations are from templates with the same 232 | name, and they have the exact same token values. 233 | """ 234 | return ( 235 | other_cite.template.name == self.template.name 236 | and other_cite.tokens == self.tokens 237 | ) 238 | 239 | def __len__(self): 240 | return len(self.text) 241 | 242 | -------------------------------------------------------------------------------- /citeurl/mdx.py: -------------------------------------------------------------------------------- 1 | """ 2 | A Python-Markdown extension to detect citations and insert them into 3 | the processed text as hyperlinks. 4 | """ 5 | 6 | # python standard imports 7 | import re 8 | import xml.etree.ElementTree as etree 9 | from pathlib import Path 10 | 11 | # markdown imports 12 | from markdown.extensions import Extension 13 | from markdown.postprocessors import Postprocessor 14 | 15 | # internal imports 16 | from . import Citator, insert_links 17 | from .citator import _get_default_citator 18 | 19 | # store citator in a global variable so it isn't remade each document 20 | CITATOR: Citator = None 21 | 22 | class CitationPostprocessor(Postprocessor): 23 | def __init__( 24 | self, 25 | citator, 26 | attributes: dict, 27 | redundant_links: bool, 28 | URL_optional: bool, 29 | break_id_on_regex: str, 30 | ignore_markup: bool, 31 | ): 32 | super().__init__() 33 | self.citator = citator 34 | self.attributes = attributes 35 | self.redundant_links = redundant_links 36 | self.URL_optional = URL_optional 37 | self.ignore_markup = ignore_markup, 38 | 39 | if break_id_on_regex: 40 | self.id_breaks = re.compile(break_id_on_regex) 41 | else: 42 | self.id_breaks = None 43 | 44 | def run(self, text): 45 | return insert_links( 46 | text = text, 47 | attrs = self.attributes, 48 | redundant_links = self.redundant_links, 49 | URL_optional = self.URL_optional, 50 | id_breaks = self.id_breaks, 51 | ignore_markup = self.ignore_markup, 52 | citator = self.citator, 53 | ) 54 | 55 | class CiteURLExtension(Extension): 56 | """Detects legal citations and inserts relevant hyperlinks.""" 57 | def __init__(self, **kwargs): 58 | self.config = { 59 | 'custom_templates': [ 60 | [], 61 | 'List of paths to YAML files containing additional citation' 62 | + 'templates to load. - Default: []', 63 | ], 64 | 'use_defaults': [ 65 | True, 66 | "Load CiteURL's default citation templates? - Default: True" 67 | ], 68 | 'redundant_links': [ 69 | False, 70 | ( 71 | "Whether to insert links links whose URLs are identical " 72 | "to the previous URL" 73 | ) 74 | ], 75 | 'URL_optional': [ 76 | False, 77 | ( 78 | "Whether to add elements for citations that have " 79 | "no URL" 80 | ) 81 | ], 82 | 'break_id_on_regex': [ 83 | None, 84 | "Anywhere this string (parsed as regex) appears in the text, " 85 | + "chains of citations like 'id.' will be interrupted. Note " 86 | + "that this is based on the output HTML, *not* the original " 87 | + f"Markdown text." 88 | ], 89 | 'attributes': [ 90 | {'class': 'citation'}, 91 | ("A dictionary of attributes (besides href) that the inserted" 92 | + " links should have. - Default: '{'class': 'citation'}'") 93 | ], 94 | 'ignore_markup': [ 95 | True, 96 | ( 97 | "Whether to detect citations even when they are " 98 | 'interrupted by inline markup, like "Id. at 32. ' 99 | 'Default: True' 100 | ) 101 | ], 102 | } 103 | super(CiteURLExtension, self).__init__(**kwargs) 104 | 105 | def extendMarkdown(self, md): 106 | global CITATOR 107 | if not CITATOR: 108 | if self.config['use_defaults'][0]: 109 | CITATOR = _get_default_citator() 110 | else: 111 | CITATOR = Citator(defaults=None) 112 | for path in self.config['custom_templates'][0] or []: 113 | CITATOR.load_yaml(Path(path).read_text()) 114 | 115 | md.postprocessors.register( 116 | CitationPostprocessor( 117 | CITATOR, 118 | self.config['attributes'][0], 119 | self.config['redundant_links'][0], 120 | self.config['URL_optional'][0], 121 | self.config['break_id_on_regex'][0], 122 | self.config['ignore_markup'][0], 123 | ), 124 | "CiteURL", 125 | 1 126 | ) 127 | 128 | def makeExtension(**kwargs): 129 | return CiteURLExtension(**kwargs) 130 | -------------------------------------------------------------------------------- /citeurl/regex_mods.py: -------------------------------------------------------------------------------- 1 | # python standard imports 2 | from typing import Iterable 3 | import re 4 | 5 | def process_pattern( 6 | pattern: str, 7 | replacements: dict[str, str], 8 | token_prefix: str = None, 9 | add_word_breaks: bool = False, 10 | ): 11 | """ 12 | For a given regex pattern, find all the places that a key in the 13 | replacements dict appears, enclosed in curly braces. Replace each 14 | one with the corresponding value, enclosed in parentheses. 15 | 16 | If token_prefix is provided, it will only replace placeholders that 17 | start with that prefix, e.g. the 'same' in "{same volume}" or 18 | "{same reporter}". 19 | 20 | If add_word_breaks is True, a mandatory word break will be added at 21 | the beginning and end of the pattern. 22 | """ 23 | for key, value in replacements.items(): 24 | if not value: 25 | continue 26 | if token_prefix: 27 | marker = '{%s %s}' % (token_prefix, key) 28 | else: 29 | marker = '{%s}' % key 30 | if not (value.startswith('(') and value.endswith(')')): 31 | value = f'({value})' 32 | value = fr'{value}(?=\W|$)' 33 | pattern = pattern.replace(marker, value) 34 | if add_word_breaks: 35 | pattern = rf'(? Iterable: 40 | """ 41 | For a given text and set of regex Pattern objects, generate each 42 | non-overlapping match found for any regex. Regexes earlier in 43 | the list take priority over later ones, such that a span of text 44 | that matches the first regex cannot also match the second. 45 | """ 46 | start = span[0] 47 | if len(span) > 1: 48 | end = span[1] 49 | else: 50 | end = None 51 | 52 | keep_trying = True 53 | while keep_trying: 54 | span = (start, end) if end else (start,) 55 | matches = [] 56 | for regex in regexes: 57 | match = regex.search(text, *span) 58 | if match: 59 | matches.append(match) 60 | if matches: 61 | matches.sort(key=lambda x: (x.span()[0], -len(x.group()))) 62 | start = matches[0].span()[1] 63 | yield matches[0] 64 | else: 65 | keep_trying = False 66 | -------------------------------------------------------------------------------- /citeurl/templates/general federal law.yaml: -------------------------------------------------------------------------------- 1 | U.S. Constitution: 2 | meta: 3 | name regex: (United States|U\.? ?S\.?) ?Const(itution|\.?) 4 | tokens: 5 | article: 6 | regex: &cardinals_to_20 '\d|[ivIV]{1,3}|[Oo]ne|[Tt](woo|hree|welve|hirteen)|[Ff](our(teen)?|ive|ifteen)|[Ss](ix(teen)?|even(teen)?)|[Ee]ight(een)?|[Nn]ine(teen)?|[Tt]en|[Ee]leven' 7 | edits: [number style: digit] 8 | section: 9 | regex: \d+|[IVXivx]+ 10 | edits: [number style: digit] 11 | clause: {regex: '\d+'} 12 | patterns: 13 | - - '{name regex},? ' 14 | - &art_sec_cl '[Aa]rt(icle|\.?) ?{article}(,? ([Ss]ec(tions?|t?s?\.?)|§§?) ?{section}(,? [Cc]l(ause|\.?) ?{clause})?)?' 15 | - [*art_sec_cl, ' [Oo]f [Tt]he {name regex}'] 16 | shortform patterns: 17 | - [&standalone_art_sec_cl '[Aa]rt(icle|\.?) ?{article}(?! of )(,? ([Ss]ec(tions?|t?s?\.?)|§§?) ?{section}(?! of )(,? [Cc]l(ause|\.?) ?{clause}(?! of ))?)?'] 18 | idform patterns: &sec_cl_idforms 19 | - '([Ii]d\. (at )?)?([Ss]ec(tions?|t?s?\.?)|§§?) ?{section},? [Cc]l(ause|\.?) ?{clause}?(?! of )' 20 | - '([Ii]d\. (at )?)?[Cc]l(ause|\.?) ?{clause}(?! of )' 21 | #broad patterns: 22 | # - ['^', *standalone_art_sec_cl] 23 | name builder: 24 | parts: 25 | - 'Article {article}' 26 | - ', Section {section}' 27 | - ', Clause. {clause}' 28 | - ' of the U.S. Constitution' 29 | edits: 30 | - token: article 31 | number style: roman 32 | URL builder: 33 | parts: 34 | - https://constitution.congress.gov/browse/ 35 | - article-{article} 36 | - '#{article_roman}_S{section}' 37 | - _C{clause} 38 | edits: 39 | - token: article 40 | number style: roman 41 | output: article_roman 42 | 43 | 44 | U.S. Constitution Amendments: 45 | meta: 46 | name regex: (United States|U\.? ?S\.?) ?Const(itution|\.?) 47 | tokens: 48 | amendment: 49 | regex: &ordinals_to_20 '\d{1,2}(st|nd|rd|th)?|[xivXIV]+|([Tt]wenty(-| )?)?([Ff]irst|[Ss]econd|[Tt]hird|[Ff]ourth|[Ff]ifth|[Ss]ixth|[Ss]eventh|[Ee]ighth|[Nn]inth)|[Tt]enth|[Ee]leventh|[Tt]welfth|([Tt]hir|[Ff]our|[Ff]if|[Ss]ix|[Ss]even|[Ee]igh|[Nn]ine)teenth|[Tt]wentieth' 50 | edits: [number style: digit] 51 | section: 52 | regex: \d{1,2}|[ivxIVX]{1,5} 53 | edits: [number style: digit] 54 | clause: {regex: '\d+'} 55 | patterns: 56 | - - '{name regex},? ' 57 | - &amdt_sec_cl '[Aa]m(end(ment|\.)|dt?\.?) ?{amendment}(,? ([Ss]ec(tions?|s?\.?)|§§?) ?{section}(,? [Cc]l(ause|\.?) ?{clause})?)?' 58 | - - &sec_cl_of_the_amdt '(([Ss]ec(tions?|t?s?\.?)|§§?) ?{section}(,? [Cc]l(ause|\.?) ?{clause})? of )?([Tt]he )?{amendment} [Aa]mendment' 59 | - ' ([Oo]f|[Tt]o) the {name regex}' 60 | broad patterns: 61 | - &bare_amdt_sec_cl [*amdt_sec_cl, '(?! of)'] 62 | - [*sec_cl_of_the_amdt, '(?! to)'] 63 | shortform patterns: 64 | - *bare_amdt_sec_cl 65 | idform patterns: *sec_cl_idforms 66 | name builder: 67 | parts: 68 | - 'Section {section}' 69 | - ', Clause {clause} of ' 70 | - 'The {amendment} Amendment to the U.S. Constitution' 71 | edits: 72 | - token: section 73 | lookup: {'.+': ''} 74 | output: has_section 75 | - token: amendment 76 | number style: ordinal 77 | - token: amendment 78 | case: title 79 | URL builder: 80 | parts: 81 | - https://constitution.congress.gov/browse/ 82 | - amendment-{amendment}/ 83 | - '#{amendment}_S{section}' 84 | - _C{clause} 85 | edits: 86 | - token: article 87 | number style: roman 88 | output: article_roman 89 | 90 | 91 | U.S. Code: 92 | meta: {name regex: 'U\. ?S\. ?C(ode|\.)( ?A(nn(otated|\.)|\.)| ?S(erv(ice|\.)|\.?))?|USC[AS]?|United States Code'} 93 | tokens: &title_sec_subsec_tokens 94 | title: {regex: \d+} 95 | section: {regex: '\d[\w.-]*\w|\d'} 96 | subsection: &subsec_token 97 | regex: '(\(([A-Za-z]{1,5}|\d{1,3})\))+' 98 | severable: yes 99 | patterns: &title_sec_subsec_patterns 100 | - - &title ([Tt]itle )?{title} 101 | - ',? {name regex}(,? )?(' 102 | - §ion_sign ((§|§|§){1,2}|[Ss]ec(tions?|t?s?\.)) 103 | - ')? ?{section}' 104 | - &subsec '(((,? )?sub(sections?|divisions?|(sec|d(iv)?)?s?\.))? ?{subsection})?' 105 | - ['[Tt]itle {title},? (', *section_sign, ')? ?{section}', *subsec, ' of the {name regex}'] 106 | - ['(', *section_sign, ')? ?{section}', *subsec, ' of [Tt]itle {title} of the {name regex}'] 107 | idform patterns: &id_sec_subsec 108 | - '[Ii]d\.,?( at)?( §§?)? ?{section}( ?{subsection})?' 109 | - '((§|§|§){1,2}|[Ss]ec(tions?|t?s?\.)) {section}( ?{subsection})?(?! of)' 110 | - '[Ii]d\.,? at {subsection}' 111 | shortform pattern: [*section_sign, '{same section}(?! of )', *subsec, '(?! of )'] 112 | name builder: 113 | parts: 114 | - '{title} U.S.C. § {section}' 115 | - '{subsection}' 116 | URL builder: 117 | parts: 118 | - https://www.law.cornell.edu/uscode/text/{title}/{section} 119 | - '#{subsection}' 120 | edits: 121 | - token: subsection 122 | sub: ['\)\(', '_'] 123 | - token: subsection 124 | sub: ['[()]', ''] 125 | 126 | 127 | U.S. Public Laws: 128 | tokens: 129 | congress: {regex: \d+} 130 | law: {regex: \d+} 131 | pattern: Pub(\.?|lic) ?L(\.?|aw) ?(No\.?)? ?{congress}[–‑-]{law} 132 | name builder: 133 | parts: ['Public Law No. {congress}–{law}'] 134 | URL builder: 135 | parts: ['https://uscode.house.gov/statutes/pl/{congress}/{law}.pdf'] 136 | 137 | 138 | U.S. Statutes at Large: 139 | tokens: &vol_page_pin_tokens 140 | volume: {regex: \d+} 141 | page: 142 | regex: '\d([,\d]*\d)?' 143 | edits: [sub: [',', '']] 144 | pincite: {regex: \d+} 145 | pattern: '{volume} Stat\.? {page}(,? {pincite})?' 146 | idform pattern: '[Ii]d\.,? at {pincite}' 147 | URL builder: 148 | parts: ['https://www.govinfo.gov/content/pkg/STATUTE-{volume}/html/STATUTE-{volume}-Pg{page}.htm'] 149 | name builder: 150 | parts: ['{volume} Stat. {page}', ', {pincite}'] 151 | 152 | 153 | Federal Register: 154 | tokens: *vol_page_pin_tokens 155 | pattern: '{volume} (Fed\. ?Reg\.|F\.? ?R\.?) {page}(,? {pincite})?' 156 | idform pattern: '[Ii]d\.,? at {pincite}' 157 | URL builder: 158 | parts: ['https://www.govinfo.gov/link/fr/{volume}/{page}'] 159 | name builder: 160 | parts: ['{volume} FR {page}', ', {pincite}'] 161 | 162 | 163 | Code of Federal Regulations: 164 | inherit: U.S. Code 165 | meta: {name regex: 'C\.? ?F\.? ?R\.?|Code of Federal Regulations'} 166 | name builder: 167 | parts: 168 | - '{title} C.F.R. § {section}' 169 | - '{subsection}' 170 | URL builder: 171 | parts: 172 | - https://ecfr.federalregister.gov/cfr-reference?cfr%5Bdate%5D=current&cfr%5Breference%5D={title} CFR {section} 173 | - '#p-{section}{subsection}' 174 | 175 | 176 | Federal Rules of Civil Procedure: 177 | meta: 178 | acronym: frcp 179 | name: Fed. R. Civ. P. 180 | name regex: Civ(il|\.) ?P(rocedure|(roc?)?\.)|C\.? ?P\.? 181 | tokens: 182 | rule: 183 | regex: '\d+(\.\d+)?[a-z]?' 184 | subsection: *subsec_token 185 | patterns: 186 | - (Fed(eral|\.) ?R(ules?|\.)|F\.? ?R\.?)( of)? ?{name regex}( [Rr]ule)? {rule}( ?{subsection})? 187 | - '[Rr]ule {rule}( ?{subsection})? [Oo]f [Tt]he Fed(eral|\.) Rules of {name regex}' 188 | idform patterns: 189 | - '([Ii]d\.,? (at )?)?[Rr]ule {rule}( ?{subsection}?)?(?! of)' 190 | - '[Ii]d\.,? (at )?{subsection}' 191 | shortform pattern: ['[Rr]ule {same rule}(?! of )', *subsec, '(?! of )'] 192 | name builder: 193 | parts: ['{name} {rule}', '{subsection}'] 194 | URL builder: 195 | parts: 196 | - https://www.law.cornell.edu/rules/{acronym}/rule_{rule} 197 | - '#rule_{rule}_{subsection}' 198 | edits: 199 | - token: subsection 200 | sub: ['\)\(', '_'] 201 | - token: subsection 202 | sub: ['\W', ''] 203 | 204 | 205 | Federal Rules of Appellate Procedure: 206 | inherit: Federal Rules of Civil Procedure 207 | meta: 208 | acronym: frap 209 | name: Fed. R. App. Proc. 210 | name regex: App(ellate|\.) ?P(rocedure|(roc?)?\.)|A\.? ?P\.? 211 | 212 | 213 | Federal Rules of Criminal Procedure: 214 | inherit: Federal Rules of Civil Procedure 215 | meta: 216 | acronym: frcrmp 217 | name: Fed. R. Crim. Proc. 218 | name regex: Crim(inal|\.) ?P(rocedure|(roc?)?\.)|Cr\.? ?P\.? 219 | 220 | 221 | Federal Rules of Evidence: 222 | inherit: Federal Rules of Civil Procedure 223 | meta: 224 | acronym: fre 225 | name: Fed. R. Evid. 226 | name regex: Evid(ence|\.)|E\.? 227 | -------------------------------------------------------------------------------- /citeurl/templates/secondary sources.yaml: -------------------------------------------------------------------------------- 1 | Model Penal Code: 2 | meta: 3 | name: Model Penal Code 4 | name regex: 'M\.? ?P\.? ?C\.?|Model Pen(al|\.) Code' 5 | tokens: &art_sec_subsec_tokens 6 | article: {regex: \d+} 7 | section: {regex: \d+} 8 | subsection: 9 | regex: '(\(\w{1,4}\))+' 10 | severable: yes 11 | pattern: '{name regex} § {article}\.{section}( ?{subsection})?' 12 | name builder: 13 | parts: ['{name} § {article}.{section}', '{subsection}'] 14 | 15 | 16 | Revised Model Business Corporation Act: 17 | inherit: Model Penal Code 18 | meta: 19 | name: Revised Model Business Corporation Act 20 | name regex: Rev(ised|\.) ?Model Bus(iness|\.) ?Corp(orations?|s?\.) Act|R\.M\.B\.C\.A|RMBCA 21 | 22 | 23 | Uniform Commercial Code: 24 | tokens: *art_sec_subsec_tokens 25 | pattern: '(U\.? ?C\.? ?C\.?|Uniform Com(mercial|m?\.) Code) (§§? ?)?{article}[–‑-]{section}( ?{subsection})?' 26 | name builder: 27 | parts: ['UCC § {article}-{section}', '{subsection}'] 28 | URL builder: 29 | parts: 30 | - https://www.law.cornell.edu/ucc/{article}/{article}-{section} 31 | - '#{article}-{section}{subsection}' 32 | edits: # only use the first part of the subsection 33 | - token: subsection 34 | sub: ['\).+', ')'] 35 | -------------------------------------------------------------------------------- /citeurl/web/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raindrum/citeurl/a929a29d7b72b5bf376d90f48759a5496ecaf799/citeurl/web/__init__.py -------------------------------------------------------------------------------- /citeurl/web/logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 152 | -------------------------------------------------------------------------------- /citeurl/web/makejs.py: -------------------------------------------------------------------------------- 1 | # python standard imports 2 | from argparse import ArgumentParser 3 | from json import dumps 4 | from pathlib import Path 5 | from re import sub 6 | 7 | # internal imports 8 | from .. import Citator 9 | from .resources import unify_regex, format_page, sources_table 10 | from .resources import VERSION 11 | 12 | COPYRIGHT_MESSAGE = f""" 13 | // This script was made with CiteURL {VERSION}, an extensible framework 14 | // to turn legal references into URLs. 15 | // 16 | // The "templates" variable below holds the data necessary to 17 | // turn each kind of citation into a URL. Some or all of the templates may 18 | // have been made by a third party and are not part of CiteURL itself. 19 | // 20 | // CiteURL is copyright of Simon Raindrum Sherred under the MIT License. 21 | // See https://raindrum.github.io/citeurl for more info. 22 | """ 23 | 24 | _dir = Path(__file__).parent.absolute() 25 | BASE_JS_PATH = _dir / 'citeurl.js' 26 | 27 | PAGE = """ 28 |Paste a legal citation here, and you can 30 | go somewhere else on the Web to read what it refereneces:
31 | 40 |This static instance of ' 155 | 'CiteURL ' 156 | 'supports the following types of citation:
Source of Law | 40 |Website | 41 |Citation Format | 42 | 43 | 44 | {rows} 45 | 46 |
---|
Paste a legal citation here, and you can 25 | go somewhere else on the Web to read what it refereneces:
26 | 30 |This web app is powered by 37 | CiteURL, 38 | an open-source tool that recognizes legal citations and generates links to 39 | publicly-available websites where you can read the cited documents.
40 |"Legal citations" here means formal references to U.S. federal and state 41 | laws and court opinions. This is not a search engine. If you enter something 42 | ambiguous, like the name of a court case, nothing will happen.
43 |Instead, citations generally need to follow 44 | bluebook style, 45 | so that CiteURL can recognize the relevant info (volume, page number, section, 46 | etc) and turn it into a URL. But just because you know how to cite something 47 | doesn't mean it'll work. Below, you will find the kinds of citation this app 48 | can recognize, along with the websites it will send you to if it does:
49 | {table} 50 |By the way, CiteURL can also detect multiple citations in a longer text, 51 | and insert hyperlinks for each one! Feel free to try that out 52 | here.
53 |Paste some text into the box below and click "Parse" to process 60 | the text and find every supported citation 61 | it contains.
62 | 69 | {output} 70 | """ 71 | 72 | INFO_PAGE = """ 73 |Sorry, "{query}" isn't a citation I recognize.
82 | 83 | """ 84 | 85 | ERROR_501 = """ 86 |Sorry, I can tell that's a {template} citation but I don't have a 88 | link for it.
89 | 90 | """ 91 | 92 | ERROR_413 = """ 93 |Sorry, that's too many characters for the server to process.
95 | 96 | """ 97 | 98 | ######################################################################## 99 | # Routes 100 | ######################################################################## 101 | 102 | @_APP.route('/Sorry, I couldn't find any citations in that.
" 163 | ) 164 | 165 | output = insert_links( 166 | text = given_text, 167 | redundant_links = True, 168 | ignore_markup = False, 169 | citator = _APP.citator, 170 | ) 171 | output = '' + sub(r'\n+', '
\n', output) + '
' 172 | 173 | return format_page( 174 | PARSER_PAGE, 175 | given_text=given_text, 176 | output=( 177 | '