├── .gitignore ├── visualization └── Step2_viz_area_chart │ ├── index.html │ ├── year_data.csv │ ├── style.css │ └── index.js ├── environment.yml ├── README.md ├── Topic_Modeling_Tutorial ├── Step3_pipeline_Text-Cleaner.ipynb ├── Step1_pipeline_supcourt_yearlist.ipynb ├── Step4_pipeline_Model-testing.ipynb └── Step2_pipeline_casegetter_forfinal.ipynb └── pipeline.py /.gitignore: -------------------------------------------------------------------------------- 1 | local/ 2 | data/ 3 | .DS_Store 4 | pipeline/.ipynb_checkpoints/ 5 | supreme_court_data/ -------------------------------------------------------------------------------- /visualization/Step2_viz_area_chart/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 |
14 |
15 | 16 | 18 |
19 | 20 |
21 | 22 | 23 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: supreme-court-topics 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | # Python version 7 | - python=3.9 8 | 9 | # Core data science stack 10 | - pandas>=1.3.0 11 | - numpy>=1.21.0 12 | - scikit-learn>=1.0.0 13 | 14 | # Web scraping 15 | - requests>=2.25.0 16 | - beautifulsoup4>=4.9.0 17 | - lxml>=4.6.0 18 | 19 | # Natural language processing 20 | - nltk>=3.6.0 21 | 22 | # Development and debugging 23 | - jupyter>=1.0.0 24 | - ipython>=7.0.0 25 | 26 | # Optional: Performance improvements 27 | - numba>=0.56.0 28 | - joblib>=1.1.0 29 | 30 | # Development tools 31 | - black>=22.0.0 32 | - flake8>=4.0.0 33 | - pytest>=6.0.0 34 | 35 | # Pip dependencies (not available in conda) 36 | - pip 37 | - pip: 38 | - textblob>=0.17.0 39 | - spacy>=3.4.0 40 | - gensim>=4.1.0 41 | - umap-learn>=0.5.0 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Supreme Court Topics Over Time 2 | 3 | This is a project by @emilyinamillion that utilizes topic modeling with Non Negative Matrix factorization (python) and has an accompanying D3.js visualization. 4 | 5 | If you came here for the topic modeling, [one of my ipython notebooks](https://github.com/emilyinamillion/supreme-court-topics-overtime/blob/master/pipeline/Step4_pipeline_Model-testing.ipynb) walks through the other algorithms I tried as well as my reasoning for why I settled on NMF. 6 | 7 | If you came here for the D3.js, the visualization folder is the entire contents of the live D3.js visualization currently available on Bl.ocks. The visualization lives [here](https://emilyinamillion.me/supreme-court-topics-visualization/index.html). 8 | 9 | :v: 10 | 11 | 12 | Quickstart 13 | 14 | ``` 15 | # Create the conda environment from the file 16 | conda env create -f environment.yml 17 | 18 | # Activate the environment 19 | conda activate supreme-court-topics 20 | 21 | # Download required NLTK data and spaCy model 22 | python -c "import nltk; nltk.download('stopwords'); nltk.download('names')" 23 | python -m spacy download en_core_web_sm 24 | ``` -------------------------------------------------------------------------------- /visualization/Step2_viz_area_chart/year_data.csv: -------------------------------------------------------------------------------- 1 | date,value 2 | 1791,10 3 | 1792,24 4 | 1793,16 5 | 1794,12 6 | 1795,36 7 | 1796,93 8 | 1797,48 9 | 1798,36 10 | 1799,42 11 | 1800,49 12 | 1801,36 13 | 1803,114 14 | 1804,76 15 | 1805,138 16 | 1806,162 17 | 1807,259 18 | 1808,61 19 | 1809,322 20 | 1810,266 21 | 1812,280 22 | 1813,322 23 | 1814,329 24 | 1815,287 25 | 1816,294 26 | 1817,294 27 | 1818,266 28 | 1819,231 29 | 1820,189 30 | 1821,287 31 | 1822,224 32 | 1823,203 33 | 1824,287 34 | 1825,189 35 | 1826,198 36 | 1827,329 37 | 1828,385 38 | 1829,264 39 | 1830,399 40 | 1831,294 41 | 1832,392 42 | 1833,287 43 | 1834,446 44 | 1835,228 45 | 1836,255 46 | 1837,140 47 | 1838,378 48 | 1839,450 49 | 1840,387 50 | 1841,270 51 | 1842,396 52 | 1843,306 53 | 1844,312 54 | 1845,372 55 | 1846,376 56 | 1847,324 57 | 1848,324 58 | 1849,378 59 | 1850,1511 60 | 1851,882 61 | 1852,472 62 | 1853,729 63 | 1854,639 64 | 1855,837 65 | 1856,576 66 | 1857,647 67 | 1858,621 68 | 1859,1035 69 | 1860,544 70 | 1861,479 71 | 1862,378 72 | 1863,830 73 | 1864,590 74 | 1865,758 75 | 1866,1206 76 | 1867,928 77 | 1868,865 78 | 1869,1677 79 | 1870,1215 80 | 1871,1782 81 | 1872,1747 82 | 1873,1783 83 | 1874,1791 84 | 1875,1899 85 | 1876,2071 86 | 1877,2325 87 | 1878,2025 88 | 1879,2223 89 | 1880,2008 90 | 1881,2103 91 | 1882,2358 92 | 1883,2547 93 | 1884,2448 94 | 1885,2511 95 | 1886,2693 96 | 1887,2415 97 | 1888,2171 98 | 1889,2500 99 | 1890,2740 100 | 1891,2221 101 | 1892,2300 102 | 1893,2473 103 | 1894,2268 104 | 1895,2371 105 | 1896,2089 106 | 1897,1675 107 | 1898,1653 108 | 1899,1953 109 | 1900,1802 110 | 1901,1640 111 | 1902,1944 112 | 1903,1926 113 | 1904,1773 114 | 1905,1566 115 | 1906,1841 116 | 1907,1593 117 | 1908,1657 118 | 1909,1490 119 | 1910,1468 120 | 1911,2050 121 | 1912,2592 122 | 1913,2655 123 | 1914,2466 124 | 1915,2038 125 | 1916,1963 126 | 1917,1954 127 | 1918,2089 128 | 1919,1630 129 | 1920,1980 130 | 1921,1611 131 | 1922,2009 132 | 1923,1935 133 | 1924,2120 134 | 1925,1908 135 | 1926,1809 136 | 1927,1593 137 | 1928,1152 138 | 1929,1175 139 | 1930,1512 140 | 1931,1336 141 | 1932,1521 142 | 1933,1530 143 | 1934,1539 144 | 1935,1440 145 | 1936,1467 146 | 1937,1530 147 | 1938,1277 148 | 1939,1251 149 | 1940,1489 150 | 1941,1494 151 | 1942,1478 152 | 1943,1260 153 | 1944,1476 154 | 1945,1216 155 | 1946,2556 156 | 1947,2124 157 | 1948,2286 158 | 1949,1782 159 | 1950,1854 160 | 1951,1798 161 | 1952,2196 162 | 1953,1566 163 | 1954,1660 164 | 1955,1908 165 | 1956,2270 166 | 1957,2790 167 | 1958,2502 168 | 1959,2448 169 | 1960,2502 170 | 1961,2134 171 | 1962,2844 172 | 1963,3294 173 | 1964,2410 174 | 1965,2538 175 | 1966,2646 176 | 1967,3546 177 | 1968,2432 178 | 1969,2320 179 | 1970,2592 180 | 1971,3090 181 | 1972,3348 182 | 1973,3114 183 | 1974,2772 184 | 1975,3252 185 | 1976,3222 186 | 1977,2790 187 | 1978,2880 188 | 1979,2808 189 | 1980,2736 190 | 1981,3186 191 | 1982,2988 192 | 1983,3114 193 | 1984,2970 194 | 1985,2970 195 | 1986,2898 196 | 1987,2700 197 | 1988,2736 198 | 1989,2520 199 | 1990,2320 200 | 1991,2156 201 | 1992,2140 202 | 1993,1728 203 | 1994,1656 204 | 1995,1638 205 | 1996,1728 206 | 1997,1800 207 | 1998,1656 208 | 1999,1546 209 | 2000,1564 210 | 2001,1512 211 | 2002,1512 212 | 2003,1412 213 | 2004,1440 214 | 2005,1566 215 | 2006,1346 216 | 2007,1314 217 | 2008,1494 218 | 2009,1656 219 | 2010,1530 220 | 2011,1386 221 | 2012,1422 222 | 2013,1350 223 | 2014,1278 -------------------------------------------------------------------------------- /visualization/Step2_viz_area_chart/style.css: -------------------------------------------------------------------------------- 1 | svg { 2 | font: 10px sans-serif; 3 | } 4 | 5 | body{ 6 | background-color: #F0F0F0; 7 | color:#000000; 8 | font-family: 'Oswald', sans-serif; 9 | } 10 | 11 | .color-legend text { 12 | font-family: 'Open Sans', sans-serif; 13 | font-size: 12pt; 14 | } 15 | 16 | .area { 17 | fill: #B4B4B4; 18 | opacity: .8; 19 | clip-path: url(#clip); 20 | } 21 | 22 | .layer { 23 | clip-path: url(#clip); 24 | } 25 | 26 | 27 | 28 | 29 | .axis path, 30 | .axis line { 31 | fill: none; 32 | stroke: #565656; 33 | stroke-width: 1px; 34 | shape-rendering: crispEdges; 35 | } 36 | 37 | .brush .extent { 38 | stroke: black; 39 | fill: white; 40 | fill-opacity: .6; 41 | shape-rendering: crispEdges; 42 | } 43 | 44 | 45 | .viz-container{ 46 | width:100%; 47 | height:800px; 48 | margin-bottom:10px; 49 | } 50 | .legend{ 51 | border-radius: 5px; 52 | width:270px; 53 | height:95%; 54 | float:left; 55 | background-color: #E7E8EA; 56 | overflow:auto; 57 | } 58 | .legend_fixed{ 59 | border-radius: 5px; 60 | width:270px; 61 | height:95%; 62 | float:left; 63 | background-color: #E7E8EA; 64 | overflow:hidden; 65 | box-shadow: 5px 5px 5px #C1C1C1; 66 | } 67 | 68 | .legend_scroll{ 69 | width:270px; 70 | height:100%; 71 | position: bottom; 72 | background-color: #E7E8EA; 73 | overflow:auto; 74 | } 75 | .viz{ 76 | height:650px; 77 | position: absolute; 78 | left:350px; 79 | } 80 | 81 | .button_clearall{ 82 | 83 | width:300px; 84 | height:40px; 85 | background-color:#484848; 86 | color:#F4F4F4; 87 | /*padding-top: 8px;*/ 88 | text-align:center; 89 | font-size: 22px; 90 | cursor: pointer; 91 | } 92 | 93 | 94 | .button{ 95 | margin-top:10px; 96 | width:240px; 97 | height:25px; 98 | background-color:#000000; 99 | color:#999999; 100 | text-align:center; 101 | } 102 | 103 | .button_selected{ 104 | margin-top:2px; 105 | width:280px; 106 | height:40px; 107 | background-color:#000000; 108 | color: #484848; 109 | text-align:center; 110 | cursor: pointer; 111 | } 112 | 113 | 114 | .spacer{ 115 | margin: 25px 50px 40px 100px; 116 | } 117 | 118 | .legend-topics{ 119 | border-radius: 5px; 120 | margin-left:8px; 121 | width:250px; 122 | float:left; 123 | overflow-y: scroll; 124 | } 125 | .topic-button{ 126 | border-radius: 2px; 127 | margin-top:2px; 128 | /*padding-left:5px;*/ 129 | width:240px; 130 | height:25px; 131 | opacity:.7; 132 | cursor:pointer; 133 | text-align:center; 134 | } 135 | .topic-button:hover{ 136 | opacity:1; 137 | } 138 | 139 | .node--hover circle { 140 | stroke: #000; 141 | stroke-width: 1.2px; 142 | border: 5px; 143 | } 144 | .tooltip { 145 | /*border-radius: 10px 15px 15px 15px;*/ 146 | background-color: #f7f7f7; 147 | padding: 3px 12px; 148 | max-width: 350px; 149 | width: 300px; 150 | height: 350px; 151 | overflow-y: auto; 152 | font-family: sans-serif; 153 | border: 1px solid #bbbbbb; 154 | box-shadow: 2px 2px 2px gray; 155 | word-wrap: break-word; 156 | } 157 | .tooltip_title { 158 | font-weight: bold; 159 | font-size: 20px; 160 | margin: 5px 0; 161 | max-width: 350px; 162 | word-wrap: normal; 163 | } 164 | .article-info{ 165 | color:#000000; 166 | overflow-y:auto; 167 | position:absolute; 168 | left:475px; 169 | float:left; 170 | width:470px; 171 | height:170px; 172 | margin-left:5px; 173 | } 174 | .tooltip_body { 175 | font-weight: normal; 176 | color:#000000; 177 | overflow-y:auto; 178 | /*overflow-x:visible;*/ 179 | position:absolute; 180 | /*width:250px;*/ 181 | width: 90%; 182 | /*max-width: 200px;*/ 183 | /*height:400px;*/ 184 | margin-left:2px; 185 | white-space: pre-wrap; 186 | word-wrap: break-word; 187 | font-weight: lighter; 188 | font-family: sans-serif; 189 | font-size: 14px; 190 | /*word-break: break-all;*/ 191 | 192 | } 193 | .tooltip_img { 194 | max-width: 240px; 195 | } 196 | 197 | .tbd { 198 | font-weight: normal; 199 | color:#000000; 200 | overflow-y:auto; 201 | position: relative ; 202 | width: 90%; 203 | margin-left:2px; 204 | white-space: pre-wrap; 205 | word-wrap: break-all; 206 | 207 | font-family: sans-serif; 208 | /*word-break: break-all;*/ 209 | 210 | } 211 | 212 | 213 | .tooltip_pps { 214 | width: 90%; 215 | /*word-wrap: break-word;*/ 216 | /*word-break: break-all;*/ 217 | } 218 | 219 | 220 | 221 | -------------------------------------------------------------------------------- /Topic_Modeling_Tutorial/Step3_pipeline_Text-Cleaner.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Text Processing Notebook\n", 8 | "\n", 9 | "I originally had this notebook as part of the initial modeling notebook, but split it out to make everything a bit more clear. Additionally, I've learned a lot about text preprocessing since originally doing this project, so I made these functions a bit more efficient as well. \n", 10 | "\n", 11 | "As a note about stopwords if you've never done NLP before - there's no one size fits all stopwords subsitution list for knowledge of your domain and which words should be excluded (or kept). Creating a powerful stopwords list is an interative process and requires a lot of stopchecking of your actual data to do it right.\n", 12 | "\n", 13 | "## Text preprocessing process\n", 14 | "1. Removing state names\n", 15 | "2. Removing case names\n", 16 | "3. Removing common stopwords (for example, \"the\" isn't a useful word)\n", 17 | "4. Removing people's names (loading the baby name dataset from sklearn)\n", 18 | "5. Removing day of the week, month names - this throws off our model into thinking we care about period of time\n", 19 | "6. Stripping non-words (lots of numbers referencing other cases - another interesting project could be keeping ONLY nums)\n", 20 | "7. Lemmatizing (getting the root of a word - ie run out of running)" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": { 27 | "collapsed": true 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "import pandas as pd\n", 32 | "import re\n", 33 | "\n", 34 | "from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS\n", 35 | "from nltk.corpus import stopwords\n", 36 | "from spacy.en import English\n", 37 | "parser = English()\n", 38 | "import nltk" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": { 45 | "collapsed": true 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "names = nltk.corpus.names\n", 50 | "male_names = names.words('male.txt')\n", 51 | "female_names = names.words('female.txt')\n", 52 | "male_names = [w.lower() for w in male_names]\n", 53 | "male_names_plur = [(w.lower() + \"s\") for w in male_names]\n", 54 | "female_names_plur = [(w.lower() + \"s\") for w in female_names]\n", 55 | "female_names = [w.lower() for w in female_names]\n", 56 | "casenames = list(pd.read_csv(\"casetitles.csv\",encoding = 'iso-8859-1'))\n", 57 | "statenames = list(pd.read_csv(\"statenames.csv\"))" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": { 64 | "collapsed": true 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "homespun_words = ['join', 'seek', 'ginnane', 'kestenbaum', 'hummel', 'loevinger', 'note', 'curiam', 'mosk', 'pd', \\\n", 69 | " 'paxton', 'rhino', 'buchsbaum', 'hirshowitz', 'misc', 'assistant', 'whereon', 'dismiss', 'sod', \\\n", 70 | " 'vote', 'present', 'entire', 'frankfurter', 'ante', 'leave', 'concur', 'entire', 'mootness', \\\n", 71 | " 'track', 'constitution', 'jj', 'blackmun', 'rehnquist', 'amici,sup', 'rep', 'stat', 'messes', \\\n", 72 | " 'like', 'rev', 'trans', 'bra', 'teller', 'vii', 'erisa', 'usca', 'annas', 'lead', 'cf', 'cca', \\\n", 73 | " 'fsupp', 'afdc', 'amicus', 'ante', 'orrick', 'kansa', 'pd', 'foth', 'stucky', 'aver',\"united\", \\\n", 74 | " \"may\", \"argued\", \"argue\", \"decide\", \"rptr\", \"nervine\", \"pp\",\"fd\" ,\"june\", \"july\", \\\n", 75 | " \"august\", \"september\", \"october\", \"november\", \"states\", \"ca\", \"joyce\", \"certiorari\", \"december\",\\\n", 76 | " \"january\", \"february\", \"march\", \"april\", \"writ\", \"supreme court\", \"court\", \"dissent\", \\\n", 77 | " \"opinion\", \"footnote\",\"brief\", \"decision\", \"member\", \"curiam\", \"dismiss\", \"note\", \"affirm\", \\\n", 78 | " \"question\", \"usc\", \"file\"]\n", 79 | "\n", 80 | "STOPLIST = set(stopwords.words('english') + list(homespun_words) + list(ENGLISH_STOP_WORDS) \\\n", 81 | " + list(statenames) + list(casenames) + list(female_names) + list(male_names) + \\ \n", 82 | " list(female_names_plur) + list(male_names_plur))" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "## Text Cleaner - including stopwords" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": { 96 | "collapsed": true 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "STOPLIST = set(list(stopwords.words('english')) + list(sub_list) + list(ENGLISH_STOP_WORDS))\n", 101 | "\n", 102 | "def tokenizeText(sample):\n", 103 | " separators = [\"\\xa0\\xa0\\xa0\\xa0\", \"\\r\", \"\\n\", \"\\t\", \"n't\", \"'m\", \"'ll\", '[^a-z ]']\n", 104 | " for i in separators:\n", 105 | " sample = re.sub(i, \" \", sample.lower())\n", 106 | " \n", 107 | " ## get the tokens using spaCy - this makes it possible to lemmatize the words\n", 108 | " tokens = parser(sample)\n", 109 | " tokens = [tok.lemma_.strip() for tok in tokens]\n", 110 | "\n", 111 | " ## apply our stoplist\n", 112 | " return [tok for tok in tokens if len(tok) != 1 and tok not in STOPLIST]" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": { 119 | "collapsed": true 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "doc_list[\"lem\"] = doc_list.case.apply(text_processing)\n", 124 | "doc_list.to_pickle(\"full_proj_lemmatized.pickle\") ## to be used in model selection" 125 | ] 126 | } 127 | ], 128 | "metadata": { 129 | "kernelspec": { 130 | "display_name": "Python 3", 131 | "language": "python", 132 | "name": "python3" 133 | }, 134 | "language_info": { 135 | "codemirror_mode": { 136 | "name": "ipython", 137 | "version": 3 138 | }, 139 | "file_extension": ".py", 140 | "mimetype": "text/x-python", 141 | "name": "python", 142 | "nbconvert_exporter": "python", 143 | "pygments_lexer": "ipython3", 144 | "version": "3.5.1" 145 | } 146 | }, 147 | "nbformat": 4, 148 | "nbformat_minor": 0 149 | } 150 | -------------------------------------------------------------------------------- /Topic_Modeling_Tutorial/Step1_pipeline_supcourt_yearlist.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from bs4 import BeautifulSoup\n", 12 | "import requests\n", 13 | "import re\n", 14 | "\n", 15 | "import pandas as pd\n", 16 | "import numpy as np\n", 17 | "\n", 18 | "headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) \\\n", 19 | " Chrome/39.0.2171.95 Safari/537.36'}" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "## Creating a list of Supreme Court Opinions and their URLs\n", 27 | "I found that caselaw has a repository of Supreme Court ruling opinions. This notebook contains the code I wrote to compile the list of these URLs and titles to use to grab the opinions.\n", 28 | "\n", 29 | "If you want the data I used for this project, you can start by running these cells. " 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": { 36 | "collapsed": true 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "root_url = \"http://caselaw.findlaw.com/court/us-supreme-court/years/\"" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 3, 46 | "metadata": { 47 | "collapsed": false 48 | }, 49 | "outputs": [], 50 | "source": [ 51 | "years = [root_url + str(year) for year in range(1760,2018)]\n", 52 | "\n", 53 | "\n", 54 | "def Beautiful_soup_grabber(link):\n", 55 | " response = requests.get(link, headers = headers)\n", 56 | " return BeautifulSoup(response.text, \"lxml\")\n", 57 | "\n", 58 | "def year_getter(years):\n", 59 | " \n", 60 | " y = {}\n", 61 | " for year in years:\n", 62 | " soup = Beautiful_soup_grabber(year)\n", 63 | " souplist = soup.findAll(\"a\")\n", 64 | " for i in souplist:\n", 65 | " if re.search(\"us-supreme-court\", str(i)) and not re.search(\"years\", str(i)) and not re.search(\"/court/\", str(i)):\n", 66 | " b = i[\"href\"]\n", 67 | " y[b] = [re.sub(\"[^0-9]\", \"\", b.split(\"/\")[-1])]\n", 68 | " return pd.DataFrame(y).transpose().reset_index()" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 4, 74 | "metadata": { 75 | "collapsed": true 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "df = year_getter(years)\n", 80 | "df.columns = [\"case_url\", \"docket\"]" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 5, 86 | "metadata": { 87 | "collapsed": false 88 | }, 89 | "outputs": [ 90 | { 91 | "data": { 92 | "text/html": [ 93 | "
\n", 94 | "\n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | "
case_urldocket
0http://caselaw.findlaw.com/us-supreme-court/05...051101
1http://caselaw.findlaw.com/us-supreme-court/06...0611951
2http://caselaw.findlaw.com/us-supreme-court/06...06263
3http://caselaw.findlaw.com/us-supreme-court/06...065590
4http://caselaw.findlaw.com/us-supreme-court/07...071390
\n", 130 | "
" 131 | ], 132 | "text/plain": [ 133 | " case_url docket\n", 134 | "0 http://caselaw.findlaw.com/us-supreme-court/05... 051101\n", 135 | "1 http://caselaw.findlaw.com/us-supreme-court/06... 0611951\n", 136 | "2 http://caselaw.findlaw.com/us-supreme-court/06... 06263\n", 137 | "3 http://caselaw.findlaw.com/us-supreme-court/06... 065590\n", 138 | "4 http://caselaw.findlaw.com/us-supreme-court/07... 071390" 139 | ] 140 | }, 141 | "execution_count": 5, 142 | "metadata": {}, 143 | "output_type": "execute_result" 144 | } 145 | ], 146 | "source": [ 147 | "df.head(5)" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 8, 153 | "metadata": { 154 | "collapsed": false 155 | }, 156 | "outputs": [ 157 | { 158 | "data": { 159 | "text/plain": [ 160 | "'http://caselaw.findlaw.com/us-supreme-court/05-1101.html'" 161 | ] 162 | }, 163 | "execution_count": 8, 164 | "metadata": {}, 165 | "output_type": "execute_result" 166 | } 167 | ], 168 | "source": [ 169 | "df.ix[0, \"case_url\"]" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 10, 175 | "metadata": { 176 | "collapsed": true 177 | }, 178 | "outputs": [], 179 | "source": [ 180 | "df.to_pickle(\"supcourt_yearlist.pickle\")" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 9, 186 | "metadata": { 187 | "collapsed": false 188 | }, 189 | "outputs": [ 190 | { 191 | "data": { 192 | "text/plain": [ 193 | "(23310, 2)" 194 | ] 195 | }, 196 | "execution_count": 9, 197 | "metadata": {}, 198 | "output_type": "execute_result" 199 | } 200 | ], 201 | "source": [ 202 | "df.shape" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": { 209 | "collapsed": true 210 | }, 211 | "outputs": [], 212 | "source": [] 213 | } 214 | ], 215 | "metadata": { 216 | "kernelspec": { 217 | "display_name": "Python 3", 218 | "language": "python", 219 | "name": "python3" 220 | }, 221 | "language_info": { 222 | "codemirror_mode": { 223 | "name": "ipython", 224 | "version": 3 225 | }, 226 | "file_extension": ".py", 227 | "mimetype": "text/x-python", 228 | "name": "python", 229 | "nbconvert_exporter": "python", 230 | "pygments_lexer": "ipython3", 231 | "version": "3.5.2" 232 | } 233 | }, 234 | "nbformat": 4, 235 | "nbformat_minor": 0 236 | } 237 | -------------------------------------------------------------------------------- /visualization/Step2_viz_area_chart/index.js: -------------------------------------------------------------------------------- 1 | 2 | var format = d3.time.format("%Y"); 3 | 4 | var margin = {top: 100, right: 225, bottom: 100, left: 40}, 5 | margin2 = {top: 730, right: 225, bottom: 20, left: 40}, 6 | width = 1500 - margin.left - margin.right, 7 | height = 800 - margin.top - margin.bottom, 8 | height2 = 800 - margin2.top - margin2.bottom; 9 | 10 | var x = d3.time.scale().range([0, width]), 11 | x2 = d3.time.scale().range([0, width]), 12 | y = d3.scale.linear().range([height, 0]), 13 | y2 = d3.scale.linear().range([height2, 0]); 14 | 15 | var hoveredColorValue; 16 | var hoveredStrokeColor = "black"; 17 | 18 | var z = d3.scale.ordinal() 19 | .range(["#a50026", 20 | "#d73027", 21 | "#f46d43", 22 | "#fdae61", 23 | "#fee090", 24 | "#ffffbf", 25 | "#e0f3f8", 26 | "#abd9e9", 27 | "#74add1", 28 | "#4575b4", 29 | "#313695", 30 | "#d9ef8b", 31 | "#a6d96a", 32 | "#66bd63", 33 | "#1a9850", 34 | "#006837", 35 | "#8c510a", 36 | "#bf812d", 37 | "#dfc27d"]); 38 | 39 | var xAxis = d3.svg.axis().scale(x).orient("bottom"), 40 | xAxis2 = d3.svg.axis().scale(x2).orient("bottom"), 41 | yAxis = d3.svg.axis().scale(y).orient("left"); 42 | 43 | var brush = d3.svg.brush() 44 | .x(x2) 45 | .on("brush", brushed); 46 | 47 | var area2 = d3.svg.area() 48 | .interpolate("monotone") 49 | .x(function(d) { return x2(d.date); }) 50 | .y0(height2) 51 | .y1(function(d) { return y2(d.value); }); 52 | 53 | var stack = d3.layout.stack() 54 | .offset("zero") 55 | .values(function(d) { return d.values; }) 56 | .x(function(d) { return d.date; }) 57 | .y(function(d) { return d.value; }); 58 | 59 | var area = d3.svg.area() 60 | .interpolate("basis") 61 | .x(function(d) { return x(d.date); }) 62 | .y0(function(d) { return y(d.y0); }) 63 | .y1(function(d) { return y(d.y0 + d.y); }); 64 | 65 | var tooltip = d3.select("body") 66 | .append("div") 67 | .attr("class", "tooltip") 68 | .style("position", "absolute") 69 | .style("z-index", "10") 70 | .style("visibility", "hidden"); 71 | 72 | 73 | var svg = d3.select("body").append("svg") 74 | .attr("width", width + margin.left + margin.right) 75 | .attr("height", height + margin.top + margin.bottom); 76 | 77 | svg.append("defs").append("clipPath") 78 | .attr("id", "clip") 79 | .append("rect") 80 | .attr("width", width) 81 | .attr("height", height); 82 | 83 | var focus = svg.append("g") 84 | .attr("class", "focus") 85 | .attr("transform", "translate(" + margin.left + "," + margin.top + ")"); 86 | 87 | var context = svg.append("g") 88 | .attr("class", "context") 89 | .attr("transform", "translate(" + margin2.left + "," + margin2.top + ")"); 90 | 91 | d3_queue.queue() 92 | .defer(d3.csv, "year_data.csv") 93 | .defer(d3.csv, "year_topic_data2.csv") 94 | .awaitAll(draw); 95 | 96 | function draw(error, data) { 97 | if (error) throw error; 98 | yearData = data[0]; 99 | topicData = data[1]; 100 | 101 | yearData.forEach(function(d) { 102 | d.date = format.parse(d.date); 103 | d.value = +d.value; 104 | }); 105 | 106 | topicData.forEach(function(d) { 107 | d.date = format.parse(d.date); 108 | d.value = +d.value; 109 | d.key = d.key; 110 | d.title = d.title; 111 | d.leadpp = d.leadpp; 112 | d.words = d.topicwords; 113 | d.url = d.exampleURL; 114 | }); 115 | 116 | var nestStack = d3.nest() 117 | .key(function(d) { return d.key; }) 118 | .entries(topicData); 119 | 120 | var layers = stack(nestStack); 121 | 122 | x.domain(d3.extent(yearData.map(function(d) { return d.date; }))); 123 | x2.domain(x.domain()); 124 | y.domain([0, d3.max(topicData, function(d) { return d.y0 + d.y; })]); 125 | y2.domain([0, d3.max(yearData.map(function(d) { return d.value; }))]); 126 | 127 | context.append("path") 128 | .datum(yearData) 129 | .attr("class", "area") 130 | .attr("d", area2); 131 | 132 | context.append("g") 133 | .attr("class", "x axis") 134 | .attr("transform", "translate(0," + height2 + ")") 135 | .call(xAxis2); 136 | 137 | context.append("g") 138 | .attr("class", "x brush") 139 | .call(brush) 140 | .selectAll("rect") 141 | .attr("y", -6) 142 | .attr("height", height2 + 7); 143 | 144 | context.append("text") 145 | .attr("x", 80) 146 | .attr("y", 35) 147 | .style("fill", '#555555') 148 | .text("Drag cursor here to zoom") 149 | .style("font-size", 15) 150 | .style("text-anchor", 'middle'); 151 | 152 | 153 | var addTooltip = function(d) { 154 | tooltip.html(""); 155 | tooltip.append("h3").attr("class", "tooltip_title"); 156 | tooltip.append("pre").attr("class", "tooltip_body"); 157 | tooltip.select(".tooltip_title") 158 | .text(d.key) 159 | 160 | topicData = d.values[0]; 161 | html = "Example Case: " + "\n" + "" + topicData.title + ""+ "\n"; 162 | html += topicData.leadpp + "..." + "" + "[READ MORE]"+ "" + "\n\n\n\n" 163 | html += "Topic Words: " + "\n" + topicData.topicwords + "\n"; 164 | 165 | 166 | tooltip.select(".tooltip_body").html(html); 167 | 168 | return tooltip.style("visibility", "visible"); 169 | } 170 | 171 | 172 | focus.selectAll(".layer") 173 | .data(layers) 174 | .enter().append("path") 175 | .attr("class", "layer") 176 | .attr("d", function(d) { return area(d.values); }) 177 | .style("fill", function(d, i) { return z(d.key); }) 178 | .on("click", function(d) { 179 | d3.selectAll(".layer").attr("opacity", 0.3); 180 | d3.select(this) 181 | .style("fill", "black") 182 | .attr("opacity", 1); 183 | addTooltip(d); 184 | tooltip.style("top", 15 + "px").style("left", 65 + "px"); 185 | d3.event.stopPropagation(); 186 | }) 187 | svg.on("click", function() { 188 | d3.selectAll(".layer").attr("opacity", 1) 189 | .style("fill", function(d, i) { return z(d.key); }); 190 | tooltip.style("visibility", "hidden"); 191 | }) 192 | 193 | focus.append("g") 194 | .attr("class", "x axis") 195 | .attr("transform", "translate(0," + height + ")") 196 | .call(xAxis); 197 | 198 | focus.append("g") 199 | .attr("class", "y axis") 200 | .call(yAxis); 201 | 202 | focus.append("text") 203 | .attr("x", width / 2) 204 | .attr("y", 20) 205 | .style("fill", '#555555') 206 | .text("Supreme Court Topics Over Time") 207 | .style("font-size", 32) 208 | .style("text-anchor", 'middle'); 209 | 210 | focus.append("text") 211 | .attr("dy", "1.14em") 212 | .attr("transform", "rotate(-90)") 213 | .style("fill", '#555555') 214 | .text("Number of Cases") 215 | .style("font-size", 10) 216 | .style("text-anchor", 'end'); 217 | 218 | focus.append("text") 219 | .attr("x", width / 2) 220 | .attr("y", 40) 221 | .style("fill", '#555555') 222 | .text("(click the graph for details)") 223 | .style("font-size", 15) 224 | .style("text-anchor", 'middle'); 225 | 226 | svg.append("g") 227 | .attr("class", "legendOrdinal") 228 | .attr("transform", "translate(" + [width + margin.left + 25, margin.top] + ")"); 229 | 230 | var legendOrdinal = d3.legend.color() 231 | .shape("path", d3.svg.symbol().type("square").size(150)()) 232 | .shapePadding(10) 233 | .scale(z); 234 | 235 | svg.select(".legendOrdinal") 236 | .call(legendOrdinal); 237 | }; 238 | 239 | function brushed() { 240 | x.domain(brush.empty() ? x2.domain() : brush.extent()); 241 | focus.selectAll(".layer").attr("d", function(d) { return area(d.values); }) 242 | focus.select(".x.axis").call(xAxis); 243 | } 244 | -------------------------------------------------------------------------------- /Topic_Modeling_Tutorial/Step4_pipeline_Model-testing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Topic Modeling with various methods\n", 8 | "Topic modeling is a powerful tool for quickly sorting through a lot of text and documents without having to read every one. There are several methods available for this using python, as well as several libraries. Topic modeling is extremely challenging to get meaningful results. \"Garbage in, garbage out\" is a phrase that applies well to this - we have to do a significant amount of text preprocessing to extract the right information to feed into a model. On this sheet, I will be topic modeling supreme court cases with the following:\n", 9 | "\n", 10 | "__SKlearn__\n", 11 | "\n", 12 | "LDA (with TF)\n", 13 | "\n", 14 | "NMF (with TFIDF)\n", 15 | "\n", 16 | "LSA - AKA TruncatedSVD (with TF and TFIDF)" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "### Process of the ENTIRE project\n", 24 | "Extracting text using beautiful soup --> processing the text --> fitting text to a model --> applying model to other text" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "metadata": { 31 | "collapsed": true 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "import pandas as pd\n", 36 | "import re\n", 37 | "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer\n", 38 | "from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD\n", 39 | "from textblob import TextBlob\n", 40 | "from sklearn.preprocessing import Normalizer" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 25, 46 | "metadata": { 47 | "collapsed": false 48 | }, 49 | "outputs": [], 50 | "source": [ 51 | "doc_list.read_pickle(\"full_proj_lemmatized3.pickle\") #always save your work!" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 26, 57 | "metadata": { 58 | "collapsed": false 59 | }, 60 | "outputs": [ 61 | { 62 | "data": { 63 | "text/plain": [ 64 | "(23268, 5)" 65 | ] 66 | }, 67 | "execution_count": 26, 68 | "metadata": {}, 69 | "output_type": "execute_result" 70 | } 71 | ], 72 | "source": [ 73 | "doc_list.shape #checking to make sure we have the info we expected to have" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "## _____________________________________________________________________\n", 81 | "## Model testing section\n", 82 | "I'm trying LDA, NMF and LSA as well as adjusting # of features, # topics, and overlap. " 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 103, 88 | "metadata": { 89 | "collapsed": true 90 | }, 91 | "outputs": [], 92 | "source": [ 93 | "def print_top_words(model, feature_names, n_top_words):\n", 94 | " for topic_idx, topic in enumerate(model.components_):\n", 95 | " print(\"Topic #%d:\" % topic_idx)\n", 96 | " print(\" \".join([feature_names[i]\n", 97 | " for i in topic.argsort()[:-n_top_words - 1:-1]]))\n", 98 | " print()\n", 99 | " \n", 100 | " \n", 101 | "def modeler(corp, n_topics, n_top_words, clf, vect):\n", 102 | " df = .80\n", 103 | " str_vect = str(vect).split(\"(\")[0]\n", 104 | " str_clf = str(clf).split(\"(\")[0]\n", 105 | "\n", 106 | " print(\"Extracting {} features for {}...\".format(str_vect, str_clf))\n", 107 | " vect_trans = vect.fit_transform(corp)\n", 108 | "\n", 109 | "\n", 110 | " # Fit the model\n", 111 | " print(\"Fitting the {} model with {} features, \"\n", 112 | " \"n_topics= {}, n_topic_words= {}, n_features= {}...\"\n", 113 | " .format(str_clf, str_vect, n_topics, n_top_words, n_features))\n", 114 | "\n", 115 | " clf = clf.fit(vect_trans)\n", 116 | " if str_clf == \"TruncatedSVD\":\n", 117 | " print(\"\\nExplained variance ratio\", clf.explained_variance_ratio_)\n", 118 | " \n", 119 | " print(\"\\nTopics in {} model:\".format(str_clf))\n", 120 | " feature_names = vect.get_feature_names()\n", 121 | " return print_top_words(clf, feature_names, n_top_words) " 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "### NMF model\n", 129 | "Find two non-negative matrices (W, H) whose product approximates the non- negative matrix X. This factorization can be used for example for dimensionality reduction, source separation or topic extraction." 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": { 136 | "collapsed": false, 137 | "scrolled": true 138 | }, 139 | "outputs": [ 140 | { 141 | "name": "stdout", 142 | "output_type": "stream", 143 | "text": [ 144 | "Extracting tf-idf features for NMF...\n", 145 | "Fitting the NMF model with tf-idf features, n_topics= 30, n_topic_words= 30, n_features= 2000...\n", 146 | "\n", 147 | "Topics in NMF model:\n", 148 | "Topic #0:\n", 149 | "jurisdiction suit admiralty citizenship controversy bring question exclusive arise removal diversity proceeding original exercise jurisdictional complaint want stat dismiss appellate confer final merit constitution entertain remove allege section judgment venue\n", 150 | "Topic #1:\n", 151 | "dismiss curiam want whereon substantial report appellee question misc appellant assistant pd appellees jurisdiction improvidently sod solicitor probable frankfurter app note consideration decision mosk moot paxton brief reverse rhyne dispense\n", 152 | "Topic #2:\n", 153 | "respondent brief reverse judgment file assistant affirm solicitor curia urge footnote divide join improvidently amicus amici rehnquist equally complaint reversal jj curiam deliver usc app blackmun decision affirmance award conclude\n", 154 | "Topic #3:\n", 155 | "vacate remand pauperis forma curiam judgment ante proceed consideration leave assistant solicitor dissent misc report reason reverse moot reconsideration proceeding examination decision suggestion concur ninth record app mootness disposition entire\n", 156 | "Topic #4:\n", 157 | "bankruptcy bankrupt creditor debtor lien debt referee proceeding priority discharge adjudication insolvent preference asset payment file filing sub adjudge assignee section provable possession assignment claim month firm attachment account proceeds\n", 158 | "Topic #5:\n", 159 | "stay applicant pending execution injunction habeas merit issue disposition request mandate vacate relief decision vote judgment irreparable seek present denial final proceeding enter schedule consideration bail ninth rehnquist filing file\n", 160 | "Topic #6:\n", 161 | "sup rep stat messrs statute question error constitution decision inters make defendant subject cite deliver sustain like rev proposition true lrans lra assume kansa principle consider settle teleg word objection\n", 162 | "Topic #7:\n", 163 | "arrest habeas confession evidence conviction murder custody prisoner hearing seizure convict probable seize statement record interrogation charge suppress issue make guilty person proceeding obtain agent robbery alien tell suspect narcotic\n", 164 | "Topic #8:\n", 165 | "sentence death sentencing penalty murder punishment circumstance impose conviction aggravating imprisonment eighth convict offense mitigate cruel guilty dissent juror unusual mitigating phase execution prosecutor aggravate imposition habeas guideline fourteenth commit\n", 166 | "Topic #9:\n", 167 | "lease lessee lessor mineral premise covenant year tenant depletion operate owner leasehold possession landlord operation acre purchase agree agreement acquire purpose terminate remove payment condition execute approval allottee ton restriction\n", 168 | "Topic #10:\n", 169 | "appellant affirm appellees appellee supp assistant curiam judgment ginnane solicitor probable note brief threejudge messrs footnote fourteenth hummel loevinger statute fortas equally clause divide kestenbaum challenge consideration set intrastate operation\n", 170 | "Topic #11:\n", 171 | "employee employer bargaining relation wage worker unfair collective practice discharge engage strike bargain hour employ pay member injury collectivebargaining activity vii picketing overtime erisa picket seniority membership workmens hire contractor\n", 172 | "Topic #12:\n", 173 | "indictment count offense charge conspiracy guilty commit conviction jeopardy prosecution plea person violation convict indict accuse false sentence defraud imprisonment statute defendant intent try return quash conspire felony plead usc\n", 174 | "Topic #13:\n", 175 | "sct usca alr supra section statute question cf suit decision footnote cca messrs respondent fsupp seq present compare bring decedent judgment intrastate usc appellant finding anncas clause ledd sustain relation\n", 176 | "Topic #14:\n", 177 | "decree injunction enter suit final proceeding make file report interlocutory adjudge enjoin record cost dismiss pray hearing hear set mandate deed issue messrs restrain answer entry relief supplemental finding divorce\n", 178 | "Topic #15:\n", 179 | "ordinance track construct pass pole operate constitution erect foot privilege inhabitant charge limit regulate fourteenth maintain hydrant permit purpose impair impose furnish year main passage regulation person injunction confiscatory operation\n", 180 | "Topic #16:\n", 181 | "brief statute file regulation usc clause require decision member join requirement violate apply dissent challenge seek curia issue footnote violation discrimination question relief complaint urge activity statutory program conduct provide\n", 182 | "Topic #17:\n", 183 | "rehearing messrs leave file seventh decision examination rehear submit footnote consideration eighth banc forma pauperis ninth reason sct oct denial issue proceed fsupp timely cl enter extend mandate entry entertain\n", 184 | "Topic #18:\n", 185 | "treaty reservation allotment acre survey stat cede allot article heir deed member cession convey possession make allottee conveyance severalty occupy affair military allottees acquire reside apart selection settler occupancy settlement\n", 186 | "Topic #19:\n", 187 | "witness testimony evidence testify contempt subpoena privilege answer prosecution question record statement immunity selfincrimination juror make refuse hearing prosecutor examination report crossexamination proceeding ask finding information refusal objection appear charge\n", 188 | "Topic #20:\n", 189 | "error judgment affirm defendant question constitution record statute decision verdict messrs render assignment validity sup reverse exception proceeding final cost appellate present bring recover enter favor contention raise dismiss sustain\n", 190 | "Topic #21:\n", 191 | "shall section provide person statute make stat constitution article thereof purpose duty subject authorize pay pass follow prescribe year word declare approve require construct privilege regulation mile usca condition necessary\n", 192 | "Topic #22:\n", 193 | "child death wife father parent husband decedent widow heir divorce daughter marriage illegitimate executor deed deceased age survive probate dependent gift custody testator afdc mrs sister paternity devise woman year\n", 194 | "Topic #23:\n", 195 | "receiver stockholder appoint creditor asset appointment receivership possession insolvent lien debt shareholder suit payment pay money note make foreclosure account currency proceeds insolvency pray ancillary file liabilities collect indebtedness enforce\n", 196 | "Topic #24:\n", 197 | "agreement arbitration arbitrator dispute collectivebargaining bargaining arbitrate collective agree award strike breach antitrust settlement clause bind compel arise enter negotiation adjustment controversy contractual enforceable member procedure write interpretation negotiate contractor\n", 198 | "Topic #25:\n", 199 | "value pay year payment taxpayer net deduction profit return taxable assess loss dividend make receive account cost money gain expense asset purchase stockholder premium deduct valuation total collect note debt\n", 200 | "Topic #26:\n", 201 | "claim claimant suit judgment file location pay payment limitation possession make damage award settlement allege refund allow present decision entitle date recover war assert lien stat mineral owner prior contractor\n", 202 | "Topic #27:\n", 203 | "certificate issue certify convenience necessity question operation route applicant authorize stamp record broker deed abandonment payment make member require par purchase chinese cancel operate receive hearing propose indebtedness issuance pledge\n", 204 | "Topic #28:\n", 205 | "defendant make deed allege complaint answer suit possession agent plea file aver set bring thereof demurrer evidence removal allegation appear notice convey premise enter judgment follow purchase pay aforesaid statement\n", 206 | "Topic #29:\n", 207 | "damage injury negligence track loss verdict injure recover death admiralty award evidence foot liable owner recovery employer shipment shipper result lade switch tort duty collision seaman engineer negligent servant libel\n", 208 | "\n" 209 | ] 210 | } 211 | ], 212 | "source": [ 213 | "modeler(doc_list.lem, 30, 30, NMF(n_components=30, random_state=1, alpha=.1, l1_ratio=.5), \\ \n", 214 | " tfidf_vectorizer = TfidfVectorizer(\n", 215 | " max_df=0.95,\n", 216 | " min_df=min_df_val, # Reduced from 5\n", 217 | " stop_words='english',\n", 218 | " ngram_range=(1, 1),\n", 219 | " max_features=5000 # Limit features to avoid memory issues\n", 220 | " )) " 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": {}, 226 | "source": [ 227 | "#### Notes about NMF performance \n", 228 | "Seeing these results makes me so happy - through several attempts of playing around with options for this model, this one has proved overwhelmingly good for the type of topic modeling I'm doing. I've done more reading about NMF and I think the methods behind it are what has lead to its awesome performance. Being able to use tf-idf I think is very important for this." 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "### Truncated SVD (LSA) Model\n", 236 | "This transformer performs linear dimensionality reduction by means of truncated singular value decomposition (SVD). It is very similar to PCA, but operates on sample vectors directly, instead of on a covariance matrix. This means it can work with scipy.sparse matrices efficiently.\n", 237 | "\n", 238 | "Notes: SVD suffers from a problem called “sign indeterminancy”, which means the sign of the components_ and the output from transform depend on the algorithm and random state. To work around this, fit instances of this class to data once, then keep the instance around to do transformations." 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": { 245 | "collapsed": false 246 | }, 247 | "outputs": [ 248 | { 249 | "name": "stdout", 250 | "output_type": "stream", 251 | "text": [ 252 | "Extracting tf-idf features for LSA...\n", 253 | "Fitting the LSA model with tf-idf features,n_samples=2000 and n_features=1000...\n", 254 | "\n", 255 | "Topics in LSA model:\n", 256 | "\n", 257 | "Explained variance ratio [ 0.00454438 0.04702202]\n", 258 | "Topic #0:\n", 259 | "defendant judgment dismiss make error shall sup respondent statute appellant claim question opinion issue section proceeding file curiam evidence remand\n", 260 | "Topic #1:\n", 261 | "dismiss curiam want appellant substantial appellees assistant whereon affirm appellee ginnane solicitor app vacate question judgment jurisdiction report hummel pd\n", 262 | "\n" 263 | ] 264 | } 265 | ], 266 | "source": [ 267 | "modeler(doc_list.lem, 100, 30, TruncatedSVD(2, algorithm = 'arpack'), TfidfVectorizer(max_df=.8, min_df=2,stop_words='english')) " 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "#### Notes about LSA performance\n", 275 | "A few attempts at tinkering with this algorithm did not improve its performance at all. The issues I'm finding with this are the same as the issues I found with LDA - it's good at pulling out the law themes, but that's not _really_ what I need. I really need the law terms to not play a role at all in modeling for these topics - we know that this entire corpus is about the law, but we need to know what KIND of law each case within the corpus is about. " 276 | ] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "metadata": {}, 281 | "source": [ 282 | "### Latent Dirchlet Allocation model \n", 283 | "In natural language processing, latent Dirichlet allocation (LDA) is a generative statistical model that allows sets of observations to be explained by unobserved groups that explain why some parts of the data are similar." 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 36, 289 | "metadata": { 290 | "collapsed": false 291 | }, 292 | "outputs": [ 293 | { 294 | "name": "stdout", 295 | "output_type": "stream", 296 | "text": [ 297 | "Extracting tf features for LDA...\n", 298 | "Fitting LDA models with tf features, n_samples and n_features=1000...\n", 299 | "\n", 300 | "Topics in LDA model:\n", 301 | "Topic #0:\n", 302 | "activity engage authorize acquire relate closely provide principally affirm brief hearing respondent affiliate language prohibit approval section statute determination record\n", 303 | "Topic #1:\n", 304 | "respondent statute evidence make opinion defendant sup shipper purpose file judgment notice provide child violate reverse deliver present claim appellee\n", 305 | "Topic #2:\n", 306 | "claim shall provide respondent injury pay manner defendant follow constitution suit make purpose proceeding ordinance yes pass regulation person effect\n", 307 | "Topic #3:\n", 308 | "shall make defendant statute purchase file section issue pay creditor provide appellant asset evidence require opinion purpose respondent determine resident\n", 309 | "Topic #4:\n", 310 | "defendant respondent make present judgment error amend exempt lease comment assistant suit render allege obtain premium affirm lawfully year duty\n", 311 | "Topic #5:\n", 312 | "charge tariff shipper respondent arrangement child dwell circumstance section citizenship sentence deliberation reach shall request furnish death claim pay decision\n", 313 | "Topic #6:\n", 314 | "defendant judgment make question error opinion proceeding file issue present dismiss claim sup suit evidence deliver affirm follow allege appellant\n", 315 | "Topic #7:\n", 316 | "jurisdiction claim sanction dissent immunity decision opinion statement respondent subject statute exemption reason file requirement procedure ante belief question brief\n", 317 | "Topic #8:\n", 318 | "file schedule notice hearing judgment lessee expense remand make prior opinion suit reverse appellee statute follow deliver proceeding value seek\n", 319 | "Topic #9:\n", 320 | "claim make statute shall issue section year provide person purpose defendant restriction subject authorize constitution permit period regulation appellee stat\n", 321 | "\n" 322 | ] 323 | } 324 | ], 325 | "source": [ 326 | "modeler(doc_list.lem, 30, 30, LatentDirichletAllocation(n_topics=30, max_iter=5, learning_method='online', \\\n", 327 | " learning_offset=50.,random_state=0), CountVectorizer(max_df=.80, min_df=2, \n", 328 | " stop_words='english'))" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": 37, 334 | "metadata": { 335 | "collapsed": false 336 | }, 337 | "outputs": [ 338 | { 339 | "name": "stdout", 340 | "output_type": "stream", 341 | "text": [ 342 | "Extracting tf features for LDA...\n", 343 | "Fitting LDA models with tf features, n_samples and n_features=2000...\n", 344 | "\n", 345 | "Topics in LDA model:\n", 346 | "Topic #0:\n", 347 | "defendant make judgment shall error evidence opinion issue purpose pay question proceeding respondent section statute provide suit present charge subject\n", 348 | "Topic #1:\n", 349 | "respondent death penalty tariff charge opinion shipper offender question defendant activity engage punishment evidence file execution arrangement comment authorize judgment\n", 350 | "Topic #2:\n", 351 | "opinion require make effect clause provide child affirm finding reimbursement religious violate voting number judgment result death deliver section program\n", 352 | "Topic #3:\n", 353 | "respondent statute file jurisdiction brief opinion appellee permit claim question authorize evidence deliver determine certificate issue counterclaim subject regulation judgment\n", 354 | "Topic #4:\n", 355 | "value shall damage cost make completion exceed date default erect pay respondent prior breach debt follow measure decision entitle lien\n", 356 | "Topic #5:\n", 357 | "abandonment statute statement immunity exemption provide make reason deed authorize belief opinion file decision abandon claim question section require evidence\n", 358 | "Topic #6:\n", 359 | "return relief child claim judgment proceeding question depletion file section suit seek statute make sup schedule cost continue entitle appellant\n", 360 | "Topic #7:\n", 361 | "claim make shall issue charge defendant question statute judgment follow creditor section appellant provide stat opinion person approve sup error\n", 362 | "Topic #8:\n", 363 | "dismiss entitlement appellant curiam question want allocation affirm purchase substantial assistant charge solicitor jurisdiction file brief appellees customer appellee confession\n", 364 | "Topic #9:\n", 365 | "account vacate make respondent remand agreement judgment pay dissent value deliver purchase dwell pauperis money half claim opinion error curiam\n", 366 | "\n" 367 | ] 368 | } 369 | ], 370 | "source": [ 371 | "LDA_mod(doc_list.lem, .95, 2, 2000,10) #df is a way to extract 'meaningful text' in this case" 372 | ] 373 | }, 374 | { 375 | "cell_type": "markdown", 376 | "metadata": {}, 377 | "source": [ 378 | "#### Notes about LDA model performance\n", 379 | "LDA was the first modeling type I tried, because it was the most frequently used in conversations about topic modeling. Initially I assumed that I would not have any other reasonable options, but LDA has proven ineffective for this project. I've done more reading about the differences between LDA and NMF, and LDA seems to be not so good at picking up subtle differences in a corpus about the same subject (as in, if I wanted to find the difference between Apple products and apple the fruit, LDA would probably work, but not if I need to find the difference between cases where the majority of the text is about the law). My suspicion is that this is because LDA can only use a count vectorizer rather than a tfidf, so this bag of words is a serious limitation to finding how these documents _relate_." 380 | ] 381 | } 382 | ], 383 | "metadata": { 384 | "anaconda-cloud": {}, 385 | "kernelspec": { 386 | "display_name": "Python [default]", 387 | "language": "python", 388 | "name": "python3" 389 | }, 390 | "language_info": { 391 | "codemirror_mode": { 392 | "name": "ipython", 393 | "version": 3 394 | }, 395 | "file_extension": ".py", 396 | "mimetype": "text/x-python", 397 | "name": "python", 398 | "nbconvert_exporter": "python", 399 | "pygments_lexer": "ipython3", 400 | "version": "3.5.2" 401 | } 402 | }, 403 | "nbformat": 4, 404 | "nbformat_minor": 0 405 | } 406 | -------------------------------------------------------------------------------- /pipeline.py: -------------------------------------------------------------------------------- 1 | """ 2 | Supreme Court Topic Modeling Pipeline 3 | ===================================== 4 | A complete pipeline for scraping, processing, and analyzing Supreme Court cases 5 | to extract legal topics using NMF topic modeling. 6 | 7 | Based on the original notebook series by [Your Name] 8 | """ 9 | 10 | import os 11 | import pandas as pd 12 | import numpy as np 13 | import re 14 | import glob 15 | import json 16 | import requests 17 | from bs4 import BeautifulSoup 18 | import pickle 19 | import time 20 | from pathlib import Path 21 | import logging 22 | from typing import Dict, List, Tuple, Optional 23 | import operator 24 | 25 | # NLP and ML imports 26 | from sklearn.feature_extraction.text import TfidfVectorizer 27 | from sklearn.decomposition import NMF 28 | from textblob import TextBlob 29 | 30 | # Handle different sklearn versions for stop words 31 | try: 32 | from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS 33 | except ImportError: 34 | from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS 35 | 36 | # Try to import NLTK components (install if needed) 37 | try: 38 | import nltk 39 | from nltk.corpus import stopwords, names 40 | # Download required NLTK data 41 | nltk.download('stopwords', quiet=True) 42 | nltk.download('names', quiet=True) 43 | except ImportError: 44 | print("NLTK not installed. Install with: pip install nltk") 45 | stopwords = None 46 | names = None 47 | 48 | # Try to import spaCy (install if needed) 49 | try: 50 | import spacy 51 | from spacy.en import English 52 | parser = English() 53 | except ImportError: 54 | print("spaCy not installed. Install with: pip install spacy") 55 | parser = None 56 | 57 | # Set up logging 58 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 59 | logger = logging.getLogger(__name__) 60 | 61 | 62 | class SupremeCourtTopicModeler: 63 | """ 64 | A complete pipeline for Supreme Court case topic modeling. 65 | 66 | Steps: 67 | 1. Scrape case URLs and metadata 68 | 2. Extract full case text 69 | 3. Clean and preprocess text 70 | 4. Apply topic modeling (NMF) 71 | 5. Generate visualization data 72 | """ 73 | 74 | def __init__(self, data_dir: str = "data", start_year: int = 1760, end_year: int = 2018): 75 | self.data_dir = Path(data_dir) 76 | self.data_dir.mkdir(exist_ok=True) 77 | self.start_year = start_year 78 | self.end_year = end_year 79 | self.str_data_dir = data_dir 80 | self.dir_contents = glob.glob(self.str_data_dir) 81 | 82 | # Web scraping headers to avoid being blocked 83 | self.headers = { 84 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) \ 85 | Chrome/39.0.2171.95 Safari/537.36", 86 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 87 | "Accept-Language": "en-US,en;q=0.5", 88 | "Connection": "keep-alive", 89 | "Upgrade-Insecure-Requests": "1", 90 | } 91 | 92 | # Initialize stopwords 93 | self._setup_stopwords() 94 | 95 | # Results storage 96 | self.case_urls_df = None 97 | self.full_cases_df = None 98 | self.processed_df = None 99 | self.final_results = None 100 | 101 | def _setup_stopwords(self): 102 | """Set up comprehensive stopwords list for legal text""" 103 | 104 | # Basic stopwords 105 | basic_stopwords = set(ENGLISH_STOP_WORDS) 106 | if stopwords: 107 | basic_stopwords.update(stopwords.words('english')) 108 | 109 | # Legal/court specific stopwords (reduced list - keep more legal terms) 110 | legal_stopwords = [ 111 | 'join', 'seek', 'note', 'pd', 'misc', 'assistant', 'whereon', 'dismiss', 'sod', 112 | 'vote', 'present', 'entire', 'ante', 'leave', 'concur', 'entire', 'mootness', 113 | 'jj', 'amici', 'sup', 'rep', 'stat', 'like', 'rev', 'trans', 'vii', 'erisa', 114 | 'usca', 'lead', 'cf', 'cca', 'fsupp', 'afdc', 'amicus', 'ante', 'pd', 'aver', 115 | 'may', 'argued', 'argue', 'decide', 'rptr', 'pp', 'fd', 'june', 'july', 116 | 'august', 'september', 'october', 'november', 'ca', 'certiorari', 117 | 'december', 'january', 'february', 'march', 'april', 'writ', 'footnote', 118 | 'member', 'curiam', 'usc', 'file' 119 | ] 120 | 121 | # Only include most common names to avoid being too aggressive 122 | if names: 123 | # Get only very common names to avoid removing too much 124 | common_male_names = ['john', 'james', 'robert', 'michael', 'william', 'david', 'richard', 'thomas'] 125 | common_female_names = ['mary', 'patricia', 'jennifer', 'linda', 'elizabeth', 'barbara', 'susan', 'jessica'] 126 | all_names = common_male_names + common_female_names 127 | else: 128 | all_names = [] 129 | 130 | # Reduced state names list (only abbreviations to avoid removing content) 131 | state_abbrevs = [ 132 | 'al', 'ak', 'az', 'ar', 'ca', 'co', 'ct', 'de', 'fl', 'ga', 'hi', 'id', 'il', 'in', 133 | 'ia', 'ks', 'ky', 'la', 'me', 'md', 'ma', 'mi', 'mn', 'ms', 'mo', 'mt', 'ne', 'nv', 134 | 'nh', 'nj', 'nm', 'ny', 'nc', 'nd', 'oh', 'ok', 'or', 'pa', 'ri', 'sc', 'sd', 'tn', 135 | 'tx', 'ut', 'vt', 'va', 'wa', 'wv', 'wi', 'wy' 136 | ] 137 | 138 | self.STOPLIST = basic_stopwords.union(set(legal_stopwords + all_names + state_abbrevs)) 139 | logger.info(f"Created stopwords list with {len(self.STOPLIST)} terms") 140 | 141 | def beautiful_soup_grabber(self, link: str, max_retries: int = 3) -> BeautifulSoup: 142 | """ 143 | Get BeautifulSoup object from URL with retry logic 144 | """ 145 | for attempt in range(max_retries): 146 | try: 147 | response = requests.get(link, headers=self.headers, timeout=10) 148 | response.raise_for_status() 149 | return BeautifulSoup(response.text, "lxml") 150 | except requests.exceptions.RequestException as e: 151 | logger.warning(f"Attempt {attempt + 1} failed for {link}: {e}") 152 | if attempt < max_retries - 1: 153 | time.sleep(2 ** attempt) # Exponential backoff 154 | else: 155 | logger.error(f"Failed to fetch {link} after {max_retries} attempts") 156 | return None 157 | 158 | def step1_get_case_urls(self) -> pd.DataFrame: 159 | """ 160 | Step 1: Scrape Supreme Court case URLs and metadata 161 | """ 162 | logger.info("Step 1: Collecting case URLs from Supreme Court archives...") 163 | 164 | root_url = "http://caselaw.findlaw.com/court/us-supreme-court/years/" 165 | years = [root_url + str(year) for year in range(self.start_year, self.end_year + 1)] 166 | 167 | case_data = {} 168 | 169 | for i, year_url in enumerate(years): 170 | if i % 10 == 0: 171 | logger.info(f"Processing year {self.start_year + i}/{self.end_year}") 172 | 173 | # Debug: show what URL we're trying to access 174 | logger.debug(f"Fetching: {year_url}") 175 | 176 | soup = self.beautiful_soup_grabber(year_url) 177 | if not soup: 178 | logger.warning(f"Failed to get soup for {year_url}") 179 | continue 180 | 181 | # New approach: look for the actual case links in the modern structure 182 | # Based on the screenshot, cases are in links that go to /us-supreme-court/ 183 | links = soup.findAll("a") 184 | logger.debug(f"Found {len(links)} total links on {year_url}") 185 | 186 | year_case_count = 0 187 | for link in links: 188 | href = link.get("href", "") 189 | 190 | # New pattern: look for links that contain /us-supreme-court/ and have case numbers 191 | # Example from screenshot: should match links to actual cases, not years pages 192 | if href and "/us-supreme-court/" in href and "/years/" not in href: 193 | # Additional check: make sure it's not a navigation link 194 | if not href.startswith("https://www.findlaw.com/") or "/us-supreme-court/" in href: 195 | # Extract case number from URL (last part before .html) 196 | url_parts = href.rstrip('/').split('/') 197 | if url_parts: 198 | last_part = url_parts[-1].replace('.html', '') 199 | # Extract numbers for docket 200 | docket = re.sub("[^0-9-]", "", last_part) 201 | 202 | if docket: # Only add if we found a docket number 203 | case_data[href] = docket 204 | year_case_count += 1 205 | 206 | # Debug: show first few matches 207 | if year_case_count <= 3: 208 | case_title = link.get_text(strip=True)[:50] 209 | logger.debug(f" Match {year_case_count}: {href} -> docket {docket} | {case_title}") 210 | 211 | logger.info(f" Year {self.start_year + i}: found {year_case_count} cases") 212 | 213 | # Be nice to the server 214 | time.sleep(0.5) 215 | 216 | logger.info(f"Total cases found across all years: {len(case_data)}") 217 | 218 | if len(case_data) == 0: 219 | logger.error("No cases found! Let's examine the page structure more closely...") 220 | # Let's look at the HTML structure more carefully 221 | test_year = 2000 222 | test_url = f"{root_url}{test_year}" 223 | logger.info(f"Detailed analysis of: {test_url}") 224 | 225 | soup = self.beautiful_soup_grabber(test_url) 226 | if soup: 227 | # Look for different patterns that might contain case links 228 | patterns_to_try = [ 229 | ("Links containing 'supreme'", lambda tag: tag.name == "a" and tag.get("href", "").find("supreme") != -1), 230 | ("Links containing case numbers", lambda tag: tag.name == "a" and re.search(r'\d+-\d+', tag.get("href", ""))), 231 | ("Links with 'U.S.' in text", lambda tag: tag.name == "a" and "U.S." in tag.get_text()), 232 | ] 233 | 234 | for pattern_name, pattern_func in patterns_to_try: 235 | matches = soup.find_all(pattern_func) 236 | logger.info(f"\n{pattern_name}: {len(matches)} matches") 237 | 238 | for j, match in enumerate(matches[:5]): # Show first 5 239 | href = match.get("href", "") 240 | text = match.get_text(strip=True)[:80] 241 | logger.info(f" {j+1}. {href} | {text}") 242 | 243 | # Also look at the overall page structure 244 | logger.info(f"\nPage title: {soup.title.string if soup.title else 'No title'}") 245 | 246 | # Look for any div or section that might contain cases 247 | case_containers = soup.find_all(['div', 'section'], class_=re.compile(r'case|decision|result')) 248 | logger.info(f"Found {len(case_containers)} potential case containers") 249 | 250 | df = pd.DataFrame(list(case_data.items()), columns=["case_url", "docket"]) 251 | 252 | # Save intermediate result and CSV for inspection 253 | output_file = self.data_dir / "supcourt_yearlist.pickle" 254 | csv_file = self.data_dir / "supcourt_yearlist.csv" 255 | 256 | df.to_pickle(output_file) 257 | df.to_csv(csv_file, index=False) 258 | 259 | logger.info(f"Step 1 complete. Found {len(df)} cases. Saved to {output_file} and {csv_file}") 260 | 261 | self.case_urls_df = df 262 | return df 263 | 264 | def step2_extract_case_text(self, batch_size: int = 5000) -> pd.DataFrame: 265 | """ 266 | Step 2: Extract full text from each case URL 267 | """ 268 | logger.info("Step 2: Extracting full case text...") 269 | 270 | if self.case_urls_df is None: 271 | # Try to load from file 272 | try: 273 | self.case_urls_df = pd.read_pickle(self.data_dir / "supcourt_yearlist.pickle") 274 | except FileNotFoundError: 275 | raise ValueError("No case URLs found. Run step1_get_case_urls() first.") 276 | 277 | df = self.case_urls_df.copy() 278 | 279 | def extract_case_content(url: str) -> str: 280 | """Extract case content from a single URL""" 281 | soup = self.beautiful_soup_grabber(url) 282 | if not soup: 283 | return "" 284 | 285 | # Try multiple selectors to find case content 286 | content_selectors = [ 287 | "div.caselawcontent.searchable-content", 288 | "div.caselawcontent", 289 | "div.searchable-content", 290 | "div[class*='content']", 291 | "div[class*='case']", 292 | "div[class*='opinion']", 293 | "div[class*='text']" 294 | ] 295 | 296 | all_text = [] 297 | 298 | for selector in content_selectors: 299 | content_divs = soup.select(selector) 300 | if content_divs: 301 | for div in content_divs: 302 | text = div.get_text(separator=' ', strip=True) 303 | if text and len(text) > 100: # Only include substantial text 304 | all_text.append(text) 305 | break # Use first successful selector 306 | 307 | # Fallback: get all paragraph text if specific selectors fail 308 | if not all_text: 309 | paragraphs = soup.find_all('p') 310 | for p in paragraphs: 311 | text = p.get_text(strip=True) 312 | if len(text) > 50: # Only substantial paragraphs 313 | all_text.append(text) 314 | 315 | # Final fallback: get all text but try to clean it 316 | if not all_text: 317 | body_text = soup.get_text(separator=' ', strip=True) 318 | if len(body_text) > 200: 319 | all_text.append(body_text) 320 | 321 | result = ' '.join(all_text) 322 | logger.debug(f"Extracted {len(result)} characters from {url}") 323 | return result 324 | 325 | # Process in batches to avoid overwhelming the server 326 | total_cases = len(df) 327 | df['case_text'] = "" 328 | 329 | for start_idx in range(0, total_cases, batch_size): 330 | end_idx = min(start_idx + batch_size, total_cases) 331 | logger.info(f"Processing cases {start_idx} to {end_idx} of {total_cases}") 332 | 333 | batch = df.iloc[start_idx:end_idx].copy() 334 | batch_results = [] 335 | 336 | for idx, row in batch.iterrows(): 337 | case_text = extract_case_content(row['case_url']) 338 | batch_results.append(case_text) 339 | 340 | # Progress indicator and rate limiting 341 | if (idx - start_idx) % 100 == 0: 342 | logger.info(f" Processed {idx - start_idx} cases in current batch") 343 | time.sleep(0.2) # Rate limiting 344 | 345 | # Update the main dataframe 346 | df.loc[start_idx:end_idx-1, 'case_text'] = batch_results 347 | 348 | # Save intermediate results 349 | temp_file = self.data_dir / f"temp_batch_{start_idx}_{end_idx}.pickle" 350 | df.iloc[start_idx:end_idx].to_pickle(temp_file) 351 | 352 | # Save final result 353 | output_file = self.data_dir / "full_proj_preproc.pickle" 354 | df.to_pickle(output_file) 355 | logger.info(f"Step 2 complete. Saved to {output_file}") 356 | 357 | self.full_cases_df = df 358 | return df 359 | 360 | def tokenize_text(self, text: str) -> List[str]: 361 | """ 362 | Tokenize and clean text using spaCy if available, otherwise basic processing 363 | """ 364 | if not text or not isinstance(text, str): 365 | return [] 366 | 367 | # Basic cleaning 368 | separators = ["\xa0\xa0\xa0\xa0", "\r", "\n", "\t", "n't", "'m", "'ll", '[^a-z ]'] 369 | clean_text = text.lower() 370 | for sep in separators: 371 | clean_text = re.sub(sep, " ", clean_text) 372 | 373 | if parser: 374 | # Use spaCy for better tokenization and lemmatization 375 | tokens = parser(clean_text) 376 | tokens = [tok.lemma_.strip() for tok in tokens if tok.lemma_.strip()] 377 | else: 378 | # Fallback to simple tokenization 379 | tokens = clean_text.split() 380 | 381 | # Apply stoplist and length filtering 382 | final_tokens = [tok for tok in tokens if len(tok) > 1 and tok not in self.STOPLIST] 383 | 384 | return final_tokens 385 | 386 | def step3_preprocess_text(self) -> pd.DataFrame: 387 | """ 388 | Step 3: Clean and preprocess case text 389 | """ 390 | logger.info("Step 3: Preprocessing text...") 391 | 392 | if self.full_cases_df is None: 393 | # Try to load from file 394 | try: 395 | self.full_cases_df = pd.read_pickle(self.data_dir / "full_proj_preproc.pickle") 396 | except FileNotFoundError: 397 | raise ValueError("No case text found. Run step2_extract_case_text() first.") 398 | 399 | df = self.full_cases_df.copy() 400 | 401 | # Debug: check text extraction quality 402 | logger.info("Analyzing extracted text quality...") 403 | df['text_length'] = df['case_text'].str.len() 404 | logger.info(f"Text length stats: mean={df['text_length'].mean():.0f}, " 405 | f"median={df['text_length'].median():.0f}, " 406 | f"min={df['text_length'].min()}, max={df['text_length'].max()}") 407 | 408 | # Remove very short documents 409 | min_length = 200 410 | initial_count = len(df) 411 | df = df[df['text_length'] >= min_length] 412 | logger.info(f"Removed {initial_count - len(df)} documents shorter than {min_length} characters") 413 | 414 | if len(df) == 0: 415 | raise ValueError("No documents remain after filtering short texts. Check text extraction.") 416 | 417 | # Sample some documents for debugging 418 | logger.info("Sample extracted text:") 419 | for i in range(min(3, len(df))): 420 | sample_text = df.iloc[i]['case_text'][:200] 421 | logger.info(f" Doc {i}: {sample_text}...") 422 | 423 | # Apply text preprocessing 424 | logger.info("Tokenizing and cleaning text...") 425 | df['processed_text'] = df['case_text'].apply(self.tokenize_text) 426 | 427 | # Debug: check tokenization results 428 | df['token_count'] = df['processed_text'].apply(len) 429 | logger.info(f"Token count stats: mean={df['token_count'].mean():.0f}, " 430 | f"median={df['token_count'].median():.0f}, " 431 | f"min={df['token_count'].min()}, max={df['token_count'].max()}") 432 | 433 | # Convert back to string for sklearn 434 | df['processed_text_str'] = df['processed_text'].apply(lambda x: ' '.join(x) if x else '') 435 | 436 | # Remove documents with too few tokens 437 | min_tokens = 10 438 | initial_count = len(df) 439 | df = df[df['token_count'] >= min_tokens] 440 | logger.info(f"Removed {initial_count - len(df)} documents with fewer than {min_tokens} tokens") 441 | 442 | if len(df) == 0: 443 | raise ValueError("No documents remain after tokenization. Check stop words list.") 444 | 445 | # Sample processed text for debugging 446 | logger.info("Sample processed text:") 447 | for i in range(min(3, len(df))): 448 | sample_tokens = df.iloc[i]['processed_text'][:20] 449 | logger.info(f" Doc {i}: {sample_tokens}") 450 | 451 | # Save result 452 | output_file = self.data_dir / "full_proj_lemmatized.pickle" 453 | df.to_pickle(output_file) 454 | logger.info(f"Step 3 complete. {len(df)} documents processed. Saved to {output_file}") 455 | 456 | self.processed_df = df 457 | return df 458 | 459 | def step4_topic_modeling(self, n_topics: int = 30, n_top_words: int = 40) -> Tuple[pd.DataFrame, Dict]: 460 | """ 461 | Step 4: Apply NMF topic modeling 462 | """ 463 | logger.info("Step 4: Applying topic modeling...") 464 | 465 | if self.processed_df is None: 466 | # Try to load from file 467 | try: 468 | self.processed_df = pd.read_pickle(self.data_dir / "full_proj_lemmatized.pickle") 469 | except FileNotFoundError: 470 | raise ValueError("No processed text found. Run step3_preprocess_text() first.") 471 | 472 | df = self.processed_df.copy() 473 | 474 | # Debug vocabulary before TF-IDF 475 | logger.info("Checking vocabulary before TF-IDF...") 476 | all_text = ' '.join(df['processed_text_str'].tolist()) 477 | unique_words = set(all_text.split()) 478 | logger.info(f"Total unique words in corpus: {len(unique_words)}") 479 | 480 | if len(unique_words) < 100: 481 | logger.warning(f"Very small vocabulary ({len(unique_words)} words). Reducing min_df.") 482 | min_df_val = 1 483 | else: 484 | min_df_val = 2 485 | 486 | # Set up TF-IDF vectorizer with more lenient parameters 487 | logger.info("Creating TF-IDF vectors...") 488 | tfidf_vectorizer = TfidfVectorizer( 489 | max_df=0.95, 490 | min_df=min_df_val, # Reduced from 5 491 | stop_words='english', 492 | ngram_range=(1, 1), 493 | max_features=5000 # Limit features to avoid memory issues 494 | ) 495 | 496 | try: 497 | tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_text_str']) 498 | feature_names = tfidf_vectorizer.get_feature_names_out() 499 | logger.info(f"TF-IDF matrix shape: {tfidf_matrix.shape}") 500 | logger.info(f"Vocabulary size: {len(feature_names)}") 501 | except ValueError as e: 502 | logger.error(f"TF-IDF failed: {e}") 503 | logger.info("Trying with even more lenient parameters...") 504 | 505 | # Emergency fallback: very lenient parameters 506 | tfidf_vectorizer = TfidfVectorizer( 507 | max_df=0.99, 508 | min_df=1, 509 | stop_words=None, # Don't use sklearn's stop words 510 | ngram_range=(1, 1), 511 | max_features=1000 512 | ) 513 | tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_text_str']) 514 | feature_names = tfidf_vectorizer.get_feature_names_out() 515 | logger.info(f"Fallback TF-IDF matrix shape: {tfidf_matrix.shape}") 516 | 517 | # Apply NMF 518 | logger.info(f"Fitting NMF model with {n_topics} topics...") 519 | nmf_model = NMF( 520 | n_components=n_topics, 521 | random_state=42, 522 | max_iter=1000 523 | ) 524 | 525 | nmf_matrix = nmf_model.fit_transform(tfidf_matrix) 526 | 527 | # Assign topics to documents 528 | topic_assignments = [] 529 | topic_strengths = [] 530 | 531 | for doc_topics in nmf_matrix: 532 | max_index, max_value = max(enumerate(doc_topics), key=operator.itemgetter(1)) 533 | topic_assignments.append(max_index) 534 | topic_strengths.append(max_value) 535 | 536 | df['topic_number'] = topic_assignments 537 | df['topic_strength'] = topic_strengths 538 | 539 | # Extract topic words 540 | topic_words = {} 541 | for topic_idx, topic in enumerate(nmf_model.components_): 542 | top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]] 543 | topic_words[topic_idx] = ', '.join(top_words) 544 | 545 | # Add topic words to dataframe 546 | df['topic_words'] = df['topic_number'].map(topic_words) 547 | 548 | # Print topic summary 549 | logger.info("\nTopic Summary:") 550 | topic_counts = df['topic_number'].value_counts().sort_index() 551 | for topic_idx in range(n_topics): 552 | count = topic_counts.get(topic_idx, 0) 553 | words = topic_words[topic_idx][:100] + "..." if len(topic_words[topic_idx]) > 100 else topic_words[topic_idx] 554 | logger.info(f"Topic {topic_idx} ({count} cases): {words}") 555 | 556 | # Save results 557 | output_file = self.data_dir / "topic_modeled_cases.pickle" 558 | df.to_pickle(output_file) 559 | 560 | # Save topic words separately 561 | topic_file = self.data_dir / "topic_words.json" 562 | with open(topic_file, 'w') as f: 563 | json.dump(topic_words, f, indent=2) 564 | 565 | logger.info(f"Step 4 complete. Saved to {output_file}") 566 | 567 | self.processed_df = df 568 | return df, topic_words 569 | 570 | def step5_prepare_visualization_data(self) -> pd.DataFrame: 571 | """ 572 | Step 5: Prepare data for D3.js visualization 573 | """ 574 | logger.info("Step 5: Preparing visualization data...") 575 | 576 | if self.processed_df is None: 577 | # Try to load from file 578 | try: 579 | self.processed_df = pd.read_pickle(self.data_dir / "topic_modeled_cases.pickle") 580 | except FileNotFoundError: 581 | raise ValueError("No topic-modeled data found. Run step4_topic_modeling() first.") 582 | 583 | df = self.processed_df.copy() 584 | 585 | # Extract year from case URL (this might need adjustment based on URL format) 586 | def extract_year_from_url(url): 587 | # Try to extract year from URL pattern 588 | year_match = re.search(r'/(\d{4})/', url) 589 | if year_match: 590 | return int(year_match.group(1)) 591 | # Fallback: try to extract from any 4-digit number 592 | year_match = re.search(r'\b(1[7-9]\d{2}|20[0-2]\d)\b', url) 593 | if year_match: 594 | return int(year_match.group(1)) 595 | return None 596 | 597 | df['year'] = df['case_url'].apply(extract_year_from_url) 598 | 599 | # Remove cases without valid years 600 | df = df.dropna(subset=['year']) 601 | df['year'] = df['year'].astype(int) 602 | 603 | # Create year-topic counts 604 | year_topic_counts = df.groupby(['year', 'topic_number']).size().reset_index(name='count') 605 | 606 | # Fill missing year-topic combinations with 0 607 | all_years = range(df['year'].min(), df['year'].max() + 1) 608 | all_topics = range(df['topic_number'].max() + 1) 609 | 610 | # Create complete year-topic grid 611 | complete_grid = [] 612 | for year in all_years: 613 | for topic in all_topics: 614 | complete_grid.append({'year': year, 'topic_number': topic}) 615 | 616 | complete_df = pd.DataFrame(complete_grid) 617 | viz_data = complete_df.merge(year_topic_counts, on=['year', 'topic_number'], how='left') 618 | viz_data['count'] = viz_data['count'].fillna(0).astype(int) 619 | 620 | # Add topic metadata (you might want to manually create topic names) 621 | topic_names = {i: f"Topic {i}" for i in range(df['topic_number'].max() + 1)} 622 | viz_data['topic_name'] = viz_data['topic_number'].map(topic_names) 623 | 624 | # Save visualization data 625 | viz_file = self.data_dir / "visualization_data.csv" 626 | viz_data.to_csv(viz_file, index=False) 627 | 628 | # Also create yearly totals for brushing visualization 629 | yearly_totals = df.groupby('year').size().reset_index(name='total_cases') 630 | yearly_file = self.data_dir / "yearly_totals.csv" 631 | yearly_totals.to_csv(yearly_file, index=False) 632 | 633 | logger.info(f"Step 5 complete. Visualization data saved to {viz_file}") 634 | 635 | self.final_results = viz_data 636 | return viz_data 637 | 638 | def get_data(self) -> pd.DataFrame: 639 | """ 640 | Get the final processed data with topics and case text 641 | """ 642 | 643 | results = {} 644 | 645 | if os.path.exists(self.data_dir / "supcourt_yearlist.pickle"): 646 | results['case_urls'] = pd.read_pickle(self.data_dir / "supcourt_yearlist.pickle") 647 | else: 648 | logger.warning("No case URLs found. Running step1_get_case_urls()...") 649 | results['case_urls'] = self.step1_get_case_urls() 650 | if os.path.exists(self.data_dir / "full_proj_preproc.pickle"): 651 | results['full_cases'] = pd.read_pickle(self.data_dir / "full_proj_preproc.pickle") 652 | else: 653 | logger.warning("No full cases found. Running step2_extract_case_text()...") 654 | results['full_cases'] = self.step2_extract_case_text() 655 | 656 | return results 657 | 658 | def run_full_pipeline(self, n_topics: int = 30) -> Dict: 659 | """ 660 | Run the complete pipeline from start to finish 661 | """ 662 | logger.info("Starting complete Supreme Court topic modeling pipeline...") 663 | 664 | results = {} 665 | results = self.get_data() 666 | try: 667 | 668 | # Step 3: Preprocess text 669 | results['processed_cases'] = self.step3_preprocess_text() 670 | 671 | # Step 4: Topic modeling 672 | results['topic_modeled'], results['topic_words'] = self.step4_topic_modeling(n_topics=n_topics) 673 | 674 | # Step 5: Prepare visualization data 675 | results['visualization_data'] = self.step5_prepare_visualization_data() 676 | 677 | logger.info("Pipeline completed successfully!") 678 | # Print summary 679 | final_df = results['topic_modeled'] 680 | logger.info(f""" 681 | Pipeline Summary: 682 | - Total cases processed: {len(final_df)} 683 | - Number of topics: {n_topics} 684 | - Data saved to: {self.data_dir} 685 | """) 686 | 687 | return results 688 | 689 | except Exception as e: 690 | logger.error(f"Pipeline failed at step: {e}") 691 | raise 692 | 693 | 694 | def main(): 695 | """ 696 | Run the complete Supreme Court topic modeling pipeline 697 | """ 698 | # Initialize the pipeline with a reasonable range 699 | pipeline = SupremeCourtTopicModeler( 700 | data_dir="supreme_court_data", 701 | start_year=1950, # Good range for substantial analysis 702 | end_year=2020 703 | ) 704 | 705 | logger.info("Starting complete Supreme Court topic modeling pipeline...") 706 | logger.info(f"Date range: {1950}-{2020}") 707 | 708 | try: 709 | # Run the complete pipeline 710 | results = pipeline.run_full_pipeline(n_topics=20) 711 | 712 | logger.info("🎉 Pipeline completed successfully!") 713 | 714 | # Print final summary 715 | final_df = results['topic_modeled'] 716 | viz_data = results['visualization_data'] 717 | 718 | logger.info(f""" 719 | 📊 Final Results Summary: 720 | ========================================== 721 | Total cases processed: {len(final_df):,} 722 | Number of topics identified: {20} 723 | 724 | 📁 Output files created: 725 | - supreme_court_data/supcourt_yearlist.csv (case URLs) 726 | - supreme_court_data/topic_modeled_cases.pickle (full results) 727 | - supreme_court_data/topic_words.json (topic definitions) 728 | - supreme_court_data/visualization_data.csv (D3.js ready) 729 | - supreme_court_data/yearly_totals.csv (for brushing viz) 730 | 731 | 🏷️ Top 5 Most Common Topics: 732 | """) 733 | 734 | topic_counts = final_df['topic_number'].value_counts().head() 735 | topic_words_dict = results['topic_words'] 736 | 737 | for i, (topic_num, count) in enumerate(topic_counts.items()): 738 | topic_desc = topic_words_dict[topic_num][:80] + "..." 739 | logger.info(f" {i+1}. Topic {topic_num}: {count:,} cases") 740 | logger.info(f" Keywords: {topic_desc}") 741 | 742 | logger.info(f""" 743 | Next Steps: 744 | 1. Examine topic_words.json to understand the legal topics discovered 745 | 2. Use visualization_data.csv for D3.js time-series visualization 746 | 3. Explore topic_modeled_cases.pickle for detailed analysis 747 | """) 748 | 749 | return results 750 | 751 | except KeyboardInterrupt: 752 | logger.info("Pipeline interrupted by user. Partial results may be available in the data directory.") 753 | return None 754 | except Exception as e: 755 | logger.error(f"Pipeline failed: {e}") 756 | logger.info("Check the data directory for any partial results that were saved.") 757 | raise 758 | 759 | 760 | if __name__ == "__main__": 761 | main() -------------------------------------------------------------------------------- /Topic_Modeling_Tutorial/Step2_pipeline_casegetter_forfinal.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Getting All Cases in the Court History\n", 8 | "Supreme Court cases are publicly available (obviously), but difficult to find in full. I eventually found a site where I could pick up all cases. If you've ever used BeautifulSoup before you know that finding a site where you can find everything you need is great because you can scale up dramatically without any extra work - vs if you find disperate sources you have to write multiple request functions." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 63, 14 | "metadata": { 15 | "collapsed": true 16 | }, 17 | "outputs": [], 18 | "source": [ 19 | "from bs4 import BeautifulSoup\n", 20 | "import requests\n", 21 | "import re\n", 22 | "import pandas as pd\n", 23 | "#23268 = total cases" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "### Our year and case title df from previous notebook\n", 31 | "We will use an apply function on this (a really fantastic function of pandas where you can easily apply a function to every row in a df with a single line of code).\n", 32 | "\n", 33 | "I had to split this up into three temporary dfs to run because caselaw eventually got smart enough and realized someone was scraping the crap out of their site (almost 24k requests within a few hrs...). I added the bot crawler fakeout header (which you should absolutely do if you are collecting as much data as I naively thought I could collect without a header), which may have solved this problem. However, nothing is worse than waiting 2 hours and coming back to an errored out screen which basically means you'll have to start completely over. So I decided to split them up." 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 64, 39 | "metadata": { 40 | "collapsed": false 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "supcourt = pd.read_pickle(\"supcourt_yearlist.pickle\")" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 79, 50 | "metadata": { 51 | "collapsed": false 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "test_df = supcourt.iloc[5000:15000]\n", 56 | "test3_df = supcourt.iloc[0:5000]\n", 57 | "test2_df = supcourt.iloc[15000:23268]" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 67, 63 | "metadata": { 64 | "collapsed": false 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "def supcourtdescr(link):\n", 69 | " headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}\n", 70 | " allitems = []\n", 71 | " response = requests.get(link, headers = headers)\n", 72 | " page = response.text\n", 73 | " soup = BeautifulSoup(page, \"lxml\")\n", 74 | " \n", 75 | " pagesoup = soup.find_all(class_=\"caselawcontent searchable-content\") \n", 76 | " \n", 77 | " for item in pagesoup:\n", 78 | " txtt = item.get_text()\n", 79 | " allitems.append(txtt)\n", 80 | " return ' '.join(allitems)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": { 87 | "collapsed": false 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "test_df[\"case\"] = test_df.caseurl.apply(supcourtdescr)\n", 92 | "test_df.to_pickle(\"temp2.pickle\")" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": { 99 | "collapsed": true 100 | }, 101 | "outputs": [], 102 | "source": [ 103 | "test2_df[\"case\"] = test2_df.caseurl.apply(supcourtdescr)\n", 104 | "test2_df.to_pickle(\"temp1.pickle\")" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": { 111 | "collapsed": false 112 | }, 113 | "outputs": [], 114 | "source": [ 115 | "test1_df[\"case\"] = test1_df.caseurl.apply(supcourtdescr)\n", 116 | "test1_df.to_pickle(\"temp3.pickle\")" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 77, 122 | "metadata": { 123 | "collapsed": true 124 | }, 125 | "outputs": [], 126 | "source": [ 127 | "full_project = pd.concat([test2_df, test3_df, test_df]) #putting it all together" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 86, 133 | "metadata": { 134 | "collapsed": false 135 | }, 136 | "outputs": [ 137 | { 138 | "data": { 139 | "text/html": [ 140 | "
\n", 141 | "\n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | "
caseurlcasetitleyearscase
15000http://caselaw.findlaw.com/us-supreme-court/38...ALUMINUM CO. OF AMERICA v. UNITED STATES,  382...1965United States Supreme Court JOBE v. CITY OF ...
15001http://caselaw.findlaw.com/us-supreme-court/38...JONES & LAUGHLIN STEEL CORP. v. GRIDIRON STEEL...1965United States Supreme Court JONES & LAUGHLIN...
15002http://caselaw.findlaw.com/us-supreme-court/38...JORDAN v. SILVER,  381 U.S. 415 (1965)1965United States Supreme Court JORDAN v. SILVER...
15003http://caselaw.findlaw.com/us-supreme-court/38...KADANS v. DICKERSON,  382 U.S. 22 (1965)1965United States Supreme Court KADANS v. DICKER...
15004http://caselaw.findlaw.com/us-supreme-court/38...METROMEDIA, INC. v. AMERICAN SOCIETY OF COMPOS...1965United States Supreme Court KASHARIAN v. MET...
15005http://caselaw.findlaw.com/us-supreme-court/38...KASHARIAN v. SOUTH PLAINFIELD BAPTIST CHURCH, ...1965United States Supreme Court KASHARIAN v. SOU...
15006http://caselaw.findlaw.com/us-supreme-court/38...FLORIDA EAST COAST RAILWAY CO. v. UNITED STATE...1965United States Supreme Court KASHARIAN v. WIL...
15007http://caselaw.findlaw.com/us-supreme-court/38...KENNECOTT COPPER CORP. v. UNITED STATES,  381 ...1965United States Supreme Court KENNECOTT COPPER...
15008http://caselaw.findlaw.com/us-supreme-court/38...KILLGORE v. BLACKWELL,  381 U.S. 278 (1965)1965United States Supreme Court KILLGORE v. BLAC...
15009http://caselaw.findlaw.com/us-supreme-court/37...KITTY HAWK DEVELOPMENT CO. v. CITY OF COLORADO...1965United States Supreme Court KITTY HAWK DEVEL...
15010http://caselaw.findlaw.com/us-supreme-court/38...KNOWLES v. FLORIDA,  381 U.S. 763 (1965)1965United States Supreme Court KNOWLES v. FLORI...
15011http://caselaw.findlaw.com/us-supreme-court/38...LABOR BOARD v. BROWN,  380 U.S. 278 (1965)1965United States Supreme Court LABOR BOARD v. B...
15012http://caselaw.findlaw.com/us-supreme-court/38...LABOR BOARD v. METROPOLITAN INS. CO.,  380 U.S...1965United States Supreme Court LABOR BOARD v. M...
15013http://caselaw.findlaw.com/us-supreme-court/38...LAMONT v. POSTMASTER GENERAL,  381 U.S. 301 (1...1965United States Supreme Court LAMONT v. POSTMA...
15014http://caselaw.findlaw.com/us-supreme-court/38...LEH v. GENERAL PETROLEUM CORP.,  382 U.S. 54 (...1965United States Supreme Court LEH v. GENERAL P...
15015http://caselaw.findlaw.com/us-supreme-court/38...LINKLETTER v. WALKER,  381 U.S. 618 (1965)1965United States Supreme Court LINKLETTER v. WA...
15016http://caselaw.findlaw.com/us-supreme-court/37...LISBON SALESBOOK CO. v. OHIO,  379 U.S. 673 (1...1965United States Supreme Court LISBON SALESBOOK...
15017http://caselaw.findlaw.com/us-supreme-court/38...LOUISIANA v. UNITED STATES,  380 U.S. 145 (1965)1965United States Supreme Court LOUISIANA v. UNI...
15018http://caselaw.findlaw.com/us-supreme-court/38...MADDOX v. BIRZGALIS,  380 U.S. 126 (1965)1965United States Supreme Court MADDOX v. BIRZGA...
15019http://caselaw.findlaw.com/us-supreme-court/38...EL PASO ELECTRIC CO. v. CALVERT,  382 U.S. 18 ...1965United States Supreme Court MADDOX v. WILLIS...
15020http://caselaw.findlaw.com/us-supreme-court/38...MARCHEV v. TOWNSHIP OF LIVINGSTON,  382 U.S. 2...1965United States Supreme Court MARCHEV v. TOWNS...
15021http://caselaw.findlaw.com/us-supreme-court/38...MARTIN v. TEXAS,  382 U.S. 928 (1965)1965United States Supreme Court MARTIN v. TEXAS,...
15022http://caselaw.findlaw.com/us-supreme-court/38...MARVEL v. UNITED STATES,  380 U.S. 262 (1965)1965United States Supreme Court MARVEL v. UNITED...
15023http://caselaw.findlaw.com/us-supreme-court/38...MARYLAND v. UNITED STATES,  381 U.S. 41 (1965)1965United States Supreme Court MARYLAND v. UNIT...
15024http://caselaw.findlaw.com/us-supreme-court/38...O'CONNOR v. OHIO,  382 U.S. 286 (1965)1965United States Supreme Court MAYBERRY v. PENN...
15025http://caselaw.findlaw.com/us-supreme-court/38...MCCLELLAN v. CHESAPEAKE & OHIO RAILWAY CO.,  3...1965United States Supreme Court McCLELLAN v. CHE...
15026http://caselaw.findlaw.com/us-supreme-court/38...MCGEE v. CROUSE,  382 U.S. 67 (1965)1965United States Supreme Court McGEE v. CROUSE,...
15027http://caselaw.findlaw.com/us-supreme-court/38...MCKINNIE v. TENNESSEE,  380 U.S. 449 (1965)1965United States Supreme Court McKINNIE v. TENN...
15028http://caselaw.findlaw.com/us-supreme-court/38...MCLEOD v. OHIO,  381 U.S. 356 (1965)1965United States Supreme Court McLEOD v. OHIO, ...
15029http://caselaw.findlaw.com/us-supreme-court/38...MEAT CUTTERS v. JEWEL TEA,  381 U.S. 676 (1965)1965United States Supreme Court MEAT CUTTERS v. ...
...............
14970http://caselaw.findlaw.com/us-supreme-court/38...GRIFFIN v. CALIFORNIA,  380 U.S. 609 (1965)1965United States Supreme Court GRIFFIN v. CALIF...
14971http://caselaw.findlaw.com/us-supreme-court/38...GRIFFING v. BIANCHI,  382 U.S. 15 (1965)1965United States Supreme Court GRIFFING v. BIAN...
14972http://caselaw.findlaw.com/us-supreme-court/38...GRISWOLD v. CONNECTICUT,  381 U.S. 479 (1965)1965United States Supreme Court GRISWOLD v. CONN...
14973http://caselaw.findlaw.com/us-supreme-court/38...GUNTHER v. SAN DIEGO & A. E. R. CO.,  382 U.S....1965United States Supreme Court GUNTHER v. SAN D...
14974http://caselaw.findlaw.com/us-supreme-court/38...HAINSWORTH v. MARTIN,  382 U.S. 109 (1965)1965United States Supreme Court HAINSWORTH v. MA...
14975http://caselaw.findlaw.com/us-supreme-court/38...ARTHUR v. COLORADO,  380 U.S. 250 (1965)1965United States Supreme Court HALL v. ILLINOIS...
14976http://caselaw.findlaw.com/us-supreme-court/37...WINKLE v. BANNAN,  379 U.S. 645 (1965)1965United States Supreme Court HALPERT v. UDALL...
14977http://caselaw.findlaw.com/us-supreme-court/38...HANNA MINING v. MARINE ENGINEERS,  382 U.S. 18...1965United States Supreme Court HANNA MINING v. ...
14978http://caselaw.findlaw.com/us-supreme-court/38...HANNA v. PLUMER,  380 U.S. 460 (1965)1965United States Supreme Court HANNA v. PLUMER,...
14979http://caselaw.findlaw.com/us-supreme-court/38...HARMAN v. FORSSENIUS,  380 U.S. 528 (1965)1965United States Supreme Court HARMAN v. FORSSE...
14980http://caselaw.findlaw.com/us-supreme-court/38...HARPER v. VIRGINIA STATE BOARD OF ELECTIONS, 3...1965United States Supreme Court HARPER v. VIRGIN...
14981http://caselaw.findlaw.com/us-supreme-court/38...HARRIS v. UNITED STATES,  382 U.S. 162 (1965)1965United States Supreme Court HARRIS v. UNITED...
14982http://caselaw.findlaw.com/us-supreme-court/38...HARRISON v. MCNAMARA,  380 U.S. 261 (1965)1965United States Supreme Court HARRISON v. McNA...
14983http://caselaw.findlaw.com/us-supreme-court/38...HAZELTINE RESEARCH, INC. v. BRENNER,  382 U.S....1965United States Supreme Court HAZELTINE RESEAR...
14984http://caselaw.findlaw.com/us-supreme-court/38...HEARNE v. SMYLIE,  381 U.S. 420 (1965)1965United States Supreme Court HEARNE v. SMYLIE...
14985http://caselaw.findlaw.com/us-supreme-court/37...HEARNE v. SMYLIE,  379 U.S. 692 (1965)1965United States Supreme Court HEARNE v. SMYLIE...
14986http://caselaw.findlaw.com/us-supreme-court/38...HENRY v. COLLINS,  380 U.S. 356 (1965)1965United States Supreme Court HENRY v. COLLINS...
14987http://caselaw.findlaw.com/us-supreme-court/37...HENRY v. MISSISSIPPI,  379 U.S. 443 (1965)1965United States Supreme Court HENRY v. MISSISS...
14988http://caselaw.findlaw.com/us-supreme-court/38...BOWMAN v. LAKE COUNTY PUBLIC BUILDING COMMISSI...1965United States Supreme Court HERALD PUBLISHIN...
14989http://caselaw.findlaw.com/us-supreme-court/38...REYNOLDS METALS CO. v. WASHINGTON,  382 U.S. 1...1965United States Supreme Court HODGES v. BUCKEY...
14990http://caselaw.findlaw.com/us-supreme-court/38...HOLT v. VIRGINIA,  381 U.S. 131 (1965)1965United States Supreme Court HOLT v. VIRGINIA...
14991http://caselaw.findlaw.com/us-supreme-court/38...UNITED STATES v. NEW ORLEANS CHAPTER,  382 U.S...1965United States Supreme Court HOURIHAN v. MAHO...
14992http://caselaw.findlaw.com/us-supreme-court/38...HUGHES TOOL CO. v. TWA,  380 U.S. 248 (1965)1965United States Supreme Court HUGHES TOOL CO. ...
14993http://caselaw.findlaw.com/us-supreme-court/38...HUGHES TOOL CO. v. TWA,  380 U.S. 249 (1965)1965United States Supreme Court HUGHES TOOL CO. ...
14994http://caselaw.findlaw.com/us-supreme-court/37...HUGHES v. WMCA,  379 U.S. 694 (1965)1965United States Supreme Court HUGHES v. WMCA, ...
14995http://caselaw.findlaw.com/us-supreme-court/38...MASON v. MIDWESTERN GAS TRANSMISSION CO.,  380...1965United States Supreme Court IN RE RYAN, (196...
14996http://caselaw.findlaw.com/us-supreme-court/38...JABEN v. UNITED STATES,  381 U.S. 214 (1965)1965United States Supreme Court JABEN v. UNITED ...
14997http://caselaw.findlaw.com/us-supreme-court/38...JAMES v. LOUISIANA,  382 U.S. 36 (1965)1965United States Supreme Court JAMES v. LOUISIA...
14998http://caselaw.findlaw.com/us-supreme-court/37...JANKOVICH v. TOLL ROAD COMM'N,  379 U.S. 487 (...1965United States Supreme Court JANKOVICH v. TOL...
14999http://caselaw.findlaw.com/us-supreme-court/38...JENKINS v. UNITED STATES,  380 U.S. 445 (1965)1965United States Supreme Court JENKINS v. UNITE...
\n", 581 | "

23268 rows × 4 columns

\n", 582 | "
" 583 | ], 584 | "text/plain": [ 585 | " caseurl \\\n", 586 | "15000 http://caselaw.findlaw.com/us-supreme-court/38... \n", 587 | "15001 http://caselaw.findlaw.com/us-supreme-court/38... \n", 588 | "15002 http://caselaw.findlaw.com/us-supreme-court/38... \n", 589 | "15003 http://caselaw.findlaw.com/us-supreme-court/38... \n", 590 | "15004 http://caselaw.findlaw.com/us-supreme-court/38... \n", 591 | "15005 http://caselaw.findlaw.com/us-supreme-court/38... \n", 592 | "15006 http://caselaw.findlaw.com/us-supreme-court/38... \n", 593 | "15007 http://caselaw.findlaw.com/us-supreme-court/38... \n", 594 | "15008 http://caselaw.findlaw.com/us-supreme-court/38... \n", 595 | "15009 http://caselaw.findlaw.com/us-supreme-court/37... \n", 596 | "15010 http://caselaw.findlaw.com/us-supreme-court/38... \n", 597 | "15011 http://caselaw.findlaw.com/us-supreme-court/38... \n", 598 | "15012 http://caselaw.findlaw.com/us-supreme-court/38... \n", 599 | "15013 http://caselaw.findlaw.com/us-supreme-court/38... \n", 600 | "15014 http://caselaw.findlaw.com/us-supreme-court/38... \n", 601 | "15015 http://caselaw.findlaw.com/us-supreme-court/38... \n", 602 | "15016 http://caselaw.findlaw.com/us-supreme-court/37... \n", 603 | "15017 http://caselaw.findlaw.com/us-supreme-court/38... \n", 604 | "15018 http://caselaw.findlaw.com/us-supreme-court/38... \n", 605 | "15019 http://caselaw.findlaw.com/us-supreme-court/38... \n", 606 | "15020 http://caselaw.findlaw.com/us-supreme-court/38... \n", 607 | "15021 http://caselaw.findlaw.com/us-supreme-court/38... \n", 608 | "15022 http://caselaw.findlaw.com/us-supreme-court/38... \n", 609 | "15023 http://caselaw.findlaw.com/us-supreme-court/38... \n", 610 | "15024 http://caselaw.findlaw.com/us-supreme-court/38... \n", 611 | "15025 http://caselaw.findlaw.com/us-supreme-court/38... \n", 612 | "15026 http://caselaw.findlaw.com/us-supreme-court/38... \n", 613 | "15027 http://caselaw.findlaw.com/us-supreme-court/38... \n", 614 | "15028 http://caselaw.findlaw.com/us-supreme-court/38... \n", 615 | "15029 http://caselaw.findlaw.com/us-supreme-court/38... \n", 616 | "... ... \n", 617 | "14970 http://caselaw.findlaw.com/us-supreme-court/38... \n", 618 | "14971 http://caselaw.findlaw.com/us-supreme-court/38... \n", 619 | "14972 http://caselaw.findlaw.com/us-supreme-court/38... \n", 620 | "14973 http://caselaw.findlaw.com/us-supreme-court/38... \n", 621 | "14974 http://caselaw.findlaw.com/us-supreme-court/38... \n", 622 | "14975 http://caselaw.findlaw.com/us-supreme-court/38... \n", 623 | "14976 http://caselaw.findlaw.com/us-supreme-court/37... \n", 624 | "14977 http://caselaw.findlaw.com/us-supreme-court/38... \n", 625 | "14978 http://caselaw.findlaw.com/us-supreme-court/38... \n", 626 | "14979 http://caselaw.findlaw.com/us-supreme-court/38... \n", 627 | "14980 http://caselaw.findlaw.com/us-supreme-court/38... \n", 628 | "14981 http://caselaw.findlaw.com/us-supreme-court/38... \n", 629 | "14982 http://caselaw.findlaw.com/us-supreme-court/38... \n", 630 | "14983 http://caselaw.findlaw.com/us-supreme-court/38... \n", 631 | "14984 http://caselaw.findlaw.com/us-supreme-court/38... \n", 632 | "14985 http://caselaw.findlaw.com/us-supreme-court/37... \n", 633 | "14986 http://caselaw.findlaw.com/us-supreme-court/38... \n", 634 | "14987 http://caselaw.findlaw.com/us-supreme-court/37... \n", 635 | "14988 http://caselaw.findlaw.com/us-supreme-court/38... \n", 636 | "14989 http://caselaw.findlaw.com/us-supreme-court/38... \n", 637 | "14990 http://caselaw.findlaw.com/us-supreme-court/38... \n", 638 | "14991 http://caselaw.findlaw.com/us-supreme-court/38... \n", 639 | "14992 http://caselaw.findlaw.com/us-supreme-court/38... \n", 640 | "14993 http://caselaw.findlaw.com/us-supreme-court/38... \n", 641 | "14994 http://caselaw.findlaw.com/us-supreme-court/37... \n", 642 | "14995 http://caselaw.findlaw.com/us-supreme-court/38... \n", 643 | "14996 http://caselaw.findlaw.com/us-supreme-court/38... \n", 644 | "14997 http://caselaw.findlaw.com/us-supreme-court/38... \n", 645 | "14998 http://caselaw.findlaw.com/us-supreme-court/37... \n", 646 | "14999 http://caselaw.findlaw.com/us-supreme-court/38... \n", 647 | "\n", 648 | " casetitle years \\\n", 649 | "15000 ALUMINUM CO. OF AMERICA v. UNITED STATES,  382... 1965 \n", 650 | "15001 JONES & LAUGHLIN STEEL CORP. v. GRIDIRON STEEL... 1965 \n", 651 | "15002 JORDAN v. SILVER,  381 U.S. 415 (1965) 1965 \n", 652 | "15003 KADANS v. DICKERSON,  382 U.S. 22 (1965) 1965 \n", 653 | "15004 METROMEDIA, INC. v. AMERICAN SOCIETY OF COMPOS... 1965 \n", 654 | "15005 KASHARIAN v. SOUTH PLAINFIELD BAPTIST CHURCH, ... 1965 \n", 655 | "15006 FLORIDA EAST COAST RAILWAY CO. v. UNITED STATE... 1965 \n", 656 | "15007 KENNECOTT COPPER CORP. v. UNITED STATES,  381 ... 1965 \n", 657 | "15008 KILLGORE v. BLACKWELL,  381 U.S. 278 (1965) 1965 \n", 658 | "15009 KITTY HAWK DEVELOPMENT CO. v. CITY OF COLORADO... 1965 \n", 659 | "15010 KNOWLES v. FLORIDA,  381 U.S. 763 (1965) 1965 \n", 660 | "15011 LABOR BOARD v. BROWN,  380 U.S. 278 (1965) 1965 \n", 661 | "15012 LABOR BOARD v. METROPOLITAN INS. CO.,  380 U.S... 1965 \n", 662 | "15013 LAMONT v. POSTMASTER GENERAL,  381 U.S. 301 (1... 1965 \n", 663 | "15014 LEH v. GENERAL PETROLEUM CORP.,  382 U.S. 54 (... 1965 \n", 664 | "15015 LINKLETTER v. WALKER,  381 U.S. 618 (1965) 1965 \n", 665 | "15016 LISBON SALESBOOK CO. v. OHIO,  379 U.S. 673 (1... 1965 \n", 666 | "15017 LOUISIANA v. UNITED STATES,  380 U.S. 145 (1965) 1965 \n", 667 | "15018 MADDOX v. BIRZGALIS,  380 U.S. 126 (1965) 1965 \n", 668 | "15019 EL PASO ELECTRIC CO. v. CALVERT,  382 U.S. 18 ... 1965 \n", 669 | "15020 MARCHEV v. TOWNSHIP OF LIVINGSTON,  382 U.S. 2... 1965 \n", 670 | "15021 MARTIN v. TEXAS,  382 U.S. 928 (1965) 1965 \n", 671 | "15022 MARVEL v. UNITED STATES,  380 U.S. 262 (1965) 1965 \n", 672 | "15023 MARYLAND v. UNITED STATES,  381 U.S. 41 (1965) 1965 \n", 673 | "15024 O'CONNOR v. OHIO,  382 U.S. 286 (1965) 1965 \n", 674 | "15025 MCCLELLAN v. CHESAPEAKE & OHIO RAILWAY CO.,  3... 1965 \n", 675 | "15026 MCGEE v. CROUSE,  382 U.S. 67 (1965) 1965 \n", 676 | "15027 MCKINNIE v. TENNESSEE,  380 U.S. 449 (1965) 1965 \n", 677 | "15028 MCLEOD v. OHIO,  381 U.S. 356 (1965) 1965 \n", 678 | "15029 MEAT CUTTERS v. JEWEL TEA,  381 U.S. 676 (1965) 1965 \n", 679 | "... ... ... \n", 680 | "14970 GRIFFIN v. CALIFORNIA,  380 U.S. 609 (1965) 1965 \n", 681 | "14971 GRIFFING v. BIANCHI,  382 U.S. 15 (1965) 1965 \n", 682 | "14972 GRISWOLD v. CONNECTICUT,  381 U.S. 479 (1965) 1965 \n", 683 | "14973 GUNTHER v. SAN DIEGO & A. E. R. CO.,  382 U.S.... 1965 \n", 684 | "14974 HAINSWORTH v. MARTIN,  382 U.S. 109 (1965) 1965 \n", 685 | "14975 ARTHUR v. COLORADO,  380 U.S. 250 (1965) 1965 \n", 686 | "14976 WINKLE v. BANNAN,  379 U.S. 645 (1965) 1965 \n", 687 | "14977 HANNA MINING v. MARINE ENGINEERS,  382 U.S. 18... 1965 \n", 688 | "14978 HANNA v. PLUMER,  380 U.S. 460 (1965) 1965 \n", 689 | "14979 HARMAN v. FORSSENIUS,  380 U.S. 528 (1965) 1965 \n", 690 | "14980 HARPER v. VIRGINIA STATE BOARD OF ELECTIONS, 3... 1965 \n", 691 | "14981 HARRIS v. UNITED STATES,  382 U.S. 162 (1965) 1965 \n", 692 | "14982 HARRISON v. MCNAMARA,  380 U.S. 261 (1965) 1965 \n", 693 | "14983 HAZELTINE RESEARCH, INC. v. BRENNER,  382 U.S.... 1965 \n", 694 | "14984 HEARNE v. SMYLIE,  381 U.S. 420 (1965) 1965 \n", 695 | "14985 HEARNE v. SMYLIE,  379 U.S. 692 (1965) 1965 \n", 696 | "14986 HENRY v. COLLINS,  380 U.S. 356 (1965) 1965 \n", 697 | "14987 HENRY v. MISSISSIPPI,  379 U.S. 443 (1965) 1965 \n", 698 | "14988 BOWMAN v. LAKE COUNTY PUBLIC BUILDING COMMISSI... 1965 \n", 699 | "14989 REYNOLDS METALS CO. v. WASHINGTON,  382 U.S. 1... 1965 \n", 700 | "14990 HOLT v. VIRGINIA,  381 U.S. 131 (1965) 1965 \n", 701 | "14991 UNITED STATES v. NEW ORLEANS CHAPTER,  382 U.S... 1965 \n", 702 | "14992 HUGHES TOOL CO. v. TWA,  380 U.S. 248 (1965) 1965 \n", 703 | "14993 HUGHES TOOL CO. v. TWA,  380 U.S. 249 (1965) 1965 \n", 704 | "14994 HUGHES v. WMCA,  379 U.S. 694 (1965) 1965 \n", 705 | "14995 MASON v. MIDWESTERN GAS TRANSMISSION CO.,  380... 1965 \n", 706 | "14996 JABEN v. UNITED STATES,  381 U.S. 214 (1965) 1965 \n", 707 | "14997 JAMES v. LOUISIANA,  382 U.S. 36 (1965) 1965 \n", 708 | "14998 JANKOVICH v. TOLL ROAD COMM'N,  379 U.S. 487 (... 1965 \n", 709 | "14999 JENKINS v. UNITED STATES,  380 U.S. 445 (1965) 1965 \n", 710 | "\n", 711 | " case \n", 712 | "15000 United States Supreme Court JOBE v. CITY OF ... \n", 713 | "15001 United States Supreme Court JONES & LAUGHLIN... \n", 714 | "15002 United States Supreme Court JORDAN v. SILVER... \n", 715 | "15003 United States Supreme Court KADANS v. DICKER... \n", 716 | "15004 United States Supreme Court KASHARIAN v. MET... \n", 717 | "15005 United States Supreme Court KASHARIAN v. SOU... \n", 718 | "15006 United States Supreme Court KASHARIAN v. WIL... \n", 719 | "15007 United States Supreme Court KENNECOTT COPPER... \n", 720 | "15008 United States Supreme Court KILLGORE v. BLAC... \n", 721 | "15009 United States Supreme Court KITTY HAWK DEVEL... \n", 722 | "15010 United States Supreme Court KNOWLES v. FLORI... \n", 723 | "15011 United States Supreme Court LABOR BOARD v. B... \n", 724 | "15012 United States Supreme Court LABOR BOARD v. M... \n", 725 | "15013 United States Supreme Court LAMONT v. POSTMA... \n", 726 | "15014 United States Supreme Court LEH v. GENERAL P... \n", 727 | "15015 United States Supreme Court LINKLETTER v. WA... \n", 728 | "15016 United States Supreme Court LISBON SALESBOOK... \n", 729 | "15017 United States Supreme Court LOUISIANA v. UNI... \n", 730 | "15018 United States Supreme Court MADDOX v. BIRZGA... \n", 731 | "15019 United States Supreme Court MADDOX v. WILLIS... \n", 732 | "15020 United States Supreme Court MARCHEV v. TOWNS... \n", 733 | "15021 United States Supreme Court MARTIN v. TEXAS,... \n", 734 | "15022 United States Supreme Court MARVEL v. UNITED... \n", 735 | "15023 United States Supreme Court MARYLAND v. UNIT... \n", 736 | "15024 United States Supreme Court MAYBERRY v. PENN... \n", 737 | "15025 United States Supreme Court McCLELLAN v. CHE... \n", 738 | "15026 United States Supreme Court McGEE v. CROUSE,... \n", 739 | "15027 United States Supreme Court McKINNIE v. TENN... \n", 740 | "15028 United States Supreme Court McLEOD v. OHIO, ... \n", 741 | "15029 United States Supreme Court MEAT CUTTERS v. ... \n", 742 | "... ... \n", 743 | "14970 United States Supreme Court GRIFFIN v. CALIF... \n", 744 | "14971 United States Supreme Court GRIFFING v. BIAN... \n", 745 | "14972 United States Supreme Court GRISWOLD v. CONN... \n", 746 | "14973 United States Supreme Court GUNTHER v. SAN D... \n", 747 | "14974 United States Supreme Court HAINSWORTH v. MA... \n", 748 | "14975 United States Supreme Court HALL v. ILLINOIS... \n", 749 | "14976 United States Supreme Court HALPERT v. UDALL... \n", 750 | "14977 United States Supreme Court HANNA MINING v. ... \n", 751 | "14978 United States Supreme Court HANNA v. PLUMER,... \n", 752 | "14979 United States Supreme Court HARMAN v. FORSSE... \n", 753 | "14980 United States Supreme Court HARPER v. VIRGIN... \n", 754 | "14981 United States Supreme Court HARRIS v. UNITED... \n", 755 | "14982 United States Supreme Court HARRISON v. McNA... \n", 756 | "14983 United States Supreme Court HAZELTINE RESEAR... \n", 757 | "14984 United States Supreme Court HEARNE v. SMYLIE... \n", 758 | "14985 United States Supreme Court HEARNE v. SMYLIE... \n", 759 | "14986 United States Supreme Court HENRY v. COLLINS... \n", 760 | "14987 United States Supreme Court HENRY v. MISSISS... \n", 761 | "14988 United States Supreme Court HERALD PUBLISHIN... \n", 762 | "14989 United States Supreme Court HODGES v. BUCKEY... \n", 763 | "14990 United States Supreme Court HOLT v. VIRGINIA... \n", 764 | "14991 United States Supreme Court HOURIHAN v. MAHO... \n", 765 | "14992 United States Supreme Court HUGHES TOOL CO. ... \n", 766 | "14993 United States Supreme Court HUGHES TOOL CO. ... \n", 767 | "14994 United States Supreme Court HUGHES v. WMCA, ... \n", 768 | "14995 United States Supreme Court IN RE RYAN, (196... \n", 769 | "14996 United States Supreme Court JABEN v. UNITED ... \n", 770 | "14997 United States Supreme Court JAMES v. LOUISIA... \n", 771 | "14998 United States Supreme Court JANKOVICH v. TOL... \n", 772 | "14999 United States Supreme Court JENKINS v. UNITE... \n", 773 | "\n", 774 | "[23268 rows x 4 columns]" 775 | ] 776 | }, 777 | "execution_count": 86, 778 | "metadata": {}, 779 | "output_type": "execute_result" 780 | } 781 | ], 782 | "source": [ 783 | "full_project" 784 | ] 785 | }, 786 | { 787 | "cell_type": "markdown", 788 | "metadata": {}, 789 | "source": [ 790 | "### Saving the body of our data\n", 791 | "This pickle file is about 600 MB. We don't want to run this again!" 792 | ] 793 | }, 794 | { 795 | "cell_type": "code", 796 | "execution_count": 85, 797 | "metadata": { 798 | "collapsed": false 799 | }, 800 | "outputs": [], 801 | "source": [ 802 | "full_project.to_pickle(\"full_proj_preproc.pickle\")" 803 | ] 804 | } 805 | ], 806 | "metadata": { 807 | "kernelspec": { 808 | "display_name": "Python 3", 809 | "language": "python", 810 | "name": "python3" 811 | }, 812 | "language_info": { 813 | "codemirror_mode": { 814 | "name": "ipython", 815 | "version": 3 816 | }, 817 | "file_extension": ".py", 818 | "mimetype": "text/x-python", 819 | "name": "python", 820 | "nbconvert_exporter": "python", 821 | "pygments_lexer": "ipython3", 822 | "version": "3.5.1" 823 | } 824 | }, 825 | "nbformat": 4, 826 | "nbformat_minor": 0 827 | } 828 | --------------------------------------------------------------------------------