├── .python_version ├── src ├── tester │ ├── __init__.py │ └── README.md ├── __init__.py ├── tools │ ├── zip_folder.py │ ├── combine_parsed.py │ ├── print_stats.py │ └── build_event_csv.py ├── scraper │ ├── hays.py │ ├── README.md │ ├── scrapcode_post2017.py │ ├── helpers.py │ └── __init__.py ├── orchestrator │ └── __init__.py ├── cleaner │ ├── Data Structure of Cleaned JSON.md │ └── __init__.py ├── parser │ ├── README.md │ ├── Data Structure of Parsed JSON.md │ ├── __init__.py │ └── hays.py └── updater │ └── __init__.py ├── .github ├── CODEOWNERS └── workflows │ └── python-app.yml ├── resources ├── test_files │ ├── parser_testing │ │ ├── test_1.html │ │ ├── test_2.html │ │ └── multiple_html_files │ │ │ ├── test_1.html │ │ │ └── test_2.html │ ├── test_123456.html │ ├── test_hidden_values.txt │ ├── cleaned_test_json │ │ └── test_123456.json │ ├── field_validation_list.json │ ├── hays_main_page.html │ └── test_123456.json └── texas_county_data.csv ├── requirements.txt ├── CONTRIBUTING.md ├── docs ├── index.rst └── conf.py ├── LICENSE ├── parser_log.txt ├── .gitignore └── README.md /.python_version: -------------------------------------------------------------------------------- 1 | 3.12.2 -------------------------------------------------------------------------------- /src/tester/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @ids-core -------------------------------------------------------------------------------- /resources/test_files/parser_testing/test_1.html: -------------------------------------------------------------------------------- 1 | test -------------------------------------------------------------------------------- /resources/test_files/parser_testing/test_2.html: -------------------------------------------------------------------------------- 1 | test -------------------------------------------------------------------------------- /resources/test_files/parser_testing/multiple_html_files/test_1.html: -------------------------------------------------------------------------------- 1 | test -------------------------------------------------------------------------------- /resources/test_files/parser_testing/multiple_html_files/test_2.html: -------------------------------------------------------------------------------- 1 | test -------------------------------------------------------------------------------- /resources/test_files/test_123456.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-austin/indigent-defense-stats/HEAD/resources/test_files/test_123456.html -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | from . import cleaner 2 | from . import orchestrator 3 | from . import parser 4 | from . import scraper 5 | from . import tools 6 | from . import updater 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | azure-cosmos == 4.7.0 2 | beautifulsoup4 == 4.12.3 3 | boto3 == 1.35.5 4 | python-dotenv == 1.0.1 5 | requests == 2.32.3 6 | retry == 0.9.2 7 | statistics == 1.0.3.5 8 | xxhash == 3.5.0 9 | flake8 == 7.1.0 10 | Sphinx == 8.0.2 11 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | ## Development Ethos 4 | 5 | - Write code simply to make it more accessible for new developers to contribute over the long run. 6 | - Make your code easier to read, even if at the expense of being more succinct. 7 | - Use commonly used and well-maintained libraries and approaches 8 | - Document and notate your code. 9 | 10 | ## How to Report an Issue 11 | 12 | - If something breaks make an [Issue](../../issues) 13 | - Provide the date, JO, case number, and county that you were scraping. (if known) 14 | - Paste all error text. 15 | -------------------------------------------------------------------------------- /resources/test_files/test_hidden_values.txt: -------------------------------------------------------------------------------- 1 | {'__VIEWSTATE': '/wEPDwULLTEwOTk1NTcyNzAPZBYCZg9kFgICAQ8WAh4HVmlzaWJsZWgWAgIDDw9kFgIeB29ua2V5dXAFJnRoaXMudmFsdWUgPSB0aGlzLnZhbHVlLnRvTG93ZXJDYXNlKCk7ZGSnBpspJun0H8O1uyepgbYYqxCR2g==', '__VIEWSTATEGENERATOR': 'BBBC20B8', '__EVENTVALIDATION': '/wEWAgLohsKOBgKYxoa5CF1tgF3CUdvlNXx3DxVd7HpMX9tL', 'NodeID': '100,101,102,103,200,201,202,203,204,220,6112,400,401,402,403,404,405,406,407,6111,6116', 'NodeDesc': 'All Courts', 'SearchType': '', 'SearchMode': '', 'NameTypeKy': '', 'BaseConnKy': '', 'StatusType': '', 'ShowInactive': '', 'AllStatusTypes': '', 'CaseCategories': '', 'RequireFirstName': '', 'CaseTypeIDs': '', 'HearingTypeIDs': '', 'SearchParams': ''} -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. indigent-defense-stats documentation master file, created by 2 | sphinx-quickstart on Sun Sep 15 15:44:02 2024. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | indigent-defense-stats documentation 7 | ==================================== 8 | 9 | Add your content using ``reStructuredText`` syntax. See the 10 | `reStructuredText `_ 11 | documentation for details. 12 | 13 | .. autosummary:: 14 | :toctree: generated 15 | 16 | cleaner 17 | orchestrator 18 | parser 19 | scraper 20 | tools 21 | updater 22 | -------------------------------------------------------------------------------- /src/tools/zip_folder.py: -------------------------------------------------------------------------------- 1 | import zipfile 2 | import os 3 | import argparse 4 | from io import BytesIO 5 | import boto3 6 | 7 | argparser = argparse.ArgumentParser() 8 | argparser.add_argument( 9 | "-county", 10 | "-c", 11 | type=str, 12 | default="hays", 13 | help="The name of the county.", 14 | ) 15 | argparser.description = "Print stats for the specified county." 16 | args = argparser.parse_args() 17 | 18 | folderpath = os.path.join( 19 | os.path.dirname(__file__), "..", "..", "data", args.county, "case_html" 20 | ) 21 | memory_file = BytesIO() 22 | with zipfile.ZipFile(memory_file, "w") as zf: 23 | for root, dirs, files in os.walk(folderpath): 24 | for file in files: 25 | filepath = os.path.join(root, file) 26 | zf.write(filepath, arcname=file) 27 | memory_file.seek(0) 28 | 29 | cli = boto3.client("s3") 30 | cli.put_object( 31 | Body=memory_file, 32 | Bucket="indigent-defense", 33 | Key="case_html.zip", 34 | ) 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Derek Olson 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/tools/combine_parsed.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import boto3 5 | 6 | argparser = argparse.ArgumentParser() 7 | argparser.add_argument( 8 | "-county", 9 | "-c", 10 | type=str, 11 | default="hays", 12 | help="The name of the county.", 13 | ) 14 | argparser.description = "Print stats for the specified county." 15 | args = argparser.parse_args() 16 | 17 | case_json_path = os.path.join( 18 | os.path.dirname(__file__), "..", "..", "data", args.county, "case_json" 19 | ) 20 | 21 | file_list = os.listdir(case_json_path) 22 | 23 | # read case ids (first 1000 for now) 24 | all_case_data = {} 25 | for case_filename in file_list[:1000]: 26 | case_id = os.path.splitext(os.path.basename(case_filename))[0] 27 | with open(os.path.join(case_json_path, case_filename), "r") as f: 28 | case_data = json.load(f) 29 | all_case_data[case_id] = case_data 30 | 31 | # export to s3 bucket 32 | case_data_str = json.dumps(all_case_data) 33 | cli = boto3.client("s3") 34 | cli.put_object( 35 | Body=json.dumps(all_case_data), 36 | Bucket="indigent-defense", 37 | Key="case_id_example.json", 38 | ) 39 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # For the full list of built-in configuration values, see the documentation: 6 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 7 | 8 | # -- Project information ----------------------------------------------------- 9 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 10 | 11 | project = 'indigent-defense-stats' 12 | copyright = '2024, Open Austin' 13 | author = 'Open Austin' 14 | 15 | # -- General configuration --------------------------------------------------- 16 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 17 | 18 | sys.path.insert(0, os.path.abspath('../src')) 19 | 20 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.autosummary'] 21 | 22 | templates_path = ['_templates'] 23 | exclude_patterns = ['src/tester'] 24 | 25 | 26 | 27 | # -- Options for HTML output ------------------------------------------------- 28 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 29 | 30 | html_theme = 'alabaster' 31 | html_static_path = ['_static'] 32 | -------------------------------------------------------------------------------- /src/scraper/hays.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from helpers import * 3 | 4 | class ScraperHays(): 5 | 6 | def __init__(self): 7 | pass 8 | 9 | def scraper_hays(self, base_url, results_soup, case_html_path, logger, session, ms_wait): 10 | case_urls = [ 11 | base_url + anchor["href"] 12 | for anchor in results_soup.select('a[href^="CaseDetail"]') 13 | ] 14 | logger.info(f"{len(case_urls)} cases found") 15 | for case_url in case_urls: 16 | case_id = case_url.split("=")[1] 17 | logger.info(f"{case_id} - scraping case") 18 | # make request for the case 19 | try: 20 | case_html = request_page_with_retry( 21 | session=session, 22 | url=case_url, 23 | verification_text="Date Filed", 24 | logger=logger, 25 | ms_wait=ms_wait, 26 | ) 27 | except: 28 | logger.info(f"Issue with scraping this case: {case_id}. Moving to next one.") 29 | # write html case data 30 | logger.info(f"{len(case_html)} response string length") 31 | 32 | with open( 33 | os.path.join(case_html_path, f"{case_id}.html"), "w" 34 | ) as file_handle: 35 | file_handle.write(case_html) 36 | -------------------------------------------------------------------------------- /src/scraper/README.md: -------------------------------------------------------------------------------- 1 | ```mermaid 2 | graph TD 3 | 4 | A[scrape] --> B[set_defaults: Initialize default values for parameters like county, wait time, dates, and case details] 5 | B --> C[configure_logger: Set up logging for the scraping process] 6 | C --> D[format_county: Normalize the county name to ensure consistent processing] 7 | D --> E[create_session: Create a web session object for handling HTTP requests] 8 | E --> F[make_directories: Create directories for storing scraped case data, if not already provided] 9 | F --> G[get_ody_link: Retrieve base URL and Odyssey version information based on county] 10 | G --> H[scrape_main_page: Fetch and parse the main page of the county's court site] 11 | G <--> O[county_csv] 12 | H --> I[scrape_search_page: Navigate to the search page and extract relevant content] 13 | I --> J[get_hidden_values: Extract hidden form values required for subsequent searches] 14 | 15 | J --> K{Is case_number provided?} 16 | K -- Yes --> L[scrape_individual_case: Scrape data for a specific case number provided by the user] 17 | L --> Q[county-specific scraper] 18 | K -- No --> M[scrape_jo_list: Retrieve a list of judicial officers between the start and end dates] 19 | M --> N[scrape_multiple_cases: Scrape data for multiple cases based on judicial officers and date range] 20 | N -- loop through Judicial Officers per Day in Range --> R[county-specific scraper] 21 | ``` 22 | -------------------------------------------------------------------------------- /src/orchestrator/__init__.py: -------------------------------------------------------------------------------- 1 | import os, csv 2 | 3 | # Import all of the programs modules within the parent_dir 4 | from .. import scraper 5 | from .. import parser 6 | from .. import cleaner 7 | from .. import updater 8 | 9 | class Orchestrator: 10 | def __init__(self): 11 | #Sets our base parameters 12 | self.counties = [] 13 | self.start_date = '2024-07-01' #Update start date here 14 | self.end_date = '2024-07-01' #Update start date here 15 | def orchestrate(self, test: bool = False): 16 | 17 | #This open the county data CSV to see which counties should be scraped, parsed, cleaned, and updated. 18 | with open( 19 | os.path.join( 20 | os.path.dirname(__file__), "..", "..", "resources", "texas_county_data.csv" 21 | ), 22 | mode="r", 23 | ) as file_handle: 24 | csv_file = csv.DictReader(file_handle) 25 | for row in csv_file: 26 | #This only selects the counties from the csv that should be parsed. 27 | if row["scrape"].lower() == "yes": 28 | self.counties.append(row["county"]) 29 | 30 | #This runs the different modules in order 31 | for c in self.counties: 32 | print(f"Starting to scrape, parse, clean, and update this county: {c}") 33 | scraper(test = test, county = c).scrape() #src/scraper 34 | parser(c).parse() #src/parser 35 | cleaner(c).clean() #src/cleaner 36 | updater(c).update() #src/updater 37 | print(f"Completed with scraping, parsing, cleaning, and updating of this county: {c}") 38 | 39 | if __name__ == '__main__': 40 | Orchestrator().orchestrate() 41 | -------------------------------------------------------------------------------- /.github/workflows/python-app.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: Indigent Defense Stats 5 | 6 | on: 7 | push: 8 | branches: [ "main" ] 9 | pull_request: 10 | branches: [ "main" ] 11 | 12 | permissions: 13 | contents: read 14 | pages: write 15 | id-token: write 16 | 17 | concurrency: 18 | group: "pages" 19 | cancel-in-progress: false 20 | 21 | jobs: 22 | build: 23 | 24 | runs-on: ubuntu-latest 25 | 26 | steps: 27 | - uses: actions/checkout@v4 28 | - name: Set up Python 3.12 29 | uses: actions/setup-python@v3 30 | with: 31 | python-version: "3.12" 32 | - name: Install dependencies 33 | run: | 34 | python -m pip install --upgrade pip 35 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 36 | - name: Lint with flake8 37 | run: | 38 | # stop the build if there are Python syntax errors or undefined names 39 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 40 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 41 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 42 | - name: Test with unittest 43 | run: | 44 | SKIP_SLOW=true python -m unittest -v 45 | - name: Build documentation 46 | run: | 47 | sphinx-build -b html docs build 48 | 49 | - uses: actions/upload-pages-artifact@v3.0.1 50 | with: 51 | path: build/ 52 | 53 | deploy: 54 | needs: build 55 | runs-on: ubuntu-latest 56 | if: github.ref == 'refs/heads/main' 57 | steps: 58 | - name: Deploy to GitHub Pages 59 | id: deployment 60 | uses: actions/deploy-pages@v4 61 | -------------------------------------------------------------------------------- /resources/test_files/cleaned_test_json/test_123456.json: -------------------------------------------------------------------------------- 1 | { 2 | "parsing_date": "2024-11-02", 3 | "html_hash": "8d4a80173c700b37", 4 | "Case Metadata": { 5 | "county": "hays" 6 | }, 7 | "Defendant Information": { 8 | "appointed_or_retained": "Court Appointed", 9 | "defense_attorney": "9083bb693e33919c" 10 | }, 11 | "Charge Information": [ 12 | { 13 | "charge_id": 0, 14 | "charge_level": "Second Degree Felony", 15 | "orignal_charge": "AGGRAVATED ASSAULT WITH A DEADLY WEAPON", 16 | "statute": "22.02(a)(2)", 17 | "is_primary_charge": true, 18 | "charge_date": "2015-10-25", 19 | "charge_name": "AGGRAVATED ASSAULT WITH A DEADLY WEAPON", 20 | "uccs_code": "1200", 21 | "charge_desc": "Aggravated Assault", 22 | "offense_category_desc": "Aggravated assault", 23 | "offense_type_desc": "Violent" 24 | } 25 | ], 26 | "Case Details": { 27 | "earliest_charge_date": "2015-10-25", 28 | "has_evidence_of_representation": false 29 | }, 30 | "Disposition_Information": [ 31 | { 32 | "date": "12/06/2016", 33 | "event": "Disposition", 34 | "details": [ 35 | { 36 | "charge": "1. AGGRAVATED ASSAULT WITH A DEADLY WEAPON", 37 | "outcome": "Deferred Adjudication" 38 | } 39 | ] 40 | }, 41 | { 42 | "date": "11/04/2019", 43 | "event": "Amended Disposition", 44 | "details": [ 45 | { 46 | "charge": "1. AGGRAVATED ASSAULT WITH A DEADLY WEAPON", 47 | "outcome": "Amend Probation" 48 | } 49 | ] 50 | } 51 | ], 52 | "Good Motions": [], 53 | "cause_number_redacted": "871239500b7fe2fd" 54 | } -------------------------------------------------------------------------------- /src/scraper/scrapcode_post2017.py: -------------------------------------------------------------------------------- 1 | # Not currently in use. Should be moved to a county-specific module, class, and method when a post2017 county is included 2 | """def scrape_case_data_post2017(self, base_url, case_html_path, session, logger, ms_wait): 3 | # Need to POST this page to get a JSON of the search results after the initial POST 4 | case_list_json = request_page_with_retry( 5 | session=session, 6 | url=urllib.parse.urljoin(base_url, "Hearing/HearingResults/Read"), 7 | verification_text="AggregateResults", 8 | logger=logger, 9 | ) 10 | case_list_json = json.loads(case_list_json) 11 | logger.info(f"{case_list_json['Total']} cases found") 12 | for case_json in case_list_json["Data"]: 13 | case_id = str(case_json["CaseId"]) 14 | logger.info(f"{case_id} scraping case") 15 | # make request for the case 16 | case_html = request_page_with_retry( 17 | session=session, 18 | url=urllib.parse.urljoin(base_url, "Case/CaseDetail"), 19 | verification_text="Case Information", 20 | logger=logger, 21 | ms_wait=ms_wait, 22 | params={ 23 | "eid": case_json["EncryptedCaseId"], 24 | "CaseNumber": case_json["CaseNumber"], 25 | }, 26 | ) 27 | # make request for financial info 28 | case_html += request_page_with_retry( 29 | session=session, 30 | url=urllib.parse.urljoin( 31 | base_url, "Case/CaseDetail/LoadFinancialInformation" 32 | ), 33 | verification_text="Financial", 34 | logger=logger, 35 | ms_wait=ms_wait, 36 | params={ 37 | "caseId": case_json["CaseId"], 38 | }, 39 | ) 40 | # write case html data 41 | logger.info(f"{len(case_html)} response string length") 42 | with open( 43 | os.path.join(case_html_path, f"{case_id}.html"), "w" 44 | ) as file_handle: 45 | file_handle.write(case_html)""" -------------------------------------------------------------------------------- /src/tools/print_stats.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | from time import time 6 | from statistics import mean, median, mode 7 | 8 | N_LONGEST = 5 9 | START_TIME = time() 10 | 11 | case_data_list = [] 12 | 13 | argparser = argparse.ArgumentParser() 14 | argparser.add_argument( 15 | "-county", 16 | "-c", 17 | type=str, 18 | default="hays", 19 | help="The name of the county.", 20 | ) 21 | argparser.description = "Print stats for the specified county." 22 | args = argparser.parse_args() 23 | 24 | case_json_path = os.path.join( 25 | os.path.dirname(__file__), "..", "..", "data", args.county, "case_json" 26 | ) 27 | for case_file in os.scandir(case_json_path): 28 | with open(case_file.path, "r") as file_handle: 29 | case_data_list.append(json.loads(file_handle.read())) 30 | 31 | 32 | def print_top_cases_by_lambda(sort_function, description): 33 | print("\n", description) 34 | cases_by_lambda = sorted(case_data_list, key=sort_function)[-N_LONGEST:] 35 | converted_data = list(sort_function(case) for case in case_data_list) 36 | print( 37 | "\n".join( 38 | f"{i}. {sort_function(case)}".ljust(20) + case["odyssey id"] 39 | for i, case in enumerate(reversed(cases_by_lambda), 1) 40 | ), 41 | "\nMean:", 42 | round(mean(converted_data), 2), 43 | " Median:", 44 | round(median(converted_data), 2), 45 | " Mode:", 46 | round(mode(converted_data), 2), 47 | ) 48 | 49 | 50 | disposition_len = (lambda case: len(case["dispositions"]), "dispositions length") 51 | charges_len = (lambda case: len(case["charge information"]), "number of charges") 52 | events_len = ( 53 | lambda case: len(case["other events and hearings"]), 54 | "other events and hearings length", 55 | ) 56 | case_cost = ( 57 | lambda case: float( 58 | case["financial information"]["total financial assessment"].replace(",", "") 59 | ) 60 | if "financial information" in case 61 | else 0.0, 62 | "highest cost", 63 | ) 64 | for sort_function, description in (events_len, disposition_len, case_cost, charges_len): 65 | print_top_cases_by_lambda( 66 | sort_function, 67 | description, 68 | ) 69 | print("\nNumber of cases:", len(case_data_list)) 70 | print("Stats parsing runtime:", round(time() - START_TIME, 2), "seconds") 71 | -------------------------------------------------------------------------------- /parser_log.txt: -------------------------------------------------------------------------------- 1 | 2024-10-13 13:51:33,631 - INFO - Logger configured 2 | 2024-10-13 13:51:33,631 - INFO - Starting parsing for hays county with case number 51652356 3 | 2024-10-13 13:51:33,632 - INFO - get_directories function called 4 | base_dir: c:\Users\nicol\Documents\Open Austin\ids\indigent-defense-stats 5 | 6 | 2024-10-13 13:51:33,633 - INFO - Returning case_html_path: c:\Users\nicol\Documents\Open Austin\ids\indigent-defense-stats\data\hays\case_html 7 | Returning case_json_path: c:\Users\nicol\Documents\Open Austin\ids\indigent-defense-stats\data\hays\case_json 8 | 9 | 2024-10-13 13:51:33,634 - INFO - Time started: 1728845493.6341271 10 | 2024-10-13 13:51:33,650 - INFO - get_list_of_html function called 11 | 12 | 2024-10-13 13:51:33,651 - INFO - parse_single_file is True 13 | 14 | 2024-10-13 13:51:33,652 - INFO - Starting for loop to parse 1 cases 15 | 2024-10-13 13:51:33,654 - INFO - test_51652356 - parsing 16 | 2024-10-13 13:51:33,852 - INFO - Module: hays 17 | Class: ParserHays 18 | Method: parser_hays 19 | 20 | 2024-10-13 13:51:33,925 - INFO - Module 'hays' imported successfully. 21 | 2024-10-13 13:51:33,928 - INFO - Class 'ParserHays' retrieved successfully. 22 | 2024-10-13 13:51:33,929 - INFO - Method 'parser_hays' retrieved successfully. 23 | 2024-10-13 13:51:33,946 - INFO - Getting case metadata for hays case test_51652356 24 | 2024-10-13 13:51:33,951 - INFO - Getting case details 25 | 2024-10-13 13:51:33,956 - INFO - Parsing defendant rows 26 | 2024-10-13 13:51:33,957 - INFO - Parsing state rows 27 | 2024-10-13 13:51:33,957 - INFO - Getting charge information 28 | 2024-10-13 13:51:33,959 - INFO - Formatting events and orders of the court 29 | 2024-10-13 13:51:33,972 - INFO - For Loop started 30 | Getting disposition information 31 | 2024-10-13 13:51:33,976 - INFO - Row is not a disposition: ['11/07/2016', 'CANCELED', 'Punishment Hearing', '(9:00 AM) (Judicial Officer Boyer, Bruce)', "Defendant's Request"] 32 | 2024-10-13 13:51:33,978 - INFO - Row is not a disposition: ['03/23/2016', 'CANCELED', 'Arraignment', '(9:00 AM) (Judicial Officer Henry, William R)', 'Waived Arraignment'] 33 | 2024-10-13 13:51:33,978 - INFO - Row is not a disposition: ['11/04/2019', 'Amended Deferred Adjudication', '(Judicial Officer: Boyer, Bruce) Reason: Community Supervision Extended', '1. AGGRAVATED ASSAULT WITH A DEADLY WEAPON', 'CSCD', '7 Years'] 34 | 2024-10-13 13:51:33,979 - INFO - For Loop ended 35 | 36 | 2024-10-13 13:51:33,995 - INFO - Writing JSON to: c:\Users\nicol\Documents\Open Austin\ids\indigent-defense-stats\data\hays\case_json 37 | 2024-10-13 13:51:34,000 - INFO - Parsing took 0.3662230968475342 seconds 38 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # data, caching, logging 2 | data 3 | logging 4 | debug.* 5 | aws 6 | test_data 7 | 8 | # venv stuff 9 | bin/ 10 | include/ 11 | lib64/ 12 | Scripts/ 13 | pyenv.cfg 14 | 15 | # Byte-compiled / optimized / DLL files 16 | __pycache__/ 17 | *.py[cod] 18 | *$py.class 19 | 20 | # editor stuff 21 | .vscode/ 22 | 23 | # C extensions 24 | *.so 25 | 26 | # Distribution / packaging 27 | .Python 28 | build/ 29 | develop-eggs/ 30 | dist/ 31 | downloads/ 32 | eggs/ 33 | .eggs/ 34 | lib/ 35 | lib64/ 36 | parts/ 37 | sdist/ 38 | var/ 39 | wheels/ 40 | share/python-wheels/ 41 | *.egg-info/ 42 | .installed.cfg 43 | *.egg 44 | MANIFEST 45 | 46 | # PyInstaller 47 | # Usually these files are written by a python script from a template 48 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 49 | *.manifest 50 | *.spec 51 | 52 | # Installer logs 53 | pip-log.txt 54 | pip-delete-this-directory.txt 55 | 56 | # Unit test / coverage reports 57 | htmlcov/ 58 | .tox/ 59 | .nox/ 60 | .coverage 61 | .coverage.* 62 | .cache 63 | nosetests.xml 64 | coverage.xml 65 | *.cover 66 | *.py,cover 67 | .hypothesis/ 68 | .pytest_cache/ 69 | cover/ 70 | 71 | # Translations 72 | *.mo 73 | *.pot 74 | 75 | # Django stuff: 76 | *.log 77 | local_settings.py 78 | db.sqlite3 79 | db.sqlite3-journal 80 | 81 | # Flask stuff: 82 | instance/ 83 | .webassets-cache 84 | 85 | # Scrapy stuff: 86 | .scrapy 87 | 88 | # Sphinx documentation 89 | docs/_build/ 90 | 91 | # PyBuilder 92 | .pybuilder/ 93 | target/ 94 | 95 | # Jupyter Notebook 96 | .ipynb_checkpoints 97 | 98 | # IPython 99 | profile_default/ 100 | ipython_config.py 101 | 102 | # pyenv 103 | # For a library or package, you might want to ignore these files since the code is 104 | # intended to run in multiple environments; otherwise, check them in: 105 | .python-version 106 | 107 | # pipenv 108 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 109 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 110 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 111 | # install all needed dependencies. 112 | #Pipfile.lock 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | pyvenv.cfg 157 | 158 | # ignore data folders 159 | case_html/ 160 | case_json/ 161 | data/ 162 | 163 | # asdf 164 | .tool-versions 165 | 166 | # libreoffice makes this lock file when opening csvs 167 | .~lock.* 168 | 169 | .DS_Store 170 | 171 | docs/generated 172 | -------------------------------------------------------------------------------- /src/tester/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Unit Testing 3 | This module provides unit testing using the unittest module for each module in the program. 4 | ## Setup 5 | 6 | Once you've loaded the repository in Visual Studio, you can load the test by using the testing module in the VS interface. But you should make sure to update your settings.json file in the .vscode folder created in the root folder in order to direct VS to the subdirectory location of the test files within /src/tester. 7 | ``` 8 | 9 | { 10 | "python.testing.unittestEnabled": true, 11 | "python.testing.unittestArgs": [ 12 | "-v" 13 | ] 14 | } 15 | ``` 16 | ## Tests 17 | 18 | ### A. Scraper Tests 19 | 20 | #### Test #1: Did the scraper create a file called 12947592.html in the right location? 21 | 22 | This will look to see if there is an HTML file of the expected name in the expected destination. 23 | 24 | It will also check if the file has been updated since the test began (ie. was updated rather than simply exists). 25 | 26 | #### Test #2: Test #2: Is the resulting HTML file longer than 1000 characters? 27 | 28 | This will check if the length of a string of the HTML file is longer than 1000 characters to ensure it was a full page scrape. 29 | 30 | #### Test #3: Does the resulting HTML file container the cause number in the expected header location? 31 | 32 | This will check a specific location in the HTML file for where a cause number is expected to be. If the cause number is present within the HTML at that location, this is a good indication that the scrape was successful. 33 | 34 | ### B. Parser Tests 35 | 36 | #### Test #1: Check to see if there is a JSON called 51652356.json created in the correct location and that it was updated since this test started running 37 | 38 | This will look to see if there is an JSON file of the expected name in the expected destination. 39 | 40 | It will also check if the file has been updated since the test began (ie. was updated rather than simply exists). 41 | 42 | #### Test #2: Check to see that JSON parsed all of the necessary fields and did so properly. 43 | 44 | This unit test uses a JSON database of expected fields and features of those fields (called "field_validation_list.json") where each entry in the JSON file is a field with the following fields for each that are used in validation (for example): 45 | ``` 46 | { 47 | "name": "location", 48 | "logical_level": "top", 49 | "type": "string", 50 | "estimated_min_length": 3, 51 | "importance": "necessary" 52 | }, 53 | { 54 | "name": "party information", 55 | "logical_level": "top", 56 | "type": "array", 57 | "estimated_min_length": 1, 58 | "importance": "necessary" 59 | }, 60 | { 61 | "name": "charge information", 62 | "logical_level": "top", 63 | "type": "array", 64 | "estimated_min_length": 1, 65 | "importance": "necessary" 66 | }, 67 | { 68 | "name": "defendant", 69 | "logical_level": "party", 70 | "type": "string", 71 | "estimated_min_length": 1, 72 | "importance": "necessary" 73 | } 74 | ``` 75 | The order of fields it addresses goes in this order: 76 | - necessary: fields that are consider required for a successful parsing 77 | - high: are important for data visualization and analysis 78 | - medium: have potential for use 79 | - low: have little or no use or importance 80 | 81 | It does so by opening a json dictionary filled with expected fields and expected features of those fields: 82 | - whether field exists in the jason (is in): check_exists 83 | - expected type (str or array): 84 | - expected length (strings and arrays): check_length 85 | 86 | ### C. Cleaner Tests 87 | 88 | In progress. 89 | 90 | ### D. Updater Tests 91 | 92 | In progress. 93 | 94 | ### E. Orchestrator Tests 95 | 96 | In progress. 97 | -------------------------------------------------------------------------------- /src/cleaner/Data Structure of Cleaned JSON.md: -------------------------------------------------------------------------------- 1 | ## Data Structure of the Cleaned Cases JSON 2 | 3 | ```mermaid 4 | graph TB 5 | subgraph CaseInformation[Case Information Summary] 6 | style CaseInformation fill:#d3a8e2,stroke:#333,stroke-width:2px 7 | A1[County: Hays] 8 | A2[Cause Number Hash: dsqn91cn1odmo] 9 | A3[Odyssey ID: Redacted] 10 | A4[Date Filed: 01/01/2015] 11 | A5[Location: 22nd District Court] 12 | A6[Version: 1] 13 | A7[Parsing Date: 2024-01-01] 14 | end 15 | 16 | subgraph PartyInformation[Party Information] 17 | style PartyInformation fill:#d3a8e2,stroke:#333,stroke-width:2px 18 | 19 | subgraph DefendantInfoBox[Defendant Info] 20 | style DefendantInfoBox fill:#b0d4f1,stroke:#333,stroke-width:2px 21 | D8[Defendant Info: Redacted] 22 | end 23 | subgraph RepresentationInfo[Defense Attorney Info] 24 | style RepresentationInfo fill:#b0d4f1,stroke:#333,stroke-width:2px 25 | B1[Defense Attorney Hash: 9083bb693e33919c] 26 | B2[Appointed or Retained: Court Appointed] 27 | 28 | end 29 | 30 | end 31 | 32 | subgraph Events[Event Information] 33 | style Events fill:#d3a8e2,stroke:#333,stroke-width:2px 34 | subgraph EvidenceofRep[Representation Evidence] 35 | style EvidenceofRep fill:#b0d4f1,stroke:#333,stroke-width:2px 36 | B3[Has Evidence of Representation: No] 37 | end 38 | 39 | end 40 | 41 | subgraph ChargeInformation[Charge Information] 42 | style ChargeInformation fill:#d3a8e2,stroke:#333,stroke-width:2px 43 | 44 | subgraph Charge1[Aggravated Assault with a Deadly Weapon] 45 | style Charge1 fill:#b0d4f1,stroke:#333,stroke-width:2px 46 | C1[Statute: 22.02a2] 47 | C2[Level: Second Degree Felony] 48 | C3[Date: 10/25/2015] 49 | C4[Charge Name: Aggravated Assault with a Deadly Weapon] 50 | C5[Description: Aggravated Assault] 51 | C6[Category: Violent] 52 | C7[UCCS Code: 1200] 53 | end 54 | 55 | subgraph Charge2[Resisting Arrest] 56 | style Charge2 fill:#b0d4f1,stroke:#333,stroke-width:2px 57 | C8[Statute: 38.03] 58 | C9[Level: Class A Misdemeanor] 59 | C10[Date: 10/25/2015] 60 | C11[Charge Name: Resisting Arrest] 61 | C12[Description: Resisting Arrest] 62 | end 63 | 64 | E3[Charges Dismissed: 1] 65 | 66 | 67 | end 68 | 69 | subgraph TopCharge[Top Charge] 70 | style TopCharge fill:#b0d4f1,stroke:#333,stroke-width:2px 71 | E1[Charge Name: Aggravated Assault with a Deadly Weapon] 72 | E2[Charge Level: Second Degree Felony] 73 | end 74 | 75 | subgraph Dispositions[Dispositions] 76 | style Dispositions fill:#d3a8e2,stroke:#333,stroke-width:2px 77 | 78 | subgraph Disposition1[Disposition Details] 79 | style Disposition1 fill:#b0d4f1,stroke:#333,stroke-width:2px 80 | D1[Date: 12/06/2016] 81 | D2[Event: Disposition] 82 | D3[Outcome: Deferred Adjudication] 83 | D4[Sentence Length: 1 Year] 84 | end 85 | 86 | subgraph Disposition2[Resisting Arrest Disposition] 87 | style Disposition2 fill:#b0d4f1,stroke:#333,stroke-width:2px 88 | D5[Date: 12/06/2016] 89 | D6[Event: Disposition] 90 | D7[Outcome: Dismissed] 91 | end 92 | end 93 | 94 | 95 | CaseInformation --> PartyInformation 96 | CaseInformation --> ChargeInformation 97 | CaseInformation --> Dispositions 98 | CaseInformation --> Events 99 | ChargeInformation --> TopCharge 100 | ``` 101 | -------------------------------------------------------------------------------- /src/parser/README.md: -------------------------------------------------------------------------------- 1 | ```mermaid 2 | graph TD 3 | subgraph Parsing 4 | A[Start Parsing] --> B([configure_logger]) 5 | B --> C[Store county] 6 | C --> D([get_directories]) 7 | D --> E[Start Timer] 8 | E --> F([get_list_of_html]) 9 | F --> G{for case_html_file_path
in case_html_list} 10 | G --> H[Store case_number] 11 | H --> I([get_class_and_method]) 12 | I --> J{{if parser_instance and
parser_function is not None}} 13 | J --> L([parser_function]) 14 | I --> K{{else: THROW ERROR}} 15 | DD --> M([write_json_data]) 16 | K --> M([write_json_data]) 17 | L --> AA[Start Parsing
Specific County] 18 | AA --> BB[Create root_tables] 19 | BB --> CC([get_case_metadata]) 20 | CC --> DD{for table in root_tables} 21 | DD --> EE{{if Case Type and Date Filed}} 22 | EE -- True --> JJ([get_case_details]) 23 | EE --> FF{{elif Related Case}} 24 | FF -- True --> KK[Store
case_data#91;Related Cases#93;] 25 | FF --> GG{{elif Party Information}} 26 | GG -- True --> LL([parse_defendant_rows#40;
extract_rows#40;#41;#41;]) 27 | LL --> MM([parse_state_rows#40;
extract_rows#40;#41;#41;]) 28 | GG --> HH{{elif Charge Information}} 29 | HH -- True --> NN([get_charge_information]) 30 | HH --> II{{elif Events & Orders of
the Court}} 31 | II --> DD 32 | II -- True --> OO([format_events_and_
orders_of_the_court]) 33 | OO --> PP{for row
in disposition_rows:} 34 | PP --> QQ([get_disposition_information]) 35 | QQ --> PP 36 | PP --> RR{{if case_data#91;Disposition
Information#93;}} 37 | RR -- True --> SS([get_top_charge]) 38 | RR --> II 39 | SS --> TT([count_dismissed_charges]) 40 | 41 | 42 | end 43 | M --> Y 44 | G --> Y[End Timer] 45 | Y --> Z[End Parsing] 46 | 47 | style A fill:#66A182,stroke:#333,stroke-width:4px,color:#FFF 48 | style B fill:#9A7FAE,stroke:#333,stroke-width:2px,color:#FFF 49 | style C fill:#D99559,stroke:#333,stroke-width:2px,color:#FFF 50 | style D fill:#9A7FAE,stroke:#333,stroke-width:2px,color:#FFF 51 | style E fill:#66A182,stroke:#333,stroke-width:4px,color:#FFF 52 | style F fill:#9A7FAE,stroke:#333,stroke-width:2px,color:#FFF 53 | style G fill:#779ECB,stroke:#333,stroke-width:4px,color:#FFF 54 | style H fill:#D99559,stroke:#333,stroke-width:2px,color:#FFF 55 | style I fill:#9A7FAE,stroke:#333,stroke-width:2px,color:#FFF 56 | style J fill:#D06A6A,stroke:#333,stroke-width:4px,color:#FFF 57 | style K fill:#D06A6A,stroke:#333,stroke-width:4px,color:#FFF 58 | style L fill:#9A7FAE,stroke:#333,stroke-width:2px,color:#FFF 59 | style M fill:#9A7FAE,stroke:#333,stroke-width:2px,color:#FFF 60 | style Y fill:#66A182,stroke:#333,stroke-width:4px,color:#FFF 61 | style Z fill:#66A182,stroke:#333,stroke-width:4px,color:#FFF 62 | 63 | style AA fill:#66A182,stroke:#333,stroke-width:4px,color:#FFF 64 | style BB fill:#D99559,stroke:#333,stroke-width:4px,color:#FFF 65 | style CC fill:#9A7FAE,stroke:#333,stroke-width:4px,color:#FFF 66 | style DD fill:#779ECB,stroke:#333,stroke-width:4px,color:#FFF 67 | style EE fill:#D06A6A,stroke:#333,stroke-width:4px,color:#FFF 68 | style FF fill:#D06A6A,stroke:#333,stroke-width:4px,color:#FFF 69 | style GG fill:#D06A6A,stroke:#333,stroke-width:4px,color:#FFF 70 | style HH fill:#D06A6A,stroke:#333,stroke-width:4px,color:#FFF 71 | style II fill:#D06A6A,stroke:#333,stroke-width:4px,color:#FFF 72 | style JJ fill:#9A7FAE,stroke:#333,stroke-width:4px,color:#FFF 73 | style KK fill:#D99559,stroke:#333,stroke-width:4px,color:#FFF 74 | style LL fill:#9A7FAE,stroke:#333,stroke-width:4px,color:#FFF 75 | style MM fill:#9A7FAE,stroke:#333,stroke-width:4px,color:#FFF 76 | style NN fill:#9A7FAE,stroke:#333,stroke-width:4px,color:#FFF 77 | style OO fill:#9A7FAE,stroke:#333,stroke-width:4px,color:#FFF 78 | style PP fill:#779ECB,stroke:#333,stroke-width:4px,color:#FFF 79 | style QQ fill:#9A7FAE,stroke:#333,stroke-width:4px,color:#FFF 80 | style RR fill:#D06A6A,stroke:#333,stroke-width:4px,color:#FFF 81 | style SS fill:#9A7FAE,stroke:#333,stroke-width:4px,color:#FFF 82 | style TT fill:#9A7FAE,stroke:#333,stroke-width:4px,color:#FFF 83 | 84 | ``` 85 | 86 | 87 | -------------------------------------------------------------------------------- /src/tools/build_event_csv.py: -------------------------------------------------------------------------------- 1 | """ 2 | Combine hearing & event records from multiple case files into a single csv. 3 | """ 4 | import csv 5 | import argparse 6 | import json 7 | import os 8 | from datetime import datetime 9 | 10 | argparser = argparse.ArgumentParser() 11 | argparser.add_argument( 12 | "-county", 13 | "-c", 14 | type=str, 15 | default="hays", 16 | help="The name of the county.", 17 | ) 18 | argparser.description = "Print stats for the specified county." 19 | args = argparser.parse_args() 20 | 21 | FILE_DIR = os.path.join( 22 | os.path.dirname(__file__), "..", "..", "data", args.county, "case_json" 23 | ) 24 | 25 | 26 | def parse_event_date(date_str): 27 | """Return a python `datetime` from e.g. '01/30/2021'""" 28 | month, day, year = date_str.split("/") 29 | return datetime(year=int(year), month=int(month), day=int(day)) 30 | 31 | 32 | def iso_event_date(dt): 33 | """Format a `datetime` instance as YYYY-MM-DD""" 34 | return dt.strftime("%Y-%m-%d") 35 | 36 | 37 | def get_days_elapsed(start, end): 38 | """Return the number of days between two dates""" 39 | delta = end - start 40 | return delta.days 41 | 42 | 43 | def main(): 44 | files = [file for file in os.listdir(FILE_DIR) if file.endswith(".json")] 45 | events = [] 46 | charges = [] 47 | 48 | for f_count, f_name in enumerate(files): 49 | if f_count % 1000 == 0: 50 | print(f"Processing file {f_count} of {len(files)}") 51 | 52 | with open(f"{FILE_DIR}/{f_name}", "r") as fin: 53 | """ 54 | Extract fields of interest. you can add any attributes of interest to the 55 | event_record dict and they will be included in the output CSV. 56 | Extracts events and charges from the case file, in seperate files. 57 | """ 58 | case = json.load(fin) 59 | 60 | # extract demographic info 61 | case_id = case["odyssey id"] 62 | case_number = case["code"] 63 | retained = case["party information"]["appointed or retained"] 64 | gender = case["party information"]["sex"] 65 | race = case["party information"]["race"] 66 | defense_attorney = case["party information"]["defense attorney"] 67 | 68 | # extract event data 69 | first_event_date = None 70 | for i, event in enumerate(case["other events and hearings"]): 71 | event_record = {} 72 | event_date = parse_event_date(event[0]) 73 | 74 | if i == 0: 75 | first_event_date = event_date 76 | 77 | days_elapsed = get_days_elapsed(first_event_date, event_date) 78 | event_record["event_id"] = i + 1 79 | event_record["event_date"] = iso_event_date(event_date) 80 | event_record["first_event_date"] = iso_event_date(first_event_date) 81 | event_record["days_elapsed"] = days_elapsed 82 | event_record["event_name"] = event[1] 83 | event_record["attorney"] = retained 84 | event_record["case_id"] = case_id 85 | event_record["case_number"] = case_number 86 | event_record["defense_attorney"] = defense_attorney 87 | event_record["race"] = race 88 | event_record["gender"] = gender 89 | events.append(event_record) 90 | 91 | # extract charge data 92 | for i, charge in enumerate(case["charge information"]): 93 | charge_record = {} 94 | charge_record["charge_id"] = i + 1 95 | charge_record["charge_name"] = charge.get("charges", "") 96 | charge_record["statute"] = charge.get("statute", "") 97 | charge_record["level"] = charge.get("level", "") 98 | 99 | charge_record["charge_date"] = charge.get("date", "") 100 | if charge_record["charge_date"]: 101 | charge_record["charge_date"] = iso_event_date( 102 | parse_event_date(charge_record["charge_date"]) 103 | ) 104 | 105 | charge_record["case_id"] = case_id 106 | charge_record["case_number"] = case_number 107 | charges.append(charge_record) 108 | 109 | with open("events_combined.csv", "w", newline="") as fout: 110 | writer = csv.DictWriter(fout, fieldnames=events[0].keys()) 111 | writer.writeheader() 112 | writer.writerows(events) 113 | 114 | with open("charges_combined.csv", "w", newline="") as fout: 115 | writer = csv.DictWriter(fout, fieldnames=charges[0].keys()) 116 | writer.writeheader() 117 | writer.writerows(charges) 118 | 119 | 120 | if __name__ == "__main__": 121 | main() 122 | -------------------------------------------------------------------------------- /src/scraper/helpers.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import requests 3 | from time import sleep 4 | from datetime import date 5 | from logging import Logger 6 | from typing import Dict, Optional, Tuple, Literal 7 | from enum import Enum 8 | 9 | #This is called debug and quit. 10 | def write_debug_and_quit( 11 | page_text: str, logger: Logger, verification_text: Optional[str] = None 12 | ) -> None: 13 | logger.error( 14 | ( 15 | f"{verification_text} could not be found in page." 16 | if verification_text 17 | else "Failed to load page." 18 | ) 19 | + f" Aborting. Writing /data/debug.html with response. May not be HTML." 20 | ) 21 | with open(os.path.join(os.path.dirname(__file__), "..","..","logging", f"debug.html"), "w") as file_handle: 22 | file_handle.write(page_text) 23 | sys.exit(1) 24 | 25 | # helper function to make form data 26 | def create_search_form_data( 27 | date: str, JO_id: str, hidden_values: Dict[str, str], odyssey_version: int 28 | ) -> Dict[str, str]: 29 | form_data = {} 30 | form_data.update(hidden_values) 31 | if odyssey_version < 2017: 32 | form_data.update( 33 | { 34 | "SearchBy": "3", 35 | "cboJudOffc": JO_id, 36 | "DateSettingOnAfter": date, 37 | "DateSettingOnBefore": date, 38 | "SearchType": "JUDOFFC", # Search by Judicial Officer 39 | "SearchMode": "JUDOFFC", 40 | "CaseCategories": "CR", # "CR,CV,FAM,PR" criminal, civil, family, probate and mental health - these are the options 41 | } 42 | ) 43 | else: 44 | form_data.update( 45 | { 46 | "SearchCriteria.SelectedHearingType": "Criminal Hearing Types", 47 | "SearchCriteria.SearchByType": "JudicialOfficer", 48 | "SearchCriteria.SelectedJudicialOfficer": JO_id, 49 | "SearchCriteria.DateFrom": date, 50 | "SearchCriteria.DateTo": date, 51 | } 52 | ) 53 | return form_data 54 | 55 | def create_single_case_search_form_data(hidden_values: Dict[str, str], case_number: str): 56 | form_data = {} 57 | form_data.update(hidden_values) 58 | os_specific_time_format = "%#m/%#d/%Y" if os.name == 'nt' else "%-m/%-d/%Y" 59 | form_data.update( 60 | { 61 | "__EVENTTARGET":"", 62 | "SearchBy": "0", 63 | "DateSettingOnAfter": "1/1/1970", 64 | "DateSettingOnBefore": date.today().strftime(os_specific_time_format), 65 | "SearchType": "CASE", # Search by case id 66 | "SearchMode": "CASENUMBER", 67 | "CourtCaseSearchValue": case_number, 68 | "CaseCategories": "", 69 | "cboJudOffc":"38501", 70 | } 71 | ) 72 | return form_data 73 | 74 | 75 | class HTTPMethod(Enum): 76 | POST: int = 1 77 | GET: int = 2 78 | 79 | 80 | def request_page_with_retry( 81 | session: requests.Session, 82 | url: str, 83 | logger: Logger, 84 | verification_text: Optional[str] = None, 85 | http_method: Literal[HTTPMethod.POST, HTTPMethod.GET] = HTTPMethod.POST, 86 | params: Dict[str, str] = {}, 87 | data: Optional[Dict[str, str]] = None, 88 | max_retries: int = 5, 89 | ms_wait: str = 200, 90 | ) -> Tuple[str, bool]: 91 | response = None 92 | for i in range(max_retries): 93 | sleep(ms_wait / 1000 * (i + 1)) 94 | failed = False 95 | try: 96 | if http_method == HTTPMethod.POST: 97 | if not data: 98 | response = session.post(url, params=params) 99 | else: 100 | response = session.post(url, data=data, params=params) 101 | elif http_method == HTTPMethod.GET: 102 | if not data: 103 | response = session.get(url, params=params) 104 | else: 105 | response = session.get(url, data=data, params=params) 106 | response.raise_for_status() 107 | if verification_text: 108 | if verification_text not in response.text: 109 | failed = True 110 | logger.error( 111 | f"Verification text {verification_text} not in response" 112 | ) 113 | except requests.RequestException as e: 114 | logger.exception(f"Failed to get url {url}, try {i}") 115 | failed = True 116 | if failed: 117 | if response == None: 118 | response_text = 'No response from Odyssey.' 119 | else: 120 | response_text = response.text 121 | write_debug_and_quit( 122 | verification_text=verification_text, 123 | page_text=response_text, 124 | logger=logger, 125 | ) 126 | return response.text 127 | -------------------------------------------------------------------------------- /resources/test_files/field_validation_list.json: -------------------------------------------------------------------------------- 1 | [{ 2 | "name": "code", 3 | "logical_level": "top", 4 | "type": "string", 5 | "estimated_min_length": 5, 6 | "importance": "necessary" 7 | }, 8 | { 9 | "name": "odyssey id", 10 | "logical_level": "top", 11 | "type": "string", 12 | "estimated_min_length": 5, 13 | "importance": "necessary" 14 | }, 15 | { 16 | "name": "county", 17 | "logical_level": "top", 18 | "type": "string", 19 | "estimated_min_length": 3, 20 | "importance": "necessary" 21 | }, 22 | { 23 | "name": "name", 24 | "logical_level": "top", 25 | "type": "string", 26 | "estimated_min_length": 3, 27 | "importance": "necessary" 28 | }, 29 | { 30 | "name": "case type", 31 | "logical_level": "top", 32 | "type": "string", 33 | "estimated_min_length": 4, 34 | "importance": "necessary" 35 | }, 36 | { 37 | "name": "date filed", 38 | "logical_level": "top", 39 | "type": "string", 40 | "estimated_min_length": 4, 41 | "importance": "necessary" 42 | }, 43 | { 44 | "name": "location", 45 | "logical_level": "top", 46 | "type": "string", 47 | "estimated_min_length": 3, 48 | "importance": "necessary" 49 | }, 50 | { 51 | "name": "party information", 52 | "logical_level": "top", 53 | "type": "array", 54 | "estimated_min_length": 1, 55 | "importance": "necessary" 56 | }, 57 | { 58 | "name": "charge information", 59 | "logical_level": "top", 60 | "type": "array", 61 | "estimated_min_length": 1, 62 | "importance": "necessary" 63 | }, 64 | { 65 | "name": "defendant", 66 | "logical_level": "party", 67 | "type": "string", 68 | "estimated_min_length": 1, 69 | "importance": "necessary" 70 | }, 71 | { 72 | "name": "sex", 73 | "logical_level": "party", 74 | "type": "string", 75 | "estimated_min_length": 3, 76 | "importance": "medium" 77 | }, 78 | { 79 | "name": "race", 80 | "logical_level": "party", 81 | "type": "string", 82 | "estimated_min_length": 1, 83 | "importance": "medium" 84 | }, 85 | { 86 | "name": "date of birth", 87 | "logical_level": "party", 88 | "type": "string", 89 | "estimated_min_length": 4, 90 | "importance": "low" 91 | }, 92 | { 93 | "name": "height", 94 | "logical_level": "party", 95 | "type": "string", 96 | "estimated_min_length": 1, 97 | "importance": "low" 98 | }, 99 | { 100 | "name": "weight", 101 | "logical_level": "party", 102 | "type": "string", 103 | "estimated_min_length": 1, 104 | "importance": "low" 105 | }, 106 | { 107 | "name": "defense attorney", 108 | "logical_level": "party", 109 | "type": "string", 110 | "estimated_min_length": 1, 111 | "importance": "high" 112 | }, 113 | { 114 | "name": "appointed or retained", 115 | "logical_level": "party", 116 | "type": "string", 117 | "estimated_min_length": 1, 118 | "importance": "high" 119 | }, 120 | { 121 | "name": "defense attorney phone number", 122 | "logical_level": "party", 123 | "type": "string", 124 | "estimated_min_length": 1, 125 | "importance": "low" 126 | }, 127 | { 128 | "name": "defendant address", 129 | "logical_level": "party", 130 | "type": "string", 131 | "estimated_min_length": 1, 132 | "importance": "low" 133 | }, 134 | { 135 | "name": "SID", 136 | "logical_level": "party", 137 | "type": "string", 138 | "estimated_min_length": 5, 139 | "importance": "high" 140 | }, 141 | { 142 | "name": "prosecuting attorney", 143 | "logical_level": "party", 144 | "type": "string", 145 | "estimated_min_length": 1, 146 | "importance": "low" 147 | }, 148 | { 149 | "name": "prosecuting attorney phone number", 150 | "logical_level": "party", 151 | "type": "string", 152 | "estimated_min_length": 1, 153 | "importance": "low" 154 | }, 155 | { 156 | "name": "prosecuting attorney address", 157 | "logical_level": "party", 158 | "type": "string", 159 | "estimated_min_length": 1, 160 | "importance": "low" 161 | }, 162 | { 163 | "name": "bondsman", 164 | "logical_level": "party", 165 | "type": "string", 166 | "estimated_min_length": 1, 167 | "importance": "low" 168 | }, 169 | { 170 | "name": "bondsman address", 171 | "logical_level": "party", 172 | "type": "string", 173 | "estimated_min_length": 1, 174 | "importance": "low" 175 | }, 176 | { 177 | "name": "charges", 178 | "logical_level": "charge", 179 | "type": "string", 180 | "estimated_min_length": 5, 181 | "importance": "necessary" 182 | }, 183 | { 184 | "name": "level", 185 | "logical_level": "charge", 186 | "type": "string", 187 | "estimated_min_length": 5, 188 | "importance": "necessary" 189 | }, 190 | { 191 | "name": "date", 192 | "logical_level": "charge", 193 | "type": "string", 194 | "estimated_min_length": 4, 195 | "importance": "high" 196 | }] -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tyler Technologies Odyssey scraper and parser 2 | 3 | This is a scraper to collect and process public case records from the Tyler Technologies Odyssey court records system. If you are a dev or want to file an Issue, please read [CONTRIBUTING](CONTRIBUTING.md). 4 | 5 | ## Local setup 6 | 7 | ### Install toolchain 8 | 9 | 1. Clone this repo and navigate to it. 10 | - `git clone https://github.com/open-austin/indigent-defense-stats` 11 | - `cd indigent-defense-stats` 12 | 2. Install Pyenv if not already installed ([linux, mac](https://github.com/pyenv/pyenv), or [windows](https://github.com/pyenv-win/pyenv-win)) 13 | 3. Run `pyenv install` to get the right Python version 14 | 15 | ### Setup `venv` 16 | 17 | First, you'll need to create a virtual environment, this differs depending on your OS. 18 | 19 | On linux/mac 20 | 21 | ```bash 22 | python -m venv .venv --prompt ids # (you can substitute `ids` for any name you want) 23 | ``` 24 | 25 | On Windows 26 | 27 | ```powershell 28 | c:\>Python35\python -m venv c:\path\to\repo\ids # (you can substitute `ids` for any name you want) 29 | ``` 30 | 31 | Next, you'll need to "activate" the venv. You'll need to run this command every time you work in the codebase and tell your IDE which Python environment to use. It will likely default to wherever `python` resolves to in your system path. The specific command you run will depend on both your OS and shell. 32 | 33 | On linux/mac 34 | 35 | | platform | shell | Command to activate virtual environment | 36 | | :------- | :--------- | :-------------------------------------- | 37 | | POSIX | bash/zsh | $ source /bin/activate | 38 | | | fish | $ source /bin/activate.fish | 39 | | | csh/tcsh | $ source /bin/activate.csh | 40 | | | PowerShell | $ /bin/Activate.ps1 | 41 | | Windows | cmd.exe | C:\> \Scripts\activate.bat | 42 | | | PowerShell | PS C:\> \Scripts\Activate.ps1 | 43 | 44 | Source: https://docs.python.org/3/library/venv.html#how-venvs-work 45 | 46 | Note: Again, you'll need to activate venv _every time you want to work in the codebase_. 47 | 48 | If the above doesn't work, try these instructions for creating and activating a virtual environment: 49 | 1. Navigate to your project directory: cd [insert file path] 50 | 2. Create a virtual environenment: python -m venv venv 51 | 3. Activate the virtual environment: .\venv\Scripts\activate.bat 52 | 53 | ### Install python dependencies 54 | 55 | Using `pip`, install the project dependencies. 56 | 57 | ```shell 58 | pip install -r requirements.txt 59 | ``` 60 | 61 | ### Running CLI 62 | 63 | @TODO - this section needs to be updated. 64 | 65 | 7. Set parameters to the main command: 66 | - counties = The counties that are listed in the count CSV. Update column "scraper" in the CSV to "yes" to include the county. 67 | - start_date = The first date you want to scrape for case data. Update in scraper. 68 | - end_date = The last date you want to scrape for case data. Update in scraper. 69 | 8. Run the handler. 70 | - `python run python .src/orchestrator` 71 | 72 | ## Structure of Code 73 | 74 | - County Database: A CSV table contains the necessary Odyssey links and version for each county in Texas. One column ("scrape") indicates whether that county should be scraped. Currently, Hays is the default. 75 | - Handler (src/handler): This reads the CSV for the counties to be scraped and runs the following processes for each county. You can also set the start and end date of the parser here. 76 | 77 | - **Scraper** (`src/scraper`): This scrapes all of the judicial officers for each day within the period set in the handler and saves all of the HTML to data/[county name]/case_html. 78 | - **Parser** (`src/parser`): This parses all of the HTML in the county-specific HTML folder to accompanying JSON files in data/[county name]/case_json. 79 | - **Cleaner** (`src/cleaner`): This cleans and redacts information in in the county-specific json folder to a new folder of JSON files in data/[county name]/case_json_cleaned. 80 | - **Updater** (`src/updater`): This pushed the cleaned and redacted JSON in the county-specific cleaned json folder to a container in CosmosDB where it can then be use for visualization. 81 | 82 | ## Flowchart: Relationships Between Functions and Directories 83 | 84 | ```mermaid 85 | flowchart TD 86 | orchestrator{"src/orchestrator (class):
orchestrate (function)"} --> county_db[resources/texas_county_data.csv] 87 | county_db --> |return counties where 'scrape' = 'yes'| orchestrator 88 | orchestrator -->|loop through these counties
and run these four functions| scraper(1. src/scraper: scrape) 89 | scraper --> parser(2. src/parser: parse) 90 | scraper --> |create 1 HTML per case| data_html[data/county/case_html/case_id.html] 91 | parser--> pre2017(src/parser/pre2017) 92 | parser--> post2017(src/parser/post2017) 93 | pre2017 --> cleaner[3. src/cleaner: clean] 94 | post2017 --> cleaner 95 | parser --> |create 1 JSON per case| data_json[data/county/case_json/case_id.json] 96 | cleaner --> |look for charge in db
and normalize it to uccs| charge_db[resouces/umich-uccs-database.json] 97 | charge_db --> cleaner 98 | cleaner --> updater(4. src/updater: update) 99 | cleaner --> |create 1 JSON per case| data_json_cleaned[data/county/case_json_cleaned/case_id.json] 100 | updater --> |send final cleaned JSON to CosmosDB container| CosmosDB_container[CosmosDB container] 101 | CosmosDB_container --> visualization{live visualization} 102 | ``` 103 | -------------------------------------------------------------------------------- /src/parser/Data Structure of Parsed JSON.md: -------------------------------------------------------------------------------- 1 | ```mermaid 2 | graph TB 3 | subgraph CaseInformation[Case Information Summary] 4 | style CaseInformation fill:#d3a8e2,stroke:#333,stroke-width:2px 5 | A1[Case Code: CR-15-1234-C] 6 | A2[Odyssey ID: 198372] 7 | A3[County: Hays] 8 | A4[Case Name: The State of Texas vs. Fake Name] 9 | A5[Case Type: Adult Felony] 10 | A6[Date Filed: 01/01/2015] 11 | A7[Location: 22nd District Court] 12 | end 13 | 14 | subgraph PartyInformation[Party Information] 15 | style PartyInformation fill:#d3a8e2,stroke:#333,stroke-width:2px 16 | 17 | subgraph DefendantInfo[Defendant Information] 18 | style DefendantInfo fill:#b0d4f1,stroke:#333,stroke-width:2px 19 | B1[Defendant: Fake, Name] 20 | B2[Sex: Female] 21 | B3[Race: White] 22 | B4[Date of Birth: 01/01/1980] 23 | B5[Height: 5 foot 6 inches] 24 | B6[Weight: 200 lbs] 25 | B7[Address: 876 Main St, Natalia, TX 78059] 26 | B8[SID: TX01234567] 27 | end 28 | 29 | subgraph DefenseAttorney[Defense Attorney] 30 | style DefenseAttorney fill:#b0d4f1,stroke:#333,stroke-width:2px 31 | B9[Defense Attorney: Defense Attorney] 32 | B10[Appointed or Retained: Court Appointed] 33 | B11[Phone Number: 512-123-4567 W] 34 | end 35 | 36 | subgraph ProsecutingAttorney[Prosecuting Attorney] 37 | style ProsecutingAttorney fill:#b0d4f1,stroke:#333,stroke-width:2px 38 | B12[Prosecuting Attorney: Yuuuuu Haaaaa] 39 | B13[Prosecuting Attorney Phone Number: 512-321-8596 W] 40 | B14[Prosecuting Attorney Address: 712 S Stagecoach TRL, San Marcos, TX 78666] 41 | end 42 | end 43 | 44 | subgraph ChargeInformation[Charge Information] 45 | style ChargeInformation fill:#d3a8e2,stroke:#333,stroke-width:2px 46 | 47 | subgraph Charge1[Aggravated Assault with a Deadly Weapon] 48 | style Charge1 fill:#b0d4f1,stroke:#333,stroke-width:2px 49 | C1[Statute: 22.02a2] 50 | C2[Level: Second Degree Felony] 51 | C3[Date: 10/25/2015] 52 | end 53 | 54 | subgraph Charge2[Resisting Arrest] 55 | style Charge2 fill:#b0d4f1,stroke:#333,stroke-width:2px 56 | C4[Statute: 38.03] 57 | C5[Level: Class A Misdemeanor] 58 | C6[Date: 10/25/2015] 59 | end 60 | end 61 | 62 | subgraph Dispositions[Dispositions] 63 | style Dispositions fill:#d3a8e2,stroke:#333,stroke-width:2px 64 | 65 | subgraph Disposition1[Aggravated Assault with a Deadly Weapon] 66 | style Disposition1 fill:#b0d4f1,stroke:#333,stroke-width:2px 67 | D1[Date: 12/06/2016] 68 | D2[Event: Disposition] 69 | D3[Judicial Officer: Fake, Judge] 70 | D4[Outcome: Deferred Adjudication] 71 | D5[Sentence Length: 1 Year] 72 | end 73 | 74 | subgraph Disposition2[Resisting Arrest] 75 | style Disposition2 fill:#b0d4f1,stroke:#333,stroke-width:2px 76 | D6[Date: 12/06/2016] 77 | D7[Event: Disposition] 78 | D8[Judicial Officer: Fake, Judge] 79 | D9[Outcome: Dismissed] 80 | end 81 | 82 | end 83 | 84 | subgraph TopCharge[Top Charge] 85 | style TopCharge fill:#d3a8e2,stroke:#333,stroke-width:2px 86 | 87 | E1[Charge Name: Aggravated Assault with a Deadly Weapon] 88 | E2[Charge Level: Second Degree Felony] 89 | end 90 | 91 | subgraph EventsHearings[Example Events & Hearings] 92 | style EventsHearings fill:#d3a8e2,stroke:#333,stroke-width:2px 93 | 94 | subgraph InitialHearings[Initial Hearings and Filings] 95 | style InitialHearings fill:#b0d4f1,stroke:#333,stroke-width:2px 96 | F1[01/05/2016: Indictment Open Case] 97 | F2[02/24/2016: Arraignment Reset] 98 | F3[03/15/2016: Waiver of Arraignment] 99 | F4[04/14/2016: Pre-Trial Motions Reset] 100 | end 101 | 102 | subgraph DiscoveryMotions[Discovery and Motions] 103 | style DiscoveryMotions fill:#b0d4f1,stroke:#333,stroke-width:2px 104 | G1[04/29/2016: Discovery Receipt from District Attorney] 105 | G2[05/05/2016: Acknowledgment of Receipt of Discovery] 106 | G3[06/15/2016: Pre-Trial Motions Reset] 107 | end 108 | 109 | subgraph PreTrial[Pre-Trial Motions and Hearings] 110 | style PreTrial fill:#b0d4f1,stroke:#333,stroke-width:2px 111 | H1[07/27/2016: Pre-Trial Motions Reset] 112 | H2[08/25/2016: Pre-Trial Motions Reset] 113 | H3[09/26/2016: Plea Bargain Agreement] 114 | end 115 | 116 | subgraph TrialAdjudication[Trial and Adjudication] 117 | style TrialAdjudication fill:#b0d4f1,stroke:#333,stroke-width:2px 118 | I1[12/06/2016: Punishment Hearing Deferred Adjudication] 119 | I2[12/06/2016: Conditions of Probation] 120 | end 121 | 122 | subgraph ProbationWarrants[Probation and Warrant Issuances] 123 | style ProbationWarrants fill:#b0d4f1,stroke:#333,stroke-width:2px 124 | J1[10/24/2017: Show Cause Hearing Failure to Appear] 125 | J2[11/01/2017: Motion to Revoke Probation/Adjudicate Guilt Reopen Case] 126 | J3[02/23/2022: Capias Issued] 127 | end 128 | end 129 | 130 | CaseInformation --> PartyInformation 131 | CaseInformation --> ChargeInformation 132 | ChargeInformation --> TopCharge 133 | CaseInformation --> Dispositions 134 | Dispositions --> D10[Charges Dismissed: 1] 135 | CaseInformation --> EventsHearings 136 | ``` -------------------------------------------------------------------------------- /src/updater/__init__.py: -------------------------------------------------------------------------------- 1 | import json, argparse, os, xxhash 2 | from azure.cosmos import CosmosClient, exceptions 3 | from dotenv import load_dotenv 4 | from datetime import datetime as dt 5 | import logging 6 | 7 | class Updater(): 8 | def __init__(self, county = "hays"): 9 | self.county = county.lower() 10 | self.case_json_cleaned_folder_path = os.path.join( 11 | os.path.dirname(__file__), "..", "..", "data", self.county, "case_json_cleaned" 12 | ) 13 | self.processed_path = os.path.join(self.case_json_cleaned_folder_path) 14 | 15 | 16 | # open or create a output directory for a log and successfully processed data 17 | if os.path.exists(self.case_json_cleaned_folder_path) and \ 18 | not os.path.exists(self.processed_path): 19 | os.makedirs(self.processed_path) 20 | self.logger = self.configure_logger() 21 | self.COSMOSDB_CONTAINER_CASES_CLEANED = self.get_database_container() 22 | 23 | def configure_logger(self): 24 | logger = logging.getLogger(name="pid: " + str(os.getpid())) 25 | logger.setLevel(logging.DEBUG) 26 | 27 | cleaner_log_path = os.path.join( 28 | os.path.dirname(__file__), "..", "..", "resources" 29 | ) 30 | 31 | file_handler = logging.FileHandler(os.path.join(cleaner_log_path, 'logger_log.txt')) 32 | file_handler.setLevel(logging.DEBUG) 33 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 34 | file_handler.setFormatter(formatter) 35 | logger.addHandler(file_handler) 36 | 37 | console_handler = logging.StreamHandler() 38 | console_handler.setLevel(logging.WARNING) 39 | console_handler.setFormatter(formatter) 40 | logger.addHandler(console_handler) 41 | 42 | return logger 43 | 44 | def get_database_container(self): 45 | #This loads the environment for interacting with CosmosDB #Dan: Should this be moved to the .env file? 46 | load_dotenv() 47 | URL = os.getenv("URL") 48 | KEY = os.getenv("KEY") 49 | DATA_BASE_NAME = os.getenv("DATA_BASE_NAME") 50 | CONTAINER_NAME_CLEANED = os.getenv("CONTAINER_NAME_CLEANED") 51 | try: 52 | client = CosmosClient(URL, credential=KEY) 53 | except Exception as e: 54 | self.logger.error(f"Error instantiating CosmosClient: {e.status_code} - {e.message}") 55 | return 56 | try: 57 | database = client.get_database_client(DATA_BASE_NAME) 58 | except Exception as e: 59 | self.logger.error(f"Error instantiating DatabaseClient: {e.status_code} - {e.message}") 60 | return 61 | try: 62 | COSMOSDB_CONTAINER_CASES_CLEANED = database.get_container_client(CONTAINER_NAME_CLEANED) 63 | except Exception as e: 64 | self.logger.error(f"Error instantiating ContainerClient: {e.status_code} - {e.message}") 65 | return 66 | 67 | return COSMOSDB_CONTAINER_CASES_CLEANED 68 | 69 | def update(self): 70 | if not os.path.exists(self.case_json_cleaned_folder_path): 71 | self.logger.error(f'The following path doesn\'t exits: \n{self.case_json_cleaned_folder_path}') 72 | return 73 | 74 | if not self.COSMOSDB_CONTAINER_CASES_CLEANED: 75 | return 76 | 77 | list_case_json_files = os.listdir(self.case_json_cleaned_folder_path) 78 | 79 | for case_json in list_case_json_files: 80 | print(f'case_json: {case_json}') 81 | in_file = self.case_json_cleaned_folder_path + "/" + case_json 82 | if os.path.isfile(in_file): 83 | dest_file = self.processed_path + "/" + case_json 84 | else: 85 | continue 86 | 87 | with open(in_file, "r") as f: 88 | input_dict = json.load(f) 89 | self.logger.info(f"[Case Filename: {case_json}, Case Number: {input_dict.get('case_number', None)}, HTML Hash: {input_dict.get('html_hash', None)}]") 90 | 91 | # Querying case databse to fetch all items that match the hash. 92 | hash_query = f"SELECT * FROM COSMOSDB_CONTAINER_CASES_CLEANED WHERE COSMOSDB_CONTAINER_CASES_CLEANED['html_hash'] = '{input_dict['html_hash']}'" 93 | try: 94 | # Execute the query 95 | cases = list(self.COSMOSDB_CONTAINER_CASES_CLEANED.query_items(query=hash_query,enable_cross_partition_query=True)) 96 | except Exception as e: 97 | self.logger.error(f"Error querying cases-cleaned database for an existing hash: {e.status_code} - {e.message}") 98 | continue 99 | 100 | if len(cases) > 0: 101 | # There already exists one with the same hash, so skip this entirely. 102 | # Move the file to the processed folder. 103 | os.rename(in_file, dest_file) 104 | self.logger.info(f"The case's HTML hash already exists in the databse: {case_json}. Not updating the database.") 105 | continue 106 | 107 | # Querying case databse to fetch all items that match the cause number. 108 | case_query = f"SELECT * FROM COSMOSDB_CONTAINER_CASES_CLEANED WHERE COSMOSDB_CONTAINER_CASES_CLEANED['case_number'] = '{input_dict['case_number']}'" 109 | try: 110 | # Execute the query 111 | cases = list(self.COSMOSDB_CONTAINER_CASES_CLEANED.query_items(query=case_query,enable_cross_partition_query=True)) 112 | except Exception as e: 113 | self.logger.error(f"Error querying cases-cleaned database for an existing cases: {e.status_code} - {e.message}") 114 | continue 115 | 116 | #If there are no cases that match the cause number, then create the case ID, add a version number of 1 to the JSON and push the JSON to the database. 117 | today = dt.today() 118 | input_dict['id'] = input_dict['case_number'] + ":" + input_dict['county'] + ":" + today.strftime('%m-%d-%Y') + input_dict['html_hash'] 119 | input_dict['version'] = max(int(case['version']) for case in cases) + 1 if len(cases) > 0 else 1 120 | try: 121 | self.COSMOSDB_CONTAINER_CASES_CLEANED.create_item(body=input_dict) 122 | except Exception as e: 123 | self.logger.error(f"Error inserting this case to cases-cleaned database: {e.status_code} - {e.message}") 124 | continue 125 | 126 | # This case is inserted successfully. 127 | # Move the file to the processed folder. 128 | os.rename(in_file, dest_file) 129 | self.logger.info(f"Insertion successfully done with id: {input_dict['id']}, version: { input_dict['version']}") 130 | 131 | if __name__ == '__main__': 132 | Updater().update() 133 | -------------------------------------------------------------------------------- /resources/test_files/hays_main_page.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 114 | 115 | 116 |
Hays County Courts Records Inquiry
Skip to Main Content Logout My Account 
Help
Welcome to Odyssey Public Access
Case Records

 
Criminal Case Records
Civil, Family & Probate Case Records
Court Calendar

Jail Records
Jail Records
Jail Bond Records

 
State of Texas|Hays County

Copyright 2003 Tyler Technologies. All rights Reserved.

117 | 118 | -------------------------------------------------------------------------------- /src/parser/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import csv 4 | import json 5 | import traceback 6 | import xxhash 7 | from time import time 8 | import sys 9 | import importlib 10 | from bs4 import BeautifulSoup 11 | from typing import Tuple, List, Optional 12 | 13 | current_dir = os.path.dirname(os.path.abspath(__file__)) 14 | parent_dir = os.path.dirname(current_dir) 15 | project_root = os.path.dirname(parent_dir) 16 | 17 | 18 | class Parser: 19 | def __init__(self): 20 | pass 21 | 22 | def configure_logger(self): 23 | logger = logging.getLogger(name="pid: " + str(os.getpid())) 24 | logging.basicConfig( 25 | level=logging.INFO, 26 | format="%(asctime)s - %(levelname)s - %(message)s", 27 | handlers=[logging.FileHandler("parser_log.txt"), logging.StreamHandler()], 28 | ) 29 | logger.info("Logger configured") 30 | return logger 31 | 32 | def get_class_and_method( 33 | self, logger, county: str, test=False 34 | ) -> Tuple[Optional[object], Optional[callable]]: 35 | if test: 36 | logger.info(f"Test mode is on") 37 | # Construct the module, class, and method names 38 | module_name = county # ex: 'hays' 39 | class_name = f"Parser{county.capitalize()}" # ex: 'ParserHays' 40 | method_name = f"parser_{county}" # ex: 'parser_hays' 41 | 42 | logger.info( 43 | f"Module: {module_name}\nClass: {class_name}\nMethod: {method_name}\n" 44 | ) 45 | 46 | # Add the current directory to the system path 47 | sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) 48 | 49 | try: 50 | # Dynamically import the module 51 | module = importlib.import_module(module_name) 52 | 53 | logger.info(f"Module '{module_name}' imported successfully.") 54 | 55 | # Retrieve the class from the module 56 | cls = getattr(module, class_name) 57 | 58 | logger.info(f"Class '{class_name}' retrieved successfully.") 59 | 60 | if cls is None: 61 | logger.info( 62 | f"Class '{class_name}' not found in module '{module_name}'." 63 | ) 64 | return None, None 65 | 66 | # Instantiate the class 67 | instance = cls() 68 | 69 | # Retrieve the method with the specified name 70 | method = getattr(instance, method_name, None) 71 | logger.info(f"Method '{method_name}' retrieved successfully.") 72 | 73 | if method is None: 74 | logger.info( 75 | f"Method '{method_name}' not found in class '{class_name}'." 76 | ) 77 | return instance, None 78 | 79 | return instance, method 80 | except ModuleNotFoundError as e: 81 | logger.info(f"Module '{module_name}' not found: {e}") 82 | except AttributeError as e: 83 | logger.info(f"Error retrieving class or method: {e}") 84 | except Exception as e: 85 | logger.info(f"Unexpected error: {e}") 86 | return None, None 87 | 88 | def get_directories( 89 | self, county: str, logger, parse_single_file: bool = False 90 | ) -> Tuple[str, str]: 91 | # Determine the base directory of your project 92 | base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) 93 | logger.info(f"get_directories function called\nbase_dir: {base_dir}\n") 94 | try: 95 | if parse_single_file: 96 | case_html_path = os.path.join(base_dir, "resources", "test_files") 97 | case_json_path = os.path.join(base_dir, "resources", "test_files") 98 | else: 99 | case_html_path = os.path.join(base_dir, "data", county, "case_html") 100 | case_json_path = os.path.join(base_dir, "data", county, "case_json") 101 | if not os.path.exists(case_json_path): 102 | os.makedirs(case_json_path, exist_ok=True) 103 | logger.info( 104 | f"Returning case_html_path: {case_html_path}\nReturning case_json_path: {case_json_path}\n" 105 | ) 106 | return case_html_path, case_json_path 107 | except Exception as e: 108 | logger.info(f"Error in get_directories: {e}") 109 | raise 110 | 111 | def get_list_of_html( 112 | self, 113 | case_html_path: str, 114 | case_number: str, 115 | county: str, 116 | logger, 117 | parse_single_file: bool = False, 118 | ) -> List[str]: 119 | logger.info(f"get_list_of_html function called\n") 120 | try: 121 | if parse_single_file: 122 | logger.info(f"parse_single_file is True\n") 123 | relative_path = os.path.join(project_root, "resources", "test_files") 124 | return [os.path.join(relative_path, f"test_{case_number}.html")] 125 | # This will loop through the html in the folder they were scraped to. 126 | case_html_list = os.listdir(case_html_path) 127 | 128 | # However, if an optional case number is passed to the function, then read in the case number html file from the data folder 129 | # -Assumes that the requested parsed case number has been scraped to html 130 | if case_number: 131 | case_html_list = [f"{case_number}.html"] 132 | case_html_list = [ 133 | os.path.join(case_html_path, file_name) for file_name in case_html_list 134 | ] 135 | logger.info(f"Returning case_html_list: {case_html_list}\n") 136 | return case_html_list 137 | except Exception as e: 138 | logger.info(f"Error in get_list_of_html: {e}") 139 | raise 140 | 141 | def get_html_path( 142 | self, case_html_path: str, case_html_file_name: str, case_number: str, logger 143 | ) -> str: 144 | logger.info(f"get_html_path function called\n") 145 | try: 146 | case_html_file_path = os.path.join(case_html_path, case_html_file_name) 147 | logger.info(f"Constructed path: {case_html_file_path}") 148 | return case_html_file_path 149 | except Exception as e: 150 | logger.info(f"Error in get_html_path: {e}") 151 | raise 152 | 153 | def write_json_data( 154 | self, case_json_path: str, case_number: str, case_data: str, logger 155 | ) -> None: 156 | try: 157 | indent_level = 4 158 | logger.info(f"Writing JSON to: {case_json_path}") 159 | with open( 160 | os.path.join(case_json_path, case_number + ".json"), "w" 161 | ) as file_handle: 162 | file_handle.write(json.dumps(case_data, indent=indent_level)) 163 | except Exception as e: 164 | logger.info(f"Error in write_json_data: {e}") 165 | raise 166 | 167 | def write_error_log(self, county: str, case_number: str) -> None: 168 | try: 169 | base_dir = os.path.abspath( 170 | os.path.join(os.path.dirname(__file__), "..", "..") 171 | ) 172 | error_log_path = os.path.join( 173 | base_dir, "data", county, "cases_with_parsing_error.txt" 174 | ) 175 | with open( 176 | error_log_path, 177 | "w", 178 | ) as file_handle: 179 | file_handle.write(case_number + "\n") 180 | except Exception as e: 181 | print(f"Error in write_error_log: {e}") 182 | raise 183 | 184 | def parse( 185 | self, county: str, case_number: str, parse_single_file: bool = False, test=False 186 | ) -> None: 187 | logger = self.configure_logger() 188 | 189 | # For simple testing purposes 190 | # Comment out for larger scale testing 191 | # Case number is from /resources/test_files/test_{case_number}.html 192 | if not case_number: 193 | case_number = "51652356" 194 | 195 | logger.info( 196 | f"Starting parsing for {county} county with case number {case_number}" 197 | ) 198 | county = county.lower() 199 | try: 200 | # get input and output directories and make json dir if not present 201 | case_html_path, case_json_path = self.get_directories(county, logger, test) 202 | 203 | # start 204 | START_TIME_PARSER = time() 205 | logger.info(f"Time started: {START_TIME_PARSER}") 206 | # creating a list of json files already parsed 207 | cached_case_json_list = [ 208 | file_name.split(".")[0] for file_name in os.listdir(case_json_path) 209 | ] 210 | 211 | # Get a list of the HTML files that it needs to parse. 212 | case_html_list = self.get_list_of_html( 213 | case_html_path, case_number, county, logger, parse_single_file 214 | ) 215 | logger.info(f"Starting for loop to parse {len(case_html_list)} cases") 216 | for case_html_file_path in case_html_list: 217 | try: 218 | case_number = os.path.basename(case_html_file_path).split(".")[0] 219 | 220 | logger.info(f"{case_number} - parsing") 221 | 222 | with open( 223 | case_html_file_path, "r", encoding="utf-8", errors="ignore" 224 | ) as file: 225 | case_soup = BeautifulSoup(file, "html.parser") 226 | 227 | parser_instance, parser_function = self.get_class_and_method( 228 | county=county, logger=logger, test=test 229 | ) 230 | 231 | if parser_instance is not None and parser_function is not None: 232 | case_data = parser_function( 233 | county, case_number, logger, case_soup 234 | ) 235 | else: 236 | logger.info( 237 | "Error: Could not obtain parser instance or function." 238 | ) 239 | continue 240 | 241 | body = case_soup.find("body") 242 | tables = body.find_all("table") 243 | if tables: 244 | """ 245 | Why balance table is dropped before hashing: 246 | The balance table is excluded from the hashing because 247 | balance is updated as any costs are paid off. Otherwise, 248 | the hash would change frequently and multiple versions 249 | of the case would be captured that we don't want. 250 | """ 251 | balance_table = tables[-1] 252 | if "Balance Due" in balance_table.text: 253 | balance_table.decompose() 254 | case_data["html_hash"] = xxhash.xxh64(str(body)).hexdigest() 255 | 256 | self.write_json_data(case_json_path, case_number, case_data, logger) 257 | 258 | except Exception: 259 | print(traceback.format_exc()) 260 | self.write_error_log(county, case_number) 261 | 262 | RUN_TIME_PARSER = time() - START_TIME_PARSER 263 | logger.info(f"Parsing took {RUN_TIME_PARSER} seconds") 264 | except Exception as e: 265 | logger.info(f"Error in parse: {e}") 266 | raise 267 | 268 | if __name__ == "__main__": 269 | parser = Parser() 270 | parser.parse(county="hays", case_number=None, parse_single_file=True) 271 | -------------------------------------------------------------------------------- /src/cleaner/__init__.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import datetime as dt 4 | import xxhash 5 | import logging 6 | 7 | # Configure logging 8 | logging.basicConfig( 9 | level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" 10 | ) 11 | 12 | # List of motions identified as evidentiary. 13 | # TODO: These should be moved to a separate JSON in resources 14 | GOOD_MOTIONS = [ 15 | "Motion To Suppress", 16 | "Motion to Reduce Bond", 17 | "Motion to Reduce Bond Hearing", 18 | "Motion for Production", 19 | "Motion For Speedy Trial", 20 | "Motion for Discovery", 21 | "Motion In Limine", 22 | ] 23 | 24 | 25 | class Cleaner: 26 | def __init__(self): 27 | pass 28 | 29 | def redact_cause_number(self, input_dict: dict) -> str: 30 | # This will hash and redact the cause number and then add it to the output file. 31 | cause_number_hash = xxhash.xxh64(str(input_dict["Case Metadata"]["code"])).hexdigest() 32 | return cause_number_hash 33 | 34 | def get_or_create_folder_path(self, county: str, folder_type: str) -> str: 35 | """Returns and ensures the existence of the folder path.""" 36 | folder_path = os.path.join( 37 | os.path.dirname(__file__), "..", "..", "data", county.lower(), folder_type 38 | ) 39 | try: 40 | if not os.path.exists(folder_path): 41 | os.makedirs(folder_path) 42 | logging.info(f"Folder '{folder_path}' created successfully.") 43 | else: 44 | logging.info(f"Folder '{folder_path}' already exists.") 45 | except OSError as e: 46 | logging.error(f"Error creating folder '{folder_path}': {e}") 47 | return folder_path 48 | 49 | def load_json_file(self, file_path: str) -> dict: 50 | """Loads a JSON file from a given file path and returns the data as an object""" 51 | try: 52 | with open(file_path, "r") as f: 53 | return json.load(f) 54 | except (FileNotFoundError, json.JSONDecodeError) as e: 55 | logging.error(f"Error loading file at {file_path}: {e}") 56 | return {} 57 | 58 | def remove_judicial_officer(self, data): 59 | # Check if data is a dictionary 60 | if isinstance(data, dict): 61 | # Remove 'judicial officer' if it exists in this dictionary 62 | if "judicial officer" in data: 63 | del data["judicial officer"] 64 | # Recursively check each value in the dictionary 65 | for key, value in data.items(): 66 | self.remove_judicial_officer(value) 67 | # Check if data is a list 68 | elif isinstance(data, list): 69 | for item in data: 70 | self.remove_judicial_officer(item) 71 | 72 | def load_and_map_charge_names(self, file_path: str) -> dict: 73 | """Loads a JSON file and maps charge names to their corresponding UMich data.""" 74 | charge_data = self.load_json_file(file_path) 75 | # Check if the file loaded successfully 76 | if not charge_data: 77 | logging.error(f"Failed to load charge data from {file_path}") 78 | raise FileNotFoundError(f"File not found or is empty: {file_path}") 79 | # Create dictionary mapping charge names 80 | try: 81 | return {item["charge_name"]: item for item in charge_data} 82 | except KeyError as e: 83 | logging.error(f"Error in mapping charge names: {e}") 84 | raise ValueError(f"Invalid data structure: {file_path}") 85 | 86 | def process_charges( 87 | self, charges: list[dict], charge_mapping: dict 88 | ) -> tuple[list[dict], str]: 89 | """ 90 | Processes a list of charges by formatting charge details, 91 | mapping charges to UMich data, and finding the earliest charge date. 92 | 93 | Args: 94 | charges: A list of charges where each charge is a dictionary containing charge details. 95 | charge_mapping: A dictionary mapping charge names to corresponding UMich data. 96 | 97 | Returns: 98 | tuple: A list of processed charges and the earliest charge date. 99 | """ 100 | charge_dates = [] 101 | processed_charges = [] 102 | 103 | for i, charge in enumerate(charges): 104 | charge_dict = { 105 | "charge_id": i, 106 | "charge_level": charge["level"], 107 | "orignal_charge": charge["charges"], 108 | "statute": charge["statute"], 109 | "is_primary_charge": i == 0, 110 | } 111 | 112 | # Parse the charge date and append it to charge_dates 113 | try: 114 | charge_datetime = dt.datetime.strptime(charge["date"], "%m/%d/%Y") 115 | charge_dates.append(charge_datetime) 116 | charge_dict["charge_date"] = dt.datetime.strftime( 117 | charge_datetime, "%Y-%m-%d" 118 | ) 119 | except ValueError: 120 | logging.error(f"Error parsing date for charge: {charge}") 121 | continue 122 | 123 | # Try to map the charge to UMich data 124 | try: 125 | charge_dict.update(charge_mapping[charge["charges"]]) 126 | except KeyError: 127 | logging.warning(f"Couldn't find this charge: {charge['charges']}") 128 | continue 129 | 130 | processed_charges.append(charge_dict) 131 | 132 | # Find the earliest charge date 133 | if charge_dates: 134 | earliest_charge_date = dt.datetime.strftime(min(charge_dates), "%Y-%m-%d") 135 | else: 136 | logging.warning("No valid charge dates found.") 137 | earliest_charge_date = "" 138 | 139 | return processed_charges, earliest_charge_date 140 | 141 | def contains_good_motion(self, motion: str, event: list | str) -> bool: 142 | """Recursively check if a motion exists in an event list or sublist.""" 143 | if isinstance(event, list): 144 | return any(self.contains_good_motion(motion, item) for item in event) 145 | return motion.lower() in event.lower() 146 | 147 | def find_good_motions( 148 | self, events: list | str, good_motions: list[str] 149 | ) -> list[str]: 150 | """Finds motions in events based on list of good motions.""" 151 | return [ 152 | motion 153 | for motion in good_motions 154 | if self.contains_good_motion(motion, events) 155 | ] 156 | 157 | def hash_defense_attorney(self, input_dict: dict) -> str: 158 | """Hashes the defense attorney info to anonymize it.""" 159 | try: 160 | def_atty_unique_str = f'{input_dict["Defendent Information"]["defense attorney"]}:{input_dict["Defendent Information"]["defense attorney phone number"]}' 161 | return xxhash.xxh64(def_atty_unique_str).hexdigest() 162 | except KeyError as e: 163 | logging.error(f"Missing defense attorney data: {e}") 164 | return "" 165 | 166 | def write_json_output(self, file_path: str, data: dict) -> None: 167 | """Writes the given data to a JSON file at the specified file path.""" 168 | try: 169 | with open(file_path, "w") as f: 170 | json.dump(data, f, indent=4) 171 | logging.info(f"Successfully wrote cleaned data to {file_path}") 172 | except OSError as e: 173 | logging.error(f"Failed to write JSON output to {file_path}: {e}") 174 | 175 | def process_single_case( 176 | self, 177 | case_json_folder_path: str, 178 | case_json_filename: str, 179 | cleaned_folder_path: str, 180 | ) -> None: 181 | """Process a single case JSON file.""" 182 | input_json_path = os.path.join(case_json_folder_path, case_json_filename) 183 | input_dict = self.load_json_file(input_json_path) 184 | 185 | if not input_dict: 186 | logging.error(f"Failed to load case data from {input_json_path}") 187 | return 188 | 189 | # Initialize cleaned output data 190 | output_json_data = { 191 | "parsing_date": dt.datetime.today().strftime("%Y-%m-%d"), 192 | "html_hash": input_dict["html_hash"], 193 | "Case Metadata": { 194 | "county": input_dict["Case Metadata"]["county"] 195 | }, 196 | "Defendant Information": { 197 | "appointed_or_retained": input_dict["Defendent Information"]["appointed or retained"], 198 | "defense_attorney": self.hash_defense_attorney(input_dict), 199 | }, 200 | "Charge Information": [], 201 | "Case Details": { 202 | "earliest_charge_date": "", 203 | "has_evidence_of_representation": False, 204 | }, 205 | "Disposition_Information": input_dict["Disposition Information"] 206 | } 207 | 208 | # Removing judicial office name from data 209 | self.remove_judicial_officer(output_json_data["Disposition_Information"]) 210 | 211 | # Load charge mappings 212 | charge_name_to_umich_file = os.path.join( 213 | os.path.dirname(__file__), 214 | "..", 215 | "..", 216 | "resources", 217 | "umich-uccs-database.json", 218 | ) 219 | charges_mapped = self.load_and_map_charge_names(charge_name_to_umich_file) 220 | 221 | # Process charges and motions 222 | output_json_data["Charge Information"], output_json_data['Case Details']["earliest_charge_date"] = ( 223 | self.process_charges(input_dict["Charge Information"], charges_mapped) 224 | ) 225 | output_json_data['Good Motions'] = self.find_good_motions( 226 | input_dict["Other Events and Hearings"], GOOD_MOTIONS 227 | ) 228 | output_json_data['Case Details']["has_evidence_of_representation"] = ( 229 | len(output_json_data["Good Motions"]) > 0 230 | ) 231 | 232 | output_json_data["cause_number_redacted"] = self.redact_cause_number(input_dict) 233 | 234 | # Write output to file 235 | output_filepath = os.path.join(cleaned_folder_path, case_json_filename) 236 | self.write_json_output(output_filepath, output_json_data) 237 | 238 | def process_json_files(self, county: str, case_json_folder_path: str) -> None: 239 | """Processes all JSON files in the specified folder.""" 240 | try: 241 | list_case_json_files = os.listdir(case_json_folder_path) 242 | except (FileNotFoundError, Exception) as e: 243 | logging.error(f"Error reading directory {case_json_folder_path}: {e}") 244 | return 245 | 246 | # Ensure the case_json_cleaned folder exists 247 | cleaned_folder_path = self.get_or_create_folder_path( 248 | county, "case_json_cleaned" 249 | ) 250 | 251 | for case_json_filename in list_case_json_files: 252 | try: 253 | self.process_single_case( 254 | case_json_folder_path, case_json_filename, cleaned_folder_path 255 | ) 256 | except Exception as e: 257 | logging.error(f"Error processing file {case_json_filename}. Error: {e}") 258 | 259 | def clean(self, county: str) -> None: 260 | """ 261 | Cleans and processes case data for a given county. 262 | This method performs the following steps: 263 | 1. Loads raw JSON case data from the 'case_json' folder for the specified county. 264 | 2. Processes and maps charges using an external UMich data source. 265 | 3. Identifies relevant motions from a predefined list of good motions. 266 | 4. Hashes defense attorney information to anonymize but uniquely identify the attorney. 267 | 5. Adds metadata, such as parsing date and case number, to the cleaned data. 268 | 6. Writes the cleaned data to the 'case_json_cleaned' folder for the specified county. 269 | """ 270 | try: 271 | case_json_folder_path = self.get_or_create_folder_path(county, "case_json") 272 | logging.info(f"Processing data for county: {county}") 273 | self.process_json_files(county, case_json_folder_path) 274 | logging.info(f"Completed processing for county: {county}") 275 | except Exception as e: 276 | logging.error( 277 | f"Error during cleaning process for county: {county}. Error: {e}" 278 | ) 279 | -------------------------------------------------------------------------------- /resources/test_files/test_123456.json: -------------------------------------------------------------------------------- 1 | { 2 | "Case Metadata": { 3 | "code": "CR-17-5152-C", 4 | "odyssey id": "test_123456", 5 | "county": "hays" 6 | }, 7 | "Case Details": { 8 | "name": "The State of Texas vs. Zzzzzz Xxxxxx", 9 | "case type": "Adult Felony", 10 | "date filed": "01/05/2016", 11 | "location": "22nd District Court" 12 | }, 13 | "Defendent Information": { 14 | "defendant": "Xxxxxx, Zzzzzz", 15 | "sex": "Female", 16 | "race": "White", 17 | "date of birth": "DOB: 02/15/1997", 18 | "height": "5'6\",", 19 | "weight": "200", 20 | "defense attorney": "Richard Jones", 21 | "appointed or retained": "Court Appointed", 22 | "defense attorney phone number": "512-632-2433(W)", 23 | "defendant address": "876 Main St Natalia, TX 78059", 24 | "SID": "TX03816410" 25 | }, 26 | "State Information": { 27 | "prosecuting attorney": "Yuuuuu Haaaaa", 28 | "prosectuing attorney phone number": "512-362-7711(W)" 29 | }, 30 | "Charge Information": [ 31 | { 32 | "charges": "AGGRAVATED ASSAULT WITH A DEADLY WEAPON", 33 | "statute": "22.02(a)(2)", 34 | "level": "Second Degree Felony", 35 | "date": "10/25/2015" 36 | } 37 | ], 38 | "Disposition Information": [ 39 | { 40 | "date": "12/06/2016", 41 | "event": "Disposition", 42 | "judicial officer": "Boyer, Bruce", 43 | "details": [ 44 | { 45 | "charge": "1. AGGRAVATED ASSAULT WITH A DEADLY WEAPON", 46 | "outcome": "Deferred Adjudication" 47 | } 48 | ] 49 | }, 50 | { 51 | "date": "11/04/2019", 52 | "event": "Amended Disposition", 53 | "judicial officer": "Boyer, Bruce) Reason: Community Supervision Extende", 54 | "details": [ 55 | { 56 | "charge": "1. AGGRAVATED ASSAULT WITH A DEADLY WEAPON", 57 | "outcome": "Amend Probation" 58 | } 59 | ] 60 | } 61 | ], 62 | "Top Charge": { 63 | "charge name": "AGGRAVATED ASSAULT WITH A DEADLY WEAPON", 64 | "charge level": "Second Degree Felony" 65 | }, 66 | "Dismissed Charges Count": 0, 67 | "Other Events and Hearings": [ 68 | [ 69 | "08/12/2024", 70 | "Motion to Adjudicate", 71 | "(9:00 AM) (Judicial Officer Boyer, Bruce)" 72 | ], 73 | [ 74 | "07/01/2024", 75 | "Motion to Adjudicate", 76 | "(9:00 AM) (Judicial Officer Boyer, Bruce)", 77 | "Result: Reset" 78 | ], 79 | [ 80 | "06/06/2024", 81 | "Motion to Adjudicate", 82 | "(9:00 AM) (Judicial Officer Boyer, Bruce)", 83 | "Result: Reset" 84 | ], 85 | [ 86 | "05/07/2024", 87 | "Application For Court Appointed Attorney/Order", 88 | "Richard Jones" 89 | ], 90 | [ 91 | "05/01/2024", 92 | "Acknowledgement of Receipt of Discovery", 93 | "Discovery Receipt - Email CR-18-32131-A" 94 | ], 95 | [ 96 | "04/25/2024", 97 | "Motion to Adjudicate", 98 | "(9:00 AM) (Judicial Officer Boyer, Bruce)", 99 | "Result: Reset" 100 | ], 101 | [ 102 | "03/08/2024", 103 | "Bond (Cash/Surety) After Release from Jail", 104 | "See Bond Tab" 105 | ], 106 | [ 107 | "03/04/2024", 108 | "Capias Executed", 109 | "See Warrant Tab" 110 | ], 111 | [ 112 | "02/23/2022", 113 | "Capias Issued", 114 | "See Warrant Tab" 115 | ], 116 | [ 117 | "02/15/2022", 118 | "Judge's Fiat", 119 | "(Judicial Officer: Boyer, Bruce )" 120 | ], 121 | [ 122 | "02/09/2022", 123 | "Motion to Revoke Probation/Adjudicate Guilt (Reopen Case)", 124 | "(Judicial Officer: Boyer, Bruce )" 125 | ], 126 | [ 127 | "05/05/2020", 128 | "Motion To Waive Court Ordered Debts", 129 | "(Judicial Officer: Boyer, Bruce )", 130 | "Supervision Fees" 131 | ], 132 | [ 133 | "12/03/2019", 134 | "Court Cost (Bill of Cost)" 135 | ], 136 | [ 137 | "11/20/2019", 138 | "Motion/Order for Payment of Itemized Time/Services", 139 | "(Judicial Officer: Boyer, Bruce )" 140 | ], 141 | [ 142 | "11/04/2019", 143 | "Stipulation of Evidence" 144 | ], 145 | [ 146 | "11/04/2019", 147 | "Trial Court 's Certification of Defendant's Right of Appeal", 148 | "(Judicial Officer: Boyer, Bruce )" 149 | ], 150 | [ 151 | "11/04/2019", 152 | "Court Writ" 153 | ], 154 | [ 155 | "11/04/2019", 156 | "Motion to Adjudicate", 157 | "(9:00 AM) (Judicial Officer Boyer, Bruce)", 158 | "Result: Prob Modified" 159 | ], 160 | [ 161 | "10/10/2019", 162 | "Motion to Adjudicate", 163 | "(9:00 AM) (Judicial Officer Boyer, Bruce)", 164 | "Result: Reset" 165 | ], 166 | [ 167 | "09/16/2019", 168 | "Discovery Receipt Email from District Attorney" 169 | ], 170 | [ 171 | "09/08/2019", 172 | "Application For Court Appointed Attorney/Order", 173 | "(Judicial Officer: Junkin, David )", 174 | "Denied" 175 | ], 176 | [ 177 | "09/06/2019", 178 | "Magistration Documents" 179 | ], 180 | [ 181 | "09/06/2019", 182 | "Magistrate Warning" 183 | ], 184 | [ 185 | "09/06/2019", 186 | "Bench Warrant (See Warrant Tab)" 187 | ], 188 | [ 189 | "09/05/2019", 190 | "Capias Executed", 191 | "See Warrant Tab" 192 | ], 193 | [ 194 | "09/05/2019", 195 | "Capias Executed", 196 | "See Warrant Tab" 197 | ], 198 | [ 199 | "09/03/2019", 200 | "Order", 201 | "(Judicial Officer: Junkin, David )", 202 | "Appointing Attorney" 203 | ], 204 | [ 205 | "11/08/2017", 206 | "Capias Issued", 207 | "See Warrant Tab" 208 | ], 209 | [ 210 | "11/06/2017", 211 | "Judge's Fiat", 212 | "(Judicial Officer: Boyer, Bruce )" 213 | ], 214 | [ 215 | "11/01/2017", 216 | "Motion to Revoke Probation/Adjudicate Guilt (Reopen Case)", 217 | "(Judicial Officer: Boyer, Bruce )" 218 | ], 219 | [ 220 | "10/25/2017", 221 | "Capias Issued", 222 | "See Warrant Tab" 223 | ], 224 | [ 225 | "10/24/2017", 226 | "Bailiffs Certificate", 227 | "(Judicial Officer: Boyer, Bruce )" 228 | ], 229 | [ 230 | "10/24/2017", 231 | "Show Cause Hearing", 232 | "(9:00 AM) (Judicial Officer Boyer, Bruce)", 233 | "Result: Failure To Appear" 234 | ], 235 | [ 236 | "03/30/2017", 237 | "Amended Conditions of Probation", 238 | "First Amended-Deferred Adjudication" 239 | ], 240 | [ 241 | "12/09/2016", 242 | "Motion/Order for Payment of Itemized Time/Services", 243 | "(Judicial Officer: Boyer, Bruce )" 244 | ], 245 | [ 246 | "12/06/2016", 247 | "Court Cost (Bill of Cost)" 248 | ], 249 | [ 250 | "12/06/2016", 251 | "Conditions of Probation", 252 | "Deferred Adjudication" 253 | ], 254 | [ 255 | "12/06/2016", 256 | "Trial Court 's Certification of Defendant's Right of Appeal", 257 | "(Judicial Officer: Boyer, Bruce )" 258 | ], 259 | [ 260 | "12/06/2016", 261 | "Punishment Hearing", 262 | "(9:00 AM) (Judicial Officer Boyer, Bruce)", 263 | "Result: Def. Adjudication" 264 | ], 265 | [ 266 | "11/07/2016", 267 | "CANCELED", 268 | "Punishment Hearing", 269 | "(9:00 AM) (Judicial Officer Boyer, Bruce)", 270 | "Defendant's Request" 271 | ], 272 | [ 273 | "09/26/2016", 274 | "Plea Bargain Agreement" 275 | ], 276 | [ 277 | "09/26/2016", 278 | "Pre Trial Motions (Non-Evidentiary)", 279 | "(9:00 AM) (Judicial Officer Boyer, Bruce)", 280 | "Result: Reset" 281 | ], 282 | [ 283 | "08/25/2016", 284 | "Pre Trial Motions (Non-Evidentiary)", 285 | "(9:00 AM) (Judicial Officer Henry, William R)", 286 | "Result: Reset" 287 | ], 288 | [ 289 | "07/29/2016", 290 | "Capias Recalled" 291 | ], 292 | [ 293 | "07/29/2016", 294 | "Capias Issued", 295 | "See Warrant Tab" 296 | ], 297 | [ 298 | "07/27/2016", 299 | "Bailiffs Certificate", 300 | "(Judicial Officer: Henry, William R )" 301 | ], 302 | [ 303 | "07/27/2016", 304 | "Pre Trial Motions (Non-Evidentiary)", 305 | "(9:00 AM) (Judicial Officer Henry, William R)", 306 | "Result: Reset" 307 | ], 308 | [ 309 | "06/15/2016", 310 | "Pre Trial Motions (Non-Evidentiary)", 311 | "(9:00 AM) (Judicial Officer Henry, William R)", 312 | "Result: Reset" 313 | ], 314 | [ 315 | "05/12/2016", 316 | "Pre Trial Motions (Non-Evidentiary)", 317 | "(9:00 AM) (Judicial Officer Steel, Gary L.)", 318 | "Result: Reset" 319 | ], 320 | [ 321 | "05/05/2016", 322 | "Acknowledgement of Receipt of Discovery" 323 | ], 324 | [ 325 | "04/29/2016", 326 | "Discovery Receipt Email from District Attorney" 327 | ], 328 | [ 329 | "04/29/2016", 330 | "Discovery Receipt Email from District Attorney" 331 | ], 332 | [ 333 | "04/14/2016", 334 | "Pre Trial Motions (Non-Evidentiary)", 335 | "(9:00 AM) (Judicial Officer Robison, Jack)", 336 | "Result: Reset" 337 | ], 338 | [ 339 | "03/23/2016", 340 | "CANCELED", 341 | "Arraignment", 342 | "(9:00 AM) (Judicial Officer Henry, William R)", 343 | "Waived Arraignment" 344 | ], 345 | [ 346 | "03/15/2016", 347 | "Waiver of Arraignment", 348 | "Unsigned" 349 | ], 350 | [ 351 | "03/15/2016", 352 | "Waiver of Arraignment" 353 | ], 354 | [ 355 | "02/24/2016", 356 | "Application For Court Appointed Attorney/Order", 357 | "(Judicial Officer: Ramsay, Charles )", 358 | "MARTIN CLAUDER" 359 | ], 360 | [ 361 | "02/24/2016", 362 | "Arraignment", 363 | "(9:00 AM) (Judicial Officer Henry, William R)", 364 | "Result: Reset" 365 | ], 366 | [ 367 | "02/09/2016", 368 | "Returned To Sender", 369 | "NOTICE OF ARRAIGNMENT" 370 | ], 371 | [ 372 | "01/05/2016", 373 | "Court's Docket Sheet" 374 | ], 375 | [ 376 | "01/05/2016", 377 | "Indictment (Open Case)" 378 | ], 379 | [ 380 | "10/29/2015", 381 | "Bond (Cash/Surety) After Release from Jail", 382 | "See Bond Tab" 383 | ], 384 | [ 385 | "11/04/2019", 386 | "Amended Deferred Adjudication", 387 | "(Judicial Officer: Boyer, Bruce) Reason: Community Supervision Extended", 388 | "1. AGGRAVATED ASSAULT WITH A DEADLY WEAPON", 389 | "CSCD", 390 | "7 Years" 391 | ], 392 | [ 393 | "12/06/2016", 394 | "Deferred Adjudication", 395 | "(Judicial Officer: Boyer, Bruce)", 396 | "1. AGGRAVATED ASSAULT WITH A DEADLY WEAPON", 397 | "CSCD", 398 | "5 Years" 399 | ], 400 | [ 401 | "12/06/2016", 402 | "Plea", 403 | "(Judicial Officer: Boyer, Bruce)", 404 | "1. AGGRAVATED ASSAULT WITH A DEADLY WEAPON", 405 | "Guilty" 406 | ] 407 | ], 408 | "html_hash": "8d4a80173c700b37" 409 | } -------------------------------------------------------------------------------- /src/parser/hays.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | from bs4 import BeautifulSoup 3 | 4 | CHARGE_SEVERITY = { 5 | "First Degree Felony": 1, 6 | "Second Degree Felony": 2, 7 | "Third Degree Felony": 3, 8 | "State Jail Felony": 4, 9 | "Misdemeanor A": 5, 10 | "Misdemeanor B": 6, 11 | } 12 | 13 | class ParserHays: 14 | 15 | def __init__(self): 16 | pass 17 | 18 | def extract_rows(self, table: BeautifulSoup, logger) -> List[List[str]]: 19 | try: 20 | rows = [ 21 | [ 22 | tag.strip().replace("\xa0", "").replace("Â", "") 23 | for tag in tr.find_all(text=True) 24 | if tag.strip() 25 | ] 26 | for tr in table.select("tr") 27 | ] 28 | return [row for row in rows if row] 29 | except Exception as e: 30 | logger.info(f"Error extracting rows: {e}") 31 | return [] 32 | 33 | def get_charge_severity(self, charge: str, logger) -> int: 34 | try: 35 | for charge_name, severity in CHARGE_SEVERITY.items(): 36 | if charge_name in charge: 37 | return severity 38 | return float('inf') 39 | except Exception as e: 40 | logger.info(f"Error getting charge severity: {e}") 41 | return float('inf') 42 | 43 | def count_dismissed_charges(self, dispositions: List[Dict], logger) -> int: 44 | try: 45 | return sum( 46 | 1 for disposition in dispositions 47 | for detail in disposition.get("details", []) 48 | if detail.get("outcome", "").lower() == 'dismissed' 49 | ) 50 | except Exception as e: 51 | logger.info(f"Error counting dismissed charges: {e}") 52 | return "Unknown" 53 | 54 | def get_top_charge(self, dispositions: List[Dict], charge_information: List[Dict], logger) -> Dict: 55 | try: 56 | top_charge = None 57 | min_severity = float('inf') 58 | 59 | charge_map = {info['charges']: info['level'] for info in charge_information} 60 | 61 | for disposition in dispositions: 62 | if isinstance(disposition, dict): 63 | for detail in disposition.get("details", []): 64 | if isinstance(detail, dict): 65 | charge_text = detail.get("charge", "").strip() 66 | charge_name = charge_text.split(" >=")[0].strip().lstrip("0123456789. ").strip() 67 | charge_level = charge_map.get(charge_name, "Unknown") 68 | 69 | severity = self.get_charge_severity(charge_level, logger) 70 | if severity < min_severity: 71 | min_severity = severity 72 | top_charge = { 73 | "charge name": charge_name, 74 | "charge level": charge_level 75 | } 76 | else: 77 | logger.info(f"Unexpected type for disposition: {type(disposition)}") 78 | 79 | return top_charge 80 | except Exception as e: 81 | logger.info(f"Error getting top charge: {e}") 82 | return { 83 | "charge name": "Unknown", 84 | "charge level": "Unknown" 85 | } 86 | 87 | def get_case_metadata(self, county: str, case_number: str, case_soup: BeautifulSoup, logger) -> Dict[str, str]: 88 | try: 89 | logger.info(f"Getting case metadata for {county} case {case_number}") 90 | return { 91 | "code": case_soup.select('div[class="ssCaseDetailCaseNbr"] > span')[0].text, 92 | "odyssey id": case_number, 93 | "county": county 94 | } 95 | except Exception as e: 96 | logger.info(f"Error getting case metadata: {e}") 97 | return { 98 | "code": "Unknown", 99 | "odyssey id": case_number, 100 | "county": county 101 | } 102 | 103 | def get_case_details(self, table: BeautifulSoup, logger) -> Dict[str, str]: 104 | try: 105 | table_values = table.select("b") 106 | logger.info(f"Getting case details") 107 | return { 108 | "name": table_values[0].text, 109 | "case type": table_values[1].text, 110 | "date filed": table_values[2].text, 111 | "location": table_values[3].text 112 | } 113 | except Exception as e: 114 | logger.info(f"Error getting case details: {e}") 115 | return { 116 | "name": "Unknown", 117 | "case type": "Unknown", 118 | "date filed": "Unknown", 119 | "location": "Unknown" 120 | } 121 | 122 | def parse_defendant_rows(self, defendant_rows: List[List[str]], logger) -> Dict[str, str]: 123 | try: 124 | logger.info(f"Parsing defendant rows") 125 | return { 126 | "defendant": defendant_rows[1][1], 127 | "sex": defendant_rows[1][2].split(" ")[0], 128 | "race": defendant_rows[1][2].split(" ")[1], 129 | "date of birth": defendant_rows[1][3], 130 | "height": defendant_rows[1][4].split(" ")[0], 131 | "weight": defendant_rows[1][4].split(" ")[1], 132 | "defense attorney": defendant_rows[1][5], 133 | "appointed or retained": defendant_rows[1][6], 134 | "defense attorney phone number": defendant_rows[1][7], 135 | "defendant address": defendant_rows[2][0] + " " + defendant_rows[2][1], 136 | "SID": defendant_rows[2][3], 137 | } 138 | except Exception as e: 139 | logger.info(f"Error parsing defendant rows: {e}") 140 | return { 141 | "defendant": "Unknown", 142 | "sex": "Unknown", 143 | "race": "Unknown", 144 | "date of birth": "Unknown", 145 | "height": "Unknown", 146 | "weight": "Unknown", 147 | "defense attorney": "Unknown", 148 | "appointed or retained": "Unknown", 149 | "defense attorney phone number": "Unknown", 150 | "defendant address": "Unknown", 151 | "SID": "Unknown", 152 | } 153 | 154 | def parse_state_rows(self, state_rows: List[List[str]], logger) -> Dict[str, str]: 155 | try: 156 | logger.info(f"Parsing state rows") 157 | return { 158 | "prosecuting attorney": state_rows[3][2], 159 | "prosectuing attorney phone number": state_rows[3][3], 160 | } 161 | except Exception as e: 162 | logger.info(f"Error parsing state rows: {e}") 163 | return { 164 | "prosecuting attorney": "Unknown", 165 | "prosectuing attorney phone number": "Unknown", 166 | } 167 | 168 | def get_charge_information(self, table: BeautifulSoup, logger) -> List[Dict]: 169 | try: 170 | logger.info(f"Getting charge information") 171 | table_rows = [ 172 | tag.strip().replace("\xa0", " ") 173 | for tag in table.find_all(text=True) 174 | if tag.strip() 175 | ] 176 | 177 | charge_information = [] 178 | for i in range(5, len(table_rows), 5): 179 | charge_information.append( 180 | { 181 | k: v 182 | for k, v in zip( 183 | ["charges", "statute", "level", "date"], 184 | table_rows[i + 1 : i + 5], 185 | ) 186 | } 187 | ) 188 | return charge_information 189 | except Exception as e: 190 | logger.info(f"Error getting charge information: {e}") 191 | return [] 192 | 193 | def format_events_and_orders_of_the_court(self, table: BeautifulSoup, case_soup: BeautifulSoup, logger) -> List: 194 | try: 195 | logger.info(f"Formatting events and orders of the court") 196 | table_rows = [ 197 | [ 198 | tag.strip().replace("\xa0", " ") 199 | for tag in tr.find_all(text=True) 200 | if tag.strip() 201 | ] 202 | for tr in table.select("tr") 203 | if tr.select("th") 204 | ] 205 | table_rows = [ 206 | [" ".join(word.strip() for word in text.split()) for text in sublist] 207 | for sublist in table_rows 208 | if sublist 209 | ] 210 | 211 | disposition_rows = [] 212 | other_event_rows = [] 213 | 214 | for row in table_rows: 215 | print(f'printing row: {row}') 216 | if len(row) >= 2: 217 | if row[1] in ["Disposition", "Disposition:", "Amended Disposition"]: 218 | print(f'YES A DISPOSITION: {row}') 219 | disposition_rows.append(row) 220 | else: 221 | print(f'YES AN EVENT: {row}') 222 | other_event_rows.append(row) 223 | 224 | # Reverse the order of the rows 225 | other_event_rows = other_event_rows[::-1] 226 | disposition_rows = disposition_rows[::-1] 227 | 228 | print(other_event_rows) 229 | 230 | return (disposition_rows, other_event_rows) 231 | except Exception as e: 232 | logger.info(f"Error formatting events and orders of the court: {e}") 233 | return ([], []) 234 | 235 | def get_disposition_information(self, row, dispositions, case_data, table, county, case_soup, logger) -> List[Dict]: 236 | try: 237 | if not row: 238 | logger.info(f"No dispositions to process.") 239 | return dispositions 240 | 241 | if len(row) >= 5: 242 | # Extract judicial officer if present 243 | judicial_officer = "" 244 | if len(row[2]) > 18 and row[2].startswith("(Judicial Officer:"): 245 | judicial_officer = row[2][18:-1].strip() 246 | 247 | # Create a disposition entry 248 | disposition = { 249 | "date": row[0], 250 | "event": row[1], 251 | "judicial officer": judicial_officer, 252 | "details": [] 253 | } 254 | 255 | # Check if this row is a disposition 256 | if row[1].lower() in ["disposition", "amended disposition", "deferred adjudication", "punishment hearing"]: 257 | details = { 258 | "charge": row[3], 259 | "outcome": row[4] 260 | } 261 | if len(row) > 5: 262 | details["additional_info"] = row[5:] 263 | disposition["details"].append(details) 264 | dispositions.append(disposition) 265 | dispositions.reverse() 266 | else: 267 | logger.info("Row is not a disposition: %s", row) 268 | 269 | return dispositions 270 | except Exception as e: 271 | logger.info(f"Error getting disposition information: {e}") 272 | return dispositions 273 | 274 | def parser_hays(self, county: str, case_number: str, logger, case_soup: BeautifulSoup) -> Dict[str, Dict]: 275 | try: 276 | root_tables = case_soup.select("body>table") 277 | 278 | case_data = { 279 | "Case Metadata": self.get_case_metadata(county, case_number, case_soup, logger) 280 | } 281 | 282 | for table in root_tables: 283 | 284 | if "Case Type:" in table.text and "Date Filed:" in table.text: 285 | case_data["Case Details"] = self.get_case_details(table, logger) 286 | 287 | elif "Related Case Information" in table.text: 288 | case_data["Related Cases"] = [ 289 | case.text.strip().replace("\xa0", " ") for case in table.select("td")] 290 | 291 | elif "Party Information" in table.text: 292 | case_data["Defendent Information"] = self.parse_defendant_rows(self.extract_rows(table, logger), logger) 293 | case_data["State Information"] = self.parse_state_rows(self.extract_rows(table, logger), logger) 294 | 295 | elif "Charge Information" in table.text: 296 | case_data["Charge Information"] = self.get_charge_information(table, logger) 297 | 298 | elif "Events & Orders of the Court" in table.text: 299 | disposition_rows, other_event_rows = self.format_events_and_orders_of_the_court(table, case_soup, logger) 300 | 301 | dispositions = [] 302 | logger.info(f"For Loop started\nGetting disposition information") 303 | for row in disposition_rows: 304 | case_data["Disposition Information"] = self.get_disposition_information(row, dispositions, case_data, table, county, case_soup, logger) 305 | logger.info(f"For Loop ended\n") 306 | if case_data["Disposition Information"]: 307 | case_data["Top Charge"] = self.get_top_charge(dispositions, case_data.get("Charge Information", []), logger) 308 | case_data["Dismissed Charges Count"] = self.count_dismissed_charges(case_data["Disposition Information"], logger) 309 | case_data['Other Events and Hearings'] = other_event_rows 310 | 311 | return case_data 312 | except Exception as e: 313 | logger.info(f"Error parsing Hays case: {e}") 314 | return {} 315 | -------------------------------------------------------------------------------- /resources/texas_county_data.csv: -------------------------------------------------------------------------------- 1 | county,population,website,portal,type,version,search_disabled,site_down,captcha,must_pay,must_register,notes,scrape 2 | Harris,4731145,http://www.harriscountytx.gov/,https://jpodysseyportal.harriscountytx.gov/OdysseyPortalJP/,odyssey,2017.1.46.2,,,,,,,no 3 | Dallas,2613539,http://www.dallascounty.org/,https://courtsportal.dallascounty.org/DALLASPROD/,odyssey,2017.1.46.2,,,,,,,no 4 | Tarrant,2110640,http://www.tarrantcounty.com/,https://odyssey.tarrantcounty.com/PublicAccess/,odyssey,2003,,,,,,,no 5 | Bexar,2009324,http://www.bexar.org/,https://portal-txbexar.tylertech.cloud/Portal/,odyssey,2017.1.35.6,,yes – maintenance,,,,,no 6 | Travis,1290188,https://www.traviscountytx.gov/,https://odysseypa.traviscountytx.gov/JPPublicAccess/,odyssey,2011,,,,,,,no 7 | Collin,1064465,http://www.collincountytx.gov/,https://cijspub.co.collin.tx.us/,odyssey,2003,,,,,,,no 8 | Denton,906422,https://dentoncounty.gov/,https://justice1.dentoncounty.gov/PublicAccess/,odyssey,2003,,,,,,,no 9 | Hidalgo,870781,https://tx-hidalgocounty.civicplus.com/,https://pa.co.hidalgo.tx.us/,odyssey,2003,,,,,,,no 10 | El Paso,865657,http://www.epcounty.com/,https://casesearch.epcounty.com/PublicAccess/,odyssey,2003,yes,,,yes,,,no 11 | Fort Bend,822779,http://www.fortbendcountytx.gov/,https://tylerpaw.fortbendcountytx.gov/PublicAccess/,odyssey,2003,,,,,,,no 12 | Montgomery,620443,http://www.mctx.org/,http://odyssey.mctx.org/Unsecured/,odyssey,2011,,,,,,,no 13 | Williamson,609017,http://www.wilco.org/,https://judicialrecords.wilco.org/PublicAccess/,odyssey,2003,,,,,,,no 14 | Cameron,421017,http://www.co.cameron.tx.us/,https://portal.co.cameron.tx.us/portalprod/,odyssey,2017.1.46.2,,,yes,,,,no 15 | Brazoria,372031,http://brazoriacountytx.gov/,https://pubweb.brazoriacountytx.gov/PublicAccess/,odyssey,2011,,,,,,,no 16 | Bell,370647,http://www.bellcountytx.com/,https://justice.bellcounty.texas.gov/PublicPortal/,odyssey,2017.1.46.2,,,,,,,no 17 | Nueces,353178,http://www.co.nueces.tx.us/,https://portal-txnueces.tylertech.cloud/Portal/,odyssey,2024,,,,,,,no 18 | Galveston,350682,http://www.galvestoncountytx.gov/,https://portal.galvestoncountytx.gov/portal/,odyssey,2017.1.46.2,,,yes,,,,no 19 | Lubbock,310639,http://www.co.lubbock.tx.us/,https://publicrecords.lubbockcounty.gov/Portal/,odyssey,2017.1.40.0,,,,,,,no 20 | Webb,267114,http://www.webbcountytx.gov/,https://publicaccess.webbcountytx.gov/PublicAccess/,odyssey,2011,,,,,,,no 21 | McLennan,260579,http://www.co.mclennan.tx.us/,https://mclennan.edoctec.com/McLennanDCWeb/,edoctec,2022,,,,,,scrapable,no 22 | Jefferson,256526,http://www.co.jefferson.tx.us/,https://jeffersontxclerk.manatron.com/Court/SearchEntry.aspx?cabinet=COURT_CRIMINAL,Aumentum recorder,3,,,,,,scrapable,no 23 | Hays,241067,http://www.co.hays.tx.us/,http://public.co.hays.tx.us/,odyssey,2003,,,,,,,yes 24 | Brazos,233849,http://www.brazoscountytx.gov/,https://brazoscountytx.gov/237/Public-Records,,,,,,,,does records requests through e-mail and fax as far as I can tell,no 25 | Smith,233479,http://www.smith-county.com/,https://judicial.smith-county.com/PublicAccess/,odyssey,2011,,,,,,,no 26 | Ellis,192455,http://www.co.ellis.tx.us/,https://public.lgsonlinesolutions.com/ors.html,Local Government Solutions,2022,,,,,,scrapable,no 27 | Johnson,179927,http://www.johnsoncountytx.org/,https://pa.johnsoncountytx.org/PublicAccess/,odyssey,2011,,,,,,,no 28 | Guadalupe,172706,http://www.co.guadalupe.tx.us/,https://portal-txguadalupe.tylertech.cloud/PublicAccess/,odyssey,2015,,,,,,,no 29 | Midland,169983,http://www.co.midland.tx.us/,http://jp.co.midland.tx.us/countyclerk/,netdata,2008,,,,,yes,must register. registration through County Clerk's office,no 30 | Ector,165171,http://www.co.ector.tx.us/,https://portal-txector.tylertech.cloud/PortalProd/,odyssey,2017.1.46.2,,,,,,,no 31 | Comal,161501,http://www.co.comal.tx.us/,http://public.co.comal.tx.us/default.aspx,odyssey,2003,yes,,,,,needs default.aspx in url or sends you to IIS default index,no 32 | Parker,148222,http://www.parkercountytx.com/,https://txparkerodyprod.tylerhost.net/PublicAccess/,odyssey,2011,,,,,,,no 33 | Kaufman,145310,http://www.kaufmancounty.net/,http://txkaufmanodyprod.tylerhost.net/PublicAccess/,odyssey,2003,yes,,,,,,no 34 | Taylor,143208,http://www.taylorcountytexas.org/,http://publicaccess.taylorcountytexas.org/PublicAccess/,odyssey,2003,,,,,,,no 35 | Randall,140753,http://randallcounty.com/,https://odysseypa.tylerhost.net/Randall/,odyssey,2011,,,,,,,no 36 | Grayson,135543,http://www.co.grayson.tx.us/,https://judicialsearch.co.grayson.tx.us:8443/,odyssey,2011,,,,,,,no 37 | Wichita,129350,http://www.co.wichita.tx.us/,https://portal-txwichita.tylertech.cloud/Portal/,odyssey,2017.1.46.2,,,,,,,no 38 | Gregg,124239,http://www.co.gregg.tx.us/,http://beta.co.gregg.tx.us/OdysseyPA/,odyssey,2003,,,,,,,no 39 | Tom Green,120003,http://www.co.tom-green.tx.us/,http://odysseypa.co.tom-green.tx.us/,odyssey,2003,,,,,,,no 40 | Potter,118525,http://www.co.potter.tx.us/,https://portal-txpotter.tylertech.cloud/Portal/,odyssey,2017.1.46.2,,,,,,,no 41 | Rockwall,107819,https://www.rockwallcountytexas.com/,https://portal-txrockwall.tylertech.cloud/Portal/,odyssey,2017.1.46.2,,,,,,,no 42 | Hunt,99956,http://www.huntcounty.net/,https://portal-txhunt.tylertech.cloud/Portal/,odyssey,2017.1.46.2,yes,,,yes,,,no 43 | Bastrop,97216,http://www.co.bastrop.tx.us/,http://records.co.bastrop.tx.us/PublicAccess/,odyssey,2003,,,,,,,no 44 | Bowie,92893,http://www.co.bowie.tx.us/,https://portal-txbowie.tylertech.cloud/PublicAccess/,odyssey,2013,,,,,,,no 45 | Liberty,91628,http://www.co.liberty.tx.us/,https://portal-txliberty.tylertech.cloud/Portal/,odyssey,2017.1.46.2,,,,,,,no 46 | Victoria,91319,http://www.victoriacountytx.org/,http://odyssey.vctx.org/,odyssey,2003,,,,,,,no 47 | Angelina,86395,http://www.angelinacounty.net/,http://public.angelinacounty.net/,odyssey,2003,,,yes,,,,no 48 | Orange,84808,http://www.co.orange.tx.us/,https://www.co.orange.tx.us/departments/CountyClerk/OnlineRecordsSearch,myClerkbooks.com,2022,,,,,,scrapable,no 49 | Coryell,83093,https://www.coryellcounty.org/,https://www.coryellcounty.org/page/coryell.County.Clerk,,,,,,,,it appears that you need to pay. it’s unclear though,no 50 | Henderson,82150,http://www.henderson-county.com/,http://txhendersonodyprod.tylerhost.net/PublicAccess/,odyssey,2011,,,,,,,no 51 | Walker,76400,http://www.co.walker.tx.us/,https://odysseypa.tylerhost.net/Walker/,odyssey,2003,,,,,,,no 52 | Harrison,68839,http://harrisoncountytexas.org/,http://portal-txharrison.tylertech.cloud/PublicAccess/,odyssey,2011,,,,,,,no 53 | San Patricio,68755,http://www.co.san-patricio.tx.us/,https://www.co.san-patricio.tx.us/page/sanpatricio.County.Clerk,,,,,,,,I believe you need to call or email for records,no 54 | Wise,68632,http://www.co.wise.tx.us/,http://jail.co.wise.tx.us:81/,odyssey,2003,,,,,,,no 55 | Starr,65920,http://www.co.starr.tx.us/,https://www.co.starr.tx.us/page/starr.County.Clerk,,,,,,yes,,criminal case request form,no 56 | Nacogdoches,64653,http://www.co.nacogdoches.tx.us/,https://www.co.nacogdoches.tx.us/OpenRecords/Index.asp,,,,,,,,it is unclear how to get criminal records,no 57 | Hood,61598,http://www.co.hood.tx.us/,https://www.texasonlinerecords.com/clerk/?office_id=24,netdata,2013,,,yes,,yes,must register + captcha,no 58 | Van Zandt,59541,http://www.vanzandtcounty.org/,https://www.vanzandtcounty.org/page/vanzandt.County.Clerk,,,,,,,yes,uses countygovernmentrecords.com but it's not clear if this includes criminal records. Must register,no 59 | Anderson,57922,http://www.co.anderson.tx.us/,http://ac5.co.anderson.tx.us/PublicAccess/,odyssey,2003,,,,,,,no 60 | Maverick,57887,http://www.co.maverick.tx.us/,https://public.lgsonlinesolutions.com/ors.html,Local Government Solutions,2022,,,,,,scrapable,no 61 | Waller,56794,http://www.co.waller.tx.us/,https://odysseypa.tylerhost.net/Waller/,odyssey,2003,,,,,,,no 62 | Hardin,56231,http://www.co.hardin.tx.us/,http://www.hardincourts.com/recordSearch.php,"Henschen & Associates, Inc.",2022,,,yes,,,scrapable outside of captcha,no 63 | Navarro,52624,http://www.co.navarro.tx.us/,https://portal-txnavarro.tylertech.cloud/PublicAccess/,odyssey,2011,,,,,,,no 64 | Kerr,52598,http://www.co.kerr.tx.us/,http://courts.co.kerr.tx.us/PublicAccess/,odyssey,2014,,yes - 403 Forbidden,,,,,no 65 | Rusk,52214,http://www.co.rusk.tx.us/,https://www.co.rusk.tx.us/page/rusk.County.Clerk,,,,,,,,it is unclear how to get criminal records. There is a civil records request sheet,no 66 | Medina,50748,http://www.medinacountytexas.org/,https://odysseypa.tylerhost.net/Medina/,odyssey,2003,,,,,,,no 67 | Cherokee,50412,http://www.co.cherokee.tx.us/,https://cherokeeclerkofcourt.com/mainpage.aspx,ICON,5.1.1.1,,,,,,scrapable,no 68 | Polk,50123,http://www.co.polk.tx.us/,https://www.co.polk.tx.us/page/polk.County.Clerk,,,,,,,,it is unclear how to get criminal records,no 69 | Lamar,50088,http://www.co.lamar.tx.us/,https://txlamarodyprod.tylerhost.net/PublicAccess/,odyssey,2011,,,,,,,no 70 | Wilson,49753,http://www.co.wilson.tx.us/,https://www.co.wilson.tx.us/page/wilson.County.Clerk,,,,,,,,it is unclear how to get criminal records,no 71 | Burnet,49130,http://www.burnetcountytexas.org/,https://portal-txburnet.tylertech.cloud/ProdPortal,odyssey,2003,,,,,,PUBLICLOGIN#visitor/visitor# do not edit - used as data in scraper,no 72 | Atascosa,48981,http://www.atascosacounty.texas.gov/,https://public.lgsonlinesolutions.com/ors.html,Local Government Solutions,2022,,,,,,looks scrapable but this is for district rather than county clerk. Could not find county records,no 73 | Val Verde,47586,http://valverdecounty.texas.gov/,https://www.valverdecounty.texas.gov/153/County-Clerk,,,,,,,,it is unclear how to get criminal records,no 74 | Chambers,46571,http://www.co.chambers.tx.us/,https://portal-txchambers.tylertech.cloud/Portal/,odyssey,2017.1.46.2,yes,,,yes,,,no 75 | Caldwell,45883,http://www.co.caldwell.tx.us/,https://www.co.caldwell.tx.us/page/caldwell.County.Clerk,iDocket,,,,,,yes,must register to iDocket.,no 76 | Wood,44843,http://www.mywoodcounty.com/,https://portal-txwood.tylertech.cloud/PublicAccess/,odyssey,2011,,,,,,,no 77 | Kendall,44279,http://www.co.kendall.tx.us/,https://www.co.kendall.tx.us/page/County.Clerk,,,,,,,,it is unclear how to get criminal records,no 78 | Erath,42545,http://co.erath.tx.us/,https://www.texasonlinerecords.com/clerk/?office_id=21,netdata,2022,,,yes,,yes,must register + captcha,no 79 | Cooke,41668,http://www.co.cooke.tx.us/,https://www.texasonlinerecords.com/clerk/?office_id=101,netdata,2022,,,yes,,yes,must register + captcha,no 80 | Wharton,41570,http://www.co.wharton.tx.us/,https://www.co.wharton.tx.us/page/wharton.County.Clerk,iDocket,,,,,,yes,must register to iDocket.,no 81 | Upshur,40892,http://www.countyofupshur.com/,https://www.texasonlinerecords.com/clerk/?office_id=43,netdata,2022,,,yes,,yes,must register + captcha,no 82 | Jim Wells,38891,http://www.co.jim-wells.tx.us/,https://courtportal.co.jim-wells.tx.us/eservices/home.page.23,courtview,1.32.01,,,,,,scrapable,no 83 | Brown,38095,http://www.browncountytx.org/,https://www.browncountytx.org/page/brown.County.Clerk,iDocket,,,,,,yes,must register to iDocket.,no 84 | Hopkins,36787,http://www.hopkinscountytx.org/,https://www.texasonlinerecords.com/clerk/?office_id=1,netdata,2022,,,yes,,yes,must register + captcha,no 85 | Matagorda,36255,http://www.co.matagorda.tx.us/,https://portal-txmatagorda.tylertech.cloud/Matagorda/,odyssey,2011,,,,,,,no 86 | Hill,35874,http://www.co.hill.tx.us/,https://www.co.hill.tx.us/page/hill.County.Clerk,,,,,,,,it is unclear how to get criminal records,no 87 | Washington,35805,http://www.co.washington.tx.us/,https://www.co.washington.tx.us/page/washington.County.Clerk,,,,,,,,it is unclear how to get criminal records,no 88 | Fannin,35662,http://www.co.fannin.tx.us/,https://portal-txfannin.tylertech.cloud/Portal/,odyssey,2017.1.46.2,,,,,,,no 89 | Howard,34860,http://www.co.howard.tx.us/,https://txhowardodyprod.tylerhost.net/PublicAccess/,odyssey,2011,,,,,,,no 90 | Jasper,32980,http://www.co.jasper.tx.us/,https://www.co.jasper.tx.us/page/jasper.County.Clerk,,,,,,,,it is unclear how to get criminal records,no 91 | Hale,32522,http://www.halecounty.org/,https://portal-txhale.tylertech.cloud/PublicAccess/,odyssey,2011,,,,,,,no 92 | Titus,31247,http://www.co.titus.tx.us/,https://www.texasonlinerecords.com/clerk/?office_id=2,netdata,2022,,,yes,,yes,must register + captcha. Looks like this may not include criminal data as well,no 93 | Bee,31047,http://www.co.bee.tx.us/,https://www.co.bee.tx.us/page/bee.County.Clerk,,,,,,,,it is unclear how to get criminal records,no 94 | Kleberg,31040,http://www.co.kleberg.tx.us/,https://www.texasonlinerecords.com/clerk/?office_id=81,netdata,2022,,,yes,,yes,must register + captcha. Looks like this may not include criminal data as well,no 95 | Austin,30167,http://www.austincounty.com/,http://public.austincounty.com/,odyssey,2003,,,,,,,no 96 | Grimes,29268,http://www.co.grimes.tx.us/,https://public.lgsonlinesolutions.com/ors.html,Local Government Solutions,2022,,,,,,scrapable,no 97 | Cass,28454,http://www.co.cass.tx.us/,https://cc.co.cass.tx.us/Court/SearchEntry.aspx?cabinet=COURT_CRIMINAL,Aumentum recorder,2020.2.0,,,,,,scrapable,no 98 | Palo Pinto,28409,http://www.co.palo-pinto.tx.us/,https://www.co.palo-pinto.tx.us/page/palopinto.County.Clerk,,,,,,,,it is unclear how to get criminal records,no 99 | San Jacinto,27402,http://www.co.san-jacinto.tx.us/,https://txsanjacintoodyprod.tylerhost.net/PublicAccess/,odyssey,2011,,,,,,,no 100 | Gillespie,26725,http://www.gillespiecounty.org/,https://portal-txgillespie.tylertech.cloud/PublicAccess/,odyssey,2006,,,,,,,no 101 | Milam,24754,http://www.milamcounty.net/,https://www.milamcounty.net/page/milam.countyclerk,iDocket,,,,,,,must register to iDocket.,no 102 | Uvalde,24564,http://www.uvaldecounty.com/,https://public.lgsonlinesolutions.com/ors.html,Local Government Solutions,2022,,,,,,scrapable,no 103 | Fayette,24435,http://www.co.fayette.tx.us/,https://www.co.fayette.tx.us/page/fayette.County.Clerk,,,,,,,,it is unclear how to get criminal records,no 104 | Shelby,24022,http://www.co.shelby.tx.us/,http://cc.co.shelby.tx.us/localization/notavailable.aspx,Aumentum Recorder,2021.1.0,yes,,,,,search disabled. Must get records in person.,no 105 | Aransas,23830,http://www.aransascountytx.gov/main/,https://www.aransascountytx.gov/clerk/,,,,,,,,it is unclear how to get criminal records,no 106 | Panola,22491,http://www.co.panola.tx.us/,https://portal-txpanola.tylertech.cloud/PublicAccess/,odyssey,2006,,,,,,,no 107 | Limestone,22146,http://www.co.limestone.tx.us/,https://www.co.limestone.tx.us/page/limestone.County.Clerk,iDocket,,,,,,,must register to iDocket.,no 108 | Houston,22066,http://www.co.houston.tx.us/,https://www.co.houston.tx.us/page/houston.County.Clerk,iDocket,,,,,,,must register to iDocket.,no 109 | Lampasas,21627,http://www.co.lampasas.tx.us/,https://www.co.lampasas.tx.us/page/lampasas.County.Clerk,,,,,,,,it is unclear how to get criminal records,no 110 | Gaines,21598,http://www.co.gaines.tx.us/,https://www.co.gaines.tx.us/page/gaines.County.Clerk,,,,,,,,it is unclear how to get criminal records,no 111 | Hockley,21537,http://www.co.hockley.tx.us/,https://www.co.hockley.tx.us/page/hockley.County.Clerk,,,,,,,,it is unclear how to get criminal records,no 112 | Moore,21358,http://www.co.moore.tx.us/,https://public.lgsonlinesolutions.com/ors.html,Local Government Solutions,2022,,,,,,scrapable,no 113 | Llano,21243,http://www.co.llano.tx.us/,https://www.co.llano.tx.us/page/llano.County.Clerk,,,,,,,,it is unclear how to get criminal records,no 114 | Gray,21227,http://www.co.gray.tx.us/,http://www.lgs-hosted.com/rmtgraycck.html,Local Government Solutions,2017,,,,,yes,must register. Seems broken orguest/orguest doesn’t work which Is default on other LGS sites. Nor does GRAYCCKINQ / 20!DiMe14,no 115 | Bandera,20851,http://www.banderacounty.org/,https://www.banderacounty.org/departments/RecordSearch.htm,iDocket,,,,,,,must register to iDocket.,no 116 | Hutchinson,20617,http://www.co.hutchinson.tx.us/,https://portal-txhutchinson.tylertech.cloud/OdysseyPA/Login.aspx,odyssey,2011,,,,,yes,requires login. Not clear how to get one.,no 117 | Colorado,20557,http://www.co.colorado.tx.us/,https://www.co.colorado.tx.us/page/colorado.County.Clerk,iDocket,,,,,,,must register to iDocket.,no 118 | Lavaca,20337,http://www.co.lavaca.tx.us/,https://www.co.lavaca.tx.us/page/lavaca.County.Clerk,,,,,,,,it is unclear how to get criminal records,no 119 | Willacy,20164,http://www.co.willacy.tx.us/,https://www.co.willacy.tx.us/page/willacy.County.Clerk,,,,,,,,it is unclear how to get criminal records,no 120 | Calhoun,20106,http://www.calhouncotx.org/,https://txcalhounportal.tylerhost.net/Portal/,odyssey,2017.1.46.2,,,,,,,no 121 | Montague,19965,http://www.co.montague.tx.us/,https://public.lgsonlinesolutions.com/ors.html,Local Government Solutions,2022,,,,,,scrapable,no 122 | DeWitt,19824,http://www.co.dewitt.tx.us/,https://public.lgsonlinesolutions.com/ors.html,Local Government Solutions,2022,,,,,,scrapable,no 123 | Tyler,19798,http://www.co.tyler.tx.us/,https://www.co.tyler.tx.us/page/tyler.CriminalRecordsRequestInstructions,,,,,,yes,,must make request and pay as far as I can tell,no 124 | Jones,19663,http://www.co.jones.tx.us/,https://public.lgsonlinesolutions.com/ors.html,Local Government Solutions,2022,,,,,,scrapable,no 125 | Gonzales,19653,http://www.co.gonzales.tx.us/,https://www.co.gonzales.tx.us/page/gonzales.County.Clerk,,,,,,,,it is unclear how to get criminal records,no 126 | Freestone,19435,http://www.co.freestone.tx.us/,https://www.co.freestone.tx.us/page/freestone.County.Clerk,,,,,,,,it is unclear how to get criminal records,no 127 | Andrews,18610,http://www.co.andrews.tx.us/,https://www.co.andrews.tx.us/181/County-Clerk,,,,,,,,it is unclear how to get criminal records,no 128 | Deaf Smith,18583,http://www.co.deaf-smith.tx.us/,,,,,,,,,,no 129 | Frio,18385,http://www.co.frio.tx.us/,,,,,,,,,,no 130 | Bosque,18235,http://www.bosquecounty.us/,,,,,,,,,,no 131 | Young,17867,http://www.co.young.tx.us/,,,,,,,,,,no 132 | Eastland,17725,http://www.eastlandcountytexas.com/,,,,,,,,,,no 133 | Burleson,17642,http://www.co.burleson.tx.us/,,,,,,,,,,no 134 | Lee,17478,http://www.co.lee.tx.us/,http://www.lgs-hosted.com/rmtleecck.html,Local Government Solutions,2014,,,,,,LEECCKINQ / 20!DiMe14 default works. Scrapable,no 135 | Falls,16968,http://co.falls.tx.us/,,,,,,,,,,no 136 | Scurry,16932,http://www.co.scurry.tx.us/,,,,,,,,,,no 137 | Robertson,16757,http://www.co.robertson.tx.us/,,,,,,,,,,no 138 | Leon,15719,http://www.co.leon.tx.us/,,,,,,,,,,no 139 | Pecos,15193,http://www.co.pecos.tx.us/,,,,,,,,,,no 140 | Jackson,14988,http://www.co.jackson.tx.us/,,,,,,,,,,no 141 | Reeves,14748,http://www.reevescountytexas.net/,,,,,,,,,,no 142 | Nolan,14738,http://www.co.nolan.tx.us/,https://public.lgsonlinesolutions.com/ors.html,Local Government Solutions,2022,,,,,,scrapable,no 143 | Karnes,14710,http://www.co.karnes.tx.us/,,,,,,,,,,no 144 | Zapata,13889,http://www.co.zapata.tx.us/,,,,,,,,,,no 145 | Callahan,13708,http://www.co.callahan.tx.us/,https://public.lgsonlinesolutions.com/ors.html,Local Government Solutions,2022,,,,,,scrapable,no 146 | Trinity,13602,http://www.co.trinity.tx.us/,,,,,,,,,,no 147 | Comanche,13594,http://www.co.comanche.tx.us/,,,,,,,,,,no 148 | Madison,13455,http://www.co.madison.tx.us/,,,,,,,,,,no 149 | Lamb,13045,http://www.co.lamb.tx.us/,,,,,,,,,,no 150 | Wilbarger,12887,http://www.co.wilbarger.tx.us/,,,,,,,,,,no 151 | Camp,12464,http://www.co.camp.tx.us/,,,,,,,,,,no 152 | Dawson,12456,http://www.co.dawson.tx.us/,,,,,,,,,,no 153 | Newton,12217,http://www.co.newton.tx.us/,,,,,,,,,,no 154 | Rains,12164,http://www.co.rains.tx.us/,https://public.lgsonlinesolutions.com/ors.html,Local Government Solutions,2022,,,,,,scrapable,no 155 | Morris,11973,http://www.co.morris.tx.us/,http://txmorrisodyprod.tylerhost.net/PublicAccess/,odyssey,2011,,,,,,PUBLICLOGIN#Public/Public# do not edit - used as data in scraper,no 156 | Terry,11831,http://co.terry.tx.us/,https://public.lgsonlinesolutions.com/ors.html,Local Government Solutions,2022,,,,,,scrapable,no 157 | Ward,11644,http://www.co.ward.tx.us/,,,,,,,,,,no 158 | Red River,11587,http://www.co.red-river.tx.us/,,,,,,,,,,no 159 | Blanco,11374,http://www.co.blanco.tx.us/,,,,,,,,,,no 160 | Live Oak,11335,http://www.co.live-oak.tx.us/,,,,,,,,,,no 161 | Franklin,10359,http://co.franklin.tx.us/,,,,,,,,,,no 162 | Clay,10218,http://www.co.clay.tx.us/,,,,,,,,,,no 163 | Ochiltree,10015,http://www.co.ochiltree.tx.us/,,,,,,,,,,no 164 | Runnels,9900,http://www.co.runnels.tx.us/,https://public.lgsonlinesolutions.com/ors.html,Local Government Solutions,,,,,,,scrapable,no 165 | Sabine,9894,http://www.co.sabine.tx.us/,,,,,,,,,,no 166 | Parmer,9869,http://parmercounty.org/,,,,,,,,,,no 167 | Duval,9831,http://www.co.duval.tx.us/,,,,,,,,,,no 168 | Marion,9725,http://www.co.marion.tx.us/,,,,,,,,,,no 169 | Zavala,9670,http://www.co.zavala.tx.us/,,,,,,,,,,no 170 | Brewster,9546,http://www.brewstercountytx.com/,,,,,,,,,,no 171 | Somervell,9205,http://www.somervell.co/,,,,,,,,,,no 172 | Stephens,9101,http://www.co.stephens.tx.us/,,,,,,,,,,no 173 | Mitchell,8990,http://www.co.mitchell.tx.us/,,,,,,,,,,no 174 | Dimmit,8615,http://www.dimmitcounty.org/,,,,,,,,,,no 175 | Archer,8560,http://www.co.archer.tx.us/,,,,,,,,,,no 176 | Jack,8472,http://www.jackcounty.org/,,,,,,,,,,no 177 | Hamilton,8222,http://www.co.hamilton.tx.us/,,,,,,,,,,no 178 | San Augustine,7918,http://www.co.san-augustine.tx.us/,,,,,,,,,,no 179 | Winkler,7791,http://www.co.winkler.tx.us/,,,,,,,,,,no 180 | Yoakum,7694,http://www.co.yoakum.tx.us/,,,,,,,,,,no 181 | Coleman,7684,http://www.co.coleman.tx.us/,,,,,,,,,,no 182 | McCulloch,7630,http://www.co.mcculloch.tx.us/,https://public.lgsonlinesolutions.com/ors.html,Local Government Solutions,,,,,,,scrapable,no 183 | Castro,7371,http://www.co.castro.tx.us/,,,,,,,,,,no 184 | Dallam,7115,http://www.dallam.org/county/,,,,,,,,,,no 185 | Brooks,7076,http://www.co.brooks.tx.us/,,,,,,,,,,no 186 | Goliad,7012,http://www.co.goliad.tx.us/,,,,,,,,,,no 187 | Swisher,6971,http://www.co.swisher.tx.us/,,,,,,,,,,no 188 | Bailey,6904,http://www.co.bailey.tx.us/,,,,,,,,,,no 189 | Refugio,6741,http://www.co.refugio.tx.us/,,,,,,,,,,no 190 | Childress,6664,http://www.childresscountytexas.us/,,,,,,,,,,no 191 | La Salle,6664,http://www.co.la-salle.tx.us/,,,,,,,,,,no 192 | Presidio,6131,http://www.co.presidio.tx.us/,,,,,,,,,,no 193 | Garza,5816,http://www.garzacounty.net/,,,,,,,,,,no 194 | Carson,5807,http://www.co.carson.tx.us/,,,,,,,,,,no 195 | San Saba,5730,http://www.co.san-saba.tx.us/,,,,,,,,,,no 196 | Lynn,5596,http://www.co.lynn.tx.us/,,,,,,,,,,no 197 | Haskell,5416,http://www.co.haskell.tx.us/,,,,,,,,,,no 198 | Floyd,5402,http://co.floyd.tx.us/,,,,,,,,,,no 199 | Hartley,5382,http://www.co.hartley.tx.us/,,,,,,,,,,no 200 | Hansford,5285,http://www.co.hansford.tx.us/,,,,,,,,,,no 201 | Martin,5237,http://www.co.martin.tx.us/,https://public.lgsonlinesolutions.com/ors.html,Local Government Solutions,,,,,,,scrapable,no 202 | Delta,5230,http://www.deltacountytx.com/,,,,,,,,,,no 203 | Crosby,5133,http://www.co.crosby.tx.us/,,,,,,,,,,no 204 | Wheeler,4990,http://www.co.wheeler.tx.us/,,,,,,,,,,no 205 | Jim Hogg,4838,http://co.jim-hogg.tx.us/,,,,,,,,,,no 206 | Crane,4675,http://www.co.crane.tx.us/,,,,,,,,,,no 207 | Mills,4456,http://www.co.mills.tx.us/,,,,,,,,,,no 208 | Kimble,4286,http://www.co.kimble.tx.us/,,,,,,,,,,no 209 | Mason,3953,http://www.co.mason.tx.us/,,,,,,,,,,no 210 | Fisher,3672,http://www.co.fisher.tx.us/,,,,,,,,,,no 211 | Hardeman,3549,http://www.co.hardeman.tx.us/,,,,,,,,,,no 212 | Baylor,3465,http://www.co.baylor.tx.us/,,,,,,,,,,no 213 | Reagan,3385,http://www.co.reagan.tx.us/,,,,,,,,,,no 214 | Hemphill,3382,http://www.co.hemphill.tx.us/,,,,,,,,,,no 215 | Sutton,3372,http://www.co.sutton.tx.us/,,,,,,,,,,no 216 | Knox,3353,http://www.knoxcountytexas.org/,,,,,,,,,,no 217 | Upton,3308,http://www.co.upton.tx.us/,,,,,,,,,,no 218 | Concho,3303,http://www.co.concho.tx.us/,,,,,,,,,,no 219 | Coke,3285,http://www.co.coke.tx.us/,,,,,,,,,,no 220 | Donley,3258,http://www.co.donley.tx.us/,,,,,,,,,,no 221 | Hudspeth,3202,http://www.co.hudspeth.tx.us/,,,,,,,,,,no 222 | Kinney,3129,http://www.co.kinney.tx.us/,,,,,,,,,,no 223 | Shackelford,3105,http://www.co.shackelford.tx.us/,https://public.lgsonlinesolutions.com/ors.html,Local Government Solutions,,,,,,,scrapable,no 224 | Crockett,3098,http://www.co.crockett.tx.us/,,,,,,,,,,no 225 | Lipscomb,3059,http://www.co.lipscomb.tx.us/,,,,,,,,,,no 226 | Hall,2825,https://www.co.hall.tx.us/,,,,,,,,,,no 227 | Sherman,2782,http://www.co.sherman.tx.us/,,,,,,,,,,no 228 | Real,2758,http://www.co.real.tx.us/,,,,,,,,,,no 229 | Collingsworth,2652,http://www.co.collingsworth.tx.us/,,,,,,,,,,no 230 | Cochran,2547,http://www.co.cochran.tx.us/,,,,,,,,,,no 231 | Schleicher,2451,http://www.co.schleicher.tx.us/,,,,,,,,,,no 232 | Culberson,2188,http://www.co.culberson.tx.us/,,,,,,,,,,no 233 | Jeff Davis,1996,http://www.co.jeff-davis.tx.us/,,,,,,,,,,no 234 | Menard,1962,http://co.menard.tx.us/,,,,,,,,,,no 235 | Armstrong,1848,http://www.co.armstrong.tx.us/,,,,,,,,,,no 236 | Dickens,1770,http://www.co.dickens.tx.us/,,,,,,,,,,no 237 | Oldham,1758,http://www.co.oldham.tx.us/,,,,,,,,,,no 238 | Irion,1513,http://www.co.irion.tx.us/,,,,,,,,,,no 239 | Throckmorton,1440,http://www.throckmortoncounty.org/,,,,,,,,,,no 240 | Briscoe,1435,http://www.co.briscoe.tx.us/,,,,,,,,,,no 241 | Edwards,1422,http://www.co.edwards.tx.us/,,,,,,,,,,no 242 | Cottle,1380,http://www.co.cottle.tx.us/,,,,,,,,,,no 243 | Sterling,1372,http://www.co.sterling.tx.us/,,,,,,,,,,no 244 | Stonewall,1245,http://www.co.stonewall.tx.us/,,,,,,,,,,no 245 | Glasscock,1116,http://www.co.glasscock.tx.us/,,,,,,,,,,no 246 | Foard,1095,http://www.foardcounty.texas.gov/,,,,,,,,,,no 247 | Motley,1063,http://www.co.motley.tx.us/,,,,,,,,,,no 248 | Roberts,827,http://www.co.roberts.tx.us/,,,,,,,,,,no 249 | Terrell,760,http://www.co.terrell.tx.us/,,,,,,,,,,no 250 | Kent,753,http://www.kentcountytexas.us/,,,,,,,,,,no 251 | Borden,631,http://www.co.borden.tx.us/,,,,,,,,,,no 252 | McMullen,600,http://www.mcmullencountytexas.us/,,,,,,,,,,no 253 | Kenedy,350,http://www.co.kenedy.tx.us/,,,,,,,,,,no 254 | King,265,http://www.co.king.tx.us/,,,,,,,,,,no 255 | Loving,64,http://www.co.loving.tx.us/,,,,,,,,,,no -------------------------------------------------------------------------------- /src/scraper/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import csv 4 | import urllib.parse 5 | import sys 6 | from datetime import datetime, timedelta 7 | from time import time 8 | import requests 9 | from bs4 import BeautifulSoup 10 | from .helpers import * 11 | import importlib 12 | from typing import Optional, Tuple, Callable, Type, List 13 | import importlib.util 14 | import re 15 | 16 | class Scraper: 17 | """Scrape Odyssey html files into an output folder""" 18 | def __init__(self): 19 | pass 20 | 21 | def set_defaults( 22 | self, 23 | ms_wait: int | None = None, 24 | start_date: str | None = None, 25 | end_date: str | None = None, 26 | court_calendar_link_text: str | None = None, 27 | case_number: str | None = None, 28 | ssl: bool | None = None, 29 | county: str | None = None, 30 | case_html_path: str | None = None, 31 | ) -> Tuple[int, str, str, str, Optional[str], bool, str, str]: 32 | """ 33 | Sets default values for the provided optional parameters. 34 | 35 | Defaults: 36 | - `ms_wait`: 200 milliseconds if not provided. 37 | - `start_date`: '2024-07-01' if not provided. 38 | - `end_date`: '2024-07-01' if not provided. 39 | - `court_calendar_link_text`: 'Court Calendar' if not provided. 40 | - `case_number`: None if not provided. 41 | 42 | :param ms_wait: Milliseconds to wait. 43 | :param start_date: Start date in YYYY-MM-DD format. 44 | :param end_date: End date in YYYY-MM-DD format. 45 | :param court_calendar_link_text: Text for the court calendar link. 46 | :param case_number: Case number, or None. 47 | 48 | :returns: A tuple containing: 49 | - ms_wait (int): Milliseconds to wait. 50 | - start_date (str): Start date. 51 | - end_date (str): End date. 52 | - court_calendar_link_text (str): Text for court calendar link. 53 | - case_number (Optional[str]): Case number or None. 54 | """ 55 | 56 | # Assign default values if parameters are not provided 57 | ms_wait = ms_wait if ms_wait is not None else 200 58 | start_date = start_date if start_date is not None else '2024-07-01' 59 | end_date = end_date if end_date is not None else '2024-07-01' 60 | court_calendar_link_text = court_calendar_link_text if court_calendar_link_text is not None else "Court Calendar" 61 | # case_number defaults to None if not provided 62 | case_number = case_number 63 | ssl = ssl if ssl is not None else True 64 | county = county if county is not None else 'hays' 65 | case_html_path = case_html_path if case_html_path is not None else os.path.join(os.path.dirname(__file__), "..", "..", "data", county, "case_html") 66 | return ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl, county, case_html_path 67 | 68 | def configure_logger(self) -> logging.Logger: 69 | """ 70 | Configures and returns a logger instance for the scraper class. 71 | 72 | This method sets up the logger with a unique name based on the process ID, 73 | configures the logging level to INFO, and logs an initialization message. 74 | 75 | :returns: Configured logger instance. 76 | """ 77 | # Configure the logger 78 | logger = logging.getLogger(name=f"pid: {os.getpid()}") 79 | 80 | # Set up basic configuration for the logging system 81 | logging.basicConfig(level=logging.INFO) 82 | 83 | return logger 84 | 85 | def format_county(self, county: str) -> str: 86 | """ 87 | Formats the county name to lowercase. 88 | 89 | :param county: The name of the county to be formatted. 90 | :returns: The county name in lowercase. 91 | :raises TypeError: If the provided county name is not a string. 92 | """ 93 | 94 | return re.sub(r'[^\w]+', '', county.lower()) 95 | 96 | def create_session(self, logger: logging.Logger, ssl) -> requests.sessions.Session: 97 | """ 98 | Sets up a `requests.Session` with or without SSL verification and suppresses 99 | related warnings. 100 | 101 | Defaults to enable SSL. 102 | 103 | :param logger: Logger instance for logging errors. 104 | :returns: Configured session object. 105 | """ 106 | # Create and configure the session 107 | session = requests.Session() 108 | 109 | # Optionally SSL certificate verification. Default to True unless False passed. 110 | session.verify = ssl 111 | requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) 112 | 113 | return session 114 | 115 | def make_directories(self, county: str, logger: logging.Logger, case_html_path) -> str: 116 | """ 117 | Creates necessary directories for storing case HTML files. 118 | 119 | This method constructs a path based on the county name and ensures that 120 | all required directories in the path are created. If the directories already 121 | exist, no action is taken. 122 | 123 | :param county: The name of the county, used to create a specific directory path. 124 | :param logger: Logger instance for logging errors. 125 | :returns: The path to the created directories. 126 | :raises OSError: If there is an error creating the directories. 127 | """ 128 | 129 | # Create the directories if they do not exist 130 | os.makedirs(case_html_path, exist_ok=True) 131 | 132 | return case_html_path 133 | 134 | # get county portal URL, Odyssey version, and notes from csv file 135 | def get_ody_link(self, 136 | county: str, 137 | logger: logging.Logger 138 | ) -> Tuple[str, str, str ]: 139 | """ 140 | Retrieves Odyssey-related information for a given county from a CSV file. 141 | 142 | This function reads county-specific data from a CSV file located in the `resources` directory. 143 | It searches for the county name in the CSV file, extracts the corresponding base URL, Odyssey 144 | version, and any additional notes. The base URL is formatted with a trailing slash if necessary. 145 | 146 | :param county: The name of the county for which to retrieve Odyssey information. 147 | :param logger: Logger instance for logging errors and information. 148 | :returns: A tuple containing: 149 | - base_url (str): The base URL for the county’s portal. 150 | - odyssey_version (str): The major version of Odyssey associated with the county. 151 | - notes (str): Additional notes related to the county. 152 | :raises Exception: If the county is not found in the CSV file or if required data is missing. 153 | """ 154 | 155 | try: 156 | base_url = odyssey_version = notes = None 157 | # CSV is located in 'resources' folder 158 | with open( 159 | os.path.join(os.path.dirname(__file__), "..", "..", "resources", "texas_county_data.csv"), 160 | mode="r", 161 | ) as file_handle: 162 | csv_file = csv.DictReader(file_handle) 163 | for row in csv_file: 164 | if row["county"].lower() == county.lower(): 165 | base_url = row["portal"] 166 | # add trailing slash if not present, otherwise urljoin breaks 167 | if base_url[-1] != "/": 168 | base_url += "/" 169 | logger.info(f"{base_url} - scraping this url") 170 | odyssey_version = int(row["version"].split(".")[0]) 171 | notes = row["notes"] 172 | break 173 | if not base_url or not odyssey_version: 174 | raise Exception("The required data to scrape this county is not in /resources/texas_county_data.csv") 175 | except Exception as e: 176 | logger.exception(e, "Error getting county-specific information from csv.") 177 | raise 178 | return base_url, odyssey_version, notes 179 | 180 | def get_class_and_method( 181 | self, 182 | county: str, 183 | logger: logging.Logger 184 | ) -> Tuple[Type[object], Callable]: 185 | 186 | """ 187 | Dynamically imports a module, retrieves a class, and gets a method from it based on the county name. 188 | 189 | :param county: The name of the county, used to construct module, class, and method names. 190 | :param logger: Logger instance for logging errors. 191 | :returns: A tuple containing the instance of the class and the method callable. 192 | :raises ImportError: If the module cannot be imported. 193 | :raises AttributeError: If the class or method cannot be found. 194 | """ 195 | 196 | module_name = county 197 | class_name = f"Scraper{county.capitalize()}" 198 | method_name = f"scraper_{county}" 199 | 200 | # Add the current directory to the system path 201 | sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) 202 | 203 | try: 204 | # Dynamically import the module 205 | module = importlib.import_module(module_name) 206 | 207 | # Retrieve the class from the module 208 | cls = getattr(module, class_name, None) 209 | if cls is None: 210 | raise AttributeError(f"Class '{class_name}' not found in module '{module_name}'") 211 | 212 | # Instantiate the class 213 | instance = cls() 214 | 215 | # Retrieve the method with the specified name 216 | method = getattr(instance, method_name, None) 217 | if method is None: 218 | raise AttributeError(f"Method '{method_name}' not found in class '{class_name}'") 219 | 220 | return instance, method 221 | 222 | except (FileNotFoundError, ImportError, AttributeError) as e: 223 | logger.exception(e, "Error dynamically loading module or retrieving class/method.") 224 | raise 225 | 226 | def scrape_main_page(self, 227 | base_url: str, 228 | odyssey_version: int, 229 | session: requests.sessions.Session, 230 | notes: str, 231 | logger: logging.Logger, 232 | ms_wait: int 233 | ) -> Tuple[str, BeautifulSoup]: 234 | """ 235 | Scrapes the main page of the Odyssey site, handling login if required, and returns the page's HTML and parsed content. 236 | 237 | This function handles a special case where some sites may require a public guest login. If the `notes` parameter 238 | contains a "PUBLICLOGIN#" identifier, it will extract the username and password from the `notes`, perform the login, 239 | and then proceed to scrape the main page. 240 | 241 | :param base_url: The base URL of the main page to scrape. 242 | :param odyssey_version: The version of Odyssey; currently not used in this function. 243 | :param session: The `requests` session object used for making HTTP requests. 244 | :param notes: A string containing notes that may include login credentials in the format "PUBLICLOGIN#username/password". 245 | :param logger: Logger instance for logging errors and debug information. 246 | :param ms_wait: The number of milliseconds to wait between retry attempts. 247 | :returns: A tuple containing: 248 | - main_page_html (str): The raw HTML content of the main page. 249 | - main_soup (BeautifulSoup): A BeautifulSoup object containing the parsed HTML content. 250 | :raises Exception: If any error occurs during the HTTP requests or HTML parsing. 251 | """ 252 | 253 | try: 254 | # some sites have a public guest login that must be used 255 | if "PUBLICLOGIN#" in notes: 256 | userpass = notes.split("#")[1].split("/") 257 | data = { 258 | "UserName": userpass[0], 259 | "Password": userpass[1], 260 | "ValidateUser": "1", 261 | "dbKeyAuth": "Justice", 262 | "SignOn": "Sign On", 263 | } 264 | 265 | request_page_with_retry( 266 | session=session, 267 | url=urllib.parse.urljoin(base_url, "login.aspx"), 268 | logger=logger, 269 | http_method=HTTPMethod.GET, 270 | ms_wait=ms_wait, 271 | data=data, 272 | ) 273 | 274 | main_page_html = request_page_with_retry( 275 | session=session, 276 | url=base_url, 277 | verification_text="ssSearchHyperlink", 278 | logger=logger, 279 | http_method=HTTPMethod.GET, 280 | ms_wait=ms_wait, 281 | ) 282 | main_soup = BeautifulSoup(main_page_html, "html.parser") 283 | except Exception as e: 284 | logger.exception(e, f"Error scraping main page for main page HTML.") 285 | raise 286 | return main_page_html, main_soup 287 | 288 | def scrape_search_page( 289 | self, 290 | base_url: str, 291 | odyssey_version: int, 292 | main_page_html: str, 293 | main_soup: BeautifulSoup, 294 | session: requests.sessions.Session, 295 | logger: logging.Logger, 296 | ms_wait: int, 297 | court_calendar_link_text: str 298 | ) -> Tuple[str, str, BeautifulSoup]: 299 | """ 300 | Scrapes the search page URL and data based on the main page content. 301 | 302 | This method extracts the search page ID from the court calendar link, constructs the URL for the search page, 303 | and retrieves the search page HTML. Depending on the Odyssey version, it either uses the extracted URL or a 304 | default URL. It then parses the search page HTML into a BeautifulSoup object. 305 | 306 | :param base_url: The base URL for constructing full URLs. 307 | :param odyssey_version: The version of Odyssey, used to determine the correct URL and verification text. 308 | :param main_page_html: The HTML content of the main page. 309 | :param main_soup: Parsed BeautifulSoup object of the main page HTML. 310 | :param session: The session object for making HTTP requests. 311 | :param logger: Logger instance for logging errors and information. 312 | :param ms_wait: Milliseconds to wait before making requests. 313 | :param court_calendar_link_text: Text to search for in the court calendar link. 314 | :returns: A tuple containing the search page URL, search page HTML, and the BeautifulSoup object of the search page. 315 | :raises ValueError: If the court calendar link is not found on the main page. 316 | """ 317 | 318 | # Extract the search page ID from the court calendar link 319 | search_page_id = None 320 | for link in main_soup.select("a.ssSearchHyperlink"): 321 | if court_calendar_link_text in link.text: 322 | search_page_id = link["href"].split("?ID=")[1].split("'")[0] 323 | break # Exit loop once the link is found 324 | 325 | if not search_page_id: 326 | write_debug_and_quit( 327 | verification_text="Court Calendar link", 328 | page_text=main_page_html, 329 | logger=logger, 330 | ) 331 | raise ValueError("Court Calendar link not found on the main page.") 332 | 333 | # Build the URL for the search page 334 | search_url = f"{base_url}Search.aspx?ID={search_page_id}" 335 | 336 | # Determine the correct URL and verification text based on Odyssey version 337 | if odyssey_version < 2017: 338 | search_url = search_url 339 | verification_text = "Court Calendar" 340 | else: 341 | search_url = urllib.parse.urljoin(base_url, "Home/Dashboard/26") 342 | verification_text = "SearchCriteria.SelectedCourt" 343 | 344 | # Hit the search page to gather initial data 345 | search_page_html = request_page_with_retry( 346 | session=session, 347 | url=search_url, 348 | verification_text=verification_text, 349 | http_method=HTTPMethod.GET, 350 | logger=logger, 351 | ms_wait=ms_wait, 352 | ) 353 | search_soup = BeautifulSoup(search_page_html, "html.parser") 354 | 355 | return search_url, search_page_html, search_soup 356 | 357 | def get_hidden_values( 358 | self, 359 | odyssey_version: int, 360 | main_soup: BeautifulSoup, 361 | search_soup: BeautifulSoup, 362 | logger: logging.Logger 363 | ) -> Dict[str, str]: 364 | """ 365 | Extracts hidden input values and additional data from the search page. 366 | 367 | :param odyssey_version: The version of Odyssey to determine logic. 368 | :param main_soup: Parsed BeautifulSoup object of the main page HTML. 369 | :param search_soup: Parsed BeautifulSoup object of the search page HTML. 370 | :param logger: Logger instance for logging information. 371 | :returns: Dictionary of hidden input names and their values. 372 | """ 373 | 374 | # Extract hidden input values 375 | hidden_values = { 376 | hidden["name"]: hidden["value"] 377 | for hidden in search_soup.select('input[type="hidden"]') 378 | if hidden.has_attr("name") 379 | } 380 | 381 | # Get NodeDesc and NodeID information based on Odyssey version 382 | if odyssey_version < 2017: 383 | location_option = main_soup.find_all("option")[0] 384 | logger.info(f"Location: {location_option.text}") 385 | hidden_values.update({ 386 | "NodeDesc": location_option.text, 387 | "NodeID": location_option["value"] 388 | }) 389 | else: 390 | hidden_values["SearchCriteria.SelectedCourt"] = hidden_values.get("Settings.DefaultLocation", "") 391 | 392 | return hidden_values 393 | 394 | def get_search_results( 395 | self, 396 | session: requests.sessions.Session, 397 | search_url: str, 398 | logger: logging.Logger, 399 | ms_wait: int, 400 | hidden_values: Dict[str, str], 401 | case_number: Optional[str] 402 | ) -> BeautifulSoup: 403 | """ 404 | Retrieves search results from the search page. 405 | 406 | :param session: The session object for making HTTP requests. 407 | :param search_url: The URL to request search results from. 408 | :param logger: Logger instance for logging information. 409 | :param ms_wait: Milliseconds to wait before making requests. 410 | :param hidden_values: Dictionary of hidden input values. 411 | :param case_number: Case number for searching. 412 | :returns: Parsed BeautifulSoup object of the search results page HTML. 413 | """ 414 | 415 | results_page_html = request_page_with_retry( 416 | session=session, 417 | url=search_url, 418 | verification_text="Record Count", 419 | logger=logger, 420 | data=create_single_case_search_form_data(hidden_values, case_number), 421 | ms_wait=ms_wait, 422 | ) 423 | return BeautifulSoup(results_page_html, "html.parser") 424 | 425 | def scrape_individual_case( 426 | self, 427 | base_url: str, 428 | search_url: str, 429 | hidden_values: Dict[str, str], 430 | case_number: Optional[str], 431 | case_html_path: str, 432 | session: requests.sessions.Session, 433 | logger: logging.Logger, 434 | ms_wait: int 435 | ) -> None: 436 | 437 | results_soup = self.get_search_results(session, search_url, logger, ms_wait, hidden_values, case_number) 438 | case_urls = [ 439 | base_url + anchor["href"] 440 | for anchor in results_soup.select('a[href^="CaseDetail"]') 441 | ] 442 | 443 | logger.info(f"{len(case_urls)} entries found") 444 | 445 | if case_urls: 446 | case_id = case_urls[0].split("=")[1] 447 | logger.info(f"{case_id} - scraping case") 448 | 449 | case_html = request_page_with_retry( 450 | session=session, 451 | url=case_urls[0], 452 | verification_text="Date Filed", 453 | logger=logger, 454 | ms_wait=ms_wait, 455 | ) 456 | 457 | logger.info(f"{len(case_html)} response string length") 458 | 459 | with open( 460 | os.path.join(case_html_path, f"{case_id}.html"), "w" 461 | ) as file_handle: 462 | file_handle.write(case_html) 463 | else: 464 | logger.warning("No case URLs found.") 465 | 466 | def scrape_jo_list( 467 | self, 468 | odyssey_version: int, 469 | search_soup: BeautifulSoup, 470 | judicial_officers: Optional[List[str]], 471 | logger: logging.Logger 472 | ) -> Tuple[List[str], Dict[str, str]]: 473 | """ 474 | Scrapes a list of judicial officers and their IDs from the search page. 475 | 476 | Optionally receives a list of judicial officers to scrape. 477 | 478 | :param odyssey_version: The version of Odyssey to determine the selector. 479 | :param search_soup: Parsed BeautifulSoup object of the search page HTML. 480 | :param judicial_officers: List of specific judicial officers to use. 481 | :param logger: Logger instance for logging information. 482 | :returns: Tuple containing a list of judicial officers to use and a dictionary of judicial officers and their IDs. 483 | """ 484 | 485 | selector = 'select[labelname="Judicial Officer:"] > option' if odyssey_version < 2017 else 'select[id="selHSJudicialOfficer"] > option' 486 | judicial_officer_to_ID = { 487 | option.text: option["value"] 488 | for option in search_soup.select(selector) 489 | if option.text 490 | } 491 | 492 | if not judicial_officers: 493 | judicial_officers = list(judicial_officer_to_ID.keys()) 494 | logger.info(f"No judicial officers specified, so scraping all of them: {len(judicial_officers)}") 495 | else: 496 | logger.info(f"Judicial officers were specified, so only scraping these: {judicial_officers}") 497 | 498 | return judicial_officers, judicial_officer_to_ID 499 | 500 | def scrape_results_page( 501 | self, 502 | odyssey_version: int, 503 | base_url: str, 504 | search_url: str, 505 | hidden_values: dict[str, str], 506 | jo_id: str, 507 | date_string: str, 508 | session: requests.sessions.Session, 509 | logger: logging.Logger, 510 | ms_wait: int 511 | ) -> Tuple[str, BeautifulSoup]: 512 | """ 513 | Scrapes the results page based on Odyssey version and search criteria. 514 | 515 | :param odyssey_version: The version of Odyssey to determine the URL and verification text. 516 | :param base_url: The base URL for constructing full URLs. 517 | :param search_url: The URL to request search results from. 518 | :param hidden_values: Dictionary of hidden input values. 519 | :param jo_id: Judicial officer ID for searching. 520 | :param date_string: Date string for searching. 521 | :param session: The session object for making HTTP requests. 522 | :param logger: Logger instance for logging information. 523 | :param ms_wait: Milliseconds to wait before making requests. 524 | :returns: A tuple containing the HTML of the results page and the parsed BeautifulSoup object. 525 | """ 526 | 527 | search_url = ( 528 | search_url 529 | if odyssey_version < 2017 530 | else urllib.parse.urljoin(base_url, "Hearing/SearchHearings/HearingSearch") 531 | ) 532 | 533 | verification_text = ( 534 | "Record Count" 535 | if odyssey_version < 2017 536 | else "Search Results" 537 | ) 538 | 539 | results_page_html = request_page_with_retry( 540 | session=session, 541 | url=search_url, 542 | verification_text=verification_text, 543 | logger=logger, 544 | data=create_search_form_data(date_string, jo_id, hidden_values, odyssey_version), 545 | ms_wait=ms_wait, 546 | ) 547 | 548 | results_soup = BeautifulSoup(results_page_html, "html.parser") 549 | 550 | return results_page_html, results_soup 551 | 552 | def scrape_multiple_cases( 553 | self, 554 | county: str, 555 | odyssey_version: int, 556 | base_url: str, 557 | search_url: str, 558 | hidden_values: Dict[str, str], 559 | judicial_officers: List[str], 560 | judicial_officer_to_ID: Dict[str, str], 561 | case_html_path: Optional[str], 562 | logger: logging.Logger, 563 | session: requests.Session, 564 | ms_wait: int, 565 | start_date: str, 566 | end_date: str 567 | ) -> None: 568 | start_date = datetime.strptime(start_date, '%Y-%m-%d').date() 569 | end_date = datetime.strptime(end_date, '%Y-%m-%d').date() 570 | 571 | for date in (start_date + timedelta(n) for n in range((end_date - start_date).days + 1)): 572 | date_string = date.strftime("%m/%d/%Y") 573 | 574 | for JO_name in judicial_officers: 575 | if JO_name not in judicial_officer_to_ID: 576 | logger.error(f"Judicial officer {JO_name} not found on search page. Continuing.") 577 | continue 578 | 579 | jo_id = judicial_officer_to_ID[JO_name] 580 | logger.info(f"Searching cases on {date_string} for {JO_name}") 581 | 582 | results_page_html, results_soup = self.scrape_results_page( 583 | odyssey_version, base_url, search_url, hidden_values, jo_id, date_string, session, logger, ms_wait 584 | ) 585 | 586 | scraper_instance, scraper_function = self.get_class_and_method(county, logger) 587 | print(scraper_function) 588 | scraper_function(base_url, results_soup, case_html_path, logger, session, ms_wait) 589 | 590 | def scrape( 591 | self, 592 | county: str, 593 | judicial_officers: List[str], 594 | ms_wait: int, 595 | start_date: str, 596 | end_date: str, 597 | court_calendar_link_text: Optional[str], 598 | case_number: Optional[str], 599 | case_html_path: Optional[str] 600 | ) -> None: 601 | ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl, county, case_html_path = self.set_defaults( 602 | ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl, county, case_html_path 603 | ) 604 | 605 | logger = self.configure_logger() 606 | county = self.format_county(county, logger) 607 | session = self.create_session(logger) 608 | 609 | if case_html_path is None: 610 | self.make_directories(county, logger) 611 | 612 | base_url, odyssey_version, notes = self.get_ody_link(county, logger) 613 | main_page_html, main_soup = self.scrape_main_page(base_url, odyssey_version, session, notes, logger, ms_wait) 614 | search_url, search_page_html, search_soup = self.scrape_search_page( 615 | base_url, odyssey_version, main_page_html, main_soup, session, logger, ms_wait, court_calendar_link_text 616 | ) 617 | 618 | hidden_values = self.get_hidden_values(odyssey_version, main_soup, search_soup, logger) 619 | 620 | if case_number: 621 | self.scrape_individual_case( 622 | base_url, search_url, hidden_values, case_number, case_html_path, session, logger, ms_wait 623 | ) 624 | else: 625 | judicial_officers, judicial_officer_to_ID = self.scrape_jo_list( 626 | odyssey_version, search_soup, judicial_officers, logger 627 | ) 628 | scraper_start_time = time() 629 | self.scrape_multiple_cases( 630 | county, odyssey_version, base_url, search_url, hidden_values, judicial_officers, judicial_officer_to_ID, 631 | case_html_path, logger, session, ms_wait, start_date, end_date 632 | ) 633 | logger.info(f"\nTime to run script: {round(time() - scraper_start_time, 2)} seconds") 634 | --------------------------------------------------------------------------------