├── .python_version
├── src
    ├── tester
    │   ├── __init__.py
    │   └── README.md
    ├── __init__.py
    ├── tools
    │   ├── zip_folder.py
    │   ├── combine_parsed.py
    │   ├── print_stats.py
    │   └── build_event_csv.py
    ├── scraper
    │   ├── hays.py
    │   ├── README.md
    │   ├── scrapcode_post2017.py
    │   ├── helpers.py
    │   └── __init__.py
    ├── orchestrator
    │   └── __init__.py
    ├── cleaner
    │   ├── Data Structure of Cleaned JSON.md
    │   └── __init__.py
    ├── parser
    │   ├── README.md
    │   ├── Data Structure of Parsed JSON.md
    │   ├── __init__.py
    │   └── hays.py
    └── updater
    │   └── __init__.py
├── .github
    ├── CODEOWNERS
    └── workflows
    │   └── python-app.yml
├── resources
    ├── test_files
    │   ├── parser_testing
    │   │   ├── test_1.html
    │   │   ├── test_2.html
    │   │   └── multiple_html_files
    │   │   │   ├── test_1.html
    │   │   │   └── test_2.html
    │   ├── test_123456.html
    │   ├── test_hidden_values.txt
    │   ├── cleaned_test_json
    │   │   └── test_123456.json
    │   ├── field_validation_list.json
    │   ├── hays_main_page.html
    │   └── test_123456.json
    └── texas_county_data.csv
├── requirements.txt
├── CONTRIBUTING.md
├── docs
    ├── index.rst
    └── conf.py
├── LICENSE
├── parser_log.txt
├── .gitignore
└── README.md


/.python_version:
--------------------------------------------------------------------------------
1 | 3.12.2


--------------------------------------------------------------------------------
/src/tester/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | *       @ids-core


--------------------------------------------------------------------------------
/resources/test_files/parser_testing/test_1.html:
--------------------------------------------------------------------------------
1 | test


--------------------------------------------------------------------------------
/resources/test_files/parser_testing/test_2.html:
--------------------------------------------------------------------------------
1 | test


--------------------------------------------------------------------------------
/resources/test_files/parser_testing/multiple_html_files/test_1.html:
--------------------------------------------------------------------------------
1 | test


--------------------------------------------------------------------------------
/resources/test_files/parser_testing/multiple_html_files/test_2.html:
--------------------------------------------------------------------------------
1 | test


--------------------------------------------------------------------------------
/resources/test_files/test_123456.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-austin/indigent-defense-stats/HEAD/resources/test_files/test_123456.html


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
1 | from . import cleaner
2 | from . import orchestrator
3 | from . import parser
4 | from . import scraper
5 | from . import tools
6 | from . import updater
7 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | azure-cosmos    == 4.7.0
 2 | beautifulsoup4  == 4.12.3
 3 | boto3           == 1.35.5
 4 | python-dotenv   == 1.0.1
 5 | requests        == 2.32.3
 6 | retry           == 0.9.2
 7 | statistics      == 1.0.3.5
 8 | xxhash          == 3.5.0
 9 | flake8 == 7.1.0
10 | Sphinx == 8.0.2
11 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | ## Development Ethos
 4 | 
 5 | - Write code simply to make it more accessible for new developers to contribute over the long run.
 6 |   - Make your code easier to read, even if at the expense of being more succinct.
 7 |   - Use commonly used and well-maintained libraries and approaches
 8 |   - Document and notate your code.
 9 | 
10 | ## How to Report an Issue
11 | 
12 | - If something breaks make an [Issue](../../issues)
13 |   - Provide the date, JO, case number, and county that you were scraping. (if known)
14 |   - Paste all error text.
15 | 


--------------------------------------------------------------------------------
/resources/test_files/test_hidden_values.txt:
--------------------------------------------------------------------------------
1 | {'__VIEWSTATE': '/wEPDwULLTEwOTk1NTcyNzAPZBYCZg9kFgICAQ8WAh4HVmlzaWJsZWgWAgIDDw9kFgIeB29ua2V5dXAFJnRoaXMudmFsdWUgPSB0aGlzLnZhbHVlLnRvTG93ZXJDYXNlKCk7ZGSnBpspJun0H8O1uyepgbYYqxCR2g==', '__VIEWSTATEGENERATOR': 'BBBC20B8', '__EVENTVALIDATION': '/wEWAgLohsKOBgKYxoa5CF1tgF3CUdvlNXx3DxVd7HpMX9tL', 'NodeID': '100,101,102,103,200,201,202,203,204,220,6112,400,401,402,403,404,405,406,407,6111,6116', 'NodeDesc': 'All Courts', 'SearchType': '', 'SearchMode': '', 'NameTypeKy': '', 'BaseConnKy': '', 'StatusType': '', 'ShowInactive': '', 'AllStatusTypes': '', 'CaseCategories': '', 'RequireFirstName': '', 'CaseTypeIDs': '', 'HearingTypeIDs': '', 'SearchParams': ''}


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. indigent-defense-stats documentation master file, created by
 2 |    sphinx-quickstart on Sun Sep 15 15:44:02 2024.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | indigent-defense-stats documentation
 7 | ====================================
 8 | 
 9 | Add your content using ``reStructuredText`` syntax. See the
10 | `reStructuredText <https://www.sphinx-doc.org/en/master/usage/restructuredtext/index.html>`_
11 | documentation for details.
12 | 
13 | .. autosummary::
14 |    :toctree: generated
15 | 
16 |    cleaner
17 |    orchestrator
18 |    parser
19 |    scraper
20 |    tools
21 |    updater
22 | 


--------------------------------------------------------------------------------
/src/tools/zip_folder.py:
--------------------------------------------------------------------------------
 1 | import zipfile
 2 | import os
 3 | import argparse
 4 | from io import BytesIO
 5 | import boto3
 6 | 
 7 | argparser = argparse.ArgumentParser()
 8 | argparser.add_argument(
 9 |     "-county",
10 |     "-c",
11 |     type=str,
12 |     default="hays",
13 |     help="The name of the county.",
14 | )
15 | argparser.description = "Print stats for the specified county."
16 | args = argparser.parse_args()
17 | 
18 | folderpath = os.path.join(
19 |     os.path.dirname(__file__), "..", "..", "data", args.county, "case_html"
20 | )
21 | memory_file = BytesIO()
22 | with zipfile.ZipFile(memory_file, "w") as zf:
23 |     for root, dirs, files in os.walk(folderpath):
24 |         for file in files:
25 |             filepath = os.path.join(root, file)
26 |             zf.write(filepath, arcname=file)
27 | memory_file.seek(0)
28 | 
29 | cli = boto3.client("s3")
30 | cli.put_object(
31 |     Body=memory_file,
32 |     Bucket="indigent-defense",
33 |     Key="case_html.zip",
34 | )
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Derek Olson
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/tools/combine_parsed.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | import boto3
 5 | 
 6 | argparser = argparse.ArgumentParser()
 7 | argparser.add_argument(
 8 |     "-county",
 9 |     "-c",
10 |     type=str,
11 |     default="hays",
12 |     help="The name of the county.",
13 | )
14 | argparser.description = "Print stats for the specified county."
15 | args = argparser.parse_args()
16 | 
17 | case_json_path = os.path.join(
18 |     os.path.dirname(__file__), "..", "..", "data", args.county, "case_json"
19 | )
20 | 
21 | file_list = os.listdir(case_json_path)
22 | 
23 | # read case ids (first 1000 for now)
24 | all_case_data = {}
25 | for case_filename in file_list[:1000]:
26 |     case_id = os.path.splitext(os.path.basename(case_filename))[0]
27 |     with open(os.path.join(case_json_path, case_filename), "r") as f:
28 |         case_data = json.load(f)
29 |     all_case_data[case_id] = case_data
30 | 
31 | # export to s3 bucket
32 | case_data_str = json.dumps(all_case_data)
33 | cli = boto3.client("s3")
34 | cli.put_object(
35 |     Body=json.dumps(all_case_data),
36 |     Bucket="indigent-defense",
37 |     Key="case_id_example.json",
38 | )
39 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | # Configuration file for the Sphinx documentation builder.
 4 | #
 5 | # For the full list of built-in configuration values, see the documentation:
 6 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 7 | 
 8 | # -- Project information -----------------------------------------------------
 9 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
10 | 
11 | project = 'indigent-defense-stats'
12 | copyright = '2024, Open Austin'
13 | author = 'Open Austin'
14 | 
15 | # -- General configuration ---------------------------------------------------
16 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
17 | 
18 | sys.path.insert(0, os.path.abspath('../src'))
19 | 
20 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.autosummary']
21 | 
22 | templates_path = ['_templates']
23 | exclude_patterns = ['src/tester']
24 | 
25 | 
26 | 
27 | # -- Options for HTML output -------------------------------------------------
28 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
29 | 
30 | html_theme = 'alabaster'
31 | html_static_path = ['_static']
32 | 


--------------------------------------------------------------------------------
/src/scraper/hays.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from helpers import *
 3 | 
 4 | class ScraperHays():
 5 | 
 6 |     def __init__(self):
 7 |         pass
 8 | 
 9 |     def scraper_hays(self, base_url, results_soup, case_html_path, logger, session, ms_wait):
10 |         case_urls = [
11 |             base_url + anchor["href"]
12 |             for anchor in results_soup.select('a[href^="CaseDetail"]')
13 |         ]
14 |         logger.info(f"{len(case_urls)} cases found")
15 |         for case_url in case_urls:
16 |             case_id = case_url.split("=")[1]
17 |             logger.info(f"{case_id} - scraping case")
18 |             # make request for the case
19 |             try:
20 |                 case_html = request_page_with_retry(
21 |                     session=session,
22 |                     url=case_url,
23 |                     verification_text="Date Filed",
24 |                     logger=logger,
25 |                     ms_wait=ms_wait,
26 |                 )
27 |             except:
28 |                 logger.info(f"Issue with scraping this case: {case_id}. Moving to next one.")
29 |             # write html case data
30 |             logger.info(f"{len(case_html)} response string length")
31 | 
32 |             with open(
33 |                 os.path.join(case_html_path, f"{case_id}.html"), "w"
34 |             ) as file_handle:
35 |                 file_handle.write(case_html)
36 | 


--------------------------------------------------------------------------------
/src/scraper/README.md:
--------------------------------------------------------------------------------
 1 | ```mermaid
 2 | graph TD
 3 | 
 4 |   A[scrape] --> B[set_defaults: Initialize default values for parameters like county, wait time, dates, and case details]
 5 |   B --> C[configure_logger: Set up logging for the scraping process]
 6 |   C --> D[format_county: Normalize the county name to ensure consistent processing]
 7 |   D --> E[create_session: Create a web session object for handling HTTP requests]
 8 |   E --> F[make_directories: Create directories for storing scraped case data, if not already provided]
 9 |   F --> G[get_ody_link: Retrieve base URL and Odyssey version information based on county]
10 |   G --> H[scrape_main_page: Fetch and parse the main page of the county's court site]
11 |   G <--> O[county_csv]
12 |   H --> I[scrape_search_page: Navigate to the search page and extract relevant content]
13 |   I --> J[get_hidden_values: Extract hidden form values required for subsequent searches]
14 | 
15 |   J --> K{Is case_number provided?}
16 |   K -- Yes --> L[scrape_individual_case: Scrape data for a specific case number provided by the user]
17 |   L --> Q[county-specific scraper]
18 |   K -- No --> M[scrape_jo_list: Retrieve a list of judicial officers between the start and end dates]
19 |   M --> N[scrape_multiple_cases: Scrape data for multiple cases based on judicial officers and date range]
20 |   N -- loop through Judicial Officers per Day in Range --> R[county-specific scraper]
21 | ```
22 | 


--------------------------------------------------------------------------------
/src/orchestrator/__init__.py:
--------------------------------------------------------------------------------
 1 | import os, csv
 2 | 
 3 | # Import all of the programs modules within the parent_dir
 4 | from .. import scraper
 5 | from .. import parser
 6 | from .. import cleaner
 7 | from .. import updater
 8 | 
 9 | class Orchestrator:
10 |     def __init__(self):
11 |         #Sets our base parameters
12 |         self.counties = []
13 |         self.start_date = '2024-07-01'       #Update start date here
14 |         self.end_date = '2024-07-01'         #Update start date here
15 |     def orchestrate(self, test: bool = False):
16 | 
17 |         #This open the county data CSV to see which counties should be scraped, parsed, cleaned, and updated.
18 |         with open(
19 |             os.path.join(
20 |                 os.path.dirname(__file__), "..", "..", "resources", "texas_county_data.csv"
21 |             ),
22 |             mode="r",
23 |         ) as file_handle:
24 |             csv_file = csv.DictReader(file_handle)
25 |             for row in csv_file:
26 |                 #This only selects the counties from the csv that should be parsed.
27 |                 if row["scrape"].lower() == "yes":
28 |                     self.counties.append(row["county"])
29 | 
30 |         #This runs the different modules in order
31 |         for c in self.counties:
32 |             print(f"Starting to scrape, parse, clean, and update this county: {c}")
33 |             scraper(test = test, county = c).scrape() #src/scraper
34 |             parser(c).parse() #src/parser
35 |             cleaner(c).clean() #src/cleaner
36 |             updater(c).update() #src/updater
37 |             print(f"Completed with scraping, parsing, cleaning, and updating of this county: {c}")
38 | 
39 | if __name__ == '__main__':
40 |     Orchestrator().orchestrate()
41 | 


--------------------------------------------------------------------------------
/.github/workflows/python-app.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 3 | 
 4 | name: Indigent Defense Stats
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ "main" ]
 9 |   pull_request:
10 |     branches: [ "main" ]
11 | 
12 | permissions:
13 |   contents: read
14 |   pages: write
15 |   id-token: write
16 | 
17 | concurrency:
18 |   group: "pages"
19 |   cancel-in-progress: false
20 | 
21 | jobs:
22 |   build:
23 | 
24 |     runs-on: ubuntu-latest
25 | 
26 |     steps:
27 |     - uses: actions/checkout@v4
28 |     - name: Set up Python 3.12
29 |       uses: actions/setup-python@v3
30 |       with:
31 |         python-version: "3.12"
32 |     - name: Install dependencies
33 |       run: |
34 |         python -m pip install --upgrade pip
35 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
36 |     - name: Lint with flake8
37 |       run: |
38 |         # stop the build if there are Python syntax errors or undefined names
39 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
40 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
41 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
42 |     - name: Test with unittest
43 |       run: |
44 |         SKIP_SLOW=true python -m unittest -v
45 |     - name: Build documentation
46 |       run: |
47 |         sphinx-build -b html docs build
48 | 
49 |     - uses: actions/upload-pages-artifact@v3.0.1
50 |       with:
51 |         path: build/
52 | 
53 |   deploy:
54 |     needs: build
55 |     runs-on: ubuntu-latest
56 |     if: github.ref == 'refs/heads/main'
57 |     steps:
58 |       - name: Deploy to GitHub Pages
59 |         id: deployment
60 |         uses: actions/deploy-pages@v4
61 | 


--------------------------------------------------------------------------------
/resources/test_files/cleaned_test_json/test_123456.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "parsing_date": "2024-11-02",
 3 |     "html_hash": "8d4a80173c700b37",
 4 |     "Case Metadata": {
 5 |         "county": "hays"
 6 |     },
 7 |     "Defendant Information": {
 8 |         "appointed_or_retained": "Court Appointed",
 9 |         "defense_attorney": "9083bb693e33919c"
10 |     },
11 |     "Charge Information": [
12 |         {
13 |             "charge_id": 0,
14 |             "charge_level": "Second Degree Felony",
15 |             "orignal_charge": "AGGRAVATED ASSAULT WITH A DEADLY WEAPON",
16 |             "statute": "22.02(a)(2)",
17 |             "is_primary_charge": true,
18 |             "charge_date": "2015-10-25",
19 |             "charge_name": "AGGRAVATED ASSAULT WITH A DEADLY WEAPON",
20 |             "uccs_code": "1200",
21 |             "charge_desc": "Aggravated Assault",
22 |             "offense_category_desc": "Aggravated assault",
23 |             "offense_type_desc": "Violent"
24 |         }
25 |     ],
26 |     "Case Details": {
27 |         "earliest_charge_date": "2015-10-25",
28 |         "has_evidence_of_representation": false
29 |     },
30 |     "Disposition_Information": [
31 |         {
32 |             "date": "12/06/2016",
33 |             "event": "Disposition",
34 |             "details": [
35 |                 {
36 |                     "charge": "1. AGGRAVATED ASSAULT WITH A DEADLY WEAPON",
37 |                     "outcome": "Deferred Adjudication"
38 |                 }
39 |             ]
40 |         },
41 |         {
42 |             "date": "11/04/2019",
43 |             "event": "Amended Disposition",
44 |             "details": [
45 |                 {
46 |                     "charge": "1. AGGRAVATED ASSAULT WITH A DEADLY WEAPON",
47 |                     "outcome": "Amend Probation"
48 |                 }
49 |             ]
50 |         }
51 |     ],
52 |     "Good Motions": [],
53 |     "cause_number_redacted": "871239500b7fe2fd"
54 | }


--------------------------------------------------------------------------------
/src/scraper/scrapcode_post2017.py:
--------------------------------------------------------------------------------
 1 | # Not currently in use. Should be moved to a county-specific module, class, and method when a post2017 county is included
 2 | """def scrape_case_data_post2017(self, base_url, case_html_path, session, logger, ms_wait):
 3 |     # Need to POST this page to get a JSON of the search results after the initial POST
 4 |     case_list_json = request_page_with_retry(
 5 |         session=session,
 6 |         url=urllib.parse.urljoin(base_url, "Hearing/HearingResults/Read"),
 7 |         verification_text="AggregateResults",
 8 |         logger=logger,
 9 |     )
10 |     case_list_json = json.loads(case_list_json)
11 |     logger.info(f"{case_list_json['Total']} cases found")
12 |     for case_json in case_list_json["Data"]:
13 |         case_id = str(case_json["CaseId"])
14 |         logger.info(f"{case_id} scraping case")
15 |         # make request for the case
16 |         case_html = request_page_with_retry(
17 |             session=session,
18 |             url=urllib.parse.urljoin(base_url, "Case/CaseDetail"),
19 |             verification_text="Case Information",
20 |             logger=logger,
21 |             ms_wait=ms_wait,
22 |             params={
23 |                 "eid": case_json["EncryptedCaseId"],
24 |                 "CaseNumber": case_json["CaseNumber"],
25 |             },
26 |         )
27 |         # make request for financial info
28 |         case_html += request_page_with_retry(
29 |             session=session,
30 |             url=urllib.parse.urljoin(
31 |                 base_url, "Case/CaseDetail/LoadFinancialInformation"
32 |             ),
33 |             verification_text="Financial",
34 |             logger=logger,
35 |             ms_wait=ms_wait,
36 |             params={
37 |                 "caseId": case_json["CaseId"],
38 |             },
39 |         )
40 |         # write case html data
41 |         logger.info(f"{len(case_html)} response string length")
42 |         with open(
43 |             os.path.join(case_html_path, f"{case_id}.html"), "w"
44 |         ) as file_handle:
45 |             file_handle.write(case_html)"""


--------------------------------------------------------------------------------
/src/tools/print_stats.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | from time import time
 6 | from statistics import mean, median, mode
 7 | 
 8 | N_LONGEST = 5
 9 | START_TIME = time()
10 | 
11 | case_data_list = []
12 | 
13 | argparser = argparse.ArgumentParser()
14 | argparser.add_argument(
15 |     "-county",
16 |     "-c",
17 |     type=str,
18 |     default="hays",
19 |     help="The name of the county.",
20 | )
21 | argparser.description = "Print stats for the specified county."
22 | args = argparser.parse_args()
23 | 
24 | case_json_path = os.path.join(
25 |     os.path.dirname(__file__), "..", "..", "data", args.county, "case_json"
26 | )
27 | for case_file in os.scandir(case_json_path):
28 |     with open(case_file.path, "r") as file_handle:
29 |         case_data_list.append(json.loads(file_handle.read()))
30 | 
31 | 
32 | def print_top_cases_by_lambda(sort_function, description):
33 |     print("\n", description)
34 |     cases_by_lambda = sorted(case_data_list, key=sort_function)[-N_LONGEST:]
35 |     converted_data = list(sort_function(case) for case in case_data_list)
36 |     print(
37 |         "\n".join(
38 |             f"{i}. {sort_function(case)}".ljust(20) + case["odyssey id"]
39 |             for i, case in enumerate(reversed(cases_by_lambda), 1)
40 |         ),
41 |         "\nMean:",
42 |         round(mean(converted_data), 2),
43 |         " Median:",
44 |         round(median(converted_data), 2),
45 |         " Mode:",
46 |         round(mode(converted_data), 2),
47 |     )
48 | 
49 | 
50 | disposition_len = (lambda case: len(case["dispositions"]), "dispositions length")
51 | charges_len = (lambda case: len(case["charge information"]), "number of charges")
52 | events_len = (
53 |     lambda case: len(case["other events and hearings"]),
54 |     "other events and hearings length",
55 | )
56 | case_cost = (
57 |     lambda case: float(
58 |         case["financial information"]["total financial assessment"].replace(",", "")
59 |     )
60 |     if "financial information" in case
61 |     else 0.0,
62 |     "highest cost",
63 | )
64 | for sort_function, description in (events_len, disposition_len, case_cost, charges_len):
65 |     print_top_cases_by_lambda(
66 |         sort_function,
67 |         description,
68 |     )
69 | print("\nNumber of cases:", len(case_data_list))
70 | print("Stats parsing runtime:", round(time() - START_TIME, 2), "seconds")
71 | 


--------------------------------------------------------------------------------
/parser_log.txt:
--------------------------------------------------------------------------------
 1 | 2024-10-13 13:51:33,631 - INFO - Logger configured
 2 | 2024-10-13 13:51:33,631 - INFO - Starting parsing for hays county with case number 51652356
 3 | 2024-10-13 13:51:33,632 - INFO - get_directories function called
 4 | base_dir: c:\Users\nicol\Documents\Open Austin\ids\indigent-defense-stats
 5 | 
 6 | 2024-10-13 13:51:33,633 - INFO - Returning case_html_path: c:\Users\nicol\Documents\Open Austin\ids\indigent-defense-stats\data\hays\case_html
 7 | Returning case_json_path: c:\Users\nicol\Documents\Open Austin\ids\indigent-defense-stats\data\hays\case_json
 8 | 
 9 | 2024-10-13 13:51:33,634 - INFO - Time started: 1728845493.6341271
10 | 2024-10-13 13:51:33,650 - INFO - get_list_of_html function called
11 | 
12 | 2024-10-13 13:51:33,651 - INFO - parse_single_file is True
13 | 
14 | 2024-10-13 13:51:33,652 - INFO - Starting for loop to parse 1 cases
15 | 2024-10-13 13:51:33,654 - INFO - test_51652356 - parsing
16 | 2024-10-13 13:51:33,852 - INFO - Module: hays
17 | Class: ParserHays
18 | Method: parser_hays
19 | 
20 | 2024-10-13 13:51:33,925 - INFO - Module 'hays' imported successfully.
21 | 2024-10-13 13:51:33,928 - INFO - Class 'ParserHays' retrieved successfully.
22 | 2024-10-13 13:51:33,929 - INFO - Method 'parser_hays' retrieved successfully.
23 | 2024-10-13 13:51:33,946 - INFO - Getting case metadata for hays case test_51652356
24 | 2024-10-13 13:51:33,951 - INFO - Getting case details
25 | 2024-10-13 13:51:33,956 - INFO - Parsing defendant rows
26 | 2024-10-13 13:51:33,957 - INFO - Parsing state rows
27 | 2024-10-13 13:51:33,957 - INFO - Getting charge information
28 | 2024-10-13 13:51:33,959 - INFO - Formatting events and orders of the court
29 | 2024-10-13 13:51:33,972 - INFO - For Loop started
30 | Getting disposition information
31 | 2024-10-13 13:51:33,976 - INFO - Row is not a disposition: ['11/07/2016', 'CANCELED', 'Punishment Hearing', '(9:00 AM) (Judicial Officer Boyer, Bruce)', "Defendant's Request"]
32 | 2024-10-13 13:51:33,978 - INFO - Row is not a disposition: ['03/23/2016', 'CANCELED', 'Arraignment', '(9:00 AM) (Judicial Officer Henry, William R)', 'Waived Arraignment']
33 | 2024-10-13 13:51:33,978 - INFO - Row is not a disposition: ['11/04/2019', 'Amended Deferred Adjudication', '(Judicial Officer: Boyer, Bruce) Reason: Community Supervision Extended', '1. AGGRAVATED ASSAULT WITH A DEADLY WEAPON', 'CSCD', '7 Years']
34 | 2024-10-13 13:51:33,979 - INFO - For Loop ended
35 | 
36 | 2024-10-13 13:51:33,995 - INFO - Writing JSON to: c:\Users\nicol\Documents\Open Austin\ids\indigent-defense-stats\data\hays\case_json
37 | 2024-10-13 13:51:34,000 - INFO - Parsing took 0.3662230968475342 seconds
38 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # data, caching, logging
  2 | data
  3 | logging
  4 | debug.*
  5 | aws
  6 | test_data
  7 | 
  8 | # venv stuff
  9 | bin/
 10 | include/
 11 | lib64/
 12 | Scripts/
 13 | pyenv.cfg
 14 | 
 15 | # Byte-compiled / optimized / DLL files
 16 | __pycache__/
 17 | *.py[cod]
 18 | *$py.class
 19 | 
 20 | # editor stuff
 21 | .vscode/
 22 | 
 23 | # C extensions
 24 | *.so
 25 | 
 26 | # Distribution / packaging
 27 | .Python
 28 | build/
 29 | develop-eggs/
 30 | dist/
 31 | downloads/
 32 | eggs/
 33 | .eggs/
 34 | lib/
 35 | lib64/
 36 | parts/
 37 | sdist/
 38 | var/
 39 | wheels/
 40 | share/python-wheels/
 41 | *.egg-info/
 42 | .installed.cfg
 43 | *.egg
 44 | MANIFEST
 45 | 
 46 | # PyInstaller
 47 | #  Usually these files are written by a python script from a template
 48 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 49 | *.manifest
 50 | *.spec
 51 | 
 52 | # Installer logs
 53 | pip-log.txt
 54 | pip-delete-this-directory.txt
 55 | 
 56 | # Unit test / coverage reports
 57 | htmlcov/
 58 | .tox/
 59 | .nox/
 60 | .coverage
 61 | .coverage.*
 62 | .cache
 63 | nosetests.xml
 64 | coverage.xml
 65 | *.cover
 66 | *.py,cover
 67 | .hypothesis/
 68 | .pytest_cache/
 69 | cover/
 70 | 
 71 | # Translations
 72 | *.mo
 73 | *.pot
 74 | 
 75 | # Django stuff:
 76 | *.log
 77 | local_settings.py
 78 | db.sqlite3
 79 | db.sqlite3-journal
 80 | 
 81 | # Flask stuff:
 82 | instance/
 83 | .webassets-cache
 84 | 
 85 | # Scrapy stuff:
 86 | .scrapy
 87 | 
 88 | # Sphinx documentation
 89 | docs/_build/
 90 | 
 91 | # PyBuilder
 92 | .pybuilder/
 93 | target/
 94 | 
 95 | # Jupyter Notebook
 96 | .ipynb_checkpoints
 97 | 
 98 | # IPython
 99 | profile_default/
100 | ipython_config.py
101 | 
102 | # pyenv
103 | #   For a library or package, you might want to ignore these files since the code is
104 | #   intended to run in multiple environments; otherwise, check them in:
105 | .python-version
106 | 
107 | # pipenv
108 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
109 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
110 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
111 | #   install all needed dependencies.
112 | #Pipfile.lock
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | pyvenv.cfg
157 | 
158 | # ignore data folders
159 | case_html/
160 | case_json/
161 | data/
162 | 
163 | # asdf
164 | .tool-versions
165 | 
166 | # libreoffice makes this lock file when opening csvs
167 | .~lock.*
168 | 
169 | .DS_Store
170 | 
171 | docs/generated
172 | 


--------------------------------------------------------------------------------
/src/tester/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Unit Testing
 3 | This module provides unit testing using the unittest module for each module in the program.
 4 | ## Setup
 5 |   
 6 | Once you've loaded the repository in Visual Studio, you can load the test by using the testing module in the VS interface. But you should make sure to update your settings.json file in the .vscode folder created in the root folder in order to direct VS to the subdirectory location of the test files within /src/tester.
 7 | ```
 8 | 
 9 | {
10 | "python.testing.unittestEnabled": true,
11 | "python.testing.unittestArgs": [
12 | "-v"
13 | ]
14 | }
15 | ```
16 | ## Tests
17 | 
18 | ### A. Scraper Tests
19 | 
20 | #### Test #1: Did the scraper create a file called 12947592.html in the right location?
21 | 
22 | This will look to see if there is an HTML file of the expected name in the expected destination.
23 | 
24 | It will also check if the file has been updated since the test began (ie. was updated rather than simply exists).
25 | 
26 | #### Test #2: Test #2: Is the resulting HTML file longer than 1000 characters?
27 | 
28 | This will check if the length of a string of the HTML file is longer than 1000 characters to ensure it was a full page scrape.
29 |   
30 | #### Test #3: Does the resulting HTML file container the cause number in the expected header location?
31 | 
32 | This will check a specific location in the HTML file for where a cause number is expected to be. If the cause number is present within the HTML at that location, this is a good indication that the scrape was successful.
33 |   
34 | ### B. Parser Tests
35 | 
36 | #### Test #1: Check to see if there is a JSON called 51652356.json created in the correct location and that it was updated since this test started running
37 | 
38 | This will look to see if there is an JSON file of the expected name in the expected destination.
39 | 
40 | It will also check if the file has been updated since the test began (ie. was updated rather than simply exists).
41 | 
42 | ####  Test #2: Check to see that JSON parsed all of the necessary fields and did so properly.
43 | 
44 | This unit test uses a JSON database of expected fields and features of those fields (called "field_validation_list.json") where each entry in the JSON file is a field with the following fields for each that are used in validation (for example):
45 | ```
46 | {
47 |     "name": "location",
48 |     "logical_level": "top",
49 |     "type": "string",
50 |     "estimated_min_length": 3,
51 |     "importance": "necessary"
52 | },
53 | {
54 |     "name": "party information",
55 |     "logical_level": "top",
56 |     "type": "array",
57 |     "estimated_min_length": 1,
58 |     "importance": "necessary"
59 | },
60 | {
61 |     "name": "charge information",
62 |     "logical_level": "top",
63 |     "type": "array",
64 |     "estimated_min_length": 1,
65 |     "importance": "necessary"
66 | },
67 | {
68 |     "name": "defendant",
69 |     "logical_level": "party",
70 |     "type": "string",
71 |     "estimated_min_length": 1,
72 |     "importance": "necessary"
73 | }
74 | ```
75 | The order of fields it addresses goes in this order:
76 | - necessary: fields that are consider required for a successful parsing
77 | - high: are important for data visualization and analysis
78 | - medium: have potential for use
79 | - low: have little or no use or importance
80 | 
81 | It does so by opening a json dictionary filled with expected fields and expected features of those fields:
82 | - whether field exists in the jason (is in): check_exists
83 | - expected type (str or array):
84 | - expected length (strings and arrays): check_length
85 | 
86 | ### C. Cleaner Tests
87 | 
88 | In progress.
89 |   
90 | ### D. Updater Tests
91 | 
92 | In progress.
93 |   
94 | ### E. Orchestrator Tests
95 | 
96 | In progress.
97 | 


--------------------------------------------------------------------------------
/src/cleaner/Data Structure of Cleaned JSON.md:
--------------------------------------------------------------------------------
  1 | ## Data Structure of the Cleaned Cases JSON
  2 | 
  3 | ```mermaid
  4 | graph TB
  5 |     subgraph CaseInformation[Case Information Summary]
  6 |         style CaseInformation fill:#d3a8e2,stroke:#333,stroke-width:2px
  7 |         A1[County: Hays]
  8 |         A2[Cause Number Hash: dsqn91cn1odmo]
  9 |         A3[Odyssey ID: Redacted]
 10 |         A4[Date Filed: 01/01/2015]
 11 |         A5[Location: 22nd District Court]
 12 |         A6[Version: 1]
 13 |         A7[Parsing Date: 2024-01-01]
 14 |     end
 15 | 
 16 |     subgraph PartyInformation[Party Information]
 17 |         style PartyInformation fill:#d3a8e2,stroke:#333,stroke-width:2px
 18 | 
 19 |         subgraph DefendantInfoBox[Defendant Info]
 20 |         style DefendantInfoBox fill:#b0d4f1,stroke:#333,stroke-width:2px
 21 |         D8[Defendant Info: Redacted]
 22 |         end
 23 |         subgraph RepresentationInfo[Defense Attorney Info]
 24 |             style RepresentationInfo fill:#b0d4f1,stroke:#333,stroke-width:2px
 25 |             B1[Defense Attorney Hash: 9083bb693e33919c]
 26 |             B2[Appointed or Retained: Court Appointed]
 27 | 
 28 |         end
 29 | 
 30 |     end
 31 | 
 32 |     subgraph Events[Event Information]
 33 |         style Events fill:#d3a8e2,stroke:#333,stroke-width:2px
 34 |         subgraph EvidenceofRep[Representation Evidence]
 35 |             style EvidenceofRep fill:#b0d4f1,stroke:#333,stroke-width:2px
 36 |             B3[Has Evidence of Representation: No]
 37 |         end
 38 | 
 39 |     end
 40 | 
 41 |     subgraph ChargeInformation[Charge Information]
 42 |         style ChargeInformation fill:#d3a8e2,stroke:#333,stroke-width:2px
 43 | 
 44 |         subgraph Charge1[Aggravated Assault with a Deadly Weapon]
 45 |             style Charge1 fill:#b0d4f1,stroke:#333,stroke-width:2px
 46 |             C1[Statute: 22.02a2]
 47 |             C2[Level: Second Degree Felony]
 48 |             C3[Date: 10/25/2015]
 49 |             C4[Charge Name: Aggravated Assault with a Deadly Weapon]
 50 |             C5[Description: Aggravated Assault]
 51 |             C6[Category: Violent]
 52 |             C7[UCCS Code: 1200]
 53 |         end
 54 | 
 55 |         subgraph Charge2[Resisting Arrest]
 56 |             style Charge2 fill:#b0d4f1,stroke:#333,stroke-width:2px
 57 |             C8[Statute: 38.03]
 58 |             C9[Level: Class A Misdemeanor]
 59 |             C10[Date: 10/25/2015]
 60 |             C11[Charge Name: Resisting Arrest]
 61 |             C12[Description: Resisting Arrest]
 62 |         end
 63 | 
 64 |         E3[Charges Dismissed: 1]
 65 | 
 66 | 
 67 |     end
 68 | 
 69 |     subgraph TopCharge[Top Charge]
 70 |         style TopCharge fill:#b0d4f1,stroke:#333,stroke-width:2px
 71 |         E1[Charge Name: Aggravated Assault with a Deadly Weapon]
 72 |         E2[Charge Level: Second Degree Felony]
 73 |     end
 74 | 
 75 |     subgraph Dispositions[Dispositions]
 76 |         style Dispositions fill:#d3a8e2,stroke:#333,stroke-width:2px
 77 | 
 78 |         subgraph Disposition1[Disposition Details]
 79 |             style Disposition1 fill:#b0d4f1,stroke:#333,stroke-width:2px
 80 |             D1[Date: 12/06/2016]
 81 |             D2[Event: Disposition]
 82 |             D3[Outcome: Deferred Adjudication]
 83 |             D4[Sentence Length: 1 Year]
 84 |         end
 85 | 
 86 |         subgraph Disposition2[Resisting Arrest Disposition]
 87 |             style Disposition2 fill:#b0d4f1,stroke:#333,stroke-width:2px
 88 |             D5[Date: 12/06/2016]
 89 |             D6[Event: Disposition]
 90 |             D7[Outcome: Dismissed]
 91 |         end
 92 |     end
 93 | 
 94 | 
 95 |     CaseInformation --> PartyInformation
 96 |     CaseInformation --> ChargeInformation
 97 |     CaseInformation --> Dispositions
 98 |     CaseInformation --> Events
 99 |     ChargeInformation --> TopCharge
100 | ```
101 | 


--------------------------------------------------------------------------------
/src/parser/README.md:
--------------------------------------------------------------------------------
 1 | ```mermaid
 2 | graph TD
 3 |     subgraph Parsing
 4 |         A[Start Parsing] --> B([configure_logger])
 5 |         B --> C[Store county]
 6 |         C --> D([get_directories])
 7 |         D --> E[Start Timer]
 8 |         E --> F([get_list_of_html])
 9 |         F --> G{for case_html_file_path<br>in case_html_list}
10 |         G --> H[Store case_number]
11 |         H --> I([get_class_and_method])
12 |         I --> J{{if parser_instance and<br>parser_function is not None}}
13 |         J --> L([parser_function])
14 |         I --> K{{else: THROW ERROR}}
15 |         DD --> M([write_json_data])
16 |         K --> M([write_json_data])
17 |         L --> AA[Start Parsing<br>Specific County]
18 |         AA --> BB[Create root_tables]
19 |         BB --> CC([get_case_metadata])
20 |         CC --> DD{for table in root_tables}
21 |         DD --> EE{{if Case Type and Date Filed}}
22 |         EE -- True --> JJ([get_case_details])
23 |         EE --> FF{{elif Related Case}}
24 |         FF -- True --> KK[Store<br>case_data#91;Related Cases#93;]
25 |         FF --> GG{{elif Party Information}}
26 |         GG -- True --> LL([parse_defendant_rows#40;<br>extract_rows#40;#41;#41;])
27 |         LL --> MM([parse_state_rows#40;<br>extract_rows#40;#41;#41;])
28 |         GG --> HH{{elif Charge Information}}
29 |         HH -- True --> NN([get_charge_information])
30 |         HH --> II{{elif Events & Orders of<br>the Court}}
31 |         II --> DD
32 |         II -- True --> OO([format_events_and_<br>orders_of_the_court])
33 |         OO --> PP{for row<br>in disposition_rows:}
34 |         PP --> QQ([get_disposition_information])
35 |         QQ --> PP
36 |         PP --> RR{{if case_data#91;Disposition<br>Information#93;}}
37 |         RR -- True --> SS([get_top_charge])
38 |         RR --> II
39 |         SS --> TT([count_dismissed_charges])
40 | 
41 | 
42 |     end
43 |         M --> Y
44 |         G --> Y[End Timer]
45 |         Y --> Z[End Parsing]
46 | 
47 |         style A fill:#66A182,stroke:#333,stroke-width:4px,color:#FFF
48 |         style B fill:#9A7FAE,stroke:#333,stroke-width:2px,color:#FFF
49 |         style C fill:#D99559,stroke:#333,stroke-width:2px,color:#FFF
50 |         style D fill:#9A7FAE,stroke:#333,stroke-width:2px,color:#FFF
51 |         style E fill:#66A182,stroke:#333,stroke-width:4px,color:#FFF
52 |         style F fill:#9A7FAE,stroke:#333,stroke-width:2px,color:#FFF
53 |         style G fill:#779ECB,stroke:#333,stroke-width:4px,color:#FFF
54 |         style H fill:#D99559,stroke:#333,stroke-width:2px,color:#FFF
55 |         style I fill:#9A7FAE,stroke:#333,stroke-width:2px,color:#FFF
56 |         style J fill:#D06A6A,stroke:#333,stroke-width:4px,color:#FFF    
57 |         style K fill:#D06A6A,stroke:#333,stroke-width:4px,color:#FFF    
58 |         style L fill:#9A7FAE,stroke:#333,stroke-width:2px,color:#FFF
59 |         style M fill:#9A7FAE,stroke:#333,stroke-width:2px,color:#FFF
60 |         style Y fill:#66A182,stroke:#333,stroke-width:4px,color:#FFF
61 |         style Z fill:#66A182,stroke:#333,stroke-width:4px,color:#FFF
62 | 
63 |         style AA fill:#66A182,stroke:#333,stroke-width:4px,color:#FFF
64 |         style BB fill:#D99559,stroke:#333,stroke-width:4px,color:#FFF
65 |         style CC fill:#9A7FAE,stroke:#333,stroke-width:4px,color:#FFF
66 |         style DD fill:#779ECB,stroke:#333,stroke-width:4px,color:#FFF
67 |         style EE fill:#D06A6A,stroke:#333,stroke-width:4px,color:#FFF
68 |         style FF fill:#D06A6A,stroke:#333,stroke-width:4px,color:#FFF
69 |         style GG fill:#D06A6A,stroke:#333,stroke-width:4px,color:#FFF
70 |         style HH fill:#D06A6A,stroke:#333,stroke-width:4px,color:#FFF
71 |         style II fill:#D06A6A,stroke:#333,stroke-width:4px,color:#FFF    
72 |         style JJ fill:#9A7FAE,stroke:#333,stroke-width:4px,color:#FFF
73 |         style KK fill:#D99559,stroke:#333,stroke-width:4px,color:#FFF
74 |         style LL fill:#9A7FAE,stroke:#333,stroke-width:4px,color:#FFF
75 |         style MM fill:#9A7FAE,stroke:#333,stroke-width:4px,color:#FFF
76 |         style NN fill:#9A7FAE,stroke:#333,stroke-width:4px,color:#FFF
77 |         style OO fill:#9A7FAE,stroke:#333,stroke-width:4px,color:#FFF
78 |         style PP fill:#779ECB,stroke:#333,stroke-width:4px,color:#FFF
79 |         style QQ fill:#9A7FAE,stroke:#333,stroke-width:4px,color:#FFF
80 |         style RR fill:#D06A6A,stroke:#333,stroke-width:4px,color:#FFF
81 |         style SS fill:#9A7FAE,stroke:#333,stroke-width:4px,color:#FFF
82 |         style TT fill:#9A7FAE,stroke:#333,stroke-width:4px,color:#FFF
83 | 
84 | ```
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/src/tools/build_event_csv.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Combine hearing & event records from multiple case files into a single csv.
  3 | """
  4 | import csv
  5 | import argparse
  6 | import json
  7 | import os
  8 | from datetime import datetime
  9 | 
 10 | argparser = argparse.ArgumentParser()
 11 | argparser.add_argument(
 12 |     "-county",
 13 |     "-c",
 14 |     type=str,
 15 |     default="hays",
 16 |     help="The name of the county.",
 17 | )
 18 | argparser.description = "Print stats for the specified county."
 19 | args = argparser.parse_args()
 20 | 
 21 | FILE_DIR = os.path.join(
 22 |     os.path.dirname(__file__), "..", "..", "data", args.county, "case_json"
 23 | )
 24 | 
 25 | 
 26 | def parse_event_date(date_str):
 27 |     """Return a python `datetime` from e.g. '01/30/2021'"""
 28 |     month, day, year = date_str.split("/")
 29 |     return datetime(year=int(year), month=int(month), day=int(day))
 30 | 
 31 | 
 32 | def iso_event_date(dt):
 33 |     """Format a `datetime` instance as YYYY-MM-DD"""
 34 |     return dt.strftime("%Y-%m-%d")
 35 | 
 36 | 
 37 | def get_days_elapsed(start, end):
 38 |     """Return the number of days between two dates"""
 39 |     delta = end - start
 40 |     return delta.days
 41 | 
 42 | 
 43 | def main():
 44 |     files = [file for file in os.listdir(FILE_DIR) if file.endswith(".json")]
 45 |     events = []
 46 |     charges = []
 47 | 
 48 |     for f_count, f_name in enumerate(files):
 49 |         if f_count % 1000 == 0:
 50 |             print(f"Processing file {f_count} of {len(files)}")
 51 | 
 52 |         with open(f"{FILE_DIR}/{f_name}", "r") as fin:
 53 |             """
 54 |             Extract fields of interest. you can add any attributes of interest to the
 55 |             event_record dict and they will be included in the output CSV.
 56 |             Extracts events and charges from the case file, in seperate files.
 57 |             """
 58 |             case = json.load(fin)
 59 | 
 60 |             # extract demographic info
 61 |             case_id = case["odyssey id"]
 62 |             case_number = case["code"]
 63 |             retained = case["party information"]["appointed or retained"]
 64 |             gender = case["party information"]["sex"]
 65 |             race = case["party information"]["race"]
 66 |             defense_attorney = case["party information"]["defense attorney"]
 67 | 
 68 |             # extract event data
 69 |             first_event_date = None
 70 |             for i, event in enumerate(case["other events and hearings"]):
 71 |                 event_record = {}
 72 |                 event_date = parse_event_date(event[0])
 73 | 
 74 |                 if i == 0:
 75 |                     first_event_date = event_date
 76 | 
 77 |                 days_elapsed = get_days_elapsed(first_event_date, event_date)
 78 |                 event_record["event_id"] = i + 1
 79 |                 event_record["event_date"] = iso_event_date(event_date)
 80 |                 event_record["first_event_date"] = iso_event_date(first_event_date)
 81 |                 event_record["days_elapsed"] = days_elapsed
 82 |                 event_record["event_name"] = event[1]
 83 |                 event_record["attorney"] = retained
 84 |                 event_record["case_id"] = case_id
 85 |                 event_record["case_number"] = case_number
 86 |                 event_record["defense_attorney"] = defense_attorney
 87 |                 event_record["race"] = race
 88 |                 event_record["gender"] = gender
 89 |                 events.append(event_record)
 90 | 
 91 |             # extract charge data
 92 |             for i, charge in enumerate(case["charge information"]):
 93 |                 charge_record = {}
 94 |                 charge_record["charge_id"] = i + 1
 95 |                 charge_record["charge_name"] = charge.get("charges", "")
 96 |                 charge_record["statute"] = charge.get("statute", "")
 97 |                 charge_record["level"] = charge.get("level", "")
 98 | 
 99 |                 charge_record["charge_date"] = charge.get("date", "")
100 |                 if charge_record["charge_date"]:
101 |                     charge_record["charge_date"] = iso_event_date(
102 |                         parse_event_date(charge_record["charge_date"])
103 |                     )
104 | 
105 |                 charge_record["case_id"] = case_id
106 |                 charge_record["case_number"] = case_number
107 |                 charges.append(charge_record)
108 | 
109 |     with open("events_combined.csv", "w", newline="") as fout:
110 |         writer = csv.DictWriter(fout, fieldnames=events[0].keys())
111 |         writer.writeheader()
112 |         writer.writerows(events)
113 | 
114 |     with open("charges_combined.csv", "w", newline="") as fout:
115 |         writer = csv.DictWriter(fout, fieldnames=charges[0].keys())
116 |         writer.writeheader()
117 |         writer.writerows(charges)
118 | 
119 | 
120 | if __name__ == "__main__":
121 |     main()
122 | 


--------------------------------------------------------------------------------
/src/scraper/helpers.py:
--------------------------------------------------------------------------------
  1 | import os, sys
  2 | import requests
  3 | from time import sleep
  4 | from datetime import date
  5 | from logging import Logger
  6 | from typing import Dict, Optional, Tuple, Literal
  7 | from enum import Enum
  8 | 
  9 | #This is called debug and quit.
 10 | def write_debug_and_quit(
 11 |     page_text: str, logger: Logger, verification_text: Optional[str] = None
 12 | ) -> None:
 13 |     logger.error(
 14 |         (
 15 |             f"{verification_text} could not be found in page."
 16 |             if verification_text
 17 |             else "Failed to load page."
 18 |         )
 19 |         + f" Aborting. Writing /data/debug.html with response. May not be HTML."
 20 |     )
 21 |     with open(os.path.join(os.path.dirname(__file__), "..","..","logging", f"debug.html"), "w") as file_handle:
 22 |         file_handle.write(page_text)
 23 |     sys.exit(1)
 24 | 
 25 | # helper function to make form data
 26 | def create_search_form_data(
 27 |     date: str, JO_id: str, hidden_values: Dict[str, str], odyssey_version: int
 28 | ) -> Dict[str, str]:
 29 |     form_data = {}
 30 |     form_data.update(hidden_values)
 31 |     if odyssey_version < 2017:
 32 |         form_data.update(
 33 |             {
 34 |                 "SearchBy": "3",
 35 |                 "cboJudOffc": JO_id,
 36 |                 "DateSettingOnAfter": date,
 37 |                 "DateSettingOnBefore": date,
 38 |                 "SearchType": "JUDOFFC",  # Search by Judicial Officer
 39 |                 "SearchMode": "JUDOFFC",
 40 |                 "CaseCategories": "CR",  # "CR,CV,FAM,PR" criminal, civil, family, probate and mental health - these are the options
 41 |             }
 42 |         )
 43 |     else:
 44 |         form_data.update(
 45 |             {
 46 |                 "SearchCriteria.SelectedHearingType": "Criminal Hearing Types",
 47 |                 "SearchCriteria.SearchByType": "JudicialOfficer",
 48 |                 "SearchCriteria.SelectedJudicialOfficer": JO_id,
 49 |                 "SearchCriteria.DateFrom": date,
 50 |                 "SearchCriteria.DateTo": date,
 51 |             }
 52 |         )
 53 |     return form_data
 54 | 
 55 | def create_single_case_search_form_data(hidden_values: Dict[str, str], case_number: str):
 56 |     form_data = {}
 57 |     form_data.update(hidden_values)
 58 |     os_specific_time_format = "%#m/%#d/%Y" if os.name == 'nt' else "%-m/%-d/%Y"
 59 |     form_data.update(
 60 |         {
 61 |             "__EVENTTARGET":"",
 62 |             "SearchBy": "0",
 63 |             "DateSettingOnAfter": "1/1/1970",
 64 |             "DateSettingOnBefore": date.today().strftime(os_specific_time_format),
 65 |             "SearchType": "CASE",  # Search by case id
 66 |             "SearchMode": "CASENUMBER",
 67 |             "CourtCaseSearchValue": case_number,
 68 |             "CaseCategories": "",
 69 |             "cboJudOffc":"38501",
 70 |         }
 71 |     )
 72 |     return form_data
 73 | 
 74 | 
 75 | class HTTPMethod(Enum):
 76 |     POST: int = 1
 77 |     GET: int = 2
 78 | 
 79 | 
 80 | def request_page_with_retry(
 81 |     session: requests.Session,
 82 |     url: str,
 83 |     logger: Logger,
 84 |     verification_text: Optional[str] = None,
 85 |     http_method: Literal[HTTPMethod.POST, HTTPMethod.GET] = HTTPMethod.POST,
 86 |     params: Dict[str, str] = {},
 87 |     data: Optional[Dict[str, str]] = None,
 88 |     max_retries: int = 5,
 89 |     ms_wait: str = 200,
 90 | ) -> Tuple[str, bool]:
 91 |     response = None
 92 |     for i in range(max_retries):
 93 |         sleep(ms_wait / 1000 * (i + 1))
 94 |         failed = False
 95 |         try:
 96 |             if http_method == HTTPMethod.POST:
 97 |                 if not data:
 98 |                     response = session.post(url, params=params)
 99 |                 else:
100 |                     response = session.post(url, data=data, params=params)
101 |             elif http_method == HTTPMethod.GET:
102 |                 if not data:
103 |                     response = session.get(url, params=params)
104 |                 else:
105 |                     response = session.get(url, data=data, params=params)
106 |             response.raise_for_status()
107 |             if verification_text:
108 |                 if verification_text not in response.text:
109 |                     failed = True
110 |                     logger.error(
111 |                         f"Verification text {verification_text} not in response"
112 |                     )
113 |         except requests.RequestException as e:
114 |             logger.exception(f"Failed to get url {url}, try {i}")
115 |             failed = True
116 |         if failed:
117 |             if response == None:
118 |                 response_text = 'No response from Odyssey.'
119 |             else:
120 |                 response_text = response.text
121 |             write_debug_and_quit(
122 |                 verification_text=verification_text,
123 |                 page_text=response_text,
124 |                 logger=logger,
125 |             )
126 |     return response.text
127 | 


--------------------------------------------------------------------------------
/resources/test_files/field_validation_list.json:
--------------------------------------------------------------------------------
  1 | [{
  2 |     "name": "code",
  3 |     "logical_level": "top",
  4 |     "type": "string",
  5 |     "estimated_min_length": 5,
  6 |     "importance": "necessary"
  7 | },
  8 | {
  9 |     "name": "odyssey id",
 10 |     "logical_level": "top",
 11 |     "type": "string",
 12 |     "estimated_min_length": 5,
 13 |     "importance": "necessary"
 14 | },
 15 | {
 16 |     "name": "county",
 17 |     "logical_level": "top",
 18 |     "type": "string",
 19 |     "estimated_min_length": 3,
 20 |     "importance": "necessary"
 21 | },
 22 | {
 23 |     "name": "name",
 24 |     "logical_level": "top",
 25 |     "type": "string",
 26 |     "estimated_min_length": 3,
 27 |     "importance": "necessary"
 28 | },
 29 | {
 30 |     "name": "case type",
 31 |     "logical_level": "top",
 32 |     "type": "string",
 33 |     "estimated_min_length": 4,
 34 |     "importance": "necessary"
 35 | },
 36 | {
 37 |     "name": "date filed",
 38 |     "logical_level": "top",
 39 |     "type": "string",
 40 |     "estimated_min_length": 4,
 41 |     "importance": "necessary"
 42 | },
 43 | {
 44 |     "name": "location",
 45 |     "logical_level": "top",
 46 |     "type": "string",
 47 |     "estimated_min_length": 3,
 48 |     "importance": "necessary"
 49 | },
 50 | {
 51 |     "name": "party information",
 52 |     "logical_level": "top",
 53 |     "type": "array",
 54 |     "estimated_min_length": 1,
 55 |     "importance": "necessary"
 56 | },
 57 | {
 58 |     "name": "charge information",
 59 |     "logical_level": "top",
 60 |     "type": "array",
 61 |     "estimated_min_length": 1,
 62 |     "importance": "necessary"
 63 | },
 64 | {
 65 |     "name": "defendant",
 66 |     "logical_level": "party",
 67 |     "type": "string",
 68 |     "estimated_min_length": 1,
 69 |     "importance": "necessary"
 70 | },
 71 | {
 72 |     "name": "sex",
 73 |     "logical_level": "party",
 74 |     "type": "string",
 75 |     "estimated_min_length": 3,
 76 |     "importance": "medium"
 77 | },
 78 | {
 79 |     "name": "race",
 80 |     "logical_level": "party",
 81 |     "type": "string",
 82 |     "estimated_min_length": 1,
 83 |     "importance": "medium"
 84 | },
 85 | {
 86 |     "name": "date of birth",
 87 |     "logical_level": "party",
 88 |     "type": "string",
 89 |     "estimated_min_length": 4,
 90 |     "importance": "low"
 91 | },
 92 | {
 93 |     "name": "height",
 94 |     "logical_level": "party",
 95 |     "type": "string",
 96 |     "estimated_min_length": 1,
 97 |     "importance": "low"
 98 | },
 99 | {
100 |     "name": "weight",
101 |     "logical_level": "party",
102 |     "type": "string",
103 |     "estimated_min_length": 1,
104 |     "importance": "low"
105 | },
106 | {
107 |     "name": "defense attorney",
108 |     "logical_level": "party",
109 |     "type": "string",
110 |     "estimated_min_length": 1,
111 |     "importance": "high"
112 | },
113 | {
114 |     "name": "appointed or retained",
115 |     "logical_level": "party",
116 |     "type": "string",
117 |     "estimated_min_length": 1,
118 |     "importance": "high"
119 | },
120 | {
121 |     "name": "defense attorney phone number",
122 |     "logical_level": "party",
123 |     "type": "string",
124 |     "estimated_min_length": 1,
125 |     "importance": "low"
126 | },
127 | {
128 |     "name": "defendant address",
129 |     "logical_level": "party",
130 |     "type": "string",
131 |     "estimated_min_length": 1,
132 |     "importance": "low"
133 | },
134 | {
135 |     "name": "SID",
136 |     "logical_level": "party",
137 |     "type": "string",
138 |     "estimated_min_length": 5,
139 |     "importance": "high"
140 | },
141 | {
142 |     "name": "prosecuting attorney",
143 |     "logical_level": "party",
144 |     "type": "string",
145 |     "estimated_min_length": 1,
146 |     "importance": "low"
147 | },
148 | {
149 |     "name": "prosecuting attorney phone number",
150 |     "logical_level": "party",
151 |     "type": "string",
152 |     "estimated_min_length": 1,
153 |     "importance": "low"
154 | },
155 | {
156 |     "name": "prosecuting attorney address",
157 |     "logical_level": "party",
158 |     "type": "string",
159 |     "estimated_min_length": 1,
160 |     "importance": "low"
161 | },
162 | {
163 |     "name": "bondsman",
164 |     "logical_level": "party",
165 |     "type": "string",
166 |     "estimated_min_length": 1,
167 |     "importance": "low"
168 | },
169 | {
170 |     "name": "bondsman address",
171 |     "logical_level": "party",
172 |     "type": "string",
173 |     "estimated_min_length": 1,
174 |     "importance": "low"
175 | },
176 | {
177 |     "name": "charges",
178 |     "logical_level": "charge",
179 |     "type": "string",
180 |     "estimated_min_length": 5,
181 |     "importance": "necessary"
182 | },
183 | {
184 |     "name": "level",
185 |     "logical_level": "charge",
186 |     "type": "string",
187 |     "estimated_min_length": 5,
188 |     "importance": "necessary"
189 | },
190 | {
191 |     "name": "date",
192 |     "logical_level": "charge",
193 |     "type": "string",
194 |     "estimated_min_length": 4,
195 |     "importance": "high"
196 | }]


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Tyler Technologies Odyssey scraper and parser
  2 | 
  3 | This is a scraper to collect and process public case records from the Tyler Technologies Odyssey court records system. If you are a dev or want to file an Issue, please read [CONTRIBUTING](CONTRIBUTING.md).
  4 | 
  5 | ## Local setup
  6 | 
  7 | ### Install toolchain
  8 | 
  9 | 1. Clone this repo and navigate to it.
 10 |    - `git clone https://github.com/open-austin/indigent-defense-stats`
 11 |    - `cd indigent-defense-stats`
 12 | 2. Install Pyenv if not already installed ([linux, mac](https://github.com/pyenv/pyenv), or [windows](https://github.com/pyenv-win/pyenv-win))
 13 | 3. Run `pyenv install` to get the right Python version
 14 | 
 15 | ### Setup `venv`
 16 | 
 17 | First, you'll need to create a virtual environment, this differs depending on your OS.
 18 | 
 19 | On linux/mac
 20 | 
 21 | ```bash
 22 | python -m venv .venv --prompt ids # (you can substitute `ids` for any name you want)
 23 | ```
 24 | 
 25 | On Windows
 26 | 
 27 | ```powershell
 28 | c:\>Python35\python -m venv c:\path\to\repo\ids # (you can substitute `ids` for any name you want)
 29 | ```
 30 | 
 31 | Next, you'll need to "activate" the venv. You'll need to run this command every time you work in the codebase and tell your IDE which Python environment to use. It will likely default to wherever `python` resolves to in your system path. The specific command you run will depend on both your OS and shell.
 32 | 
 33 | On linux/mac
 34 | 
 35 | | platform | shell      | Command to activate virtual environment |
 36 | | :------- | :--------- | :-------------------------------------- |
 37 | | POSIX    | bash/zsh   | $ source <venv>/bin/activate            |
 38 | |          | fish       | $ source <venv>/bin/activate.fish       |
 39 | |          | csh/tcsh   | $ source <venv>/bin/activate.csh        |
 40 | |          | PowerShell | $ <venv>/bin/Activate.ps1               |
 41 | | Windows  | cmd.exe    | C:\> <venv>\Scripts\activate.bat        |
 42 | |          | PowerShell | PS C:\> <venv>\Scripts\Activate.ps1     |
 43 | 
 44 | Source: https://docs.python.org/3/library/venv.html#how-venvs-work
 45 | 
 46 | Note: Again, you'll need to activate venv _every time you want to work in the codebase_.
 47 | 
 48 | If the above doesn't work, try these instructions for creating and activating a virtual environment:
 49 | 1. Navigate to your project directory: cd [insert file path]
 50 | 2. Create a virtual environenment: python -m venv venv
 51 | 3. Activate the virtual environment: .\venv\Scripts\activate.bat
 52 | 
 53 | ### Install python dependencies
 54 | 
 55 | Using `pip`, install the project dependencies.
 56 | 
 57 | ```shell
 58 | pip install -r requirements.txt
 59 | ```
 60 | 
 61 | ### Running CLI
 62 | 
 63 | @TODO - this section needs to be updated.
 64 | 
 65 | 7. Set parameters to the main command:
 66 |    - counties = The counties that are listed in the count CSV. Update column "scraper" in the CSV to "yes" to include the county.
 67 |    - start_date = The first date you want to scrape for case data. Update in scraper.
 68 |    - end_date = The last date you want to scrape for case data. Update in scraper.
 69 | 8. Run the handler.
 70 |    - `python run python .src/orchestrator`
 71 | 
 72 | ## Structure of Code
 73 | 
 74 | - County Database: A CSV table contains the necessary Odyssey links and version for each county in Texas. One column ("scrape") indicates whether that county should be scraped. Currently, Hays is the default.
 75 | - Handler (src/handler): This reads the CSV for the counties to be scraped and runs the following processes for each county. You can also set the start and end date of the parser here.
 76 | 
 77 |   - **Scraper** (`src/scraper`): This scrapes all of the judicial officers for each day within the period set in the handler and saves all of the HTML to data/[county name]/case_html.
 78 |   - **Parser** (`src/parser`): This parses all of the HTML in the county-specific HTML folder to accompanying JSON files in data/[county name]/case_json.
 79 |   - **Cleaner** (`src/cleaner`): This cleans and redacts information in in the county-specific json folder to a new folder of JSON files in data/[county name]/case_json_cleaned.
 80 |   - **Updater** (`src/updater`): This pushed the cleaned and redacted JSON in the county-specific cleaned json folder to a container in CosmosDB where it can then be use for visualization.
 81 | 
 82 | ## Flowchart: Relationships Between Functions and Directories
 83 | 
 84 | ```mermaid
 85 | flowchart TD
 86 |     orchestrator{"src/orchestrator (class): <br> orchestrate (function)"} --> county_db[resources/texas_county_data.csv]
 87 |     county_db  --> |return counties where 'scrape' = 'yes'| orchestrator
 88 |     orchestrator -->|loop through these counties <br> and run these four functions| scraper(1. src/scraper: scrape)
 89 |     scraper --> parser(2. src/parser: parse)
 90 |     scraper --> |create 1 HTML per case| data_html[data/county/case_html/case_id.html]
 91 |     parser--> pre2017(src/parser/pre2017)
 92 |     parser--> post2017(src/parser/post2017)
 93 |     pre2017 --> cleaner[3. src/cleaner: clean]
 94 |     post2017 --> cleaner
 95 |     parser --> |create 1 JSON per case| data_json[data/county/case_json/case_id.json]
 96 |     cleaner --> |look for charge in db<br>and normalize it to uccs| charge_db[resouces/umich-uccs-database.json]
 97 |     charge_db --> cleaner
 98 |     cleaner --> updater(4. src/updater: update)
 99 |     cleaner --> |create 1 JSON per case| data_json_cleaned[data/county/case_json_cleaned/case_id.json]
100 |     updater --> |send final cleaned JSON to CosmosDB container| CosmosDB_container[CosmosDB container]
101 |     CosmosDB_container --> visualization{live visualization}
102 | ```
103 | 


--------------------------------------------------------------------------------
/src/parser/Data Structure of Parsed JSON.md:
--------------------------------------------------------------------------------
  1 | ```mermaid
  2 | graph TB
  3 |     subgraph CaseInformation[Case Information Summary]
  4 |         style CaseInformation fill:#d3a8e2,stroke:#333,stroke-width:2px
  5 |         A1[Case Code: CR-15-1234-C]
  6 |         A2[Odyssey ID: 198372]
  7 |         A3[County: Hays]
  8 |         A4[Case Name: The State of Texas vs. Fake Name]
  9 |         A5[Case Type: Adult Felony]
 10 |         A6[Date Filed: 01/01/2015]
 11 |         A7[Location: 22nd District Court]
 12 |     end
 13 | 
 14 |     subgraph PartyInformation[Party Information]
 15 |         style PartyInformation fill:#d3a8e2,stroke:#333,stroke-width:2px
 16 | 
 17 |         subgraph DefendantInfo[Defendant Information]
 18 |             style DefendantInfo fill:#b0d4f1,stroke:#333,stroke-width:2px
 19 |             B1[Defendant: Fake, Name]
 20 |             B2[Sex: Female]
 21 |             B3[Race: White]
 22 |             B4[Date of Birth: 01/01/1980]
 23 |             B5[Height: 5 foot 6 inches]
 24 |             B6[Weight: 200 lbs]
 25 |             B7[Address: 876 Main St, Natalia, TX 78059]
 26 |             B8[SID: TX01234567]
 27 |         end
 28 | 
 29 |         subgraph DefenseAttorney[Defense Attorney]
 30 |             style DefenseAttorney fill:#b0d4f1,stroke:#333,stroke-width:2px
 31 |             B9[Defense Attorney: Defense Attorney]
 32 |             B10[Appointed or Retained: Court Appointed]
 33 |             B11[Phone Number: 512-123-4567 W]
 34 |         end
 35 | 
 36 |         subgraph ProsecutingAttorney[Prosecuting Attorney]
 37 |             style ProsecutingAttorney fill:#b0d4f1,stroke:#333,stroke-width:2px
 38 |             B12[Prosecuting Attorney: Yuuuuu Haaaaa]
 39 |             B13[Prosecuting Attorney Phone Number: 512-321-8596 W]
 40 |             B14[Prosecuting Attorney Address: 712 S Stagecoach TRL, San Marcos, TX 78666]
 41 |         end
 42 |     end
 43 | 
 44 |     subgraph ChargeInformation[Charge Information]
 45 |         style ChargeInformation fill:#d3a8e2,stroke:#333,stroke-width:2px
 46 | 
 47 |         subgraph Charge1[Aggravated Assault with a Deadly Weapon]
 48 |             style Charge1 fill:#b0d4f1,stroke:#333,stroke-width:2px
 49 |             C1[Statute: 22.02a2]
 50 |             C2[Level: Second Degree Felony]
 51 |             C3[Date: 10/25/2015]
 52 |         end
 53 | 
 54 |         subgraph Charge2[Resisting Arrest]
 55 |             style Charge2 fill:#b0d4f1,stroke:#333,stroke-width:2px
 56 |             C4[Statute: 38.03]
 57 |             C5[Level: Class A Misdemeanor]
 58 |             C6[Date: 10/25/2015]
 59 |         end
 60 |     end
 61 | 
 62 |     subgraph Dispositions[Dispositions]
 63 |         style Dispositions fill:#d3a8e2,stroke:#333,stroke-width:2px
 64 | 
 65 |         subgraph Disposition1[Aggravated Assault with a Deadly Weapon]
 66 |             style Disposition1 fill:#b0d4f1,stroke:#333,stroke-width:2px
 67 |             D1[Date: 12/06/2016]
 68 |             D2[Event: Disposition]
 69 |             D3[Judicial Officer: Fake, Judge]
 70 |             D4[Outcome: Deferred Adjudication]
 71 |             D5[Sentence Length: 1 Year]
 72 |         end
 73 | 
 74 |         subgraph Disposition2[Resisting Arrest]
 75 |             style Disposition2 fill:#b0d4f1,stroke:#333,stroke-width:2px
 76 |             D6[Date: 12/06/2016]
 77 |             D7[Event: Disposition]
 78 |             D8[Judicial Officer: Fake, Judge]
 79 |             D9[Outcome: Dismissed]
 80 |         end
 81 | 
 82 |     end
 83 | 
 84 |     subgraph TopCharge[Top Charge]
 85 |         style TopCharge fill:#d3a8e2,stroke:#333,stroke-width:2px
 86 | 
 87 |         E1[Charge Name: Aggravated Assault with a Deadly Weapon]
 88 |         E2[Charge Level: Second Degree Felony]
 89 |     end
 90 | 
 91 |     subgraph EventsHearings[Example Events & Hearings]
 92 |         style EventsHearings fill:#d3a8e2,stroke:#333,stroke-width:2px
 93 | 
 94 |         subgraph InitialHearings[Initial Hearings and Filings]
 95 |             style InitialHearings fill:#b0d4f1,stroke:#333,stroke-width:2px
 96 |             F1[01/05/2016: Indictment Open Case]
 97 |             F2[02/24/2016: Arraignment Reset]
 98 |             F3[03/15/2016: Waiver of Arraignment]
 99 |             F4[04/14/2016: Pre-Trial Motions Reset]
100 |         end
101 | 
102 |         subgraph DiscoveryMotions[Discovery and Motions]
103 |             style DiscoveryMotions fill:#b0d4f1,stroke:#333,stroke-width:2px
104 |             G1[04/29/2016: Discovery Receipt from District Attorney]
105 |             G2[05/05/2016: Acknowledgment of Receipt of Discovery]
106 |             G3[06/15/2016: Pre-Trial Motions Reset]
107 |         end
108 | 
109 |         subgraph PreTrial[Pre-Trial Motions and Hearings]
110 |             style PreTrial fill:#b0d4f1,stroke:#333,stroke-width:2px
111 |             H1[07/27/2016: Pre-Trial Motions Reset]
112 |             H2[08/25/2016: Pre-Trial Motions Reset]
113 |             H3[09/26/2016: Plea Bargain Agreement]
114 |         end
115 | 
116 |         subgraph TrialAdjudication[Trial and Adjudication]
117 |             style TrialAdjudication fill:#b0d4f1,stroke:#333,stroke-width:2px
118 |             I1[12/06/2016: Punishment Hearing Deferred Adjudication]
119 |             I2[12/06/2016: Conditions of Probation]
120 |         end
121 | 
122 |         subgraph ProbationWarrants[Probation and Warrant Issuances]
123 |             style ProbationWarrants fill:#b0d4f1,stroke:#333,stroke-width:2px
124 |             J1[10/24/2017: Show Cause Hearing Failure to Appear]
125 |             J2[11/01/2017: Motion to Revoke Probation/Adjudicate Guilt Reopen Case]
126 |             J3[02/23/2022: Capias Issued]
127 |         end
128 |     end
129 | 
130 |     CaseInformation --> PartyInformation
131 |     CaseInformation --> ChargeInformation
132 |     ChargeInformation --> TopCharge
133 |     CaseInformation --> Dispositions
134 |     Dispositions --> D10[Charges Dismissed: 1]
135 |     CaseInformation --> EventsHearings
136 | ```


--------------------------------------------------------------------------------
/src/updater/__init__.py:
--------------------------------------------------------------------------------
  1 | import json, argparse, os, xxhash
  2 | from azure.cosmos import CosmosClient, exceptions
  3 | from dotenv import load_dotenv
  4 | from datetime import datetime as dt
  5 | import logging
  6 | 
  7 | class Updater():
  8 |     def __init__(self, county = "hays"):
  9 |         self.county = county.lower()
 10 |         self.case_json_cleaned_folder_path = os.path.join(
 11 |             os.path.dirname(__file__), "..", "..", "data", self.county, "case_json_cleaned"
 12 |         )
 13 |         self.processed_path = os.path.join(self.case_json_cleaned_folder_path)
 14 | 
 15 |         
 16 |         # open or create a output directory for a log and successfully processed data
 17 |         if os.path.exists(self.case_json_cleaned_folder_path) and \
 18 |             not os.path.exists(self.processed_path): 
 19 |             os.makedirs(self.processed_path)
 20 |         self.logger = self.configure_logger()
 21 |         self.COSMOSDB_CONTAINER_CASES_CLEANED = self.get_database_container()
 22 | 
 23 |     def configure_logger(self):
 24 |         logger = logging.getLogger(name="pid: " + str(os.getpid()))
 25 |         logger.setLevel(logging.DEBUG)
 26 | 
 27 |         cleaner_log_path = os.path.join(
 28 |             os.path.dirname(__file__), "..", "..", "resources"
 29 |         )
 30 | 
 31 |         file_handler = logging.FileHandler(os.path.join(cleaner_log_path, 'logger_log.txt'))
 32 |         file_handler.setLevel(logging.DEBUG)
 33 |         formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 34 |         file_handler.setFormatter(formatter)
 35 |         logger.addHandler(file_handler)
 36 | 
 37 |         console_handler = logging.StreamHandler()
 38 |         console_handler.setLevel(logging.WARNING)
 39 |         console_handler.setFormatter(formatter)
 40 |         logger.addHandler(console_handler)
 41 | 
 42 |         return logger
 43 |     
 44 |     def get_database_container(self):
 45 |         #This loads the environment for interacting with CosmosDB #Dan: Should this be moved to the .env file?
 46 |         load_dotenv()
 47 |         URL = os.getenv("URL")
 48 |         KEY = os.getenv("KEY")
 49 |         DATA_BASE_NAME = os.getenv("DATA_BASE_NAME")
 50 |         CONTAINER_NAME_CLEANED = os.getenv("CONTAINER_NAME_CLEANED")
 51 |         try:
 52 |             client = CosmosClient(URL, credential=KEY)
 53 |         except Exception as e:
 54 |             self.logger.error(f"Error instantiating CosmosClient: {e.status_code} - {e.message}")
 55 |             return
 56 |         try:
 57 |             database = client.get_database_client(DATA_BASE_NAME)
 58 |         except Exception as e:
 59 |             self.logger.error(f"Error instantiating DatabaseClient: {e.status_code} - {e.message}")
 60 |             return
 61 |         try:
 62 |             COSMOSDB_CONTAINER_CASES_CLEANED = database.get_container_client(CONTAINER_NAME_CLEANED)
 63 |         except Exception as e:
 64 |             self.logger.error(f"Error instantiating ContainerClient: {e.status_code} - {e.message}")
 65 |             return
 66 |         
 67 |         return COSMOSDB_CONTAINER_CASES_CLEANED
 68 |         
 69 |     def update(self):
 70 |         if not os.path.exists(self.case_json_cleaned_folder_path):
 71 |             self.logger.error(f'The following path doesn\'t exits: \n{self.case_json_cleaned_folder_path}')
 72 |             return
 73 |         
 74 |         if not self.COSMOSDB_CONTAINER_CASES_CLEANED:
 75 |             return
 76 | 
 77 |         list_case_json_files = os.listdir(self.case_json_cleaned_folder_path)
 78 | 
 79 |         for case_json in list_case_json_files:
 80 |             print(f'case_json: {case_json}')
 81 |             in_file = self.case_json_cleaned_folder_path + "/" + case_json
 82 |             if os.path.isfile(in_file):
 83 |                 dest_file = self.processed_path + "/" + case_json
 84 |             else:
 85 |                 continue
 86 | 
 87 |             with open(in_file, "r") as f:
 88 |                 input_dict = json.load(f)
 89 |             self.logger.info(f"[Case Filename: {case_json}, Case Number: {input_dict.get('case_number', None)}, HTML Hash: {input_dict.get('html_hash', None)}]")
 90 | 
 91 |             # Querying case databse to fetch all items that match the hash.
 92 |             hash_query = f"SELECT * FROM COSMOSDB_CONTAINER_CASES_CLEANED WHERE COSMOSDB_CONTAINER_CASES_CLEANED['html_hash'] = '{input_dict['html_hash']}'"
 93 |             try:
 94 |                 # Execute the query
 95 |                 cases = list(self.COSMOSDB_CONTAINER_CASES_CLEANED.query_items(query=hash_query,enable_cross_partition_query=True))
 96 |             except Exception as e:
 97 |                 self.logger.error(f"Error querying cases-cleaned database for an existing hash: {e.status_code} - {e.message}")
 98 |                 continue
 99 | 
100 |             if len(cases) > 0:
101 |                 # There already exists one with the same hash, so skip this entirely.
102 |                 # Move the file to the processed folder.
103 |                 os.rename(in_file, dest_file)
104 |                 self.logger.info(f"The case's HTML hash already exists in the databse: {case_json}. Not updating the database.")
105 |                 continue
106 | 
107 |             # Querying case databse to fetch all items that match the cause number.
108 |             case_query = f"SELECT * FROM COSMOSDB_CONTAINER_CASES_CLEANED WHERE COSMOSDB_CONTAINER_CASES_CLEANED['case_number'] = '{input_dict['case_number']}'"
109 |             try:
110 |                 # Execute the query
111 |                 cases = list(self.COSMOSDB_CONTAINER_CASES_CLEANED.query_items(query=case_query,enable_cross_partition_query=True))
112 |             except Exception as e:
113 |                 self.logger.error(f"Error querying cases-cleaned database for an existing cases: {e.status_code} - {e.message}")
114 |                 continue
115 | 
116 |             #If there are no cases that match the cause number, then create the case ID, add a version number of 1 to the JSON and push the JSON to the database.
117 |             today = dt.today()
118 |             input_dict['id'] = input_dict['case_number'] + ":" + input_dict['county'] + ":" + today.strftime('%m-%d-%Y') + input_dict['html_hash']
119 |             input_dict['version'] = max(int(case['version']) for case in cases) + 1 if len(cases) > 0 else 1
120 |             try:
121 |                 self.COSMOSDB_CONTAINER_CASES_CLEANED.create_item(body=input_dict)
122 |             except Exception as e:
123 |                 self.logger.error(f"Error inserting this case to cases-cleaned database: {e.status_code} - {e.message}")
124 |                 continue
125 | 
126 |             # This case is inserted successfully.
127 |             # Move the file to the processed folder.
128 |             os.rename(in_file, dest_file)
129 |             self.logger.info(f"Insertion successfully done with id: {input_dict['id']}, version: { input_dict['version']}")
130 | 
131 | if __name__ == '__main__':
132 |     Updater().update()
133 | 


--------------------------------------------------------------------------------
/resources/test_files/hays_main_page.html:
--------------------------------------------------------------------------------
  1 | 
  2 | <html>
  3 |   <head>
  4 |     <link rel="stylesheet" type="text/css" href="CSS/PublicAccess.css">
  5 |   
  6 |   
  7 |   <script language="javascript">
  8 | 
  9 |     function AddHiddenFormElement(form, name, value) {
 10 |       var hiddenField = document.createElement("input");
 11 |       hiddenField.setAttribute("type", "hidden");
 12 |       hiddenField.setAttribute("name", name);
 13 |       hiddenField.setAttribute("value", value);
 14 |       form.appendChild(hiddenField);
 15 |     }
 16 | 
 17 |     function LaunchSearch(sPartialQueryString, bNodeSelectorHidden, bUseSSL, sbxNodeControl) {
 18 |       baseURL = document.location.href.replace("default.aspx", "");
 19 |       baseURL = baseURL.replace("#MainContent", "");
 20 |       
 21 |       
 22 |       if(bUseSSL)
 23 |         baseURL = baseURL.replace("http://","https://");
 24 |       else
 25 |         baseURL = baseURL.replace("https://","http://");
 26 |         
 27 |       if (bNodeSelectorHidden == false) {
 28 |         if (sbxNodeControl.value != "") {
 29 | 
 30 |           var selectedIndex = sbxNodeControl.selectedIndex;
 31 | 
 32 |           var form = document.createElement("form");
 33 |           form.setAttribute("method", "post");
 34 |           form.setAttribute("action", baseURL + sPartialQueryString);
 35 | 
 36 |           AddHiddenFormElement(form, "NodeID", sbxNodeControl.value);
 37 |           AddHiddenFormElement(form, "NodeDesc", sbxNodeControl.options[selectedIndex].text);
 38 | 
 39 |           document.body.appendChild(form);
 40 |           form.submit();
 41 |           
 42 |           return false;
 43 |         } else {
 44 |           window.alert("Please select a location.");
 45 |         }
 46 |       } else {
 47 |         document.location.href = baseURL + sPartialQueryString;
 48 |       }
 49 |     }    
 50 |     
 51 |     function LocationChange(sbxNodeControl) 
 52 |     {
 53 |       if (sbxNodeControl != null) 
 54 |       {
 55 |          var locationCount = sbxNodeControl.options.length;
 56 |          var locationIndex = sbxNodeControl.selectedIndex;
 57 |          var i;
 58 |         
 59 |         // If the first value is a null string, then the option was set to have an initial blank value.
 60 |         // so there is one extra value in the dropdown than there are divOption sections.
 61 |         if (sbxNodeControl.options[0].value == "")
 62 |         {
 63 |           locationCount--;
 64 |         }
 65 | 
 66 |         for(i=0; i<locationCount; i++) 
 67 |         {
 68 |           if( locationIndex == i )
 69 |             document.getElementById("divOption" + (i+1)).style.display = "block";
 70 |           else
 71 |             document.getElementById("divOption" + (i+1)).style.display = "none";
 72 |         }
 73 |       }
 74 |     }
 75 |     
 76 |     function UpdateSearchCombo(sNodeString)    
 77 |     {
 78 |       var controlElem = document.getElementById("sbxControlID2");
 79 |     
 80 |       if (controlElem != null)
 81 |       {
 82 |         controlElem.SelectedValue = sNodeString;
 83 | 
 84 |         for (var i = 0; i < controlElem.options.length; i++)
 85 |           if (controlElem.options[i].value == sNodeString)
 86 |             break;
 87 |               
 88 |         if (i < controlElem.options.length)
 89 |         {
 90 |           controlElem.selectedIndex = i;
 91 |           LocationChange(controlElem);
 92 |         }
 93 |       }
 94 |     }
 95 |     
 96 |     function include(url)  
 97 |     {
 98 |       if ( document.all )
 99 |       {
100 |         var xml = new ActiveXObject("Microsoft.XMLHTTP");
101 |         xml.Open("GET", url, false);
102 |         xml.Send(null)
103 |         document.writeln(xml.responseText);
104 |       }
105 |       else  
106 |       {
107 |         var pageRequest = new XMLHttpRequest()
108 |         pageRequest.open("GET", url, false);
109 |         pageRequest.send(null);
110 |         document.writeln(pageRequest.responseText);
111 |       }
112 |     }        
113 |   </script>  
114 |   </head>
115 |   <body style="overflow: auto;" onload="LocationChange()">    
116 |     <table cellspacing="0" cellpadding="0" width="100%" height="100%" border="0" style="table-layout: fixed;" xmlns:msxsl="urn:schemas-microsoft-com:xslt" xmlns:PublicAccessUser="urn:PublicAccessUser"><tr><td style="height:83px"><table cellspacing="0" cellpadding="0" width="100%" border="0" style="table-layout: fixed; margin:0px; padding:0px;"><tr><td class="ssHeaderTitleBanner">Hays County Courts Records Inquiry</td></tr></table><table cellspacing="0" cellpadding="0" width="100%" border="0" style="table-layout: fixed; margin:0px; padding:0px;"><tr><td bgcolor="#000000" height="20px"><table cellspacing="0" cellpadding="0" width="100%" border="0"><tr><td align="left" style="padding-left: 5px"><font size="1"><a class="ssBlackNavBarHyperlink" href="#MainContent">Skip to Main Content</a>&nbsp;<a class="ssBlackNavBarHyperlink" href="logout.aspx">Logout</a>&nbsp;<a class="ssBlackNavBarHyperlink" href="MyAccount.aspx?ReturnURL=default.aspx">My Account</a>&nbsp;</font></td><td align="center" class="ssBlackNavBarLocation"></td><td align="right" style="padding-right: 10px"><table cellspacing="0" cellpadding="0" border="0"><tr><td><font size="1"><a class="ssBlackNavBarHyperlink" target="_blank" href="help.aspx">Help</a></font></td></tr></table></td></tr></table></td></tr></table></td></tr><tr height="*"><td><a id="MainContent" name="MainContent" tabindex="-1"></a><table cellspacing="0" cellpadding="0" height="300" width="100%" border="0" style="table-layout: fixed"><tr><td align="center"><img src="Images/HaysSeal.png" alt="Welcome to Odyssey Public Access"></img></td><td><div class="ssLaunchProductTitle" style="width: 200px">Case Records</div><label class="ssLogin" for="sbxControlID2">Select a location</label><br /><select id="sbxControlID2" onchange="LocationChange(this)"><option value="100,101,102,103,200,201,202,203,204,220,6112,400,401,402,403,404,405,406,407,6111,6116">All Courts</option><option value="96,6000,6100,6101,6102,6103,6104,6105">Law Enforcement</option><option value="98,4000,6106">Jail Manager</option></select><div> </div><a class="ssSearchHyperlink" href="javascript:LaunchSearch('Search.aspx?ID=100', false, true, sbxControlID2)">Criminal Case Records</a><br /><a class="ssSearchHyperlink" href="javascript:LaunchSearch('Search.aspx?ID=200', false, true, sbxControlID2)">Civil, Family &amp; Probate Case Records</a><br /><a class="ssSearchHyperlink" href="javascript:LaunchSearch('Search.aspx?ID=900', false, true, sbxControlID2)">Court Calendar</a><br /><div id="divOption1"></div><div id="divOption2"></div><div id="divOption3"></div><div id="divOption4"></div><div id="divOption5"></div><div id="divOption6"></div><div id="divOption7"></div><div id="divOption8"></div><div id="divOption9"></div><div id="divOption10"></div><div id="divOption11"></div><div id="divOption12"></div><div id="divOption13"></div><div id="divOption14"></div><div id="divOption15"></div><div id="divOption16"></div><div id="divOption17"></div><div id="divOption18"></div><div id="divOption19"></div><div id="divOption20"></div><div id="divOption21"></div><div id="divOption22"></div><div id="divOption23"></div><div id="divOption24"></div><div id="divOption25"></div><div id="divOption26"></div><div id="divOption27"></div><div id="divOption28"></div><div id="divOption29"></div><div id="divOption30"></div><div id="divOption31"></div><div id="divOption32"></div><div id="divOption33"></div><div id="divOption34"></div><p></p><div class="ssLaunchProductTitle" style="width: 200px">Jail Records</div><a class="ssSearchHyperlink" href="javascript:LaunchSearch('JailingSearch.aspx?ID=400', true, true, null)">Jail Records</a><br /><a class="ssSearchHyperlink" href="javascript:LaunchSearch('JailingSearch.aspx?ID=500', true, true, null)">Jail Bond Records</a><br /><div id="divOption1"></div><p></p></td></tr><tr><td class="ssMessageText" colspan="2"></td></tr></table></td></tr><tr valign="bottom"><td><table cellspacing="0" bgcolor="#6699CC" cellpadding="0" height="125" width="100%" style="table-layout: fixed"><col width="10" /><col width="*" /><col width="10" /><tr class="ssSiteFooter"><td></td><td class="ssSiteFooterRule">&nbsp;</td><td></td></tr><tr height="3"><td colspan="3"></td></tr><tr><td></td><td valign="top"><a class="ssLaunchHyperLink" href="http://www.state.tx.us/" target="_blank">State of Texas</a><span class="ssHyperLinkSpacer">|</span><a class="ssLaunchHyperLink" href="http://www.co.hays.tx.us/" target="_blank">Hays County</a><p class="ssCopyright">Copyright 2003 Tyler Technologies. All rights Reserved.</p></td><td></td></tr></table></td></tr></table>    
117 |   </body>
118 | </html>


--------------------------------------------------------------------------------
/src/parser/__init__.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import csv
  4 | import json
  5 | import traceback
  6 | import xxhash
  7 | from time import time
  8 | import sys
  9 | import importlib
 10 | from bs4 import BeautifulSoup
 11 | from typing import Tuple, List, Optional
 12 | 
 13 | current_dir = os.path.dirname(os.path.abspath(__file__))
 14 | parent_dir = os.path.dirname(current_dir)
 15 | project_root = os.path.dirname(parent_dir)
 16 | 
 17 | 
 18 | class Parser:
 19 |     def __init__(self):
 20 |         pass
 21 | 
 22 |     def configure_logger(self):
 23 |         logger = logging.getLogger(name="pid: " + str(os.getpid()))
 24 |         logging.basicConfig(
 25 |             level=logging.INFO,
 26 |             format="%(asctime)s - %(levelname)s - %(message)s",
 27 |             handlers=[logging.FileHandler("parser_log.txt"), logging.StreamHandler()],
 28 |         )
 29 |         logger.info("Logger configured")
 30 |         return logger
 31 | 
 32 |     def get_class_and_method(
 33 |         self, logger, county: str, test=False
 34 |     ) -> Tuple[Optional[object], Optional[callable]]:
 35 |         if test:
 36 |             logger.info(f"Test mode is on")
 37 |         # Construct the module, class, and method names
 38 |         module_name = county  # ex: 'hays'
 39 |         class_name = f"Parser{county.capitalize()}"  # ex: 'ParserHays'
 40 |         method_name = f"parser_{county}"  # ex: 'parser_hays'
 41 | 
 42 |         logger.info(
 43 |             f"Module: {module_name}\nClass: {class_name}\nMethod: {method_name}\n"
 44 |         )
 45 | 
 46 |         # Add the current directory to the system path
 47 |         sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 48 | 
 49 |         try:
 50 |             # Dynamically import the module
 51 |             module = importlib.import_module(module_name)
 52 | 
 53 |             logger.info(f"Module '{module_name}' imported successfully.")
 54 | 
 55 |             # Retrieve the class from the module
 56 |             cls = getattr(module, class_name)
 57 | 
 58 |             logger.info(f"Class '{class_name}' retrieved successfully.")
 59 | 
 60 |             if cls is None:
 61 |                 logger.info(
 62 |                     f"Class '{class_name}' not found in module '{module_name}'."
 63 |                 )
 64 |                 return None, None
 65 | 
 66 |             # Instantiate the class
 67 |             instance = cls()
 68 | 
 69 |             # Retrieve the method with the specified name
 70 |             method = getattr(instance, method_name, None)
 71 |             logger.info(f"Method '{method_name}' retrieved successfully.")
 72 | 
 73 |             if method is None:
 74 |                 logger.info(
 75 |                     f"Method '{method_name}' not found in class '{class_name}'."
 76 |                 )
 77 |                 return instance, None
 78 | 
 79 |             return instance, method
 80 |         except ModuleNotFoundError as e:
 81 |             logger.info(f"Module '{module_name}' not found: {e}")
 82 |         except AttributeError as e:
 83 |             logger.info(f"Error retrieving class or method: {e}")
 84 |         except Exception as e:
 85 |             logger.info(f"Unexpected error: {e}")
 86 |         return None, None
 87 | 
 88 |     def get_directories(
 89 |         self, county: str, logger, parse_single_file: bool = False
 90 |     ) -> Tuple[str, str]:
 91 |         # Determine the base directory of your project
 92 |         base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
 93 |         logger.info(f"get_directories function called\nbase_dir: {base_dir}\n")
 94 |         try:
 95 |             if parse_single_file:
 96 |                 case_html_path = os.path.join(base_dir, "resources", "test_files")
 97 |                 case_json_path = os.path.join(base_dir, "resources", "test_files")
 98 |             else:
 99 |                 case_html_path = os.path.join(base_dir, "data", county, "case_html")
100 |                 case_json_path = os.path.join(base_dir, "data", county, "case_json")
101 |                 if not os.path.exists(case_json_path):
102 |                     os.makedirs(case_json_path, exist_ok=True)
103 |             logger.info(
104 |                 f"Returning case_html_path: {case_html_path}\nReturning case_json_path: {case_json_path}\n"
105 |             )
106 |             return case_html_path, case_json_path
107 |         except Exception as e:
108 |             logger.info(f"Error in get_directories: {e}")
109 |             raise
110 | 
111 |     def get_list_of_html(
112 |         self,
113 |         case_html_path: str,
114 |         case_number: str,
115 |         county: str,
116 |         logger,
117 |         parse_single_file: bool = False,
118 |     ) -> List[str]:
119 |         logger.info(f"get_list_of_html function called\n")
120 |         try:
121 |             if parse_single_file:
122 |                 logger.info(f"parse_single_file is True\n")
123 |                 relative_path = os.path.join(project_root, "resources", "test_files")
124 |                 return [os.path.join(relative_path, f"test_{case_number}.html")]
125 |             # This will loop through the html in the folder they were scraped to.
126 |             case_html_list = os.listdir(case_html_path)
127 | 
128 |             # However, if an optional case number is passed to the function, then read in the case number html file from the data folder
129 |             #   -Assumes that the requested parsed case number has been scraped to html
130 |             if case_number:
131 |                 case_html_list = [f"{case_number}.html"]
132 |             case_html_list = [
133 |                 os.path.join(case_html_path, file_name) for file_name in case_html_list
134 |             ]
135 |             logger.info(f"Returning case_html_list: {case_html_list}\n")
136 |             return case_html_list
137 |         except Exception as e:
138 |             logger.info(f"Error in get_list_of_html: {e}")
139 |             raise
140 | 
141 |     def get_html_path(
142 |         self, case_html_path: str, case_html_file_name: str, case_number: str, logger
143 |     ) -> str:
144 |         logger.info(f"get_html_path function called\n")
145 |         try:
146 |             case_html_file_path = os.path.join(case_html_path, case_html_file_name)
147 |             logger.info(f"Constructed path: {case_html_file_path}")
148 |             return case_html_file_path
149 |         except Exception as e:
150 |             logger.info(f"Error in get_html_path: {e}")
151 |             raise
152 | 
153 |     def write_json_data(
154 |         self, case_json_path: str, case_number: str, case_data: str, logger
155 |     ) -> None:
156 |         try:
157 |             indent_level = 4
158 |             logger.info(f"Writing JSON to: {case_json_path}")
159 |             with open(
160 |                 os.path.join(case_json_path, case_number + ".json"), "w"
161 |             ) as file_handle:
162 |                 file_handle.write(json.dumps(case_data, indent=indent_level))
163 |         except Exception as e:
164 |             logger.info(f"Error in write_json_data: {e}")
165 |             raise
166 | 
167 |     def write_error_log(self, county: str, case_number: str) -> None:
168 |         try:
169 |             base_dir = os.path.abspath(
170 |                 os.path.join(os.path.dirname(__file__), "..", "..")
171 |             )
172 |             error_log_path = os.path.join(
173 |                 base_dir, "data", county, "cases_with_parsing_error.txt"
174 |             )
175 |             with open(
176 |                 error_log_path,
177 |                 "w",
178 |             ) as file_handle:
179 |                 file_handle.write(case_number + "\n")
180 |         except Exception as e:
181 |             print(f"Error in write_error_log: {e}")
182 |             raise
183 | 
184 |     def parse(
185 |         self, county: str, case_number: str, parse_single_file: bool = False, test=False
186 |     ) -> None:
187 |         logger = self.configure_logger()
188 | 
189 |         # For simple testing purposes
190 |         # Comment out for larger scale testing
191 |         # Case number is from /resources/test_files/test_{case_number}.html
192 |         if not case_number:
193 |             case_number = "51652356"
194 | 
195 |         logger.info(
196 |             f"Starting parsing for {county} county with case number {case_number}"
197 |         )
198 |         county = county.lower()
199 |         try:
200 |             # get input and output directories and make json dir if not present
201 |             case_html_path, case_json_path = self.get_directories(county, logger, test)
202 | 
203 |             # start
204 |             START_TIME_PARSER = time()
205 |             logger.info(f"Time started: {START_TIME_PARSER}")
206 |             # creating a list of json files already parsed
207 |             cached_case_json_list = [
208 |                 file_name.split(".")[0] for file_name in os.listdir(case_json_path)
209 |             ]
210 | 
211 |             # Get a list of the HTML files that it needs to parse.
212 |             case_html_list = self.get_list_of_html(
213 |                 case_html_path, case_number, county, logger, parse_single_file
214 |             )
215 |             logger.info(f"Starting for loop to parse {len(case_html_list)} cases")
216 |             for case_html_file_path in case_html_list:
217 |                 try:
218 |                     case_number = os.path.basename(case_html_file_path).split(".")[0]
219 | 
220 |                     logger.info(f"{case_number} - parsing")
221 | 
222 |                     with open(
223 |                         case_html_file_path, "r", encoding="utf-8", errors="ignore"
224 |                     ) as file:
225 |                         case_soup = BeautifulSoup(file, "html.parser")
226 | 
227 |                     parser_instance, parser_function = self.get_class_and_method(
228 |                         county=county, logger=logger, test=test
229 |                     )
230 | 
231 |                     if parser_instance is not None and parser_function is not None:
232 |                         case_data = parser_function(
233 |                             county, case_number, logger, case_soup
234 |                         )
235 |                     else:
236 |                         logger.info(
237 |                             "Error: Could not obtain parser instance or function."
238 |                         )
239 |                         continue
240 | 
241 |                     body = case_soup.find("body")
242 |                     tables = body.find_all("table")
243 |                     if tables:
244 |                         """
245 |                         Why balance table is dropped before hashing:
246 |                         The balance table is excluded from the hashing because
247 |                         balance is updated as any costs are paid off. Otherwise,
248 |                         the hash would change frequently and multiple versions 
249 |                         of the case would be captured that we don't want.
250 |                         """
251 |                         balance_table = tables[-1]
252 |                         if "Balance Due" in balance_table.text:
253 |                             balance_table.decompose()
254 |                     case_data["html_hash"] = xxhash.xxh64(str(body)).hexdigest()
255 | 
256 |                     self.write_json_data(case_json_path, case_number, case_data, logger)
257 | 
258 |                 except Exception:
259 |                     print(traceback.format_exc())
260 |                     self.write_error_log(county, case_number)
261 | 
262 |             RUN_TIME_PARSER = time() - START_TIME_PARSER
263 |             logger.info(f"Parsing took {RUN_TIME_PARSER} seconds")
264 |         except Exception as e:
265 |             logger.info(f"Error in parse: {e}")
266 |             raise
267 |         
268 | if __name__ == "__main__":
269 |     parser = Parser()
270 |     parser.parse(county="hays", case_number=None, parse_single_file=True)
271 | 


--------------------------------------------------------------------------------
/src/cleaner/__init__.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import datetime as dt
  4 | import xxhash
  5 | import logging
  6 | 
  7 | # Configure logging
  8 | logging.basicConfig(
  9 |     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
 10 | )
 11 | 
 12 | # List of motions identified as evidentiary.
 13 | # TODO: These should be moved to a separate JSON in resources
 14 | GOOD_MOTIONS = [
 15 |     "Motion To Suppress",
 16 |     "Motion to Reduce Bond",
 17 |     "Motion to Reduce Bond Hearing",
 18 |     "Motion for Production",
 19 |     "Motion For Speedy Trial",
 20 |     "Motion for Discovery",
 21 |     "Motion In Limine",
 22 | ]
 23 | 
 24 | 
 25 | class Cleaner:
 26 |     def __init__(self):
 27 |         pass
 28 | 
 29 |     def redact_cause_number(self, input_dict: dict) -> str:
 30 |         # This will hash and redact the cause number and then add it to the output file.
 31 |         cause_number_hash = xxhash.xxh64(str(input_dict["Case Metadata"]["code"])).hexdigest()
 32 |         return cause_number_hash
 33 | 
 34 |     def get_or_create_folder_path(self, county: str, folder_type: str) -> str:
 35 |         """Returns and ensures the existence of the folder path."""
 36 |         folder_path = os.path.join(
 37 |             os.path.dirname(__file__), "..", "..", "data", county.lower(), folder_type
 38 |         )
 39 |         try:
 40 |             if not os.path.exists(folder_path):
 41 |                 os.makedirs(folder_path)
 42 |                 logging.info(f"Folder '{folder_path}' created successfully.")
 43 |             else:
 44 |                 logging.info(f"Folder '{folder_path}' already exists.")
 45 |         except OSError as e:
 46 |             logging.error(f"Error creating folder '{folder_path}': {e}")
 47 |         return folder_path
 48 | 
 49 |     def load_json_file(self, file_path: str) -> dict:
 50 |         """Loads a JSON file from a given file path and returns the data as an object"""
 51 |         try:
 52 |             with open(file_path, "r") as f:
 53 |                 return json.load(f)
 54 |         except (FileNotFoundError, json.JSONDecodeError) as e:
 55 |             logging.error(f"Error loading file at {file_path}: {e}")
 56 |             return {}
 57 | 
 58 |     def remove_judicial_officer(self, data):
 59 |         # Check if data is a dictionary
 60 |         if isinstance(data, dict):
 61 |             # Remove 'judicial officer' if it exists in this dictionary
 62 |             if "judicial officer" in data:
 63 |                 del data["judicial officer"]
 64 |             # Recursively check each value in the dictionary
 65 |             for key, value in data.items():
 66 |                 self.remove_judicial_officer(value)
 67 |         # Check if data is a list
 68 |         elif isinstance(data, list):
 69 |             for item in data:
 70 |                 self.remove_judicial_officer(item)
 71 | 
 72 |     def load_and_map_charge_names(self, file_path: str) -> dict:
 73 |         """Loads a JSON file and maps charge names to their corresponding UMich data."""
 74 |         charge_data = self.load_json_file(file_path)
 75 |         # Check if the file loaded successfully
 76 |         if not charge_data:
 77 |             logging.error(f"Failed to load charge data from {file_path}")
 78 |             raise FileNotFoundError(f"File not found or is empty: {file_path}")
 79 |         # Create dictionary mapping charge names
 80 |         try:
 81 |             return {item["charge_name"]: item for item in charge_data}
 82 |         except KeyError as e:
 83 |             logging.error(f"Error in mapping charge names: {e}")
 84 |             raise ValueError(f"Invalid data structure: {file_path}")
 85 | 
 86 |     def process_charges(
 87 |         self, charges: list[dict], charge_mapping: dict
 88 |     ) -> tuple[list[dict], str]:
 89 |         """
 90 |         Processes a list of charges by formatting charge details,
 91 |         mapping charges to UMich data, and finding the earliest charge date.
 92 | 
 93 |         Args:
 94 |             charges: A list of charges where each charge is a dictionary containing charge details.
 95 |             charge_mapping: A dictionary mapping charge names to corresponding UMich data.
 96 | 
 97 |         Returns:
 98 |             tuple: A list of processed charges and the earliest charge date.
 99 |         """
100 |         charge_dates = []
101 |         processed_charges = []
102 | 
103 |         for i, charge in enumerate(charges):
104 |             charge_dict = {
105 |                 "charge_id": i,
106 |                 "charge_level": charge["level"],
107 |                 "orignal_charge": charge["charges"],
108 |                 "statute": charge["statute"],
109 |                 "is_primary_charge": i == 0,
110 |             }
111 | 
112 |             # Parse the charge date and append it to charge_dates
113 |             try:
114 |                 charge_datetime = dt.datetime.strptime(charge["date"], "%m/%d/%Y")
115 |                 charge_dates.append(charge_datetime)
116 |                 charge_dict["charge_date"] = dt.datetime.strftime(
117 |                     charge_datetime, "%Y-%m-%d"
118 |                 )
119 |             except ValueError:
120 |                 logging.error(f"Error parsing date for charge: {charge}")
121 |                 continue
122 | 
123 |             # Try to map the charge to UMich data
124 |             try:
125 |                 charge_dict.update(charge_mapping[charge["charges"]])
126 |             except KeyError:
127 |                 logging.warning(f"Couldn't find this charge: {charge['charges']}")
128 |                 continue
129 | 
130 |             processed_charges.append(charge_dict)
131 | 
132 |         # Find the earliest charge date
133 |         if charge_dates:
134 |             earliest_charge_date = dt.datetime.strftime(min(charge_dates), "%Y-%m-%d")
135 |         else:
136 |             logging.warning("No valid charge dates found.")
137 |             earliest_charge_date = ""
138 | 
139 |         return processed_charges, earliest_charge_date
140 | 
141 |     def contains_good_motion(self, motion: str, event: list | str) -> bool:
142 |         """Recursively check if a motion exists in an event list or sublist."""
143 |         if isinstance(event, list):
144 |             return any(self.contains_good_motion(motion, item) for item in event)
145 |         return motion.lower() in event.lower()
146 | 
147 |     def find_good_motions(
148 |         self, events: list | str, good_motions: list[str]
149 |     ) -> list[str]:
150 |         """Finds motions in events based on list of good motions."""
151 |         return [
152 |             motion
153 |             for motion in good_motions
154 |             if self.contains_good_motion(motion, events)
155 |         ]
156 | 
157 |     def hash_defense_attorney(self, input_dict: dict) -> str:
158 |         """Hashes the defense attorney info to anonymize it."""
159 |         try:
160 |             def_atty_unique_str = f'{input_dict["Defendent Information"]["defense attorney"]}:{input_dict["Defendent Information"]["defense attorney phone number"]}'
161 |             return xxhash.xxh64(def_atty_unique_str).hexdigest()
162 |         except KeyError as e:
163 |             logging.error(f"Missing defense attorney data: {e}")
164 |             return ""
165 | 
166 |     def write_json_output(self, file_path: str, data: dict) -> None:
167 |         """Writes the given data to a JSON file at the specified file path."""
168 |         try:
169 |             with open(file_path, "w") as f:
170 |                 json.dump(data, f, indent=4)
171 |             logging.info(f"Successfully wrote cleaned data to {file_path}")
172 |         except OSError as e:
173 |             logging.error(f"Failed to write JSON output to {file_path}: {e}")
174 | 
175 |     def process_single_case(
176 |         self,
177 |         case_json_folder_path: str,
178 |         case_json_filename: str,
179 |         cleaned_folder_path: str,
180 |     ) -> None:
181 |         """Process a single case JSON file."""
182 |         input_json_path = os.path.join(case_json_folder_path, case_json_filename)
183 |         input_dict = self.load_json_file(input_json_path)
184 | 
185 |         if not input_dict:
186 |             logging.error(f"Failed to load case data from {input_json_path}")
187 |             return
188 | 
189 |         # Initialize cleaned output data
190 |         output_json_data = {
191 |             "parsing_date": dt.datetime.today().strftime("%Y-%m-%d"),
192 |             "html_hash": input_dict["html_hash"],
193 |             "Case Metadata": {
194 |                 "county": input_dict["Case Metadata"]["county"]
195 |             },            
196 |             "Defendant Information": {
197 |                 "appointed_or_retained": input_dict["Defendent Information"]["appointed or retained"],
198 |                 "defense_attorney": self.hash_defense_attorney(input_dict),
199 |             },
200 |             "Charge Information": [],
201 |             "Case Details": {
202 |                 "earliest_charge_date": "",
203 |                 "has_evidence_of_representation": False,
204 |             },
205 |             "Disposition_Information": input_dict["Disposition Information"]
206 |         }
207 | 
208 |         # Removing judicial office name from data
209 |         self.remove_judicial_officer(output_json_data["Disposition_Information"])
210 | 
211 |         # Load charge mappings
212 |         charge_name_to_umich_file = os.path.join(
213 |             os.path.dirname(__file__),
214 |             "..",
215 |             "..",
216 |             "resources",
217 |             "umich-uccs-database.json",
218 |         )
219 |         charges_mapped = self.load_and_map_charge_names(charge_name_to_umich_file)
220 | 
221 |         # Process charges and motions
222 |         output_json_data["Charge Information"], output_json_data['Case Details']["earliest_charge_date"] = (
223 |             self.process_charges(input_dict["Charge Information"], charges_mapped)
224 |         )
225 |         output_json_data['Good Motions'] = self.find_good_motions(
226 |             input_dict["Other Events and Hearings"], GOOD_MOTIONS
227 |         )
228 |         output_json_data['Case Details']["has_evidence_of_representation"] = (
229 |             len(output_json_data["Good Motions"]) > 0
230 |         )
231 | 
232 |         output_json_data["cause_number_redacted"] = self.redact_cause_number(input_dict)
233 | 
234 |         # Write output to file
235 |         output_filepath = os.path.join(cleaned_folder_path, case_json_filename)
236 |         self.write_json_output(output_filepath, output_json_data)
237 | 
238 |     def process_json_files(self, county: str, case_json_folder_path: str) -> None:
239 |         """Processes all JSON files in the specified folder."""
240 |         try:
241 |             list_case_json_files = os.listdir(case_json_folder_path)
242 |         except (FileNotFoundError, Exception) as e:
243 |             logging.error(f"Error reading directory {case_json_folder_path}: {e}")
244 |             return
245 | 
246 |         # Ensure the case_json_cleaned folder exists
247 |         cleaned_folder_path = self.get_or_create_folder_path(
248 |             county, "case_json_cleaned"
249 |         )
250 | 
251 |         for case_json_filename in list_case_json_files:
252 |             try:
253 |                 self.process_single_case(
254 |                     case_json_folder_path, case_json_filename, cleaned_folder_path
255 |                 )
256 |             except Exception as e:
257 |                 logging.error(f"Error processing file {case_json_filename}. Error: {e}")
258 | 
259 |     def clean(self, county: str) -> None:
260 |         """
261 |         Cleans and processes case data for a given county.
262 |         This method performs the following steps:
263 |         1. Loads raw JSON case data from the 'case_json' folder for the specified county.
264 |         2. Processes and maps charges using an external UMich data source.
265 |         3. Identifies relevant motions from a predefined list of good motions.
266 |         4. Hashes defense attorney information to anonymize but uniquely identify the attorney.
267 |         5. Adds metadata, such as parsing date and case number, to the cleaned data.
268 |         6. Writes the cleaned data to the 'case_json_cleaned' folder for the specified county.
269 |         """
270 |         try:
271 |             case_json_folder_path = self.get_or_create_folder_path(county, "case_json")
272 |             logging.info(f"Processing data for county: {county}")
273 |             self.process_json_files(county, case_json_folder_path)
274 |             logging.info(f"Completed processing for county: {county}")
275 |         except Exception as e:
276 |             logging.error(
277 |                 f"Error during cleaning process for county: {county}. Error: {e}"
278 |             )
279 | 


--------------------------------------------------------------------------------
/resources/test_files/test_123456.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "Case Metadata": {
  3 |         "code": "CR-17-5152-C",
  4 |         "odyssey id": "test_123456",
  5 |         "county": "hays"
  6 |     },
  7 |     "Case Details": {
  8 |         "name": "The State of Texas vs. Zzzzzz Xxxxxx",
  9 |         "case type": "Adult Felony",
 10 |         "date filed": "01/05/2016",
 11 |         "location": "22nd District Court"
 12 |     },
 13 |     "Defendent Information": {
 14 |         "defendant": "Xxxxxx, Zzzzzz",
 15 |         "sex": "Female",
 16 |         "race": "White",
 17 |         "date of birth": "DOB: 02/15/1997",
 18 |         "height": "5'6\",",
 19 |         "weight": "200",
 20 |         "defense attorney": "Richard Jones",
 21 |         "appointed or retained": "Court Appointed",
 22 |         "defense attorney phone number": "512-632-2433(W)",
 23 |         "defendant address": "876 Main St Natalia, TX 78059",
 24 |         "SID": "TX03816410"
 25 |     },
 26 |     "State Information": {
 27 |         "prosecuting attorney": "Yuuuuu Haaaaa",
 28 |         "prosectuing attorney phone number": "512-362-7711(W)"
 29 |     },
 30 |     "Charge Information": [
 31 |         {
 32 |             "charges": "AGGRAVATED ASSAULT WITH A DEADLY WEAPON",
 33 |             "statute": "22.02(a)(2)",
 34 |             "level": "Second Degree Felony",
 35 |             "date": "10/25/2015"
 36 |         }
 37 |     ],
 38 |     "Disposition Information": [
 39 |         {
 40 |             "date": "12/06/2016",
 41 |             "event": "Disposition",
 42 |             "judicial officer": "Boyer, Bruce",
 43 |             "details": [
 44 |                 {
 45 |                     "charge": "1. AGGRAVATED ASSAULT WITH A DEADLY WEAPON",
 46 |                     "outcome": "Deferred Adjudication"
 47 |                 }
 48 |             ]
 49 |         },
 50 |         {
 51 |             "date": "11/04/2019",
 52 |             "event": "Amended Disposition",
 53 |             "judicial officer": "Boyer, Bruce) Reason: Community Supervision Extende",
 54 |             "details": [
 55 |                 {
 56 |                     "charge": "1. AGGRAVATED ASSAULT WITH A DEADLY WEAPON",
 57 |                     "outcome": "Amend Probation"
 58 |                 }
 59 |             ]
 60 |         }
 61 |     ],
 62 |     "Top Charge": {
 63 |         "charge name": "AGGRAVATED ASSAULT WITH A DEADLY WEAPON",
 64 |         "charge level": "Second Degree Felony"
 65 |     },
 66 |     "Dismissed Charges Count": 0,
 67 |     "Other Events and Hearings": [
 68 |         [
 69 |             "08/12/2024",
 70 |             "Motion to Adjudicate",
 71 |             "(9:00 AM) (Judicial Officer Boyer, Bruce)"
 72 |         ],
 73 |         [
 74 |             "07/01/2024",
 75 |             "Motion to Adjudicate",
 76 |             "(9:00 AM) (Judicial Officer Boyer, Bruce)",
 77 |             "Result: Reset"
 78 |         ],
 79 |         [
 80 |             "06/06/2024",
 81 |             "Motion to Adjudicate",
 82 |             "(9:00 AM) (Judicial Officer Boyer, Bruce)",
 83 |             "Result: Reset"
 84 |         ],
 85 |         [
 86 |             "05/07/2024",
 87 |             "Application For Court Appointed Attorney/Order",
 88 |             "Richard Jones"
 89 |         ],
 90 |         [
 91 |             "05/01/2024",
 92 |             "Acknowledgement of Receipt of Discovery",
 93 |             "Discovery Receipt - Email CR-18-32131-A"
 94 |         ],
 95 |         [
 96 |             "04/25/2024",
 97 |             "Motion to Adjudicate",
 98 |             "(9:00 AM) (Judicial Officer Boyer, Bruce)",
 99 |             "Result: Reset"
100 |         ],
101 |         [
102 |             "03/08/2024",
103 |             "Bond (Cash/Surety) After Release from Jail",
104 |             "See Bond Tab"
105 |         ],
106 |         [
107 |             "03/04/2024",
108 |             "Capias Executed",
109 |             "See Warrant Tab"
110 |         ],
111 |         [
112 |             "02/23/2022",
113 |             "Capias Issued",
114 |             "See Warrant Tab"
115 |         ],
116 |         [
117 |             "02/15/2022",
118 |             "Judge's Fiat",
119 |             "(Judicial Officer: Boyer, Bruce )"
120 |         ],
121 |         [
122 |             "02/09/2022",
123 |             "Motion to Revoke Probation/Adjudicate Guilt (Reopen Case)",
124 |             "(Judicial Officer: Boyer, Bruce )"
125 |         ],
126 |         [
127 |             "05/05/2020",
128 |             "Motion To Waive Court Ordered Debts",
129 |             "(Judicial Officer: Boyer, Bruce )",
130 |             "Supervision Fees"
131 |         ],
132 |         [
133 |             "12/03/2019",
134 |             "Court Cost (Bill of Cost)"
135 |         ],
136 |         [
137 |             "11/20/2019",
138 |             "Motion/Order for Payment of Itemized Time/Services",
139 |             "(Judicial Officer: Boyer, Bruce )"
140 |         ],
141 |         [
142 |             "11/04/2019",
143 |             "Stipulation of Evidence"
144 |         ],
145 |         [
146 |             "11/04/2019",
147 |             "Trial Court 's Certification of Defendant's Right of Appeal",
148 |             "(Judicial Officer: Boyer, Bruce )"
149 |         ],
150 |         [
151 |             "11/04/2019",
152 |             "Court Writ"
153 |         ],
154 |         [
155 |             "11/04/2019",
156 |             "Motion to Adjudicate",
157 |             "(9:00 AM) (Judicial Officer Boyer, Bruce)",
158 |             "Result: Prob Modified"
159 |         ],
160 |         [
161 |             "10/10/2019",
162 |             "Motion to Adjudicate",
163 |             "(9:00 AM) (Judicial Officer Boyer, Bruce)",
164 |             "Result: Reset"
165 |         ],
166 |         [
167 |             "09/16/2019",
168 |             "Discovery Receipt Email from District Attorney"
169 |         ],
170 |         [
171 |             "09/08/2019",
172 |             "Application For Court Appointed Attorney/Order",
173 |             "(Judicial Officer: Junkin, David )",
174 |             "Denied"
175 |         ],
176 |         [
177 |             "09/06/2019",
178 |             "Magistration Documents"
179 |         ],
180 |         [
181 |             "09/06/2019",
182 |             "Magistrate Warning"
183 |         ],
184 |         [
185 |             "09/06/2019",
186 |             "Bench Warrant (See Warrant Tab)"
187 |         ],
188 |         [
189 |             "09/05/2019",
190 |             "Capias Executed",
191 |             "See Warrant Tab"
192 |         ],
193 |         [
194 |             "09/05/2019",
195 |             "Capias Executed",
196 |             "See Warrant Tab"
197 |         ],
198 |         [
199 |             "09/03/2019",
200 |             "Order",
201 |             "(Judicial Officer: Junkin, David )",
202 |             "Appointing Attorney"
203 |         ],
204 |         [
205 |             "11/08/2017",
206 |             "Capias Issued",
207 |             "See Warrant Tab"
208 |         ],
209 |         [
210 |             "11/06/2017",
211 |             "Judge's Fiat",
212 |             "(Judicial Officer: Boyer, Bruce )"
213 |         ],
214 |         [
215 |             "11/01/2017",
216 |             "Motion to Revoke Probation/Adjudicate Guilt (Reopen Case)",
217 |             "(Judicial Officer: Boyer, Bruce )"
218 |         ],
219 |         [
220 |             "10/25/2017",
221 |             "Capias Issued",
222 |             "See Warrant Tab"
223 |         ],
224 |         [
225 |             "10/24/2017",
226 |             "Bailiffs Certificate",
227 |             "(Judicial Officer: Boyer, Bruce )"
228 |         ],
229 |         [
230 |             "10/24/2017",
231 |             "Show Cause Hearing",
232 |             "(9:00 AM) (Judicial Officer Boyer, Bruce)",
233 |             "Result: Failure To Appear"
234 |         ],
235 |         [
236 |             "03/30/2017",
237 |             "Amended Conditions of Probation",
238 |             "First Amended-Deferred Adjudication"
239 |         ],
240 |         [
241 |             "12/09/2016",
242 |             "Motion/Order for Payment of Itemized Time/Services",
243 |             "(Judicial Officer: Boyer, Bruce )"
244 |         ],
245 |         [
246 |             "12/06/2016",
247 |             "Court Cost (Bill of Cost)"
248 |         ],
249 |         [
250 |             "12/06/2016",
251 |             "Conditions of Probation",
252 |             "Deferred Adjudication"
253 |         ],
254 |         [
255 |             "12/06/2016",
256 |             "Trial Court 's Certification of Defendant's Right of Appeal",
257 |             "(Judicial Officer: Boyer, Bruce )"
258 |         ],
259 |         [
260 |             "12/06/2016",
261 |             "Punishment Hearing",
262 |             "(9:00 AM) (Judicial Officer Boyer, Bruce)",
263 |             "Result: Def. Adjudication"
264 |         ],
265 |         [
266 |             "11/07/2016",
267 |             "CANCELED",
268 |             "Punishment Hearing",
269 |             "(9:00 AM) (Judicial Officer Boyer, Bruce)",
270 |             "Defendant's Request"
271 |         ],
272 |         [
273 |             "09/26/2016",
274 |             "Plea Bargain Agreement"
275 |         ],
276 |         [
277 |             "09/26/2016",
278 |             "Pre Trial Motions (Non-Evidentiary)",
279 |             "(9:00 AM) (Judicial Officer Boyer, Bruce)",
280 |             "Result: Reset"
281 |         ],
282 |         [
283 |             "08/25/2016",
284 |             "Pre Trial Motions (Non-Evidentiary)",
285 |             "(9:00 AM) (Judicial Officer Henry, William R)",
286 |             "Result: Reset"
287 |         ],
288 |         [
289 |             "07/29/2016",
290 |             "Capias Recalled"
291 |         ],
292 |         [
293 |             "07/29/2016",
294 |             "Capias Issued",
295 |             "See Warrant Tab"
296 |         ],
297 |         [
298 |             "07/27/2016",
299 |             "Bailiffs Certificate",
300 |             "(Judicial Officer: Henry, William R )"
301 |         ],
302 |         [
303 |             "07/27/2016",
304 |             "Pre Trial Motions (Non-Evidentiary)",
305 |             "(9:00 AM) (Judicial Officer Henry, William R)",
306 |             "Result: Reset"
307 |         ],
308 |         [
309 |             "06/15/2016",
310 |             "Pre Trial Motions (Non-Evidentiary)",
311 |             "(9:00 AM) (Judicial Officer Henry, William R)",
312 |             "Result: Reset"
313 |         ],
314 |         [
315 |             "05/12/2016",
316 |             "Pre Trial Motions (Non-Evidentiary)",
317 |             "(9:00 AM) (Judicial Officer Steel, Gary L.)",
318 |             "Result: Reset"
319 |         ],
320 |         [
321 |             "05/05/2016",
322 |             "Acknowledgement of Receipt of Discovery"
323 |         ],
324 |         [
325 |             "04/29/2016",
326 |             "Discovery Receipt Email from District Attorney"
327 |         ],
328 |         [
329 |             "04/29/2016",
330 |             "Discovery Receipt Email from District Attorney"
331 |         ],
332 |         [
333 |             "04/14/2016",
334 |             "Pre Trial Motions (Non-Evidentiary)",
335 |             "(9:00 AM) (Judicial Officer Robison, Jack)",
336 |             "Result: Reset"
337 |         ],
338 |         [
339 |             "03/23/2016",
340 |             "CANCELED",
341 |             "Arraignment",
342 |             "(9:00 AM) (Judicial Officer Henry, William R)",
343 |             "Waived Arraignment"
344 |         ],
345 |         [
346 |             "03/15/2016",
347 |             "Waiver of Arraignment",
348 |             "Unsigned"
349 |         ],
350 |         [
351 |             "03/15/2016",
352 |             "Waiver of Arraignment"
353 |         ],
354 |         [
355 |             "02/24/2016",
356 |             "Application For Court Appointed Attorney/Order",
357 |             "(Judicial Officer: Ramsay, Charles )",
358 |             "MARTIN CLAUDER"
359 |         ],
360 |         [
361 |             "02/24/2016",
362 |             "Arraignment",
363 |             "(9:00 AM) (Judicial Officer Henry, William R)",
364 |             "Result: Reset"
365 |         ],
366 |         [
367 |             "02/09/2016",
368 |             "Returned To Sender",
369 |             "NOTICE OF ARRAIGNMENT"
370 |         ],
371 |         [
372 |             "01/05/2016",
373 |             "Court's Docket Sheet"
374 |         ],
375 |         [
376 |             "01/05/2016",
377 |             "Indictment (Open Case)"
378 |         ],
379 |         [
380 |             "10/29/2015",
381 |             "Bond (Cash/Surety) After Release from Jail",
382 |             "See Bond Tab"
383 |         ],
384 |         [
385 |             "11/04/2019",
386 |             "Amended Deferred Adjudication",
387 |             "(Judicial Officer: Boyer, Bruce) Reason: Community Supervision Extended",
388 |             "1. AGGRAVATED ASSAULT WITH A DEADLY WEAPON",
389 |             "CSCD",
390 |             "7 Years"
391 |         ],
392 |         [
393 |             "12/06/2016",
394 |             "Deferred Adjudication",
395 |             "(Judicial Officer: Boyer, Bruce)",
396 |             "1. AGGRAVATED ASSAULT WITH A DEADLY WEAPON",
397 |             "CSCD",
398 |             "5 Years"
399 |         ],
400 |         [
401 |             "12/06/2016",
402 |             "Plea",
403 |             "(Judicial Officer: Boyer, Bruce)",
404 |             "1. AGGRAVATED ASSAULT WITH A DEADLY WEAPON",
405 |             "Guilty"
406 |         ]
407 |     ],
408 |     "html_hash": "8d4a80173c700b37"
409 | }


--------------------------------------------------------------------------------
/src/parser/hays.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, List
  2 | from bs4 import BeautifulSoup
  3 | 
  4 | CHARGE_SEVERITY = {
  5 |     "First Degree Felony": 1,
  6 |     "Second Degree Felony": 2,
  7 |     "Third Degree Felony": 3,
  8 |     "State Jail Felony": 4,
  9 |     "Misdemeanor A": 5,
 10 |     "Misdemeanor B": 6,
 11 | }
 12 | 
 13 | class ParserHays:
 14 | 
 15 |     def __init__(self):
 16 |         pass
 17 | 
 18 |     def extract_rows(self, table: BeautifulSoup, logger) -> List[List[str]]:
 19 |         try:
 20 |             rows = [
 21 |                 [
 22 |                     tag.strip().replace("\xa0", "").replace("Â", "")
 23 |                     for tag in tr.find_all(text=True)
 24 |                     if tag.strip()
 25 |                 ]
 26 |                 for tr in table.select("tr")
 27 |             ]
 28 |             return [row for row in rows if row]
 29 |         except Exception as e:
 30 |             logger.info(f"Error extracting rows: {e}")
 31 |             return []
 32 |     
 33 |     def get_charge_severity(self, charge: str, logger) -> int:
 34 |         try:
 35 |             for charge_name, severity in CHARGE_SEVERITY.items():
 36 |                 if charge_name in charge:
 37 |                     return severity
 38 |             return float('inf')
 39 |         except Exception as e:
 40 |             logger.info(f"Error getting charge severity: {e}")
 41 |             return float('inf')
 42 | 
 43 |     def count_dismissed_charges(self, dispositions: List[Dict], logger) -> int:
 44 |         try:
 45 |             return sum(
 46 |                 1 for disposition in dispositions
 47 |                 for detail in disposition.get("details", [])
 48 |                 if detail.get("outcome", "").lower() == 'dismissed'
 49 |             )
 50 |         except Exception as e:
 51 |             logger.info(f"Error counting dismissed charges: {e}")
 52 |             return "Unknown"
 53 | 
 54 |     def get_top_charge(self, dispositions: List[Dict], charge_information: List[Dict], logger) -> Dict:
 55 |         try:
 56 |             top_charge = None
 57 |             min_severity = float('inf')
 58 | 
 59 |             charge_map = {info['charges']: info['level'] for info in charge_information}
 60 | 
 61 |             for disposition in dispositions:
 62 |                 if isinstance(disposition, dict):
 63 |                     for detail in disposition.get("details", []):
 64 |                         if isinstance(detail, dict):
 65 |                             charge_text = detail.get("charge", "").strip()
 66 |                             charge_name = charge_text.split(" >=")[0].strip().lstrip("0123456789. ").strip()
 67 |                             charge_level = charge_map.get(charge_name, "Unknown")
 68 | 
 69 |                             severity = self.get_charge_severity(charge_level, logger)
 70 |                             if severity < min_severity:
 71 |                                 min_severity = severity
 72 |                                 top_charge = {
 73 |                                     "charge name": charge_name,
 74 |                                     "charge level": charge_level
 75 |                                 }
 76 |                 else:
 77 |                     logger.info(f"Unexpected type for disposition: {type(disposition)}")
 78 | 
 79 |             return top_charge
 80 |         except Exception as e:
 81 |             logger.info(f"Error getting top charge: {e}")
 82 |             return {
 83 |                 "charge name": "Unknown",
 84 |                 "charge level": "Unknown"
 85 |             }
 86 | 
 87 |     def get_case_metadata(self, county: str, case_number: str, case_soup: BeautifulSoup, logger) -> Dict[str, str]:
 88 |         try:
 89 |             logger.info(f"Getting case metadata for {county} case {case_number}")
 90 |             return {
 91 |                 "code": case_soup.select('div[class="ssCaseDetailCaseNbr"] > span')[0].text,
 92 |                 "odyssey id": case_number,
 93 |                 "county": county
 94 |             }  
 95 |         except Exception as e:
 96 |             logger.info(f"Error getting case metadata: {e}")
 97 |             return {
 98 |                 "code": "Unknown",
 99 |                 "odyssey id": case_number,
100 |                 "county": county
101 |             }
102 |         
103 |     def get_case_details(self, table: BeautifulSoup, logger) -> Dict[str, str]:
104 |         try:
105 |             table_values = table.select("b")
106 |             logger.info(f"Getting case details")
107 |             return {
108 |                 "name": table_values[0].text,
109 |                 "case type": table_values[1].text,
110 |                 "date filed": table_values[2].text,
111 |                 "location": table_values[3].text
112 |             }
113 |         except Exception as e:
114 |             logger.info(f"Error getting case details: {e}")
115 |             return {
116 |                 "name": "Unknown",
117 |                 "case type": "Unknown",
118 |                 "date filed": "Unknown",
119 |                 "location": "Unknown"
120 |             }   
121 |     
122 |     def parse_defendant_rows(self, defendant_rows: List[List[str]], logger) -> Dict[str, str]:
123 |         try:
124 |             logger.info(f"Parsing defendant rows")
125 |             return {
126 |                 "defendant": defendant_rows[1][1],
127 |                 "sex": defendant_rows[1][2].split(" ")[0],
128 |                 "race": defendant_rows[1][2].split(" ")[1],
129 |                 "date of birth": defendant_rows[1][3],
130 |                 "height": defendant_rows[1][4].split(" ")[0],
131 |                 "weight": defendant_rows[1][4].split(" ")[1],
132 |                 "defense attorney": defendant_rows[1][5],
133 |                 "appointed or retained": defendant_rows[1][6],
134 |                 "defense attorney phone number": defendant_rows[1][7],
135 |                 "defendant address": defendant_rows[2][0] + " " + defendant_rows[2][1],
136 |                 "SID": defendant_rows[2][3],
137 |             }
138 |         except Exception as e:
139 |             logger.info(f"Error parsing defendant rows: {e}")
140 |             return {
141 |                 "defendant": "Unknown",
142 |                 "sex": "Unknown",  
143 |                 "race": "Unknown", 
144 |                 "date of birth": "Unknown",
145 |                 "height": "Unknown",
146 |                 "weight": "Unknown",
147 |                 "defense attorney": "Unknown",
148 |                 "appointed or retained": "Unknown",
149 |                 "defense attorney phone number": "Unknown",
150 |                 "defendant address": "Unknown",
151 |                 "SID": "Unknown",
152 |             }
153 |         
154 |     def parse_state_rows(self, state_rows: List[List[str]], logger) -> Dict[str, str]:
155 |         try:
156 |             logger.info(f"Parsing state rows")
157 |             return {
158 |                 "prosecuting attorney": state_rows[3][2],
159 |                 "prosectuing attorney phone number": state_rows[3][3],
160 |             }
161 |         except Exception as e:
162 |             logger.info(f"Error parsing state rows: {e}")
163 |             return {
164 |                 "prosecuting attorney": "Unknown",
165 |                 "prosectuing attorney phone number": "Unknown",
166 |             }
167 |         
168 |     def get_charge_information(self, table: BeautifulSoup, logger) -> List[Dict]:
169 |         try:
170 |             logger.info(f"Getting charge information")
171 |             table_rows = [
172 |                 tag.strip().replace("\xa0", " ")
173 |                 for tag in table.find_all(text=True)
174 |                 if tag.strip()
175 |             ]
176 | 
177 |             charge_information = []
178 |             for i in range(5, len(table_rows), 5):
179 |                 charge_information.append(
180 |                     {
181 |                         k: v
182 |                         for k, v in zip(
183 |                             ["charges", "statute", "level", "date"],
184 |                             table_rows[i + 1 : i + 5],
185 |                         )
186 |                     }
187 |                 )
188 |             return charge_information
189 |         except Exception as e:
190 |             logger.info(f"Error getting charge information: {e}")
191 |             return []
192 |         
193 |     def format_events_and_orders_of_the_court(self, table: BeautifulSoup, case_soup: BeautifulSoup, logger) -> List:
194 |         try:
195 |             logger.info(f"Formatting events and orders of the court")
196 |             table_rows = [
197 |                 [
198 |                     tag.strip().replace("\xa0", " ")
199 |                     for tag in tr.find_all(text=True)
200 |                     if tag.strip()
201 |                 ]
202 |                 for tr in table.select("tr")
203 |                 if tr.select("th")
204 |             ]
205 |             table_rows = [
206 |                 [" ".join(word.strip() for word in text.split()) for text in sublist]
207 |                 for sublist in table_rows
208 |                 if sublist
209 |             ]
210 | 
211 |             disposition_rows = []
212 |             other_event_rows = []
213 | 
214 |             for row in table_rows:
215 |                 print(f'printing row: {row}')
216 |                 if len(row) >= 2:
217 |                     if row[1] in ["Disposition", "Disposition:", "Amended Disposition"]:
218 |                         print(f'YES A DISPOSITION: {row}')
219 |                         disposition_rows.append(row)
220 |                     else:
221 |                         print(f'YES AN EVENT: {row}')
222 |                         other_event_rows.append(row)
223 | 
224 |             # Reverse the order of the rows
225 |             other_event_rows = other_event_rows[::-1]
226 |             disposition_rows = disposition_rows[::-1]
227 | 
228 |             print(other_event_rows)
229 | 
230 |             return (disposition_rows, other_event_rows)
231 |         except Exception as e:
232 |             logger.info(f"Error formatting events and orders of the court: {e}")
233 |             return ([], [])
234 |         
235 |     def get_disposition_information(self, row, dispositions, case_data, table, county, case_soup, logger) -> List[Dict]:
236 |         try:
237 |             if not row:
238 |                 logger.info(f"No dispositions to process.")
239 |                 return dispositions 
240 |             
241 |             if len(row) >= 5:
242 |                 # Extract judicial officer if present
243 |                 judicial_officer = ""
244 |                 if len(row[2]) > 18 and row[2].startswith("(Judicial Officer:"):
245 |                     judicial_officer = row[2][18:-1].strip()
246 |                 
247 |                 # Create a disposition entry
248 |                 disposition = {
249 |                     "date": row[0],
250 |                     "event": row[1],
251 |                     "judicial officer": judicial_officer,
252 |                     "details": []
253 |                 }
254 | 
255 |                 # Check if this row is a disposition
256 |                 if row[1].lower() in ["disposition", "amended disposition", "deferred adjudication", "punishment hearing"]:
257 |                     details = {
258 |                         "charge": row[3],
259 |                         "outcome": row[4]
260 |                     }
261 |                     if len(row) > 5:
262 |                         details["additional_info"] = row[5:]
263 |                     disposition["details"].append(details)
264 |                     dispositions.append(disposition)
265 |                     dispositions.reverse()
266 |                 else:
267 |                     logger.info("Row is not a disposition: %s", row)
268 | 
269 |             return dispositions
270 |         except Exception as e:
271 |             logger.info(f"Error getting disposition information: {e}")
272 |             return dispositions
273 |         
274 |     def parser_hays(self, county: str, case_number: str, logger, case_soup: BeautifulSoup) -> Dict[str, Dict]:
275 |         try:
276 |             root_tables = case_soup.select("body>table")
277 | 
278 |             case_data = {
279 |                 "Case Metadata": self.get_case_metadata(county, case_number, case_soup, logger)
280 |             }
281 | 
282 |             for table in root_tables:
283 | 
284 |                 if "Case Type:" in table.text and "Date Filed:" in table.text:
285 |                     case_data["Case Details"] = self.get_case_details(table, logger)
286 | 
287 |                 elif "Related Case Information" in table.text:
288 |                     case_data["Related Cases"] = [
289 |                         case.text.strip().replace("\xa0", " ") for case in table.select("td")]
290 | 
291 |                 elif "Party Information" in table.text:
292 |                     case_data["Defendent Information"] = self.parse_defendant_rows(self.extract_rows(table, logger), logger)
293 |                     case_data["State Information"] = self.parse_state_rows(self.extract_rows(table, logger), logger)
294 | 
295 |                 elif "Charge Information" in table.text:
296 |                     case_data["Charge Information"] = self.get_charge_information(table, logger)
297 | 
298 |                 elif "Events & Orders of the Court" in table.text:
299 |                     disposition_rows, other_event_rows = self.format_events_and_orders_of_the_court(table, case_soup, logger)
300 | 
301 |                     dispositions = []
302 |                     logger.info(f"For Loop started\nGetting disposition information")
303 |                     for row in disposition_rows:
304 |                         case_data["Disposition Information"] = self.get_disposition_information(row, dispositions, case_data, table, county, case_soup, logger)
305 |                     logger.info(f"For Loop ended\n")
306 |                     if case_data["Disposition Information"]:
307 |                         case_data["Top Charge"] = self.get_top_charge(dispositions, case_data.get("Charge Information", []), logger)
308 |                         case_data["Dismissed Charges Count"] = self.count_dismissed_charges(case_data["Disposition Information"], logger)
309 |                     case_data['Other Events and Hearings'] = other_event_rows
310 |                     
311 |             return case_data
312 |         except Exception as e:
313 |             logger.info(f"Error parsing Hays case: {e}")
314 |             return {}
315 | 


--------------------------------------------------------------------------------
/resources/texas_county_data.csv:
--------------------------------------------------------------------------------
  1 | county,population,website,portal,type,version,search_disabled,site_down,captcha,must_pay,must_register,notes,scrape
  2 | Harris,4731145,http://www.harriscountytx.gov/,https://jpodysseyportal.harriscountytx.gov/OdysseyPortalJP/,odyssey,2017.1.46.2,,,,,,,no
  3 | Dallas,2613539,http://www.dallascounty.org/,https://courtsportal.dallascounty.org/DALLASPROD/,odyssey,2017.1.46.2,,,,,,,no
  4 | Tarrant,2110640,http://www.tarrantcounty.com/,https://odyssey.tarrantcounty.com/PublicAccess/,odyssey,2003,,,,,,,no
  5 | Bexar,2009324,http://www.bexar.org/,https://portal-txbexar.tylertech.cloud/Portal/,odyssey,2017.1.35.6,,yes – maintenance,,,,,no
  6 | Travis,1290188,https://www.traviscountytx.gov/,https://odysseypa.traviscountytx.gov/JPPublicAccess/,odyssey,2011,,,,,,,no
  7 | Collin,1064465,http://www.collincountytx.gov/,https://cijspub.co.collin.tx.us/,odyssey,2003,,,,,,,no
  8 | Denton,906422,https://dentoncounty.gov/,https://justice1.dentoncounty.gov/PublicAccess/,odyssey,2003,,,,,,,no
  9 | Hidalgo,870781,https://tx-hidalgocounty.civicplus.com/,https://pa.co.hidalgo.tx.us/,odyssey,2003,,,,,,,no
 10 | El Paso,865657,http://www.epcounty.com/,https://casesearch.epcounty.com/PublicAccess/,odyssey,2003,yes,,,yes,,,no
 11 | Fort Bend,822779,http://www.fortbendcountytx.gov/,https://tylerpaw.fortbendcountytx.gov/PublicAccess/,odyssey,2003,,,,,,,no
 12 | Montgomery,620443,http://www.mctx.org/,http://odyssey.mctx.org/Unsecured/,odyssey,2011,,,,,,,no
 13 | Williamson,609017,http://www.wilco.org/,https://judicialrecords.wilco.org/PublicAccess/,odyssey,2003,,,,,,,no
 14 | Cameron,421017,http://www.co.cameron.tx.us/,https://portal.co.cameron.tx.us/portalprod/,odyssey,2017.1.46.2,,,yes,,,,no
 15 | Brazoria,372031,http://brazoriacountytx.gov/,https://pubweb.brazoriacountytx.gov/PublicAccess/,odyssey,2011,,,,,,,no
 16 | Bell,370647,http://www.bellcountytx.com/,https://justice.bellcounty.texas.gov/PublicPortal/,odyssey,2017.1.46.2,,,,,,,no
 17 | Nueces,353178,http://www.co.nueces.tx.us/,https://portal-txnueces.tylertech.cloud/Portal/,odyssey,2024,,,,,,,no
 18 | Galveston,350682,http://www.galvestoncountytx.gov/,https://portal.galvestoncountytx.gov/portal/,odyssey,2017.1.46.2,,,yes,,,,no
 19 | Lubbock,310639,http://www.co.lubbock.tx.us/,https://publicrecords.lubbockcounty.gov/Portal/,odyssey,2017.1.40.0,,,,,,,no
 20 | Webb,267114,http://www.webbcountytx.gov/,https://publicaccess.webbcountytx.gov/PublicAccess/,odyssey,2011,,,,,,,no
 21 | McLennan,260579,http://www.co.mclennan.tx.us/,https://mclennan.edoctec.com/McLennanDCWeb/,edoctec,2022,,,,,,scrapable,no
 22 | Jefferson,256526,http://www.co.jefferson.tx.us/,https://jeffersontxclerk.manatron.com/Court/SearchEntry.aspx?cabinet=COURT_CRIMINAL,Aumentum recorder,3,,,,,,scrapable,no
 23 | Hays,241067,http://www.co.hays.tx.us/,http://public.co.hays.tx.us/,odyssey,2003,,,,,,,yes
 24 | Brazos,233849,http://www.brazoscountytx.gov/,https://brazoscountytx.gov/237/Public-Records,,,,,,,,does records requests through e-mail and fax as far as I can tell,no
 25 | Smith,233479,http://www.smith-county.com/,https://judicial.smith-county.com/PublicAccess/,odyssey,2011,,,,,,,no
 26 | Ellis,192455,http://www.co.ellis.tx.us/,https://public.lgsonlinesolutions.com/ors.html,Local Government Solutions,2022,,,,,,scrapable,no
 27 | Johnson,179927,http://www.johnsoncountytx.org/,https://pa.johnsoncountytx.org/PublicAccess/,odyssey,2011,,,,,,,no
 28 | Guadalupe,172706,http://www.co.guadalupe.tx.us/,https://portal-txguadalupe.tylertech.cloud/PublicAccess/,odyssey,2015,,,,,,,no
 29 | Midland,169983,http://www.co.midland.tx.us/,http://jp.co.midland.tx.us/countyclerk/,netdata,2008,,,,,yes,must register. registration through County Clerk's office,no
 30 | Ector,165171,http://www.co.ector.tx.us/,https://portal-txector.tylertech.cloud/PortalProd/,odyssey,2017.1.46.2,,,,,,,no
 31 | Comal,161501,http://www.co.comal.tx.us/,http://public.co.comal.tx.us/default.aspx,odyssey,2003,yes,,,,,needs default.aspx in url or sends you to IIS default index,no
 32 | Parker,148222,http://www.parkercountytx.com/,https://txparkerodyprod.tylerhost.net/PublicAccess/,odyssey,2011,,,,,,,no
 33 | Kaufman,145310,http://www.kaufmancounty.net/,http://txkaufmanodyprod.tylerhost.net/PublicAccess/,odyssey,2003,yes,,,,,,no
 34 | Taylor,143208,http://www.taylorcountytexas.org/,http://publicaccess.taylorcountytexas.org/PublicAccess/,odyssey,2003,,,,,,,no
 35 | Randall,140753,http://randallcounty.com/,https://odysseypa.tylerhost.net/Randall/,odyssey,2011,,,,,,,no
 36 | Grayson,135543,http://www.co.grayson.tx.us/,https://judicialsearch.co.grayson.tx.us:8443/,odyssey,2011,,,,,,,no
 37 | Wichita,129350,http://www.co.wichita.tx.us/,https://portal-txwichita.tylertech.cloud/Portal/,odyssey,2017.1.46.2,,,,,,,no
 38 | Gregg,124239,http://www.co.gregg.tx.us/,http://beta.co.gregg.tx.us/OdysseyPA/,odyssey,2003,,,,,,,no
 39 | Tom Green,120003,http://www.co.tom-green.tx.us/,http://odysseypa.co.tom-green.tx.us/,odyssey,2003,,,,,,,no
 40 | Potter,118525,http://www.co.potter.tx.us/,https://portal-txpotter.tylertech.cloud/Portal/,odyssey,2017.1.46.2,,,,,,,no
 41 | Rockwall,107819,https://www.rockwallcountytexas.com/,https://portal-txrockwall.tylertech.cloud/Portal/,odyssey,2017.1.46.2,,,,,,,no
 42 | Hunt,99956,http://www.huntcounty.net/,https://portal-txhunt.tylertech.cloud/Portal/,odyssey,2017.1.46.2,yes,,,yes,,,no
 43 | Bastrop,97216,http://www.co.bastrop.tx.us/,http://records.co.bastrop.tx.us/PublicAccess/,odyssey,2003,,,,,,,no
 44 | Bowie,92893,http://www.co.bowie.tx.us/,https://portal-txbowie.tylertech.cloud/PublicAccess/,odyssey,2013,,,,,,,no
 45 | Liberty,91628,http://www.co.liberty.tx.us/,https://portal-txliberty.tylertech.cloud/Portal/,odyssey,2017.1.46.2,,,,,,,no
 46 | Victoria,91319,http://www.victoriacountytx.org/,http://odyssey.vctx.org/,odyssey,2003,,,,,,,no
 47 | Angelina,86395,http://www.angelinacounty.net/,http://public.angelinacounty.net/,odyssey,2003,,,yes,,,,no
 48 | Orange,84808,http://www.co.orange.tx.us/,https://www.co.orange.tx.us/departments/CountyClerk/OnlineRecordsSearch,myClerkbooks.com,2022,,,,,,scrapable,no
 49 | Coryell,83093,https://www.coryellcounty.org/,https://www.coryellcounty.org/page/coryell.County.Clerk,,,,,,,,it appears that you need to pay. it’s unclear though,no
 50 | Henderson,82150,http://www.henderson-county.com/,http://txhendersonodyprod.tylerhost.net/PublicAccess/,odyssey,2011,,,,,,,no
 51 | Walker,76400,http://www.co.walker.tx.us/,https://odysseypa.tylerhost.net/Walker/,odyssey,2003,,,,,,,no
 52 | Harrison,68839,http://harrisoncountytexas.org/,http://portal-txharrison.tylertech.cloud/PublicAccess/,odyssey,2011,,,,,,,no
 53 | San Patricio,68755,http://www.co.san-patricio.tx.us/,https://www.co.san-patricio.tx.us/page/sanpatricio.County.Clerk,,,,,,,,I believe you need to call or email for records,no
 54 | Wise,68632,http://www.co.wise.tx.us/,http://jail.co.wise.tx.us:81/,odyssey,2003,,,,,,,no
 55 | Starr,65920,http://www.co.starr.tx.us/,https://www.co.starr.tx.us/page/starr.County.Clerk,,,,,,yes,,criminal case request form,no
 56 | Nacogdoches,64653,http://www.co.nacogdoches.tx.us/,https://www.co.nacogdoches.tx.us/OpenRecords/Index.asp,,,,,,,,it is unclear how to get criminal records,no
 57 | Hood,61598,http://www.co.hood.tx.us/,https://www.texasonlinerecords.com/clerk/?office_id=24,netdata,2013,,,yes,,yes,must register + captcha,no
 58 | Van Zandt,59541,http://www.vanzandtcounty.org/,https://www.vanzandtcounty.org/page/vanzandt.County.Clerk,,,,,,,yes,uses countygovernmentrecords.com but it's not clear if this includes criminal records. Must register,no
 59 | Anderson,57922,http://www.co.anderson.tx.us/,http://ac5.co.anderson.tx.us/PublicAccess/,odyssey,2003,,,,,,,no
 60 | Maverick,57887,http://www.co.maverick.tx.us/,https://public.lgsonlinesolutions.com/ors.html,Local Government Solutions,2022,,,,,,scrapable,no
 61 | Waller,56794,http://www.co.waller.tx.us/,https://odysseypa.tylerhost.net/Waller/,odyssey,2003,,,,,,,no
 62 | Hardin,56231,http://www.co.hardin.tx.us/,http://www.hardincourts.com/recordSearch.php,"Henschen & Associates, Inc.",2022,,,yes,,,scrapable outside of captcha,no
 63 | Navarro,52624,http://www.co.navarro.tx.us/,https://portal-txnavarro.tylertech.cloud/PublicAccess/,odyssey,2011,,,,,,,no
 64 | Kerr,52598,http://www.co.kerr.tx.us/,http://courts.co.kerr.tx.us/PublicAccess/,odyssey,2014,,yes - 403 Forbidden,,,,,no
 65 | Rusk,52214,http://www.co.rusk.tx.us/,https://www.co.rusk.tx.us/page/rusk.County.Clerk,,,,,,,,it is unclear how to get criminal records. There is a civil records request sheet,no
 66 | Medina,50748,http://www.medinacountytexas.org/,https://odysseypa.tylerhost.net/Medina/,odyssey,2003,,,,,,,no
 67 | Cherokee,50412,http://www.co.cherokee.tx.us/,https://cherokeeclerkofcourt.com/mainpage.aspx,ICON,5.1.1.1,,,,,,scrapable,no
 68 | Polk,50123,http://www.co.polk.tx.us/,https://www.co.polk.tx.us/page/polk.County.Clerk,,,,,,,,it is unclear how to get criminal records,no
 69 | Lamar,50088,http://www.co.lamar.tx.us/,https://txlamarodyprod.tylerhost.net/PublicAccess/,odyssey,2011,,,,,,,no
 70 | Wilson,49753,http://www.co.wilson.tx.us/,https://www.co.wilson.tx.us/page/wilson.County.Clerk,,,,,,,,it is unclear how to get criminal records,no
 71 | Burnet,49130,http://www.burnetcountytexas.org/,https://portal-txburnet.tylertech.cloud/ProdPortal,odyssey,2003,,,,,,PUBLICLOGIN#visitor/visitor# do not edit - used as data in scraper,no
 72 | Atascosa,48981,http://www.atascosacounty.texas.gov/,https://public.lgsonlinesolutions.com/ors.html,Local Government Solutions,2022,,,,,,looks scrapable but this is for district rather than county clerk. Could not find county records,no
 73 | Val Verde,47586,http://valverdecounty.texas.gov/,https://www.valverdecounty.texas.gov/153/County-Clerk,,,,,,,,it is unclear how to get criminal records,no
 74 | Chambers,46571,http://www.co.chambers.tx.us/,https://portal-txchambers.tylertech.cloud/Portal/,odyssey,2017.1.46.2,yes,,,yes,,,no
 75 | Caldwell,45883,http://www.co.caldwell.tx.us/,https://www.co.caldwell.tx.us/page/caldwell.County.Clerk,iDocket,,,,,,yes,must register to iDocket.,no
 76 | Wood,44843,http://www.mywoodcounty.com/,https://portal-txwood.tylertech.cloud/PublicAccess/,odyssey,2011,,,,,,,no
 77 | Kendall,44279,http://www.co.kendall.tx.us/,https://www.co.kendall.tx.us/page/County.Clerk,,,,,,,,it is unclear how to get criminal records,no
 78 | Erath,42545,http://co.erath.tx.us/,https://www.texasonlinerecords.com/clerk/?office_id=21,netdata,2022,,,yes,,yes,must register + captcha,no
 79 | Cooke,41668,http://www.co.cooke.tx.us/,https://www.texasonlinerecords.com/clerk/?office_id=101,netdata,2022,,,yes,,yes,must register + captcha,no
 80 | Wharton,41570,http://www.co.wharton.tx.us/,https://www.co.wharton.tx.us/page/wharton.County.Clerk,iDocket,,,,,,yes,must register to iDocket.,no
 81 | Upshur,40892,http://www.countyofupshur.com/,https://www.texasonlinerecords.com/clerk/?office_id=43,netdata,2022,,,yes,,yes,must register + captcha,no
 82 | Jim Wells,38891,http://www.co.jim-wells.tx.us/,https://courtportal.co.jim-wells.tx.us/eservices/home.page.23,courtview,1.32.01,,,,,,scrapable,no
 83 | Brown,38095,http://www.browncountytx.org/,https://www.browncountytx.org/page/brown.County.Clerk,iDocket,,,,,,yes,must register to iDocket.,no
 84 | Hopkins,36787,http://www.hopkinscountytx.org/,https://www.texasonlinerecords.com/clerk/?office_id=1,netdata,2022,,,yes,,yes,must register + captcha,no
 85 | Matagorda,36255,http://www.co.matagorda.tx.us/,https://portal-txmatagorda.tylertech.cloud/Matagorda/,odyssey,2011,,,,,,,no
 86 | Hill,35874,http://www.co.hill.tx.us/,https://www.co.hill.tx.us/page/hill.County.Clerk,,,,,,,,it is unclear how to get criminal records,no
 87 | Washington,35805,http://www.co.washington.tx.us/,https://www.co.washington.tx.us/page/washington.County.Clerk,,,,,,,,it is unclear how to get criminal records,no
 88 | Fannin,35662,http://www.co.fannin.tx.us/,https://portal-txfannin.tylertech.cloud/Portal/,odyssey,2017.1.46.2,,,,,,,no
 89 | Howard,34860,http://www.co.howard.tx.us/,https://txhowardodyprod.tylerhost.net/PublicAccess/,odyssey,2011,,,,,,,no
 90 | Jasper,32980,http://www.co.jasper.tx.us/,https://www.co.jasper.tx.us/page/jasper.County.Clerk,,,,,,,,it is unclear how to get criminal records,no
 91 | Hale,32522,http://www.halecounty.org/,https://portal-txhale.tylertech.cloud/PublicAccess/,odyssey,2011,,,,,,,no
 92 | Titus,31247,http://www.co.titus.tx.us/,https://www.texasonlinerecords.com/clerk/?office_id=2,netdata,2022,,,yes,,yes,must register + captcha. Looks like this may not include criminal data as well,no
 93 | Bee,31047,http://www.co.bee.tx.us/,https://www.co.bee.tx.us/page/bee.County.Clerk,,,,,,,,it is unclear how to get criminal records,no
 94 | Kleberg,31040,http://www.co.kleberg.tx.us/,https://www.texasonlinerecords.com/clerk/?office_id=81,netdata,2022,,,yes,,yes,must register + captcha. Looks like this may not include criminal data as well,no
 95 | Austin,30167,http://www.austincounty.com/,http://public.austincounty.com/,odyssey,2003,,,,,,,no
 96 | Grimes,29268,http://www.co.grimes.tx.us/,https://public.lgsonlinesolutions.com/ors.html,Local Government Solutions,2022,,,,,,scrapable,no
 97 | Cass,28454,http://www.co.cass.tx.us/,https://cc.co.cass.tx.us/Court/SearchEntry.aspx?cabinet=COURT_CRIMINAL,Aumentum recorder,2020.2.0,,,,,,scrapable,no
 98 | Palo Pinto,28409,http://www.co.palo-pinto.tx.us/,https://www.co.palo-pinto.tx.us/page/palopinto.County.Clerk,,,,,,,,it is unclear how to get criminal records,no
 99 | San Jacinto,27402,http://www.co.san-jacinto.tx.us/,https://txsanjacintoodyprod.tylerhost.net/PublicAccess/,odyssey,2011,,,,,,,no
100 | Gillespie,26725,http://www.gillespiecounty.org/,https://portal-txgillespie.tylertech.cloud/PublicAccess/,odyssey,2006,,,,,,,no
101 | Milam,24754,http://www.milamcounty.net/,https://www.milamcounty.net/page/milam.countyclerk,iDocket,,,,,,,must register to iDocket.,no
102 | Uvalde,24564,http://www.uvaldecounty.com/,https://public.lgsonlinesolutions.com/ors.html,Local Government Solutions,2022,,,,,,scrapable,no
103 | Fayette,24435,http://www.co.fayette.tx.us/,https://www.co.fayette.tx.us/page/fayette.County.Clerk,,,,,,,,it is unclear how to get criminal records,no
104 | Shelby,24022,http://www.co.shelby.tx.us/,http://cc.co.shelby.tx.us/localization/notavailable.aspx,Aumentum Recorder,2021.1.0,yes,,,,,search disabled. Must get records in person.,no
105 | Aransas,23830,http://www.aransascountytx.gov/main/,https://www.aransascountytx.gov/clerk/,,,,,,,,it is unclear how to get criminal records,no
106 | Panola,22491,http://www.co.panola.tx.us/,https://portal-txpanola.tylertech.cloud/PublicAccess/,odyssey,2006,,,,,,,no
107 | Limestone,22146,http://www.co.limestone.tx.us/,https://www.co.limestone.tx.us/page/limestone.County.Clerk,iDocket,,,,,,,must register to iDocket.,no
108 | Houston,22066,http://www.co.houston.tx.us/,https://www.co.houston.tx.us/page/houston.County.Clerk,iDocket,,,,,,,must register to iDocket.,no
109 | Lampasas,21627,http://www.co.lampasas.tx.us/,https://www.co.lampasas.tx.us/page/lampasas.County.Clerk,,,,,,,,it is unclear how to get criminal records,no
110 | Gaines,21598,http://www.co.gaines.tx.us/,https://www.co.gaines.tx.us/page/gaines.County.Clerk,,,,,,,,it is unclear how to get criminal records,no
111 | Hockley,21537,http://www.co.hockley.tx.us/,https://www.co.hockley.tx.us/page/hockley.County.Clerk,,,,,,,,it is unclear how to get criminal records,no
112 | Moore,21358,http://www.co.moore.tx.us/,https://public.lgsonlinesolutions.com/ors.html,Local Government Solutions,2022,,,,,,scrapable,no
113 | Llano,21243,http://www.co.llano.tx.us/,https://www.co.llano.tx.us/page/llano.County.Clerk,,,,,,,,it is unclear how to get criminal records,no
114 | Gray,21227,http://www.co.gray.tx.us/,http://www.lgs-hosted.com/rmtgraycck.html,Local Government Solutions,2017,,,,,yes,must register. Seems broken orguest/orguest doesn’t work which Is default on other LGS sites. Nor does GRAYCCKINQ / 20!DiMe14,no
115 | Bandera,20851,http://www.banderacounty.org/,https://www.banderacounty.org/departments/RecordSearch.htm,iDocket,,,,,,,must register to iDocket.,no
116 | Hutchinson,20617,http://www.co.hutchinson.tx.us/,https://portal-txhutchinson.tylertech.cloud/OdysseyPA/Login.aspx,odyssey,2011,,,,,yes,requires login. Not clear how to get one.,no
117 | Colorado,20557,http://www.co.colorado.tx.us/,https://www.co.colorado.tx.us/page/colorado.County.Clerk,iDocket,,,,,,,must register to iDocket.,no
118 | Lavaca,20337,http://www.co.lavaca.tx.us/,https://www.co.lavaca.tx.us/page/lavaca.County.Clerk,,,,,,,,it is unclear how to get criminal records,no
119 | Willacy,20164,http://www.co.willacy.tx.us/,https://www.co.willacy.tx.us/page/willacy.County.Clerk,,,,,,,,it is unclear how to get criminal records,no
120 | Calhoun,20106,http://www.calhouncotx.org/,https://txcalhounportal.tylerhost.net/Portal/,odyssey,2017.1.46.2,,,,,,,no
121 | Montague,19965,http://www.co.montague.tx.us/,https://public.lgsonlinesolutions.com/ors.html,Local Government Solutions,2022,,,,,,scrapable,no
122 | DeWitt,19824,http://www.co.dewitt.tx.us/,https://public.lgsonlinesolutions.com/ors.html,Local Government Solutions,2022,,,,,,scrapable,no
123 | Tyler,19798,http://www.co.tyler.tx.us/,https://www.co.tyler.tx.us/page/tyler.CriminalRecordsRequestInstructions,,,,,,yes,,must make request and pay as far as I can tell,no
124 | Jones,19663,http://www.co.jones.tx.us/,https://public.lgsonlinesolutions.com/ors.html,Local Government Solutions,2022,,,,,,scrapable,no
125 | Gonzales,19653,http://www.co.gonzales.tx.us/,https://www.co.gonzales.tx.us/page/gonzales.County.Clerk,,,,,,,,it is unclear how to get criminal records,no
126 | Freestone,19435,http://www.co.freestone.tx.us/,https://www.co.freestone.tx.us/page/freestone.County.Clerk,,,,,,,,it is unclear how to get criminal records,no
127 | Andrews,18610,http://www.co.andrews.tx.us/,https://www.co.andrews.tx.us/181/County-Clerk,,,,,,,,it is unclear how to get criminal records,no
128 | Deaf Smith,18583,http://www.co.deaf-smith.tx.us/,,,,,,,,,,no
129 | Frio,18385,http://www.co.frio.tx.us/,,,,,,,,,,no
130 | Bosque,18235,http://www.bosquecounty.us/,,,,,,,,,,no
131 | Young,17867,http://www.co.young.tx.us/,,,,,,,,,,no
132 | Eastland,17725,http://www.eastlandcountytexas.com/,,,,,,,,,,no
133 | Burleson,17642,http://www.co.burleson.tx.us/,,,,,,,,,,no
134 | Lee,17478,http://www.co.lee.tx.us/,http://www.lgs-hosted.com/rmtleecck.html,Local Government Solutions,2014,,,,,,LEECCKINQ / 20!DiMe14 default works. Scrapable,no
135 | Falls,16968,http://co.falls.tx.us/,,,,,,,,,,no
136 | Scurry,16932,http://www.co.scurry.tx.us/,,,,,,,,,,no
137 | Robertson,16757,http://www.co.robertson.tx.us/,,,,,,,,,,no
138 | Leon,15719,http://www.co.leon.tx.us/,,,,,,,,,,no
139 | Pecos,15193,http://www.co.pecos.tx.us/,,,,,,,,,,no
140 | Jackson,14988,http://www.co.jackson.tx.us/,,,,,,,,,,no
141 | Reeves,14748,http://www.reevescountytexas.net/,,,,,,,,,,no
142 | Nolan,14738,http://www.co.nolan.tx.us/,https://public.lgsonlinesolutions.com/ors.html,Local Government Solutions,2022,,,,,,scrapable,no
143 | Karnes,14710,http://www.co.karnes.tx.us/,,,,,,,,,,no
144 | Zapata,13889,http://www.co.zapata.tx.us/,,,,,,,,,,no
145 | Callahan,13708,http://www.co.callahan.tx.us/,https://public.lgsonlinesolutions.com/ors.html,Local Government Solutions,2022,,,,,,scrapable,no
146 | Trinity,13602,http://www.co.trinity.tx.us/,,,,,,,,,,no
147 | Comanche,13594,http://www.co.comanche.tx.us/,,,,,,,,,,no
148 | Madison,13455,http://www.co.madison.tx.us/,,,,,,,,,,no
149 | Lamb,13045,http://www.co.lamb.tx.us/,,,,,,,,,,no
150 | Wilbarger,12887,http://www.co.wilbarger.tx.us/,,,,,,,,,,no
151 | Camp,12464,http://www.co.camp.tx.us/,,,,,,,,,,no
152 | Dawson,12456,http://www.co.dawson.tx.us/,,,,,,,,,,no
153 | Newton,12217,http://www.co.newton.tx.us/,,,,,,,,,,no
154 | Rains,12164,http://www.co.rains.tx.us/,https://public.lgsonlinesolutions.com/ors.html,Local Government Solutions,2022,,,,,,scrapable,no
155 | Morris,11973,http://www.co.morris.tx.us/,http://txmorrisodyprod.tylerhost.net/PublicAccess/,odyssey,2011,,,,,,PUBLICLOGIN#Public/Public# do not edit - used as data in scraper,no
156 | Terry,11831,http://co.terry.tx.us/,https://public.lgsonlinesolutions.com/ors.html,Local Government Solutions,2022,,,,,,scrapable,no
157 | Ward,11644,http://www.co.ward.tx.us/,,,,,,,,,,no
158 | Red River,11587,http://www.co.red-river.tx.us/,,,,,,,,,,no
159 | Blanco,11374,http://www.co.blanco.tx.us/,,,,,,,,,,no
160 | Live Oak,11335,http://www.co.live-oak.tx.us/,,,,,,,,,,no
161 | Franklin,10359,http://co.franklin.tx.us/,,,,,,,,,,no
162 | Clay,10218,http://www.co.clay.tx.us/,,,,,,,,,,no
163 | Ochiltree,10015,http://www.co.ochiltree.tx.us/,,,,,,,,,,no
164 | Runnels,9900,http://www.co.runnels.tx.us/,https://public.lgsonlinesolutions.com/ors.html,Local Government Solutions,,,,,,,scrapable,no
165 | Sabine,9894,http://www.co.sabine.tx.us/,,,,,,,,,,no
166 | Parmer,9869,http://parmercounty.org/,,,,,,,,,,no
167 | Duval,9831,http://www.co.duval.tx.us/,,,,,,,,,,no
168 | Marion,9725,http://www.co.marion.tx.us/,,,,,,,,,,no
169 | Zavala,9670,http://www.co.zavala.tx.us/,,,,,,,,,,no
170 | Brewster,9546,http://www.brewstercountytx.com/,,,,,,,,,,no
171 | Somervell,9205,http://www.somervell.co/,,,,,,,,,,no
172 | Stephens,9101,http://www.co.stephens.tx.us/,,,,,,,,,,no
173 | Mitchell,8990,http://www.co.mitchell.tx.us/,,,,,,,,,,no
174 | Dimmit,8615,http://www.dimmitcounty.org/,,,,,,,,,,no
175 | Archer,8560,http://www.co.archer.tx.us/,,,,,,,,,,no
176 | Jack,8472,http://www.jackcounty.org/,,,,,,,,,,no
177 | Hamilton,8222,http://www.co.hamilton.tx.us/,,,,,,,,,,no
178 | San Augustine,7918,http://www.co.san-augustine.tx.us/,,,,,,,,,,no
179 | Winkler,7791,http://www.co.winkler.tx.us/,,,,,,,,,,no
180 | Yoakum,7694,http://www.co.yoakum.tx.us/,,,,,,,,,,no
181 | Coleman,7684,http://www.co.coleman.tx.us/,,,,,,,,,,no
182 | McCulloch,7630,http://www.co.mcculloch.tx.us/,https://public.lgsonlinesolutions.com/ors.html,Local Government Solutions,,,,,,,scrapable,no
183 | Castro,7371,http://www.co.castro.tx.us/,,,,,,,,,,no
184 | Dallam,7115,http://www.dallam.org/county/,,,,,,,,,,no
185 | Brooks,7076,http://www.co.brooks.tx.us/,,,,,,,,,,no
186 | Goliad,7012,http://www.co.goliad.tx.us/,,,,,,,,,,no
187 | Swisher,6971,http://www.co.swisher.tx.us/,,,,,,,,,,no
188 | Bailey,6904,http://www.co.bailey.tx.us/,,,,,,,,,,no
189 | Refugio,6741,http://www.co.refugio.tx.us/,,,,,,,,,,no
190 | Childress,6664,http://www.childresscountytexas.us/,,,,,,,,,,no
191 | La Salle,6664,http://www.co.la-salle.tx.us/,,,,,,,,,,no
192 | Presidio,6131,http://www.co.presidio.tx.us/,,,,,,,,,,no
193 | Garza,5816,http://www.garzacounty.net/,,,,,,,,,,no
194 | Carson,5807,http://www.co.carson.tx.us/,,,,,,,,,,no
195 | San Saba,5730,http://www.co.san-saba.tx.us/,,,,,,,,,,no
196 | Lynn,5596,http://www.co.lynn.tx.us/,,,,,,,,,,no
197 | Haskell,5416,http://www.co.haskell.tx.us/,,,,,,,,,,no
198 | Floyd,5402,http://co.floyd.tx.us/,,,,,,,,,,no
199 | Hartley,5382,http://www.co.hartley.tx.us/,,,,,,,,,,no
200 | Hansford,5285,http://www.co.hansford.tx.us/,,,,,,,,,,no
201 | Martin,5237,http://www.co.martin.tx.us/,https://public.lgsonlinesolutions.com/ors.html,Local Government Solutions,,,,,,,scrapable,no
202 | Delta,5230,http://www.deltacountytx.com/,,,,,,,,,,no
203 | Crosby,5133,http://www.co.crosby.tx.us/,,,,,,,,,,no
204 | Wheeler,4990,http://www.co.wheeler.tx.us/,,,,,,,,,,no
205 | Jim Hogg,4838,http://co.jim-hogg.tx.us/,,,,,,,,,,no
206 | Crane,4675,http://www.co.crane.tx.us/,,,,,,,,,,no
207 | Mills,4456,http://www.co.mills.tx.us/,,,,,,,,,,no
208 | Kimble,4286,http://www.co.kimble.tx.us/,,,,,,,,,,no
209 | Mason,3953,http://www.co.mason.tx.us/,,,,,,,,,,no
210 | Fisher,3672,http://www.co.fisher.tx.us/,,,,,,,,,,no
211 | Hardeman,3549,http://www.co.hardeman.tx.us/,,,,,,,,,,no
212 | Baylor,3465,http://www.co.baylor.tx.us/,,,,,,,,,,no
213 | Reagan,3385,http://www.co.reagan.tx.us/,,,,,,,,,,no
214 | Hemphill,3382,http://www.co.hemphill.tx.us/,,,,,,,,,,no
215 | Sutton,3372,http://www.co.sutton.tx.us/,,,,,,,,,,no
216 | Knox,3353,http://www.knoxcountytexas.org/,,,,,,,,,,no
217 | Upton,3308,http://www.co.upton.tx.us/,,,,,,,,,,no
218 | Concho,3303,http://www.co.concho.tx.us/,,,,,,,,,,no
219 | Coke,3285,http://www.co.coke.tx.us/,,,,,,,,,,no
220 | Donley,3258,http://www.co.donley.tx.us/,,,,,,,,,,no
221 | Hudspeth,3202,http://www.co.hudspeth.tx.us/,,,,,,,,,,no
222 | Kinney,3129,http://www.co.kinney.tx.us/,,,,,,,,,,no
223 | Shackelford,3105,http://www.co.shackelford.tx.us/,https://public.lgsonlinesolutions.com/ors.html,Local Government Solutions,,,,,,,scrapable,no
224 | Crockett,3098,http://www.co.crockett.tx.us/,,,,,,,,,,no
225 | Lipscomb,3059,http://www.co.lipscomb.tx.us/,,,,,,,,,,no
226 | Hall,2825,https://www.co.hall.tx.us/,,,,,,,,,,no
227 | Sherman,2782,http://www.co.sherman.tx.us/,,,,,,,,,,no
228 | Real,2758,http://www.co.real.tx.us/,,,,,,,,,,no
229 | Collingsworth,2652,http://www.co.collingsworth.tx.us/,,,,,,,,,,no
230 | Cochran,2547,http://www.co.cochran.tx.us/,,,,,,,,,,no
231 | Schleicher,2451,http://www.co.schleicher.tx.us/,,,,,,,,,,no
232 | Culberson,2188,http://www.co.culberson.tx.us/,,,,,,,,,,no
233 | Jeff Davis,1996,http://www.co.jeff-davis.tx.us/,,,,,,,,,,no
234 | Menard,1962,http://co.menard.tx.us/,,,,,,,,,,no
235 | Armstrong,1848,http://www.co.armstrong.tx.us/,,,,,,,,,,no
236 | Dickens,1770,http://www.co.dickens.tx.us/,,,,,,,,,,no
237 | Oldham,1758,http://www.co.oldham.tx.us/,,,,,,,,,,no
238 | Irion,1513,http://www.co.irion.tx.us/,,,,,,,,,,no
239 | Throckmorton,1440,http://www.throckmortoncounty.org/,,,,,,,,,,no
240 | Briscoe,1435,http://www.co.briscoe.tx.us/,,,,,,,,,,no
241 | Edwards,1422,http://www.co.edwards.tx.us/,,,,,,,,,,no
242 | Cottle,1380,http://www.co.cottle.tx.us/,,,,,,,,,,no
243 | Sterling,1372,http://www.co.sterling.tx.us/,,,,,,,,,,no
244 | Stonewall,1245,http://www.co.stonewall.tx.us/,,,,,,,,,,no
245 | Glasscock,1116,http://www.co.glasscock.tx.us/,,,,,,,,,,no
246 | Foard,1095,http://www.foardcounty.texas.gov/,,,,,,,,,,no
247 | Motley,1063,http://www.co.motley.tx.us/,,,,,,,,,,no
248 | Roberts,827,http://www.co.roberts.tx.us/,,,,,,,,,,no
249 | Terrell,760,http://www.co.terrell.tx.us/,,,,,,,,,,no
250 | Kent,753,http://www.kentcountytexas.us/,,,,,,,,,,no
251 | Borden,631,http://www.co.borden.tx.us/,,,,,,,,,,no
252 | McMullen,600,http://www.mcmullencountytexas.us/,,,,,,,,,,no
253 | Kenedy,350,http://www.co.kenedy.tx.us/,,,,,,,,,,no
254 | King,265,http://www.co.king.tx.us/,,,,,,,,,,no
255 | Loving,64,http://www.co.loving.tx.us/,,,,,,,,,,no


--------------------------------------------------------------------------------
/src/scraper/__init__.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import csv
  4 | import urllib.parse
  5 | import sys
  6 | from datetime import datetime, timedelta
  7 | from time import time
  8 | import requests
  9 | from bs4 import BeautifulSoup
 10 | from .helpers import *
 11 | import importlib
 12 | from typing import Optional, Tuple, Callable, Type, List
 13 | import importlib.util
 14 | import re
 15 | 
 16 | class Scraper:
 17 |     """Scrape Odyssey html files into an output folder"""
 18 |     def __init__(self):
 19 |         pass
 20 | 
 21 |     def set_defaults(
 22 |         self, 
 23 |         ms_wait: int | None = None, 
 24 |         start_date: str | None = None, 
 25 |         end_date: str | None = None, 
 26 |         court_calendar_link_text: str | None = None, 
 27 |         case_number: str | None = None,
 28 |         ssl: bool | None = None,
 29 |         county: str | None = None,
 30 |         case_html_path: str | None = None,
 31 |     ) -> Tuple[int, str, str, str, Optional[str], bool, str, str]:
 32 |         """
 33 |         Sets default values for the provided optional parameters.
 34 | 
 35 |         Defaults:
 36 |         - `ms_wait`: 200 milliseconds if not provided.
 37 |         - `start_date`: '2024-07-01' if not provided.
 38 |         - `end_date`: '2024-07-01' if not provided.
 39 |         - `court_calendar_link_text`: 'Court Calendar' if not provided.
 40 |         - `case_number`: None if not provided.
 41 | 
 42 |         :param ms_wait: Milliseconds to wait.
 43 |         :param start_date: Start date in YYYY-MM-DD format.
 44 |         :param end_date: End date in YYYY-MM-DD format.
 45 |         :param court_calendar_link_text: Text for the court calendar link.
 46 |         :param case_number: Case number, or None.
 47 | 
 48 |         :returns: A tuple containing:
 49 |             - ms_wait (int): Milliseconds to wait.
 50 |             - start_date (str): Start date.
 51 |             - end_date (str): End date.
 52 |             - court_calendar_link_text (str): Text for court calendar link.
 53 |             - case_number (Optional[str]): Case number or None.
 54 |         """
 55 | 
 56 |         # Assign default values if parameters are not provided
 57 |         ms_wait = ms_wait if ms_wait is not None else 200
 58 |         start_date = start_date if start_date is not None else '2024-07-01'
 59 |         end_date = end_date if end_date is not None else '2024-07-01'
 60 |         court_calendar_link_text = court_calendar_link_text if court_calendar_link_text is not None else "Court Calendar"
 61 |         # case_number defaults to None if not provided
 62 |         case_number = case_number 
 63 |         ssl = ssl if ssl is not None else True
 64 |         county = county if county is not None else 'hays'
 65 |         case_html_path = case_html_path if case_html_path is not None else os.path.join(os.path.dirname(__file__), "..", "..", "data", county, "case_html")
 66 |         return ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl, county, case_html_path
 67 | 
 68 |     def configure_logger(self) -> logging.Logger:
 69 |         """
 70 |         Configures and returns a logger instance for the scraper class.
 71 | 
 72 |         This method sets up the logger with a unique name based on the process ID, 
 73 |         configures the logging level to INFO, and logs an initialization message.
 74 | 
 75 |         :returns: Configured logger instance.
 76 |         """
 77 |         # Configure the logger
 78 |         logger = logging.getLogger(name=f"pid: {os.getpid()}")
 79 |         
 80 |         # Set up basic configuration for the logging system
 81 |         logging.basicConfig(level=logging.INFO)
 82 |                 
 83 |         return logger
 84 | 
 85 |     def format_county(self, county: str) -> str:
 86 |         """
 87 |         Formats the county name to lowercase.
 88 | 
 89 |         :param county: The name of the county to be formatted.
 90 |         :returns: The county name in lowercase.
 91 |         :raises TypeError: If the provided county name is not a string.
 92 |         """
 93 |         
 94 |         return re.sub(r'[^\w]+', '', county.lower())
 95 | 
 96 |     def create_session(self, logger: logging.Logger, ssl) -> requests.sessions.Session:
 97 |         """
 98 |         Sets up a `requests.Session` with or without SSL verification and suppresses 
 99 |         related warnings.
100 | 
101 |         Defaults to enable SSL.
102 | 
103 |         :param logger: Logger instance for logging errors.
104 |         :returns: Configured session object.
105 |         """
106 |         # Create and configure the session
107 |         session = requests.Session()
108 | 
109 |         # Optionally SSL certificate verification. Default to True unless False passed.
110 |         session.verify = ssl
111 |         requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
112 |         
113 |         return session
114 | 
115 |     def make_directories(self, county: str, logger: logging.Logger, case_html_path) -> str:
116 |         """
117 |         Creates necessary directories for storing case HTML files.
118 | 
119 |         This method constructs a path based on the county name and ensures that
120 |         all required directories in the path are created. If the directories already
121 |         exist, no action is taken.
122 | 
123 |         :param county: The name of the county, used to create a specific directory path.
124 |         :param logger: Logger instance for logging errors.
125 |         :returns: The path to the created directories.
126 |         :raises OSError: If there is an error creating the directories.
127 |         """
128 | 
129 |         # Create the directories if they do not exist
130 |         os.makedirs(case_html_path, exist_ok=True)
131 |         
132 |         return case_html_path
133 | 
134 |     # get county portal URL, Odyssey version, and notes from csv file
135 |     def get_ody_link(self, 
136 |                      county: str, 
137 |                      logger: logging.Logger
138 |                      ) -> Tuple[str, str, str ]:
139 |         """
140 |         Retrieves Odyssey-related information for a given county from a CSV file.
141 | 
142 |         This function reads county-specific data from a CSV file located in the `resources` directory. 
143 |         It searches for the county name in the CSV file, extracts the corresponding base URL, Odyssey 
144 |         version, and any additional notes. The base URL is formatted with a trailing slash if necessary.
145 | 
146 |         :param county: The name of the county for which to retrieve Odyssey information.
147 |         :param logger: Logger instance for logging errors and information.
148 |         :returns: A tuple containing:
149 |             - base_url (str): The base URL for the county’s portal.
150 |             - odyssey_version (str): The major version of Odyssey associated with the county.
151 |             - notes (str): Additional notes related to the county.
152 |         :raises Exception: If the county is not found in the CSV file or if required data is missing.
153 |         """
154 | 
155 |         try:
156 |             base_url = odyssey_version = notes = None
157 |             # CSV is located in 'resources' folder
158 |             with open(
159 |                 os.path.join(os.path.dirname(__file__), "..", "..", "resources", "texas_county_data.csv"),
160 |                 mode="r",
161 |             ) as file_handle:
162 |                 csv_file = csv.DictReader(file_handle)
163 |                 for row in csv_file:
164 |                     if row["county"].lower() == county.lower():
165 |                         base_url = row["portal"]
166 |                         # add trailing slash if not present, otherwise urljoin breaks
167 |                         if base_url[-1] != "/":
168 |                             base_url += "/"
169 |                         logger.info(f"{base_url} - scraping this url")
170 |                         odyssey_version = int(row["version"].split(".")[0])
171 |                         notes = row["notes"]
172 |                         break
173 |             if not base_url or not odyssey_version:
174 |                 raise Exception("The required data to scrape this county is not in /resources/texas_county_data.csv")
175 |         except Exception as e:
176 |             logger.exception(e, "Error getting county-specific information from csv.")
177 |             raise
178 |         return base_url, odyssey_version, notes
179 | 
180 |     def get_class_and_method(
181 |         self,
182 |         county: str, 
183 |         logger: logging.Logger
184 |     ) -> Tuple[Type[object], Callable]:
185 |         
186 |         """
187 |         Dynamically imports a module, retrieves a class, and gets a method from it based on the county name.
188 | 
189 |         :param county: The name of the county, used to construct module, class, and method names.
190 |         :param logger: Logger instance for logging errors.
191 |         :returns: A tuple containing the instance of the class and the method callable.
192 |         :raises ImportError: If the module cannot be imported.
193 |         :raises AttributeError: If the class or method cannot be found.
194 |         """
195 | 
196 |         module_name = county
197 |         class_name = f"Scraper{county.capitalize()}"
198 |         method_name = f"scraper_{county}"
199 | 
200 |         # Add the current directory to the system path
201 |         sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
202 |         
203 |         try:
204 |             # Dynamically import the module
205 |             module = importlib.import_module(module_name)
206 |             
207 |             # Retrieve the class from the module
208 |             cls = getattr(module, class_name, None)
209 |             if cls is None:
210 |                 raise AttributeError(f"Class '{class_name}' not found in module '{module_name}'")
211 | 
212 |             # Instantiate the class
213 |             instance = cls()
214 |             
215 |             # Retrieve the method with the specified name
216 |             method = getattr(instance, method_name, None)
217 |             if method is None:
218 |                 raise AttributeError(f"Method '{method_name}' not found in class '{class_name}'")
219 | 
220 |             return instance, method
221 | 
222 |         except (FileNotFoundError, ImportError, AttributeError) as e:
223 |             logger.exception(e, "Error dynamically loading module or retrieving class/method.")
224 |             raise
225 | 
226 |     def scrape_main_page(self, 
227 |                          base_url: str, 
228 |                          odyssey_version: int, 
229 |                          session: requests.sessions.Session, 
230 |                          notes: str, 
231 |                          logger: logging.Logger, 
232 |                          ms_wait: int
233 |                          ) -> Tuple[str, BeautifulSoup]:
234 |         """
235 |         Scrapes the main page of the Odyssey site, handling login if required, and returns the page's HTML and parsed content.
236 | 
237 |         This function handles a special case where some sites may require a public guest login. If the `notes` parameter 
238 |         contains a "PUBLICLOGIN#" identifier, it will extract the username and password from the `notes`, perform the login, 
239 |         and then proceed to scrape the main page.
240 | 
241 |         :param base_url: The base URL of the main page to scrape.
242 |         :param odyssey_version: The version of Odyssey; currently not used in this function.
243 |         :param session: The `requests` session object used for making HTTP requests.
244 |         :param notes: A string containing notes that may include login credentials in the format "PUBLICLOGIN#username/password".
245 |         :param logger: Logger instance for logging errors and debug information.
246 |         :param ms_wait: The number of milliseconds to wait between retry attempts.
247 |         :returns: A tuple containing:
248 |             - main_page_html (str): The raw HTML content of the main page.
249 |             - main_soup (BeautifulSoup): A BeautifulSoup object containing the parsed HTML content.
250 |         :raises Exception: If any error occurs during the HTTP requests or HTML parsing.
251 |         """
252 | 
253 |         try:
254 |             # some sites have a public guest login that must be used
255 |             if "PUBLICLOGIN#" in notes:
256 |                 userpass = notes.split("#")[1].split("/")
257 |                 data = {
258 |                     "UserName": userpass[0],
259 |                     "Password": userpass[1],
260 |                     "ValidateUser": "1",
261 |                     "dbKeyAuth": "Justice",
262 |                     "SignOn": "Sign On",
263 |                 }
264 | 
265 |                 request_page_with_retry(
266 |                     session=session,
267 |                     url=urllib.parse.urljoin(base_url, "login.aspx"),
268 |                     logger=logger,
269 |                     http_method=HTTPMethod.GET,
270 |                     ms_wait=ms_wait,
271 |                     data=data,
272 |                 )
273 | 
274 |             main_page_html = request_page_with_retry(
275 |                 session=session,
276 |                 url=base_url,
277 |                 verification_text="ssSearchHyperlink",
278 |                 logger=logger,
279 |                 http_method=HTTPMethod.GET,
280 |                 ms_wait=ms_wait,
281 |             )
282 |             main_soup = BeautifulSoup(main_page_html, "html.parser")
283 |         except Exception as e:
284 |             logger.exception(e, f"Error scraping main page for main page HTML.")
285 |             raise
286 |         return main_page_html, main_soup
287 |         
288 |     def scrape_search_page(
289 |         self,
290 |         base_url: str,
291 |         odyssey_version: int,
292 |         main_page_html: str,
293 |         main_soup: BeautifulSoup,
294 |         session: requests.sessions.Session,
295 |         logger: logging.Logger,
296 |         ms_wait: int,
297 |         court_calendar_link_text: str
298 |     ) -> Tuple[str, str, BeautifulSoup]:
299 |         """
300 |         Scrapes the search page URL and data based on the main page content.
301 | 
302 |         This method extracts the search page ID from the court calendar link, constructs the URL for the search page,
303 |         and retrieves the search page HTML. Depending on the Odyssey version, it either uses the extracted URL or a
304 |         default URL. It then parses the search page HTML into a BeautifulSoup object.
305 | 
306 |         :param base_url: The base URL for constructing full URLs.
307 |         :param odyssey_version: The version of Odyssey, used to determine the correct URL and verification text.
308 |         :param main_page_html: The HTML content of the main page.
309 |         :param main_soup: Parsed BeautifulSoup object of the main page HTML.
310 |         :param session: The session object for making HTTP requests.
311 |         :param logger: Logger instance for logging errors and information.
312 |         :param ms_wait: Milliseconds to wait before making requests.
313 |         :param court_calendar_link_text: Text to search for in the court calendar link.
314 |         :returns: A tuple containing the search page URL, search page HTML, and the BeautifulSoup object of the search page.
315 |         :raises ValueError: If the court calendar link is not found on the main page.
316 |         """
317 | 
318 |         # Extract the search page ID from the court calendar link
319 |         search_page_id = None
320 |         for link in main_soup.select("a.ssSearchHyperlink"):
321 |             if court_calendar_link_text in link.text:
322 |                 search_page_id = link["href"].split("?ID=")[1].split("'")[0]
323 |                 break  # Exit loop once the link is found
324 | 
325 |         if not search_page_id:
326 |             write_debug_and_quit(
327 |                 verification_text="Court Calendar link",
328 |                 page_text=main_page_html,
329 |                 logger=logger,
330 |             )
331 |             raise ValueError("Court Calendar link not found on the main page.")
332 | 
333 |         # Build the URL for the search page
334 |         search_url = f"{base_url}Search.aspx?ID={search_page_id}"
335 |         
336 |         # Determine the correct URL and verification text based on Odyssey version
337 |         if odyssey_version < 2017:
338 |             search_url = search_url
339 |             verification_text = "Court Calendar"
340 |         else:
341 |             search_url = urllib.parse.urljoin(base_url, "Home/Dashboard/26")
342 |             verification_text = "SearchCriteria.SelectedCourt"
343 |         
344 |         # Hit the search page to gather initial data
345 |         search_page_html = request_page_with_retry(
346 |             session=session,
347 |             url=search_url,
348 |             verification_text=verification_text,
349 |             http_method=HTTPMethod.GET,
350 |             logger=logger,
351 |             ms_wait=ms_wait,
352 |         )
353 |         search_soup = BeautifulSoup(search_page_html, "html.parser")
354 | 
355 |         return search_url, search_page_html, search_soup
356 | 
357 |     def get_hidden_values(
358 |         self,
359 |         odyssey_version: int,
360 |         main_soup: BeautifulSoup,
361 |         search_soup: BeautifulSoup,
362 |         logger: logging.Logger
363 |     ) -> Dict[str, str]:
364 |         """
365 |         Extracts hidden input values and additional data from the search page.
366 | 
367 |         :param odyssey_version: The version of Odyssey to determine logic.
368 |         :param main_soup: Parsed BeautifulSoup object of the main page HTML.
369 |         :param search_soup: Parsed BeautifulSoup object of the search page HTML.
370 |         :param logger: Logger instance for logging information.
371 |         :returns: Dictionary of hidden input names and their values.
372 |         """
373 | 
374 |         # Extract hidden input values
375 |         hidden_values = {
376 |             hidden["name"]: hidden["value"]
377 |             for hidden in search_soup.select('input[type="hidden"]')
378 |             if hidden.has_attr("name")
379 |         }
380 | 
381 |         # Get NodeDesc and NodeID information based on Odyssey version
382 |         if odyssey_version < 2017:
383 |             location_option = main_soup.find_all("option")[0]
384 |             logger.info(f"Location: {location_option.text}")
385 |             hidden_values.update({
386 |                 "NodeDesc": location_option.text,
387 |                 "NodeID": location_option["value"]
388 |             })
389 |         else:
390 |             hidden_values["SearchCriteria.SelectedCourt"] = hidden_values.get("Settings.DefaultLocation", "")
391 | 
392 |         return hidden_values
393 | 
394 |     def get_search_results(
395 |         self,
396 |         session: requests.sessions.Session,
397 |         search_url: str,
398 |         logger: logging.Logger,
399 |         ms_wait: int,
400 |         hidden_values: Dict[str, str],
401 |         case_number: Optional[str]
402 |     ) -> BeautifulSoup:
403 |         """
404 |         Retrieves search results from the search page.
405 | 
406 |         :param session: The session object for making HTTP requests.
407 |         :param search_url: The URL to request search results from.
408 |         :param logger: Logger instance for logging information.
409 |         :param ms_wait: Milliseconds to wait before making requests.
410 |         :param hidden_values: Dictionary of hidden input values.
411 |         :param case_number: Case number for searching.
412 |         :returns: Parsed BeautifulSoup object of the search results page HTML.
413 |         """
414 | 
415 |         results_page_html = request_page_with_retry(
416 |             session=session,
417 |             url=search_url,
418 |             verification_text="Record Count",
419 |             logger=logger,
420 |             data=create_single_case_search_form_data(hidden_values, case_number),
421 |             ms_wait=ms_wait,
422 |         )
423 |         return BeautifulSoup(results_page_html, "html.parser")
424 | 
425 |     def scrape_individual_case(
426 |         self,
427 |         base_url: str,
428 |         search_url: str,
429 |         hidden_values: Dict[str, str],
430 |         case_number: Optional[str],
431 |         case_html_path: str,
432 |         session: requests.sessions.Session,
433 |         logger: logging.Logger,
434 |         ms_wait: int
435 |     ) -> None:
436 | 
437 |         results_soup = self.get_search_results(session, search_url, logger, ms_wait, hidden_values, case_number)
438 |         case_urls = [
439 |             base_url + anchor["href"]
440 |             for anchor in results_soup.select('a[href^="CaseDetail"]')
441 |         ]
442 |         
443 |         logger.info(f"{len(case_urls)} entries found")
444 |         
445 |         if case_urls:
446 |             case_id = case_urls[0].split("=")[1]
447 |             logger.info(f"{case_id} - scraping case")
448 |             
449 |             case_html = request_page_with_retry(
450 |                 session=session,
451 |                 url=case_urls[0],
452 |                 verification_text="Date Filed",
453 |                 logger=logger,
454 |                 ms_wait=ms_wait,
455 |             )
456 |             
457 |             logger.info(f"{len(case_html)} response string length")
458 | 
459 |             with open(
460 |                 os.path.join(case_html_path, f"{case_id}.html"), "w"
461 |             ) as file_handle:
462 |                 file_handle.write(case_html)
463 |         else:
464 |             logger.warning("No case URLs found.")
465 | 
466 |     def scrape_jo_list(
467 |         self,
468 |         odyssey_version: int,
469 |         search_soup: BeautifulSoup,
470 |         judicial_officers: Optional[List[str]],
471 |         logger: logging.Logger
472 |     ) -> Tuple[List[str], Dict[str, str]]:
473 |         """
474 |         Scrapes a list of judicial officers and their IDs from the search page.
475 | 
476 |         Optionally receives a list of judicial officers to scrape.
477 | 
478 |         :param odyssey_version: The version of Odyssey to determine the selector.
479 |         :param search_soup: Parsed BeautifulSoup object of the search page HTML.
480 |         :param judicial_officers: List of specific judicial officers to use.
481 |         :param logger: Logger instance for logging information.
482 |         :returns: Tuple containing a list of judicial officers to use and a dictionary of judicial officers and their IDs.
483 |         """
484 | 
485 |         selector = 'select[labelname="Judicial Officer:"] > option' if odyssey_version < 2017 else 'select[id="selHSJudicialOfficer"] > option'
486 |         judicial_officer_to_ID = {
487 |             option.text: option["value"]
488 |             for option in search_soup.select(selector)
489 |             if option.text
490 |         }
491 |         
492 |         if not judicial_officers:
493 |             judicial_officers = list(judicial_officer_to_ID.keys())
494 |             logger.info(f"No judicial officers specified, so scraping all of them: {len(judicial_officers)}")
495 |         else:
496 |             logger.info(f"Judicial officers were specified, so only scraping these: {judicial_officers}")            
497 |         
498 |         return judicial_officers, judicial_officer_to_ID
499 | 
500 |     def scrape_results_page(
501 |         self,
502 |         odyssey_version: int,
503 |         base_url: str,
504 |         search_url: str,
505 |         hidden_values: dict[str, str],
506 |         jo_id: str,
507 |         date_string: str,
508 |         session: requests.sessions.Session,
509 |         logger: logging.Logger,
510 |         ms_wait: int
511 |     ) -> Tuple[str, BeautifulSoup]:
512 |         """
513 |         Scrapes the results page based on Odyssey version and search criteria.
514 | 
515 |         :param odyssey_version: The version of Odyssey to determine the URL and verification text.
516 |         :param base_url: The base URL for constructing full URLs.
517 |         :param search_url: The URL to request search results from.
518 |         :param hidden_values: Dictionary of hidden input values.
519 |         :param jo_id: Judicial officer ID for searching.
520 |         :param date_string: Date string for searching.
521 |         :param session: The session object for making HTTP requests.
522 |         :param logger: Logger instance for logging information.
523 |         :param ms_wait: Milliseconds to wait before making requests.
524 |         :returns: A tuple containing the HTML of the results page and the parsed BeautifulSoup object.
525 |         """
526 | 
527 |         search_url = (
528 |             search_url
529 |             if odyssey_version < 2017
530 |             else urllib.parse.urljoin(base_url, "Hearing/SearchHearings/HearingSearch")
531 |         )
532 |         
533 |         verification_text = (
534 |             "Record Count"
535 |             if odyssey_version < 2017
536 |             else "Search Results"
537 |         )
538 |         
539 |         results_page_html = request_page_with_retry(
540 |             session=session,
541 |             url=search_url,
542 |             verification_text=verification_text,
543 |             logger=logger,
544 |             data=create_search_form_data(date_string, jo_id, hidden_values, odyssey_version),
545 |             ms_wait=ms_wait,
546 |         )
547 |         
548 |         results_soup = BeautifulSoup(results_page_html, "html.parser")
549 |         
550 |         return results_page_html, results_soup
551 | 
552 |     def scrape_multiple_cases(
553 |         self,
554 |         county: str,
555 |         odyssey_version: int,
556 |         base_url: str,
557 |         search_url: str,
558 |         hidden_values: Dict[str, str],
559 |         judicial_officers: List[str],
560 |         judicial_officer_to_ID: Dict[str, str],
561 |         case_html_path: Optional[str],
562 |         logger: logging.Logger,
563 |         session: requests.Session,
564 |         ms_wait: int,
565 |         start_date: str,
566 |         end_date: str
567 |     ) -> None:
568 |         start_date = datetime.strptime(start_date, '%Y-%m-%d').date()
569 |         end_date = datetime.strptime(end_date, '%Y-%m-%d').date()
570 |         
571 |         for date in (start_date + timedelta(n) for n in range((end_date - start_date).days + 1)):
572 |             date_string = date.strftime("%m/%d/%Y")
573 |             
574 |             for JO_name in judicial_officers:
575 |                 if JO_name not in judicial_officer_to_ID:
576 |                     logger.error(f"Judicial officer {JO_name} not found on search page. Continuing.")
577 |                     continue
578 |                 
579 |                 jo_id = judicial_officer_to_ID[JO_name]
580 |                 logger.info(f"Searching cases on {date_string} for {JO_name}")
581 |                 
582 |                 results_page_html, results_soup = self.scrape_results_page(
583 |                     odyssey_version, base_url, search_url, hidden_values, jo_id, date_string, session, logger, ms_wait
584 |                 )
585 |                 
586 |                 scraper_instance, scraper_function = self.get_class_and_method(county, logger)
587 |                 print(scraper_function)
588 |                 scraper_function(base_url, results_soup, case_html_path, logger, session, ms_wait)
589 | 
590 |     def scrape(
591 |         self,
592 |         county: str,
593 |         judicial_officers: List[str],
594 |         ms_wait: int,
595 |         start_date: str,
596 |         end_date: str,
597 |         court_calendar_link_text: Optional[str],
598 |         case_number: Optional[str],
599 |         case_html_path: Optional[str]
600 |     ) -> None:
601 |         ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl, county, case_html_path = self.set_defaults(
602 |             ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl, county, case_html_path
603 |         )
604 |         
605 |         logger = self.configure_logger()
606 |         county = self.format_county(county, logger)
607 |         session = self.create_session(logger)
608 |         
609 |         if case_html_path is None:
610 |             self.make_directories(county, logger)
611 |         
612 |         base_url, odyssey_version, notes = self.get_ody_link(county, logger)
613 |         main_page_html, main_soup = self.scrape_main_page(base_url, odyssey_version, session, notes, logger, ms_wait)
614 |         search_url, search_page_html, search_soup = self.scrape_search_page(
615 |             base_url, odyssey_version, main_page_html, main_soup, session, logger, ms_wait, court_calendar_link_text
616 |         )
617 |         
618 |         hidden_values = self.get_hidden_values(odyssey_version, main_soup, search_soup, logger)
619 |         
620 |         if case_number:
621 |             self.scrape_individual_case(
622 |                 base_url, search_url, hidden_values, case_number, case_html_path, session, logger, ms_wait
623 |             )
624 |         else:
625 |             judicial_officers, judicial_officer_to_ID = self.scrape_jo_list(
626 |                 odyssey_version, search_soup, judicial_officers, logger
627 |             )
628 |             scraper_start_time = time()
629 |             self.scrape_multiple_cases(
630 |                 county, odyssey_version, base_url, search_url, hidden_values, judicial_officers, judicial_officer_to_ID,
631 |                 case_html_path, logger, session, ms_wait, start_date, end_date
632 |             )
633 |             logger.info(f"\nTime to run script: {round(time() - scraper_start_time, 2)} seconds")
634 | 


--------------------------------------------------------------------------------