├── prepline_sec_filings ├── __init__.py ├── api │ ├── __init__.py │ ├── app.py │ └── section.py ├── sections.py └── fetch.py ├── preprocessing-pipeline-family.yaml ├── setup.cfg ├── scripts ├── shellcheck.sh ├── docker-build.sh ├── test-doc-pipeline-apis-consistent.sh ├── check-and-format-notebooks.py └── version-sync.sh ├── img └── unstructured_logo.png ├── requirements ├── dev.in ├── base.in ├── test.in ├── test.txt ├── base.txt └── dev.txt ├── .github ├── dependabot.yml └── workflows │ ├── codeql-analysis.yml │ └── ci.yml ├── test_real_docs ├── fixtures │ └── list-item-counts.json ├── generate_first_last.py └── test_real_examples.py ├── logger_config.yaml ├── test_utils ├── README-generating-validation-csvs.md ├── symbols-for-validation-csvs.txt ├── examples.json ├── get_sec_docs_from_edgar.py └── create_validation_csv_files.py ├── test_sec_filings_integration └── test_notebooks.py ├── Dockerfile ├── CHANGELOG.md ├── .gitignore ├── sample-docs └── sample-sec-docs.sha256 ├── Makefile ├── test_sec_filings ├── test_fetch.py ├── sec_filings │ └── test_section_api.py └── test_sec_document.py ├── exploration-notebooks ├── exploration-s1-risks.ipynb ├── exploration-TOC-action.ipynb └── exploration-10k-risks.ipynb ├── LICENSE.md └── README.md /prepline_sec_filings/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /prepline_sec_filings/api/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /preprocessing-pipeline-family.yaml: -------------------------------------------------------------------------------- 1 | name: sec-filings 2 | version: 0.2.1 3 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 100 3 | exclude = 4 | prepline_sec_filings/api 5 | -------------------------------------------------------------------------------- /scripts/shellcheck.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | find scripts -name "*.sh" -exec shellcheck {} + 4 | 5 | -------------------------------------------------------------------------------- /img/unstructured_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/pipeline-sec-filings/HEAD/img/unstructured_logo.png -------------------------------------------------------------------------------- /requirements/dev.in: -------------------------------------------------------------------------------- 1 | black 2 | flake8 3 | jupyter 4 | mypy 5 | pip-tools 6 | # NOTE(crag): consistency with unstructured-api-tools. pinned for a reason, see there. 7 | ipython==8.8.0 8 | 9 | # NOTE(robinson) - Required pins for security scans 10 | jupyter-core>=4.11.2 11 | -------------------------------------------------------------------------------- /requirements/base.in: -------------------------------------------------------------------------------- 1 | unstructured==0.2.5 2 | unstructured_api_tools>=0.10.6 3 | 4 | ratelimit 5 | requests 6 | numpy 7 | scikit-learn 8 | 9 | # NOTE(robinson) - Required pins for security scans 10 | jupyter-core>=5.3.0 11 | 12 | # We need newer versions of these for deps 13 | traitlets>=5.6.0 14 | packaging>=22.0 15 | -------------------------------------------------------------------------------- /scripts/docker-build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -euo pipefail 4 | 5 | DOCKER_BUILDKIT=1 docker buildx build --load --platform=linux/amd64 -f Dockerfile \ 6 | --build-arg PIP_VERSION="$PIP_VERSION" \ 7 | --build-arg PIPELINE_PACKAGE="$PIPELINE_PACKAGE" \ 8 | --progress plain \ 9 | -t pipeline-family-"$PIPELINE_FAMILY"-dev:latest . 10 | -------------------------------------------------------------------------------- /requirements/test.in: -------------------------------------------------------------------------------- 1 | black>=22.3.0 2 | # NOTE(mrobinson) - Pinning click due to a unicode issue in black 3 | # can remove after black drops support for Python 3.6 4 | # ref: https://github.com/psf/black/issues/2964 5 | click>=8.1 6 | flake8 7 | httpx 8 | mypy 9 | pytest-cov 10 | nbdev 11 | ipykernel 12 | 13 | # NOTE(robinson) - Required pins for security scans 14 | jupyter-core>=4.11.2 15 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "pip" 4 | directory: "/requirements" 5 | schedule: 6 | interval: "monthly" 7 | 8 | - package-ecosystem: "github-actions" 9 | # NOTE(robinson) - Workflow files stored in the 10 | # default location of `.github/workflows` 11 | directory: "/" 12 | schedule: 13 | interval: "monthly" 14 | -------------------------------------------------------------------------------- /test_real_docs/fixtures/list-item-counts.json: -------------------------------------------------------------------------------- 1 | { 2 | "hlvx": 13, 3 | "blco": 13, 4 | "mrk": 0, 5 | "aust": 3, 6 | "ee": 5, 7 | "nke": 6, 8 | "pepg": 9, 9 | "msex": 0, 10 | "v": 6, 11 | "cvs": 42, 12 | "doc": 0, 13 | "smtc": 0, 14 | "cl": 7, 15 | "ava": 0, 16 | "bc": 4, 17 | "f": 0, 18 | "lmt": 0, 19 | "cri": 12, 20 | "asns": 4, 21 | "aig": 3, 22 | "rgld": 29, 23 | "apld": 9, 24 | "omcl": 0, 25 | "mmm": 1, 26 | "bgs": 3, 27 | "ehc": 11, 28 | "dis": 7, 29 | "wetg": 4, 30 | "bj": 7, 31 | "brks": 0 32 | } 33 | -------------------------------------------------------------------------------- /logger_config.yaml: -------------------------------------------------------------------------------- 1 | version: 1 2 | disable_existing_loggers: False 3 | formatters: 4 | default_format: 5 | "()": uvicorn.logging.DefaultFormatter 6 | format: '%(asctime)s %(name)s %(levelname)s %(message)s' 7 | access: 8 | "()": uvicorn.logging.AccessFormatter 9 | format: '%(asctime)s %(client_addr)s %(request_line)s - %(status_code)s' 10 | handlers: 11 | access_handler: 12 | formatter: access 13 | class: logging.StreamHandler 14 | stream: ext://sys.stderr 15 | standard_handler: 16 | formatter: default_format 17 | class: logging.StreamHandler 18 | stream: ext://sys.stderr 19 | loggers: 20 | uvicorn.error: 21 | level: INFO 22 | handlers: 23 | - standard_handler 24 | propagate: no 25 | # disable logging for uvicorn.error by not having a handler 26 | uvicorn.access: 27 | level: INFO 28 | handlers: 29 | - access_handler 30 | propagate: no 31 | # disable logging for uvicorn.access by not having a handler 32 | unstructured: 33 | level: INFO 34 | handlers: 35 | - standard_handler 36 | propagate: no 37 | 38 | -------------------------------------------------------------------------------- /test_utils/README-generating-validation-csvs.md: -------------------------------------------------------------------------------- 1 | # Downloading CSV's with all sections extracted for fiings 2 | 3 | ## Step 1: Download filings from Edgar 4 | 5 | Given a list of symbols (tickers or CIK's) and which form type to download in $FILINGS_MANIFEST_FILE, save resulting files and manifest json in $SEC_DOCS_DIR. 6 | 7 | ``` 8 | # needed for Edgar's API 9 | export SEC_API_ORGANIZATION= 10 | export SEC_API_EMAIL= 11 | 12 | PYTHONPATH=. SEC_DOCS_DIR=sec-filing-downloads \ 13 | FILINGS_MANIFEST_FILE=test_utils/symbols-for-validation-csvs.txt \ 14 | python test_utils/get_sec_docs_from_edgar.py 15 | ``` 16 | 17 | ## Step 2: Generate validation csv's with downloaded files and manifest json 18 | 19 | ``` 20 | PYTHONPATH=. SEC_DOCS_DIR=sec-filing-downloads/ CSV_FILES_DIR=validation-csvs python \ 21 | test_utils/create_validation_csv_files.py 22 | ``` 23 | 24 | Note that you may also provide the following env vars in the command above: 25 | 26 | * `PIPELINE_SECTION_API_URL` - defaults to local API. 27 | * `FILINGS_MANIFEST_JSON` - the list of filings to create CSV's for. defaults to $SEC_DOCS_DIR/sec_docs_manifest.json which is written step 1. 28 | -------------------------------------------------------------------------------- /prepline_sec_filings/api/app.py: -------------------------------------------------------------------------------- 1 | ##################################################################### 2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS. 3 | # DO NOT MODIFY DIRECTLY 4 | ##################################################################### 5 | 6 | 7 | from fastapi import FastAPI, Request, status 8 | import logging 9 | import os 10 | 11 | from .section import router as section_router 12 | 13 | 14 | app = FastAPI( 15 | title="Unstructured Pipeline API", 16 | description="""""", 17 | version="1.0.0", 18 | docs_url="/sec-filings/docs", 19 | openapi_url="/sec-filings/openapi.json", 20 | ) 21 | 22 | allowed_origins = os.environ.get("ALLOWED_ORIGINS", None) 23 | if allowed_origins: 24 | from fastapi.middleware.cors import CORSMiddleware 25 | 26 | app.add_middleware( 27 | CORSMiddleware, 28 | allow_origins=allowed_origins.split(","), 29 | allow_methods=["OPTIONS", "POST"], 30 | allow_headers=["Content-Type"], 31 | ) 32 | 33 | app.include_router(section_router) 34 | 35 | 36 | # Filter out /healthcheck noise 37 | class HealthCheckFilter(logging.Filter): 38 | def filter(self, record: logging.LogRecord) -> bool: 39 | return record.getMessage().find("/healthcheck") == -1 40 | 41 | 42 | logging.getLogger("uvicorn.access").addFilter(HealthCheckFilter()) 43 | 44 | 45 | @app.get("/healthcheck", status_code=status.HTTP_200_OK, include_in_schema=False) 46 | def healthcheck(request: Request): 47 | return {"healthcheck": "HEALTHCHECK STATUS: EVERYTHING OK!"} 48 | -------------------------------------------------------------------------------- /test_sec_filings_integration/test_notebooks.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | import pytest 4 | from typing import List 5 | import sys 6 | 7 | if sys.version_info < (3, 8): 8 | from typing_extensions import Final 9 | else: 10 | from typing import Final 11 | 12 | import nbformat 13 | from nbconvert.preprocessors import ExecutePreprocessor 14 | 15 | TIMEOUT: Final[int] = 600 # in seconds 16 | 17 | DIRECTORY: Final[str] = Path(__file__).absolute().parent 18 | PIPELINE_NB_DIR: Final[str] = os.path.join(DIRECTORY, "..", "pipeline-notebooks") 19 | 20 | 21 | def run_notebook_directory(directory: str): 22 | """Executes all of the notebooks in a test directory. Tests that at least one cell 23 | was executed in every notebook.""" 24 | notebook_files = [file for file in os.listdir(directory) if file.endswith(".ipynb")] 25 | for notebook_file in notebook_files: 26 | filename = os.path.join(directory, notebook_file) 27 | 28 | with open(filename) as f: 29 | notebook = nbformat.read(f, as_version=4) 30 | 31 | executor = ExecutePreprocessor(timeout=TIMEOUT) 32 | executed_notebook, _ = executor.preprocess(notebook) 33 | 34 | execution_counts: List[int] = list() 35 | for cell in executed_notebook["cells"]: 36 | execution_count = cell.get("execution_count", None) 37 | if isinstance(execution_count, int): 38 | execution_counts.append(execution_count) 39 | 40 | assert len(execution_counts) > 0 41 | 42 | 43 | @pytest.mark.parametrize("directory", [(PIPELINE_NB_DIR)]) 44 | def test_notebooks(directory): 45 | # NOTE(robinson) - The expectation is that all the notebooks will execute completely 46 | # without errors 47 | run_notebook_directory(directory) 48 | -------------------------------------------------------------------------------- /test_utils/symbols-for-validation-csvs.txt: -------------------------------------------------------------------------------- 1 | # large cap 10-Q 2 | abt 10-Q 3 | amzn 10-Q 4 | mo 10-Q 5 | c 10-Q 6 | cat 10-Q 7 | dis 10-Q 8 | nflx 10-Q 9 | tmus 10-Q 10 | # mid cap 10-Q 11 | wolf 10-Q 12 | jazz 10-Q 13 | seic 10-Q 14 | rh 10-Q 15 | pdce 10-Q 16 | amkr 10-Q 17 | wen 10-Q 18 | tdc 10-Q 19 | fl 10-Q 20 | enr 10-Q 21 | # small cap 10-Q 22 | lthm 10-Q 23 | skyw 10-Q 24 | kfy 10-Q 25 | oi 10-Q 26 | b 10-Q 27 | ktb 10-Q 28 | chuy 10-Q 29 | lpsn 10-Q 30 | gci 10-Q 31 | abtx 10-Q 32 | # selected since more recent filing is a 10-Q/A as of Sept 2022 33 | adra 10-Q 34 | # large cap 10-K 35 | exc 10-K 36 | pkg 10-K 37 | hpe 10-K 38 | aiz 10-K 39 | rok 10-K 40 | ben 10-K 41 | gl 10-K 42 | all 10-K 43 | rost 10-K 44 | sivb 10-K 45 | # mid cap 10-K 46 | syna 10-K 47 | x 10-K 48 | oln 10-K 49 | sfm 10-K 50 | smg 10-K 51 | wso 10-K 52 | sam 10-K 53 | wwd 10-K 54 | mms 10-K 55 | mlkn 10-K 56 | # small cap 10-K 57 | tbbk 10-K 58 | rdnt 10-K 59 | ueic 10-K 60 | atni 10-K 61 | cwt 10-K 62 | pke 10-K 63 | zyxi 10-K 64 | klic 10-K 65 | mdc 10-K 66 | nbhc 10-K 67 | # S-1 recent filing CIK's per search 2022-09-18 68 | # https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent&CIK=&type=s-1&company=&dateb=&owner=include&start=0&count=80&output=atom 69 | 0001156784 S-1 70 | 0001912287 S-1 71 | 0001398805 S-1 72 | 0001886894 S-1 73 | 0001638287 S-1 74 | 0001893448 S-1 75 | 0001144879 S-1 76 | 0001839412 S-1 77 | 0001707079 S-1 78 | 0001704795 S-1 79 | 0001425627 S-1 80 | 0001861063 S-1 81 | 0001726711 S-1 82 | 0001841800 S-1 83 | 0001074828 S-1 84 | 0001895144 S-1 85 | 0001450704 S-1 86 | 0001076262 S-1 87 | 0001726711 S-1 88 | 0001527352 S-1 89 | 90 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:experimental 2 | 3 | from centos:centos7.9.2009 4 | 5 | # NOTE(crag): NB_USER ARG for mybinder.org compat: 6 | # https://mybinder.readthedocs.io/en/latest/tutorials/dockerfile.html 7 | ARG NB_USER=notebook-user 8 | ARG NB_UID=1000 9 | ARG PIP_VERSION 10 | ARG PIPELINE_PACKAGE 11 | 12 | RUN yum -y update && \ 13 | yum -y install gcc openssl-devel bzip2-devel libffi-devel make git sqlite-devel && \ 14 | curl -O https://www.python.org/ftp/python/3.8.15/Python-3.8.15.tgz && tar -xzf Python-3.8.15.tgz && \ 15 | cd Python-3.8.15/ && ./configure --enable-optimizations && make altinstall && \ 16 | cd .. && rm -rf Python-3.8.15* && \ 17 | ln -s /usr/local/bin/python3.8 /usr/local/bin/python3 18 | 19 | # create user with a home directory 20 | ENV USER ${NB_USER} 21 | ENV HOME /home/${NB_USER} 22 | 23 | RUN groupadd --gid ${NB_UID} ${NB_USER} 24 | RUN useradd --uid ${NB_UID} --gid ${NB_UID} ${NB_USER} 25 | USER ${NB_USER} 26 | WORKDIR ${HOME} 27 | ENV PYTHONPATH="${PYTHONPATH}:${HOME}" 28 | ENV PATH="/home/${NB_USER}/.local/bin:${PATH}" 29 | 30 | COPY logger_config.yaml logger_config.yaml 31 | COPY requirements/dev.txt requirements-dev.txt 32 | COPY requirements/base.txt requirements-base.txt 33 | COPY prepline_${PIPELINE_PACKAGE}/ prepline_${PIPELINE_PACKAGE}/ 34 | COPY exploration-notebooks exploration-notebooks 35 | COPY pipeline-notebooks pipeline-notebooks 36 | 37 | 38 | # NOTE(robinson) - Can remove the secret mount once the unstructured repo is public 39 | # NOTE(crag) - Cannot use an ARG in the dst= path (so it seems), hence no ${NB_USER}, ${NB_UID} 40 | RUN python3.8 -m pip install pip==${PIP_VERSION} \ 41 | && pip3.8 install --no-cache -r requirements-base.txt \ 42 | && pip3.8 install --no-cache -r requirements-dev.txt \ 43 | && python3.8 -c "import nltk; nltk.download('punkt')" \ 44 | && python3.8 -c "import nltk; nltk.download('averaged_perceptron_tagger')" 45 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## 0.2.1 2 | 3 | * Supports json responses suitable for Label Studio. 4 | * Allows a json list instead of a multipart response for multi-file requests 5 | * Supports text/csv responses instead of just json 6 | * More general (non-pipeline-specific) way of starting the app 7 | * Add alternative way of importing `Final` to support google colab 8 | * Dependency bumps 9 | 10 | ## 0.2.0 11 | 12 | * Updated section API to accept multiple text files uploads as `text_files` parameter. 13 | 14 | ## 0.1.0 15 | 16 | * Updated FastAPI param m_section -> section 17 | * API updated to support known filing sections rather just risk factors 18 | * Updated interface to be compatible with new version of unstructured 19 | 20 | ## 0.0.3 21 | 22 | * Updated `match_s1_toc_title_to_section` for an exact match 23 | * Enumerated and added patterns for common 10-K/Q and S-1 sections 24 | * Refactor get risk narrative to allow capture of variable section 25 | * Naming conventions updated with "pipeline" terminology (no longer "recipe") 26 | * Various tweaks to parsing methods to improve capturing of risk section and TOC 27 | * Auto-generated api risk_narrative.py now lints (unstructured-api-tools) 28 | * Added get_table_of_contents to find TOC elements within SEC document (and tests) 29 | * Added helper functions for retrieving/opening documents from the SEC 30 | * Changed `unstructured_api` package to `unstructured_api_tools` 31 | * Rewrote `get_risk_narrative` to use the TOC 32 | * Added integration tests to verify capture of risk factors section 33 | 34 | ## 0.0.2 35 | 36 | * Pipeline now generates a FastAPI web application 37 | * Added logic to skip risk section if risk section is empty upon completion 38 | * Added different form types to unit tests, and added variation of forms that use a table of contents 39 | 40 | ## 0.0.1 41 | 42 | * Added make target to build the pipeline scripts 43 | * Change `doc_prep` package name to `unstructured` 44 | * Created pipeline for extracting the risk section from 10-K, 10-Q, and S-1 filings 45 | * Initial repo setup for SEC filings 46 | -------------------------------------------------------------------------------- /scripts/test-doc-pipeline-apis-consistent.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -eu -o pipefail 4 | 5 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 6 | cd "$SCRIPT_DIR"/.. 7 | 8 | PIPELINE_OUTPUT_DIR=tmp-api-check-output-$RANDOM 9 | FILE_INDICTATING_FAILURE="$PIPELINE_OUTPUT_DIR"-has-failures 10 | mkdir -p $PIPELINE_OUTPUT_DIR 11 | touch $PIPELINE_OUTPUT_DIR/__init__.py 12 | 13 | function tmp_pipeline_comp_cleanup () { 14 | cd "$SCRIPT_DIR"/.. 15 | rm -f "$FILE_INDICTATING_FAILURE" 16 | if [[ "$1" -eq 0 ]]; then 17 | rm -rf $PIPELINE_OUTPUT_DIR 18 | fi 19 | exit "$1" 20 | } 21 | 22 | unstructured_api_tools convert-pipeline-notebooks \ 23 | --input-directory ./pipeline-notebooks \ 24 | --output-directory "$PIPELINE_OUTPUT_DIR" 25 | 26 | NUM_PIPELINE_API_FILES_GENERATED=$(find "$PIPELINE_OUTPUT_DIR" -name "*.py" | wc -l) 27 | 28 | if [[ "$NUM_PIPELINE_API_FILES_GENERATED" -eq 0 ]]; then 29 | echo "No pipelines where created by unstructured_api_tools convert-pipeline-notebooks" 30 | tmp_pipeline_comp_cleanup 1 31 | fi 32 | 33 | NUM_EXISTING_PIPELINE_API_FILES=$(find "$PACKAGE_NAME"/api -name "*.py" | wc -l) 34 | 35 | if [[ "$NUM_PIPELINE_API_FILES_GENERATED" -gt "$NUM_EXISTING_PIPELINE_API_FILES" ]]; then 36 | echo "More pipeline api files were autogenerated than appear in the ${PACKAGE_NAME}/api" 37 | tmp_pipeline_comp_cleanup 1 38 | elif [[ "$NUM_PIPELINE_API_FILES_GENERATED" -lt "$NUM_EXISTING_PIPELINE_API_FILES" ]]; then 39 | echo "Fewer pipeline api files were autogenerated than appear in the ${PACKAGE_NAME}/api" 40 | tmp_pipeline_comp_cleanup 1 41 | fi 42 | 43 | cd "$PACKAGE_NAME"/api 44 | find . -name "*.py" -print0 | while IFS= read -r -d '' pipeline_file; do 45 | set +o pipefail 46 | if ! diff -u "$pipeline_file" ../../"$PIPELINE_OUTPUT_DIR/$pipeline_file"; then 47 | touch "../../$FILE_INDICTATING_FAILURE" 48 | fi 49 | set -o pipefail 50 | done 51 | cd - 52 | 53 | if [ -r "$FILE_INDICTATING_FAILURE" ]; then 54 | echo 55 | echo "Autogenerated pipeline api file(s) do not match existing versions, see above for diff's" 56 | echo " or run: diff -ru ${PACKAGE_NAME}/api/ ${PIPELINE_OUTPUT_DIR}/" 57 | tmp_pipeline_comp_cleanup 1 58 | fi 59 | tmp_pipeline_comp_cleanup 0 60 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # VSCode 132 | .vscode/ 133 | 134 | # Mac 135 | .DS_Store 136 | 137 | # Example forms 138 | sample-sec-docs/ 139 | 140 | nbs/ 141 | 142 | # Celery files that are created when the mercury dashboard is run 143 | celery.sqlite 144 | celerybeat-schedule.db 145 | 146 | # temporarily generated files by project-specific Makefile 147 | tmp* 148 | 149 | # downloaded filings from experimental notebooks, for example 150 | *xbrl 151 | 152 | *.csv 153 | -------------------------------------------------------------------------------- /test_real_docs/generate_first_last.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | # File used to generate 'first' and 'last' for sample-first-last.json 5 | # from the downloaded forms (through "make dl-test-artifacts") 6 | 7 | # NOTE: This file is ran from the root path of the repository 8 | 9 | from prepline_sec_filings.sec_document import SECDocument 10 | 11 | from prepline_sec_filings.sections import section_string_to_enum 12 | 13 | DIRECTORY = os.getcwd() 14 | 15 | RISK_FACTOR_XFAILS = ["aig", "bgs"] 16 | 17 | with open( 18 | os.path.join(DIRECTORY, "test_real_docs", "fixtures", "sample-first-last.json"), 19 | "r", 20 | ) as f: 21 | sample_first_last = json.load(f) 22 | 23 | with open(os.path.join("test_real_docs", "test_utils", "examples.json")) as f: 24 | examples = json.load(f) 25 | 26 | 27 | def get_file_from_ticker(ticker): 28 | cik = examples[ticker]["cik"] 29 | formtype = next(iter(examples[ticker]["forms"])) 30 | accession_number = examples[ticker]["forms"][formtype] 31 | with open( 32 | os.path.join( 33 | "test_real_docs", 34 | "sample-docs", 35 | f"{ticker}-{formtype}-{cik}-{accession_number}.xbrl", 36 | ) 37 | ) as f: 38 | out = f.read() 39 | return out 40 | 41 | 42 | tickers_10q = [ 43 | ticker for ticker in sample_first_last if "10-Q" in examples[ticker]["forms"] 44 | ] # filter only 10-Q docs 45 | 46 | 47 | def get_doc_elements(tickers): 48 | docs_all = {} 49 | for ticker in tickers: 50 | print("at ticker", ticker) 51 | text = get_file_from_ticker(ticker) 52 | doc = SECDocument.from_string(text).doc_after_cleaners(skip_headers_and_footers=True) 53 | docs_all[ticker] = {} 54 | docs_all[ticker]["doc"] = doc 55 | docs_all[ticker]["elements"] = doc.elements 56 | return docs_all 57 | 58 | 59 | def get_doc(docs_all, ticker): 60 | return docs_all[ticker]["doc"], docs_all[ticker]["elements"] 61 | 62 | 63 | sections = [ 64 | "FINANCIAL_STATEMENTS", # ITEM 1 65 | "MANAGEMENT_DISCUSSION", # ITEM 2 66 | "MARKET_RISK_DISCLOSURES", # ITEM 3 67 | "CONTROLS_AND_PROCEDURES", 68 | ] # ITEM 4 69 | 70 | 71 | def print_ticker(docs_all, ticker, sections=sections): 72 | doc, _ = get_doc(docs_all, ticker) 73 | print("### ", ticker, " ###") 74 | for section in sections: 75 | print("----", section, "-----") 76 | # skip if nothing is extracted 77 | if len(doc.get_section_narrative(section_string_to_enum[section])) == 0: 78 | continue 79 | print(doc.get_section_narrative(section_string_to_enum[section])[0]) # first 80 | print(doc.get_section_narrative(section_string_to_enum[section])[-1]) # last 81 | # for el in doc.get_section_narrative(section_string_to_enum[section]): 82 | # print('+',clean_sec_text(el.text)) 83 | 84 | 85 | docs_all = get_doc_elements(tickers_10q) 86 | 87 | for ticker in tickers_10q: 88 | print_ticker(docs_all, ticker) 89 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ "main" ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ "main" ] 20 | schedule: 21 | - cron: '23 21 * * 3' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | permissions: 28 | actions: read 29 | contents: read 30 | security-events: write 31 | 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | language: [ 'python' ] 36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] 37 | # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support 38 | 39 | steps: 40 | - name: Checkout repository 41 | uses: actions/checkout@v4 42 | 43 | # Initializes the CodeQL tools for scanning. 44 | - name: Initialize CodeQL 45 | uses: github/codeql-action/init@v2 46 | with: 47 | languages: ${{ matrix.language }} 48 | # If you wish to specify custom queries, you can do so here or in a config file. 49 | # By default, queries listed here will override any specified in a config file. 50 | # Prefix the list here with "+" to use these queries and those in the config file. 51 | 52 | # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs 53 | # queries: security-extended,security-and-quality 54 | 55 | 56 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 57 | # If this step fails, then you should remove it and run the build manually (see below) 58 | - name: Autobuild 59 | uses: github/codeql-action/autobuild@v2 60 | 61 | # ℹ️ Command-line programs to run using the OS shell. 62 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun 63 | 64 | # If the Autobuild fails above, remove it and uncomment the following three lines. 65 | # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. 66 | 67 | # - run: | 68 | # echo "Run, Build Application using script" 69 | # ./location_of_script_within_repo/buildscript.sh 70 | 71 | - name: Perform CodeQL Analysis 72 | uses: github/codeql-action/analyze@v2 73 | with: 74 | category: "/language:${{matrix.language}}" 75 | -------------------------------------------------------------------------------- /sample-docs/sample-sec-docs.sha256: -------------------------------------------------------------------------------- 1 | 5ecd0a02875508c69b46c8cc558b8b9901bc5a9bc3068401680f248d00167e7f sample-docs/aig-10-K-5272-000110465922024701.xbrl 2 | 34339a8914828791bbf0fad3d619b031d6fd428135a7069b237d716ee79052df sample-docs/apld-S-1-1144879-000110465921142627.xbrl 3 | 697bdcde08961f9288a89b965468369d0c0df852d77259f3c5729a7e0177bf7f sample-docs/asns-S-1-1141284-000121390022020064.xbrl 4 | e2e7c4ebb4006ef8f2efbeeb0bdab0f2e924bbd753c11ff907c44a012efeb22b sample-docs/aust-S-1-1817740-000110465921128425.xbrl 5 | 6db5134f2231d37f0b5d744b2bc3db413a32ac5555370a3b5c8f2ffe32553dfa sample-docs/ava-10-Q-104918-000095017021000739.xbrl 6 | 85181a1b87583159f86bf98127225ad27779e7d48ded2b1de4d51332c4658eb4 sample-docs/bc-10-Q-14930-000001493021000103.xbrl 7 | 429ff6e82fee2d54c0b51b75386d09fee74b68e003ae3a10aa7d8f921bb227f4 sample-docs/bgs-S-1-1278027-000104746904003937.xbrl 8 | 70a7883d199ff125eb767424a017320591c82f9b6bb9bbf4141760d1dfdfe1ab sample-docs/bj-S-1-1531152-000119312519032591.xbrl 9 | 396de04f2546df697f5b45334a2328556eaeaf8bcdc5dd88d55039c8f79f7b94 sample-docs/blco-S-1-1860742-000119312522008667.xbrl 10 | 1a810c8b67be0debf18f4111e88d48b5f50d7ff0f5db9bf20f5df6f890b9a266 sample-docs/brks-10-Q-933974-000155837021006699.xbrl 11 | 9c673ad2f1b446d1b82d11207c6305b55220bdffb6cf403977dcb9f006601781 sample-docs/cl-10-Q-21665-000002166522000010.xbrl 12 | 3a41f0cef88bea6848ab68b80d1379ffe52becaab236feb550a9eee18402bce1 sample-docs/cri-10-K-1060822-000106082222000096.xbrl 13 | 1e7bf289bb40be4befff79c3a9a2d6d14a1c49fff54e094a0855cc51a66dd613 sample-docs/cvs-10-K-64803-000006480322000008.xbrl 14 | 6d105033564a5e3be0575a7b0488c3bf60a4204b724c2b761d6ada23589080be sample-docs/dis-10-K-1744489-000174448921000220.xbrl 15 | 0e861b443d4c6e8e1d0f81f319ff7aae79af7b0d365c97646e03f56a391ff432 sample-docs/doc-10-Q-1574540-000157454021000146.xbrl 16 | cc78e478fc7cf839e3327227d73b1d197e12eca42f7bf615c77f84a167a4333b sample-docs/ee-S-1-1888447-000114036122000986.xbrl 17 | 73bf9bf74776a6ac5b27757c617c0a9392c4a1d7bc13bc952738a66fad878fae sample-docs/ehc-10-K-785161-000078516122000008.xbrl 18 | 48bbe980eafb779417e08cb1772a60043c095cee26ef6864d821ecde647bd70d sample-docs/f-10-Q-37996-000003799622000024.xbrl 19 | af29fd3e7d51f4c535814ed226513197658800d4348c9245e18a868f00427a2d sample-docs/hlvx-S-1-1888012-000119312522097505.xbrl 20 | 69dbdf8134fc8933dbe8fbb7d793ea3f02d9bd8dfafdd327651e65abd3bddad7 sample-docs/lmt-10-K-936468-000093646822000008.xbrl 21 | bc9e17ff46da6e6017c4d1bc0307c20a5dfcfd4b455cdf2c1adb737fe5b03f12 sample-docs/mmm-10-Q-66740-000006674022000065.xbrl 22 | b705207be1f9164e7f731fa5738e15205e3ab8d589f3fc170b88d50d2335aed7 sample-docs/mrk-10-Q-310158-000031015821000028.xbrl 23 | 18069c61245d5110f983407829149592acbc0932d5e715f02bd45705e8dfdd6e sample-docs/msex-10-K-66004-000117494722000283.xbrl 24 | 8994d75c07ce9e66a968b0cbe0146b265805a7544d10087ff7c68085e6b06f7c sample-docs/nke-10-K-320187-000032018722000038.xbrl 25 | d3e500d6b861c291fd8a0eb53b98470d2108c432688a27614f4468e4f4514091 sample-docs/omcl-10-Q-926326-000092632622000014.xbrl 26 | 9fd0e54840a65723ff82f325cbf9e19697fdea99c066334be0580777ecfb469a sample-docs/pepg-S-1-1835597-000119312522106884.xbrl 27 | 893d69a8ae134aa723da7df0b4e4fd93598ca0d44e4954eabf7ad00636e47763 sample-docs/rgld-10-K-85535-000155837021011343.xbrl 28 | a127c6759870e49ed0e687ad4b907f2bdbbb8a15424859dfd86042b173e36aad sample-docs/smtc-10-K-88941-000008894122000006.xbrl 29 | db77a55421cbbb33dcb1ff02e5e7508eef78956565004871c42c8d495e1b5e36 sample-docs/v-10-Q-1403161-000140316122000027.xbrl 30 | 02b834ff2c0ab4e7768d52811ab963e7cb74d0cadb4b06d8be5d9f9d143ea04d sample-docs/wetg-S-1-1784970-000147793221000299.xbrl 31 | -------------------------------------------------------------------------------- /test_utils/examples.json: -------------------------------------------------------------------------------- 1 | { 2 | "mmm": { 3 | "cik": "66740", 4 | "forms": { 5 | "10-Q": "000006674022000065" 6 | } 7 | }, 8 | "aig": { 9 | "cik": "5272", 10 | "forms": { 11 | "10-K": "000110465922024701" 12 | } 13 | }, 14 | "cl": { 15 | "cik": "21665", 16 | "forms": { 17 | "10-Q": "000002166522000010" 18 | } 19 | }, 20 | "cvs": { 21 | "cik": "64803", 22 | "forms": { 23 | "10-K": "000006480322000008" 24 | } 25 | }, 26 | "f": { 27 | "cik": "37996", 28 | "forms": { 29 | "10-Q": "000003799622000024" 30 | } 31 | }, 32 | "lmt": { 33 | "cik": "936468", 34 | "forms": { 35 | "10-K": "000093646822000008" 36 | } 37 | }, 38 | "mrk": { 39 | "cik": "310158", 40 | "forms": { 41 | "10-Q": "000031015821000028" 42 | } 43 | }, 44 | "nke": { 45 | "cik": "320187", 46 | "forms": { 47 | "10-K": "000032018722000038" 48 | } 49 | }, 50 | "v": { 51 | "cik": "1403161", 52 | "forms": { 53 | "10-Q": "000140316122000027" 54 | } 55 | }, 56 | "dis": { 57 | "cik": "1744489", 58 | "forms": { 59 | "10-K": "000174448921000220" 60 | } 61 | }, 62 | "brks": { 63 | "cik": "933974", 64 | "forms": { 65 | "10-Q": "000155837021006699" 66 | } 67 | }, 68 | "rgld": { 69 | "cik": "85535", 70 | "forms": { 71 | "10-K": "000155837021011343" 72 | } 73 | }, 74 | "bc": { 75 | "cik": "14930", 76 | "forms": { 77 | "10-Q": "000001493021000103" 78 | } 79 | }, 80 | "cri": { 81 | "cik": "1060822", 82 | "forms": { 83 | "10-K": "000106082222000096" 84 | } 85 | }, 86 | "doc": { 87 | "cik": "1574540", 88 | "forms": { 89 | "10-Q": "000157454021000146" 90 | } 91 | }, 92 | "pepg": { 93 | "cik": "1835597", 94 | "forms": { 95 | "S-1": "000119312522106884" 96 | } 97 | }, 98 | "ehc": { 99 | "cik": "785161", 100 | "forms": { 101 | "10-K": "000078516122000008" 102 | } 103 | }, 104 | "bj": { 105 | "cik": "1531152", 106 | "forms": { 107 | "S-1": "000119312519032591" 108 | } 109 | }, 110 | "omcl": { 111 | "cik": "926326", 112 | "forms": { 113 | "10-Q": "000092632622000014" 114 | } 115 | }, 116 | "smtc": { 117 | "cik": "88941", 118 | "forms": { 119 | "10-K": "000008894122000006" 120 | } 121 | }, 122 | "ava": { 123 | "cik": "104918", 124 | "forms": { 125 | "10-Q": "000095017021000739" 126 | } 127 | }, 128 | "msex": { 129 | "cik": "66004", 130 | "forms": { 131 | "10-K": "000117494722000283" 132 | } 133 | }, 134 | "bgs": { 135 | "cik": "1278027", 136 | "forms": { 137 | "S-1": "000104746904003937" 138 | } 139 | }, 140 | "aust": { 141 | "cik": "1817740", 142 | "forms": { 143 | "S-1": "000110465921128425" 144 | } 145 | }, 146 | "wetg": { 147 | "cik": "1784970", 148 | "forms": { 149 | "S-1": "000147793221000299" 150 | } 151 | }, 152 | "hlvx": { 153 | "cik": "1888012", 154 | "forms": { 155 | "S-1": "000119312522097505" 156 | } 157 | }, 158 | "apld": { 159 | "cik": "1144879", 160 | "forms": { 161 | "S-1": "000110465921142627" 162 | } 163 | }, 164 | "asns": { 165 | "cik": "1141284", 166 | "forms": { 167 | "S-1": "000121390022020064" 168 | } 169 | }, 170 | "ee": { 171 | "cik": "1888447", 172 | "forms": { 173 | "S-1": "000114036122000986" 174 | } 175 | }, 176 | "blco": { 177 | "cik": "1860742", 178 | "forms": { 179 | "S-1": "000119312522008667" 180 | } 181 | } 182 | } -------------------------------------------------------------------------------- /requirements/test.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.8 3 | # by the following command: 4 | # 5 | # pip-compile requirements/test.in 6 | # 7 | anyio==3.7.0 8 | # via httpcore 9 | appnope==0.1.3 10 | # via 11 | # ipykernel 12 | # ipython 13 | asttokens==2.2.1 14 | # via 15 | # nbdev 16 | # stack-data 17 | astunparse==1.6.3 18 | # via nbdev 19 | backcall==0.2.0 20 | # via ipython 21 | black==23.3.0 22 | # via -r requirements/test.in 23 | certifi==2023.5.7 24 | # via 25 | # httpcore 26 | # httpx 27 | click==8.1.3 28 | # via 29 | # -r requirements/test.in 30 | # black 31 | comm==0.1.3 32 | # via ipykernel 33 | coverage[toml]==7.2.7 34 | # via pytest-cov 35 | debugpy==1.6.7 36 | # via ipykernel 37 | decorator==5.1.1 38 | # via ipython 39 | exceptiongroup==1.1.1 40 | # via 41 | # anyio 42 | # pytest 43 | execnb==0.1.5 44 | # via nbdev 45 | executing==1.2.0 46 | # via stack-data 47 | fastcore==1.5.29 48 | # via 49 | # execnb 50 | # ghapi 51 | # nbdev 52 | flake8==6.0.0 53 | # via -r requirements/test.in 54 | ghapi==1.0.3 55 | # via nbdev 56 | h11==0.14.0 57 | # via httpcore 58 | httpcore==0.17.2 59 | # via httpx 60 | httpx==0.24.1 61 | # via -r requirements/test.in 62 | idna==3.4 63 | # via 64 | # anyio 65 | # httpx 66 | importlib-metadata==6.6.0 67 | # via jupyter-client 68 | iniconfig==2.0.0 69 | # via pytest 70 | ipykernel==6.23.1 71 | # via -r requirements/test.in 72 | ipython==8.12.2 73 | # via 74 | # execnb 75 | # ipykernel 76 | jedi==0.18.2 77 | # via ipython 78 | jupyter-client==8.2.0 79 | # via ipykernel 80 | jupyter-core==5.3.0 81 | # via 82 | # -r requirements/test.in 83 | # ipykernel 84 | # jupyter-client 85 | matplotlib-inline==0.1.6 86 | # via 87 | # ipykernel 88 | # ipython 89 | mccabe==0.7.0 90 | # via flake8 91 | mypy==1.3.0 92 | # via -r requirements/test.in 93 | mypy-extensions==1.0.0 94 | # via 95 | # black 96 | # mypy 97 | nbdev==2.3.12 98 | # via -r requirements/test.in 99 | nest-asyncio==1.5.6 100 | # via ipykernel 101 | packaging==23.1 102 | # via 103 | # black 104 | # fastcore 105 | # ghapi 106 | # ipykernel 107 | # pytest 108 | parso==0.8.3 109 | # via jedi 110 | pathspec==0.11.1 111 | # via black 112 | pexpect==4.8.0 113 | # via ipython 114 | pickleshare==0.7.5 115 | # via ipython 116 | platformdirs==3.5.1 117 | # via 118 | # black 119 | # jupyter-core 120 | pluggy==1.0.0 121 | # via pytest 122 | prompt-toolkit==3.0.38 123 | # via ipython 124 | psutil==5.9.5 125 | # via ipykernel 126 | ptyprocess==0.7.0 127 | # via pexpect 128 | pure-eval==0.2.2 129 | # via stack-data 130 | pycodestyle==2.10.0 131 | # via flake8 132 | pyflakes==3.0.1 133 | # via flake8 134 | pygments==2.15.1 135 | # via ipython 136 | pytest==7.3.1 137 | # via pytest-cov 138 | pytest-cov==4.1.0 139 | # via -r requirements/test.in 140 | python-dateutil==2.8.2 141 | # via jupyter-client 142 | pyyaml==6.0 143 | # via nbdev 144 | pyzmq==25.1.0 145 | # via 146 | # ipykernel 147 | # jupyter-client 148 | six==1.16.0 149 | # via 150 | # asttokens 151 | # astunparse 152 | # python-dateutil 153 | sniffio==1.3.0 154 | # via 155 | # anyio 156 | # httpcore 157 | # httpx 158 | stack-data==0.6.2 159 | # via ipython 160 | tomli==2.0.1 161 | # via 162 | # black 163 | # coverage 164 | # mypy 165 | # pytest 166 | tornado==6.3.2 167 | # via 168 | # ipykernel 169 | # jupyter-client 170 | traitlets==5.9.0 171 | # via 172 | # comm 173 | # ipykernel 174 | # ipython 175 | # jupyter-client 176 | # jupyter-core 177 | # matplotlib-inline 178 | typing-extensions==4.6.3 179 | # via 180 | # black 181 | # ipython 182 | # mypy 183 | watchdog==3.0.0 184 | # via nbdev 185 | wcwidth==0.2.6 186 | # via prompt-toolkit 187 | wheel==0.40.0 188 | # via astunparse 189 | zipp==3.15.0 190 | # via importlib-metadata 191 | 192 | # The following packages are considered to be unsafe in a requirements file: 193 | # pip 194 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | # NOTE(robinson) - We are limiting when we run CI avoid exceeding our 2,000 min/month limt. 5 | # We can switch to running on push if we make this repo public or are fine with 6 | # paying for CI minutes. 7 | push: 8 | branches: [ main ] 9 | pull_request: 10 | branches: [ main ] 11 | 12 | env: 13 | PYTHON_VERSION: "3.8" 14 | 15 | jobs: 16 | setup: 17 | runs-on: ubuntu-latest 18 | env: 19 | NLTK_DATA: ${{ github.workspace }}/nltk_data 20 | steps: 21 | - uses: actions/checkout@v4 22 | - uses: actions/cache@v3 23 | id: virtualenv-cache 24 | with: 25 | path: | 26 | .venv 27 | sample-docs 28 | key: ci-venv-sec-${{ hashFiles('requirements/*.txt', 'sample-docs/sample-sec-docs.sha256') }} 29 | - name: Set up Python ${{ env.PYTHON_VERSION }} 30 | uses: actions/setup-python@v4 31 | with: 32 | python-version: ${{ env.PYTHON_VERSION }} 33 | - name: Setup virtual environment (no cache hit) 34 | if: steps.virtualenv-cache.outputs.cache-hit != 'true' 35 | run: | 36 | python${{ env.PYTHON_VERSION }} -m venv .venv 37 | source .venv/bin/activate 38 | make install-ci 39 | make dl-test-artifacts 40 | - uses: actions/cache@v3 41 | id: nltk-cache 42 | with: 43 | path: /home/runner/nltk_data 44 | key: ci-nltk-${{ hashFiles('requirements/*.txt') }} 45 | - name: Download NLTK (no cache hit) 46 | if: steps.nltk-cache.outputs.cache-hit != 'true' 47 | run: | 48 | source .venv/bin/activate 49 | make install-nltk-models 50 | 51 | lint: 52 | runs-on: ubuntu-latest 53 | needs: setup 54 | steps: 55 | - uses: actions/checkout@v4 56 | - uses: actions/cache@v3 57 | id: virtualenv-cache 58 | with: 59 | path: | 60 | .venv 61 | sample-docs 62 | key: ci-venv-sec-${{ hashFiles('requirements/*.txt', 'sample-docs/sample-sec-docs.sha256') }} 63 | - uses: actions/cache@v3 64 | id: nltk-cache 65 | with: 66 | path: /home/runner/nltk_data 67 | key: ci-nltk-${{ hashFiles('requirements/*.txt') }} 68 | - name: Set up Python ${{ env.PYTHON_VERSION }} 69 | uses: actions/setup-python@v4 70 | with: 71 | python-version: ${{ env.PYTHON_VERSION }} 72 | - name: Lint 73 | run: | 74 | source .venv/bin/activate 75 | make check 76 | make check-notebooks 77 | 78 | shellcheck: 79 | runs-on: ubuntu-latest 80 | steps: 81 | - uses: actions/checkout@v4 82 | - name: ShellCheck 83 | uses: ludeeus/action-shellcheck@master 84 | 85 | test: 86 | runs-on: ubuntu-latest 87 | env: 88 | NLTK_DATA: ${{ github.workspace }}/nltk_data 89 | needs: [setup, lint] 90 | steps: 91 | - uses: actions/checkout@v4 92 | - uses: actions/cache@v3 93 | id: virtualenv-cache 94 | with: 95 | path: | 96 | .venv 97 | sample-docs 98 | key: ci-venv-sec-${{ hashFiles('requirements/*.txt', 'sample-docs/sample-sec-docs.sha256') }} 99 | - uses: actions/cache@v3 100 | id: nltk-cache 101 | with: 102 | path: /home/runner/nltk_data 103 | key: ci-nltk-${{ hashFiles('requirements/*.txt') }} 104 | - name: Run core tests 105 | run: | 106 | source .venv/bin/activate 107 | make test 108 | make check-coverage 109 | - name: Run sample SEC documents tests 110 | run: | 111 | source .venv/bin/activate 112 | make test-sample-docs 113 | 114 | changelog: 115 | runs-on: ubuntu-latest 116 | steps: 117 | - uses: actions/checkout@v4 118 | - if: github.ref != 'refs/heads/main' 119 | uses: dorny/paths-filter@v2 120 | id: changes 121 | with: 122 | filters: | 123 | src: 124 | - 'doc_recipe/**' 125 | - 'recipe-notebooks/**' 126 | 127 | - if: steps.changes.outputs.src == 'true' && github.ref != 'refs/heads/main' 128 | uses: dangoslen/changelog-enforcer@v3 129 | 130 | api_consistency: 131 | runs-on: ubuntu-latest 132 | needs: setup 133 | steps: 134 | - uses: actions/checkout@v4 135 | - uses: actions/cache@v3 136 | id: virtualenv-cache 137 | with: 138 | path: | 139 | .venv 140 | sample-docs 141 | key: ci-venv-sec-${{ hashFiles('requirements/*.txt', 'sample-docs/sample-sec-docs.sha256') }} 142 | - name: API Consistency 143 | run: | 144 | source .venv/bin/activate 145 | make api-check 146 | -------------------------------------------------------------------------------- /requirements/base.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.8 3 | # by the following command: 4 | # 5 | # pip-compile requirements/base.in 6 | # 7 | anyio==3.7.0 8 | # via 9 | # starlette 10 | # watchfiles 11 | attrs==23.1.0 12 | # via jsonschema 13 | beautifulsoup4==4.12.2 14 | # via nbconvert 15 | bleach==6.0.0 16 | # via nbconvert 17 | certifi==2023.5.7 18 | # via requests 19 | charset-normalizer==3.1.0 20 | # via requests 21 | click==8.1.3 22 | # via 23 | # nltk 24 | # unstructured-api-tools 25 | # uvicorn 26 | defusedxml==0.7.1 27 | # via nbconvert 28 | exceptiongroup==1.1.1 29 | # via anyio 30 | fastapi==0.95.2 31 | # via unstructured-api-tools 32 | fastjsonschema==2.17.1 33 | # via nbformat 34 | h11==0.14.0 35 | # via uvicorn 36 | httptools==0.5.0 37 | # via uvicorn 38 | idna==3.4 39 | # via 40 | # anyio 41 | # requests 42 | importlib-metadata==6.6.0 43 | # via 44 | # jupyter-client 45 | # nbconvert 46 | importlib-resources==5.12.0 47 | # via jsonschema 48 | jinja2==3.1.2 49 | # via 50 | # nbconvert 51 | # unstructured-api-tools 52 | joblib==1.2.0 53 | # via 54 | # nltk 55 | # scikit-learn 56 | jsonschema==4.17.3 57 | # via nbformat 58 | jupyter-client==8.2.0 59 | # via nbclient 60 | jupyter-core==5.3.0 61 | # via 62 | # -r requirements/base.in 63 | # jupyter-client 64 | # nbclient 65 | # nbconvert 66 | # nbformat 67 | jupyterlab-pygments==0.2.2 68 | # via nbconvert 69 | lxml==4.9.2 70 | # via unstructured 71 | markupsafe==2.1.2 72 | # via 73 | # jinja2 74 | # nbconvert 75 | mistune==2.0.5 76 | # via nbconvert 77 | mypy==1.3.0 78 | # via unstructured-api-tools 79 | mypy-extensions==1.0.0 80 | # via mypy 81 | nbclient==0.8.0 82 | # via nbconvert 83 | nbconvert==7.4.0 84 | # via unstructured-api-tools 85 | nbformat==5.9.0 86 | # via 87 | # nbclient 88 | # nbconvert 89 | nltk==3.8.1 90 | # via unstructured 91 | numpy==1.24.3 92 | # via 93 | # -r requirements/base.in 94 | # scikit-learn 95 | # scipy 96 | packaging==23.1 97 | # via 98 | # -r requirements/base.in 99 | # nbconvert 100 | pandocfilters==1.5.0 101 | # via nbconvert 102 | pkgutil-resolve-name==1.3.10 103 | # via jsonschema 104 | platformdirs==3.5.1 105 | # via jupyter-core 106 | pydantic==1.10.8 107 | # via fastapi 108 | pygments==2.15.1 109 | # via nbconvert 110 | pyrsistent==0.19.3 111 | # via jsonschema 112 | python-dateutil==2.8.2 113 | # via jupyter-client 114 | python-dotenv==1.0.0 115 | # via uvicorn 116 | python-multipart==0.0.6 117 | # via unstructured-api-tools 118 | pyyaml==6.0 119 | # via uvicorn 120 | pyzmq==25.1.0 121 | # via jupyter-client 122 | ratelimit==2.2.1 123 | # via -r requirements/base.in 124 | regex==2023.5.5 125 | # via nltk 126 | requests==2.31.0 127 | # via -r requirements/base.in 128 | scikit-learn==1.2.2 129 | # via -r requirements/base.in 130 | scipy==1.10.1 131 | # via scikit-learn 132 | six==1.16.0 133 | # via 134 | # bleach 135 | # python-dateutil 136 | sniffio==1.3.0 137 | # via anyio 138 | soupsieve==2.4.1 139 | # via beautifulsoup4 140 | starlette==0.27.0 141 | # via fastapi 142 | threadpoolctl==3.1.0 143 | # via scikit-learn 144 | tinycss2==1.2.1 145 | # via nbconvert 146 | tomli==2.0.1 147 | # via mypy 148 | tornado==6.3.2 149 | # via jupyter-client 150 | tqdm==4.65.0 151 | # via nltk 152 | traitlets==5.9.0 153 | # via 154 | # -r requirements/base.in 155 | # jupyter-client 156 | # jupyter-core 157 | # nbclient 158 | # nbconvert 159 | # nbformat 160 | types-requests==2.31.0.1 161 | # via unstructured-api-tools 162 | types-ujson==5.7.0.5 163 | # via unstructured-api-tools 164 | types-urllib3==1.26.25.13 165 | # via types-requests 166 | typing-extensions==4.6.3 167 | # via 168 | # mypy 169 | # pydantic 170 | # starlette 171 | unstructured==0.2.5 172 | # via -r requirements/base.in 173 | unstructured-api-tools==0.10.6 174 | # via -r requirements/base.in 175 | urllib3==2.0.2 176 | # via requests 177 | uvicorn[standard]==0.22.0 178 | # via unstructured-api-tools 179 | uvloop==0.17.0 180 | # via uvicorn 181 | watchfiles==0.19.0 182 | # via uvicorn 183 | webencodings==0.5.1 184 | # via 185 | # bleach 186 | # tinycss2 187 | websockets==11.0.3 188 | # via uvicorn 189 | zipp==3.15.0 190 | # via 191 | # importlib-metadata 192 | # importlib-resources 193 | -------------------------------------------------------------------------------- /test_utils/get_sec_docs_from_edgar.py: -------------------------------------------------------------------------------- 1 | """ 2 | Downloads example SEC filings from the SEC EDGAR API as specified by examples.json. 3 | Not normally intended to be called by users as it hits EDGAR directly. 4 | Filings for testing/CI instead will be downloaded from s3. 5 | """ 6 | import json 7 | import os 8 | import re 9 | from pathlib import Path 10 | 11 | 12 | from prepline_sec_filings.fetch import ( 13 | get_filing, 14 | get_recent_acc_by_cik, 15 | get_recent_cik_and_acc_by_ticker, 16 | ) 17 | 18 | 19 | SEC_DOCS_DIR = os.environ.get("SEC_DOCS_DIR", "sample-docs") 20 | SEC_API_ORGANIZATION = os.environ.get("SEC_API_ORGANIZATION") 21 | SEC_API_EMAIL = os.environ.get("SEC_API_EMAIL") 22 | # only 1 of these 2 manifests types determines what gets downloaded 23 | FILINGS_MANIFEST_JSON = os.path.join("test_utils", "examples.json") 24 | FILINGS_MANIFEST_FILE = os.environ.get("FILINGS_MANIFEST_FILE") 25 | 26 | 27 | def fetch_filing_xbrl(ticker, form_type, cik, accession_number, skip_fetch_if_file_exists=True): 28 | "Fetch a single filing from edgar and write it to $SEC_DOCS_DIR" 29 | _doc_name = f"{ticker}-{form_type}-{cik}-{accession_number}.xbrl".replace("/", "") 30 | sec_doc_filename = os.path.join(SEC_DOCS_DIR, _doc_name) 31 | if skip_fetch_if_file_exists and Path(sec_doc_filename).is_file(): 32 | print(f"skipping download since {sec_doc_filename} exists") 33 | return 34 | 35 | text = get_filing(cik, accession_number, SEC_API_ORGANIZATION, SEC_API_EMAIL) 36 | with open(sec_doc_filename, "w+") as f: 37 | f.write(text) 38 | 39 | 40 | def parse_examples_json(): 41 | with open(FILINGS_MANIFEST_JSON, "r") as f: 42 | manifest_json_obj = json.load(f) 43 | return manifest_json_obj 44 | 45 | 46 | def parse_manifest_text_file(): 47 | ticker_form_type_pairs = [] 48 | with open(FILINGS_MANIFEST_FILE, "r") as f: 49 | for line in f.readlines(): 50 | line = line.strip() 51 | if line and not line.startswith("#"): 52 | m = re.match(r"(\w+)\s+(\S+)\s*", line) 53 | ticker_form_type_pairs.append(m.groups()) 54 | return ticker_form_type_pairs 55 | 56 | 57 | def fetch_filings(manifest_json_obj): 58 | """Given json like: 59 | { 60 | "mmm": { 61 | "cik": "66740", 62 | "forms": { 63 | "10-Q": "000006674022000065" 64 | } 65 | }, 66 | download the indicated xbrl documents from edgar. 67 | """ 68 | for ticker, filing_info in manifest_json_obj.items(): 69 | cik = filing_info["cik"] 70 | for form_type, accession_number in filing_info["forms"].items(): 71 | fetch_filing_xbrl(ticker, form_type, cik, accession_number) 72 | print(f"fetched {ticker}") 73 | 74 | 75 | def get_sample_docs(): 76 | """Fetch filings from edgar ultimately to be used for 'make test-sample-docs'.""" 77 | fetch_filings(parse_examples_json()) 78 | 79 | 80 | def _add_to_manifest_json_obj(manifest_json_obj, ticker, form_type, cik, acc_num): 81 | if ticker not in manifest_json_obj: 82 | manifest_json_obj[ticker] = {"forms": {}} 83 | if cik in manifest_json_obj[ticker]: 84 | assert manifest_json_obj[ticker]["cik"] == cik 85 | else: 86 | manifest_json_obj[ticker]["cik"] = cik 87 | manifest_json_obj[ticker]["forms"][form_type] = acc_num 88 | 89 | 90 | def get_latest_docs(): 91 | """Fetch filings from edgar, but unlike get_sample_docs() the 92 | acession_number and cik that correspond to the most recent filing are 93 | determined at runtime.""" 94 | 95 | manifest_json_obj = {} 96 | for ticker_or_cik, _form_type in parse_manifest_text_file(): 97 | ticker_or_cik = ticker_or_cik.lower() 98 | _form_type = _form_type.upper() # just following the convention :) 99 | print(f"{ticker_or_cik}-{_form_type}...", end="", flush=True) 100 | if re.search(r"^\d+$", ticker_or_cik): 101 | cik = ticker_or_cik 102 | acc_num, form_type = get_recent_acc_by_cik(cik, _form_type) 103 | else: 104 | ticker = ticker_or_cik 105 | cik, acc_num, form_type = get_recent_cik_and_acc_by_ticker(ticker, _form_type) 106 | _add_to_manifest_json_obj(manifest_json_obj, ticker_or_cik, form_type, cik, acc_num) 107 | fetch_filing_xbrl(ticker_or_cik, form_type, cik, acc_num) 108 | 109 | with open(os.path.join(SEC_DOCS_DIR, "sec_docs_manifest.json"), "w") as f: 110 | json.dump(manifest_json_obj, f, indent=2) 111 | 112 | 113 | if __name__ == "__main__": 114 | if SEC_API_ORGANIZATION is None or SEC_API_EMAIL is None: 115 | raise RuntimeError( 116 | "Environment vaiables SEC_API_ORGANIZATION and SEC_API_EMAIL " 117 | "must be set for SEC EDGAR API call (allows them to identify the consumer)" 118 | ) 119 | Path(SEC_DOCS_DIR).mkdir(exist_ok=True) 120 | 121 | if not FILINGS_MANIFEST_FILE: 122 | # documents related to python tests in test_real_docs/ 123 | print("env var FILINGS_MANIFEST_FILE not defined, fetching docs for python tests") 124 | get_sample_docs() 125 | else: 126 | # pull latest filings in FILINGS_MANIFEST_FILE for reasons beknownst to user 127 | get_latest_docs() 128 | -------------------------------------------------------------------------------- /test_utils/create_validation_csv_files.py: -------------------------------------------------------------------------------- 1 | """Given an $SEC_DOCS_DIR with a sec_docs_manifest.json file, create 2 | a CSV with all extracted sections, one row per section.""" 3 | import json 4 | import os 5 | import subprocess 6 | from pathlib import Path 7 | import time 8 | 9 | import pandas as pd 10 | 11 | 12 | from prepline_sec_filings.fetch import archive_url 13 | from prepline_sec_filings.sections import SECTIONS_10K, SECTIONS_10Q, SECTIONS_S1 14 | from prepline_sec_filings.sec_document import SECDocument 15 | from unstructured_api_tools.pipelines.api_conventions import get_pipeline_path 16 | 17 | 18 | SEC_DOCS_DIR = os.environ.get("SEC_DOCS_DIR") 19 | CSV_FILES_DIR = os.environ.get("CSV_FILES_DIR") 20 | FILINGS_MANIFEST_JSON = os.environ.get( 21 | "FILINGS_MANIFEST_JSON", os.path.join(SEC_DOCS_DIR, "sec_docs_manifest.json") 22 | ) 23 | PIPELINE_SECTION_API_URL = os.environ.get( 24 | "PIPELINE_SECTION_API_URL", f"http://127.0.0.1:8000{get_pipeline_path('section')}" 25 | ) 26 | 27 | 28 | def _fetch_response_from_api_curl(sec_doc_filename): 29 | time.sleep(1) 30 | command = [ 31 | "curl", 32 | "-s", 33 | f"{PIPELINE_SECTION_API_URL}", 34 | "-H", 35 | "Accept: application/json", 36 | "-H", 37 | "Content-Type: multipart/form-data", 38 | "-F", 39 | f"file=@{sec_doc_filename}", 40 | "-F", 41 | "section=_ALL", 42 | ] 43 | proc = subprocess.run(command, capture_output=True) 44 | 45 | resp_data = {} 46 | if proc.returncode != 0: 47 | print(f"Failed to get results for {sec_doc_filename}", flush=True) 48 | print(proc.stderr) 49 | else: 50 | try: 51 | resp_data = json.loads(proc.stdout.decode("utf-8")) 52 | if "error" in resp_data: 53 | print(f"Error in response for api for {sec_doc_filename}", flush=True) 54 | print(resp_data) 55 | resp_data = {} 56 | except json.decoder.JSONDecodeError: 57 | print(f"failed to create json obj from the response for {command}") 58 | return resp_data 59 | 60 | 61 | def parse_manifest_json(): 62 | with open(FILINGS_MANIFEST_JSON, "r") as f: 63 | manifest_json_obj = json.load(f) 64 | return manifest_json_obj 65 | 66 | 67 | def _bookkeeping_info(keys, values, ticker_or_cik, cik, acc_num): 68 | """Add convenience lookup keys/values to row.""" 69 | keys.append("url_for_xbrl") 70 | values.append(archive_url(cik, acc_num)) 71 | keys.append("url_for_all_filings") 72 | values.append(f"https://www.sec.gov/edgar/browse/?CIK={cik}") 73 | keys.append("identifier") 74 | values.append(ticker_or_cik) 75 | 76 | 77 | def _csv_filename(ticker_or_cik, form_type, cik, acc_num): 78 | return os.path.join( 79 | CSV_FILES_DIR, f"{ticker_or_cik}-{form_type}-{cik}-{acc_num}.csv".replace("/", "") 80 | ) 81 | 82 | 83 | def _write_csv(keys, values, ticker_or_cik, form_type, cik, acc_num): 84 | df = pd.DataFrame({"key": pd.Series(keys), "value": pd.Series(values)}) 85 | df.to_csv( 86 | _csv_filename(ticker_or_cik, form_type, cik, acc_num), 87 | sep="\t", 88 | encoding="utf-8", 89 | index=False, 90 | ) 91 | 92 | 93 | def gen_csv(sec_doc_filename, ticker_or_cik, form_type, cik, acc_num): 94 | keys = [] 95 | values = [] 96 | 97 | _bookkeeping_info(keys, values, ticker_or_cik, cik, acc_num) 98 | resp_data = _fetch_response_from_api_curl(sec_doc_filename) 99 | if not resp_data: 100 | return 101 | for _key, _value in resp_data.items(): 102 | keys.append(_key) 103 | values.append("\n".join([elem["text"] for elem in _value])) 104 | _write_csv(keys, values, ticker_or_cik, form_type, cik, acc_num) 105 | 106 | 107 | def _gen_csv_no_api(filing_file_handle, ticker_or_cik, form_type, cik, acc_num): 108 | keys = [] 109 | values = [] 110 | filing_content = filing_file_handle.read() 111 | 112 | _bookkeeping_info(keys, values, ticker_or_cik, cik, acc_num) 113 | 114 | sec_document = SECDocument.from_string(filing_content) 115 | if "K" in form_type: 116 | sections = SECTIONS_10K 117 | elif "Q" in form_type: 118 | sections = SECTIONS_10Q 119 | else: 120 | sections = SECTIONS_S1 121 | 122 | for section in sections: 123 | print(section) 124 | result = "\n".join([str(elem) for elem in sec_document.get_section_narrative(section)]) 125 | keys.append(section.name) 126 | values.append(result) 127 | _write_csv(keys, values, ticker_or_cik, form_type, cik, acc_num) 128 | 129 | 130 | def gen_csvs(manifest_json_obj): 131 | """create CSVs given a manifest_json_obj which looks like: 132 | { 133 | "mmm": { 134 | "cik": "66740", 135 | "forms": { 136 | "10-Q": "000006674022000065" 137 | } 138 | }, 139 | "0001156784": { 140 | "forms": { 141 | "S-1/A": "000149315222026129" 142 | }, 143 | "cik": "0001156784" 144 | }, 145 | """ 146 | Path(CSV_FILES_DIR).mkdir(exist_ok=True) 147 | 148 | for ticker_or_cik in manifest_json_obj: 149 | cik = manifest_json_obj[ticker_or_cik]["cik"] 150 | for form_type in manifest_json_obj[ticker_or_cik]["forms"]: 151 | acc_num = manifest_json_obj[ticker_or_cik]["forms"][form_type] 152 | no_dir_filename = f"{ticker_or_cik}-{form_type}-{cik}-{acc_num}.xbrl".replace("/", "") 153 | sec_doc_filename = os.path.join(SEC_DOCS_DIR, no_dir_filename) 154 | csv_filename = _csv_filename(ticker_or_cik, form_type, cik, acc_num) 155 | if os.path.exists(csv_filename) and os.path.getsize(csv_filename) > 0: 156 | print(f"skipping api call for existing csv: {sec_doc_filename}", flush=True) 157 | continue 158 | print(f"{ticker_or_cik}", flush=True) 159 | gen_csv(sec_doc_filename, ticker_or_cik, form_type, cik, acc_num) 160 | 161 | 162 | if __name__ == "__main__": 163 | if SEC_DOCS_DIR is None or CSV_FILES_DIR is None: 164 | raise RuntimeError("Environment vaiables SEC_DOCS_DIR and CSV_FILES_DIR must be set.") 165 | gen_csvs(parse_manifest_json()) 166 | -------------------------------------------------------------------------------- /scripts/check-and-format-notebooks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | from copy import deepcopy 5 | import difflib 6 | import json 7 | from pathlib import Path 8 | import sys 9 | from typing import List, Tuple, Union 10 | 11 | from nbdev import clean 12 | from nbconvert.preprocessors import ExecutePreprocessor 13 | import nbformat 14 | from unstructured_api_tools.pipelines.convert import read_notebook 15 | 16 | 17 | def process_nb(nb: nbformat.NotebookNode, working_dir: Union[str, Path]) -> nbformat.NotebookNode: 18 | """Execute cells in nb using working_dir as the working directory for imports, modifying the 19 | notebook in place (in memory).""" 20 | # Clear existing outputs before executing the notebook 21 | for cell in nb.cells: 22 | if cell.cell_type == "code": 23 | cell.outputs = [] 24 | ep = ExecutePreprocessor(timeout=600) 25 | ep.preprocess(nb, {"metadata": {"path": working_dir}}) 26 | # Merge adjacent text outputs after executing the notebook 27 | for cell in nb.cells: 28 | merge_adjacent_text_outputs(cell) 29 | return nb 30 | 31 | def merge_adjacent_text_outputs(cell: nbformat.NotebookNode) -> nbformat.NotebookNode: 32 | """Merges adjacent text stream outputs to avoid non-deterministic splits in output.""" 33 | if cell.cell_type != "code": 34 | return cell 35 | 36 | new_outputs = [] 37 | current_output = None 38 | 39 | for output in cell.outputs: 40 | if output.output_type == "stream": 41 | if current_output is None: 42 | current_output = output 43 | elif current_output.name == output.name: 44 | current_output.text += output.text 45 | else: 46 | new_outputs.append(current_output) 47 | current_output = output 48 | else: 49 | if current_output is not None: 50 | new_outputs.append(current_output) 51 | current_output = None 52 | new_outputs.append(output) 53 | 54 | if current_output is not None: 55 | new_outputs.append(current_output) 56 | 57 | cell.outputs = new_outputs 58 | return cell 59 | 60 | def nb_paths(root_path: Union[str, Path]) -> List[Path]: 61 | """Fetches all .ipynb filenames that belong to subdirectories of root_path (1 level deep) with 62 | 'notebooks' in the name.""" 63 | root_path = Path(root_path) 64 | return [ 65 | fn 66 | for dir in root_path.iterdir() 67 | # NOTE(alan): Search only in paths with 'notebooks' in the title such as pipeline-notebooks 68 | # and exploration-notebooks 69 | if "notebooks" in dir.stem and dir.is_dir() 70 | for fn in dir.iterdir() 71 | if fn.suffix == ".ipynb" 72 | ] 73 | 74 | 75 | def to_results_str(fns: List[Path], nonmatching_nbs: List[Path]) -> Tuple[str, str]: 76 | """Given files that were checked and list of files that would be changed, produces a summary of 77 | changes as well as a list of files to be changed""" 78 | unchanged = len(fns) - len(nonmatching_nbs) 79 | results = [] 80 | if nonmatching_nbs: 81 | results.append( 82 | f"{len(nonmatching_nbs)} " 83 | f"{'file' if len(nonmatching_nbs) == 1 else 'files'} " 84 | f"{'would be ' if check else ''}changed" 85 | ) 86 | if unchanged: 87 | results.append( 88 | f"{unchanged} " 89 | f"{'file' if unchanged == 1 else 'files'} " 90 | f"{'would be ' if check else ''}left unchanged" 91 | ) 92 | summary_str = ", ".join(results) + ".\n" 93 | if nonmatching_nbs: 94 | details_str = ( 95 | f"The following notebooks {'would have been' if check else 'were'} " 96 | "changed when executed and cleaned:\n* " + "\n* ".join(nonmatching_nbs) + "\n" 97 | ) 98 | else: 99 | details_str = "" 100 | 101 | return summary_str, details_str 102 | 103 | 104 | if __name__ == "__main__": 105 | parser = argparse.ArgumentParser() 106 | parser.add_argument( 107 | "--check", 108 | default=False, 109 | action="store_true", 110 | help="Check notebook format without making changes. Return code 0 means formatting would " 111 | "produce no changes. Return code 1 means some files would be changed.", 112 | ) 113 | parser.add_argument( 114 | "notebooks", 115 | metavar="notebook", 116 | nargs="*", 117 | help="Path(s) to notebook(s) to format (or check). If you don't pass any paths, " 118 | "notebooks in any subfolders with 'notebooks' in the name will be processed.", 119 | default=[], 120 | ) 121 | args = parser.parse_args() 122 | check = args.check 123 | notebooks = args.notebooks 124 | 125 | root_path = Path(__file__).parent.parent 126 | nonmatching_nbs = [] 127 | fns = notebooks if notebooks else nb_paths(root_path) 128 | for fn in fns: 129 | print(f"{'checking' if check else 'processing'} {fn}") 130 | nb = read_notebook(fn) 131 | modified_nb = deepcopy(nb) 132 | process_nb(modified_nb, root_path) 133 | clean.clean_nb(modified_nb, allowed_cell_metadata_keys=["tags"]) 134 | if nb != modified_nb: 135 | nonmatching_nbs.append(str(fn)) 136 | nb_json = json.dumps(nb.dict(), indent=2, sort_keys=True) 137 | modified_nb_json = json.dumps(modified_nb.dict(), indent=2, sort_keys=True) 138 | sys.stderr.write(f"The following diff shows the modifications made to {fn}\n") 139 | sys.stderr.writelines( 140 | ( 141 | difflib.unified_diff( 142 | nb_json.splitlines(keepends=True), 143 | modified_nb_json.splitlines(keepends=True), 144 | ) 145 | ) 146 | ) 147 | if not check: 148 | nbformat.write(modified_nb, fn) 149 | 150 | summary_str, details_str = to_results_str(fns, nonmatching_nbs) 151 | print(summary_str) 152 | if check: 153 | sys.stderr.write(details_str) 154 | if nonmatching_nbs: 155 | sys.exit(1) 156 | else: 157 | print(details_str) 158 | -------------------------------------------------------------------------------- /scripts/version-sync.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | function usage { 3 | echo "Usage: $(basename "$0") [-c] -f FILE_TO_CHANGE REPLACEMENT_FORMAT [-f FILE_TO_CHANGE REPLACEMENT_FORMAT ...]" 2>&1 4 | echo 'Synchronize files to latest version in source file' 5 | echo ' -s Specifies source file for version (default is CHANGELOG.md)' 6 | echo ' -f Specifies a file to change and the format for searching and replacing versions' 7 | echo ' FILE_TO_CHANGE is the file to be updated/checked for updates' 8 | echo ' REPLACEMENT_FORMAT is one of (semver, release, api-release)' 9 | echo ' semver indicates to look for a full semver version and replace with the latest full version' 10 | echo ' release indicates to look for a release semver version (x.x.x) and replace with the latest release version' 11 | echo ' api-release indicates to look for a release semver version in the context of an api route and replace with the latest release version' 12 | echo ' -c Compare versions and output proposed changes without changing anything.' 13 | } 14 | 15 | function getopts-extra () { 16 | declare i=1 17 | # if the next argument is not an option, then append it to array OPTARG 18 | while [[ ${OPTIND} -le $# && ${!OPTIND:0:1} != '-' ]]; do 19 | OPTARG[i]=${!OPTIND} 20 | i+=1 21 | OPTIND+=1 22 | done 23 | } 24 | 25 | # Parse input options 26 | declare CHECK=0 27 | declare SOURCE_FILE="CHANGELOG.md" 28 | declare -a FILES_TO_CHECK=() 29 | declare -a REPLACEMENT_FORMATS=() 30 | declare args 31 | declare OPTIND OPTARG opt 32 | while getopts ":hcs:f:" opt; do 33 | case $opt in 34 | h) 35 | usage 36 | exit 0 37 | ;; 38 | c) 39 | CHECK=1 40 | ;; 41 | s) 42 | SOURCE_FILE="$OPTARG" 43 | ;; 44 | f) 45 | getopts-extra "$@" 46 | args=( "${OPTARG[@]}" ) 47 | # validate length of args, should be 2 48 | if [ ${#args[@]} -eq 2 ]; then 49 | FILES_TO_CHECK+=( "${args[0]}" ) 50 | REPLACEMENT_FORMATS+=( "${args[1]}" ) 51 | else 52 | echo "Exactly 2 arguments must follow -f option." >&2 53 | exit 1 54 | fi 55 | ;; 56 | \?) 57 | echo "Invalid option: -$OPTARG." >&2 58 | usage 59 | exit 1 60 | ;; 61 | esac 62 | done 63 | 64 | # Parse REPLACEMENT_FORMATS 65 | RE_SEMVER_FULL="(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)(-((0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*)(\.(0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*))*))?(\+([0-9a-zA-Z-]+(\.[0-9a-zA-Z-]+)*))?" 66 | RE_RELEASE="(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)" 67 | RE_API_RELEASE="v(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)" 68 | # Pull out semver appearing earliest in SOURCE_FILE. 69 | LAST_VERSION=$(grep -o -m 1 -E "${RE_SEMVER_FULL}" "$SOURCE_FILE") 70 | LAST_RELEASE=$(grep -o -m 1 -E "${RE_RELEASE}($|[^-+])" "$SOURCE_FILE" | grep -o -m 1 -E "${RE_RELEASE}") 71 | LAST_API_RELEASE="v$(grep -o -m 1 -E "${RE_RELEASE}($|[^-+])$" "$SOURCE_FILE" | grep -o -m 1 -E "${RE_RELEASE}")" 72 | declare -a RE_SEMVERS=() 73 | declare -a UPDATED_VERSIONS=() 74 | for i in "${!REPLACEMENT_FORMATS[@]}"; do 75 | REPLACEMENT_FORMAT=${REPLACEMENT_FORMATS[$i]} 76 | case $REPLACEMENT_FORMAT in 77 | semver) 78 | RE_SEMVERS+=( "$RE_SEMVER_FULL" ) 79 | UPDATED_VERSIONS+=( "$LAST_VERSION" ) 80 | ;; 81 | release) 82 | RE_SEMVERS+=( "$RE_RELEASE" ) 83 | UPDATED_VERSIONS+=( "$LAST_RELEASE" ) 84 | ;; 85 | api-release) 86 | RE_SEMVERS+=( "$RE_API_RELEASE" ) 87 | UPDATED_VERSIONS+=( "$LAST_API_RELEASE" ) 88 | ;; 89 | *) 90 | echo "Invalid replacement format: \"${REPLACEMENT_FORMAT}\". Use semver, release, or api-release" >&2 91 | exit 1 92 | ;; 93 | esac 94 | done 95 | 96 | if [ -z "$LAST_VERSION" ]; 97 | then 98 | # No match to semver regex in SOURCE_FILE, so no version to go from. 99 | printf "Error: Unable to find latest version from %s.\n" "$SOURCE_FILE" 100 | exit 1 101 | fi 102 | 103 | # Search files in FILES_TO_CHECK and change (or get diffs) 104 | declare FAILED_CHECK=0 105 | 106 | for i in "${!FILES_TO_CHECK[@]}"; do 107 | FILE_TO_CHANGE=${FILES_TO_CHECK[$i]} 108 | RE_SEMVER=${RE_SEMVERS[$i]} 109 | UPDATED_VERSION=${UPDATED_VERSIONS[$i]} 110 | FILE_VERSION=$(grep -o -m 1 -E "${RE_SEMVER}" "$FILE_TO_CHANGE") 111 | if [ -z "$FILE_VERSION" ]; 112 | then 113 | # No match to semver regex in VERSIONFILE, so nothing to replace 114 | printf "Error: No semver version found in file %s.\n" "$FILE_TO_CHANGE" 115 | exit 1 116 | else 117 | # Replace semver in VERSIONFILE with semver obtained from SOURCE_FILE 118 | TMPFILE=$(mktemp /tmp/new_version.XXXXXX) 119 | # Check sed version, exit if version < 4.3 120 | if ! sed --version > /dev/null 2>&1; then 121 | CURRENT_VERSION=1.archaic 122 | else 123 | CURRENT_VERSION=$(sed --version | head -n1 | cut -d" " -f4) 124 | fi 125 | REQUIRED_VERSION="4.3" 126 | if [ "$(printf '%s\n' "$REQUIRED_VERSION" "$CURRENT_VERSION" | sort -V | head -n1)" != "$REQUIRED_VERSION" ]; then 127 | echo "sed version must be >= ${REQUIRED_VERSION}" && exit 1 128 | fi 129 | sed -E -r "s/$RE_SEMVER/$UPDATED_VERSION/" "$FILE_TO_CHANGE" > "$TMPFILE" 130 | if [ $CHECK == 1 ]; 131 | then 132 | DIFF=$(diff "$FILE_TO_CHANGE" "$TMPFILE" ) 133 | if [ -z "$DIFF" ]; 134 | then 135 | printf "version sync would make no changes to %s.\n" "$FILE_TO_CHANGE" 136 | rm "$TMPFILE" 137 | else 138 | FAILED_CHECK=1 139 | printf "version sync would make the following changes to %s:\n%s\n" "$FILE_TO_CHANGE" "$DIFF" 140 | rm "$TMPFILE" 141 | fi 142 | else 143 | cp "$TMPFILE" "$FILE_TO_CHANGE" 144 | rm "$TMPFILE" 145 | fi 146 | fi 147 | done 148 | 149 | # Exit with code determined by whether changes were needed in a check. 150 | if [ ${FAILED_CHECK} -ne 0 ]; then 151 | exit 1 152 | else 153 | exit 0 154 | fi 155 | -------------------------------------------------------------------------------- /prepline_sec_filings/sections.py: -------------------------------------------------------------------------------- 1 | """Module for defining/enumerating the common sections from SEC forms""" 2 | from enum import Enum 3 | import re 4 | from typing import List 5 | 6 | 7 | class SECSection(Enum): 8 | PROSPECTUS_SUMMARY = re.compile(r"^(?:prospectus )?summary$") 9 | ABOUT_PROSPECTUS = re.compile(r"about this prospectus") 10 | FORWARD_LOOKING_STATEMENTS = re.compile(r"forward[ -]looking statements") 11 | RISK_FACTORS = re.compile(r"risk factors") 12 | USE_OF_PROCEEDS = re.compile(r"use of proceeds") 13 | DIVIDEND_POLICY = re.compile(r"^dividend policy") 14 | CAPITALIZATION = re.compile(r"^capitalization$") 15 | DILUTION = re.compile(r"^dilution$") 16 | MANAGEMENT_DISCUSSION = re.compile(r"^management(?:[\u2019']s)? discussion") 17 | BUSINESS = re.compile(r"^business$") 18 | MANAGEMENT = re.compile(r"^(?:(?:our )?management)|(?:executive officers)$") 19 | COMPENSATION = re.compile(r"compensation") 20 | RELATED_PARTY_TRANSACTIONS = re.compile(r"(?:relationships|related).*transactions") 21 | PRINCIPAL_STOCKHOLDERS = re.compile( 22 | r"(?:principal.*(?:stockholder|shareholder)s?)|(?:(security|stock|share) " 23 | r"ownership .*certain)" 24 | ) 25 | DESCRIPTION_OF_STOCK = re.compile(r"^description of (?:capital stock|share capital|securities)") 26 | DESCRIPTION_OF_DEBT = re.compile(r"^description of .*debt") 27 | FUTURE_SALE = re.compile(r"(?:shares|stock) eligible for future sale") 28 | US_TAX = re.compile( 29 | r"(?:us|u\.s\.|united states|material federal).* tax (?:consideration|consequence)" 30 | ) 31 | UNDERWRITING = re.compile(r"underwrit") 32 | LEGAL_MATTERS = re.compile(r"legal matters") 33 | EXPERTS = re.compile(r"^experts$") 34 | MORE_INFORMATION = re.compile(r"(?:additional|more) information") 35 | FINANCIAL_STATEMENTS = r"financial statements" 36 | MARKET_RISK_DISCLOSURES = r"(?:quantitative|qualitative) disclosures? about market risk" 37 | CONTROLS_AND_PROCEDURES = r"controls and procedures" 38 | LEGAL_PROCEEDINGS = r"legal proceedings" 39 | DEFAULTS = r"defaults (?:up)?on .*securities" 40 | MINE_SAFETY = r"mine safety disclosures?" 41 | OTHER_INFORMATION = r"other information" 42 | UNRESOLVED_STAFF_COMMENTS = r"unresolved staff comments" 43 | PROPERTIES = r"^properties$" 44 | MARKET_FOR_REGISTRANT_COMMON_EQUITY = ( 45 | r"market for(?: the)? (?:registrant|company)(?:['\u2019]s)? common equity" 46 | ) 47 | ACCOUNTING_DISAGREEMENTS = r"disagreements with accountants" 48 | FOREIGN_JURISDICTIONS = r"diclosure .*foreign jurisdictions .*inspection" 49 | EXECUTIVE_OFFICERS = r"executive officers" 50 | ACCOUNTING_FEES = r"accounting fees" 51 | EXHIBITS = r"^exhibits?(.*financial statement schedules)?$" 52 | FORM_SUMMARY = r"^form .*summary$" 53 | # NOTE(yuming): Additional section titles used in test_real_examples.py, 54 | # maybe change this when custom regex string param is allowed. 55 | CERTAIN_TRADEMARKS = r"certain trademarks" 56 | OFFER_PRICE = r"(?:determination of )offering price" 57 | 58 | @property 59 | def pattern(self): 60 | return self.value 61 | 62 | 63 | ALL_SECTIONS = "_ALL" 64 | 65 | section_string_to_enum = {enum.name: enum for enum in SECSection} 66 | 67 | # NOTE(robinson) - Sections are listed in the following document from SEC 68 | # ref: https://www.sec.gov/files/form10-k.pdf 69 | SECTIONS_10K = ( 70 | SECSection.BUSINESS, # ITEM 1 71 | SECSection.RISK_FACTORS, # ITEM 1A 72 | SECSection.UNRESOLVED_STAFF_COMMENTS, # ITEM 1B 73 | SECSection.PROPERTIES, # ITEM 2 74 | SECSection.LEGAL_PROCEEDINGS, # ITEM 3 75 | SECSection.MINE_SAFETY, # ITEM 4 76 | SECSection.MARKET_FOR_REGISTRANT_COMMON_EQUITY, # ITEM 5 77 | # NOTE(robinson) - ITEM 6 is "RESERVED" 78 | SECSection.MANAGEMENT_DISCUSSION, # ITEM 7 79 | SECSection.MARKET_RISK_DISCLOSURES, # ITEM 7A 80 | SECSection.FINANCIAL_STATEMENTS, # ITEM 8 81 | SECSection.ACCOUNTING_DISAGREEMENTS, # ITEM 9 82 | SECSection.CONTROLS_AND_PROCEDURES, # ITEM 9A 83 | # NOTE(robinson) - ITEM 9B is other information 84 | SECSection.FOREIGN_JURISDICTIONS, # ITEM 9C 85 | SECSection.MANAGEMENT, # ITEM 10 86 | SECSection.COMPENSATION, # ITEM 11 87 | SECSection.PRINCIPAL_STOCKHOLDERS, # ITEM 12 88 | SECSection.RELATED_PARTY_TRANSACTIONS, # ITEM 13 89 | SECSection.ACCOUNTING_FEES, # ITEM 14 90 | SECSection.EXHIBITS, # ITEM 15 91 | SECSection.FORM_SUMMARY, # ITEM 16 92 | ) 93 | 94 | # NOTE(robinson) - Sections are listed in the following document from SEC 95 | # ref: https://www.sec.gov/files/form10-q.pdf 96 | SECTIONS_10Q = ( 97 | # Part I - Financial information 98 | SECSection.FINANCIAL_STATEMENTS, # ITEM 1 99 | SECSection.MANAGEMENT_DISCUSSION, # ITEM 2 100 | SECSection.MARKET_RISK_DISCLOSURES, # ITEM 3 101 | SECSection.CONTROLS_AND_PROCEDURES, # ITEM 4 102 | # Part II - Other information 103 | SECSection.LEGAL_PROCEEDINGS, # ITEM 1 104 | SECSection.RISK_FACTORS, # ITEM 1A 105 | SECSection.USE_OF_PROCEEDS, # ITEM 2 106 | SECSection.DEFAULTS, # ITEM 3 107 | SECSection.MINE_SAFETY, # ITEM 4 108 | SECSection.OTHER_INFORMATION, # ITEM 5 109 | ) 110 | 111 | SECTIONS_S1 = ( 112 | SECSection.PROSPECTUS_SUMMARY, 113 | SECSection.ABOUT_PROSPECTUS, 114 | SECSection.FORWARD_LOOKING_STATEMENTS, 115 | SECSection.RISK_FACTORS, 116 | SECSection.USE_OF_PROCEEDS, 117 | SECSection.DIVIDEND_POLICY, 118 | SECSection.CAPITALIZATION, 119 | SECSection.DILUTION, 120 | SECSection.MANAGEMENT_DISCUSSION, 121 | SECSection.BUSINESS, 122 | SECSection.MANAGEMENT, 123 | SECSection.COMPENSATION, 124 | SECSection.RELATED_PARTY_TRANSACTIONS, 125 | SECSection.PRINCIPAL_STOCKHOLDERS, 126 | SECSection.DESCRIPTION_OF_STOCK, 127 | SECSection.DESCRIPTION_OF_DEBT, 128 | SECSection.FUTURE_SALE, 129 | SECSection.US_TAX, 130 | SECSection.UNDERWRITING, 131 | SECSection.LEGAL_MATTERS, 132 | SECSection.EXPERTS, 133 | SECSection.MORE_INFORMATION, 134 | ) 135 | 136 | 137 | def validate_section_names(section_names: List[str]): 138 | """Return section names that don't correspond to a defined enum.""" 139 | if len(section_names) == 1 and section_names[0] == ALL_SECTIONS: 140 | return None 141 | elif len(section_names) > 1 and ALL_SECTIONS in section_names: 142 | raise ValueError(f"{ALL_SECTIONS} may not be specified with other sections") 143 | 144 | invalid_names = [name for name in section_names if name not in section_string_to_enum] 145 | if invalid_names: 146 | raise ValueError(f"The following section names are not valid: {invalid_names}") 147 | return None 148 | -------------------------------------------------------------------------------- /test_real_docs/test_real_examples.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from pathlib import Path 4 | 5 | import pytest 6 | 7 | from prepline_sec_filings.sec_document import SECDocument, clean_sec_text 8 | from unstructured.documents.html import HTMLListItem 9 | 10 | from prepline_sec_filings.sections import SECSection, section_string_to_enum 11 | 12 | DIRECTORY = Path(__file__).absolute().parent 13 | 14 | RISK_FACTOR_XFAILS = ["aig", "bgs"] 15 | 16 | 17 | with open(os.path.join("test_utils", "examples.json")) as f: 18 | examples = json.load(f) 19 | 20 | 21 | with open( 22 | os.path.join(DIRECTORY, "fixtures", "sample-first-last.json"), 23 | "r", 24 | ) as f: 25 | sample_first_last = json.load(f) 26 | 27 | 28 | @pytest.fixture(scope="module") 29 | def docs_all(): 30 | return {} 31 | 32 | 33 | @pytest.fixture 34 | def doc_elements(ticker, docs_all): 35 | if ticker not in docs_all: 36 | text = get_file_from_ticker(ticker) 37 | doc = SECDocument.from_string(text).doc_after_cleaners(skip_headers_and_footers=True) 38 | docs_all[ticker] = {} 39 | docs_all[ticker]["doc"] = doc 40 | docs_all[ticker]["elements"] = doc.elements 41 | return (docs_all[ticker]["doc"], docs_all[ticker]["elements"]) 42 | 43 | 44 | @pytest.fixture 45 | def xfail(ticker, section, first_or_last): 46 | if ticker in RISK_FACTOR_XFAILS: 47 | return True 48 | elif ticker == "cl" and section in [ 49 | SECSection.MANAGEMENT_DISCUSSION, 50 | SECSection.MARKET_RISK_DISCLOSURES, 51 | ]: 52 | return True 53 | elif ticker == "bc" and section == SECSection.USE_OF_PROCEEDS: 54 | return True 55 | elif ticker == "doc" and section == SECSection.OTHER_INFORMATION: 56 | return True 57 | elif ( 58 | ticker == "cvs" and section == SECSection.PRINCIPAL_STOCKHOLDERS and first_or_last == "last" 59 | ): 60 | return True 61 | # TODO(yuming): The issue of this xfail is the same as the one in core-241 62 | elif ticker == "ehc" and section == SECSection.BUSINESS: 63 | return True 64 | return False 65 | 66 | 67 | @pytest.fixture 68 | def risk_samples(): 69 | with open(os.path.join(os.path.dirname(__file__), "fixtures", "risk-samples.json"), "r") as f: 70 | out = json.load(f) 71 | return out 72 | 73 | 74 | def get_file_from_ticker(ticker): 75 | cik = examples[ticker]["cik"] 76 | formtype = next(iter(examples[ticker]["forms"])) 77 | accession_number = examples[ticker]["forms"][formtype] 78 | with open( 79 | os.path.join("sample-docs", f"{ticker}-{formtype}-{cik}-{accession_number}.xbrl") 80 | ) as f: 81 | out = f.read() 82 | return out 83 | 84 | 85 | @pytest.mark.parametrize("ticker", [ticker for ticker in examples]) 86 | def test_samples_found(ticker, risk_samples, doc_elements): 87 | samples = risk_samples[ticker] 88 | if ticker in ( 89 | "mmm", 90 | "aig", 91 | "rgld", 92 | "cri", 93 | "pepg", 94 | "ehc", 95 | "bj", 96 | "smtc", 97 | "bgs", 98 | "blco", 99 | ): 100 | pytest.xfail(reason="Need to re-examine test failure reasons") 101 | 102 | doc, _ = doc_elements 103 | parsed_risk_narratives = doc.get_risk_narrative() 104 | # The expected samples will be empty only when there is no risk factors section, so 105 | # the parsed narratives and samples to find should either both be empty or both be 106 | # populated. 107 | assert bool(parsed_risk_narratives) == bool(samples) 108 | for sample in samples: 109 | assert any( 110 | ( 111 | # TODO(alan): Do cleaning directly in risk-samples.json and define cleaning 112 | # specifically for this test. 113 | clean_sec_text(sample) in clean_sec_text(risk_narrative.text) 114 | for risk_narrative in parsed_risk_narratives 115 | ) 116 | ) 117 | 118 | 119 | @pytest.mark.parametrize( 120 | "ticker, section, first_or_last", 121 | [ 122 | (ticker, section_string_to_enum[section], first_or_last) 123 | for ticker in sample_first_last 124 | for section in sample_first_last[ticker] 125 | for first_or_last in sample_first_last[ticker][section] 126 | ], 127 | ) 128 | def test_first_last(ticker, doc_elements, section, first_or_last, xfail): 129 | if xfail: 130 | pytest.xfail() 131 | doc, _ = doc_elements 132 | parsed_risk_narratives = doc.get_section_narrative(section) 133 | sample = sample_first_last[ticker][section.name][first_or_last] 134 | idx = 0 if first_or_last == "first" else -1 135 | assert clean_sec_text(parsed_risk_narratives[idx].text) == clean_sec_text(sample) 136 | 137 | 138 | def list_item_test_values(): 139 | list_item_count_file = os.path.join(DIRECTORY, "fixtures", "list-item-counts.json") 140 | with open(list_item_count_file, "r") as f: 141 | list_item_counts = json.load(f) 142 | 143 | list_item_content_file = os.path.join(DIRECTORY, "fixtures", "list-item-content.json") 144 | with open(list_item_content_file, "r") as f: 145 | list_item_content = json.load(f) 146 | 147 | list_item_tests = list() 148 | for ticker, count in list_item_counts.items(): 149 | content = list_item_content.get(ticker, None) 150 | list_item_tests.append((ticker, count, content)) 151 | 152 | return list_item_tests 153 | 154 | 155 | def check_first_list_item_section(section, expected_count, expected_content): 156 | count = 0 157 | in_list_item_section = False 158 | for i, element in enumerate(section): 159 | if not in_list_item_section and isinstance(element, HTMLListItem): 160 | in_list_item_section = True 161 | if expected_content: 162 | section_text = clean_sec_text(section[i].text) 163 | expected_text = clean_sec_text(expected_content[count]) 164 | assert section_text == expected_text 165 | count += 1 166 | elif in_list_item_section and isinstance(element, HTMLListItem): 167 | if expected_content: 168 | section_text = clean_sec_text(section[i].text) 169 | expected_text = clean_sec_text(expected_content[count]) 170 | assert section_text == expected_text 171 | count += 1 172 | elif in_list_item_section and not isinstance(element, HTMLListItem): 173 | return count 174 | 175 | assert count == expected_count 176 | 177 | return count 178 | 179 | 180 | @pytest.mark.parametrize("ticker, expected_count, expected_content", list_item_test_values()) 181 | def test_list_items(ticker, expected_count, expected_content): 182 | if ticker in RISK_FACTOR_XFAILS: 183 | pytest.xfail(reason="xfail for risk factor section. therefore can't count list items") 184 | text = get_file_from_ticker(ticker) 185 | doc = SECDocument.from_string(text) 186 | risk_section = doc.get_section_narrative(SECSection.RISK_FACTORS) 187 | check_first_list_item_section(risk_section, expected_count, expected_content) 188 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PIPELINE_FAMILY := sec-filings 2 | PIPELINE_PACKAGE := sec_filings 3 | PACKAGE_NAME := prepline_${PIPELINE_PACKAGE} 4 | PIP_VERSION := 23.1.2 5 | 6 | .PHONY: help 7 | help: Makefile 8 | @sed -n 's/^\(## \)\([a-zA-Z]\)/\2/p' $< 9 | 10 | 11 | ########### 12 | # Install # 13 | ########### 14 | 15 | ## install-base: installs minimum requirements to run the API 16 | .PHONY: install-base 17 | install-base: install-base-pip-packages install-nltk-models 18 | 19 | ## install: installs all test and dev requirements 20 | .PHONY: install 21 | install: install-base install-test install-dev 22 | 23 | .PHONY: install-base-pip-packages 24 | install-base-pip-packages: 25 | python3 -m pip install pip==${PIP_VERSION} 26 | pip install -r requirements/base.txt 27 | 28 | .PHONY: install-nltk-models 29 | install-nltk-models: 30 | python -c "import nltk; nltk.download('punkt')" 31 | python -c "import nltk; nltk.download('averaged_perceptron_tagger')" 32 | 33 | .PHONY: install-test 34 | install-test: 35 | pip install -r requirements/test.txt 36 | 37 | .PHONY: install-dev 38 | install-dev: 39 | pip install -r requirements/dev.txt 40 | 41 | .PHONY: install-ipython-kernel 42 | install-ipython-kernel: 43 | ipython kernel install --name "python3" --sys-prefix 44 | 45 | .PHONY: install-ci 46 | install-ci: install-base install-test install-ipython-kernel 47 | 48 | ## pip-compile: compiles all base/dev/test requirements 49 | .PHONY: pip-compile 50 | pip-compile: 51 | pip-compile --upgrade requirements/base.in 52 | pip-compile --upgrade requirements/dev.in 53 | pip-compile --upgrade requirements/test.in 54 | 55 | 56 | ######### 57 | # Build # 58 | ######### 59 | 60 | ## generate-api: generates the FastAPI python APIs from notebooks 61 | .PHONY: generate-api 62 | generate-api: 63 | PYTHONPATH=. unstructured_api_tools convert-pipeline-notebooks \ 64 | --input-directory ./pipeline-notebooks \ 65 | --output-directory ./${PACKAGE_NAME}/api 66 | 67 | 68 | ########## 69 | # Docker # 70 | ########## 71 | 72 | # Docker targets are provided for convenience only and are not required in a standard development environment 73 | 74 | # Note that the image has notebooks baked in, however the current working directory 75 | # is mounted under /home/notebook-user/local/ when the image is started with 76 | # docker-start-api or docker-start-jupyter 77 | 78 | .PHONY: docker-build 79 | docker-build: 80 | PIP_VERSION=${PIP_VERSION} PIPELINE_FAMILY=${PIPELINE_FAMILY} PIPELINE_PACKAGE=${PIPELINE_PACKAGE} ./scripts/docker-build.sh 81 | 82 | .PHONY: docker-start-api 83 | docker-start-api: 84 | docker run -p 8000:8000 --mount type=bind,source=$(realpath .),target=/home/notebook-user/local -t --rm pipeline-family-${PIPELINE_FAMILY}-dev:latest uvicorn ${PACKAGE_NAME}.api.app:app --log-config logger_config.yaml --host 0.0.0.0 --port 8000 85 | 86 | .PHONY: docker-start-jupyter 87 | docker-start-jupyter: 88 | docker run -p 8888:8888 --mount type=bind,source=$(realpath .),target=/home/notebook-user/local -t --rm pipeline-family-${PIPELINE_FAMILY}-dev:latest jupyter-notebook --port 8888 --ip 0.0.0.0 --no-browser --NotebookApp.token='' --NotebookApp.password='' 89 | 90 | 91 | ######### 92 | # Local # 93 | ######### 94 | 95 | ## run-jupyter: starts jupyter notebook 96 | .PHONY: run-jupyter 97 | run-jupyter: 98 | PYTHONPATH=$(realpath .) JUPYTER_PATH=$(realpath .) jupyter-notebook --NotebookApp.token='' --NotebookApp.password='' 99 | 100 | ## run-web-app: runs the FastAPI api with hot reloading 101 | .PHONY: run-web-app 102 | run-web-app: 103 | PYTHONPATH=. uvicorn ${PACKAGE_NAME}.api.app:app --log-config logger_config.yaml --reload 104 | 105 | 106 | ################# 107 | # Test and Lint # 108 | ################# 109 | 110 | ## test: runs core tests 111 | .PHONY: test 112 | test: 113 | PYTHONPATH=. pytest test_${PIPELINE_PACKAGE} --cov=${PACKAGE_NAME} --cov-report term-missing 114 | 115 | .PHONY: check-coverage 116 | check-coverage: 117 | coverage report --fail-under=93 118 | 119 | ## test-integration: runs integration tests 120 | .PHONY: test-integration 121 | test-integration: 122 | PYTHONPATH=. pytest test_${PIPELINE_PACKAGE}_integration 123 | 124 | ## test-sample-docs: runs the pipeline on a set of sample SEC documents 125 | .PHONY: test-sample-docs 126 | test-sample-docs: verify-artifacts 127 | PYTHONPATH=. pytest test_real_docs 128 | 129 | ## api-check: verifies auto-generated pipeline APIs match the existing ones 130 | .PHONY: api-check 131 | api-check: 132 | PYTHONPATH=. PACKAGE_NAME=${PACKAGE_NAME} ./scripts/test-doc-pipeline-apis-consistent.sh 133 | 134 | ## dl-test-artifacts: downloads external artifacts used for testing 135 | .PHONY: dl-test-artifacts 136 | dl-test-artifacts: 137 | wget -r -nH -O sample-docs/sample-sec-docs.tar.gz https://utic-dev-tech-fixtures.s3.us-east-2.amazonaws.com/sample-sec-docs/sample-sec-docs.tar.gz 138 | tar -xf sample-docs/sample-sec-docs.tar.gz -C sample-docs/ && rm sample-docs/sample-sec-docs.tar.gz 139 | $(MAKE) verify-artifacts 140 | 141 | .PHONY: verify-artifacts 142 | verify-artifacts: 143 | sha256sum --check --status sample-docs/sample-sec-docs.sha256 144 | 145 | .PHONY: dl-test-artifacts-source 146 | dl-test-artifacts-source: 147 | # Downloads directly from SEC website. Not normally needed, see script. 148 | PYTHONPATH=. python3 test_utils/get_sec_docs_from_edgar.py 149 | 150 | 151 | ## check: runs linters (includes tests) 152 | .PHONY: check 153 | check: check-src check-tests check-version 154 | 155 | ## check-src: runs linters (source only, no tests) 156 | .PHONY: check-src 157 | check-src: 158 | black --line-length 100 ${PACKAGE_NAME} --check --exclude ${PACKAGE_NAME}/api 159 | flake8 ${PACKAGE_NAME} 160 | mypy ${PACKAGE_NAME} --ignore-missing-imports --implicit-optional --install-types --non-interactive 161 | 162 | .PHONY: check-tests 163 | check-tests: 164 | black --line-length 100 test_${PIPELINE_PACKAGE} --check 165 | flake8 test_${PIPELINE_PACKAGE} 166 | black --line-length 100 test_${PIPELINE_PACKAGE}_integration --check 167 | flake8 test_${PIPELINE_PACKAGE}_integration 168 | black --line-length 100 test_real_docs --check 169 | flake8 test_real_docs 170 | black --line-length 100 test_utils --check 171 | flake8 test_utils 172 | 173 | ## check-scripts: run shellcheck 174 | .PHONY: check-scripts 175 | check-scripts: 176 | # Fail if any of these files have warnings 177 | scripts/shellcheck.sh 178 | 179 | ## check-version: run check to ensure version in CHANGELOG.md matches references in files 180 | .PHONY: check-version 181 | check-version: 182 | # Fail if syncing version would produce changes 183 | scripts/version-sync.sh -c \ 184 | -s CHANGELOG.md \ 185 | -f README.md api-release \ 186 | -f preprocessing-pipeline-family.yaml release \ 187 | -f exploration-notebooks/exploration-10q-amended.ipynb api-release 188 | 189 | ## check-notebooks: check that executing and cleaning notebooks doesn't produce changes 190 | .PHONY: check-notebooks 191 | check-notebooks: 192 | scripts/check-and-format-notebooks.py --check 193 | 194 | ## tidy: run black 195 | .PHONY: tidy 196 | tidy: 197 | black --line-length 100 ${PACKAGE_NAME} 198 | black --line-length 100 test_${PIPELINE_PACKAGE} 199 | black --line-length 100 test_${PIPELINE_PACKAGE}_integration 200 | black --line-length 100 test_real_docs 201 | black --line-length 100 test_utils 202 | 203 | ## tidy-notebooks: execute notebooks and remove metadata 204 | .PHONY: tidy-notebooks 205 | tidy-notebooks: 206 | scripts/check-and-format-notebooks.py 207 | 208 | ## version-sync: update references to version with most recent version from CHANGELOG.md 209 | .PHONY: version-sync 210 | version-sync: 211 | scripts/version-sync.sh \ 212 | -s CHANGELOG.md \ 213 | -f README.md api-release \ 214 | -f preprocessing-pipeline-family.yaml release \ 215 | -f exploration-notebooks/exploration-10q-amended.ipynb api-release 216 | -------------------------------------------------------------------------------- /requirements/dev.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.8 3 | # by the following command: 4 | # 5 | # pip-compile requirements/dev.in 6 | # 7 | anyio==3.7.0 8 | # via jupyter-server 9 | appnope==0.1.3 10 | # via 11 | # ipykernel 12 | # ipython 13 | argon2-cffi==21.3.0 14 | # via 15 | # jupyter-server 16 | # nbclassic 17 | # notebook 18 | argon2-cffi-bindings==21.2.0 19 | # via argon2-cffi 20 | arrow==1.2.3 21 | # via isoduration 22 | asttokens==2.2.1 23 | # via stack-data 24 | attrs==23.1.0 25 | # via jsonschema 26 | backcall==0.2.0 27 | # via ipython 28 | beautifulsoup4==4.12.2 29 | # via nbconvert 30 | black==23.3.0 31 | # via -r requirements/dev.in 32 | bleach==6.0.0 33 | # via nbconvert 34 | build==0.10.0 35 | # via pip-tools 36 | cffi==1.15.1 37 | # via argon2-cffi-bindings 38 | click==8.1.3 39 | # via 40 | # black 41 | # pip-tools 42 | comm==0.1.3 43 | # via ipykernel 44 | debugpy==1.6.7 45 | # via ipykernel 46 | decorator==5.1.1 47 | # via ipython 48 | defusedxml==0.7.1 49 | # via nbconvert 50 | exceptiongroup==1.1.1 51 | # via anyio 52 | executing==1.2.0 53 | # via stack-data 54 | fastjsonschema==2.17.1 55 | # via nbformat 56 | flake8==6.0.0 57 | # via -r requirements/dev.in 58 | fqdn==1.5.1 59 | # via jsonschema 60 | idna==3.4 61 | # via 62 | # anyio 63 | # jsonschema 64 | importlib-metadata==6.6.0 65 | # via 66 | # jupyter-client 67 | # nbconvert 68 | importlib-resources==5.12.0 69 | # via jsonschema 70 | ipykernel==6.23.1 71 | # via 72 | # ipywidgets 73 | # jupyter 74 | # jupyter-console 75 | # nbclassic 76 | # notebook 77 | # qtconsole 78 | ipython==8.8.0 79 | # via 80 | # -r requirements/dev.in 81 | # ipykernel 82 | # ipywidgets 83 | # jupyter-console 84 | ipython-genutils==0.2.0 85 | # via 86 | # nbclassic 87 | # notebook 88 | # qtconsole 89 | ipywidgets==8.0.6 90 | # via jupyter 91 | isoduration==20.11.0 92 | # via jsonschema 93 | jedi==0.18.2 94 | # via ipython 95 | jinja2==3.1.2 96 | # via 97 | # jupyter-server 98 | # nbclassic 99 | # nbconvert 100 | # notebook 101 | jsonpointer==2.3 102 | # via jsonschema 103 | jsonschema[format-nongpl]==4.17.3 104 | # via 105 | # jupyter-events 106 | # nbformat 107 | jupyter==1.0.0 108 | # via -r requirements/dev.in 109 | jupyter-client==8.2.0 110 | # via 111 | # ipykernel 112 | # jupyter-console 113 | # jupyter-server 114 | # nbclassic 115 | # nbclient 116 | # notebook 117 | # qtconsole 118 | jupyter-console==6.6.3 119 | # via jupyter 120 | jupyter-core==5.3.0 121 | # via 122 | # -r requirements/dev.in 123 | # ipykernel 124 | # jupyter-client 125 | # jupyter-console 126 | # jupyter-server 127 | # nbclassic 128 | # nbclient 129 | # nbconvert 130 | # nbformat 131 | # notebook 132 | # qtconsole 133 | jupyter-events==0.6.3 134 | # via jupyter-server 135 | jupyter-server==2.6.0 136 | # via 137 | # nbclassic 138 | # notebook-shim 139 | jupyter-server-terminals==0.4.4 140 | # via jupyter-server 141 | jupyterlab-pygments==0.2.2 142 | # via nbconvert 143 | jupyterlab-widgets==3.0.7 144 | # via ipywidgets 145 | markupsafe==2.1.2 146 | # via 147 | # jinja2 148 | # nbconvert 149 | matplotlib-inline==0.1.6 150 | # via 151 | # ipykernel 152 | # ipython 153 | mccabe==0.7.0 154 | # via flake8 155 | mistune==2.0.5 156 | # via nbconvert 157 | mypy==1.3.0 158 | # via -r requirements/dev.in 159 | mypy-extensions==1.0.0 160 | # via 161 | # black 162 | # mypy 163 | nbclassic==1.0.0 164 | # via notebook 165 | nbclient==0.8.0 166 | # via nbconvert 167 | nbconvert==7.4.0 168 | # via 169 | # jupyter 170 | # jupyter-server 171 | # nbclassic 172 | # notebook 173 | nbformat==5.9.0 174 | # via 175 | # jupyter-server 176 | # nbclassic 177 | # nbclient 178 | # nbconvert 179 | # notebook 180 | nest-asyncio==1.5.6 181 | # via 182 | # ipykernel 183 | # nbclassic 184 | # notebook 185 | notebook==6.5.4 186 | # via jupyter 187 | notebook-shim==0.2.3 188 | # via nbclassic 189 | overrides==7.3.1 190 | # via jupyter-server 191 | packaging==23.1 192 | # via 193 | # black 194 | # build 195 | # ipykernel 196 | # jupyter-server 197 | # nbconvert 198 | # qtconsole 199 | # qtpy 200 | pandocfilters==1.5.0 201 | # via nbconvert 202 | parso==0.8.3 203 | # via jedi 204 | pathspec==0.11.1 205 | # via black 206 | pexpect==4.8.0 207 | # via ipython 208 | pickleshare==0.7.5 209 | # via ipython 210 | pip-tools==6.13.0 211 | # via -r requirements/dev.in 212 | pkgutil-resolve-name==1.3.10 213 | # via jsonschema 214 | platformdirs==3.5.1 215 | # via 216 | # black 217 | # jupyter-core 218 | prometheus-client==0.17.0 219 | # via 220 | # jupyter-server 221 | # nbclassic 222 | # notebook 223 | prompt-toolkit==3.0.38 224 | # via 225 | # ipython 226 | # jupyter-console 227 | psutil==5.9.5 228 | # via ipykernel 229 | ptyprocess==0.7.0 230 | # via 231 | # pexpect 232 | # terminado 233 | pure-eval==0.2.2 234 | # via stack-data 235 | pycodestyle==2.10.0 236 | # via flake8 237 | pycparser==2.21 238 | # via cffi 239 | pyflakes==3.0.1 240 | # via flake8 241 | pygments==2.15.1 242 | # via 243 | # ipython 244 | # jupyter-console 245 | # nbconvert 246 | # qtconsole 247 | pyproject-hooks==1.0.0 248 | # via build 249 | pyrsistent==0.19.3 250 | # via jsonschema 251 | python-dateutil==2.8.2 252 | # via 253 | # arrow 254 | # jupyter-client 255 | python-json-logger==2.0.7 256 | # via jupyter-events 257 | pyyaml==6.0 258 | # via jupyter-events 259 | pyzmq==25.1.0 260 | # via 261 | # ipykernel 262 | # jupyter-client 263 | # jupyter-console 264 | # jupyter-server 265 | # nbclassic 266 | # notebook 267 | # qtconsole 268 | qtconsole==5.4.3 269 | # via jupyter 270 | qtpy==2.3.1 271 | # via qtconsole 272 | rfc3339-validator==0.1.4 273 | # via 274 | # jsonschema 275 | # jupyter-events 276 | rfc3986-validator==0.1.1 277 | # via 278 | # jsonschema 279 | # jupyter-events 280 | send2trash==1.8.2 281 | # via 282 | # jupyter-server 283 | # nbclassic 284 | # notebook 285 | six==1.16.0 286 | # via 287 | # asttokens 288 | # bleach 289 | # python-dateutil 290 | # rfc3339-validator 291 | sniffio==1.3.0 292 | # via anyio 293 | soupsieve==2.4.1 294 | # via beautifulsoup4 295 | stack-data==0.6.2 296 | # via ipython 297 | terminado==0.17.1 298 | # via 299 | # jupyter-server 300 | # jupyter-server-terminals 301 | # nbclassic 302 | # notebook 303 | tinycss2==1.2.1 304 | # via nbconvert 305 | tomli==2.0.1 306 | # via 307 | # black 308 | # build 309 | # mypy 310 | # pyproject-hooks 311 | tornado==6.3.2 312 | # via 313 | # ipykernel 314 | # jupyter-client 315 | # jupyter-server 316 | # nbclassic 317 | # notebook 318 | # terminado 319 | traitlets==5.9.0 320 | # via 321 | # comm 322 | # ipykernel 323 | # ipython 324 | # ipywidgets 325 | # jupyter-client 326 | # jupyter-console 327 | # jupyter-core 328 | # jupyter-events 329 | # jupyter-server 330 | # matplotlib-inline 331 | # nbclassic 332 | # nbclient 333 | # nbconvert 334 | # nbformat 335 | # notebook 336 | # qtconsole 337 | typing-extensions==4.6.3 338 | # via 339 | # black 340 | # mypy 341 | uri-template==1.2.0 342 | # via jsonschema 343 | wcwidth==0.2.6 344 | # via prompt-toolkit 345 | webcolors==1.13 346 | # via jsonschema 347 | webencodings==0.5.1 348 | # via 349 | # bleach 350 | # tinycss2 351 | websocket-client==1.5.2 352 | # via jupyter-server 353 | wheel==0.40.0 354 | # via pip-tools 355 | widgetsnbextension==4.0.7 356 | # via ipywidgets 357 | zipp==3.15.0 358 | # via 359 | # importlib-metadata 360 | # importlib-resources 361 | 362 | # The following packages are considered to be unsafe in a requirements file: 363 | # pip 364 | # setuptools 365 | -------------------------------------------------------------------------------- /test_sec_filings/test_fetch.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import webbrowser 4 | import requests 5 | from unittest import mock 6 | 7 | import pytest 8 | import prepline_sec_filings.fetch as fetch 9 | 10 | 11 | response_content = { 12 | "filings": { 13 | "recent": { 14 | "accessionNumber": [ 15 | "1234567890-12-345678", 16 | "1234567890-12-345679", 17 | "1234567890-12-345680", 18 | "1234567890-12-345681", 19 | ], 20 | "form": ["10-K", "S-1", "10-K", "10-Q"], 21 | } 22 | } 23 | } 24 | 25 | 26 | class MockSession: 27 | def __init__(self): 28 | self.headers = dict() 29 | 30 | def get(self, url, **kwargs): 31 | if url.startswith(fetch.SEC_ARCHIVE_URL): 32 | if url.endswith("txt"): 33 | filename = url.split("/")[-1] 34 | return MockResponse(f"{filename}") 35 | elif url.endswith("html"): 36 | return MockResponse("") 37 | elif url.startswith(fetch.SEC_SEARCH_URL): 38 | return MockResponse("CIK=1234567890") 39 | elif url.startswith(fetch.SEC_SUBMISSIONS_URL): 40 | return MockResponse( 41 | "", 42 | content=json.dumps(response_content), 43 | ) 44 | else: 45 | raise ValueError 46 | 47 | 48 | class MockResponse: 49 | def __init__(self, text, content=None): 50 | self.text = text 51 | self.content = content 52 | 53 | def raise_for_status(self): 54 | pass 55 | 56 | 57 | def test_get_filing(monkeypatch): 58 | monkeypatch.setattr(requests, "Session", MockSession) 59 | filing = fetch.get_filing("949874", "000119312511215661", "Giant", "parker@giant.com") 60 | assert filing == "0001193125-11-215661.txt" 61 | 62 | 63 | def test_archive_url(): 64 | url = fetch.archive_url("949874", "000119312511215661") 65 | assert url == f"{fetch.SEC_ARCHIVE_URL}/949874/000119312511215661/0001193125-11-215661.txt" 66 | 67 | 68 | def test_add_dashes(): 69 | accession_number = fetch._add_dashes("000119312511215661") 70 | assert accession_number == "0001193125-11-215661" 71 | 72 | 73 | def test_drop_dashes(): 74 | accession_number = fetch._drop_dashes("0001193125-11-215661") 75 | assert accession_number == "000119312511215661" 76 | 77 | 78 | def test_get_session(monkeypatch): 79 | monkeypatch.setattr(requests, "Session", MockSession) 80 | session = fetch._get_session("Giant", "parker@giant.com") 81 | assert session.headers["User-Agent"] == "Giant parker@giant.com" 82 | 83 | 84 | @mock.patch.dict( 85 | os.environ, 86 | {"SEC_API_ORGANIZATION": "OtherOrg", "SEC_API_EMAIL": "person@otherorg.io"}, 87 | ) 88 | def test_get_session_default(monkeypatch): 89 | monkeypatch.setattr(requests, "Session", MockSession) 90 | session = fetch._get_session() 91 | assert session.headers["User-Agent"] == "OtherOrg person@otherorg.io" 92 | 93 | 94 | def test_get_cik_by_ticker(monkeypatch): 95 | monkeypatch.setattr(requests, "Session", MockSession) 96 | session = MockSession() 97 | cik = fetch.get_cik_by_ticker(session, "noice") 98 | assert cik == "1234567890" 99 | 100 | 101 | def test_get_forms_by_cik(monkeypatch): 102 | monkeypatch.setattr(requests, "Session", MockSession) 103 | session = MockSession() 104 | forms = fetch.get_forms_by_cik(session, "1234567890") 105 | assert forms["1234567890-12-345678"] == "10-K" 106 | assert forms["1234567890-12-345679"] == "S-1" 107 | assert forms["1234567890-12-345680"] == "10-K" 108 | assert forms["1234567890-12-345681"] == "10-Q" 109 | 110 | 111 | def test_get_recent_acc_num_by_cik(monkeypatch): 112 | monkeypatch.setattr(requests, "Session", MockSession) 113 | session = MockSession() 114 | assert fetch._get_recent_acc_num_by_cik(session, "1234567890", ["10-K"]) == ( 115 | "123456789012345678", 116 | "10-K", 117 | ) 118 | assert fetch._get_recent_acc_num_by_cik(session, "1234567890", ["S-1"]) == ( 119 | "123456789012345679", 120 | "S-1", 121 | ) 122 | assert fetch._get_recent_acc_num_by_cik(session, "1234567890", ["10-Q"]) == ( 123 | "123456789012345681", 124 | "10-Q", 125 | ) 126 | 127 | 128 | @pytest.mark.parametrize( 129 | "form_type, expected", 130 | [ 131 | ("10-K", "1234567890-12-345678.txt"), 132 | ("10-Q", "1234567890-12-345681.txt"), 133 | ("S-1", "1234567890-12-345679.txt"), 134 | ], 135 | ) 136 | def test_get_form_by_ticker(monkeypatch, form_type, expected): 137 | monkeypatch.setattr(requests, "Session", MockSession) 138 | assert ( 139 | fetch.get_form_by_ticker("1234567890", form_type, company="Giant", email="parker@giant.com") 140 | == expected 141 | ) 142 | 143 | 144 | @pytest.mark.parametrize( 145 | "form_type, expected", 146 | [ 147 | ( 148 | "10-K", 149 | f"{fetch.SEC_ARCHIVE_URL}/1234567890/123456789012345678/" 150 | "1234567890-12-345678-index.html", 151 | ), 152 | ( 153 | "10-Q", 154 | f"{fetch.SEC_ARCHIVE_URL}/1234567890/123456789012345681/" 155 | "1234567890-12-345681-index.html", 156 | ), 157 | ( 158 | "S-1", 159 | f"{fetch.SEC_ARCHIVE_URL}/1234567890/123456789012345679/" 160 | "1234567890-12-345679-index.html", 161 | ), 162 | ], 163 | ) 164 | @mock.patch("webbrowser.open_new_tab") 165 | @mock.patch("requests.Session", MockSession) 166 | def test_open_form_by_ticker(monkeypatch, form_type, expected): 167 | fetch.open_form_by_ticker("noice", form_type, False, company="Giant", email="parker@giant.com") 168 | webbrowser.open_new_tab.assert_called_once_with(expected) 169 | 170 | 171 | @pytest.mark.parametrize( 172 | "cik, acc_num, expected", 173 | [ 174 | ( 175 | "1234567890", 176 | "123456789012345678", 177 | f"{fetch.SEC_ARCHIVE_URL}/1234567890/123456789012345678/" 178 | "1234567890-12-345678-index.html", 179 | ), 180 | ( 181 | "1234567890", 182 | "123456789012345681", 183 | f"{fetch.SEC_ARCHIVE_URL}/1234567890/123456789012345681/" 184 | "1234567890-12-345681-index.html", 185 | ), 186 | ( 187 | "1234567890", 188 | "123456789012345679", 189 | f"{fetch.SEC_ARCHIVE_URL}/1234567890/123456789012345679/" 190 | "1234567890-12-345679-index.html", 191 | ), 192 | ], 193 | ) 194 | @mock.patch("webbrowser.open_new_tab") 195 | @mock.patch("requests.Session", MockSession) 196 | def test_open_form(monkeypatch, cik, acc_num, expected): 197 | fetch.open_form(cik, acc_num) 198 | webbrowser.open_new_tab.assert_called_once_with(expected) 199 | 200 | 201 | @pytest.mark.parametrize( 202 | "formtype, expected_cid, expected_acc_num", 203 | [ 204 | ( 205 | "10-K", 206 | "1234567890", 207 | "123456789012345678", 208 | ), 209 | ( 210 | "10-Q", 211 | "1234567890", 212 | "123456789012345681", 213 | ), 214 | ( 215 | "S-1", 216 | "1234567890", 217 | "123456789012345679", 218 | ), 219 | ], 220 | ) 221 | def test_get_recent_cik_and_acc_by_ticker(monkeypatch, formtype, expected_cid, expected_acc_num): 222 | monkeypatch.setattr(requests, "Session", MockSession) 223 | cik, acc_num, retrieved_form_type = fetch.get_recent_cik_and_acc_by_ticker( 224 | "noice", formtype, "Giant", "parker@giant.com" 225 | ) 226 | assert cik == expected_cid 227 | assert acc_num == expected_acc_num 228 | assert retrieved_form_type == formtype 229 | 230 | 231 | @pytest.mark.parametrize( 232 | "formtype, cid, expected_acc_num", 233 | [ 234 | ( 235 | "10-K", 236 | "1234567890", 237 | "123456789012345678", 238 | ), 239 | ( 240 | "10-Q", 241 | "1234567890", 242 | "123456789012345681", 243 | ), 244 | ( 245 | "S-1", 246 | "1234567890", 247 | "123456789012345679", 248 | ), 249 | ], 250 | ) 251 | def test_get_recent_acc_by_cik(monkeypatch, formtype, cid, expected_acc_num): 252 | monkeypatch.setattr(requests, "Session", MockSession) 253 | acc_num, recvd_formtype = fetch.get_recent_acc_by_cik( 254 | cid, formtype, "Giant", "parker@giant.com" 255 | ) 256 | assert acc_num == expected_acc_num 257 | assert recvd_formtype == formtype 258 | -------------------------------------------------------------------------------- /prepline_sec_filings/fetch.py: -------------------------------------------------------------------------------- 1 | """Module for fetching data from the SEC EDGAR Archives""" 2 | import json 3 | import os 4 | import re 5 | import requests 6 | from typing import List, Optional, Tuple, Union 7 | import sys 8 | 9 | if sys.version_info < (3, 8): 10 | from typing_extensions import Final 11 | else: 12 | from typing import Final 13 | 14 | import webbrowser 15 | 16 | from ratelimit import limits, sleep_and_retry 17 | 18 | from prepline_sec_filings.sec_document import VALID_FILING_TYPES 19 | 20 | SEC_ARCHIVE_URL: Final[str] = "https://www.sec.gov/Archives/edgar/data" 21 | SEC_SEARCH_URL: Final[str] = "http://www.sec.gov/cgi-bin/browse-edgar" 22 | SEC_SUBMISSIONS_URL = "https://data.sec.gov/submissions" 23 | 24 | 25 | def get_filing( 26 | cik: Union[str, int], accession_number: Union[str, int], company: str, email: str 27 | ) -> str: 28 | """Fetches the specified filing from the SEC EDGAR Archives. Conforms to the rate 29 | limits specified on the SEC website. 30 | ref: https://www.sec.gov/os/accessing-edgar-data""" 31 | session = _get_session(company, email) 32 | return _get_filing(session, cik, accession_number) 33 | 34 | 35 | @sleep_and_retry 36 | @limits(calls=10, period=1) 37 | def _get_filing( 38 | session: requests.Session, cik: Union[str, int], accession_number: Union[str, int] 39 | ) -> str: 40 | """Wrapped so filings can be retrieved with an existing session.""" 41 | url = archive_url(cik, accession_number) 42 | response = session.get(url) 43 | response.raise_for_status() 44 | return response.text 45 | 46 | 47 | @sleep_and_retry 48 | @limits(calls=10, period=1) 49 | def get_cik_by_ticker(session: requests.Session, ticker: str) -> str: 50 | """Gets a CIK number from a stock ticker by running a search on the SEC website.""" 51 | cik_re = re.compile(r".*CIK=(\d{10}).*") 52 | url = _search_url(ticker) 53 | response = session.get(url, stream=True) 54 | response.raise_for_status() 55 | results = cik_re.findall(response.text) 56 | return str(results[0]) 57 | 58 | 59 | @sleep_and_retry 60 | @limits(calls=10, period=1) 61 | def get_forms_by_cik(session: requests.Session, cik: Union[str, int]) -> dict: 62 | """Gets retrieves dict of recent SEC form filings for a given cik number.""" 63 | json_name = f"CIK{cik}.json" 64 | response = session.get(f"{SEC_SUBMISSIONS_URL}/{json_name}") 65 | response.raise_for_status() 66 | content = json.loads(response.content) 67 | recent_forms = content["filings"]["recent"] 68 | form_types = {k: v for k, v in zip(recent_forms["accessionNumber"], recent_forms["form"])} 69 | return form_types 70 | 71 | 72 | def _get_recent_acc_num_by_cik( 73 | session: requests.Session, cik: Union[str, int], form_types: List[str] 74 | ) -> Tuple[str, str]: 75 | """Returns accession number and form type for the most recent filing for one of the 76 | given form_types (AKA filing types) for a given cik.""" 77 | retrieved_form_types = get_forms_by_cik(session, cik) 78 | for acc_num, form_type_ in retrieved_form_types.items(): 79 | if form_type_ in form_types: 80 | return _drop_dashes(acc_num), form_type_ 81 | raise ValueError(f"No filings found for {cik}, looking for any of: {form_types}") 82 | 83 | 84 | def get_recent_acc_by_cik( 85 | cik: str, 86 | form_type: str, 87 | company: Optional[str] = None, 88 | email: Optional[str] = None, 89 | ) -> Tuple[str, str]: 90 | """Returns (accession_number, retrieved_form_type) for the given cik and form_type. 91 | The retrieved_form_type may be an amended version of requested form_type, e.g. 10-Q/A for 10-Q. 92 | """ 93 | session = _get_session(company, email) 94 | return _get_recent_acc_num_by_cik(session, cik, _form_types(form_type)) 95 | 96 | 97 | def get_recent_cik_and_acc_by_ticker( 98 | ticker: str, 99 | form_type: str, 100 | company: Optional[str] = None, 101 | email: Optional[str] = None, 102 | ) -> Tuple[str, str, str]: 103 | """Returns (cik, accession_number, retrieved_form_type) for the given ticker and form_type. 104 | The retrieved_form_type may be an amended version of requested form_type, e.g. 10-Q/A for 10-Q. 105 | """ 106 | session = _get_session(company, email) 107 | cik = get_cik_by_ticker(session, ticker) 108 | acc_num, retrieved_form_type = _get_recent_acc_num_by_cik(session, cik, _form_types(form_type)) 109 | return cik, acc_num, retrieved_form_type 110 | 111 | 112 | def get_form_by_ticker( 113 | ticker: str, 114 | form_type: str, 115 | allow_amended_filing: Optional[bool] = True, 116 | company: Optional[str] = None, 117 | email: Optional[str] = None, 118 | ) -> str: 119 | """For a given ticker, gets the most recent form of a given form_type.""" 120 | session = _get_session(company, email) 121 | cik = get_cik_by_ticker(session, ticker) 122 | return get_form_by_cik( 123 | cik, form_type, allow_amended_filing=allow_amended_filing, company=company, email=email 124 | ) 125 | 126 | 127 | def _form_types(form_type: str, allow_amended_filing: Optional[bool] = True): 128 | """Potentialy expand to include amended filing, e.g.: 129 | "10-Q" -> "10-Q/A" 130 | """ 131 | assert form_type in VALID_FILING_TYPES 132 | if allow_amended_filing and not form_type.endswith("/A"): 133 | return [form_type, f"{form_type}/A"] 134 | else: 135 | return [form_type] 136 | 137 | 138 | def get_form_by_cik( 139 | cik: str, 140 | form_type: str, 141 | allow_amended_filing: Optional[bool] = True, 142 | company: Optional[str] = None, 143 | email: Optional[str] = None, 144 | ) -> str: 145 | """For a given CIK, returns the most recent form of a given form_type. By default 146 | an amended version of the form_type may be retrieved (allow_amended_filing=True). 147 | E.g., if form_type is "10-Q", the retrived form could be a 10-Q or 10-Q/A. 148 | """ 149 | session = _get_session(company, email) 150 | acc_num, _ = _get_recent_acc_num_by_cik( 151 | session, cik, _form_types(form_type, allow_amended_filing) 152 | ) 153 | text = _get_filing(session, cik, acc_num) 154 | return text 155 | 156 | 157 | def open_form(cik, acc_num): 158 | """For a given cik and accession number, opens the index page in default browser for the 159 | associated SEC form""" 160 | acc_num = _drop_dashes(acc_num) 161 | webbrowser.open_new_tab(f"{SEC_ARCHIVE_URL}/{cik}/{acc_num}/{_add_dashes(acc_num)}-index.html") 162 | 163 | 164 | def open_form_by_ticker( 165 | ticker: str, 166 | form_type: str, 167 | allow_amended_filing: Optional[bool] = True, 168 | company: Optional[str] = None, 169 | email: Optional[str] = None, 170 | ): 171 | """For a given ticker, opens the index page in default browser for the most recent form of a 172 | given form_type.""" 173 | session = _get_session(company, email) 174 | cik = get_cik_by_ticker(session, ticker) 175 | acc_num, _ = _get_recent_acc_num_by_cik( 176 | session, cik, _form_types(form_type, allow_amended_filing) 177 | ) 178 | open_form(cik, acc_num) 179 | 180 | 181 | def archive_url(cik: Union[str, int], accession_number: Union[str, int]) -> str: 182 | """Builds the archive URL for the SEC accession number. Looks for the .txt file for the 183 | filing, while follows a {accession_number}.txt format.""" 184 | filename = f"{_add_dashes(accession_number)}.txt" 185 | accession_number = _drop_dashes(accession_number) 186 | return f"{SEC_ARCHIVE_URL}/{cik}/{accession_number}/{filename}" 187 | 188 | 189 | def _search_url(cik: Union[str, int]) -> str: 190 | search_string = f"CIK={cik}&Find=Search&owner=exclude&action=getcompany" 191 | url = f"{SEC_SEARCH_URL}?{search_string}" 192 | return url 193 | 194 | 195 | def _add_dashes(accession_number: Union[str, int]) -> str: 196 | """Adds the dashes back into the accession number""" 197 | accession_number = str(accession_number) 198 | return f"{accession_number[:10]}-{accession_number[10:12]}-{accession_number[12:]}" 199 | 200 | 201 | def _drop_dashes(accession_number: Union[str, int]) -> str: 202 | """Converts the accession number to the no dash representation.""" 203 | accession_number = str(accession_number).replace("-", "") 204 | return accession_number.zfill(18) 205 | 206 | 207 | def _get_session(company: Optional[str] = None, email: Optional[str] = None) -> requests.Session: 208 | """Creates a requests sessions with the appropriate headers set. If these headers are not 209 | set, SEC will reject your request. 210 | ref: https://www.sec.gov/os/accessing-edgar-data""" 211 | if company is None: 212 | company = os.environ.get("SEC_API_ORGANIZATION") 213 | if email is None: 214 | email = os.environ.get("SEC_API_EMAIL") 215 | assert company 216 | assert email 217 | session = requests.Session() 218 | session.headers.update( 219 | { 220 | "User-Agent": f"{company} {email}", 221 | "Content-Type": "text/html", 222 | } 223 | ) 224 | return session 225 | -------------------------------------------------------------------------------- /exploration-notebooks/exploration-s1-risks.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "ae311bc4", 6 | "metadata": {}, 7 | "source": [ 8 | "## Exploration Notebooks - S1 Documents\n", 9 | "\n", 10 | "The purpose of this notebook is to demonstrate the logic for extracting narrative text from the risk factors section in S1 filings. \n", 11 | "\n", 12 | "#### Table of Contents\n", 13 | "\n", 14 | "1. [Palantir Filing](#palantir)\n", 15 | "2. [Tesla Filing](#tesla)" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "id": "f89372ab", 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "%load_ext autoreload\n", 26 | "%autoreload 2" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "id": "18f90b55", 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "from prepline_sec_filings.fetch import get_filing\n", 37 | "from prepline_sec_filings.sec_document import SECDocument" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "id": "bbaf7232", 43 | "metadata": {}, 44 | "source": [ 45 | "### Palantir Filing \n", 46 | "\n", 47 | "This section pulls in the Palantir S-1 filing from the SEC site, which is available [here](https://www.sec.gov/Archives/edgar/data/1321655/000119312520230013/d904406ds1.htm). The goal is to identify the [risk factors](https://www.sec.gov/Archives/edgar/data/1321655/000119312520230013/d904406ds1.htm#rom904406_3) section." 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "id": "1aef6e6d", 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "text = get_filing(\"1321655\",\n", 58 | " \"000119312520230013\", \n", 59 | " \"Unstructured Technologies\", \n", 60 | " \"support@unstructured.io\")" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "id": "71848be5", 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "sec_document = SECDocument.from_string(text)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "id": "3ff29c73", 77 | "metadata": {}, 78 | "outputs": [ 79 | { 80 | "data": { 81 | "text/plain": [ 82 | "'S-1'" 83 | ] 84 | }, 85 | "execution_count": null, 86 | "metadata": {}, 87 | "output_type": "execute_result" 88 | } 89 | ], 90 | "source": [ 91 | "sec_document.filing_type" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "id": "1d4ac11a", 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "from prepline_sec_filings.sections import SECSection\n", 102 | "risk_narrative = sec_document.get_section_narrative(SECSection.RISK_FACTORS)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "id": "4d95c612", 108 | "metadata": {}, 109 | "source": [ 110 | "From the cells below, we can see that the `get_risk_narrative` method section successfully identified the risk section in the document." 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "id": "821c431a", 117 | "metadata": {}, 118 | "outputs": [ 119 | { 120 | "name": "stdout", 121 | "output_type": "stream", 122 | "text": [ 123 | "Investing in our Class A common stock involves a high degree of risk. You should carefully consider the risks and uncertainties\n", 124 | "described below, together with all of the other information in this prospectus, including the section titled “Management’s Discussion and Analysis of Financial Condition and Results of Operations” and our consolidated financial\n", 125 | "statements and accompanying notes, before making a decision to invest in our Class A common stock. Our business, financial condition, results of operations, or prospects could also be harmed by risks and uncertainties not currently known to us\n", 126 | "or that we currently do not believe are material. If any of the risks actually occur, our business, financial condition, results of operations, and prospects could be adversely affected. In that event, the trading price of our Class A common\n", 127 | "stock could decline, and you could lose part or all of your investment.\n" 128 | ] 129 | } 130 | ], 131 | "source": [ 132 | "print(risk_narrative[0])" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "id": "8b31a840", 139 | "metadata": {}, 140 | "outputs": [ 141 | { 142 | "name": "stdout", 143 | "output_type": "stream", 144 | "text": [ 145 | "We have never declared nor paid cash dividends on our capital stock. We currently intend to retain any future earnings to finance the\n", 146 | "operation and expansion of our business, and we do not anticipate declaring or paying any dividends to holders of our capital stock in the foreseeable future. In addition, our credit facility contains restrictions on our ability to pay dividends.\n", 147 | "Any determination to pay dividends in the future will be at the discretion of our Board of Directors. Consequently, stockholders must rely on sales of their Class A common stock after price appreciation, which may never occur, as the only way\n", 148 | "to realize any future gains on their investment.\n" 149 | ] 150 | } 151 | ], 152 | "source": [ 153 | "print(risk_narrative[-1])" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "id": "6fa2c95d", 159 | "metadata": {}, 160 | "source": [ 161 | "### Tesla Filing \n", 162 | "\n", 163 | "This section tests the risk narrative logic on the Tesla S-1 filing, which can be found [here](https://www.sec.gov/Archives/edgar/data/1318605/000119312511149963/ds1.htm). The goal is to identify the narrative text in the Risk Factors section, which can be found [here](https://www.sec.gov/Archives/edgar/data/1318605/000119312511149963/ds1.htm#toc188115_4)." 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "id": "d203ec3e", 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "text = get_filing(\"1318605\",\n", 174 | " \"000119312511149963\", \n", 175 | " \"Unstructured Technologies\", \n", 176 | " \"support@unstructured.io\")" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "id": "1a26f776", 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "sec_document = SECDocument.from_string(text)" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "id": "2de728f5", 193 | "metadata": {}, 194 | "outputs": [ 195 | { 196 | "data": { 197 | "text/plain": [ 198 | "'S-1'" 199 | ] 200 | }, 201 | "execution_count": null, 202 | "metadata": {}, 203 | "output_type": "execute_result" 204 | } 205 | ], 206 | "source": [ 207 | "sec_document.filing_type" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "id": "a37d12e7", 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "risk_narrative = sec_document.get_section_narrative(SECSection.RISK_FACTORS)" 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "id": "5bd7e2a8", 223 | "metadata": {}, 224 | "source": [ 225 | "From the cells below, we can see that the `get_risk_narrative` method section successfully identified the risk section in the document." 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "id": "58d5258a", 232 | "metadata": {}, 233 | "outputs": [ 234 | { 235 | "name": "stdout", 236 | "output_type": "stream", 237 | "text": [ 238 | "You should carefully consider the risks described below together with the other information set forth in this prospectus, which could\n", 239 | "materially affect our business, financial condition and future results. The risks described below are not the only risks facing our company. Risks and uncertainties not currently known to us or that we currently deem to be immaterial also may\n", 240 | "materially adversely affect our business, financial condition and operating results.\n" 241 | ] 242 | } 243 | ], 244 | "source": [ 245 | "print(risk_narrative[0])" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "id": "f48ea22d", 252 | "metadata": {}, 253 | "outputs": [ 254 | { 255 | "name": "stdout", 256 | "output_type": "stream", 257 | "text": [ 258 | "We do not anticipate declaring any cash dividends to holders of our common stock in the foreseeable future. Consequently, investors may\n", 259 | "need to rely on sales of their common stock after price appreciation, which may never occur, as the only way to realize any future gains on their investment. Investors seeking cash dividends should not purchase our common stock.\n" 260 | ] 261 | } 262 | ], 263 | "source": [ 264 | "print(risk_narrative[-1])" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "id": "770ca0f3", 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [] 274 | } 275 | ], 276 | "metadata": { 277 | "kernelspec": { 278 | "display_name": "python3", 279 | "language": "python", 280 | "name": "python3" 281 | } 282 | }, 283 | "nbformat": 4, 284 | "nbformat_minor": 5 285 | } 286 | -------------------------------------------------------------------------------- /exploration-notebooks/exploration-TOC-action.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Exploration Notebooks - TOC in action\n", 8 | "\n", 9 | "The purpose of this notebook is to demonstrate the logic for indentifying the Table of Contents section for both 10-K/10-Q and S-1 filings. \n", 10 | "\n", 11 | "#### Table of Contents\n", 12 | "\n", 13 | "1. [TOC action for 10-K/10-Q filings](#10-K-10-Q)\n", 14 | "2. [TOC action for S-1 filings](#S-1)" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "%load_ext autoreload\n", 24 | "%autoreload 2" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "from prepline_sec_filings.fetch import get_filing\n", 34 | "from prepline_sec_filings.sec_document import SECDocument" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "### 10-K/10-Q Filing \n", 42 | "\n", 43 | "This section pulls in the Palantir 10-Q filing from the SEC site, which is available [here](https://www.sec.gov/Archives/edgar/data/1321655/000119312520292177/d31861d10q.htm). The goal is to identify the [table of contents](https://www.sec.gov/Archives/edgar/data/1321655/000119312520292177/d31861d10q.htm#toc) section." 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "text = get_filing(\"1321655\",\n", 53 | " \"000119312520292177\", \n", 54 | " \"Unstructured Technologies\",\n", 55 | " \"support@unstructured.io\")" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "sec_document = SECDocument.from_string(text)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "elements = sec_document.elements" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "toc = sec_document.get_table_of_contents()" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "From the cells below, we can see that the `get_table_of_contents` method section identified the table of contents section in the document. However, there is still extra junk at the end." 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "PART I. FINANCIAL INFORMATION\n", 102 | "Item 1\n", 103 | "Financial Statements (unaudited)\n", 104 | "Condensed Consolidated Balance Sheets\n", 105 | "Condensed Consolidated Statements of Operations\n", 106 | "Condensed Consolidated Statements of Comprehensive Loss\n", 107 | "Condensed Consolidated Statements of Redeemable Convertible and Convertible\n", 108 | " Preferred Stock and Stockholders’ Equity (Deficit)\n", 109 | "Condensed Consolidated Statements of Cash Flows\n", 110 | "Notes to Unaudited Condensed Consolidated Financial\n", 111 | "Statements\n", 112 | "Item 2\n", 113 | "Management’s Discussion and Analysis of Financial Condition and Results\n", 114 | " of Operations\n", 115 | "Item 3\n", 116 | "Quantitative and Qualitative Disclosures About Market Risk\n", 117 | "Item 4\n", 118 | "Controls and Procedures\n", 119 | "PART II. OTHER INFORMATION\n", 120 | "Item 1\n", 121 | "Legal Proceedings\n", 122 | "Item 1A\n", 123 | "Risk Factors\n", 124 | "Item 2\n", 125 | "Unregistered Sales of Equity Securities\n", 126 | "Item 3\n", 127 | "Defaults Upon Senior Securities\n", 128 | "Item 4\n", 129 | "Mine Safety Disclosures\n", 130 | "Item 5\n", 131 | "Other Information\n", 132 | "Item 6\n", 133 | "Exhibits\n", 134 | "Table of Contents\n", 135 | "SPECIAL NOTE REGARDING FORWARD-LOOKING STATEMENTS\n", 136 | "our expectations regarding financial performance, including but not limited to our expectations regarding\n", 137 | "revenue, cost of revenue, operating expenses, stock-based compensation, and our ability to achieve and maintain future profitability;\n", 138 | "our ability to successfully execute our business and growth strategy;\n", 139 | "the sufficiency of our cash and cash equivalents to meet our liquidity needs;\n", 140 | "the demand for our platforms in general;\n", 141 | "our ability to increase our number of customers and revenue generated from customers;\n", 142 | "our expectations regarding the future contribution margin of our existing and future customers;\n", 143 | "our expectations regarding our ability to quickly and effectively integrate our platforms for our existing and\n", 144 | "future customers;\n", 145 | "our ability to develop new platforms, and enhancements to existing platforms, and bring them to market in a\n", 146 | "timely manner;\n", 147 | "the size of our addressable markets, market share, category positions, and market trends, including our\n", 148 | "ability to grow our business in large government and commercial organizations, including our expectations regarding the impact of FASA;\n", 149 | "our ability to compete with existing and new competitors in existing and new markets and products;\n", 150 | "our expectations regarding anticipated technology needs and developments and our ability to address those\n", 151 | "needs and developments with our platforms;\n", 152 | "our expectations regarding litigation and legal and regulatory matters;\n", 153 | "our expectations regarding our ability to meet existing performance obligations and maintain the operability\n", 154 | "of our products;\n", 155 | "our expectations regarding the effects of existing and developing laws and regulations, including with respect\n", 156 | "to taxation, privacy and data protection;\n", 157 | "our expectations regarding new and evolving markets;\n", 158 | "our ability to develop and protect our brand;\n", 159 | "our ability to maintain the security and availability of our platforms;\n", 160 | "our expectations and management of future growth;\n", 161 | "our expectations concerning relationships with third parties, including our customers, equity method\n", 162 | "investment partners, and vendors;\n", 163 | "our ability to maintain, protect, and enhance our intellectual property;\n", 164 | "our expectations regarding our multi-class stock and governance structure and the benefits thereof;\n", 165 | "Table of Contents\n", 166 | "the impact of the ongoing COVID-19 pandemic, including on our and our\n", 167 | "customers’, vendors’, and partners’ respective businesses and the markets in which we and our customers, vendors, and partners operate; and\n", 168 | "the increased expenses associated with being a public company.\n", 169 | "We caution you that the foregoing list may not contain all of the forward-looking statements made in this Quarterly Report on Form 10-Q.\n" 170 | ] 171 | } 172 | ], 173 | "source": [ 174 | "for element in toc.elements:\n", 175 | " print(element.text)" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "### S-1 Filing \n", 183 | "\n", 184 | "This section pulls in the Tesla S-1 filing from the SEC site, which is available [here](https://www.sec.gov/Archives/edgar/data/1318605/000119312511149963/ds1.htm). The goal is to identify the [table of contents](https://www.sec.gov/Archives/edgar/data/1318605/000119312511149963/ds1.htm#toc) section." 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "text = get_filing(\"1318605\",\n", 194 | " \"000119312511149963\", \n", 195 | " \"Unstructured Technologies\", \n", 196 | " \"support@unstructured.io\")" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "sec_document = SECDocument.from_string(text)" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "elements = sec_document.elements" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "toc = sec_document.get_table_of_contents()" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": {}, 229 | "source": [ 230 | "From the cells below, we can see that the `get_table_of_contents` method section identified the table of contents section in the document. However, there is still extra junk at the end." 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [ 238 | { 239 | "name": "stdout", 240 | "output_type": "stream", 241 | "text": [ 242 | "Prospectus Summary\n", 243 | "The Offering\n", 244 | "Summary Consolidated Financial Data\n", 245 | "Risk Factors\n", 246 | "Special Note Regarding Forward Looking Statements\n", 247 | "Market, Industry and Other Data\n", 248 | "Use of Proceeds\n", 249 | "Price Range of Common Stock\n", 250 | "Dividend Policy\n", 251 | "Capitalization\n", 252 | "Dilution\n", 253 | "Selected Consolidated Financial Data\n", 254 | "Management’s Discussion and Analysis of Financial Condition and Results of\n", 255 | "Operations\n", 256 | "Business\n", 257 | "Management\n", 258 | "Executive Compensation\n", 259 | "Certain Relationships and Related Party Transactions\n", 260 | "Principal Stockholders\n", 261 | "Description of Capital Stock\n", 262 | "Shares Eligible for Future Sale\n", 263 | "Material United States Tax Considerations for Non-United States Holders\n", 264 | "Underwriting\n", 265 | "Concurrent Private Placement\n", 266 | "Legal Matters\n", 267 | "Experts\n", 268 | "Where You Can Find Additional Information\n", 269 | "Index to Consolidated Financial Statements\n", 270 | "F-1\n" 271 | ] 272 | } 273 | ], 274 | "source": [ 275 | "for element in toc.elements:\n", 276 | " print(element.text)" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [] 285 | } 286 | ], 287 | "metadata": { 288 | "kernelspec": { 289 | "display_name": "python3", 290 | "language": "python", 291 | "name": "python3" 292 | } 293 | }, 294 | "nbformat": 4, 295 | "nbformat_minor": 2 296 | } 297 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2022 Unstructured Technologies, Inc 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /exploration-notebooks/exploration-10k-risks.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "14a418c0", 6 | "metadata": {}, 7 | "source": [ 8 | "## Exploration Notebooks - 10-K/10-Q Documents\n", 9 | "\n", 10 | "The purpose of this notebook is to demonstrate the logic for extracting narrative text from the risk factors section in 10-K and 10-Q filings. \n", 11 | "\n", 12 | "#### Table of Contents\n", 13 | "\n", 14 | "1. [WABC Filing](#wabc)\n", 15 | "2. [Palantir Filing](#palantir)" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "id": "60bfe980", 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "%load_ext autoreload\n", 26 | "%autoreload 2" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "id": "768fa8c6", 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "from prepline_sec_filings.fetch import get_filing, get_form_by_ticker\n", 37 | "from prepline_sec_filings.sec_document import SECDocument" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "id": "62c97cdc", 43 | "metadata": {}, 44 | "source": [ 45 | "### Westamerica Bancorp Filing \n", 46 | "\n", 47 | "This section pulls in the 2022 WABC 10-K filing from the SEC site, which is available [here](https://www.sec.gov/Archives/edgar/data/311094/000117184322001403/wabc20211231_10k.htm). The goal is to identify the [risk factors](https://www.sec.gov/Archives/edgar/data/311094/000117184321001344/wabc20201231_10k.htm#i1a) section." 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "id": "a8998e87", 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "# This would get the most recent 10-k filing\n", 58 | "#text = get_form_by_ticker(ticker=\"WABC\",\n", 59 | "# form_type=\"10-K\",\n", 60 | "# company=\"Unstructured Technologies\",\n", 61 | "# email=\"support@unstructured.io\")\n", 62 | "\n", 63 | "# This gets the 2022 Filing\n", 64 | "text = get_filing(\"311094\",\n", 65 | " \"000117184322001403\", \n", 66 | " \"Unstructured Technologies\",\n", 67 | " \"support@unstructured.io\")\n" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "id": "6b9303a6", 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "sec_document = SECDocument.from_string(text)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "id": "35c0e709", 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "data": { 88 | "text/plain": [ 89 | "'10-K'" 90 | ] 91 | }, 92 | "execution_count": null, 93 | "metadata": {}, 94 | "output_type": "execute_result" 95 | } 96 | ], 97 | "source": [ 98 | "sec_document.filing_type" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "id": "c8e0cad6", 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "from prepline_sec_filings.sections import SECSection\n", 109 | "risk_sections = sec_document.get_section_narrative(SECSection.RISK_FACTORS)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "id": "a2360e6c", 115 | "metadata": {}, 116 | "source": [ 117 | "From the cells below, we can see that the `get_risk_narrative` method section successfully identified the risk section in the document." 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "id": "ab4c2c4f", 124 | "metadata": {}, 125 | "outputs": [ 126 | { 127 | "name": "stdout", 128 | "output_type": "stream", 129 | "text": [ 130 | "Readers and prospective investors in the Company’s securities should carefully consider the following risk factors as well as the other information contained or incorporated by reference in this Report.\n" 131 | ] 132 | } 133 | ], 134 | "source": [ 135 | "print(risk_sections[0])" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "id": "f6aa34d0", 142 | "metadata": {}, 143 | "outputs": [ 144 | { 145 | "name": "stdout", 146 | "output_type": "stream", 147 | "text": [ 148 | "Management regularly reviews and updates the Company’s internal control over financial reporting, disclosure controls and procedures, and corporate governance policies and procedures. The Company maintains controls and procedures to mitigate against risks such as processing system failures and errors, and customer or employee fraud, and maintains insurance coverage for certain of these risks. Any system of controls and procedures, however well designed and operated, is based in part on certain assumptions and can provide only reasonable, not absolute, assurances that the objectives of the system are met. Events could occur which are not prevented or detected by the Company’s internal controls or are not insured against or are in excess of the Company’s insurance limits or insurance underwriters’ financial capacity. Any failure or circumvention of the Company’s controls and procedures or failure to comply with regulations related to controls and procedures could have a material adverse effect on the Company’s business, results of operations and financial condition.\n" 149 | ] 150 | } 151 | ], 152 | "source": [ 153 | "print(risk_sections[-1])" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "id": "d763903a", 159 | "metadata": {}, 160 | "source": [ 161 | "### Palantir Filing \n", 162 | "\n", 163 | "This section pulls in an old version of the Palantir 10-Q filing from the SEC site, which is available [here](https://www.sec.gov/Archives/edgar/data/1321655/000119312520292177/d31861d10q.htm). The goal is to identify the [risk factors](https://www.sec.gov/Archives/edgar/data/1321655/000119312520292177/d31861d10q.htm#fin31861_13) section." 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "id": "8f16fa12", 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "text = get_filing(\"1321655\",\n", 174 | " \"000119312520292177\", \n", 175 | " \"Unstructured Technologies\",\n", 176 | " \"support@unstructured.io\")" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "id": "adaeaea9", 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "sec_document = SECDocument.from_string(text)" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "id": "7f999efc", 193 | "metadata": {}, 194 | "outputs": [ 195 | { 196 | "data": { 197 | "text/plain": [ 198 | "'10-Q'" 199 | ] 200 | }, 201 | "execution_count": null, 202 | "metadata": {}, 203 | "output_type": "execute_result" 204 | } 205 | ], 206 | "source": [ 207 | "sec_document.filing_type" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "id": "31c6690a", 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "risk_sections = sec_document.get_section_narrative(SECSection.RISK_FACTORS)" 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "id": "a322b2c8", 223 | "metadata": {}, 224 | "source": [ 225 | "From the cells below, we can see that the `get_risk_narrative` method section successfully identified the risk section in the document." 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "id": "aeda557f", 232 | "metadata": {}, 233 | "outputs": [ 234 | { 235 | "name": "stdout", 236 | "output_type": "stream", 237 | "text": [ 238 | "Investing in our Class A common stock involves a high degree of risk. You should carefully consider the risks and uncertainties\n", 239 | "described below, together with all of the other information in this Quarterly Report on Form 10-Q, including the section titled “Management’s Discussion and Analysis of Financial Condition and\n", 240 | "Results of Operations” and our consolidated financial statements and accompanying notes, before making a decision to invest in our Class A common stock. Our business, financial condition, results of operations, or prospects could also be\n", 241 | "harmed by risks and uncertainties not currently known to us or that we currently do not believe are material. If any of the risks actually occur, our business, financial condition, results of operations, and prospects could be adversely affected. In\n", 242 | "that event, the trading price of our Class A common stock could decline, and you could lose part or all of your investment.\n" 243 | ] 244 | } 245 | ], 246 | "source": [ 247 | "print(risk_sections[0])" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "id": "b6fb276c", 254 | "metadata": {}, 255 | "outputs": [ 256 | { 257 | "name": "stdout", 258 | "output_type": "stream", 259 | "text": [ 260 | "Future issuances of our Class A common stock will dilute the voting power of our Class A common stockholders and future issuances to\n", 261 | "stockholders other than our Founders who are then party to the Founder Voting Agreement will dilute the economic interests of our Founders. However, because the shares of Class F common stock have variable voting rights, in the event that our\n", 262 | "Founders have less than 49.999999% of the voting power of our capital stock prior to giving effect to the voting power of the Class F common stock, future issuances of Class A common stock to stockholders other than our Founders will not\n", 263 | "result in dilution of the voting power of our Founders who are then party to the Founder Voting Agreement, but rather, will correspondingly increase the voting power of the Class F common stock. For instance, if the Founders who are party to\n", 264 | "the Founder Voting Agreement have 30% of the voting power of our outstanding capital stock in aggregate prior to giving effect to the voting power of the Class F common stock, the Class F common stock would have up to 19.999999% of our\n", 265 | "voting power resulting in such Founders having up to 49.999999% of our voting power. If we were to issue additional shares of our capital stock entitled to 10% of our voting power in aggregate to stockholders other than our Founders, then our\n", 266 | "Founders who are party to the Founder Voting Agreement would have approximately 27% of our voting power, and the Class F common stock would have up to approximately 22.999999% of our voting power, resulting in such Founders having up to\n", 267 | "49.999999% of our voting power. Any future issuances of additional shares of Class A common stock will not be subject to approval by our stockholders except as required by the listing standards of the NYSE. In addition, it may be very difficult\n", 268 | "for our Class A common stockholders to determine from time to time, including in advance of a meeting of stockholders, their individual or aggregate voting power due to the unique features of our multi-class capital structure, such as the\n", 269 | "variable number of votes per share of our Class F common stock.\n" 270 | ] 271 | } 272 | ], 273 | "source": [ 274 | "print(risk_sections[-1])" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "id": "69a6681e", 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [] 284 | } 285 | ], 286 | "metadata": { 287 | "kernelspec": { 288 | "display_name": "python3", 289 | "language": "python", 290 | "name": "python3" 291 | } 292 | }, 293 | "nbformat": 4, 294 | "nbformat_minor": 5 295 | } 296 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 | 5 |

6 |

Pre-Processing Pipeline for SEC Filings

7 |

8 | 9 | 10 | This repo implements a document pre-processing pipeline for SEC filings. Currently, the pipeline is capable of extracting narrative text from user-specified sections in 10-K, 10-Q, and S-1 filings. 11 | 12 | ## Developer Quick Start 13 | 14 | * Using `pyenv` to manage virtualenv's is recommended 15 | * Mac install instructions. See [here](https://github.com/Unstructured-IO/community#mac--homebrew) for more detailed instructions. 16 | * `brew install pyenv-virtualenv` 17 | * `pyenv install 3.8.15` 18 | * Linux instructions are available [here](https://github.com/Unstructured-IO/community#linux). 19 | 20 | * Create a virtualenv to work in and activate it, e.g. for one named `sec-filings`: 21 | 22 | `pyenv virtualenv 3.8.15 sec-filings`
23 | `pyenv activate sec-filings` 24 | 25 | * Run `make install` 26 | * Start a local jupyter notebook server with `make run-jupyter`
27 | **OR**
28 | just start the fast-API locally with `make run-web-app` 29 | 30 | ## Quick Tour 31 | 32 | You can run this [Colab notebook](https://colab.research.google.com/drive/1W9jCOGbIrE43f7fHMUSn3g3xXhOIjx_v) to see how [pipeline-section.ipynb](/pipeline-notebooks/pipeline-section.ipynb) extracts the narrative text sections from an SEC Filing and defines an API. 33 | 34 | ## Extracting Narrative Text from an SEC Filing 35 | 36 | To retrieve narrative text section(s) from an iXBRL S-1, 10-K, or 10-Q document (or amended version S-1/A, 10-K/A, or 10-Q/A), post the document to the `/section` API. You can try this out by downloading the sample documents using `make dl-test-artifacts`. Then, from 37 | the `sample-docs` folder, run: 38 | 39 | ``` 40 | curl -X 'POST' \ 41 | 'http://localhost:8000/sec-filings/v0.2.1/section' \ 42 | -H 'accept: application/json' \ 43 | -H 'Content-Type: multipart/form-data' \ 44 | -F 'text_files=@rgld-10-K-85535-000155837021011343.xbrl' \ 45 | -F section=RISK_FACTORS | jq -C . | less -R 46 | ``` 47 | 48 | Note that additional `-F section` parameters may be included in the curl request to fetch 49 | multiple sections at once. Valid sections for [10-Ks](https://www.sec.gov/files/form10-k.pdf), 50 | [10-Qs](https://www.sec.gov/files/form10-q.pdf), and [S-1s](https://www.sec.gov/files/forms-1.pdf) 51 | are available on the SEC website. You can also reference 52 | [this file](https://github.com/Unstructured-IO/pipeline-sec-filings/blob/main/prepline_sec_filings/sections.py) 53 | for a list of valid `section` parameters, e.g. `RISK_FACTORS` OR `MANAGEMENT_DISCUSSION`. 54 | 55 | 56 | You'll get back a response that looks like the following. Piping through `jq` and `less` 57 | formats/colors the outputs and lets your scroll through the results. 58 | 59 | ``` 60 | { 61 | "RISK_FACTORS": [ 62 | { 63 | "text": "You should carefully consider the risks described in this section. Our future performance is subject to risks and uncertainties that could have a material adverse effect on our business, results of operations, and financial condition and the trading price of our common stock. We may be subject to other risks and uncertainties not presently known to us. In addition, please see our note about forward-looking statements included in the MD&A.", 64 | "type": "NarrativeText" 65 | }, 66 | { 67 | "text": "Our revenue is subject to volatility in metal prices, which could negatively affect our results of operations or cash flow.", 68 | "type": "NarrativeText" 69 | }, 70 | { 71 | "text": "Market prices for gold, silver, copper, nickel, and other metals may fluctuate widely over time and are affected by numerous factors beyond our control. These factors include metal supply and demand, industrial and jewelry fabrication, investment demand, central banking actions, inflation expectations, currency values, interest rates, forward sales by metal producers, and political, trade, economic, or banking conditions.", 72 | "type": "NarrativeText" 73 | }, 74 | ... 75 | ] 76 | } 77 | ``` 78 | 79 | 80 | You can also pass in custom section regex patterns using the `section_regex` parameter. For 81 | example, you can run the following command to request the risk factors section: 82 | 83 | ``` 84 | curl -X 'POST' \ 85 | 'http://localhost:8000/sec-filings/v0.2.1/section' \ 86 | -H 'accept: application/json' \ 87 | -H 'Content-Type: multipart/form-data' \ 88 | -F 'text_files=@rgld-10-K-85535-000155837021011343.xbrl' \ 89 | -F 'section_regex=risk factors' | jq -C . | less -R 90 | ``` 91 | 92 | The result will be: 93 | 94 | ``` 95 | { 96 | "REGEX_0": [ 97 | { 98 | "text": "You should carefully consider the risks described in this section. Our future performance is subject to risks and uncertainties that could have a material adverse effect on our business, results of operations, and financial condition and the trading price of our common stock. We may be subject to other risks and uncertainties not presently known to us. In addition, please see our note about forward-looking statements included in the MD&A.", 99 | "type": "NarrativeText" 100 | }, 101 | { 102 | "text": "Our revenue is subject to volatility in metal prices, which could negatively affect our results of operations or cash flow.", 103 | "type": "NarrativeText" 104 | }, 105 | { 106 | "text": "Market prices for gold, silver, copper, nickel, and other metals may fluctuate widely over time and are affected by numerous factors beyond our control. These factors include metal supply and demand, industrial and jewelry fabrication, investment demand, central banking actions, inflation expectations, currency values, interest rates, forward sales by metal producers, and political, trade, economic, or banking conditions.", 107 | "type": "NarrativeText" 108 | }, 109 | ... 110 | ] 111 | } 112 | ``` 113 | 114 | As with the `section` parameter, you can request multiple regexes by passing in multiple values 115 | for the `section_regex` parameter. The requested pattern will be treated as a raw string. 116 | 117 | You can also use special regex characters in your pattern, as shown in the example below: 118 | 119 | ``` 120 | curl -X 'POST' \ 121 | 'http://localhost:8000/sec-filings/v0.2.1/section' \ 122 | -H 'accept: application/json' \ 123 | -H 'Content-Type: multipart/form-data' \ 124 | -F 'text_files=@rgld-10-K-85535-000155837021011343.xbrl' \ 125 | -F "section_regex=^(\S+\W?)+$" 126 | ``` 127 | 128 | You can always replace the header `-H 'accept: application/json'` with `-H 'accept: text/csv'` depending on the format you want to fetch from the API as follows: 129 | 130 | ``` 131 | curl -X 'POST' \ 132 | 'http://localhost:8000/sec-filings/v0.2.1/section' \ 133 | -H 'accept: text/csv' \ 134 | -H 'Content-Type: multipart/form-data' \ 135 | -F 'text_files=@rgld-10-K-85535-000155837021011343.xbrl' \ 136 | -F section=RISK_FACTORS | jq -C . | less -R 137 | ``` 138 | The result will be: 139 | ``` 140 | "section,element_type,text\r\nRISK_FACTORS,NarrativeText,\"You should carefully consider the risks described in this section. Our future performance is subject to risks and uncertainties that could have a material adverse effect on our business, results of operations, and financial condition and the trading price of our common stock. We may be subject to other risks and uncertainties not presently known to us. In addition, please see our note about forward-looking statements included in the MD&A.\"\r\nRISK_FACTORS,NarrativeText,\"Our revenue is subject to volatility in metal prices, which could negatively affect our results of operations or cash flow.\"\r\nRISK_FACTORS,NarrativeText,\"Market prices for gold, silver, copper, nickel, and other metals may fluctuate widely over time and are affected by numerous factors beyond our control. These factors include metal supply and demand, industrial and jewelry fabrication, investment demand, central banking actions, inflation expectations, currency values, interest rates, forward sales by metal producers, and political, trade, economic, or banking conditions.\"\r\n 141 | ``` 142 | 143 | In addition, you can add the form `-F 'output_schema=labelstudio'` if you want an output to be compatible with [labelstudio](https://labelstud.io) as follows: 144 | 145 | ``` 146 | curl -X 'POST' \ 147 | 'http://localhost:8000/sec-filings/v0.2.1/section' \ 148 | -H 'accept: application/json' \ 149 | -H 'Content-Type: multipart/form-data' \ 150 | -F 'text_files=@rgld-10-K-85535-000155837021011343.xbrl' \ 151 | -F 'output_schema=labelstudio' \ 152 | -F section=RISK_FACTORS | jq -C . | less -R 153 | 154 | ``` 155 | The result will be: 156 | ``` 157 | { 158 | "RISK_FACTORS": [ 159 | { 160 | "data": { 161 | "text": "You should carefully consider the risks described in this section. Our future performance is subject to risks and uncertainties that could have a material adverse effect on our business, results of operations, and financial condition and the trading price of our common stock. We may be subject to other risks and uncertainties not presently known to us. In addition, please see our note about forward-looking statements included in the MD&A.", 162 | "ref_id": "7a912bb639b547404be4ceaf5d9083a9" 163 | } 164 | }, 165 | { 166 | "data": { 167 | "text": "Our revenue is subject to volatility in metal prices, which could negatively affect our results of operations or cash flow.", 168 | "ref_id": "d4cc8e0e0c2b68ef69282c5250b721c9" 169 | } 170 | }, 171 | ... 172 | ] 173 | } 174 | ``` 175 | 176 | ### Helper functions for SEC EDGAR API 177 | 178 | You can use some of the functions provided in `prepline_sec_filings.fetch` to directly view or manipulate the filings available from the SEC's [EDGAR API](https://www.sec.gov/edgar/searchedgar/companysearch.html). 179 | For example, `get_filing(cik, accession_number, your_organization_name, your_email)` will return the text of the filing with accession number `accession_number` for the organization with CIK number `cik`. 180 | `your_organization_name` and `your_email` should be your information. 181 | The parameters `your_organization_name` and `your_email` are passed along to Edgar's API to identify the caller and are required by Edgar. 182 | Alternatively, the parameters may be omitted if the environment variables `SEC_API_ORGANIZATION` and `SEC_API_EMAIL` are defined. 183 | 184 | 185 | Helper functions are also provided for cases where the CIK and/or accession numbers are not known. For example, 186 | `get_form_by_ticker('mmm', '10-K', your_organization_name, your_email)` returns the text of the latest 10-K filing from 3M, 187 | and `open_form_by_ticker('mmm', '10-K', your_organization_name, your_email)` opens the SEC index page for the same filing in a web browser. 188 | 189 | ### Generating Python files from the pipeline notebooks 190 | 191 | The python module [section.py](/prepline_sec_filings/api/section.py) contains the FASTApi code needed to serve the API. It's created with `make generate-api`, which derives the API from the notebook [pipeline-section.ipynb](/pipeline-notebooks/pipeline-section.ipynb). 192 | 193 | You can generate the FastAPI APIs from all [pipeline-notebooks/](/pipeline-notebooks) by running `make generate-api`. 194 | 195 | ## Docker 196 | 197 | It is not necessary to run Docker in a local development environment, however a Dockerfile and 198 | make targets of `docker-build`, `docker-start-api`, and `docker-start-jupyter` are provided for convenience. 199 | 200 | You can also launch a Jupyter instance to try out the notebooks with [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/Unstructured-IO/pipeline-sec-filings/HEAD). 201 | 202 | ## Security Policy 203 | 204 | See our [security policy](https://github.com/Unstructured-IO/pipeline-sec-filings/security/policy) for 205 | information on how to report security vulnerabilities. 206 | 207 | ## Learn more 208 | 209 | | Section | Description | 210 | |-|-| 211 | | [Company Website](https://unstructured.io) | Unstructured.io product and company info | 212 | [EDGAR API](https://www.sec.gov/edgar/searchedgar/companysearch.html) | Documentation for the SEC 213 | | [10-K Filings](https://www.sec.gov/files/form10-k.pdf) | Detailed documentation on 10-K filings | 214 | | [10-Q Filings](https://www.sec.gov/files/form10-q.pdf) | Detailed documentation on 10-Q filings | 215 | | [S-1 Filings](https://www.sec.gov/files/forms-1.pdf) | Detailed documentation on S-1 filings | 216 | -------------------------------------------------------------------------------- /test_sec_filings/sec_filings/test_section_api.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import csv 4 | from io import StringIO 5 | 6 | from fastapi.testclient import TestClient 7 | 8 | from unstructured_api_tools.pipelines.api_conventions import get_pipeline_path 9 | 10 | from prepline_sec_filings.api.app import app as core_app 11 | from prepline_sec_filings.api.section import app 12 | 13 | SECTION_ROUTE = get_pipeline_path("section") 14 | 15 | 16 | def generate_sample_document(form_type): 17 | is_s1 = form_type == "S-1" 18 | return f""" 19 | {form_type} 20 | Proctor & Gamble 21 | 22 |

SECURITY AND EXCHANGE COMISSION FILING

23 |

ITEM 1. BUSINESS

24 |

This is a section and great and wonderful business dealings.

25 |

{'ITEM 1A. ' if not is_s1 else ''}RISK FACTORS

26 |

Wolverines

27 |

The business could be attacked by wolverines.

28 |

Bears

29 |

The business could be attacked by bears.

30 |

{'ITEM 1B. ' if not is_s1 else ''}UNRESOLVED STAFF COMMENTS

31 |

None

32 |

PROSPECTUS SUMMARY

33 |

Here is a summary of the prospectus

34 | 35 |
""" 36 | 37 | 38 | @pytest.mark.parametrize( 39 | "form_type, section", 40 | [ 41 | ("10-K", "RISK_FACTORS"), 42 | ("10-Q", "RISK_FACTORS"), 43 | ("S-1", "RISK_FACTORS"), 44 | ("10-K", "_ALL"), 45 | ("10-Q", "_ALL"), 46 | ("S-1", "_ALL"), 47 | ], 48 | ) 49 | def test_section_narrative_api(form_type, section, tmpdir): 50 | sample_document = generate_sample_document(form_type) 51 | filename = os.path.join(tmpdir.dirname, "wilderness.xbrl") 52 | with open(filename, "w") as f: 53 | f.write(sample_document) 54 | 55 | # NOTE(robinson) - Reset the rate limit to avoid 429s in tests 56 | client = TestClient(app) 57 | response = client.post( 58 | SECTION_ROUTE, 59 | files=[("text_files", (filename, open(filename, "rb"), "text/plain"))], 60 | data={"section": [section]}, 61 | ) 62 | 63 | assert response.status_code == 200 64 | response_dict = response.json() 65 | 66 | assert response_dict["RISK_FACTORS"] == [ 67 | { 68 | "text": "The business could be attacked by wolverines.", 69 | "type": "NarrativeText", 70 | }, 71 | { 72 | "text": "The business could be attacked by bears.", 73 | "type": "NarrativeText", 74 | }, 75 | ] 76 | 77 | 78 | @pytest.mark.parametrize( 79 | "form_type, section", 80 | [ 81 | ("10-K", "RISK_FACTORS"), 82 | ("10-Q", "RISK_FACTORS"), 83 | ("S-1", "RISK_FACTORS"), 84 | ("10-K", "_ALL"), 85 | ("10-Q", "_ALL"), 86 | ("S-1", "_ALL"), 87 | ], 88 | ) 89 | def test_section_narrative_api_labelstudio(form_type, section, tmpdir): 90 | sample_document = generate_sample_document(form_type) 91 | filename = os.path.join(tmpdir.dirname, "wilderness.xbrl") 92 | with open(filename, "w") as f: 93 | f.write(sample_document) 94 | 95 | # NOTE(robinson) - Reset the rate limit to avoid 429s in tests 96 | client = TestClient(app) 97 | response = client.post( 98 | SECTION_ROUTE, 99 | files=[("text_files", (filename, open(filename, "rb"), "text/plain"))], 100 | data={"output_schema": "labelstudio", "section": [section]}, 101 | ) 102 | 103 | assert response.status_code == 200 104 | response_dict = response.json() 105 | 106 | assert response_dict["RISK_FACTORS"][0] == { 107 | "data": { 108 | "text": "The business could be attacked by wolverines.", 109 | "ref_id": "bd91f9f2e43cf85a8ce9b7a19c2e63e5", 110 | } 111 | } 112 | 113 | assert response_dict["RISK_FACTORS"][1] == { 114 | "data": { 115 | "text": "The business could be attacked by bears.", 116 | "ref_id": "e731c6ec715fedfe8d07fe84a7e02efb", 117 | } 118 | } 119 | 120 | 121 | @pytest.mark.parametrize( 122 | "form_type, section", 123 | [ 124 | ("10-K", "RISK_FACTORS"), 125 | ("10-Q", "RISK_FACTORS"), 126 | ("S-1", "RISK_FACTORS"), 127 | ("10-K", "_ALL"), 128 | ("10-Q", "_ALL"), 129 | ("S-1", "_ALL"), 130 | ], 131 | ) 132 | def test_section_narrative_api_with_unsupported_response_schema(form_type, section, tmpdir): 133 | sample_document = generate_sample_document(form_type) 134 | filename = os.path.join(tmpdir.dirname, "wilderness.xbrl") 135 | with open(filename, "w") as f: 136 | f.write(sample_document) 137 | 138 | # NOTE(robinson) - Reset the rate limit to avoid 429s in tests 139 | client = TestClient(app) 140 | 141 | # FIXME(nyoon): need to handle ValueError in a better way in unstructured-api-tools 142 | with pytest.raises(ValueError): 143 | response = client.post( 144 | SECTION_ROUTE, 145 | files=[("text_files", (filename, open(filename, "rb"), "text/plain"))], 146 | data={"output_schema": "unsupported", "section": [section]}, 147 | ) 148 | assert response.status_code == 406 149 | assert response.content == "Unsupported response schema unsupported.\n" 150 | 151 | 152 | @pytest.mark.parametrize( 153 | "form_type", 154 | [ 155 | ("10-K"), 156 | ("10-Q"), 157 | ("S-1"), 158 | ], 159 | ) 160 | def test_section_narrative_api_with_custom_regex(form_type, tmpdir): 161 | sample_document = generate_sample_document(form_type) 162 | filename = os.path.join(tmpdir.dirname, "wilderness.xbrl") 163 | with open(filename, "w") as f: 164 | f.write(sample_document) 165 | 166 | # NOTE(robinson) - Reset the rate limit to avoid 429s in tests 167 | client = TestClient(app) 168 | response = client.post( 169 | SECTION_ROUTE, 170 | files=[("text_files", (filename, open(filename, "rb"), "text/plain"))], 171 | data={"section_regex": ["risk factors"]}, 172 | ) 173 | 174 | assert response.status_code == 200 175 | response_dict = response.json() 176 | 177 | assert response_dict["REGEX_0"] == [ 178 | { 179 | "text": "The business could be attacked by wolverines.", 180 | "type": "NarrativeText", 181 | }, 182 | { 183 | "text": "The business could be attacked by bears.", 184 | "type": "NarrativeText", 185 | }, 186 | ] 187 | 188 | 189 | @pytest.mark.parametrize( 190 | "form_type", 191 | [ 192 | ("10-K"), 193 | ("10-Q"), 194 | ("S-1"), 195 | ], 196 | ) 197 | def test_section_narrative_api_with_custom_regex_with_special_chars(form_type, tmpdir): 198 | sample_document = generate_sample_document(form_type) 199 | filename = os.path.join(tmpdir.dirname, "wilderness.xbrl") 200 | with open(filename, "w") as f: 201 | f.write(sample_document) 202 | 203 | # NOTE(robinson) - Reset the rate limit to avoid 429s in tests 204 | client = TestClient(app) 205 | response = client.post( 206 | SECTION_ROUTE, 207 | files=[("text_files", (filename, open(filename, "rb"), "text/plain"))], 208 | data={"section_regex": ["^(?:prospectus )?summary$"]}, 209 | ) 210 | 211 | assert response.status_code == 200 212 | response_dict = response.json() 213 | 214 | assert response_dict["REGEX_0"] == [ 215 | { 216 | "text": "Here is a summary of the prospectus", 217 | "type": "NarrativeText", 218 | }, 219 | ] 220 | 221 | 222 | @pytest.mark.parametrize( 223 | "form_types, section", 224 | [ 225 | (["10-K", "10-Q"], "RISK_FACTORS"), 226 | (["10-K", "10-Q"], "_ALL"), 227 | ], 228 | ) 229 | def test_section_narrative_api_with_multiple_uploads(form_types, section, tmpdir): 230 | filenames = [] 231 | for idx, form_type in enumerate(form_types): 232 | sample_document = generate_sample_document(form_type) 233 | filename = os.path.join(tmpdir.dirname, f"wilderness_{idx}.xbrl") 234 | with open(filename, "w") as f: 235 | f.write(sample_document) 236 | filenames.append(filename) 237 | 238 | # NOTE(robinson) - Reset the rate limit to avoid 429s in tests 239 | client = TestClient(app) 240 | files = [ 241 | ("text_files", (filename, open(filename, "rb"), "text/plain")) for filename in filenames 242 | ] 243 | response = client.post( 244 | SECTION_ROUTE, 245 | files=files, 246 | headers={ 247 | "Accept": "multipart/mixed", 248 | }, 249 | data={"section": [section]}, 250 | ) 251 | 252 | assert response.status_code == 200 253 | 254 | if len(filenames) > 1: 255 | assert "multipart/mixed" in response.headers["content-type"] 256 | else: 257 | response_dict = response.json() 258 | 259 | assert response_dict["RISK_FACTORS"] == [ 260 | { 261 | "text": "The business could be attacked by wolverines.", 262 | "type": "NarrativeText", 263 | }, 264 | { 265 | "text": "The business could be attacked by bears.", 266 | "type": "NarrativeText", 267 | }, 268 | ] 269 | 270 | 271 | @pytest.mark.parametrize( 272 | "form_types, section, accept_header, response_status", 273 | [ 274 | (["10-K", "10-Q"], "RISK_FACTORS", "multipart/mixed", 200), 275 | (["10-K", "10-Q"], "_ALL", "application/json", 200), 276 | ( 277 | ["10-K", "10-Q"], 278 | "_ALL", 279 | "text/csv", # Accept header must be multipart/mixed or application/json 280 | 406, 281 | ), 282 | ([], "_ALL", "application/json", 400), 283 | ], 284 | ) 285 | def test_section_narrative_api_with_headers( 286 | form_types, section, accept_header, response_status, tmpdir 287 | ): 288 | filenames = [] 289 | for idx, form_type in enumerate(form_types): 290 | sample_document = generate_sample_document(form_type) 291 | filename = os.path.join(tmpdir.dirname, f"wilderness_{idx}.xbrl") 292 | with open(filename, "w") as f: 293 | f.write(sample_document) 294 | filenames.append(filename) 295 | 296 | # NOTE(robinson) - Reset the rate limit to avoid 429s in tests 297 | client = TestClient(app) 298 | files = [ 299 | ("text_files", (filename, open(filename, "rb"), "text/plain")) for filename in filenames 300 | ] 301 | response = client.post( 302 | SECTION_ROUTE, 303 | files=files, 304 | headers={ 305 | "Accept": accept_header, 306 | }, 307 | data={"section": [section]}, 308 | ) 309 | 310 | assert response.status_code == response_status 311 | 312 | 313 | @pytest.mark.parametrize( 314 | "form_type, response_type, section", 315 | [ 316 | ("10-K", "text/csv", "RISK_FACTORS"), 317 | ("10-Q", "text/csv", "RISK_FACTORS"), 318 | ("S-1", "text/csv", "RISK_FACTORS"), 319 | ("10-K", "text/csv", "_ALL"), 320 | ("10-Q", "text/csv", "_ALL"), 321 | ("S-1", "text/csv", "_ALL"), 322 | ], 323 | ) 324 | def test_section_narrative_api_csv_response(form_type, response_type, section, tmpdir): 325 | sample_document = generate_sample_document(form_type) 326 | filename = os.path.join(tmpdir.dirname, "wilderness.xbrl") 327 | with open(filename, "w") as f: 328 | f.write(sample_document) 329 | 330 | # NOTE(robinson) - Reset the rate limit to avoid 429s in tests 331 | client = TestClient(app) 332 | response = client.post( 333 | SECTION_ROUTE, 334 | files=[("text_files", (filename, open(filename, "rb"), "text/plain"))], 335 | data={"output_format": response_type, "section": [section]}, 336 | ) 337 | assert response.status_code == 200 338 | 339 | response_csv = csv.DictReader(StringIO(response.json()), delimiter=",") 340 | response_list = list(response_csv) 341 | 342 | assert [x["section"] for x in response_list] 343 | assert [x["element_type"] for x in response_list] 344 | assert [x["text"] for x in response_list] 345 | 346 | 347 | @pytest.mark.parametrize( 348 | "form_type, response_type, section", 349 | [ 350 | ("10-K", "text/csv", "RISK_FACTORS"), 351 | ("10-Q", "text/csv", "RISK_FACTORS"), 352 | ("S-1", "text/csv", "RISK_FACTORS"), 353 | ("10-K", "text/csv", "_ALL"), 354 | ("10-Q", "text/csv", "_ALL"), 355 | ("S-1", "text/csv", "_ALL"), 356 | ], 357 | ) 358 | def test_section_narrative_api_csv_response_with_unsupported_response_schema( 359 | form_type, response_type, section, tmpdir 360 | ): 361 | sample_document = generate_sample_document(form_type) 362 | filename = os.path.join(tmpdir.dirname, "wilderness.xbrl") 363 | with open(filename, "w") as f: 364 | f.write(sample_document) 365 | 366 | # NOTE(robinson) - Reset the rate limit to avoid 429s in tests 367 | client = TestClient(app) 368 | 369 | # FIXME(nyoon): need to handle ValueError in a better way in unstructured-api-tools 370 | with pytest.raises(ValueError): 371 | response = client.post( 372 | SECTION_ROUTE, 373 | files=[("text_files", (filename, open(filename, "rb"), "text/plain"))], 374 | data={ 375 | "output_format": response_type, 376 | "output_schema": "unsupported", 377 | "section": [section], 378 | }, 379 | ) 380 | assert response.status_code == 406 381 | assert response.content == "Unsupported response schema unsupported.\n" 382 | 383 | 384 | def test_core_app_health_check(): 385 | # NOTE(crag): switch all tests to core_app when rate limiting is removed 386 | client = TestClient(core_app) 387 | response = client.get("/healthcheck") 388 | 389 | assert response.status_code == 200 390 | -------------------------------------------------------------------------------- /prepline_sec_filings/api/section.py: -------------------------------------------------------------------------------- 1 | ##################################################################### 2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS. 3 | # DO NOT MODIFY DIRECTLY 4 | ##################################################################### 5 | 6 | import io 7 | import os 8 | import gzip 9 | import mimetypes 10 | from typing import List, Union 11 | from fastapi import status, FastAPI, File, Form, Request, UploadFile, APIRouter, HTTPException 12 | from fastapi.responses import PlainTextResponse 13 | import json 14 | from fastapi.responses import StreamingResponse 15 | from starlette.datastructures import Headers 16 | from starlette.types import Send 17 | from base64 import b64encode 18 | from typing import Optional, Mapping, Iterator, Tuple 19 | import secrets 20 | from prepline_sec_filings.sections import section_string_to_enum, validate_section_names, SECSection 21 | from prepline_sec_filings.sec_document import SECDocument, REPORT_TYPES, VALID_FILING_TYPES 22 | from enum import Enum 23 | import re 24 | import signal 25 | from unstructured.staging.base import convert_to_isd 26 | from prepline_sec_filings.sections import ( 27 | ALL_SECTIONS, 28 | SECTIONS_10K, 29 | SECTIONS_10Q, 30 | SECTIONS_S1, 31 | ) 32 | import csv 33 | from typing import Dict 34 | from unstructured.documents.elements import Text, NarrativeText, Title, ListItem 35 | from unstructured.staging.label_studio import stage_for_label_studio 36 | 37 | 38 | app = FastAPI() 39 | router = APIRouter() 40 | 41 | 42 | def is_expected_response_type(media_type, response_type): 43 | if media_type == "application/json" and response_type not in [dict, list]: 44 | return True 45 | elif media_type == "text/csv" and response_type != str: 46 | return True 47 | else: 48 | return False 49 | 50 | 51 | # pipeline-api 52 | 53 | 54 | class timeout: 55 | def __init__(self, seconds=1, error_message="Timeout"): 56 | self.seconds = seconds 57 | self.error_message = error_message 58 | 59 | def handle_timeout(self, signum, frame): 60 | raise TimeoutError(self.error_message) 61 | 62 | def __enter__(self): 63 | try: 64 | signal.signal(signal.SIGALRM, self.handle_timeout) 65 | signal.alarm(self.seconds) 66 | except ValueError: 67 | pass 68 | 69 | def __exit__(self, type, value, traceback): 70 | try: 71 | signal.alarm(0) 72 | except ValueError: 73 | pass 74 | 75 | 76 | def get_regex_enum(section_regex): 77 | class CustomSECSection(Enum): 78 | CUSTOM = re.compile(section_regex) 79 | 80 | @property 81 | def pattern(self): 82 | return self.value 83 | 84 | return CustomSECSection.CUSTOM 85 | 86 | 87 | def convert_to_isd_csv(results: dict) -> str: 88 | """ 89 | Returns the representation of document elements as an Initial Structured Document (ISD) 90 | in CSV Format. 91 | """ 92 | csv_fieldnames: List[str] = ["section", "element_type", "text"] 93 | new_rows = [] 94 | for section, section_narrative in results.items(): 95 | rows: List[Dict[str, str]] = convert_to_isd(section_narrative) 96 | for row in rows: 97 | new_row_item = dict() 98 | new_row_item["section"] = section 99 | new_row_item["element_type"] = row["type"] 100 | new_row_item["text"] = row["text"] 101 | new_rows.append(new_row_item) 102 | 103 | with io.StringIO() as buffer: 104 | csv_writer = csv.DictWriter(buffer, fieldnames=csv_fieldnames) 105 | csv_writer.writeheader() 106 | csv_writer.writerows(new_rows) 107 | return buffer.getvalue() 108 | 109 | 110 | # List of valid response schemas 111 | LABELSTUDIO = "labelstudio" 112 | ISD = "isd" 113 | 114 | 115 | def pipeline_api( 116 | text, response_type="application/json", response_schema="isd", m_section=[], m_section_regex=[] 117 | ): 118 | """Many supported sections including: RISK_FACTORS, MANAGEMENT_DISCUSSION, and many more""" 119 | validate_section_names(m_section) 120 | 121 | sec_document = SECDocument.from_string(text) 122 | if sec_document.filing_type not in VALID_FILING_TYPES: 123 | raise ValueError( 124 | f"SEC document filing type {sec_document.filing_type} is not supported, " 125 | f"must be one of {','.join(VALID_FILING_TYPES)}" 126 | ) 127 | results = {} 128 | if m_section == [ALL_SECTIONS]: 129 | filing_type = sec_document.filing_type 130 | if filing_type in REPORT_TYPES: 131 | if filing_type.startswith("10-K"): 132 | m_section = [enum.name for enum in SECTIONS_10K] 133 | elif filing_type.startswith("10-Q"): 134 | m_section = [enum.name for enum in SECTIONS_10Q] 135 | else: 136 | raise ValueError(f"Invalid report type: {filing_type}") 137 | 138 | else: 139 | m_section = [enum.name for enum in SECTIONS_S1] 140 | for section in m_section: 141 | results[section] = sec_document.get_section_narrative(section_string_to_enum[section]) 142 | for i, section_regex in enumerate(m_section_regex): 143 | regex_enum = get_regex_enum(section_regex) 144 | with timeout(seconds=5): 145 | section_elements = sec_document.get_section_narrative(regex_enum) 146 | results[f"REGEX_{i}"] = section_elements 147 | if response_type == "application/json": 148 | if response_schema == LABELSTUDIO: 149 | return { 150 | section: stage_for_label_studio(section_narrative) 151 | for section, section_narrative in results.items() 152 | } 153 | elif response_schema == ISD: 154 | return { 155 | section: convert_to_isd(section_narrative) 156 | for section, section_narrative in results.items() 157 | } 158 | else: 159 | raise ValueError( 160 | f"output_schema '{response_schema}' is not supported for {response_type}" 161 | ) 162 | elif response_type == "text/csv": 163 | if response_schema != ISD: 164 | raise ValueError( 165 | f"output_schema '{response_schema}' is not supported for {response_type}" 166 | ) 167 | return convert_to_isd_csv(results) 168 | else: 169 | raise ValueError(f"response_type '{response_type}' is not supported") 170 | 171 | 172 | def get_validated_mimetype(file): 173 | """ 174 | Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too 175 | generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and 176 | return HTTP 400 for an invalid type. 177 | """ 178 | content_type = file.content_type 179 | if not content_type or content_type == "application/octet-stream": 180 | content_type = mimetypes.guess_type(str(file.filename))[0] 181 | 182 | # Some filetypes missing for this library, just hardcode them for now 183 | if not content_type: 184 | if file.filename.endswith(".md"): 185 | content_type = "text/markdown" 186 | elif file.filename.endswith(".msg"): 187 | content_type = "message/rfc822" 188 | 189 | allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES") 190 | if allowed_mimetypes_str is not None: 191 | allowed_mimetypes = allowed_mimetypes_str.split(",") 192 | 193 | if content_type not in allowed_mimetypes: 194 | raise HTTPException( 195 | status_code=400, 196 | detail=( 197 | f"Unable to process {file.filename}: " 198 | f"File type {content_type} is not supported." 199 | ), 200 | ) 201 | 202 | return content_type 203 | 204 | 205 | class MultipartMixedResponse(StreamingResponse): 206 | CRLF = b"\r\n" 207 | 208 | def __init__(self, *args, content_type: str = None, **kwargs): 209 | super().__init__(*args, **kwargs) 210 | self.content_type = content_type 211 | 212 | def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None: 213 | super().init_headers(headers) 214 | self.boundary_value = secrets.token_hex(16) 215 | content_type = f'multipart/mixed; boundary="{self.boundary_value}"' 216 | self.raw_headers.append((b"content-type", content_type.encode("latin-1"))) 217 | 218 | @property 219 | def boundary(self): 220 | return b"--" + self.boundary_value.encode() 221 | 222 | def _build_part_headers(self, headers: dict) -> bytes: 223 | header_bytes = b"" 224 | for header, value in headers.items(): 225 | header_bytes += f"{header}: {value}".encode() + self.CRLF 226 | return header_bytes 227 | 228 | def build_part(self, chunk: bytes) -> bytes: 229 | part = self.boundary + self.CRLF 230 | part_headers = {"Content-Length": len(chunk), "Content-Transfer-Encoding": "base64"} 231 | if self.content_type is not None: 232 | part_headers["Content-Type"] = self.content_type 233 | part += self._build_part_headers(part_headers) 234 | part += self.CRLF + chunk + self.CRLF 235 | return part 236 | 237 | async def stream_response(self, send: Send) -> None: 238 | await send( 239 | { 240 | "type": "http.response.start", 241 | "status": self.status_code, 242 | "headers": self.raw_headers, 243 | } 244 | ) 245 | async for chunk in self.body_iterator: 246 | if not isinstance(chunk, bytes): 247 | chunk = chunk.encode(self.charset) 248 | chunk = b64encode(chunk) 249 | await send( 250 | {"type": "http.response.body", "body": self.build_part(chunk), "more_body": True} 251 | ) 252 | 253 | await send({"type": "http.response.body", "body": b"", "more_body": False}) 254 | 255 | 256 | def ungz_file(file: UploadFile, gz_uncompressed_content_type=None) -> UploadFile: 257 | def return_content_type(filename): 258 | if gz_uncompressed_content_type: 259 | return gz_uncompressed_content_type 260 | else: 261 | return str(mimetypes.guess_type(filename)[0]) 262 | 263 | filename = str(file.filename) if file.filename else "" 264 | if filename.endswith(".gz"): 265 | filename = filename[:-3] 266 | 267 | gzip_file = gzip.open(file.file).read() 268 | return UploadFile( 269 | file=io.BytesIO(gzip_file), 270 | size=len(gzip_file), 271 | filename=filename, 272 | headers=Headers({"content-type": return_content_type(filename)}), 273 | ) 274 | 275 | 276 | @router.post("/sec-filings/v0/section") 277 | @router.post("/sec-filings/v0.2.1/section") 278 | def pipeline_1( 279 | request: Request, 280 | gz_uncompressed_content_type: Optional[str] = Form(default=None), 281 | text_files: Union[List[UploadFile], None] = File(default=None), 282 | output_format: Union[str, None] = Form(default=None), 283 | output_schema: str = Form(default=None), 284 | section: List[str] = Form(default=[]), 285 | section_regex: List[str] = Form(default=[]), 286 | ): 287 | if text_files: 288 | for file_index in range(len(text_files)): 289 | if text_files[file_index].content_type == "application/gzip": 290 | text_files[file_index] = ungz_file(text_files[file_index]) 291 | 292 | content_type = request.headers.get("Accept") 293 | 294 | default_response_type = output_format or "application/json" 295 | if not content_type or content_type == "*/*" or content_type == "multipart/mixed": 296 | media_type = default_response_type 297 | else: 298 | media_type = content_type 299 | 300 | default_response_schema = output_schema or "isd" 301 | 302 | if isinstance(text_files, list) and len(text_files): 303 | if len(text_files) > 1: 304 | if content_type and content_type not in ["*/*", "multipart/mixed", "application/json"]: 305 | raise HTTPException( 306 | detail=( 307 | f"Conflict in media type {content_type}" 308 | ' with response type "multipart/mixed".\n' 309 | ), 310 | status_code=status.HTTP_406_NOT_ACCEPTABLE, 311 | ) 312 | 313 | def response_generator(is_multipart): 314 | for file in text_files: 315 | get_validated_mimetype(file) 316 | 317 | text = file.file.read().decode("utf-8") 318 | 319 | response = pipeline_api( 320 | text, 321 | m_section=section, 322 | m_section_regex=section_regex, 323 | response_type=media_type, 324 | response_schema=default_response_schema, 325 | ) 326 | 327 | if is_expected_response_type(media_type, type(response)): 328 | raise HTTPException( 329 | detail=( 330 | f"Conflict in media type {media_type}" 331 | f" with response type {type(response)}.\n" 332 | ), 333 | status_code=status.HTTP_406_NOT_ACCEPTABLE, 334 | ) 335 | 336 | valid_response_types = ["application/json", "text/csv", "*/*", "multipart/mixed"] 337 | if media_type in valid_response_types: 338 | if is_multipart: 339 | if type(response) not in [str, bytes]: 340 | response = json.dumps(response) 341 | yield response 342 | else: 343 | raise HTTPException( 344 | detail=f"Unsupported media type {media_type}.\n", 345 | status_code=status.HTTP_406_NOT_ACCEPTABLE, 346 | ) 347 | 348 | if content_type == "multipart/mixed": 349 | return MultipartMixedResponse( 350 | response_generator(is_multipart=True), content_type=media_type 351 | ) 352 | else: 353 | return ( 354 | list(response_generator(is_multipart=False))[0] 355 | if len(text_files) == 1 356 | else response_generator(is_multipart=False) 357 | ) 358 | else: 359 | raise HTTPException( 360 | detail='Request parameter "text_files" is required.\n', 361 | status_code=status.HTTP_400_BAD_REQUEST, 362 | ) 363 | 364 | 365 | app.include_router(router) 366 | -------------------------------------------------------------------------------- /test_sec_filings/test_sec_document.py: -------------------------------------------------------------------------------- 1 | from itertools import product, combinations 2 | import pytest 3 | 4 | from unstructured.documents.base import NarrativeText 5 | from unstructured.documents.elements import Title 6 | 7 | from prepline_sec_filings.sec_document import ( 8 | SECDocument, 9 | first, 10 | get_element_by_title, 11 | is_item_title, 12 | is_risk_title, 13 | _raise_for_invalid_filing_type, 14 | is_toc_title, 15 | match_s1_toc_title_to_section, 16 | match_10k_toc_title_to_section, 17 | remove_item_from_section_text, 18 | get_narrative_texts, 19 | ) 20 | from prepline_sec_filings.sections import SECSection, ALL_SECTIONS, validate_section_names 21 | 22 | 23 | @pytest.fixture 24 | def table_toc(form_type): 25 | is_s1 = form_type == "S-1" 26 | return f""" 27 | 28 | 29 |

{'Part I. OTHER INFORMATION' if not is_s1 else 'None'}

30 |

{'ITEM 1. ' if not is_s1 else ''}PROSPECTUS SUMMARY

31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 |

TABLE OF CONTENTS

{'ITEM 1A. ' if not is_s1 else ''}RISK FACTORS

1

{'ITEM 1B. ' if not is_s1 else ''}UNRESOLVED STAFF COMMENTS

1

{'ITEM 2 ' if not is_s1 else ''}DIVIDEND POLICY

1

{'ITEM 3 ' if not is_s1 else ''}CAPITALIZATION

1

{'ITEM 4 ' if not is_s1 else ''}DILUTION

1

{'ITEM 5 ' if not is_s1 else ''}WOLVERINES AND BEARS

1

{'ITEM 6 ' if not is_s1 else ''}PROPERTIES

1
""" 39 | 40 | 41 | @pytest.fixture 42 | def sample_document(form_type, table_toc, use_toc): 43 | is_s1 = form_type == "S-1" 44 | return f""" 45 | {form_type} 46 | Proctor & Gamble 47 | 48 | {table_toc if use_toc else ''} 49 |

SECURITY AND EXCHANGE COMISSION FILING

50 |

{'Part I.' if not is_s1 else 'None'}

51 |

{'OTHER INFORMATION' if not is_s1 else 'None'}

52 |

{'ITEM 1. ' if not is_s1 else ''}PROSPECTUS SUMMARY

53 |

This is a section on prospectus.

54 |

{'ITEM 1A. ' if not is_s1 else ''}RISK FACTORS

55 |

Wolverines

56 |

The business could be attacked by wolverines.

57 |

Bears

58 |

The business could be attacked by bears.

59 |

{'ITEM 1B. ' if not is_s1 else ''}UNRESOLVED STAFF COMMENTS

60 |

None

61 |

{'ITEM 2 ' if not is_s1 else ''}DIVIDEND POLICY

62 |

Dispersing Dividends

63 |

Sometimes we disperse dividends, and everyone gets money.

64 |

Uh Oh

65 |

Sometimes we don't disperse dividends, and nobody gets money.

66 |

{'ITEM 3 ' if not is_s1 else ''}CAPITALIZATION

67 |

None

68 |

{'ITEM 4 ' if not is_s1 else ''}DILUTION

69 |

None

70 |

{'ITEM 5 ' if not is_s1 else ''}WOLVERINES AND BEARS

71 |

Just to reiterate, our business could be the victim of a wolverine attack.

72 |

Also bears attack us literally twice a week.

73 |

{'ITEM 6 ' if not is_s1 else ''}PROPERTIES

74 |

One building in the middle of the woods.

75 |

Why did we build it here?

76 |

We really should not have done this.

77 |

It was Steve's idea.

78 | 79 |
""" 80 | 81 | 82 | @pytest.fixture 83 | def sample_document_with_last_sections(form_type, has_form_summary_section, has_exhibits_section): 84 | is_s1 = form_type == "S-1" 85 | show_exhibit = (not is_s1) and has_exhibits_section 86 | show_form_summary = (not is_s1) and has_form_summary_section 87 | 88 | table_toc = f""" 89 | 90 | 91 |

{'Part I. OTHER INFORMATION' if not is_s1 else 'None'}

92 |

{'ITEM 1. ' if not is_s1 else ''}PROSPECTUS SUMMARY

93 | {'' if show_exhibit else ''} 94 | {'' if show_form_summary else ''} 95 |

TABLE OF CONTENTS

ITEM 7 EXHIBIT

1

ITEM 8 FORM 10-K SUMMARY

1
""" 96 | 97 | return f""" 98 | {form_type} 99 | Proctor & Gamble 100 | 101 | {table_toc} 102 |

SECURITY AND EXCHANGE COMISSION FILING

103 |

{'Part I.' if not is_s1 else 'None'}

104 |

{'OTHER INFORMATION' if not is_s1 else 'None'}

105 |

{'ITEM 1. ' if not is_s1 else ''}PROSPECTUS SUMMARY

106 |

This is a section on prospectus.

107 | {'

ITEM 7 EXHIBIT

' if show_exhibit else ''} 108 | {'

This is a section on exhibit.

' if show_exhibit else ''} 109 | {'

ITEM 8 FORM 10-K SUMMARY

' if show_form_summary else ''} 110 | {'

This is a section on form summary.

' if show_form_summary else ''} 111 | 112 |
""" 113 | 114 | 115 | class MockElement: 116 | def __init__(self, text): 117 | self.text = text 118 | 119 | 120 | @pytest.fixture 121 | def elements(): 122 | texts = ["Risk Factors:", "ITEM 1a. risk factors", "ITEM 3. Cats", "Summary"] 123 | return [MockElement(text) for text in texts] 124 | 125 | 126 | @pytest.mark.parametrize( 127 | "section_name, form_type, use_toc", 128 | product( 129 | [SECSection.DIVIDEND_POLICY], 130 | ["10-Q", "10-K", "S-1"], 131 | [True, False], 132 | ), 133 | ) 134 | def test_get_dividend_narrative(section_name, sample_document): 135 | sec_document = SECDocument.from_string(sample_document) 136 | sections = sec_document.get_section_narrative(section_name) 137 | assert sections == [ 138 | NarrativeText(text="Sometimes we disperse dividends, and everyone gets money."), 139 | NarrativeText(text="Sometimes we don't disperse dividends, and nobody gets money."), 140 | ] 141 | 142 | 143 | @pytest.mark.parametrize("form_type, use_toc", product(("10-Q", "10-K", "S-1"), (True, False))) 144 | def test_get_risk_narrative(sample_document): 145 | sec_document = SECDocument.from_string(sample_document) 146 | risk_sections = sec_document.get_risk_narrative() 147 | assert risk_sections == [ 148 | NarrativeText(text="The business could be attacked by wolverines."), 149 | NarrativeText(text="The business could be attacked by bears."), 150 | ] 151 | 152 | 153 | @pytest.mark.parametrize("form_type, use_toc", product(("10-Q", "10-K", "S-1"), (True, False))) 154 | def test_get_table_of_contents(sample_document, form_type, use_toc): 155 | is_s1 = form_type == "S-1" 156 | sec_document = SECDocument.from_string(sample_document) 157 | toc_elements = sec_document.get_table_of_contents().elements 158 | if use_toc: 159 | assert Title(text=f"{'ITEM 1A. ' if not is_s1 else ''}RISK FACTORS") in toc_elements 160 | else: 161 | assert toc_elements == [] 162 | 163 | 164 | def test_get_10k_table_of_contents_processes_empty_doc(): 165 | sec_document = SECDocument.from_string("10-K") 166 | risk_sections = sec_document.get_table_of_contents().elements 167 | assert risk_sections == list() 168 | 169 | 170 | def test_get_risk_narrative_raises_with_wrong_type(): 171 | sec_document = SECDocument.from_string("999-ZZZ") 172 | with pytest.raises(ValueError): 173 | sec_document.get_risk_narrative() 174 | 175 | 176 | @pytest.mark.parametrize("form_type, use_toc", product(["10-K", "10-Q", "S-1"], [True])) 177 | def test__get_toc_sections(sample_document, form_type): 178 | is_s1 = form_type == "S-1" 179 | sec_document = SECDocument.from_string(sample_document) 180 | toc = sec_document.get_table_of_contents() 181 | # finds the section titles 182 | section_toc, next_section_toc = sec_document._get_toc_sections( 183 | SECSection.PROSPECTUS_SUMMARY, toc 184 | ) 185 | assert ( 186 | section_toc.text == f"{'ITEM 1. ' if not is_s1 else ''}PROSPECTUS SUMMARY" 187 | and next_section_toc.text == f"{'ITEM 1A. ' if not is_s1 else ''}RISK FACTORS" 188 | ) 189 | # fails to find the section_toc because it's not in the document 190 | section_toc, next_section_toc = sec_document._get_toc_sections(SECSection.EXHIBITS, toc) 191 | assert (section_toc, next_section_toc) == (None, None) 192 | assert sec_document.get_section_narrative(SECSection.EXHIBITS) == [] 193 | 194 | 195 | @pytest.mark.parametrize( 196 | "form_type, has_form_summary_section, has_exhibits_section, expected_last_section", 197 | [ 198 | ("10-K", True, False, SECSection.FORM_SUMMARY), 199 | ("10-K", False, True, SECSection.EXHIBITS), 200 | ("10-K", True, True, SECSection.FORM_SUMMARY), 201 | ("10-Q", False, True, SECSection.EXHIBITS), 202 | ], 203 | ) 204 | def test__is_last_section_in_report(sample_document_with_last_sections, expected_last_section): 205 | sec_document = SECDocument.from_string(sample_document_with_last_sections) 206 | toc = sec_document.get_table_of_contents() 207 | assert sec_document._is_last_section_in_report(expected_last_section, toc) 208 | assert len(sec_document.get_section_narrative(expected_last_section)) == 1 209 | 210 | 211 | @pytest.mark.parametrize( 212 | "section", [SECSection.RISK_FACTORS, SECSection.CAPITALIZATION, SECSection.DIVIDEND_POLICY] 213 | ) 214 | def test_get_10k_section_narrative_processes_empty_doc(section): 215 | sec_document = SECDocument.from_string("10-K") 216 | sections = sec_document.get_section_narrative(section) 217 | assert sections == list() 218 | 219 | 220 | @pytest.mark.parametrize("form_type, use_toc", product(["10-K", "10-Q", "S-1"], [False])) 221 | def test_get_filing_type(sample_document, form_type): 222 | sec_document = SECDocument.from_string(sample_document) 223 | assert sec_document.filing_type == form_type 224 | 225 | 226 | def test_get_filing_type_is_none_when_missing(): 227 | sec_document = SECDocument.from_string("") 228 | assert sec_document.filing_type is None 229 | 230 | 231 | def test_get_narrative_texts_up_to_next_title(): 232 | document_starts_with_narrative_text = """ 233 | 234 | 10-K 235 | Proctor & Gamble 236 | 237 |

this is a narrative text.

238 |

'NEXT TITLE'

239 | 240 |
""" 241 | sec_document = SECDocument.from_string(document_starts_with_narrative_text) 242 | narrative_texts_up_to_next_title = get_narrative_texts(sec_document, up_to_next_title=True) 243 | assert narrative_texts_up_to_next_title == [NarrativeText(text="this is a narrative text.")] 244 | 245 | 246 | @pytest.mark.parametrize( 247 | "title, expected", 248 | [ 249 | ("ITEM 1A.", True), 250 | ("item 1a.", True), 251 | ("Item 1.", True), 252 | ("Item 3:", True), 253 | ("Item 3(a):", True), 254 | ("Item 3(a): ", True), 255 | ( 256 | "ITEM 5(a).: MARKET FOR REGISTRANT’S COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND " 257 | "ISSUER PURCHASES OF EQUITY SECURITIES", 258 | True, 259 | ), 260 | ("Item 12A.", True), 261 | ("This is a paragraph about an item", False), 262 | ("RISK FACTORS", False), 263 | ("Risk Factors", False), 264 | ], 265 | ) 266 | def test_is_10k_item_title(title, expected): 267 | assert is_item_title(title, "10-K") == expected 268 | 269 | 270 | @pytest.mark.parametrize( 271 | "title, expected", 272 | [ 273 | ("ITEM 1A.", True), 274 | ("item 1a.", True), 275 | ("Item 1.", False), 276 | ("Item 12A.", False), 277 | ("This is a paragraph about an item", False), 278 | ("RISK FACTORS", True), 279 | ("Risk Factors", True), 280 | ("DISCLOSURES", False), 281 | ("Disclosures", False), 282 | ("SUMMARY OF RISK FACTORS", False), 283 | ], 284 | ) 285 | def test_is_10_k_risk_title(title, expected): 286 | assert is_risk_title(title, "10-K") == expected 287 | 288 | 289 | @pytest.mark.parametrize( 290 | "title, expected", 291 | [ 292 | ("RISK FACTORS", True), 293 | ("SPECIAL NOTE", True), 294 | ("Risk Factors Summary", False), 295 | ], 296 | ) 297 | def test_is_s1_item_title(title, expected): 298 | assert is_item_title(title, "S-1") == expected 299 | 300 | 301 | @pytest.mark.parametrize( 302 | "title, expected", 303 | [ 304 | ("RISK FACTORS", True), 305 | ("SPECIAL NOTE", False), 306 | ("Risk Factors Summary", False), 307 | ], 308 | ) 309 | def test_is_s1_risk_title(title, expected): 310 | assert is_risk_title(title, "S-1") == expected 311 | 312 | 313 | @pytest.mark.parametrize( 314 | "text, title, expected", 315 | [ 316 | ("risk factors", "risk factors", True), 317 | ("risk factors", "something else", False), 318 | ("summary of risk factors", "risk factors", False), 319 | ], 320 | ) 321 | def test_match_s1_toc_title_to_section(text, title, expected): 322 | assert match_s1_toc_title_to_section(text, title) == expected 323 | 324 | 325 | @pytest.mark.parametrize( 326 | "text, title, expected", 327 | [ 328 | ("risk factors", "risk factors", True), 329 | ("summary of risk factors", "risk factors", False), 330 | ("item 1a. risk factors", "item 1a", True), 331 | ("item 1a.", "item 1a", True), 332 | ("item 1a. risk factors", "risk factors", True), 333 | ("item 1a. summary of risk factors", "risk factors", False), 334 | ("item 1a. summary of risk factors", "something else", False), 335 | ], 336 | ) 337 | def test_match_10k_toc_title_to_section(text, title, expected): 338 | assert match_10k_toc_title_to_section(text, title) == expected 339 | 340 | 341 | @pytest.mark.parametrize( 342 | "text, expected", 343 | [("Item 1a. Risk Factors", "Risk Factors"), ("Risk Factors", "Risk Factors")], 344 | ) 345 | def test_remove_item_from_section_text(text, expected): 346 | assert remove_item_from_section_text(text) == expected 347 | 348 | 349 | @pytest.mark.parametrize( 350 | "title, expected", 351 | [("Table of contents", True), ("Risk Factors", False), ("Index", True)], 352 | ) 353 | def test_is_toc_title(title, expected): 354 | assert is_toc_title(title) == expected 355 | 356 | 357 | def test_invalid_item_title_returns_false(): 358 | assert is_item_title("TEST", "INVALID") is False 359 | 360 | 361 | def test_invalid_risk_title_returns_false(): 362 | assert is_risk_title("TEST", "INVALID") is False 363 | 364 | 365 | def test_empty_filing_type_raises(): 366 | with pytest.raises(ValueError): 367 | _raise_for_invalid_filing_type(None) 368 | 369 | 370 | @pytest.mark.parametrize("it, expected", [(["a"], "a"), (["b", "a"], "b"), ([], None)]) 371 | def test_first(it, expected): 372 | result = first(it) 373 | if result is None: 374 | assert expected is None 375 | else: 376 | assert result == expected 377 | 378 | 379 | @pytest.mark.parametrize( 380 | "title, filing_type, expected", 381 | [ 382 | ("risk factors", "S-1", "Risk Factors:"), 383 | ("item 1a", "10-Q", "ITEM 1a. risk factors"), 384 | ("cats", "10-Q", "ITEM 3. Cats"), 385 | ("cats", "S-1", None), 386 | ("summary", "S-1", "Summary"), 387 | ("another title", "10-K", None), 388 | ], 389 | ) 390 | def test_get_element_by_title(elements, title, filing_type, expected): 391 | result = get_element_by_title(elements, title, filing_type) 392 | if result is None: 393 | assert expected is None 394 | else: 395 | assert result.text == expected 396 | 397 | 398 | @pytest.mark.parametrize("form_type, use_toc", [("10-Q", True)]) 399 | def test_doc_after_cleaners_keeps_filing_type(form_type, sample_document): 400 | sec_document = SECDocument.from_string(sample_document).doc_after_cleaners() 401 | assert sec_document.filing_type == form_type 402 | 403 | 404 | @pytest.mark.parametrize( 405 | "section_names", 406 | [ 407 | [section.name for section in combo] 408 | for i in range(1, 3) 409 | for combo in combinations(SECSection, i) 410 | ] 411 | + [[ALL_SECTIONS]], 412 | ) 413 | def test_validate_section_names(section_names): 414 | assert validate_section_names(section_names) is None 415 | 416 | 417 | def test_validate_section_names_raises_for_nonsingleton_all(): 418 | with pytest.raises(ValueError): 419 | validate_section_names([ALL_SECTIONS, SECSection.ABOUT_PROSPECTUS]) 420 | 421 | 422 | def test_validate_section_names_raises_for_invalid_section(): 423 | with pytest.raises(ValueError): 424 | validate_section_names(["invalidsection"]) 425 | --------------------------------------------------------------------------------