├── prepline_sec_filings
    ├── __init__.py
    ├── api
    │   ├── __init__.py
    │   ├── app.py
    │   └── section.py
    ├── sections.py
    └── fetch.py
├── preprocessing-pipeline-family.yaml
├── setup.cfg
├── scripts
    ├── shellcheck.sh
    ├── docker-build.sh
    ├── test-doc-pipeline-apis-consistent.sh
    ├── check-and-format-notebooks.py
    └── version-sync.sh
├── img
    └── unstructured_logo.png
├── requirements
    ├── dev.in
    ├── base.in
    ├── test.in
    ├── test.txt
    ├── base.txt
    └── dev.txt
├── .github
    ├── dependabot.yml
    └── workflows
    │   ├── codeql-analysis.yml
    │   └── ci.yml
├── test_real_docs
    ├── fixtures
    │   └── list-item-counts.json
    ├── generate_first_last.py
    └── test_real_examples.py
├── logger_config.yaml
├── test_utils
    ├── README-generating-validation-csvs.md
    ├── symbols-for-validation-csvs.txt
    ├── examples.json
    ├── get_sec_docs_from_edgar.py
    └── create_validation_csv_files.py
├── test_sec_filings_integration
    └── test_notebooks.py
├── Dockerfile
├── CHANGELOG.md
├── .gitignore
├── sample-docs
    └── sample-sec-docs.sha256
├── Makefile
├── test_sec_filings
    ├── test_fetch.py
    ├── sec_filings
    │   └── test_section_api.py
    └── test_sec_document.py
├── exploration-notebooks
    ├── exploration-s1-risks.ipynb
    ├── exploration-TOC-action.ipynb
    └── exploration-10k-risks.ipynb
├── LICENSE.md
└── README.md


/prepline_sec_filings/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/prepline_sec_filings/api/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/preprocessing-pipeline-family.yaml:
--------------------------------------------------------------------------------
1 | name: sec-filings
2 | version: 0.2.1
3 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 100
3 | exclude =
4 |   prepline_sec_filings/api
5 | 


--------------------------------------------------------------------------------
/scripts/shellcheck.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | find scripts -name "*.sh" -exec shellcheck {} +
4 | 
5 | 


--------------------------------------------------------------------------------
/img/unstructured_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/pipeline-sec-filings/HEAD/img/unstructured_logo.png


--------------------------------------------------------------------------------
/requirements/dev.in:
--------------------------------------------------------------------------------
 1 | black
 2 | flake8
 3 | jupyter
 4 | mypy
 5 | pip-tools
 6 | # NOTE(crag): consistency with unstructured-api-tools. pinned for a reason, see there.
 7 | ipython==8.8.0
 8 | 
 9 | # NOTE(robinson) - Required pins for security scans
10 | jupyter-core>=4.11.2
11 | 


--------------------------------------------------------------------------------
/requirements/base.in:
--------------------------------------------------------------------------------
 1 | unstructured==0.2.5
 2 | unstructured_api_tools>=0.10.6
 3 | 
 4 | ratelimit
 5 | requests
 6 | numpy
 7 | scikit-learn
 8 | 
 9 | # NOTE(robinson) - Required pins for security scans
10 | jupyter-core>=5.3.0
11 | 
12 | # We need newer versions of these for deps
13 | traitlets>=5.6.0
14 | packaging>=22.0
15 | 


--------------------------------------------------------------------------------
/scripts/docker-build.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -euo pipefail
 4 | 
 5 | DOCKER_BUILDKIT=1 docker buildx build --load --platform=linux/amd64 -f Dockerfile \
 6 |   --build-arg PIP_VERSION="$PIP_VERSION" \
 7 |   --build-arg PIPELINE_PACKAGE="$PIPELINE_PACKAGE" \
 8 |   --progress plain \
 9 |   -t pipeline-family-"$PIPELINE_FAMILY"-dev:latest .
10 | 


--------------------------------------------------------------------------------
/requirements/test.in:
--------------------------------------------------------------------------------
 1 | black>=22.3.0
 2 | # NOTE(mrobinson) - Pinning click due to a unicode issue in black
 3 | # can remove after black drops support for Python 3.6
 4 | # ref: https://github.com/psf/black/issues/2964
 5 | click>=8.1
 6 | flake8
 7 | httpx
 8 | mypy
 9 | pytest-cov
10 | nbdev
11 | ipykernel
12 | 
13 | # NOTE(robinson) - Required pins for security scans
14 | jupyter-core>=4.11.2
15 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   - package-ecosystem: "pip"
 4 |     directory: "/requirements"
 5 |     schedule:
 6 |       interval: "monthly"
 7 | 
 8 |   - package-ecosystem: "github-actions"
 9 |     # NOTE(robinson) - Workflow files stored in the
10 |     # default location of `.github/workflows`
11 |     directory: "/"
12 |     schedule:
13 |       interval: "monthly"
14 | 


--------------------------------------------------------------------------------
/test_real_docs/fixtures/list-item-counts.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "hlvx": 13,
 3 |   "blco": 13,
 4 |   "mrk": 0,
 5 |   "aust": 3,
 6 |   "ee": 5,
 7 |   "nke": 6,
 8 |   "pepg": 9,
 9 |   "msex": 0,
10 |   "v": 6,
11 |   "cvs": 42,
12 |   "doc": 0,
13 |   "smtc": 0,
14 |   "cl": 7,
15 |   "ava": 0,
16 |   "bc": 4,
17 |   "f": 0,
18 |   "lmt": 0,
19 |   "cri": 12,
20 |   "asns": 4,
21 |   "aig": 3,
22 |   "rgld": 29,
23 |   "apld": 9,
24 |   "omcl": 0,
25 |   "mmm": 1,
26 |   "bgs": 3,
27 |   "ehc": 11,
28 |   "dis": 7,
29 |   "wetg": 4,
30 |   "bj": 7,
31 |   "brks": 0
32 | }
33 | 


--------------------------------------------------------------------------------
/logger_config.yaml:
--------------------------------------------------------------------------------
 1 | version: 1
 2 | disable_existing_loggers: False
 3 | formatters:
 4 |   default_format:
 5 |     "()": uvicorn.logging.DefaultFormatter
 6 |     format: '%(asctime)s %(name)s %(levelname)s %(message)s'
 7 |   access:
 8 |     "()": uvicorn.logging.AccessFormatter
 9 |     format: '%(asctime)s %(client_addr)s %(request_line)s - %(status_code)s'
10 | handlers:
11 |   access_handler:
12 |     formatter: access
13 |     class: logging.StreamHandler
14 |     stream: ext://sys.stderr
15 |   standard_handler:
16 |     formatter: default_format
17 |     class: logging.StreamHandler
18 |     stream: ext://sys.stderr
19 | loggers:
20 |   uvicorn.error:
21 |     level: INFO
22 |     handlers:
23 |       - standard_handler
24 |     propagate: no
25 |     # disable logging for uvicorn.error by not having a handler
26 |   uvicorn.access:
27 |     level: INFO
28 |     handlers:
29 |       - access_handler
30 |     propagate: no
31 |     # disable logging for uvicorn.access by not having a handler
32 |   unstructured:
33 |     level: INFO
34 |     handlers:
35 |       - standard_handler
36 |     propagate: no
37 | 
38 | 


--------------------------------------------------------------------------------
/test_utils/README-generating-validation-csvs.md:
--------------------------------------------------------------------------------
 1 | # Downloading CSV's with all sections extracted for fiings
 2 | 
 3 | ## Step 1: Download filings from Edgar
 4 | 
 5 | Given a list of symbols (tickers or CIK's) and which form type to download in $FILINGS_MANIFEST_FILE, save resulting files and manifest json in $SEC_DOCS_DIR.
 6 | 
 7 | ```
 8 | # needed for Edgar's API
 9 | export SEC_API_ORGANIZATION=<your org>
10 | export SEC_API_EMAIL=<your email>
11 | 
12 | PYTHONPATH=. SEC_DOCS_DIR=sec-filing-downloads \
13 | FILINGS_MANIFEST_FILE=test_utils/symbols-for-validation-csvs.txt \
14 | python test_utils/get_sec_docs_from_edgar.py
15 | ```
16 | 
17 | ## Step 2: Generate validation csv's with downloaded files and manifest json
18 | 
19 | ```
20 | PYTHONPATH=. SEC_DOCS_DIR=sec-filing-downloads/ CSV_FILES_DIR=validation-csvs python \
21 | test_utils/create_validation_csv_files.py
22 | ```
23 | 
24 | Note that you may also provide the following env vars in the command above:
25 | 
26 | * `PIPELINE_SECTION_API_URL` - defaults to local API.
27 | * `FILINGS_MANIFEST_JSON` - the list of filings to create CSV's for. defaults to $SEC_DOCS_DIR/sec_docs_manifest.json which is written step 1.
28 | 


--------------------------------------------------------------------------------
/prepline_sec_filings/api/app.py:
--------------------------------------------------------------------------------
 1 | #####################################################################
 2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS.
 3 | # DO NOT MODIFY DIRECTLY
 4 | #####################################################################
 5 | 
 6 | 
 7 | from fastapi import FastAPI, Request, status
 8 | import logging
 9 | import os
10 | 
11 | from .section import router as section_router
12 | 
13 | 
14 | app = FastAPI(
15 |     title="Unstructured Pipeline API",
16 |     description="""""",
17 |     version="1.0.0",
18 |     docs_url="/sec-filings/docs",
19 |     openapi_url="/sec-filings/openapi.json",
20 | )
21 | 
22 | allowed_origins = os.environ.get("ALLOWED_ORIGINS", None)
23 | if allowed_origins:
24 |     from fastapi.middleware.cors import CORSMiddleware
25 | 
26 |     app.add_middleware(
27 |         CORSMiddleware,
28 |         allow_origins=allowed_origins.split(","),
29 |         allow_methods=["OPTIONS", "POST"],
30 |         allow_headers=["Content-Type"],
31 |     )
32 | 
33 | app.include_router(section_router)
34 | 
35 | 
36 | # Filter out /healthcheck noise
37 | class HealthCheckFilter(logging.Filter):
38 |     def filter(self, record: logging.LogRecord) -> bool:
39 |         return record.getMessage().find("/healthcheck") == -1
40 | 
41 | 
42 | logging.getLogger("uvicorn.access").addFilter(HealthCheckFilter())
43 | 
44 | 
45 | @app.get("/healthcheck", status_code=status.HTTP_200_OK, include_in_schema=False)
46 | def healthcheck(request: Request):
47 |     return {"healthcheck": "HEALTHCHECK STATUS: EVERYTHING OK!"}
48 | 


--------------------------------------------------------------------------------
/test_sec_filings_integration/test_notebooks.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | import pytest
 4 | from typing import List
 5 | import sys
 6 | 
 7 | if sys.version_info < (3, 8):
 8 |     from typing_extensions import Final
 9 | else:
10 |     from typing import Final
11 | 
12 | import nbformat
13 | from nbconvert.preprocessors import ExecutePreprocessor
14 | 
15 | TIMEOUT: Final[int] = 600  # in seconds
16 | 
17 | DIRECTORY: Final[str] = Path(__file__).absolute().parent
18 | PIPELINE_NB_DIR: Final[str] = os.path.join(DIRECTORY, "..", "pipeline-notebooks")
19 | 
20 | 
21 | def run_notebook_directory(directory: str):
22 |     """Executes all of the notebooks in a test directory. Tests that at least one cell
23 |     was executed in every notebook."""
24 |     notebook_files = [file for file in os.listdir(directory) if file.endswith(".ipynb")]
25 |     for notebook_file in notebook_files:
26 |         filename = os.path.join(directory, notebook_file)
27 | 
28 |         with open(filename) as f:
29 |             notebook = nbformat.read(f, as_version=4)
30 | 
31 |         executor = ExecutePreprocessor(timeout=TIMEOUT)
32 |         executed_notebook, _ = executor.preprocess(notebook)
33 | 
34 |         execution_counts: List[int] = list()
35 |         for cell in executed_notebook["cells"]:
36 |             execution_count = cell.get("execution_count", None)
37 |             if isinstance(execution_count, int):
38 |                 execution_counts.append(execution_count)
39 | 
40 |         assert len(execution_counts) > 0
41 | 
42 | 
43 | @pytest.mark.parametrize("directory", [(PIPELINE_NB_DIR)])
44 | def test_notebooks(directory):
45 |     # NOTE(robinson) - The expectation is that all the notebooks will execute completely
46 |     # without errors
47 |     run_notebook_directory(directory)
48 | 


--------------------------------------------------------------------------------
/test_utils/symbols-for-validation-csvs.txt:
--------------------------------------------------------------------------------
 1 | # large cap 10-Q
 2 | abt    10-Q
 3 | amzn   10-Q
 4 | mo     10-Q
 5 | c      10-Q
 6 | cat    10-Q
 7 | dis    10-Q
 8 | nflx   10-Q
 9 | tmus   10-Q
10 | # mid cap 10-Q
11 | wolf   10-Q
12 | jazz   10-Q
13 | seic   10-Q
14 | rh     10-Q
15 | pdce   10-Q
16 | amkr   10-Q
17 | wen    10-Q
18 | tdc    10-Q
19 | fl     10-Q
20 | enr    10-Q
21 | # small cap 10-Q
22 | lthm   10-Q
23 | skyw   10-Q
24 | kfy    10-Q
25 | oi     10-Q
26 | b      10-Q
27 | ktb    10-Q
28 | chuy   10-Q
29 | lpsn   10-Q
30 | gci    10-Q
31 | abtx   10-Q
32 | # selected since more recent filing is a 10-Q/A as of Sept 2022
33 | adra   10-Q
34 | # large cap 10-K
35 | exc    10-K
36 | pkg    10-K
37 | hpe    10-K
38 | aiz    10-K
39 | rok    10-K
40 | ben    10-K
41 | gl     10-K
42 | all    10-K
43 | rost   10-K
44 | sivb   10-K
45 | # mid cap 10-K
46 | syna   10-K
47 | x      10-K
48 | oln    10-K
49 | sfm    10-K
50 | smg    10-K
51 | wso    10-K
52 | sam    10-K
53 | wwd    10-K
54 | mms    10-K
55 | mlkn   10-K
56 | # small cap 10-K
57 | tbbk   10-K
58 | rdnt   10-K
59 | ueic   10-K
60 | atni   10-K
61 | cwt    10-K
62 | pke    10-K
63 | zyxi   10-K
64 | klic   10-K
65 | mdc    10-K
66 | nbhc   10-K
67 | # S-1 recent filing CIK's per search 2022-09-18
68 | # https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent&CIK=&type=s-1&company=&dateb=&owner=include&start=0&count=80&output=atom
69 | 0001156784  S-1
70 | 0001912287  S-1
71 | 0001398805  S-1
72 | 0001886894  S-1
73 | 0001638287  S-1
74 | 0001893448  S-1
75 | 0001144879  S-1
76 | 0001839412  S-1
77 | 0001707079  S-1
78 | 0001704795  S-1
79 | 0001425627  S-1
80 | 0001861063  S-1
81 | 0001726711  S-1
82 | 0001841800  S-1
83 | 0001074828  S-1
84 | 0001895144  S-1
85 | 0001450704  S-1
86 | 0001076262  S-1
87 | 0001726711  S-1
88 | 0001527352  S-1
89 | 
90 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # syntax=docker/dockerfile:experimental
 2 | 
 3 | from centos:centos7.9.2009
 4 | 
 5 | # NOTE(crag): NB_USER ARG for mybinder.org compat:
 6 | #             https://mybinder.readthedocs.io/en/latest/tutorials/dockerfile.html
 7 | ARG NB_USER=notebook-user
 8 | ARG NB_UID=1000
 9 | ARG PIP_VERSION
10 | ARG PIPELINE_PACKAGE
11 | 
12 | RUN yum -y update && \
13 |   yum -y install gcc openssl-devel bzip2-devel libffi-devel make git sqlite-devel && \
14 |   curl -O https://www.python.org/ftp/python/3.8.15/Python-3.8.15.tgz && tar -xzf Python-3.8.15.tgz && \
15 |   cd Python-3.8.15/ && ./configure --enable-optimizations && make altinstall && \
16 |   cd .. && rm -rf Python-3.8.15* && \
17 |   ln -s /usr/local/bin/python3.8 /usr/local/bin/python3
18 | 
19 | # create user with a home directory
20 | ENV USER ${NB_USER}
21 | ENV HOME /home/${NB_USER}
22 | 
23 | RUN groupadd --gid ${NB_UID} ${NB_USER}
24 | RUN useradd --uid ${NB_UID}  --gid ${NB_UID} ${NB_USER}
25 | USER ${NB_USER}
26 | WORKDIR ${HOME}
27 | ENV PYTHONPATH="${PYTHONPATH}:${HOME}"
28 | ENV PATH="/home/${NB_USER}/.local/bin:${PATH}"
29 | 
30 | COPY logger_config.yaml logger_config.yaml
31 | COPY requirements/dev.txt requirements-dev.txt
32 | COPY requirements/base.txt requirements-base.txt
33 | COPY prepline_${PIPELINE_PACKAGE}/ prepline_${PIPELINE_PACKAGE}/
34 | COPY exploration-notebooks exploration-notebooks
35 | COPY pipeline-notebooks pipeline-notebooks
36 | 
37 | 
38 | # NOTE(robinson) - Can remove the secret mount once the unstructured repo is public
39 | # NOTE(crag) - Cannot use an ARG in the dst= path (so it seems), hence no ${NB_USER}, ${NB_UID}
40 | RUN python3.8 -m pip install pip==${PIP_VERSION} \
41 |   && pip3.8 install --no-cache -r requirements-base.txt \
42 |   && pip3.8 install --no-cache -r requirements-dev.txt \
43 |   && python3.8 -c "import nltk; nltk.download('punkt')" \
44 |   && python3.8 -c "import nltk; nltk.download('averaged_perceptron_tagger')"
45 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | ## 0.2.1
 2 | 
 3 | * Supports json responses suitable for Label Studio.
 4 | * Allows a json list instead of a multipart response for multi-file requests
 5 | * Supports text/csv responses instead of just json
 6 | * More general (non-pipeline-specific) way of starting the app
 7 | * Add alternative way of importing `Final` to support google colab
 8 | * Dependency bumps
 9 | 
10 | ## 0.2.0
11 | 
12 | * Updated section API to accept multiple text files uploads as `text_files` parameter.
13 | 
14 | ## 0.1.0
15 | 
16 | * Updated FastAPI param m_section -> section
17 | * API updated to support known filing sections rather just risk factors
18 | * Updated interface to be compatible with new version of unstructured
19 | 
20 | ## 0.0.3
21 | 
22 | * Updated `match_s1_toc_title_to_section` for an exact match
23 | * Enumerated and added patterns for common 10-K/Q and S-1 sections
24 | * Refactor get risk narrative to allow capture of variable section
25 | * Naming conventions updated with "pipeline" terminology (no longer "recipe")
26 | * Various tweaks to parsing methods to improve capturing of risk section and TOC
27 | * Auto-generated api risk_narrative.py now lints (unstructured-api-tools)
28 | * Added get_table_of_contents to find TOC elements within SEC document (and tests)
29 | * Added helper functions for retrieving/opening documents from the SEC
30 | * Changed `unstructured_api` package to `unstructured_api_tools`
31 | * Rewrote `get_risk_narrative` to use the TOC
32 | * Added integration tests to verify capture of risk factors section
33 | 
34 | ## 0.0.2
35 | 
36 | * Pipeline now generates a FastAPI web application
37 | * Added logic to skip risk section if risk section is empty upon completion
38 | * Added different form types to unit tests, and added variation of forms that use a table of contents
39 | 
40 | ## 0.0.1
41 | 
42 | * Added make target to build the pipeline scripts
43 | * Change `doc_prep` package name to `unstructured`
44 | * Created pipeline for extracting the risk section from 10-K, 10-Q, and S-1 filings
45 | * Initial repo setup for SEC filings
46 | 


--------------------------------------------------------------------------------
/scripts/test-doc-pipeline-apis-consistent.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -eu -o pipefail
 4 | 
 5 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 6 | cd "$SCRIPT_DIR"/..
 7 | 
 8 | PIPELINE_OUTPUT_DIR=tmp-api-check-output-$RANDOM
 9 | FILE_INDICTATING_FAILURE="$PIPELINE_OUTPUT_DIR"-has-failures
10 | mkdir -p $PIPELINE_OUTPUT_DIR
11 | touch $PIPELINE_OUTPUT_DIR/__init__.py
12 | 
13 | function tmp_pipeline_comp_cleanup () {
14 |     cd "$SCRIPT_DIR"/..
15 |     rm -f "$FILE_INDICTATING_FAILURE"
16 |     if [[ "$1" -eq 0 ]]; then
17 |       rm -rf $PIPELINE_OUTPUT_DIR
18 |     fi
19 |     exit "$1"
20 | }
21 | 
22 | unstructured_api_tools convert-pipeline-notebooks \
23 |   --input-directory ./pipeline-notebooks \
24 |   --output-directory "$PIPELINE_OUTPUT_DIR"
25 | 
26 | NUM_PIPELINE_API_FILES_GENERATED=$(find "$PIPELINE_OUTPUT_DIR" -name "*.py" | wc -l)
27 | 
28 | if [[ "$NUM_PIPELINE_API_FILES_GENERATED" -eq 0 ]]; then
29 |     echo "No pipelines where created by unstructured_api_tools convert-pipeline-notebooks"
30 |     tmp_pipeline_comp_cleanup 1
31 | fi
32 | 
33 | NUM_EXISTING_PIPELINE_API_FILES=$(find "$PACKAGE_NAME"/api -name "*.py" | wc -l)
34 | 
35 | if [[ "$NUM_PIPELINE_API_FILES_GENERATED" -gt "$NUM_EXISTING_PIPELINE_API_FILES"  ]]; then
36 |     echo "More pipeline api files were autogenerated than appear in the ${PACKAGE_NAME}/api"
37 |     tmp_pipeline_comp_cleanup 1
38 | elif [[ "$NUM_PIPELINE_API_FILES_GENERATED" -lt "$NUM_EXISTING_PIPELINE_API_FILES"  ]]; then
39 |     echo "Fewer pipeline api files were autogenerated than appear in the ${PACKAGE_NAME}/api"
40 |     tmp_pipeline_comp_cleanup 1
41 | fi
42 | 
43 | cd "$PACKAGE_NAME"/api
44 | find . -name "*.py" -print0 | while IFS= read -r -d '' pipeline_file; do
45 |     set +o pipefail
46 |     if ! diff -u "$pipeline_file" ../../"$PIPELINE_OUTPUT_DIR/$pipeline_file"; then
47 | 	touch "../../$FILE_INDICTATING_FAILURE"
48 |     fi
49 |     set -o pipefail
50 | done
51 | cd -
52 | 
53 | if [ -r "$FILE_INDICTATING_FAILURE" ]; then
54 |     echo
55 |     echo "Autogenerated pipeline api file(s) do not match existing versions, see above for diff's"
56 |     echo " or run: diff -ru ${PACKAGE_NAME}/api/ ${PIPELINE_OUTPUT_DIR}/"
57 |     tmp_pipeline_comp_cleanup 1
58 | fi
59 | tmp_pipeline_comp_cleanup 0
60 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # VSCode
132 | .vscode/
133 | 
134 | # Mac
135 | .DS_Store
136 | 
137 | # Example forms
138 | sample-sec-docs/
139 | 
140 | nbs/
141 | 
142 | # Celery files that are created when the mercury dashboard is run
143 | celery.sqlite
144 | celerybeat-schedule.db
145 | 
146 | # temporarily generated files by project-specific Makefile
147 | tmp*
148 | 
149 | # downloaded filings from experimental notebooks, for example
150 | *xbrl
151 | 
152 | *.csv
153 | 


--------------------------------------------------------------------------------
/test_real_docs/generate_first_last.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | # File used to generate 'first' and 'last' for sample-first-last.json
 5 | # from the downloaded forms (through "make dl-test-artifacts")
 6 | 
 7 | # NOTE: This file is ran from the root path of the repository
 8 | 
 9 | from prepline_sec_filings.sec_document import SECDocument
10 | 
11 | from prepline_sec_filings.sections import section_string_to_enum
12 | 
13 | DIRECTORY = os.getcwd()
14 | 
15 | RISK_FACTOR_XFAILS = ["aig", "bgs"]
16 | 
17 | with open(
18 |     os.path.join(DIRECTORY, "test_real_docs", "fixtures", "sample-first-last.json"),
19 |     "r",
20 | ) as f:
21 |     sample_first_last = json.load(f)
22 | 
23 | with open(os.path.join("test_real_docs", "test_utils", "examples.json")) as f:
24 |     examples = json.load(f)
25 | 
26 | 
27 | def get_file_from_ticker(ticker):
28 |     cik = examples[ticker]["cik"]
29 |     formtype = next(iter(examples[ticker]["forms"]))
30 |     accession_number = examples[ticker]["forms"][formtype]
31 |     with open(
32 |         os.path.join(
33 |             "test_real_docs",
34 |             "sample-docs",
35 |             f"{ticker}-{formtype}-{cik}-{accession_number}.xbrl",
36 |         )
37 |     ) as f:
38 |         out = f.read()
39 |     return out
40 | 
41 | 
42 | tickers_10q = [
43 |     ticker for ticker in sample_first_last if "10-Q" in examples[ticker]["forms"]
44 | ]  # filter only 10-Q docs
45 | 
46 | 
47 | def get_doc_elements(tickers):
48 |     docs_all = {}
49 |     for ticker in tickers:
50 |         print("at ticker", ticker)
51 |         text = get_file_from_ticker(ticker)
52 |         doc = SECDocument.from_string(text).doc_after_cleaners(skip_headers_and_footers=True)
53 |         docs_all[ticker] = {}
54 |         docs_all[ticker]["doc"] = doc
55 |         docs_all[ticker]["elements"] = doc.elements
56 |     return docs_all
57 | 
58 | 
59 | def get_doc(docs_all, ticker):
60 |     return docs_all[ticker]["doc"], docs_all[ticker]["elements"]
61 | 
62 | 
63 | sections = [
64 |     "FINANCIAL_STATEMENTS",  # ITEM 1
65 |     "MANAGEMENT_DISCUSSION",  # ITEM 2
66 |     "MARKET_RISK_DISCLOSURES",  # ITEM 3
67 |     "CONTROLS_AND_PROCEDURES",
68 | ]  # ITEM 4
69 | 
70 | 
71 | def print_ticker(docs_all, ticker, sections=sections):
72 |     doc, _ = get_doc(docs_all, ticker)
73 |     print("### ", ticker, " ###")
74 |     for section in sections:
75 |         print("----", section, "-----")
76 |         # skip if nothing is extracted
77 |         if len(doc.get_section_narrative(section_string_to_enum[section])) == 0:
78 |             continue
79 |         print(doc.get_section_narrative(section_string_to_enum[section])[0])  # first
80 |         print(doc.get_section_narrative(section_string_to_enum[section])[-1])  # last
81 |         # for el in doc.get_section_narrative(section_string_to_enum[section]):
82 |         #     print('+',clean_sec_text(el.text))
83 | 
84 | 
85 | docs_all = get_doc_elements(tickers_10q)
86 | 
87 | for ticker in tickers_10q:
88 |     print_ticker(docs_all, ticker)
89 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | #
 7 | # ******** NOTE ********
 8 | # We have attempted to detect the languages in your repository. Please check
 9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 | 
14 | on:
15 |   push:
16 |     branches: [ "main" ]
17 |   pull_request:
18 |     # The branches below must be a subset of the branches above
19 |     branches: [ "main" ]
20 |   schedule:
21 |     - cron: '23 21 * * 3'
22 | 
23 | jobs:
24 |   analyze:
25 |     name: Analyze
26 |     runs-on: ubuntu-latest
27 |     permissions:
28 |       actions: read
29 |       contents: read
30 |       security-events: write
31 | 
32 |     strategy:
33 |       fail-fast: false
34 |       matrix:
35 |         language: [ 'python' ]
36 |         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
37 |         # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
38 | 
39 |     steps:
40 |     - name: Checkout repository
41 |       uses: actions/checkout@v4
42 | 
43 |     # Initializes the CodeQL tools for scanning.
44 |     - name: Initialize CodeQL
45 |       uses: github/codeql-action/init@v2
46 |       with:
47 |         languages: ${{ matrix.language }}
48 |         # If you wish to specify custom queries, you can do so here or in a config file.
49 |         # By default, queries listed here will override any specified in a config file.
50 |         # Prefix the list here with "+" to use these queries and those in the config file.
51 |         
52 |         # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
53 |         # queries: security-extended,security-and-quality
54 | 
55 |         
56 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
57 |     # If this step fails, then you should remove it and run the build manually (see below)
58 |     - name: Autobuild
59 |       uses: github/codeql-action/autobuild@v2
60 | 
61 |     # ℹ️ Command-line programs to run using the OS shell.
62 |     # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
63 | 
64 |     #   If the Autobuild fails above, remove it and uncomment the following three lines. 
65 |     #   modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
66 | 
67 |     # - run: |
68 |     #   echo "Run, Build Application using script"
69 |     #   ./location_of_script_within_repo/buildscript.sh
70 | 
71 |     - name: Perform CodeQL Analysis
72 |       uses: github/codeql-action/analyze@v2
73 |       with:
74 |         category: "/language:${{matrix.language}}"
75 | 


--------------------------------------------------------------------------------
/sample-docs/sample-sec-docs.sha256:
--------------------------------------------------------------------------------
 1 | 5ecd0a02875508c69b46c8cc558b8b9901bc5a9bc3068401680f248d00167e7f  sample-docs/aig-10-K-5272-000110465922024701.xbrl
 2 | 34339a8914828791bbf0fad3d619b031d6fd428135a7069b237d716ee79052df  sample-docs/apld-S-1-1144879-000110465921142627.xbrl
 3 | 697bdcde08961f9288a89b965468369d0c0df852d77259f3c5729a7e0177bf7f  sample-docs/asns-S-1-1141284-000121390022020064.xbrl
 4 | e2e7c4ebb4006ef8f2efbeeb0bdab0f2e924bbd753c11ff907c44a012efeb22b  sample-docs/aust-S-1-1817740-000110465921128425.xbrl
 5 | 6db5134f2231d37f0b5d744b2bc3db413a32ac5555370a3b5c8f2ffe32553dfa  sample-docs/ava-10-Q-104918-000095017021000739.xbrl
 6 | 85181a1b87583159f86bf98127225ad27779e7d48ded2b1de4d51332c4658eb4  sample-docs/bc-10-Q-14930-000001493021000103.xbrl
 7 | 429ff6e82fee2d54c0b51b75386d09fee74b68e003ae3a10aa7d8f921bb227f4  sample-docs/bgs-S-1-1278027-000104746904003937.xbrl
 8 | 70a7883d199ff125eb767424a017320591c82f9b6bb9bbf4141760d1dfdfe1ab  sample-docs/bj-S-1-1531152-000119312519032591.xbrl
 9 | 396de04f2546df697f5b45334a2328556eaeaf8bcdc5dd88d55039c8f79f7b94  sample-docs/blco-S-1-1860742-000119312522008667.xbrl
10 | 1a810c8b67be0debf18f4111e88d48b5f50d7ff0f5db9bf20f5df6f890b9a266  sample-docs/brks-10-Q-933974-000155837021006699.xbrl
11 | 9c673ad2f1b446d1b82d11207c6305b55220bdffb6cf403977dcb9f006601781  sample-docs/cl-10-Q-21665-000002166522000010.xbrl
12 | 3a41f0cef88bea6848ab68b80d1379ffe52becaab236feb550a9eee18402bce1  sample-docs/cri-10-K-1060822-000106082222000096.xbrl
13 | 1e7bf289bb40be4befff79c3a9a2d6d14a1c49fff54e094a0855cc51a66dd613  sample-docs/cvs-10-K-64803-000006480322000008.xbrl
14 | 6d105033564a5e3be0575a7b0488c3bf60a4204b724c2b761d6ada23589080be  sample-docs/dis-10-K-1744489-000174448921000220.xbrl
15 | 0e861b443d4c6e8e1d0f81f319ff7aae79af7b0d365c97646e03f56a391ff432  sample-docs/doc-10-Q-1574540-000157454021000146.xbrl
16 | cc78e478fc7cf839e3327227d73b1d197e12eca42f7bf615c77f84a167a4333b  sample-docs/ee-S-1-1888447-000114036122000986.xbrl
17 | 73bf9bf74776a6ac5b27757c617c0a9392c4a1d7bc13bc952738a66fad878fae  sample-docs/ehc-10-K-785161-000078516122000008.xbrl
18 | 48bbe980eafb779417e08cb1772a60043c095cee26ef6864d821ecde647bd70d  sample-docs/f-10-Q-37996-000003799622000024.xbrl
19 | af29fd3e7d51f4c535814ed226513197658800d4348c9245e18a868f00427a2d  sample-docs/hlvx-S-1-1888012-000119312522097505.xbrl
20 | 69dbdf8134fc8933dbe8fbb7d793ea3f02d9bd8dfafdd327651e65abd3bddad7  sample-docs/lmt-10-K-936468-000093646822000008.xbrl
21 | bc9e17ff46da6e6017c4d1bc0307c20a5dfcfd4b455cdf2c1adb737fe5b03f12  sample-docs/mmm-10-Q-66740-000006674022000065.xbrl
22 | b705207be1f9164e7f731fa5738e15205e3ab8d589f3fc170b88d50d2335aed7  sample-docs/mrk-10-Q-310158-000031015821000028.xbrl
23 | 18069c61245d5110f983407829149592acbc0932d5e715f02bd45705e8dfdd6e  sample-docs/msex-10-K-66004-000117494722000283.xbrl
24 | 8994d75c07ce9e66a968b0cbe0146b265805a7544d10087ff7c68085e6b06f7c  sample-docs/nke-10-K-320187-000032018722000038.xbrl
25 | d3e500d6b861c291fd8a0eb53b98470d2108c432688a27614f4468e4f4514091  sample-docs/omcl-10-Q-926326-000092632622000014.xbrl
26 | 9fd0e54840a65723ff82f325cbf9e19697fdea99c066334be0580777ecfb469a  sample-docs/pepg-S-1-1835597-000119312522106884.xbrl
27 | 893d69a8ae134aa723da7df0b4e4fd93598ca0d44e4954eabf7ad00636e47763  sample-docs/rgld-10-K-85535-000155837021011343.xbrl
28 | a127c6759870e49ed0e687ad4b907f2bdbbb8a15424859dfd86042b173e36aad  sample-docs/smtc-10-K-88941-000008894122000006.xbrl
29 | db77a55421cbbb33dcb1ff02e5e7508eef78956565004871c42c8d495e1b5e36  sample-docs/v-10-Q-1403161-000140316122000027.xbrl
30 | 02b834ff2c0ab4e7768d52811ab963e7cb74d0cadb4b06d8be5d9f9d143ea04d  sample-docs/wetg-S-1-1784970-000147793221000299.xbrl
31 | 


--------------------------------------------------------------------------------
/test_utils/examples.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "mmm": {
  3 |     "cik": "66740",
  4 |     "forms": {
  5 |       "10-Q": "000006674022000065"
  6 |     }
  7 |   },
  8 |   "aig": {
  9 |     "cik": "5272",
 10 |     "forms": {
 11 |       "10-K": "000110465922024701"
 12 |     }
 13 |   },
 14 |   "cl": {
 15 |     "cik": "21665",
 16 |     "forms": {
 17 |       "10-Q": "000002166522000010"
 18 |     }
 19 |   },
 20 |   "cvs": {
 21 |     "cik": "64803",
 22 |     "forms": {
 23 |       "10-K": "000006480322000008"
 24 |     }
 25 |   },
 26 |   "f": {
 27 |     "cik": "37996",
 28 |     "forms": {
 29 |       "10-Q": "000003799622000024"
 30 |     }
 31 |   },
 32 |   "lmt": {
 33 |     "cik": "936468",
 34 |     "forms": {
 35 |       "10-K": "000093646822000008"
 36 |     }
 37 |   },
 38 |   "mrk": {
 39 |     "cik": "310158",
 40 |     "forms": {
 41 |       "10-Q": "000031015821000028"
 42 |     }
 43 |   },
 44 |   "nke": {
 45 |     "cik": "320187",
 46 |     "forms": {
 47 |       "10-K": "000032018722000038"
 48 |     }
 49 |   },
 50 |   "v": {
 51 |     "cik": "1403161",
 52 |     "forms": {
 53 |       "10-Q": "000140316122000027"
 54 |     }
 55 |   },
 56 |   "dis": {
 57 |     "cik": "1744489",
 58 |     "forms": {
 59 |       "10-K": "000174448921000220"
 60 |     }
 61 |   },
 62 |   "brks": {
 63 |     "cik": "933974",
 64 |     "forms": {
 65 |       "10-Q": "000155837021006699"
 66 |     }
 67 |   },
 68 |   "rgld": {
 69 |     "cik": "85535",
 70 |     "forms": {
 71 |       "10-K": "000155837021011343"
 72 |     }
 73 |   },
 74 |   "bc": {
 75 |     "cik": "14930",
 76 |     "forms": {
 77 |       "10-Q": "000001493021000103"
 78 |     }
 79 |   },
 80 |   "cri": {
 81 |     "cik": "1060822",
 82 |     "forms": {
 83 |       "10-K": "000106082222000096"
 84 |     }
 85 |   },
 86 |   "doc": {
 87 |     "cik": "1574540",
 88 |     "forms": {
 89 |       "10-Q": "000157454021000146"
 90 |     }
 91 |   },
 92 |   "pepg": {
 93 |     "cik": "1835597",
 94 |     "forms": {
 95 |       "S-1": "000119312522106884"
 96 |     }
 97 |   },
 98 |   "ehc": {
 99 |     "cik": "785161",
100 |     "forms": {
101 |       "10-K": "000078516122000008"
102 |     }
103 |   },
104 |   "bj": {
105 |     "cik": "1531152",
106 |     "forms": {
107 |       "S-1": "000119312519032591"
108 |     }
109 |   },
110 |   "omcl": {
111 |     "cik": "926326",
112 |     "forms": {
113 |       "10-Q": "000092632622000014"
114 |     }
115 |   },
116 |   "smtc": {
117 |     "cik": "88941",
118 |     "forms": {
119 |       "10-K": "000008894122000006"
120 |     }
121 |   },
122 |   "ava": {
123 |     "cik": "104918",
124 |     "forms": {
125 |       "10-Q": "000095017021000739"
126 |     }
127 |   },
128 |   "msex": {
129 |     "cik": "66004",
130 |     "forms": {
131 |       "10-K": "000117494722000283"
132 |     }
133 |   },
134 |   "bgs": {
135 |     "cik": "1278027",
136 |     "forms": {
137 |       "S-1": "000104746904003937"
138 |     }
139 |   },
140 |   "aust": {
141 |     "cik": "1817740",
142 |     "forms": {
143 |       "S-1": "000110465921128425"
144 |     }
145 |   },
146 |   "wetg": {
147 |     "cik": "1784970",
148 |     "forms": {
149 |       "S-1": "000147793221000299"
150 |     }
151 |   },
152 |   "hlvx": {
153 |     "cik": "1888012",
154 |     "forms": {
155 |       "S-1": "000119312522097505"
156 |     }
157 |   },
158 |   "apld": {
159 |     "cik": "1144879",
160 |     "forms": {
161 |       "S-1": "000110465921142627"
162 |     }
163 |   },
164 |   "asns": {
165 |     "cik": "1141284",
166 |     "forms": {
167 |       "S-1": "000121390022020064"
168 |     }
169 |   },
170 |   "ee": {
171 |     "cik": "1888447",
172 |     "forms": {
173 |       "S-1": "000114036122000986"
174 |     }
175 |   },
176 |   "blco": {
177 |     "cik": "1860742",
178 |     "forms": {
179 |       "S-1": "000119312522008667"
180 |     }
181 |   }
182 | }


--------------------------------------------------------------------------------
/requirements/test.txt:
--------------------------------------------------------------------------------
  1 | #
  2 | # This file is autogenerated by pip-compile with Python 3.8
  3 | # by the following command:
  4 | #
  5 | #    pip-compile requirements/test.in
  6 | #
  7 | anyio==3.7.0
  8 |     # via httpcore
  9 | appnope==0.1.3
 10 |     # via
 11 |     #   ipykernel
 12 |     #   ipython
 13 | asttokens==2.2.1
 14 |     # via
 15 |     #   nbdev
 16 |     #   stack-data
 17 | astunparse==1.6.3
 18 |     # via nbdev
 19 | backcall==0.2.0
 20 |     # via ipython
 21 | black==23.3.0
 22 |     # via -r requirements/test.in
 23 | certifi==2023.5.7
 24 |     # via
 25 |     #   httpcore
 26 |     #   httpx
 27 | click==8.1.3
 28 |     # via
 29 |     #   -r requirements/test.in
 30 |     #   black
 31 | comm==0.1.3
 32 |     # via ipykernel
 33 | coverage[toml]==7.2.7
 34 |     # via pytest-cov
 35 | debugpy==1.6.7
 36 |     # via ipykernel
 37 | decorator==5.1.1
 38 |     # via ipython
 39 | exceptiongroup==1.1.1
 40 |     # via
 41 |     #   anyio
 42 |     #   pytest
 43 | execnb==0.1.5
 44 |     # via nbdev
 45 | executing==1.2.0
 46 |     # via stack-data
 47 | fastcore==1.5.29
 48 |     # via
 49 |     #   execnb
 50 |     #   ghapi
 51 |     #   nbdev
 52 | flake8==6.0.0
 53 |     # via -r requirements/test.in
 54 | ghapi==1.0.3
 55 |     # via nbdev
 56 | h11==0.14.0
 57 |     # via httpcore
 58 | httpcore==0.17.2
 59 |     # via httpx
 60 | httpx==0.24.1
 61 |     # via -r requirements/test.in
 62 | idna==3.4
 63 |     # via
 64 |     #   anyio
 65 |     #   httpx
 66 | importlib-metadata==6.6.0
 67 |     # via jupyter-client
 68 | iniconfig==2.0.0
 69 |     # via pytest
 70 | ipykernel==6.23.1
 71 |     # via -r requirements/test.in
 72 | ipython==8.12.2
 73 |     # via
 74 |     #   execnb
 75 |     #   ipykernel
 76 | jedi==0.18.2
 77 |     # via ipython
 78 | jupyter-client==8.2.0
 79 |     # via ipykernel
 80 | jupyter-core==5.3.0
 81 |     # via
 82 |     #   -r requirements/test.in
 83 |     #   ipykernel
 84 |     #   jupyter-client
 85 | matplotlib-inline==0.1.6
 86 |     # via
 87 |     #   ipykernel
 88 |     #   ipython
 89 | mccabe==0.7.0
 90 |     # via flake8
 91 | mypy==1.3.0
 92 |     # via -r requirements/test.in
 93 | mypy-extensions==1.0.0
 94 |     # via
 95 |     #   black
 96 |     #   mypy
 97 | nbdev==2.3.12
 98 |     # via -r requirements/test.in
 99 | nest-asyncio==1.5.6
100 |     # via ipykernel
101 | packaging==23.1
102 |     # via
103 |     #   black
104 |     #   fastcore
105 |     #   ghapi
106 |     #   ipykernel
107 |     #   pytest
108 | parso==0.8.3
109 |     # via jedi
110 | pathspec==0.11.1
111 |     # via black
112 | pexpect==4.8.0
113 |     # via ipython
114 | pickleshare==0.7.5
115 |     # via ipython
116 | platformdirs==3.5.1
117 |     # via
118 |     #   black
119 |     #   jupyter-core
120 | pluggy==1.0.0
121 |     # via pytest
122 | prompt-toolkit==3.0.38
123 |     # via ipython
124 | psutil==5.9.5
125 |     # via ipykernel
126 | ptyprocess==0.7.0
127 |     # via pexpect
128 | pure-eval==0.2.2
129 |     # via stack-data
130 | pycodestyle==2.10.0
131 |     # via flake8
132 | pyflakes==3.0.1
133 |     # via flake8
134 | pygments==2.15.1
135 |     # via ipython
136 | pytest==7.3.1
137 |     # via pytest-cov
138 | pytest-cov==4.1.0
139 |     # via -r requirements/test.in
140 | python-dateutil==2.8.2
141 |     # via jupyter-client
142 | pyyaml==6.0
143 |     # via nbdev
144 | pyzmq==25.1.0
145 |     # via
146 |     #   ipykernel
147 |     #   jupyter-client
148 | six==1.16.0
149 |     # via
150 |     #   asttokens
151 |     #   astunparse
152 |     #   python-dateutil
153 | sniffio==1.3.0
154 |     # via
155 |     #   anyio
156 |     #   httpcore
157 |     #   httpx
158 | stack-data==0.6.2
159 |     # via ipython
160 | tomli==2.0.1
161 |     # via
162 |     #   black
163 |     #   coverage
164 |     #   mypy
165 |     #   pytest
166 | tornado==6.3.2
167 |     # via
168 |     #   ipykernel
169 |     #   jupyter-client
170 | traitlets==5.9.0
171 |     # via
172 |     #   comm
173 |     #   ipykernel
174 |     #   ipython
175 |     #   jupyter-client
176 |     #   jupyter-core
177 |     #   matplotlib-inline
178 | typing-extensions==4.6.3
179 |     # via
180 |     #   black
181 |     #   ipython
182 |     #   mypy
183 | watchdog==3.0.0
184 |     # via nbdev
185 | wcwidth==0.2.6
186 |     # via prompt-toolkit
187 | wheel==0.40.0
188 |     # via astunparse
189 | zipp==3.15.0
190 |     # via importlib-metadata
191 | 
192 | # The following packages are considered to be unsafe in a requirements file:
193 | # pip
194 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
  1 | name: CI
  2 | 
  3 | on:
  4 |   # NOTE(robinson) - We are limiting when we run CI avoid exceeding our 2,000 min/month limt.
  5 |   # We can switch to running on push if we make this repo public or are fine with
  6 |   # paying for CI minutes.
  7 |   push:
  8 |     branches: [ main ]
  9 |   pull_request:
 10 |     branches: [ main ]
 11 | 
 12 | env:
 13 |   PYTHON_VERSION: "3.8"
 14 | 
 15 | jobs:
 16 |   setup:
 17 |     runs-on: ubuntu-latest
 18 |     env:
 19 |       NLTK_DATA: ${{ github.workspace }}/nltk_data
 20 |     steps:
 21 |     - uses: actions/checkout@v4
 22 |     - uses: actions/cache@v3
 23 |       id: virtualenv-cache
 24 |       with:
 25 |         path: |
 26 |           .venv
 27 |           sample-docs
 28 |         key: ci-venv-sec-${{ hashFiles('requirements/*.txt', 'sample-docs/sample-sec-docs.sha256') }}
 29 |     - name: Set up Python ${{ env.PYTHON_VERSION }}
 30 |       uses: actions/setup-python@v4
 31 |       with:
 32 |         python-version: ${{ env.PYTHON_VERSION }}
 33 |     - name: Setup virtual environment (no cache hit)
 34 |       if: steps.virtualenv-cache.outputs.cache-hit != 'true'
 35 |       run: |
 36 |         python${{ env.PYTHON_VERSION }} -m venv .venv
 37 |         source .venv/bin/activate
 38 |         make install-ci
 39 |         make dl-test-artifacts
 40 |     - uses: actions/cache@v3
 41 |       id: nltk-cache
 42 |       with:
 43 |         path: /home/runner/nltk_data
 44 |         key: ci-nltk-${{ hashFiles('requirements/*.txt') }}
 45 |     - name: Download NLTK (no cache hit)
 46 |       if: steps.nltk-cache.outputs.cache-hit != 'true'
 47 |       run: |
 48 |         source .venv/bin/activate
 49 |         make install-nltk-models
 50 | 
 51 |   lint:
 52 |     runs-on: ubuntu-latest
 53 |     needs: setup
 54 |     steps:
 55 |     - uses: actions/checkout@v4
 56 |     - uses: actions/cache@v3
 57 |       id: virtualenv-cache
 58 |       with:
 59 |         path: |
 60 |           .venv
 61 |           sample-docs
 62 |         key: ci-venv-sec-${{ hashFiles('requirements/*.txt', 'sample-docs/sample-sec-docs.sha256') }}
 63 |     - uses: actions/cache@v3
 64 |       id: nltk-cache
 65 |       with:
 66 |         path: /home/runner/nltk_data
 67 |         key: ci-nltk-${{ hashFiles('requirements/*.txt') }}
 68 |     - name: Set up Python ${{ env.PYTHON_VERSION }}
 69 |       uses: actions/setup-python@v4
 70 |       with:
 71 |         python-version: ${{ env.PYTHON_VERSION }}
 72 |     - name: Lint
 73 |       run: |
 74 |         source .venv/bin/activate
 75 |         make check
 76 |         make check-notebooks
 77 | 
 78 |   shellcheck:
 79 |     runs-on: ubuntu-latest
 80 |     steps:
 81 |       - uses: actions/checkout@v4
 82 |       - name: ShellCheck
 83 |         uses: ludeeus/action-shellcheck@master
 84 | 
 85 |   test:
 86 |     runs-on: ubuntu-latest
 87 |     env:
 88 |       NLTK_DATA: ${{ github.workspace }}/nltk_data
 89 |     needs: [setup, lint]
 90 |     steps:
 91 |     - uses: actions/checkout@v4
 92 |     - uses: actions/cache@v3
 93 |       id: virtualenv-cache
 94 |       with:
 95 |         path: |
 96 |           .venv
 97 |           sample-docs
 98 |         key: ci-venv-sec-${{ hashFiles('requirements/*.txt', 'sample-docs/sample-sec-docs.sha256') }}
 99 |     - uses: actions/cache@v3
100 |       id: nltk-cache
101 |       with:
102 |         path: /home/runner/nltk_data
103 |         key: ci-nltk-${{ hashFiles('requirements/*.txt') }}
104 |     - name: Run core tests 
105 |       run: |
106 |         source .venv/bin/activate
107 |         make test
108 |         make check-coverage
109 |     - name: Run sample SEC documents tests
110 |       run: |
111 |         source .venv/bin/activate
112 |         make test-sample-docs
113 | 
114 |   changelog:
115 |     runs-on: ubuntu-latest
116 |     steps:
117 |     - uses: actions/checkout@v4
118 |     - if: github.ref != 'refs/heads/main'
119 |       uses: dorny/paths-filter@v2
120 |       id: changes
121 |       with:
122 |         filters: |
123 |           src:
124 |             - 'doc_recipe/**'
125 |             - 'recipe-notebooks/**'
126 | 
127 |     - if: steps.changes.outputs.src == 'true' && github.ref != 'refs/heads/main'
128 |       uses: dangoslen/changelog-enforcer@v3
129 | 
130 |   api_consistency:
131 |     runs-on: ubuntu-latest
132 |     needs: setup
133 |     steps:
134 |     - uses: actions/checkout@v4
135 |     - uses: actions/cache@v3
136 |       id: virtualenv-cache
137 |       with:
138 |         path: |
139 |           .venv
140 |           sample-docs
141 |         key: ci-venv-sec-${{ hashFiles('requirements/*.txt', 'sample-docs/sample-sec-docs.sha256') }}
142 |     - name: API Consistency
143 |       run: |
144 |         source .venv/bin/activate
145 |         make api-check
146 | 


--------------------------------------------------------------------------------
/requirements/base.txt:
--------------------------------------------------------------------------------
  1 | #
  2 | # This file is autogenerated by pip-compile with Python 3.8
  3 | # by the following command:
  4 | #
  5 | #    pip-compile requirements/base.in
  6 | #
  7 | anyio==3.7.0
  8 |     # via
  9 |     #   starlette
 10 |     #   watchfiles
 11 | attrs==23.1.0
 12 |     # via jsonschema
 13 | beautifulsoup4==4.12.2
 14 |     # via nbconvert
 15 | bleach==6.0.0
 16 |     # via nbconvert
 17 | certifi==2023.5.7
 18 |     # via requests
 19 | charset-normalizer==3.1.0
 20 |     # via requests
 21 | click==8.1.3
 22 |     # via
 23 |     #   nltk
 24 |     #   unstructured-api-tools
 25 |     #   uvicorn
 26 | defusedxml==0.7.1
 27 |     # via nbconvert
 28 | exceptiongroup==1.1.1
 29 |     # via anyio
 30 | fastapi==0.95.2
 31 |     # via unstructured-api-tools
 32 | fastjsonschema==2.17.1
 33 |     # via nbformat
 34 | h11==0.14.0
 35 |     # via uvicorn
 36 | httptools==0.5.0
 37 |     # via uvicorn
 38 | idna==3.4
 39 |     # via
 40 |     #   anyio
 41 |     #   requests
 42 | importlib-metadata==6.6.0
 43 |     # via
 44 |     #   jupyter-client
 45 |     #   nbconvert
 46 | importlib-resources==5.12.0
 47 |     # via jsonschema
 48 | jinja2==3.1.2
 49 |     # via
 50 |     #   nbconvert
 51 |     #   unstructured-api-tools
 52 | joblib==1.2.0
 53 |     # via
 54 |     #   nltk
 55 |     #   scikit-learn
 56 | jsonschema==4.17.3
 57 |     # via nbformat
 58 | jupyter-client==8.2.0
 59 |     # via nbclient
 60 | jupyter-core==5.3.0
 61 |     # via
 62 |     #   -r requirements/base.in
 63 |     #   jupyter-client
 64 |     #   nbclient
 65 |     #   nbconvert
 66 |     #   nbformat
 67 | jupyterlab-pygments==0.2.2
 68 |     # via nbconvert
 69 | lxml==4.9.2
 70 |     # via unstructured
 71 | markupsafe==2.1.2
 72 |     # via
 73 |     #   jinja2
 74 |     #   nbconvert
 75 | mistune==2.0.5
 76 |     # via nbconvert
 77 | mypy==1.3.0
 78 |     # via unstructured-api-tools
 79 | mypy-extensions==1.0.0
 80 |     # via mypy
 81 | nbclient==0.8.0
 82 |     # via nbconvert
 83 | nbconvert==7.4.0
 84 |     # via unstructured-api-tools
 85 | nbformat==5.9.0
 86 |     # via
 87 |     #   nbclient
 88 |     #   nbconvert
 89 | nltk==3.8.1
 90 |     # via unstructured
 91 | numpy==1.24.3
 92 |     # via
 93 |     #   -r requirements/base.in
 94 |     #   scikit-learn
 95 |     #   scipy
 96 | packaging==23.1
 97 |     # via
 98 |     #   -r requirements/base.in
 99 |     #   nbconvert
100 | pandocfilters==1.5.0
101 |     # via nbconvert
102 | pkgutil-resolve-name==1.3.10
103 |     # via jsonschema
104 | platformdirs==3.5.1
105 |     # via jupyter-core
106 | pydantic==1.10.8
107 |     # via fastapi
108 | pygments==2.15.1
109 |     # via nbconvert
110 | pyrsistent==0.19.3
111 |     # via jsonschema
112 | python-dateutil==2.8.2
113 |     # via jupyter-client
114 | python-dotenv==1.0.0
115 |     # via uvicorn
116 | python-multipart==0.0.6
117 |     # via unstructured-api-tools
118 | pyyaml==6.0
119 |     # via uvicorn
120 | pyzmq==25.1.0
121 |     # via jupyter-client
122 | ratelimit==2.2.1
123 |     # via -r requirements/base.in
124 | regex==2023.5.5
125 |     # via nltk
126 | requests==2.31.0
127 |     # via -r requirements/base.in
128 | scikit-learn==1.2.2
129 |     # via -r requirements/base.in
130 | scipy==1.10.1
131 |     # via scikit-learn
132 | six==1.16.0
133 |     # via
134 |     #   bleach
135 |     #   python-dateutil
136 | sniffio==1.3.0
137 |     # via anyio
138 | soupsieve==2.4.1
139 |     # via beautifulsoup4
140 | starlette==0.27.0
141 |     # via fastapi
142 | threadpoolctl==3.1.0
143 |     # via scikit-learn
144 | tinycss2==1.2.1
145 |     # via nbconvert
146 | tomli==2.0.1
147 |     # via mypy
148 | tornado==6.3.2
149 |     # via jupyter-client
150 | tqdm==4.65.0
151 |     # via nltk
152 | traitlets==5.9.0
153 |     # via
154 |     #   -r requirements/base.in
155 |     #   jupyter-client
156 |     #   jupyter-core
157 |     #   nbclient
158 |     #   nbconvert
159 |     #   nbformat
160 | types-requests==2.31.0.1
161 |     # via unstructured-api-tools
162 | types-ujson==5.7.0.5
163 |     # via unstructured-api-tools
164 | types-urllib3==1.26.25.13
165 |     # via types-requests
166 | typing-extensions==4.6.3
167 |     # via
168 |     #   mypy
169 |     #   pydantic
170 |     #   starlette
171 | unstructured==0.2.5
172 |     # via -r requirements/base.in
173 | unstructured-api-tools==0.10.6
174 |     # via -r requirements/base.in
175 | urllib3==2.0.2
176 |     # via requests
177 | uvicorn[standard]==0.22.0
178 |     # via unstructured-api-tools
179 | uvloop==0.17.0
180 |     # via uvicorn
181 | watchfiles==0.19.0
182 |     # via uvicorn
183 | webencodings==0.5.1
184 |     # via
185 |     #   bleach
186 |     #   tinycss2
187 | websockets==11.0.3
188 |     # via uvicorn
189 | zipp==3.15.0
190 |     # via
191 |     #   importlib-metadata
192 |     #   importlib-resources
193 | 


--------------------------------------------------------------------------------
/test_utils/get_sec_docs_from_edgar.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Downloads example SEC filings from the SEC EDGAR API as specified by examples.json.
  3 | Not normally intended to be called by users as it hits EDGAR directly.
  4 | Filings for testing/CI instead will be downloaded from s3.
  5 | """
  6 | import json
  7 | import os
  8 | import re
  9 | from pathlib import Path
 10 | 
 11 | 
 12 | from prepline_sec_filings.fetch import (
 13 |     get_filing,
 14 |     get_recent_acc_by_cik,
 15 |     get_recent_cik_and_acc_by_ticker,
 16 | )
 17 | 
 18 | 
 19 | SEC_DOCS_DIR = os.environ.get("SEC_DOCS_DIR", "sample-docs")
 20 | SEC_API_ORGANIZATION = os.environ.get("SEC_API_ORGANIZATION")
 21 | SEC_API_EMAIL = os.environ.get("SEC_API_EMAIL")
 22 | # only 1 of these 2 manifests types determines what gets downloaded
 23 | FILINGS_MANIFEST_JSON = os.path.join("test_utils", "examples.json")
 24 | FILINGS_MANIFEST_FILE = os.environ.get("FILINGS_MANIFEST_FILE")
 25 | 
 26 | 
 27 | def fetch_filing_xbrl(ticker, form_type, cik, accession_number, skip_fetch_if_file_exists=True):
 28 |     "Fetch a single filing from edgar and write it to $SEC_DOCS_DIR"
 29 |     _doc_name = f"{ticker}-{form_type}-{cik}-{accession_number}.xbrl".replace("/", "")
 30 |     sec_doc_filename = os.path.join(SEC_DOCS_DIR, _doc_name)
 31 |     if skip_fetch_if_file_exists and Path(sec_doc_filename).is_file():
 32 |         print(f"skipping download since {sec_doc_filename} exists")
 33 |         return
 34 | 
 35 |     text = get_filing(cik, accession_number, SEC_API_ORGANIZATION, SEC_API_EMAIL)
 36 |     with open(sec_doc_filename, "w+") as f:
 37 |         f.write(text)
 38 | 
 39 | 
 40 | def parse_examples_json():
 41 |     with open(FILINGS_MANIFEST_JSON, "r") as f:
 42 |         manifest_json_obj = json.load(f)
 43 |     return manifest_json_obj
 44 | 
 45 | 
 46 | def parse_manifest_text_file():
 47 |     ticker_form_type_pairs = []
 48 |     with open(FILINGS_MANIFEST_FILE, "r") as f:
 49 |         for line in f.readlines():
 50 |             line = line.strip()
 51 |             if line and not line.startswith("#"):
 52 |                 m = re.match(r"(\w+)\s+(\S+)\s*", line)
 53 |                 ticker_form_type_pairs.append(m.groups())
 54 |     return ticker_form_type_pairs
 55 | 
 56 | 
 57 | def fetch_filings(manifest_json_obj):
 58 |     """Given json like:
 59 |       {
 60 |         "mmm": {
 61 |           "cik": "66740",
 62 |           "forms": {
 63 |             "10-Q": "000006674022000065"
 64 |           }
 65 |         },
 66 |     download the indicated xbrl documents from edgar.
 67 |     """
 68 |     for ticker, filing_info in manifest_json_obj.items():
 69 |         cik = filing_info["cik"]
 70 |         for form_type, accession_number in filing_info["forms"].items():
 71 |             fetch_filing_xbrl(ticker, form_type, cik, accession_number)
 72 |             print(f"fetched {ticker}")
 73 | 
 74 | 
 75 | def get_sample_docs():
 76 |     """Fetch filings from edgar ultimately to be used for 'make test-sample-docs'."""
 77 |     fetch_filings(parse_examples_json())
 78 | 
 79 | 
 80 | def _add_to_manifest_json_obj(manifest_json_obj, ticker, form_type, cik, acc_num):
 81 |     if ticker not in manifest_json_obj:
 82 |         manifest_json_obj[ticker] = {"forms": {}}
 83 |     if cik in manifest_json_obj[ticker]:
 84 |         assert manifest_json_obj[ticker]["cik"] == cik
 85 |     else:
 86 |         manifest_json_obj[ticker]["cik"] = cik
 87 |     manifest_json_obj[ticker]["forms"][form_type] = acc_num
 88 | 
 89 | 
 90 | def get_latest_docs():
 91 |     """Fetch filings from edgar, but unlike get_sample_docs() the
 92 |     acession_number and cik that correspond to the most recent filing are
 93 |     determined at runtime."""
 94 | 
 95 |     manifest_json_obj = {}
 96 |     for ticker_or_cik, _form_type in parse_manifest_text_file():
 97 |         ticker_or_cik = ticker_or_cik.lower()
 98 |         _form_type = _form_type.upper()  # just following the convention :)
 99 |         print(f"{ticker_or_cik}-{_form_type}...", end="", flush=True)
100 |         if re.search(r"^\d+$", ticker_or_cik):
101 |             cik = ticker_or_cik
102 |             acc_num, form_type = get_recent_acc_by_cik(cik, _form_type)
103 |         else:
104 |             ticker = ticker_or_cik
105 |             cik, acc_num, form_type = get_recent_cik_and_acc_by_ticker(ticker, _form_type)
106 |         _add_to_manifest_json_obj(manifest_json_obj, ticker_or_cik, form_type, cik, acc_num)
107 |         fetch_filing_xbrl(ticker_or_cik, form_type, cik, acc_num)
108 | 
109 |     with open(os.path.join(SEC_DOCS_DIR, "sec_docs_manifest.json"), "w") as f:
110 |         json.dump(manifest_json_obj, f, indent=2)
111 | 
112 | 
113 | if __name__ == "__main__":
114 |     if SEC_API_ORGANIZATION is None or SEC_API_EMAIL is None:
115 |         raise RuntimeError(
116 |             "Environment vaiables SEC_API_ORGANIZATION and SEC_API_EMAIL "
117 |             "must be set for SEC EDGAR API call (allows them to identify the consumer)"
118 |         )
119 |     Path(SEC_DOCS_DIR).mkdir(exist_ok=True)
120 | 
121 |     if not FILINGS_MANIFEST_FILE:
122 |         # documents related to python tests in test_real_docs/
123 |         print("env var FILINGS_MANIFEST_FILE not defined, fetching docs for python tests")
124 |         get_sample_docs()
125 |     else:
126 |         # pull latest filings in FILINGS_MANIFEST_FILE for reasons beknownst to user
127 |         get_latest_docs()
128 | 


--------------------------------------------------------------------------------
/test_utils/create_validation_csv_files.py:
--------------------------------------------------------------------------------
  1 | """Given an $SEC_DOCS_DIR with a sec_docs_manifest.json file, create
  2 | a CSV with all extracted sections, one row per section."""
  3 | import json
  4 | import os
  5 | import subprocess
  6 | from pathlib import Path
  7 | import time
  8 | 
  9 | import pandas as pd
 10 | 
 11 | 
 12 | from prepline_sec_filings.fetch import archive_url
 13 | from prepline_sec_filings.sections import SECTIONS_10K, SECTIONS_10Q, SECTIONS_S1
 14 | from prepline_sec_filings.sec_document import SECDocument
 15 | from unstructured_api_tools.pipelines.api_conventions import get_pipeline_path
 16 | 
 17 | 
 18 | SEC_DOCS_DIR = os.environ.get("SEC_DOCS_DIR")
 19 | CSV_FILES_DIR = os.environ.get("CSV_FILES_DIR")
 20 | FILINGS_MANIFEST_JSON = os.environ.get(
 21 |     "FILINGS_MANIFEST_JSON", os.path.join(SEC_DOCS_DIR, "sec_docs_manifest.json")
 22 | )
 23 | PIPELINE_SECTION_API_URL = os.environ.get(
 24 |     "PIPELINE_SECTION_API_URL", f"http://127.0.0.1:8000{get_pipeline_path('section')}"
 25 | )
 26 | 
 27 | 
 28 | def _fetch_response_from_api_curl(sec_doc_filename):
 29 |     time.sleep(1)
 30 |     command = [
 31 |         "curl",
 32 |         "-s",
 33 |         f"{PIPELINE_SECTION_API_URL}",
 34 |         "-H",
 35 |         "Accept: application/json",
 36 |         "-H",
 37 |         "Content-Type: multipart/form-data",
 38 |         "-F",
 39 |         f"file=@{sec_doc_filename}",
 40 |         "-F",
 41 |         "section=_ALL",
 42 |     ]
 43 |     proc = subprocess.run(command, capture_output=True)
 44 | 
 45 |     resp_data = {}
 46 |     if proc.returncode != 0:
 47 |         print(f"Failed to get results for {sec_doc_filename}", flush=True)
 48 |         print(proc.stderr)
 49 |     else:
 50 |         try:
 51 |             resp_data = json.loads(proc.stdout.decode("utf-8"))
 52 |             if "error" in resp_data:
 53 |                 print(f"Error in response for api for {sec_doc_filename}", flush=True)
 54 |                 print(resp_data)
 55 |                 resp_data = {}
 56 |         except json.decoder.JSONDecodeError:
 57 |             print(f"failed to create json obj from the response for {command}")
 58 |     return resp_data
 59 | 
 60 | 
 61 | def parse_manifest_json():
 62 |     with open(FILINGS_MANIFEST_JSON, "r") as f:
 63 |         manifest_json_obj = json.load(f)
 64 |     return manifest_json_obj
 65 | 
 66 | 
 67 | def _bookkeeping_info(keys, values, ticker_or_cik, cik, acc_num):
 68 |     """Add convenience lookup keys/values to row."""
 69 |     keys.append("url_for_xbrl")
 70 |     values.append(archive_url(cik, acc_num))
 71 |     keys.append("url_for_all_filings")
 72 |     values.append(f"https://www.sec.gov/edgar/browse/?CIK={cik}")
 73 |     keys.append("identifier")
 74 |     values.append(ticker_or_cik)
 75 | 
 76 | 
 77 | def _csv_filename(ticker_or_cik, form_type, cik, acc_num):
 78 |     return os.path.join(
 79 |         CSV_FILES_DIR, f"{ticker_or_cik}-{form_type}-{cik}-{acc_num}.csv".replace("/", "")
 80 |     )
 81 | 
 82 | 
 83 | def _write_csv(keys, values, ticker_or_cik, form_type, cik, acc_num):
 84 |     df = pd.DataFrame({"key": pd.Series(keys), "value": pd.Series(values)})
 85 |     df.to_csv(
 86 |         _csv_filename(ticker_or_cik, form_type, cik, acc_num),
 87 |         sep="\t",
 88 |         encoding="utf-8",
 89 |         index=False,
 90 |     )
 91 | 
 92 | 
 93 | def gen_csv(sec_doc_filename, ticker_or_cik, form_type, cik, acc_num):
 94 |     keys = []
 95 |     values = []
 96 | 
 97 |     _bookkeeping_info(keys, values, ticker_or_cik, cik, acc_num)
 98 |     resp_data = _fetch_response_from_api_curl(sec_doc_filename)
 99 |     if not resp_data:
100 |         return
101 |     for _key, _value in resp_data.items():
102 |         keys.append(_key)
103 |         values.append("\n".join([elem["text"] for elem in _value]))
104 |     _write_csv(keys, values, ticker_or_cik, form_type, cik, acc_num)
105 | 
106 | 
107 | def _gen_csv_no_api(filing_file_handle, ticker_or_cik, form_type, cik, acc_num):
108 |     keys = []
109 |     values = []
110 |     filing_content = filing_file_handle.read()
111 | 
112 |     _bookkeeping_info(keys, values, ticker_or_cik, cik, acc_num)
113 | 
114 |     sec_document = SECDocument.from_string(filing_content)
115 |     if "K" in form_type:
116 |         sections = SECTIONS_10K
117 |     elif "Q" in form_type:
118 |         sections = SECTIONS_10Q
119 |     else:
120 |         sections = SECTIONS_S1
121 | 
122 |     for section in sections:
123 |         print(section)
124 |         result = "\n".join([str(elem) for elem in sec_document.get_section_narrative(section)])
125 |         keys.append(section.name)
126 |         values.append(result)
127 |     _write_csv(keys, values, ticker_or_cik, form_type, cik, acc_num)
128 | 
129 | 
130 | def gen_csvs(manifest_json_obj):
131 |     """create CSVs given a manifest_json_obj which looks like:
132 |       {
133 |     "mmm": {
134 |       "cik": "66740",
135 |       "forms": {
136 |         "10-Q": "000006674022000065"
137 |       }
138 |     },
139 |     "0001156784": {
140 |       "forms": {
141 |         "S-1/A": "000149315222026129"
142 |       },
143 |       "cik": "0001156784"
144 |     },
145 |     """
146 |     Path(CSV_FILES_DIR).mkdir(exist_ok=True)
147 | 
148 |     for ticker_or_cik in manifest_json_obj:
149 |         cik = manifest_json_obj[ticker_or_cik]["cik"]
150 |         for form_type in manifest_json_obj[ticker_or_cik]["forms"]:
151 |             acc_num = manifest_json_obj[ticker_or_cik]["forms"][form_type]
152 |             no_dir_filename = f"{ticker_or_cik}-{form_type}-{cik}-{acc_num}.xbrl".replace("/", "")
153 |             sec_doc_filename = os.path.join(SEC_DOCS_DIR, no_dir_filename)
154 |             csv_filename = _csv_filename(ticker_or_cik, form_type, cik, acc_num)
155 |             if os.path.exists(csv_filename) and os.path.getsize(csv_filename) > 0:
156 |                 print(f"skipping api call for existing csv: {sec_doc_filename}", flush=True)
157 |                 continue
158 |             print(f"{ticker_or_cik}", flush=True)
159 |             gen_csv(sec_doc_filename, ticker_or_cik, form_type, cik, acc_num)
160 | 
161 | 
162 | if __name__ == "__main__":
163 |     if SEC_DOCS_DIR is None or CSV_FILES_DIR is None:
164 |         raise RuntimeError("Environment vaiables SEC_DOCS_DIR and CSV_FILES_DIR must be set.")
165 |     gen_csvs(parse_manifest_json())
166 | 


--------------------------------------------------------------------------------
/scripts/check-and-format-notebooks.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import argparse
  4 | from copy import deepcopy
  5 | import difflib
  6 | import json
  7 | from pathlib import Path
  8 | import sys
  9 | from typing import List, Tuple, Union
 10 | 
 11 | from nbdev import clean
 12 | from nbconvert.preprocessors import ExecutePreprocessor
 13 | import nbformat
 14 | from unstructured_api_tools.pipelines.convert import read_notebook
 15 | 
 16 | 
 17 | def process_nb(nb: nbformat.NotebookNode, working_dir: Union[str, Path]) -> nbformat.NotebookNode:
 18 |     """Execute cells in nb using working_dir as the working directory for imports, modifying the
 19 |     notebook in place (in memory)."""
 20 |     # Clear existing outputs before executing the notebook
 21 |     for cell in nb.cells:
 22 |         if cell.cell_type == "code":
 23 |             cell.outputs = []
 24 |     ep = ExecutePreprocessor(timeout=600)
 25 |     ep.preprocess(nb, {"metadata": {"path": working_dir}})
 26 |     # Merge adjacent text outputs after executing the notebook
 27 |     for cell in nb.cells:
 28 |         merge_adjacent_text_outputs(cell)
 29 |     return nb
 30 | 
 31 | def merge_adjacent_text_outputs(cell: nbformat.NotebookNode) -> nbformat.NotebookNode:
 32 |     """Merges adjacent text stream outputs to avoid non-deterministic splits in output."""
 33 |     if cell.cell_type != "code":
 34 |         return cell
 35 | 
 36 |     new_outputs = []
 37 |     current_output = None
 38 | 
 39 |     for output in cell.outputs:
 40 |         if output.output_type == "stream":
 41 |             if current_output is None:
 42 |                 current_output = output
 43 |             elif current_output.name == output.name:
 44 |                 current_output.text += output.text
 45 |             else:
 46 |                 new_outputs.append(current_output)
 47 |                 current_output = output
 48 |         else:
 49 |             if current_output is not None:
 50 |                 new_outputs.append(current_output)
 51 |                 current_output = None
 52 |             new_outputs.append(output)
 53 | 
 54 |     if current_output is not None:
 55 |         new_outputs.append(current_output)
 56 | 
 57 |     cell.outputs = new_outputs
 58 |     return cell
 59 | 
 60 | def nb_paths(root_path: Union[str, Path]) -> List[Path]:
 61 |     """Fetches all .ipynb filenames that belong to subdirectories of root_path (1 level deep) with
 62 |     'notebooks' in the name."""
 63 |     root_path = Path(root_path)
 64 |     return [
 65 |         fn
 66 |         for dir in root_path.iterdir()
 67 |         # NOTE(alan): Search only in paths with 'notebooks' in the title such as pipeline-notebooks
 68 |         # and exploration-notebooks
 69 |         if "notebooks" in dir.stem and dir.is_dir()
 70 |         for fn in dir.iterdir()
 71 |         if fn.suffix == ".ipynb"
 72 |     ]
 73 | 
 74 | 
 75 | def to_results_str(fns: List[Path], nonmatching_nbs: List[Path]) -> Tuple[str, str]:
 76 |     """Given files that were checked and list of files that would be changed, produces a summary of
 77 |     changes as well as a list of files to be changed"""
 78 |     unchanged = len(fns) - len(nonmatching_nbs)
 79 |     results = []
 80 |     if nonmatching_nbs:
 81 |         results.append(
 82 |             f"{len(nonmatching_nbs)} "
 83 |             f"{'file' if len(nonmatching_nbs) == 1 else 'files'} "
 84 |             f"{'would be ' if check else ''}changed"
 85 |         )
 86 |     if unchanged:
 87 |         results.append(
 88 |             f"{unchanged} "
 89 |             f"{'file' if unchanged == 1 else 'files'} "
 90 |             f"{'would be ' if check else ''}left unchanged"
 91 |         )
 92 |     summary_str = ", ".join(results) + ".\n"
 93 |     if nonmatching_nbs:
 94 |         details_str = (
 95 |             f"The following notebooks {'would have been' if check else 'were'} "
 96 |             "changed when executed and cleaned:\n* " + "\n* ".join(nonmatching_nbs) + "\n"
 97 |         )
 98 |     else:
 99 |         details_str = ""
100 | 
101 |     return summary_str, details_str
102 | 
103 | 
104 | if __name__ == "__main__":
105 |     parser = argparse.ArgumentParser()
106 |     parser.add_argument(
107 |         "--check",
108 |         default=False,
109 |         action="store_true",
110 |         help="Check notebook format without making changes. Return code 0 means formatting would "
111 |         "produce no changes. Return code 1 means some files would be changed.",
112 |     )
113 |     parser.add_argument(
114 |         "notebooks",
115 |         metavar="notebook",
116 |         nargs="*",
117 |         help="Path(s) to notebook(s) to format (or check). If you don't pass any paths, "
118 |         "notebooks in any subfolders with 'notebooks' in the name will be processed.",
119 |         default=[],
120 |     )
121 |     args = parser.parse_args()
122 |     check = args.check
123 |     notebooks = args.notebooks
124 | 
125 |     root_path = Path(__file__).parent.parent
126 |     nonmatching_nbs = []
127 |     fns = notebooks if notebooks else nb_paths(root_path)
128 |     for fn in fns:
129 |         print(f"{'checking' if check else 'processing'} {fn}")
130 |         nb = read_notebook(fn)
131 |         modified_nb = deepcopy(nb)
132 |         process_nb(modified_nb, root_path)
133 |         clean.clean_nb(modified_nb, allowed_cell_metadata_keys=["tags"])
134 |         if nb != modified_nb:
135 |             nonmatching_nbs.append(str(fn))
136 |             nb_json = json.dumps(nb.dict(), indent=2, sort_keys=True)
137 |             modified_nb_json = json.dumps(modified_nb.dict(), indent=2, sort_keys=True)
138 |             sys.stderr.write(f"The following diff shows the modifications made to {fn}\n")
139 |             sys.stderr.writelines(
140 |                 (
141 |                     difflib.unified_diff(
142 |                         nb_json.splitlines(keepends=True),
143 |                         modified_nb_json.splitlines(keepends=True),
144 |                     )
145 |                 )
146 |             )
147 |         if not check:
148 |             nbformat.write(modified_nb, fn)
149 | 
150 |     summary_str, details_str = to_results_str(fns, nonmatching_nbs)
151 |     print(summary_str)
152 |     if check:
153 |         sys.stderr.write(details_str)
154 |         if nonmatching_nbs:
155 |             sys.exit(1)
156 |     else:
157 |         print(details_str)
158 | 


--------------------------------------------------------------------------------
/scripts/version-sync.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | function usage {
  3 |     echo "Usage: $(basename "$0") [-c] -f FILE_TO_CHANGE REPLACEMENT_FORMAT [-f FILE_TO_CHANGE REPLACEMENT_FORMAT ...]" 2>&1
  4 |     echo 'Synchronize files to latest version in source file'
  5 |     echo '   -s              Specifies source file for version (default is CHANGELOG.md)'
  6 |     echo '   -f              Specifies a file to change and the format for searching and replacing versions'
  7 |     echo '                       FILE_TO_CHANGE is the file to be updated/checked for updates'
  8 |     echo '                       REPLACEMENT_FORMAT is one of (semver, release, api-release)'
  9 |     echo '                           semver indicates to look for a full semver version and replace with the latest full version'
 10 |     echo '                           release indicates to look for a release semver version (x.x.x) and replace with the latest release version'
 11 |     echo '                           api-release indicates to look for a release semver version in the context of an api route and replace with the latest release version'
 12 |     echo '   -c              Compare versions and output proposed changes without changing anything.'
 13 | }
 14 | 
 15 | function getopts-extra () {
 16 |     declare i=1
 17 |     # if the next argument is not an option, then append it to array OPTARG
 18 |     while [[ ${OPTIND} -le $# && ${!OPTIND:0:1} != '-' ]]; do
 19 |         OPTARG[i]=${!OPTIND}
 20 |         i+=1
 21 |         OPTIND+=1
 22 |     done
 23 | }
 24 | 
 25 | # Parse input options
 26 | declare CHECK=0
 27 | declare SOURCE_FILE="CHANGELOG.md"
 28 | declare -a FILES_TO_CHECK=()
 29 | declare -a REPLACEMENT_FORMATS=()
 30 | declare args
 31 | declare OPTIND OPTARG opt
 32 | while getopts ":hcs:f:" opt; do
 33 |     case $opt in
 34 |         h)
 35 |             usage
 36 |             exit 0
 37 |             ;;
 38 |         c)
 39 |             CHECK=1
 40 |             ;;
 41 |         s)
 42 |             SOURCE_FILE="$OPTARG"
 43 |             ;;
 44 |         f)
 45 |             getopts-extra "$@"
 46 |             args=( "${OPTARG[@]}" )
 47 |             # validate length of args, should be 2
 48 |             if [ ${#args[@]} -eq 2 ]; then
 49 |                 FILES_TO_CHECK+=( "${args[0]}" )
 50 |                 REPLACEMENT_FORMATS+=( "${args[1]}" )
 51 |             else
 52 |                 echo "Exactly 2 arguments must follow -f option." >&2
 53 |                 exit 1
 54 |             fi
 55 |             ;;
 56 |         \?)
 57 |             echo "Invalid option: -$OPTARG." >&2
 58 |             usage
 59 |             exit 1
 60 |             ;;
 61 |     esac
 62 | done
 63 | 
 64 | # Parse REPLACEMENT_FORMATS
 65 | RE_SEMVER_FULL="(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)(-((0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*)(\.(0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*))*))?(\+([0-9a-zA-Z-]+(\.[0-9a-zA-Z-]+)*))?"
 66 | RE_RELEASE="(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)"
 67 | RE_API_RELEASE="v(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)"
 68 | # Pull out semver appearing earliest in SOURCE_FILE.
 69 | LAST_VERSION=$(grep -o -m 1 -E "${RE_SEMVER_FULL}" "$SOURCE_FILE")
 70 | LAST_RELEASE=$(grep -o -m 1 -E "${RE_RELEASE}($|[^-+])" "$SOURCE_FILE" | grep -o -m 1 -E "${RE_RELEASE}")
 71 | LAST_API_RELEASE="v$(grep -o -m 1 -E "${RE_RELEASE}($|[^-+])$" "$SOURCE_FILE" | grep -o -m 1 -E "${RE_RELEASE}")"
 72 | declare -a RE_SEMVERS=()
 73 | declare -a UPDATED_VERSIONS=()
 74 | for i in "${!REPLACEMENT_FORMATS[@]}"; do
 75 |     REPLACEMENT_FORMAT=${REPLACEMENT_FORMATS[$i]}
 76 |     case $REPLACEMENT_FORMAT in
 77 |         semver)
 78 |             RE_SEMVERS+=( "$RE_SEMVER_FULL" )
 79 |             UPDATED_VERSIONS+=( "$LAST_VERSION" )
 80 |             ;;
 81 |         release)
 82 |             RE_SEMVERS+=( "$RE_RELEASE" )
 83 |             UPDATED_VERSIONS+=( "$LAST_RELEASE" )
 84 |             ;;
 85 |         api-release)
 86 |             RE_SEMVERS+=( "$RE_API_RELEASE" )
 87 |             UPDATED_VERSIONS+=( "$LAST_API_RELEASE" )
 88 |             ;;
 89 |         *)
 90 |             echo "Invalid replacement format: \"${REPLACEMENT_FORMAT}\". Use semver, release, or api-release" >&2
 91 |             exit 1
 92 |             ;;
 93 |     esac
 94 | done
 95 | 
 96 | if [ -z "$LAST_VERSION" ];
 97 | then
 98 |     # No match to semver regex in SOURCE_FILE, so no version to go from.
 99 |     printf "Error: Unable to find latest version from %s.\n" "$SOURCE_FILE"
100 |     exit 1
101 | fi
102 | 
103 | # Search files in FILES_TO_CHECK and change (or get diffs)
104 | declare FAILED_CHECK=0
105 | 
106 | for i in "${!FILES_TO_CHECK[@]}"; do
107 |     FILE_TO_CHANGE=${FILES_TO_CHECK[$i]}
108 |     RE_SEMVER=${RE_SEMVERS[$i]}
109 |     UPDATED_VERSION=${UPDATED_VERSIONS[$i]}
110 |     FILE_VERSION=$(grep -o -m 1 -E "${RE_SEMVER}" "$FILE_TO_CHANGE")
111 |     if [ -z "$FILE_VERSION" ];
112 |     then
113 |         # No match to semver regex in VERSIONFILE, so nothing to replace
114 |         printf "Error: No semver version found in file %s.\n" "$FILE_TO_CHANGE"
115 |         exit 1
116 |     else
117 |         # Replace semver in VERSIONFILE with semver obtained from SOURCE_FILE
118 |         TMPFILE=$(mktemp /tmp/new_version.XXXXXX)
119 |         # Check sed version, exit if version < 4.3
120 |         if ! sed --version > /dev/null 2>&1; then
121 |             CURRENT_VERSION=1.archaic
122 |         else
123 |             CURRENT_VERSION=$(sed --version | head -n1 | cut -d" " -f4)
124 |         fi
125 |         REQUIRED_VERSION="4.3"
126 |         if [ "$(printf '%s\n' "$REQUIRED_VERSION" "$CURRENT_VERSION" | sort -V | head -n1)" != "$REQUIRED_VERSION" ]; then
127 |             echo "sed version must be >= ${REQUIRED_VERSION}" && exit 1
128 |         fi
129 |         sed -E -r "s/$RE_SEMVER/$UPDATED_VERSION/" "$FILE_TO_CHANGE" > "$TMPFILE"
130 |         if [ $CHECK == 1 ];
131 |         then
132 |             DIFF=$(diff "$FILE_TO_CHANGE"  "$TMPFILE" )
133 |             if [ -z "$DIFF" ];
134 |             then
135 |                 printf "version sync would make no changes to %s.\n" "$FILE_TO_CHANGE"
136 |                 rm "$TMPFILE"
137 |             else
138 |                 FAILED_CHECK=1
139 |                 printf "version sync would make the following changes to %s:\n%s\n" "$FILE_TO_CHANGE" "$DIFF"
140 |                 rm "$TMPFILE"
141 |             fi
142 |         else
143 |             cp "$TMPFILE" "$FILE_TO_CHANGE" 
144 |             rm "$TMPFILE"
145 |         fi
146 |     fi
147 | done
148 | 
149 | # Exit with code determined by whether changes were needed in a check.
150 | if [ ${FAILED_CHECK} -ne 0 ]; then
151 |     exit 1
152 | else
153 |     exit 0
154 | fi
155 | 


--------------------------------------------------------------------------------
/prepline_sec_filings/sections.py:
--------------------------------------------------------------------------------
  1 | """Module for defining/enumerating the common sections from SEC forms"""
  2 | from enum import Enum
  3 | import re
  4 | from typing import List
  5 | 
  6 | 
  7 | class SECSection(Enum):
  8 |     PROSPECTUS_SUMMARY = re.compile(r"^(?:prospectus )?summary$")
  9 |     ABOUT_PROSPECTUS = re.compile(r"about this prospectus")
 10 |     FORWARD_LOOKING_STATEMENTS = re.compile(r"forward[ -]looking statements")
 11 |     RISK_FACTORS = re.compile(r"risk factors")
 12 |     USE_OF_PROCEEDS = re.compile(r"use of proceeds")
 13 |     DIVIDEND_POLICY = re.compile(r"^dividend policy")
 14 |     CAPITALIZATION = re.compile(r"^capitalization$")
 15 |     DILUTION = re.compile(r"^dilution$")
 16 |     MANAGEMENT_DISCUSSION = re.compile(r"^management(?:[\u2019']s)? discussion")
 17 |     BUSINESS = re.compile(r"^business$")
 18 |     MANAGEMENT = re.compile(r"^(?:(?:our )?management)|(?:executive officers)$")
 19 |     COMPENSATION = re.compile(r"compensation")
 20 |     RELATED_PARTY_TRANSACTIONS = re.compile(r"(?:relationships|related).*transactions")
 21 |     PRINCIPAL_STOCKHOLDERS = re.compile(
 22 |         r"(?:principal.*(?:stockholder|shareholder)s?)|(?:(security|stock|share) "
 23 |         r"ownership .*certain)"
 24 |     )
 25 |     DESCRIPTION_OF_STOCK = re.compile(r"^description of (?:capital stock|share capital|securities)")
 26 |     DESCRIPTION_OF_DEBT = re.compile(r"^description of .*debt")
 27 |     FUTURE_SALE = re.compile(r"(?:shares|stock) eligible for future sale")
 28 |     US_TAX = re.compile(
 29 |         r"(?:us|u\.s\.|united states|material federal).* tax (?:consideration|consequence)"
 30 |     )
 31 |     UNDERWRITING = re.compile(r"underwrit")
 32 |     LEGAL_MATTERS = re.compile(r"legal matters")
 33 |     EXPERTS = re.compile(r"^experts$")
 34 |     MORE_INFORMATION = re.compile(r"(?:additional|more) information")
 35 |     FINANCIAL_STATEMENTS = r"financial statements"
 36 |     MARKET_RISK_DISCLOSURES = r"(?:quantitative|qualitative) disclosures? about market risk"
 37 |     CONTROLS_AND_PROCEDURES = r"controls and procedures"
 38 |     LEGAL_PROCEEDINGS = r"legal proceedings"
 39 |     DEFAULTS = r"defaults (?:up)?on .*securities"
 40 |     MINE_SAFETY = r"mine safety disclosures?"
 41 |     OTHER_INFORMATION = r"other information"
 42 |     UNRESOLVED_STAFF_COMMENTS = r"unresolved staff comments"
 43 |     PROPERTIES = r"^properties$"
 44 |     MARKET_FOR_REGISTRANT_COMMON_EQUITY = (
 45 |         r"market for(?: the)? (?:registrant|company)(?:['\u2019]s)? common equity"
 46 |     )
 47 |     ACCOUNTING_DISAGREEMENTS = r"disagreements with accountants"
 48 |     FOREIGN_JURISDICTIONS = r"diclosure .*foreign jurisdictions .*inspection"
 49 |     EXECUTIVE_OFFICERS = r"executive officers"
 50 |     ACCOUNTING_FEES = r"accounting fees"
 51 |     EXHIBITS = r"^exhibits?(.*financial statement schedules)?$"
 52 |     FORM_SUMMARY = r"^form .*summary$"
 53 |     # NOTE(yuming): Additional section titles used in test_real_examples.py,
 54 |     # maybe change this when custom regex string param is allowed.
 55 |     CERTAIN_TRADEMARKS = r"certain trademarks"
 56 |     OFFER_PRICE = r"(?:determination of )offering price"
 57 | 
 58 |     @property
 59 |     def pattern(self):
 60 |         return self.value
 61 | 
 62 | 
 63 | ALL_SECTIONS = "_ALL"
 64 | 
 65 | section_string_to_enum = {enum.name: enum for enum in SECSection}
 66 | 
 67 | # NOTE(robinson) - Sections are listed in the following document from SEC
 68 | # ref: https://www.sec.gov/files/form10-k.pdf
 69 | SECTIONS_10K = (
 70 |     SECSection.BUSINESS,  # ITEM 1
 71 |     SECSection.RISK_FACTORS,  # ITEM 1A
 72 |     SECSection.UNRESOLVED_STAFF_COMMENTS,  # ITEM 1B
 73 |     SECSection.PROPERTIES,  # ITEM 2
 74 |     SECSection.LEGAL_PROCEEDINGS,  # ITEM 3
 75 |     SECSection.MINE_SAFETY,  # ITEM 4
 76 |     SECSection.MARKET_FOR_REGISTRANT_COMMON_EQUITY,  # ITEM 5
 77 |     # NOTE(robinson) - ITEM 6 is "RESERVED"
 78 |     SECSection.MANAGEMENT_DISCUSSION,  # ITEM 7
 79 |     SECSection.MARKET_RISK_DISCLOSURES,  # ITEM 7A
 80 |     SECSection.FINANCIAL_STATEMENTS,  # ITEM 8
 81 |     SECSection.ACCOUNTING_DISAGREEMENTS,  # ITEM 9
 82 |     SECSection.CONTROLS_AND_PROCEDURES,  # ITEM 9A
 83 |     # NOTE(robinson) - ITEM 9B is other information
 84 |     SECSection.FOREIGN_JURISDICTIONS,  # ITEM 9C
 85 |     SECSection.MANAGEMENT,  # ITEM 10
 86 |     SECSection.COMPENSATION,  # ITEM 11
 87 |     SECSection.PRINCIPAL_STOCKHOLDERS,  # ITEM 12
 88 |     SECSection.RELATED_PARTY_TRANSACTIONS,  # ITEM 13
 89 |     SECSection.ACCOUNTING_FEES,  # ITEM 14
 90 |     SECSection.EXHIBITS,  # ITEM 15
 91 |     SECSection.FORM_SUMMARY,  # ITEM 16
 92 | )
 93 | 
 94 | # NOTE(robinson) - Sections are listed in the following document from SEC
 95 | # ref: https://www.sec.gov/files/form10-q.pdf
 96 | SECTIONS_10Q = (
 97 |     # Part I - Financial information
 98 |     SECSection.FINANCIAL_STATEMENTS,  # ITEM 1
 99 |     SECSection.MANAGEMENT_DISCUSSION,  # ITEM 2
100 |     SECSection.MARKET_RISK_DISCLOSURES,  # ITEM 3
101 |     SECSection.CONTROLS_AND_PROCEDURES,  # ITEM 4
102 |     # Part II - Other information
103 |     SECSection.LEGAL_PROCEEDINGS,  # ITEM 1
104 |     SECSection.RISK_FACTORS,  # ITEM 1A
105 |     SECSection.USE_OF_PROCEEDS,  # ITEM 2
106 |     SECSection.DEFAULTS,  # ITEM 3
107 |     SECSection.MINE_SAFETY,  # ITEM 4
108 |     SECSection.OTHER_INFORMATION,  # ITEM 5
109 | )
110 | 
111 | SECTIONS_S1 = (
112 |     SECSection.PROSPECTUS_SUMMARY,
113 |     SECSection.ABOUT_PROSPECTUS,
114 |     SECSection.FORWARD_LOOKING_STATEMENTS,
115 |     SECSection.RISK_FACTORS,
116 |     SECSection.USE_OF_PROCEEDS,
117 |     SECSection.DIVIDEND_POLICY,
118 |     SECSection.CAPITALIZATION,
119 |     SECSection.DILUTION,
120 |     SECSection.MANAGEMENT_DISCUSSION,
121 |     SECSection.BUSINESS,
122 |     SECSection.MANAGEMENT,
123 |     SECSection.COMPENSATION,
124 |     SECSection.RELATED_PARTY_TRANSACTIONS,
125 |     SECSection.PRINCIPAL_STOCKHOLDERS,
126 |     SECSection.DESCRIPTION_OF_STOCK,
127 |     SECSection.DESCRIPTION_OF_DEBT,
128 |     SECSection.FUTURE_SALE,
129 |     SECSection.US_TAX,
130 |     SECSection.UNDERWRITING,
131 |     SECSection.LEGAL_MATTERS,
132 |     SECSection.EXPERTS,
133 |     SECSection.MORE_INFORMATION,
134 | )
135 | 
136 | 
137 | def validate_section_names(section_names: List[str]):
138 |     """Return section names that don't correspond to a defined enum."""
139 |     if len(section_names) == 1 and section_names[0] == ALL_SECTIONS:
140 |         return None
141 |     elif len(section_names) > 1 and ALL_SECTIONS in section_names:
142 |         raise ValueError(f"{ALL_SECTIONS} may not be specified with other sections")
143 | 
144 |     invalid_names = [name for name in section_names if name not in section_string_to_enum]
145 |     if invalid_names:
146 |         raise ValueError(f"The following section names are not valid: {invalid_names}")
147 |     return None
148 | 


--------------------------------------------------------------------------------
/test_real_docs/test_real_examples.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from pathlib import Path
  4 | 
  5 | import pytest
  6 | 
  7 | from prepline_sec_filings.sec_document import SECDocument, clean_sec_text
  8 | from unstructured.documents.html import HTMLListItem
  9 | 
 10 | from prepline_sec_filings.sections import SECSection, section_string_to_enum
 11 | 
 12 | DIRECTORY = Path(__file__).absolute().parent
 13 | 
 14 | RISK_FACTOR_XFAILS = ["aig", "bgs"]
 15 | 
 16 | 
 17 | with open(os.path.join("test_utils", "examples.json")) as f:
 18 |     examples = json.load(f)
 19 | 
 20 | 
 21 | with open(
 22 |     os.path.join(DIRECTORY, "fixtures", "sample-first-last.json"),
 23 |     "r",
 24 | ) as f:
 25 |     sample_first_last = json.load(f)
 26 | 
 27 | 
 28 | @pytest.fixture(scope="module")
 29 | def docs_all():
 30 |     return {}
 31 | 
 32 | 
 33 | @pytest.fixture
 34 | def doc_elements(ticker, docs_all):
 35 |     if ticker not in docs_all:
 36 |         text = get_file_from_ticker(ticker)
 37 |         doc = SECDocument.from_string(text).doc_after_cleaners(skip_headers_and_footers=True)
 38 |         docs_all[ticker] = {}
 39 |         docs_all[ticker]["doc"] = doc
 40 |         docs_all[ticker]["elements"] = doc.elements
 41 |     return (docs_all[ticker]["doc"], docs_all[ticker]["elements"])
 42 | 
 43 | 
 44 | @pytest.fixture
 45 | def xfail(ticker, section, first_or_last):
 46 |     if ticker in RISK_FACTOR_XFAILS:
 47 |         return True
 48 |     elif ticker == "cl" and section in [
 49 |         SECSection.MANAGEMENT_DISCUSSION,
 50 |         SECSection.MARKET_RISK_DISCLOSURES,
 51 |     ]:
 52 |         return True
 53 |     elif ticker == "bc" and section == SECSection.USE_OF_PROCEEDS:
 54 |         return True
 55 |     elif ticker == "doc" and section == SECSection.OTHER_INFORMATION:
 56 |         return True
 57 |     elif (
 58 |         ticker == "cvs" and section == SECSection.PRINCIPAL_STOCKHOLDERS and first_or_last == "last"
 59 |     ):
 60 |         return True
 61 |     # TODO(yuming): The issue of this xfail is the same as the one in core-241
 62 |     elif ticker == "ehc" and section == SECSection.BUSINESS:
 63 |         return True
 64 |     return False
 65 | 
 66 | 
 67 | @pytest.fixture
 68 | def risk_samples():
 69 |     with open(os.path.join(os.path.dirname(__file__), "fixtures", "risk-samples.json"), "r") as f:
 70 |         out = json.load(f)
 71 |     return out
 72 | 
 73 | 
 74 | def get_file_from_ticker(ticker):
 75 |     cik = examples[ticker]["cik"]
 76 |     formtype = next(iter(examples[ticker]["forms"]))
 77 |     accession_number = examples[ticker]["forms"][formtype]
 78 |     with open(
 79 |         os.path.join("sample-docs", f"{ticker}-{formtype}-{cik}-{accession_number}.xbrl")
 80 |     ) as f:
 81 |         out = f.read()
 82 |     return out
 83 | 
 84 | 
 85 | @pytest.mark.parametrize("ticker", [ticker for ticker in examples])
 86 | def test_samples_found(ticker, risk_samples, doc_elements):
 87 |     samples = risk_samples[ticker]
 88 |     if ticker in (
 89 |         "mmm",
 90 |         "aig",
 91 |         "rgld",
 92 |         "cri",
 93 |         "pepg",
 94 |         "ehc",
 95 |         "bj",
 96 |         "smtc",
 97 |         "bgs",
 98 |         "blco",
 99 |     ):
100 |         pytest.xfail(reason="Need to re-examine test failure reasons")
101 | 
102 |     doc, _ = doc_elements
103 |     parsed_risk_narratives = doc.get_risk_narrative()
104 |     # The expected samples will be empty only when there is no risk factors section, so
105 |     # the parsed narratives and samples to find should either both be empty or both be
106 |     # populated.
107 |     assert bool(parsed_risk_narratives) == bool(samples)
108 |     for sample in samples:
109 |         assert any(
110 |             (
111 |                 # TODO(alan): Do cleaning directly in risk-samples.json and define cleaning
112 |                 # specifically for this test.
113 |                 clean_sec_text(sample) in clean_sec_text(risk_narrative.text)
114 |                 for risk_narrative in parsed_risk_narratives
115 |             )
116 |         )
117 | 
118 | 
119 | @pytest.mark.parametrize(
120 |     "ticker, section, first_or_last",
121 |     [
122 |         (ticker, section_string_to_enum[section], first_or_last)
123 |         for ticker in sample_first_last
124 |         for section in sample_first_last[ticker]
125 |         for first_or_last in sample_first_last[ticker][section]
126 |     ],
127 | )
128 | def test_first_last(ticker, doc_elements, section, first_or_last, xfail):
129 |     if xfail:
130 |         pytest.xfail()
131 |     doc, _ = doc_elements
132 |     parsed_risk_narratives = doc.get_section_narrative(section)
133 |     sample = sample_first_last[ticker][section.name][first_or_last]
134 |     idx = 0 if first_or_last == "first" else -1
135 |     assert clean_sec_text(parsed_risk_narratives[idx].text) == clean_sec_text(sample)
136 | 
137 | 
138 | def list_item_test_values():
139 |     list_item_count_file = os.path.join(DIRECTORY, "fixtures", "list-item-counts.json")
140 |     with open(list_item_count_file, "r") as f:
141 |         list_item_counts = json.load(f)
142 | 
143 |     list_item_content_file = os.path.join(DIRECTORY, "fixtures", "list-item-content.json")
144 |     with open(list_item_content_file, "r") as f:
145 |         list_item_content = json.load(f)
146 | 
147 |     list_item_tests = list()
148 |     for ticker, count in list_item_counts.items():
149 |         content = list_item_content.get(ticker, None)
150 |         list_item_tests.append((ticker, count, content))
151 | 
152 |     return list_item_tests
153 | 
154 | 
155 | def check_first_list_item_section(section, expected_count, expected_content):
156 |     count = 0
157 |     in_list_item_section = False
158 |     for i, element in enumerate(section):
159 |         if not in_list_item_section and isinstance(element, HTMLListItem):
160 |             in_list_item_section = True
161 |             if expected_content:
162 |                 section_text = clean_sec_text(section[i].text)
163 |                 expected_text = clean_sec_text(expected_content[count])
164 |                 assert section_text == expected_text
165 |             count += 1
166 |         elif in_list_item_section and isinstance(element, HTMLListItem):
167 |             if expected_content:
168 |                 section_text = clean_sec_text(section[i].text)
169 |                 expected_text = clean_sec_text(expected_content[count])
170 |                 assert section_text == expected_text
171 |             count += 1
172 |         elif in_list_item_section and not isinstance(element, HTMLListItem):
173 |             return count
174 | 
175 |     assert count == expected_count
176 | 
177 |     return count
178 | 
179 | 
180 | @pytest.mark.parametrize("ticker, expected_count, expected_content", list_item_test_values())
181 | def test_list_items(ticker, expected_count, expected_content):
182 |     if ticker in RISK_FACTOR_XFAILS:
183 |         pytest.xfail(reason="xfail for risk factor section. therefore can't count list items")
184 |     text = get_file_from_ticker(ticker)
185 |     doc = SECDocument.from_string(text)
186 |     risk_section = doc.get_section_narrative(SECSection.RISK_FACTORS)
187 |     check_first_list_item_section(risk_section, expected_count, expected_content)
188 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | PIPELINE_FAMILY := sec-filings
  2 | PIPELINE_PACKAGE := sec_filings
  3 | PACKAGE_NAME := prepline_${PIPELINE_PACKAGE}
  4 | PIP_VERSION := 23.1.2
  5 | 
  6 | .PHONY: help
  7 | help: Makefile
  8 | 	@sed -n 's/^\(## \)\([a-zA-Z]\)/\2/p' $<
  9 | 
 10 | 
 11 | ###########
 12 | # Install #
 13 | ###########
 14 | 
 15 | ## install-base:                installs minimum requirements to run the API
 16 | .PHONY: install-base
 17 | install-base: install-base-pip-packages install-nltk-models
 18 | 
 19 | ## install:                     installs all test and dev requirements
 20 | .PHONY: install
 21 | install: install-base install-test install-dev
 22 | 
 23 | .PHONY: install-base-pip-packages
 24 | install-base-pip-packages:
 25 | 	python3 -m pip install pip==${PIP_VERSION}
 26 | 	pip install -r requirements/base.txt
 27 | 
 28 | .PHONY: install-nltk-models
 29 | install-nltk-models:
 30 | 	python -c "import nltk; nltk.download('punkt')"
 31 | 	python -c "import nltk; nltk.download('averaged_perceptron_tagger')"
 32 | 
 33 | .PHONY: install-test
 34 | install-test:
 35 | 	pip install -r requirements/test.txt
 36 | 
 37 | .PHONY: install-dev
 38 | install-dev:
 39 | 	pip install -r requirements/dev.txt
 40 | 
 41 | .PHONY: install-ipython-kernel
 42 | install-ipython-kernel:
 43 | 	ipython kernel install --name "python3" --sys-prefix
 44 | 
 45 | .PHONY: install-ci
 46 | install-ci: install-base install-test install-ipython-kernel
 47 | 
 48 | ## pip-compile:                 compiles all base/dev/test requirements
 49 | .PHONY: pip-compile
 50 | pip-compile:
 51 | 	pip-compile --upgrade requirements/base.in
 52 | 	pip-compile --upgrade requirements/dev.in
 53 | 	pip-compile --upgrade requirements/test.in
 54 | 
 55 | 
 56 | #########
 57 | # Build #
 58 | #########
 59 | 
 60 | ## generate-api:                generates the FastAPI python APIs from notebooks
 61 | .PHONY: generate-api
 62 | generate-api:
 63 | 	PYTHONPATH=. unstructured_api_tools convert-pipeline-notebooks \
 64 | 		--input-directory ./pipeline-notebooks \
 65 | 		--output-directory ./${PACKAGE_NAME}/api
 66 | 
 67 | 
 68 | ##########
 69 | # Docker #
 70 | ##########
 71 | 
 72 | # Docker targets are provided for convenience only and are not required in a standard development environment
 73 | 
 74 | # Note that the image has notebooks baked in, however the current working directory
 75 | # is mounted under /home/notebook-user/local/ when the image is started with
 76 | # docker-start-api or docker-start-jupyter
 77 | 
 78 | .PHONY: docker-build
 79 | docker-build:
 80 | 	PIP_VERSION=${PIP_VERSION} PIPELINE_FAMILY=${PIPELINE_FAMILY} PIPELINE_PACKAGE=${PIPELINE_PACKAGE} ./scripts/docker-build.sh
 81 | 
 82 | .PHONY: docker-start-api
 83 | docker-start-api:
 84 | 	docker run -p 8000:8000 --mount type=bind,source=$(realpath .),target=/home/notebook-user/local -t --rm pipeline-family-${PIPELINE_FAMILY}-dev:latest uvicorn ${PACKAGE_NAME}.api.app:app --log-config logger_config.yaml --host 0.0.0.0 --port 8000
 85 | 
 86 | .PHONY: docker-start-jupyter
 87 | docker-start-jupyter:
 88 | 	docker run -p 8888:8888 --mount type=bind,source=$(realpath .),target=/home/notebook-user/local -t --rm pipeline-family-${PIPELINE_FAMILY}-dev:latest jupyter-notebook --port 8888 --ip 0.0.0.0 --no-browser --NotebookApp.token='' --NotebookApp.password=''
 89 | 
 90 | 
 91 | #########
 92 | # Local #
 93 | #########
 94 | 
 95 | ## run-jupyter:                 starts jupyter notebook
 96 | .PHONY: run-jupyter
 97 | run-jupyter:
 98 | 	PYTHONPATH=$(realpath .) JUPYTER_PATH=$(realpath .) jupyter-notebook --NotebookApp.token='' --NotebookApp.password=''
 99 | 
100 | ## run-web-app:                 runs the FastAPI api with hot reloading
101 | .PHONY: run-web-app
102 | run-web-app:
103 | 	PYTHONPATH=. uvicorn ${PACKAGE_NAME}.api.app:app --log-config logger_config.yaml --reload
104 | 
105 | 
106 | #################
107 | # Test and Lint #
108 | #################
109 | 
110 | ## test:                        runs core tests
111 | .PHONY: test
112 | test:
113 | 	PYTHONPATH=. pytest test_${PIPELINE_PACKAGE} --cov=${PACKAGE_NAME} --cov-report term-missing
114 | 
115 | .PHONY: check-coverage
116 | check-coverage:
117 | 	coverage report --fail-under=93
118 | 
119 | ## test-integration:            runs integration tests
120 | .PHONY: test-integration
121 | test-integration:
122 | 	PYTHONPATH=. pytest test_${PIPELINE_PACKAGE}_integration
123 | 
124 | ## test-sample-docs:            runs the pipeline on a set of sample SEC documents
125 | .PHONY: test-sample-docs
126 | test-sample-docs: verify-artifacts
127 | 	PYTHONPATH=. pytest test_real_docs
128 | 
129 | ## api-check:                   verifies auto-generated pipeline APIs match the existing ones
130 | .PHONY: api-check
131 | api-check:
132 | 	PYTHONPATH=. PACKAGE_NAME=${PACKAGE_NAME} ./scripts/test-doc-pipeline-apis-consistent.sh
133 | 
134 | ## dl-test-artifacts:           downloads external artifacts used for testing
135 | .PHONY: dl-test-artifacts
136 | dl-test-artifacts:
137 | 	wget -r -nH -O sample-docs/sample-sec-docs.tar.gz https://utic-dev-tech-fixtures.s3.us-east-2.amazonaws.com/sample-sec-docs/sample-sec-docs.tar.gz
138 | 	tar -xf sample-docs/sample-sec-docs.tar.gz -C sample-docs/ && rm sample-docs/sample-sec-docs.tar.gz
139 | 	$(MAKE) verify-artifacts
140 | 
141 | .PHONY: verify-artifacts
142 | verify-artifacts:
143 | 	sha256sum --check --status sample-docs/sample-sec-docs.sha256
144 | 
145 | .PHONY: dl-test-artifacts-source
146 | dl-test-artifacts-source:
147 | 	# Downloads directly from SEC website. Not normally needed, see script.
148 | 	PYTHONPATH=. python3 test_utils/get_sec_docs_from_edgar.py
149 | 
150 | 
151 | ## check:                       runs linters (includes tests)
152 | .PHONY: check
153 | check: check-src check-tests check-version
154 | 
155 | ## check-src:                   runs linters (source only, no tests)
156 | .PHONY: check-src
157 | check-src:
158 | 	black --line-length 100 ${PACKAGE_NAME} --check --exclude ${PACKAGE_NAME}/api
159 | 	flake8 ${PACKAGE_NAME}
160 | 	mypy ${PACKAGE_NAME} --ignore-missing-imports --implicit-optional --install-types --non-interactive
161 | 
162 | .PHONY: check-tests
163 | check-tests:
164 | 	black --line-length 100 test_${PIPELINE_PACKAGE} --check
165 | 	flake8 test_${PIPELINE_PACKAGE}
166 | 	black --line-length 100 test_${PIPELINE_PACKAGE}_integration --check
167 | 	flake8 test_${PIPELINE_PACKAGE}_integration
168 | 	black --line-length 100 test_real_docs --check
169 | 	flake8 test_real_docs
170 | 	black --line-length 100 test_utils --check
171 | 	flake8 test_utils
172 | 
173 | ## check-scripts:               run shellcheck
174 | .PHONY: check-scripts
175 | check-scripts:
176 | # Fail if any of these files have warnings
177 | 	scripts/shellcheck.sh
178 | 
179 | ## check-version:               run check to ensure version in CHANGELOG.md matches references in files
180 | .PHONY: check-version
181 | check-version:
182 | # Fail if syncing version would produce changes
183 | 	scripts/version-sync.sh -c \
184 | 		-s CHANGELOG.md \
185 | 		-f README.md api-release \
186 | 		-f preprocessing-pipeline-family.yaml release \
187 | 		-f exploration-notebooks/exploration-10q-amended.ipynb api-release
188 | 
189 | ## check-notebooks:             check that executing and cleaning notebooks doesn't produce changes
190 | .PHONY: check-notebooks
191 | check-notebooks:
192 | 	scripts/check-and-format-notebooks.py --check
193 | 
194 | ## tidy:                        run black
195 | .PHONY: tidy
196 | tidy:
197 | 	black --line-length 100 ${PACKAGE_NAME}
198 | 	black --line-length 100 test_${PIPELINE_PACKAGE}
199 | 	black --line-length 100 test_${PIPELINE_PACKAGE}_integration
200 | 	black --line-length 100 test_real_docs
201 | 	black --line-length 100 test_utils
202 | 
203 | ## tidy-notebooks:	             execute notebooks and remove metadata
204 | .PHONY: tidy-notebooks
205 | tidy-notebooks:
206 | 	scripts/check-and-format-notebooks.py
207 | 
208 | ## version-sync:                update references to version with most recent version from CHANGELOG.md
209 | .PHONY: version-sync
210 | version-sync:
211 | 	scripts/version-sync.sh \
212 | 		-s CHANGELOG.md \
213 | 		-f README.md api-release \
214 | 		-f preprocessing-pipeline-family.yaml release \
215 | 		-f exploration-notebooks/exploration-10q-amended.ipynb api-release
216 | 


--------------------------------------------------------------------------------
/requirements/dev.txt:
--------------------------------------------------------------------------------
  1 | #
  2 | # This file is autogenerated by pip-compile with Python 3.8
  3 | # by the following command:
  4 | #
  5 | #    pip-compile requirements/dev.in
  6 | #
  7 | anyio==3.7.0
  8 |     # via jupyter-server
  9 | appnope==0.1.3
 10 |     # via
 11 |     #   ipykernel
 12 |     #   ipython
 13 | argon2-cffi==21.3.0
 14 |     # via
 15 |     #   jupyter-server
 16 |     #   nbclassic
 17 |     #   notebook
 18 | argon2-cffi-bindings==21.2.0
 19 |     # via argon2-cffi
 20 | arrow==1.2.3
 21 |     # via isoduration
 22 | asttokens==2.2.1
 23 |     # via stack-data
 24 | attrs==23.1.0
 25 |     # via jsonschema
 26 | backcall==0.2.0
 27 |     # via ipython
 28 | beautifulsoup4==4.12.2
 29 |     # via nbconvert
 30 | black==23.3.0
 31 |     # via -r requirements/dev.in
 32 | bleach==6.0.0
 33 |     # via nbconvert
 34 | build==0.10.0
 35 |     # via pip-tools
 36 | cffi==1.15.1
 37 |     # via argon2-cffi-bindings
 38 | click==8.1.3
 39 |     # via
 40 |     #   black
 41 |     #   pip-tools
 42 | comm==0.1.3
 43 |     # via ipykernel
 44 | debugpy==1.6.7
 45 |     # via ipykernel
 46 | decorator==5.1.1
 47 |     # via ipython
 48 | defusedxml==0.7.1
 49 |     # via nbconvert
 50 | exceptiongroup==1.1.1
 51 |     # via anyio
 52 | executing==1.2.0
 53 |     # via stack-data
 54 | fastjsonschema==2.17.1
 55 |     # via nbformat
 56 | flake8==6.0.0
 57 |     # via -r requirements/dev.in
 58 | fqdn==1.5.1
 59 |     # via jsonschema
 60 | idna==3.4
 61 |     # via
 62 |     #   anyio
 63 |     #   jsonschema
 64 | importlib-metadata==6.6.0
 65 |     # via
 66 |     #   jupyter-client
 67 |     #   nbconvert
 68 | importlib-resources==5.12.0
 69 |     # via jsonschema
 70 | ipykernel==6.23.1
 71 |     # via
 72 |     #   ipywidgets
 73 |     #   jupyter
 74 |     #   jupyter-console
 75 |     #   nbclassic
 76 |     #   notebook
 77 |     #   qtconsole
 78 | ipython==8.8.0
 79 |     # via
 80 |     #   -r requirements/dev.in
 81 |     #   ipykernel
 82 |     #   ipywidgets
 83 |     #   jupyter-console
 84 | ipython-genutils==0.2.0
 85 |     # via
 86 |     #   nbclassic
 87 |     #   notebook
 88 |     #   qtconsole
 89 | ipywidgets==8.0.6
 90 |     # via jupyter
 91 | isoduration==20.11.0
 92 |     # via jsonschema
 93 | jedi==0.18.2
 94 |     # via ipython
 95 | jinja2==3.1.2
 96 |     # via
 97 |     #   jupyter-server
 98 |     #   nbclassic
 99 |     #   nbconvert
100 |     #   notebook
101 | jsonpointer==2.3
102 |     # via jsonschema
103 | jsonschema[format-nongpl]==4.17.3
104 |     # via
105 |     #   jupyter-events
106 |     #   nbformat
107 | jupyter==1.0.0
108 |     # via -r requirements/dev.in
109 | jupyter-client==8.2.0
110 |     # via
111 |     #   ipykernel
112 |     #   jupyter-console
113 |     #   jupyter-server
114 |     #   nbclassic
115 |     #   nbclient
116 |     #   notebook
117 |     #   qtconsole
118 | jupyter-console==6.6.3
119 |     # via jupyter
120 | jupyter-core==5.3.0
121 |     # via
122 |     #   -r requirements/dev.in
123 |     #   ipykernel
124 |     #   jupyter-client
125 |     #   jupyter-console
126 |     #   jupyter-server
127 |     #   nbclassic
128 |     #   nbclient
129 |     #   nbconvert
130 |     #   nbformat
131 |     #   notebook
132 |     #   qtconsole
133 | jupyter-events==0.6.3
134 |     # via jupyter-server
135 | jupyter-server==2.6.0
136 |     # via
137 |     #   nbclassic
138 |     #   notebook-shim
139 | jupyter-server-terminals==0.4.4
140 |     # via jupyter-server
141 | jupyterlab-pygments==0.2.2
142 |     # via nbconvert
143 | jupyterlab-widgets==3.0.7
144 |     # via ipywidgets
145 | markupsafe==2.1.2
146 |     # via
147 |     #   jinja2
148 |     #   nbconvert
149 | matplotlib-inline==0.1.6
150 |     # via
151 |     #   ipykernel
152 |     #   ipython
153 | mccabe==0.7.0
154 |     # via flake8
155 | mistune==2.0.5
156 |     # via nbconvert
157 | mypy==1.3.0
158 |     # via -r requirements/dev.in
159 | mypy-extensions==1.0.0
160 |     # via
161 |     #   black
162 |     #   mypy
163 | nbclassic==1.0.0
164 |     # via notebook
165 | nbclient==0.8.0
166 |     # via nbconvert
167 | nbconvert==7.4.0
168 |     # via
169 |     #   jupyter
170 |     #   jupyter-server
171 |     #   nbclassic
172 |     #   notebook
173 | nbformat==5.9.0
174 |     # via
175 |     #   jupyter-server
176 |     #   nbclassic
177 |     #   nbclient
178 |     #   nbconvert
179 |     #   notebook
180 | nest-asyncio==1.5.6
181 |     # via
182 |     #   ipykernel
183 |     #   nbclassic
184 |     #   notebook
185 | notebook==6.5.4
186 |     # via jupyter
187 | notebook-shim==0.2.3
188 |     # via nbclassic
189 | overrides==7.3.1
190 |     # via jupyter-server
191 | packaging==23.1
192 |     # via
193 |     #   black
194 |     #   build
195 |     #   ipykernel
196 |     #   jupyter-server
197 |     #   nbconvert
198 |     #   qtconsole
199 |     #   qtpy
200 | pandocfilters==1.5.0
201 |     # via nbconvert
202 | parso==0.8.3
203 |     # via jedi
204 | pathspec==0.11.1
205 |     # via black
206 | pexpect==4.8.0
207 |     # via ipython
208 | pickleshare==0.7.5
209 |     # via ipython
210 | pip-tools==6.13.0
211 |     # via -r requirements/dev.in
212 | pkgutil-resolve-name==1.3.10
213 |     # via jsonschema
214 | platformdirs==3.5.1
215 |     # via
216 |     #   black
217 |     #   jupyter-core
218 | prometheus-client==0.17.0
219 |     # via
220 |     #   jupyter-server
221 |     #   nbclassic
222 |     #   notebook
223 | prompt-toolkit==3.0.38
224 |     # via
225 |     #   ipython
226 |     #   jupyter-console
227 | psutil==5.9.5
228 |     # via ipykernel
229 | ptyprocess==0.7.0
230 |     # via
231 |     #   pexpect
232 |     #   terminado
233 | pure-eval==0.2.2
234 |     # via stack-data
235 | pycodestyle==2.10.0
236 |     # via flake8
237 | pycparser==2.21
238 |     # via cffi
239 | pyflakes==3.0.1
240 |     # via flake8
241 | pygments==2.15.1
242 |     # via
243 |     #   ipython
244 |     #   jupyter-console
245 |     #   nbconvert
246 |     #   qtconsole
247 | pyproject-hooks==1.0.0
248 |     # via build
249 | pyrsistent==0.19.3
250 |     # via jsonschema
251 | python-dateutil==2.8.2
252 |     # via
253 |     #   arrow
254 |     #   jupyter-client
255 | python-json-logger==2.0.7
256 |     # via jupyter-events
257 | pyyaml==6.0
258 |     # via jupyter-events
259 | pyzmq==25.1.0
260 |     # via
261 |     #   ipykernel
262 |     #   jupyter-client
263 |     #   jupyter-console
264 |     #   jupyter-server
265 |     #   nbclassic
266 |     #   notebook
267 |     #   qtconsole
268 | qtconsole==5.4.3
269 |     # via jupyter
270 | qtpy==2.3.1
271 |     # via qtconsole
272 | rfc3339-validator==0.1.4
273 |     # via
274 |     #   jsonschema
275 |     #   jupyter-events
276 | rfc3986-validator==0.1.1
277 |     # via
278 |     #   jsonschema
279 |     #   jupyter-events
280 | send2trash==1.8.2
281 |     # via
282 |     #   jupyter-server
283 |     #   nbclassic
284 |     #   notebook
285 | six==1.16.0
286 |     # via
287 |     #   asttokens
288 |     #   bleach
289 |     #   python-dateutil
290 |     #   rfc3339-validator
291 | sniffio==1.3.0
292 |     # via anyio
293 | soupsieve==2.4.1
294 |     # via beautifulsoup4
295 | stack-data==0.6.2
296 |     # via ipython
297 | terminado==0.17.1
298 |     # via
299 |     #   jupyter-server
300 |     #   jupyter-server-terminals
301 |     #   nbclassic
302 |     #   notebook
303 | tinycss2==1.2.1
304 |     # via nbconvert
305 | tomli==2.0.1
306 |     # via
307 |     #   black
308 |     #   build
309 |     #   mypy
310 |     #   pyproject-hooks
311 | tornado==6.3.2
312 |     # via
313 |     #   ipykernel
314 |     #   jupyter-client
315 |     #   jupyter-server
316 |     #   nbclassic
317 |     #   notebook
318 |     #   terminado
319 | traitlets==5.9.0
320 |     # via
321 |     #   comm
322 |     #   ipykernel
323 |     #   ipython
324 |     #   ipywidgets
325 |     #   jupyter-client
326 |     #   jupyter-console
327 |     #   jupyter-core
328 |     #   jupyter-events
329 |     #   jupyter-server
330 |     #   matplotlib-inline
331 |     #   nbclassic
332 |     #   nbclient
333 |     #   nbconvert
334 |     #   nbformat
335 |     #   notebook
336 |     #   qtconsole
337 | typing-extensions==4.6.3
338 |     # via
339 |     #   black
340 |     #   mypy
341 | uri-template==1.2.0
342 |     # via jsonschema
343 | wcwidth==0.2.6
344 |     # via prompt-toolkit
345 | webcolors==1.13
346 |     # via jsonschema
347 | webencodings==0.5.1
348 |     # via
349 |     #   bleach
350 |     #   tinycss2
351 | websocket-client==1.5.2
352 |     # via jupyter-server
353 | wheel==0.40.0
354 |     # via pip-tools
355 | widgetsnbextension==4.0.7
356 |     # via ipywidgets
357 | zipp==3.15.0
358 |     # via
359 |     #   importlib-metadata
360 |     #   importlib-resources
361 | 
362 | # The following packages are considered to be unsafe in a requirements file:
363 | # pip
364 | # setuptools
365 | 


--------------------------------------------------------------------------------
/test_sec_filings/test_fetch.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import webbrowser
  4 | import requests
  5 | from unittest import mock
  6 | 
  7 | import pytest
  8 | import prepline_sec_filings.fetch as fetch
  9 | 
 10 | 
 11 | response_content = {
 12 |     "filings": {
 13 |         "recent": {
 14 |             "accessionNumber": [
 15 |                 "1234567890-12-345678",
 16 |                 "1234567890-12-345679",
 17 |                 "1234567890-12-345680",
 18 |                 "1234567890-12-345681",
 19 |             ],
 20 |             "form": ["10-K", "S-1", "10-K", "10-Q"],
 21 |         }
 22 |     }
 23 | }
 24 | 
 25 | 
 26 | class MockSession:
 27 |     def __init__(self):
 28 |         self.headers = dict()
 29 | 
 30 |     def get(self, url, **kwargs):
 31 |         if url.startswith(fetch.SEC_ARCHIVE_URL):
 32 |             if url.endswith("txt"):
 33 |                 filename = url.split("/")[-1]
 34 |                 return MockResponse(f"<SEC-DOCUMENT>{filename}</SEC-DOCUMENT>")
 35 |             elif url.endswith("html"):
 36 |                 return MockResponse("<html></html>")
 37 |         elif url.startswith(fetch.SEC_SEARCH_URL):
 38 |             return MockResponse("<html><body>CIK=1234567890</body></html>")
 39 |         elif url.startswith(fetch.SEC_SUBMISSIONS_URL):
 40 |             return MockResponse(
 41 |                 "",
 42 |                 content=json.dumps(response_content),
 43 |             )
 44 |         else:
 45 |             raise ValueError
 46 | 
 47 | 
 48 | class MockResponse:
 49 |     def __init__(self, text, content=None):
 50 |         self.text = text
 51 |         self.content = content
 52 | 
 53 |     def raise_for_status(self):
 54 |         pass
 55 | 
 56 | 
 57 | def test_get_filing(monkeypatch):
 58 |     monkeypatch.setattr(requests, "Session", MockSession)
 59 |     filing = fetch.get_filing("949874", "000119312511215661", "Giant", "parker@giant.com")
 60 |     assert filing == "<SEC-DOCUMENT>0001193125-11-215661.txt</SEC-DOCUMENT>"
 61 | 
 62 | 
 63 | def test_archive_url():
 64 |     url = fetch.archive_url("949874", "000119312511215661")
 65 |     assert url == f"{fetch.SEC_ARCHIVE_URL}/949874/000119312511215661/0001193125-11-215661.txt"
 66 | 
 67 | 
 68 | def test_add_dashes():
 69 |     accession_number = fetch._add_dashes("000119312511215661")
 70 |     assert accession_number == "0001193125-11-215661"
 71 | 
 72 | 
 73 | def test_drop_dashes():
 74 |     accession_number = fetch._drop_dashes("0001193125-11-215661")
 75 |     assert accession_number == "000119312511215661"
 76 | 
 77 | 
 78 | def test_get_session(monkeypatch):
 79 |     monkeypatch.setattr(requests, "Session", MockSession)
 80 |     session = fetch._get_session("Giant", "parker@giant.com")
 81 |     assert session.headers["User-Agent"] == "Giant parker@giant.com"
 82 | 
 83 | 
 84 | @mock.patch.dict(
 85 |     os.environ,
 86 |     {"SEC_API_ORGANIZATION": "OtherOrg", "SEC_API_EMAIL": "person@otherorg.io"},
 87 | )
 88 | def test_get_session_default(monkeypatch):
 89 |     monkeypatch.setattr(requests, "Session", MockSession)
 90 |     session = fetch._get_session()
 91 |     assert session.headers["User-Agent"] == "OtherOrg person@otherorg.io"
 92 | 
 93 | 
 94 | def test_get_cik_by_ticker(monkeypatch):
 95 |     monkeypatch.setattr(requests, "Session", MockSession)
 96 |     session = MockSession()
 97 |     cik = fetch.get_cik_by_ticker(session, "noice")
 98 |     assert cik == "1234567890"
 99 | 
100 | 
101 | def test_get_forms_by_cik(monkeypatch):
102 |     monkeypatch.setattr(requests, "Session", MockSession)
103 |     session = MockSession()
104 |     forms = fetch.get_forms_by_cik(session, "1234567890")
105 |     assert forms["1234567890-12-345678"] == "10-K"
106 |     assert forms["1234567890-12-345679"] == "S-1"
107 |     assert forms["1234567890-12-345680"] == "10-K"
108 |     assert forms["1234567890-12-345681"] == "10-Q"
109 | 
110 | 
111 | def test_get_recent_acc_num_by_cik(monkeypatch):
112 |     monkeypatch.setattr(requests, "Session", MockSession)
113 |     session = MockSession()
114 |     assert fetch._get_recent_acc_num_by_cik(session, "1234567890", ["10-K"]) == (
115 |         "123456789012345678",
116 |         "10-K",
117 |     )
118 |     assert fetch._get_recent_acc_num_by_cik(session, "1234567890", ["S-1"]) == (
119 |         "123456789012345679",
120 |         "S-1",
121 |     )
122 |     assert fetch._get_recent_acc_num_by_cik(session, "1234567890", ["10-Q"]) == (
123 |         "123456789012345681",
124 |         "10-Q",
125 |     )
126 | 
127 | 
128 | @pytest.mark.parametrize(
129 |     "form_type, expected",
130 |     [
131 |         ("10-K", "<SEC-DOCUMENT>1234567890-12-345678.txt</SEC-DOCUMENT>"),
132 |         ("10-Q", "<SEC-DOCUMENT>1234567890-12-345681.txt</SEC-DOCUMENT>"),
133 |         ("S-1", "<SEC-DOCUMENT>1234567890-12-345679.txt</SEC-DOCUMENT>"),
134 |     ],
135 | )
136 | def test_get_form_by_ticker(monkeypatch, form_type, expected):
137 |     monkeypatch.setattr(requests, "Session", MockSession)
138 |     assert (
139 |         fetch.get_form_by_ticker("1234567890", form_type, company="Giant", email="parker@giant.com")
140 |         == expected
141 |     )
142 | 
143 | 
144 | @pytest.mark.parametrize(
145 |     "form_type, expected",
146 |     [
147 |         (
148 |             "10-K",
149 |             f"{fetch.SEC_ARCHIVE_URL}/1234567890/123456789012345678/"
150 |             "1234567890-12-345678-index.html",
151 |         ),
152 |         (
153 |             "10-Q",
154 |             f"{fetch.SEC_ARCHIVE_URL}/1234567890/123456789012345681/"
155 |             "1234567890-12-345681-index.html",
156 |         ),
157 |         (
158 |             "S-1",
159 |             f"{fetch.SEC_ARCHIVE_URL}/1234567890/123456789012345679/"
160 |             "1234567890-12-345679-index.html",
161 |         ),
162 |     ],
163 | )
164 | @mock.patch("webbrowser.open_new_tab")
165 | @mock.patch("requests.Session", MockSession)
166 | def test_open_form_by_ticker(monkeypatch, form_type, expected):
167 |     fetch.open_form_by_ticker("noice", form_type, False, company="Giant", email="parker@giant.com")
168 |     webbrowser.open_new_tab.assert_called_once_with(expected)
169 | 
170 | 
171 | @pytest.mark.parametrize(
172 |     "cik, acc_num, expected",
173 |     [
174 |         (
175 |             "1234567890",
176 |             "123456789012345678",
177 |             f"{fetch.SEC_ARCHIVE_URL}/1234567890/123456789012345678/"
178 |             "1234567890-12-345678-index.html",
179 |         ),
180 |         (
181 |             "1234567890",
182 |             "123456789012345681",
183 |             f"{fetch.SEC_ARCHIVE_URL}/1234567890/123456789012345681/"
184 |             "1234567890-12-345681-index.html",
185 |         ),
186 |         (
187 |             "1234567890",
188 |             "123456789012345679",
189 |             f"{fetch.SEC_ARCHIVE_URL}/1234567890/123456789012345679/"
190 |             "1234567890-12-345679-index.html",
191 |         ),
192 |     ],
193 | )
194 | @mock.patch("webbrowser.open_new_tab")
195 | @mock.patch("requests.Session", MockSession)
196 | def test_open_form(monkeypatch, cik, acc_num, expected):
197 |     fetch.open_form(cik, acc_num)
198 |     webbrowser.open_new_tab.assert_called_once_with(expected)
199 | 
200 | 
201 | @pytest.mark.parametrize(
202 |     "formtype, expected_cid, expected_acc_num",
203 |     [
204 |         (
205 |             "10-K",
206 |             "1234567890",
207 |             "123456789012345678",
208 |         ),
209 |         (
210 |             "10-Q",
211 |             "1234567890",
212 |             "123456789012345681",
213 |         ),
214 |         (
215 |             "S-1",
216 |             "1234567890",
217 |             "123456789012345679",
218 |         ),
219 |     ],
220 | )
221 | def test_get_recent_cik_and_acc_by_ticker(monkeypatch, formtype, expected_cid, expected_acc_num):
222 |     monkeypatch.setattr(requests, "Session", MockSession)
223 |     cik, acc_num, retrieved_form_type = fetch.get_recent_cik_and_acc_by_ticker(
224 |         "noice", formtype, "Giant", "parker@giant.com"
225 |     )
226 |     assert cik == expected_cid
227 |     assert acc_num == expected_acc_num
228 |     assert retrieved_form_type == formtype
229 | 
230 | 
231 | @pytest.mark.parametrize(
232 |     "formtype, cid, expected_acc_num",
233 |     [
234 |         (
235 |             "10-K",
236 |             "1234567890",
237 |             "123456789012345678",
238 |         ),
239 |         (
240 |             "10-Q",
241 |             "1234567890",
242 |             "123456789012345681",
243 |         ),
244 |         (
245 |             "S-1",
246 |             "1234567890",
247 |             "123456789012345679",
248 |         ),
249 |     ],
250 | )
251 | def test_get_recent_acc_by_cik(monkeypatch, formtype, cid, expected_acc_num):
252 |     monkeypatch.setattr(requests, "Session", MockSession)
253 |     acc_num, recvd_formtype = fetch.get_recent_acc_by_cik(
254 |         cid, formtype, "Giant", "parker@giant.com"
255 |     )
256 |     assert acc_num == expected_acc_num
257 |     assert recvd_formtype == formtype
258 | 


--------------------------------------------------------------------------------
/prepline_sec_filings/fetch.py:
--------------------------------------------------------------------------------
  1 | """Module for fetching data from the SEC EDGAR Archives"""
  2 | import json
  3 | import os
  4 | import re
  5 | import requests
  6 | from typing import List, Optional, Tuple, Union
  7 | import sys
  8 | 
  9 | if sys.version_info < (3, 8):
 10 |     from typing_extensions import Final
 11 | else:
 12 |     from typing import Final
 13 | 
 14 | import webbrowser
 15 | 
 16 | from ratelimit import limits, sleep_and_retry
 17 | 
 18 | from prepline_sec_filings.sec_document import VALID_FILING_TYPES
 19 | 
 20 | SEC_ARCHIVE_URL: Final[str] = "https://www.sec.gov/Archives/edgar/data"
 21 | SEC_SEARCH_URL: Final[str] = "http://www.sec.gov/cgi-bin/browse-edgar"
 22 | SEC_SUBMISSIONS_URL = "https://data.sec.gov/submissions"
 23 | 
 24 | 
 25 | def get_filing(
 26 |     cik: Union[str, int], accession_number: Union[str, int], company: str, email: str
 27 | ) -> str:
 28 |     """Fetches the specified filing from the SEC EDGAR Archives. Conforms to the rate
 29 |     limits specified on the SEC website.
 30 |     ref: https://www.sec.gov/os/accessing-edgar-data"""
 31 |     session = _get_session(company, email)
 32 |     return _get_filing(session, cik, accession_number)
 33 | 
 34 | 
 35 | @sleep_and_retry
 36 | @limits(calls=10, period=1)
 37 | def _get_filing(
 38 |     session: requests.Session, cik: Union[str, int], accession_number: Union[str, int]
 39 | ) -> str:
 40 |     """Wrapped so filings can be retrieved with an existing session."""
 41 |     url = archive_url(cik, accession_number)
 42 |     response = session.get(url)
 43 |     response.raise_for_status()
 44 |     return response.text
 45 | 
 46 | 
 47 | @sleep_and_retry
 48 | @limits(calls=10, period=1)
 49 | def get_cik_by_ticker(session: requests.Session, ticker: str) -> str:
 50 |     """Gets a CIK number from a stock ticker by running a search on the SEC website."""
 51 |     cik_re = re.compile(r".*CIK=(\d{10}).*")
 52 |     url = _search_url(ticker)
 53 |     response = session.get(url, stream=True)
 54 |     response.raise_for_status()
 55 |     results = cik_re.findall(response.text)
 56 |     return str(results[0])
 57 | 
 58 | 
 59 | @sleep_and_retry
 60 | @limits(calls=10, period=1)
 61 | def get_forms_by_cik(session: requests.Session, cik: Union[str, int]) -> dict:
 62 |     """Gets retrieves dict of recent SEC form filings for a given cik number."""
 63 |     json_name = f"CIK{cik}.json"
 64 |     response = session.get(f"{SEC_SUBMISSIONS_URL}/{json_name}")
 65 |     response.raise_for_status()
 66 |     content = json.loads(response.content)
 67 |     recent_forms = content["filings"]["recent"]
 68 |     form_types = {k: v for k, v in zip(recent_forms["accessionNumber"], recent_forms["form"])}
 69 |     return form_types
 70 | 
 71 | 
 72 | def _get_recent_acc_num_by_cik(
 73 |     session: requests.Session, cik: Union[str, int], form_types: List[str]
 74 | ) -> Tuple[str, str]:
 75 |     """Returns accession number and form type for the most recent filing for one of the
 76 |     given form_types (AKA filing types) for a given cik."""
 77 |     retrieved_form_types = get_forms_by_cik(session, cik)
 78 |     for acc_num, form_type_ in retrieved_form_types.items():
 79 |         if form_type_ in form_types:
 80 |             return _drop_dashes(acc_num), form_type_
 81 |     raise ValueError(f"No filings found for {cik}, looking for any of: {form_types}")
 82 | 
 83 | 
 84 | def get_recent_acc_by_cik(
 85 |     cik: str,
 86 |     form_type: str,
 87 |     company: Optional[str] = None,
 88 |     email: Optional[str] = None,
 89 | ) -> Tuple[str, str]:
 90 |     """Returns (accession_number, retrieved_form_type) for the given cik and form_type.
 91 |     The retrieved_form_type may be an amended version of requested form_type, e.g. 10-Q/A for 10-Q.
 92 |     """
 93 |     session = _get_session(company, email)
 94 |     return _get_recent_acc_num_by_cik(session, cik, _form_types(form_type))
 95 | 
 96 | 
 97 | def get_recent_cik_and_acc_by_ticker(
 98 |     ticker: str,
 99 |     form_type: str,
100 |     company: Optional[str] = None,
101 |     email: Optional[str] = None,
102 | ) -> Tuple[str, str, str]:
103 |     """Returns (cik, accession_number, retrieved_form_type) for the given ticker and form_type.
104 |     The retrieved_form_type may be an amended version of requested form_type, e.g. 10-Q/A for 10-Q.
105 |     """
106 |     session = _get_session(company, email)
107 |     cik = get_cik_by_ticker(session, ticker)
108 |     acc_num, retrieved_form_type = _get_recent_acc_num_by_cik(session, cik, _form_types(form_type))
109 |     return cik, acc_num, retrieved_form_type
110 | 
111 | 
112 | def get_form_by_ticker(
113 |     ticker: str,
114 |     form_type: str,
115 |     allow_amended_filing: Optional[bool] = True,
116 |     company: Optional[str] = None,
117 |     email: Optional[str] = None,
118 | ) -> str:
119 |     """For a given ticker, gets the most recent form of a given form_type."""
120 |     session = _get_session(company, email)
121 |     cik = get_cik_by_ticker(session, ticker)
122 |     return get_form_by_cik(
123 |         cik, form_type, allow_amended_filing=allow_amended_filing, company=company, email=email
124 |     )
125 | 
126 | 
127 | def _form_types(form_type: str, allow_amended_filing: Optional[bool] = True):
128 |     """Potentialy expand to include amended filing, e.g.:
129 |     "10-Q" -> "10-Q/A"
130 |     """
131 |     assert form_type in VALID_FILING_TYPES
132 |     if allow_amended_filing and not form_type.endswith("/A"):
133 |         return [form_type, f"{form_type}/A"]
134 |     else:
135 |         return [form_type]
136 | 
137 | 
138 | def get_form_by_cik(
139 |     cik: str,
140 |     form_type: str,
141 |     allow_amended_filing: Optional[bool] = True,
142 |     company: Optional[str] = None,
143 |     email: Optional[str] = None,
144 | ) -> str:
145 |     """For a given CIK, returns the most recent form of a given form_type. By default
146 |     an amended version of the form_type may be retrieved (allow_amended_filing=True).
147 |     E.g., if form_type is "10-Q", the retrived form could be a 10-Q or 10-Q/A.
148 |     """
149 |     session = _get_session(company, email)
150 |     acc_num, _ = _get_recent_acc_num_by_cik(
151 |         session, cik, _form_types(form_type, allow_amended_filing)
152 |     )
153 |     text = _get_filing(session, cik, acc_num)
154 |     return text
155 | 
156 | 
157 | def open_form(cik, acc_num):
158 |     """For a given cik and accession number, opens the index page in default browser for the
159 |     associated SEC form"""
160 |     acc_num = _drop_dashes(acc_num)
161 |     webbrowser.open_new_tab(f"{SEC_ARCHIVE_URL}/{cik}/{acc_num}/{_add_dashes(acc_num)}-index.html")
162 | 
163 | 
164 | def open_form_by_ticker(
165 |     ticker: str,
166 |     form_type: str,
167 |     allow_amended_filing: Optional[bool] = True,
168 |     company: Optional[str] = None,
169 |     email: Optional[str] = None,
170 | ):
171 |     """For a given ticker, opens the index page in default browser for the most recent form of a
172 |     given form_type."""
173 |     session = _get_session(company, email)
174 |     cik = get_cik_by_ticker(session, ticker)
175 |     acc_num, _ = _get_recent_acc_num_by_cik(
176 |         session, cik, _form_types(form_type, allow_amended_filing)
177 |     )
178 |     open_form(cik, acc_num)
179 | 
180 | 
181 | def archive_url(cik: Union[str, int], accession_number: Union[str, int]) -> str:
182 |     """Builds the archive URL for the SEC accession number. Looks for the .txt file for the
183 |     filing, while follows a {accession_number}.txt format."""
184 |     filename = f"{_add_dashes(accession_number)}.txt"
185 |     accession_number = _drop_dashes(accession_number)
186 |     return f"{SEC_ARCHIVE_URL}/{cik}/{accession_number}/{filename}"
187 | 
188 | 
189 | def _search_url(cik: Union[str, int]) -> str:
190 |     search_string = f"CIK={cik}&Find=Search&owner=exclude&action=getcompany"
191 |     url = f"{SEC_SEARCH_URL}?{search_string}"
192 |     return url
193 | 
194 | 
195 | def _add_dashes(accession_number: Union[str, int]) -> str:
196 |     """Adds the dashes back into the accession number"""
197 |     accession_number = str(accession_number)
198 |     return f"{accession_number[:10]}-{accession_number[10:12]}-{accession_number[12:]}"
199 | 
200 | 
201 | def _drop_dashes(accession_number: Union[str, int]) -> str:
202 |     """Converts the accession number to the no dash representation."""
203 |     accession_number = str(accession_number).replace("-", "")
204 |     return accession_number.zfill(18)
205 | 
206 | 
207 | def _get_session(company: Optional[str] = None, email: Optional[str] = None) -> requests.Session:
208 |     """Creates a requests sessions with the appropriate headers set. If these headers are not
209 |     set, SEC will reject your request.
210 |     ref: https://www.sec.gov/os/accessing-edgar-data"""
211 |     if company is None:
212 |         company = os.environ.get("SEC_API_ORGANIZATION")
213 |     if email is None:
214 |         email = os.environ.get("SEC_API_EMAIL")
215 |     assert company
216 |     assert email
217 |     session = requests.Session()
218 |     session.headers.update(
219 |         {
220 |             "User-Agent": f"{company} {email}",
221 |             "Content-Type": "text/html",
222 |         }
223 |     )
224 |     return session
225 | 


--------------------------------------------------------------------------------
/exploration-notebooks/exploration-s1-risks.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "ae311bc4",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "## Exploration Notebooks - S1 Documents\n",
  9 |     "\n",
 10 |     "The purpose of this notebook is to demonstrate the logic for extracting narrative text from the risk factors section in S1 filings. \n",
 11 |     "\n",
 12 |     "#### Table of Contents\n",
 13 |     "\n",
 14 |     "1. [Palantir Filing](#palantir)\n",
 15 |     "2. [Tesla Filing](#tesla)"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "id": "f89372ab",
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "%load_ext autoreload\n",
 26 |     "%autoreload 2"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "id": "18f90b55",
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "from prepline_sec_filings.fetch import get_filing\n",
 37 |     "from prepline_sec_filings.sec_document import SECDocument"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "id": "bbaf7232",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "### Palantir Filing <a id=\"palantir\"></a>\n",
 46 |     "\n",
 47 |     "This section pulls in the Palantir S-1 filing from the SEC site, which is available [here](https://www.sec.gov/Archives/edgar/data/1321655/000119312520230013/d904406ds1.htm). The goal is to identify the [risk factors](https://www.sec.gov/Archives/edgar/data/1321655/000119312520230013/d904406ds1.htm#rom904406_3) section."
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": null,
 53 |    "id": "1aef6e6d",
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "text = get_filing(\"1321655\",\n",
 58 |     "                 \"000119312520230013\", \n",
 59 |     "                 \"Unstructured Technologies\", \n",
 60 |     "                 \"support@unstructured.io\")"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "id": "71848be5",
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "sec_document = SECDocument.from_string(text)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "id": "3ff29c73",
 77 |    "metadata": {},
 78 |    "outputs": [
 79 |     {
 80 |      "data": {
 81 |       "text/plain": [
 82 |        "'S-1'"
 83 |       ]
 84 |      },
 85 |      "execution_count": null,
 86 |      "metadata": {},
 87 |      "output_type": "execute_result"
 88 |     }
 89 |    ],
 90 |    "source": [
 91 |     "sec_document.filing_type"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "id": "1d4ac11a",
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "from prepline_sec_filings.sections import SECSection\n",
102 |     "risk_narrative = sec_document.get_section_narrative(SECSection.RISK_FACTORS)"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "id": "4d95c612",
108 |    "metadata": {},
109 |    "source": [
110 |     "From the cells below, we can see that the `get_risk_narrative` method section successfully identified the risk section in the document."
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "id": "821c431a",
117 |    "metadata": {},
118 |    "outputs": [
119 |     {
120 |      "name": "stdout",
121 |      "output_type": "stream",
122 |      "text": [
123 |       "Investing in our Class A common stock involves a high degree of risk. You should carefully consider the risks and uncertainties\n",
124 |       "described below, together with all of the other information in this prospectus, including the section titled “Management’s Discussion and Analysis of Financial Condition and Results of Operations” and our consolidated financial\n",
125 |       "statements and accompanying notes, before making a decision to invest in our Class A common stock. Our business, financial condition, results of operations, or prospects could also be harmed by risks and uncertainties not currently known to us\n",
126 |       "or that we currently do not believe are material. If any of the risks actually occur, our business, financial condition, results of operations, and prospects could be adversely affected. In that event, the trading price of our Class A common\n",
127 |       "stock could decline, and you could lose part or all of your investment.\n"
128 |      ]
129 |     }
130 |    ],
131 |    "source": [
132 |     "print(risk_narrative[0])"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "id": "8b31a840",
139 |    "metadata": {},
140 |    "outputs": [
141 |     {
142 |      "name": "stdout",
143 |      "output_type": "stream",
144 |      "text": [
145 |       "We have never declared nor paid cash dividends on our capital stock. We currently intend to retain any future earnings to finance the\n",
146 |       "operation and expansion of our business, and we do not anticipate declaring or paying any dividends to holders of our capital stock in the foreseeable future. In addition, our credit facility contains restrictions on our ability to pay dividends.\n",
147 |       "Any determination to pay dividends in the future will be at the discretion of our Board of Directors. Consequently, stockholders must rely on sales of their Class A common stock after price appreciation, which may never occur, as the only way\n",
148 |       "to realize any future gains on their investment.\n"
149 |      ]
150 |     }
151 |    ],
152 |    "source": [
153 |     "print(risk_narrative[-1])"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "markdown",
158 |    "id": "6fa2c95d",
159 |    "metadata": {},
160 |    "source": [
161 |     "### Tesla Filing <a id=\"tesla\"></a>\n",
162 |     "\n",
163 |     "This section tests the risk narrative logic on the Tesla S-1 filing, which can be found [here](https://www.sec.gov/Archives/edgar/data/1318605/000119312511149963/ds1.htm). The goal is to identify the narrative text in the Risk Factors section, which can be found [here](https://www.sec.gov/Archives/edgar/data/1318605/000119312511149963/ds1.htm#toc188115_4)."
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "id": "d203ec3e",
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": [
173 |     "text = get_filing(\"1318605\",\n",
174 |     "                 \"000119312511149963\", \n",
175 |     "                 \"Unstructured Technologies\", \n",
176 |     "                 \"support@unstructured.io\")"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": null,
182 |    "id": "1a26f776",
183 |    "metadata": {},
184 |    "outputs": [],
185 |    "source": [
186 |     "sec_document = SECDocument.from_string(text)"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "id": "2de728f5",
193 |    "metadata": {},
194 |    "outputs": [
195 |     {
196 |      "data": {
197 |       "text/plain": [
198 |        "'S-1'"
199 |       ]
200 |      },
201 |      "execution_count": null,
202 |      "metadata": {},
203 |      "output_type": "execute_result"
204 |     }
205 |    ],
206 |    "source": [
207 |     "sec_document.filing_type"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "id": "a37d12e7",
214 |    "metadata": {},
215 |    "outputs": [],
216 |    "source": [
217 |     "risk_narrative = sec_document.get_section_narrative(SECSection.RISK_FACTORS)"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "markdown",
222 |    "id": "5bd7e2a8",
223 |    "metadata": {},
224 |    "source": [
225 |     "From the cells below, we can see that the `get_risk_narrative` method section successfully identified the risk section in the document."
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": null,
231 |    "id": "58d5258a",
232 |    "metadata": {},
233 |    "outputs": [
234 |     {
235 |      "name": "stdout",
236 |      "output_type": "stream",
237 |      "text": [
238 |       "You should carefully consider the risks described below together with the other information set forth in this prospectus, which could\n",
239 |       "materially affect our business, financial condition and future results. The risks described below are not the only risks facing our company. Risks and uncertainties not currently known to us or that we currently deem to be immaterial also may\n",
240 |       "materially adversely affect our business, financial condition and operating results.\n"
241 |      ]
242 |     }
243 |    ],
244 |    "source": [
245 |     "print(risk_narrative[0])"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": null,
251 |    "id": "f48ea22d",
252 |    "metadata": {},
253 |    "outputs": [
254 |     {
255 |      "name": "stdout",
256 |      "output_type": "stream",
257 |      "text": [
258 |       "We do not anticipate declaring any cash dividends to holders of our common stock in the foreseeable future. Consequently, investors may\n",
259 |       "need to rely on sales of their common stock after price appreciation, which may never occur, as the only way to realize any future gains on their investment. Investors seeking cash dividends should not purchase our common stock.\n"
260 |      ]
261 |     }
262 |    ],
263 |    "source": [
264 |     "print(risk_narrative[-1])"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": null,
270 |    "id": "770ca0f3",
271 |    "metadata": {},
272 |    "outputs": [],
273 |    "source": []
274 |   }
275 |  ],
276 |  "metadata": {
277 |   "kernelspec": {
278 |    "display_name": "python3",
279 |    "language": "python",
280 |    "name": "python3"
281 |   }
282 |  },
283 |  "nbformat": 4,
284 |  "nbformat_minor": 5
285 | }
286 | 


--------------------------------------------------------------------------------
/exploration-notebooks/exploration-TOC-action.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Exploration Notebooks - TOC in action\n",
  8 |     "\n",
  9 |     "The purpose of this notebook is to demonstrate the logic for indentifying the Table of Contents section for both 10-K/10-Q and S-1 filings. \n",
 10 |     "\n",
 11 |     "#### Table of Contents\n",
 12 |     "\n",
 13 |     "1. [TOC action for 10-K/10-Q filings](#10-K-10-Q)\n",
 14 |     "2. [TOC action for S-1 filings](#S-1)"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "%load_ext autoreload\n",
 24 |     "%autoreload 2"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "from prepline_sec_filings.fetch import get_filing\n",
 34 |     "from prepline_sec_filings.sec_document import SECDocument"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "### 10-K/10-Q Filing <a id=\"10-K-10-Q\"></a>\n",
 42 |     "\n",
 43 |     "This section pulls in the Palantir 10-Q filing from the SEC site, which is available [here](https://www.sec.gov/Archives/edgar/data/1321655/000119312520292177/d31861d10q.htm). The goal is to identify the [table of contents](https://www.sec.gov/Archives/edgar/data/1321655/000119312520292177/d31861d10q.htm#toc) section."
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "text = get_filing(\"1321655\",\n",
 53 |     "                  \"000119312520292177\", \n",
 54 |     "                  \"Unstructured Technologies\",\n",
 55 |     "                  \"support@unstructured.io\")"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "sec_document = SECDocument.from_string(text)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "elements = sec_document.elements"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "toc = sec_document.get_table_of_contents()"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "From the cells below, we can see that the `get_table_of_contents` method section identified the table of contents section in the document. However, there is still extra junk at the end."
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "metadata": {},
 96 |    "outputs": [
 97 |     {
 98 |      "name": "stdout",
 99 |      "output_type": "stream",
100 |      "text": [
101 |       "PART I. FINANCIAL INFORMATION\n",
102 |       "Item 1\n",
103 |       "Financial Statements (unaudited)\n",
104 |       "Condensed Consolidated Balance Sheets\n",
105 |       "Condensed Consolidated Statements of Operations\n",
106 |       "Condensed Consolidated Statements of Comprehensive Loss\n",
107 |       "Condensed Consolidated Statements of Redeemable Convertible and Convertible\n",
108 |       " Preferred Stock and Stockholders’ Equity (Deficit)\n",
109 |       "Condensed Consolidated Statements of Cash Flows\n",
110 |       "Notes to Unaudited Condensed Consolidated Financial\n",
111 |       "Statements\n",
112 |       "Item 2\n",
113 |       "Management’s Discussion and Analysis of Financial Condition and Results\n",
114 |       " of Operations\n",
115 |       "Item 3\n",
116 |       "Quantitative and Qualitative Disclosures About Market Risk\n",
117 |       "Item 4\n",
118 |       "Controls and Procedures\n",
119 |       "PART II. OTHER INFORMATION\n",
120 |       "Item 1\n",
121 |       "Legal Proceedings\n",
122 |       "Item 1A\n",
123 |       "Risk Factors\n",
124 |       "Item 2\n",
125 |       "Unregistered Sales of Equity Securities\n",
126 |       "Item 3\n",
127 |       "Defaults Upon Senior Securities\n",
128 |       "Item 4\n",
129 |       "Mine Safety Disclosures\n",
130 |       "Item 5\n",
131 |       "Other Information\n",
132 |       "Item 6\n",
133 |       "Exhibits\n",
134 |       "Table of Contents\n",
135 |       "SPECIAL NOTE REGARDING FORWARD-LOOKING STATEMENTS\n",
136 |       "our expectations regarding financial performance, including but not limited to our expectations regarding\n",
137 |       "revenue, cost of revenue, operating expenses, stock-based compensation, and our ability to achieve and maintain future profitability;\n",
138 |       "our ability to successfully execute our business and growth strategy;\n",
139 |       "the sufficiency of our cash and cash equivalents to meet our liquidity needs;\n",
140 |       "the demand for our platforms in general;\n",
141 |       "our ability to increase our number of customers and revenue generated from customers;\n",
142 |       "our expectations regarding the future contribution margin of our existing and future customers;\n",
143 |       "our expectations regarding our ability to quickly and effectively integrate our platforms for our existing and\n",
144 |       "future customers;\n",
145 |       "our ability to develop new platforms, and enhancements to existing platforms, and bring them to market in a\n",
146 |       "timely manner;\n",
147 |       "the size of our addressable markets, market share, category positions, and market trends, including our\n",
148 |       "ability to grow our business in large government and commercial organizations, including our expectations regarding the impact of FASA;\n",
149 |       "our ability to compete with existing and new competitors in existing and new markets and products;\n",
150 |       "our expectations regarding anticipated technology needs and developments and our ability to address those\n",
151 |       "needs and developments with our platforms;\n",
152 |       "our expectations regarding litigation and legal and regulatory matters;\n",
153 |       "our expectations regarding our ability to meet existing performance obligations and maintain the operability\n",
154 |       "of our products;\n",
155 |       "our expectations regarding the effects of existing and developing laws and regulations, including with respect\n",
156 |       "to taxation, privacy and data protection;\n",
157 |       "our expectations regarding new and evolving markets;\n",
158 |       "our ability to develop and protect our brand;\n",
159 |       "our ability to maintain the security and availability of our platforms;\n",
160 |       "our expectations and management of future growth;\n",
161 |       "our expectations concerning relationships with third parties, including our customers, equity method\n",
162 |       "investment partners, and vendors;\n",
163 |       "our ability to maintain, protect, and enhance our intellectual property;\n",
164 |       "our expectations regarding our multi-class stock and governance structure and the benefits thereof;\n",
165 |       "Table of Contents\n",
166 |       "the impact of the ongoing COVID-19 pandemic, including on our and our\n",
167 |       "customers’, vendors’, and partners’ respective businesses and the markets in which we and our customers, vendors, and partners operate; and\n",
168 |       "the increased expenses associated with being a public company.\n",
169 |       "We caution you that the foregoing list may not contain all of the forward-looking statements made in this Quarterly Report on Form 10-Q.\n"
170 |      ]
171 |     }
172 |    ],
173 |    "source": [
174 |     "for element in toc.elements:\n",
175 |     "    print(element.text)"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "markdown",
180 |    "metadata": {},
181 |    "source": [
182 |     "### S-1 Filing <a id=\"S-1\"></a>\n",
183 |     "\n",
184 |     "This section pulls in the Tesla S-1 filing from the SEC site, which is available [here](https://www.sec.gov/Archives/edgar/data/1318605/000119312511149963/ds1.htm). The goal is to identify the [table of contents](https://www.sec.gov/Archives/edgar/data/1318605/000119312511149963/ds1.htm#toc) section."
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": null,
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": [
193 |     "text = get_filing(\"1318605\",\n",
194 |     "                 \"000119312511149963\", \n",
195 |     "                 \"Unstructured Technologies\", \n",
196 |     "                 \"support@unstructured.io\")"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": null,
202 |    "metadata": {},
203 |    "outputs": [],
204 |    "source": [
205 |     "sec_document = SECDocument.from_string(text)"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": null,
211 |    "metadata": {},
212 |    "outputs": [],
213 |    "source": [
214 |     "elements = sec_document.elements"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": null,
220 |    "metadata": {},
221 |    "outputs": [],
222 |    "source": [
223 |     "toc = sec_document.get_table_of_contents()"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "markdown",
228 |    "metadata": {},
229 |    "source": [
230 |     "From the cells below, we can see that the `get_table_of_contents` method section identified the table of contents section in the document. However, there is still extra junk at the end."
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": null,
236 |    "metadata": {},
237 |    "outputs": [
238 |     {
239 |      "name": "stdout",
240 |      "output_type": "stream",
241 |      "text": [
242 |       "Prospectus Summary\n",
243 |       "The Offering\n",
244 |       "Summary Consolidated Financial Data\n",
245 |       "Risk Factors\n",
246 |       "Special Note Regarding Forward Looking Statements\n",
247 |       "Market, Industry and Other Data\n",
248 |       "Use of Proceeds\n",
249 |       "Price Range of Common Stock\n",
250 |       "Dividend Policy\n",
251 |       "Capitalization\n",
252 |       "Dilution\n",
253 |       "Selected Consolidated Financial Data\n",
254 |       "Management’s Discussion and Analysis of Financial Condition and Results of\n",
255 |       "Operations\n",
256 |       "Business\n",
257 |       "Management\n",
258 |       "Executive Compensation\n",
259 |       "Certain Relationships and Related Party Transactions\n",
260 |       "Principal Stockholders\n",
261 |       "Description of Capital Stock\n",
262 |       "Shares Eligible for Future Sale\n",
263 |       "Material United States Tax Considerations for Non-United States Holders\n",
264 |       "Underwriting\n",
265 |       "Concurrent Private Placement\n",
266 |       "Legal Matters\n",
267 |       "Experts\n",
268 |       "Where You Can Find Additional Information\n",
269 |       "Index to Consolidated Financial Statements\n",
270 |       "F-1\n"
271 |      ]
272 |     }
273 |    ],
274 |    "source": [
275 |     "for element in toc.elements:\n",
276 |     "    print(element.text)"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": null,
282 |    "metadata": {},
283 |    "outputs": [],
284 |    "source": []
285 |   }
286 |  ],
287 |  "metadata": {
288 |   "kernelspec": {
289 |    "display_name": "python3",
290 |    "language": "python",
291 |    "name": "python3"
292 |   }
293 |  },
294 |  "nbformat": 4,
295 |  "nbformat_minor": 2
296 | }
297 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2022 Unstructured Technologies, Inc
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/exploration-notebooks/exploration-10k-risks.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "14a418c0",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "## Exploration Notebooks - 10-K/10-Q Documents\n",
  9 |     "\n",
 10 |     "The purpose of this notebook is to demonstrate the logic for extracting narrative text from the risk factors section in 10-K and 10-Q filings. \n",
 11 |     "\n",
 12 |     "#### Table of Contents\n",
 13 |     "\n",
 14 |     "1. [WABC Filing](#wabc)\n",
 15 |     "2. [Palantir Filing](#palantir)"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "id": "60bfe980",
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "%load_ext autoreload\n",
 26 |     "%autoreload 2"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "id": "768fa8c6",
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "from prepline_sec_filings.fetch import get_filing, get_form_by_ticker\n",
 37 |     "from prepline_sec_filings.sec_document import SECDocument"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "id": "62c97cdc",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "### Westamerica Bancorp Filing <a id=\"wabc\"></a>\n",
 46 |     "\n",
 47 |     "This section pulls in the 2022 WABC 10-K filing from the SEC site, which is available [here](https://www.sec.gov/Archives/edgar/data/311094/000117184322001403/wabc20211231_10k.htm). The goal is to identify the [risk factors](https://www.sec.gov/Archives/edgar/data/311094/000117184321001344/wabc20201231_10k.htm#i1a) section."
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": null,
 53 |    "id": "a8998e87",
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "# This would get the most recent 10-k filing\n",
 58 |     "#text = get_form_by_ticker(ticker=\"WABC\",\n",
 59 |     "#                          form_type=\"10-K\",\n",
 60 |     "#                          company=\"Unstructured Technologies\",\n",
 61 |     "#                          email=\"support@unstructured.io\")\n",
 62 |     "\n",
 63 |     "# This gets the 2022 Filing\n",
 64 |     "text = get_filing(\"311094\",\n",
 65 |     "                  \"000117184322001403\", \n",
 66 |     "                  \"Unstructured Technologies\",\n",
 67 |     "                  \"support@unstructured.io\")\n"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "id": "6b9303a6",
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "sec_document = SECDocument.from_string(text)"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "id": "35c0e709",
 84 |    "metadata": {},
 85 |    "outputs": [
 86 |     {
 87 |      "data": {
 88 |       "text/plain": [
 89 |        "'10-K'"
 90 |       ]
 91 |      },
 92 |      "execution_count": null,
 93 |      "metadata": {},
 94 |      "output_type": "execute_result"
 95 |     }
 96 |    ],
 97 |    "source": [
 98 |     "sec_document.filing_type"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "id": "c8e0cad6",
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "from prepline_sec_filings.sections import SECSection\n",
109 |     "risk_sections = sec_document.get_section_narrative(SECSection.RISK_FACTORS)"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "markdown",
114 |    "id": "a2360e6c",
115 |    "metadata": {},
116 |    "source": [
117 |     "From the cells below, we can see that the `get_risk_narrative` method section successfully identified the risk section in the document."
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "id": "ab4c2c4f",
124 |    "metadata": {},
125 |    "outputs": [
126 |     {
127 |      "name": "stdout",
128 |      "output_type": "stream",
129 |      "text": [
130 |       "Readers and prospective investors in the Company’s securities should carefully consider the following risk factors as well as the other information contained or incorporated by reference in this Report.\n"
131 |      ]
132 |     }
133 |    ],
134 |    "source": [
135 |     "print(risk_sections[0])"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "id": "f6aa34d0",
142 |    "metadata": {},
143 |    "outputs": [
144 |     {
145 |      "name": "stdout",
146 |      "output_type": "stream",
147 |      "text": [
148 |       "Management regularly reviews and updates the Company’s internal control over financial reporting, disclosure controls and procedures, and corporate governance policies and procedures. The Company maintains controls and procedures to mitigate against risks such as processing system failures and errors, and customer or employee fraud, and maintains insurance coverage for certain of these risks. Any system of controls and procedures, however well designed and operated, is based in part on certain assumptions and can provide only reasonable, not absolute, assurances that the objectives of the system are met. Events could occur which are not prevented or detected by the Company’s internal controls or are not insured against or are in excess of the Company’s insurance limits or insurance underwriters’ financial capacity. Any failure or circumvention of the Company’s controls and procedures or failure to comply with regulations related to controls and procedures could have a material adverse effect on the Company’s business, results of operations and financial condition.\n"
149 |      ]
150 |     }
151 |    ],
152 |    "source": [
153 |     "print(risk_sections[-1])"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "markdown",
158 |    "id": "d763903a",
159 |    "metadata": {},
160 |    "source": [
161 |     "### Palantir Filing <a id=\"palantir\"></a>\n",
162 |     "\n",
163 |     "This section pulls in an old version of the Palantir 10-Q filing from the SEC site, which is available [here](https://www.sec.gov/Archives/edgar/data/1321655/000119312520292177/d31861d10q.htm). The goal is to identify the [risk factors](https://www.sec.gov/Archives/edgar/data/1321655/000119312520292177/d31861d10q.htm#fin31861_13) section."
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "id": "8f16fa12",
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": [
173 |     "text = get_filing(\"1321655\",\n",
174 |     "                  \"000119312520292177\", \n",
175 |     "                  \"Unstructured Technologies\",\n",
176 |     "                  \"support@unstructured.io\")"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": null,
182 |    "id": "adaeaea9",
183 |    "metadata": {},
184 |    "outputs": [],
185 |    "source": [
186 |     "sec_document = SECDocument.from_string(text)"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "id": "7f999efc",
193 |    "metadata": {},
194 |    "outputs": [
195 |     {
196 |      "data": {
197 |       "text/plain": [
198 |        "'10-Q'"
199 |       ]
200 |      },
201 |      "execution_count": null,
202 |      "metadata": {},
203 |      "output_type": "execute_result"
204 |     }
205 |    ],
206 |    "source": [
207 |     "sec_document.filing_type"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "id": "31c6690a",
214 |    "metadata": {},
215 |    "outputs": [],
216 |    "source": [
217 |     "risk_sections = sec_document.get_section_narrative(SECSection.RISK_FACTORS)"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "markdown",
222 |    "id": "a322b2c8",
223 |    "metadata": {},
224 |    "source": [
225 |     "From the cells below, we can see that the `get_risk_narrative` method section successfully identified the risk section in the document."
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": null,
231 |    "id": "aeda557f",
232 |    "metadata": {},
233 |    "outputs": [
234 |     {
235 |      "name": "stdout",
236 |      "output_type": "stream",
237 |      "text": [
238 |       "Investing in our Class A common stock involves a high degree of risk. You should carefully consider the risks and uncertainties\n",
239 |       "described below, together with all of the other information in this Quarterly Report on Form 10-Q, including the section titled “Management’s Discussion and Analysis of Financial Condition and\n",
240 |       "Results of Operations” and our consolidated financial statements and accompanying notes, before making a decision to invest in our Class A common stock. Our business, financial condition, results of operations, or prospects could also be\n",
241 |       "harmed by risks and uncertainties not currently known to us or that we currently do not believe are material. If any of the risks actually occur, our business, financial condition, results of operations, and prospects could be adversely affected. In\n",
242 |       "that event, the trading price of our Class A common stock could decline, and you could lose part or all of your investment.\n"
243 |      ]
244 |     }
245 |    ],
246 |    "source": [
247 |     "print(risk_sections[0])"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": null,
253 |    "id": "b6fb276c",
254 |    "metadata": {},
255 |    "outputs": [
256 |     {
257 |      "name": "stdout",
258 |      "output_type": "stream",
259 |      "text": [
260 |       "Future issuances of our Class A common stock will dilute the voting power of our Class A common stockholders and future issuances to\n",
261 |       "stockholders other than our Founders who are then party to the Founder Voting Agreement will dilute the economic interests of our Founders. However, because the shares of Class F common stock have variable voting rights, in the event that our\n",
262 |       "Founders have less than 49.999999% of the voting power of our capital stock prior to giving effect to the voting power of the Class F common stock, future issuances of Class A common stock to stockholders other than our Founders will not\n",
263 |       "result in dilution of the voting power of our Founders who are then party to the Founder Voting Agreement, but rather, will correspondingly increase the voting power of the Class F common stock. For instance, if the Founders who are party to\n",
264 |       "the Founder Voting Agreement have 30% of the voting power of our outstanding capital stock in aggregate prior to giving effect to the voting power of the Class F common stock, the Class F common stock would have up to 19.999999% of our\n",
265 |       "voting power resulting in such Founders having up to 49.999999% of our voting power. If we were to issue additional shares of our capital stock entitled to 10% of our voting power in aggregate to stockholders other than our Founders, then our\n",
266 |       "Founders who are party to the Founder Voting Agreement would have approximately 27% of our voting power, and the Class F common stock would have up to approximately 22.999999% of our voting power, resulting in such Founders having up to\n",
267 |       "49.999999% of our voting power. Any future issuances of additional shares of Class A common stock will not be subject to approval by our stockholders except as required by the listing standards of the NYSE. In addition, it may be very difficult\n",
268 |       "for our Class A common stockholders to determine from time to time, including in advance of a meeting of stockholders, their individual or aggregate voting power due to the unique features of our multi-class capital structure, such as the\n",
269 |       "variable number of votes per share of our Class F common stock.\n"
270 |      ]
271 |     }
272 |    ],
273 |    "source": [
274 |     "print(risk_sections[-1])"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": null,
280 |    "id": "69a6681e",
281 |    "metadata": {},
282 |    "outputs": [],
283 |    "source": []
284 |   }
285 |  ],
286 |  "metadata": {
287 |   "kernelspec": {
288 |    "display_name": "python3",
289 |    "language": "python",
290 |    "name": "python3"
291 |   }
292 |  },
293 |  "nbformat": 4,
294 |  "nbformat_minor": 5
295 | }
296 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <h3 align="center">
  2 |   <img src="img/unstructured_logo.png" height="200">
  3 | </h3>
  4 | 
  5 | <h3 align="center">
  6 |   <p>Pre-Processing Pipeline for SEC Filings</p>
  7 | </h3>
  8 | 
  9 | 
 10 | This repo implements a document pre-processing pipeline for SEC filings. Currently, the pipeline is capable of extracting narrative text from user-specified sections in 10-K, 10-Q, and S-1 filings.
 11 | 
 12 | ## Developer Quick Start
 13 | 
 14 | * Using `pyenv` to manage virtualenv's is recommended
 15 | 	* Mac install instructions. See [here](https://github.com/Unstructured-IO/community#mac--homebrew) for more detailed instructions.
 16 | 		* `brew install pyenv-virtualenv`
 17 | 	  * `pyenv install 3.8.15`
 18 |   * Linux instructions are available [here](https://github.com/Unstructured-IO/community#linux).
 19 | 
 20 | * Create a virtualenv to work in and activate it, e.g. for one named `sec-filings`:
 21 | 
 22 | 	`pyenv  virtualenv 3.8.15 sec-filings` <br />
 23 | 	`pyenv activate sec-filings`
 24 | 
 25 | * Run `make install`
 26 | * Start a local jupyter notebook server with `make run-jupyter` <br />
 27 | 	**OR** <br />
 28 | 	just start the fast-API locally with `make run-web-app`
 29 | 
 30 | ## Quick Tour
 31 | 
 32 | You can run this [Colab notebook](https://colab.research.google.com/drive/1W9jCOGbIrE43f7fHMUSn3g3xXhOIjx_v) to see how [pipeline-section.ipynb](/pipeline-notebooks/pipeline-section.ipynb) extracts the narrative text sections from an SEC Filing and defines an API.
 33 | 
 34 | ## Extracting Narrative Text from an SEC Filing
 35 | 
 36 | To retrieve narrative text section(s) from an iXBRL S-1, 10-K, or 10-Q document (or amended version S-1/A, 10-K/A, or 10-Q/A), post the document to the `/section` API. You can try this out by downloading the sample documents using `make dl-test-artifacts`. Then, from
 37 | the `sample-docs` folder, run:
 38 | 
 39 | ```
 40 | curl -X 'POST' \
 41 |   'http://localhost:8000/sec-filings/v0.2.1/section' \
 42 |   -H 'accept: application/json' \
 43 |   -H 'Content-Type: multipart/form-data' \
 44 |   -F 'text_files=@rgld-10-K-85535-000155837021011343.xbrl' \
 45 |   -F section=RISK_FACTORS | jq -C . | less -R
 46 | ```
 47 | 
 48 | Note that additional `-F section` parameters may be included in the curl request to fetch
 49 | multiple sections at once. Valid sections for [10-Ks](https://www.sec.gov/files/form10-k.pdf),
 50 | [10-Qs](https://www.sec.gov/files/form10-q.pdf), and [S-1s](https://www.sec.gov/files/forms-1.pdf)
 51 | are available on the SEC website. You can also reference
 52 | [this file](https://github.com/Unstructured-IO/pipeline-sec-filings/blob/main/prepline_sec_filings/sections.py)
 53 | for a list of valid `section` parameters, e.g. `RISK_FACTORS` OR `MANAGEMENT_DISCUSSION`.
 54 | 
 55 | 
 56 | You'll get back a response that looks like the following. Piping through `jq` and `less`
 57 | formats/colors the outputs and lets your scroll through the results.
 58 | 
 59 | ```
 60 | {
 61 |   "RISK_FACTORS": [
 62 |     {
 63 |       "text": "You should carefully consider the risks described in this section. Our future performance is subject to risks and uncertainties that could have a material adverse effect on our business, results of operations, and financial condition and the trading price of our common stock. We may be subject to other risks and uncertainties not presently known to us. In addition, please see our note about forward-looking statements included in the MD&A.",
 64 |       "type": "NarrativeText"
 65 |     },
 66 |     {
 67 |       "text": "Our revenue is subject to volatility in metal prices, which could negatively affect our results of operations or cash flow.",
 68 |       "type": "NarrativeText"
 69 |     },
 70 |     {
 71 |       "text": "Market prices for gold, silver, copper, nickel, and other metals may fluctuate widely over time and are affected by numerous factors beyond our control. These factors include metal supply and demand, industrial and jewelry fabrication, investment demand, central banking actions, inflation expectations, currency values, interest rates, forward sales by metal producers, and political, trade, economic, or banking conditions.",
 72 |       "type": "NarrativeText"
 73 |     },
 74 |     ...
 75 |   ]
 76 | }
 77 | ```
 78 | 
 79 | 
 80 | You can also pass in custom section regex patterns using the `section_regex` parameter. For
 81 | example, you can run the following command to request the risk factors section:
 82 | 
 83 | ```
 84 | curl -X 'POST' \
 85 |   'http://localhost:8000/sec-filings/v0.2.1/section' \
 86 |   -H 'accept: application/json' \
 87 |   -H 'Content-Type: multipart/form-data' \
 88 |   -F 'text_files=@rgld-10-K-85535-000155837021011343.xbrl' \
 89 |   -F 'section_regex=risk factors'  | jq -C . | less -R
 90 | ```
 91 | 
 92 | The result will be:
 93 | 
 94 | ```
 95 | {
 96 |   "REGEX_0": [
 97 |     {
 98 |       "text": "You should carefully consider the risks described in this section. Our future performance is subject to risks and uncertainties that could have a material adverse effect on our business, results of operations, and financial condition and the trading price of our common stock. We may be subject to other risks and uncertainties not presently known to us. In addition, please see our note about forward-looking statements included in the MD&A.",
 99 |       "type": "NarrativeText"
100 |     },
101 |     {
102 |       "text": "Our revenue is subject to volatility in metal prices, which could negatively affect our results of operations or cash flow.",
103 |       "type": "NarrativeText"
104 |     },
105 |     {
106 |       "text": "Market prices for gold, silver, copper, nickel, and other metals may fluctuate widely over time and are affected by numerous factors beyond our control. These factors include metal supply and demand, industrial and jewelry fabrication, investment demand, central banking actions, inflation expectations, currency values, interest rates, forward sales by metal producers, and political, trade, economic, or banking conditions.",
107 |       "type": "NarrativeText"
108 |     },
109 |     ...
110 |   ]
111 | }
112 | ```
113 | 
114 | As with the `section` parameter, you can request multiple regexes by passing in multiple values
115 | for the `section_regex` parameter. The requested pattern will be treated as a raw string.
116 | 
117 | You can also use special regex characters in your pattern, as shown in the example below:
118 | 
119 | ```
120 |  curl -X 'POST' \
121 |   'http://localhost:8000/sec-filings/v0.2.1/section' \
122 |   -H 'accept: application/json' \
123 |   -H 'Content-Type: multipart/form-data' \
124 |   -F 'text_files=@rgld-10-K-85535-000155837021011343.xbrl' \
125 |   -F "section_regex=^(\S+\W?)+$"
126 | ```
127 | 
128 | You can always replace the header `-H 'accept: application/json'` with `-H 'accept: text/csv'` depending on the format you want to fetch from the API as follows:
129 | 
130 | ```
131 |  curl -X 'POST' \
132 |   'http://localhost:8000/sec-filings/v0.2.1/section' \
133 |   -H 'accept: text/csv' \
134 |   -H 'Content-Type: multipart/form-data' \
135 |   -F 'text_files=@rgld-10-K-85535-000155837021011343.xbrl' \
136 |   -F section=RISK_FACTORS | jq -C . | less -R
137 | ```
138 | The result will be:
139 | ```
140 | "section,element_type,text\r\nRISK_FACTORS,NarrativeText,\"You should carefully consider the risks described in this section. Our future performance is subject to risks and uncertainties that could have a material adverse effect on our business, results of operations, and financial condition and the trading price of our common stock. We may be subject to other risks and uncertainties not presently known to us. In addition, please see our note about forward-looking statements included in the MD&A.\"\r\nRISK_FACTORS,NarrativeText,\"Our revenue is subject to volatility in metal prices, which could negatively affect our results of operations or cash flow.\"\r\nRISK_FACTORS,NarrativeText,\"Market prices for gold, silver, copper, nickel, and other metals may fluctuate widely over time and are affected by numerous factors beyond our control. These factors include metal supply and demand, industrial and jewelry fabrication, investment demand, central banking actions, inflation expectations, currency values, interest rates, forward sales by metal producers, and political, trade, economic, or banking conditions.\"\r\n
141 | ```
142 | 
143 | In addition, you can add the form `-F 'output_schema=labelstudio'` if you want an output to be compatible with [labelstudio](https://labelstud.io) as follows:
144 | 
145 | ```
146 |  curl -X 'POST' \
147 |   'http://localhost:8000/sec-filings/v0.2.1/section' \
148 |   -H 'accept: application/json' \
149 |   -H 'Content-Type: multipart/form-data' \
150 |   -F 'text_files=@rgld-10-K-85535-000155837021011343.xbrl' \
151 |   -F 'output_schema=labelstudio' \
152 |   -F section=RISK_FACTORS | jq -C . | less -R
153 | 
154 | ```
155 | The result will be:
156 | ```
157 | {
158 |   "RISK_FACTORS": [
159 |     {
160 |       "data": {
161 |         "text": "You should carefully consider the risks described in this section. Our future performance is subject to risks and uncertainties that could have a material adverse effect on our business, results of operations, and financial condition and the trading price of our common stock. We may be subject to other risks and uncertainties not presently known to us. In addition, please see our note about forward-looking statements included in the MD&A.",
162 |         "ref_id": "7a912bb639b547404be4ceaf5d9083a9"
163 |       }
164 |     },
165 |     {
166 |       "data": {
167 |         "text": "Our revenue is subject to volatility in metal prices, which could negatively affect our results of operations or cash flow.",
168 |         "ref_id": "d4cc8e0e0c2b68ef69282c5250b721c9"
169 |       }
170 |     },
171 |     ...
172 |     ]
173 | }
174 | ```
175 | 
176 | ### Helper functions for SEC EDGAR API
177 | 
178 | You can use some of the functions provided in `prepline_sec_filings.fetch` to directly view or manipulate the filings available from the SEC's [EDGAR API](https://www.sec.gov/edgar/searchedgar/companysearch.html).
179 | For example, `get_filing(cik, accession_number, your_organization_name, your_email)` will return the text of the filing with accession number `accession_number` for the organization with CIK number `cik`.
180 | `your_organization_name` and `your_email` should be your information.
181 | The parameters `your_organization_name` and `your_email` are passed along to Edgar's API to identify the caller and are required by Edgar.
182 | Alternatively, the parameters may be omitted if the environment variables `SEC_API_ORGANIZATION` and `SEC_API_EMAIL` are defined.
183 | 
184 | 
185 | Helper functions are also provided for cases where the CIK and/or accession numbers are not known. For example,
186 | `get_form_by_ticker('mmm', '10-K', your_organization_name, your_email)` returns the text of the latest 10-K filing from 3M,
187 | and `open_form_by_ticker('mmm', '10-K', your_organization_name, your_email)` opens the SEC index page for the same filing in a web browser.
188 | 
189 | ### Generating Python files from the pipeline notebooks
190 | 
191 | The python module [section.py](/prepline_sec_filings/api/section.py) contains the FASTApi code needed to serve the API. It's created with `make generate-api`, which derives the API from the notebook [pipeline-section.ipynb](/pipeline-notebooks/pipeline-section.ipynb).
192 | 
193 | You can generate the FastAPI APIs from all [pipeline-notebooks/](/pipeline-notebooks) by running `make generate-api`.
194 | 
195 | ## Docker
196 | 
197 | It is not necessary to run Docker in a local development environment, however a Dockerfile and
198 | make targets of `docker-build`, `docker-start-api`, and `docker-start-jupyter` are provided for convenience.
199 | 
200 | You can also launch a Jupyter instance to try out the notebooks with [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/Unstructured-IO/pipeline-sec-filings/HEAD).
201 | 
202 | ## Security Policy
203 | 
204 | See our [security policy](https://github.com/Unstructured-IO/pipeline-sec-filings/security/policy) for
205 | information on how to report security vulnerabilities.
206 | 
207 | ## Learn more
208 | 
209 | | Section | Description |
210 | |-|-|
211 | | [Company Website](https://unstructured.io) | Unstructured.io product and company info |
212 | [EDGAR API](https://www.sec.gov/edgar/searchedgar/companysearch.html) | Documentation for the SEC
213 | | [10-K Filings](https://www.sec.gov/files/form10-k.pdf) | Detailed documentation on 10-K filings |
214 | | [10-Q Filings](https://www.sec.gov/files/form10-q.pdf) | Detailed documentation on 10-Q filings |
215 | | [S-1 Filings](https://www.sec.gov/files/forms-1.pdf) | Detailed documentation on S-1 filings |
216 | 


--------------------------------------------------------------------------------
/test_sec_filings/sec_filings/test_section_api.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pytest
  3 | import csv
  4 | from io import StringIO
  5 | 
  6 | from fastapi.testclient import TestClient
  7 | 
  8 | from unstructured_api_tools.pipelines.api_conventions import get_pipeline_path
  9 | 
 10 | from prepline_sec_filings.api.app import app as core_app
 11 | from prepline_sec_filings.api.section import app
 12 | 
 13 | SECTION_ROUTE = get_pipeline_path("section")
 14 | 
 15 | 
 16 | def generate_sample_document(form_type):
 17 |     is_s1 = form_type == "S-1"
 18 |     return f"""<SEC-DOCUMENT>
 19 |     <TYPE>{form_type}
 20 |     <COMPANY>Proctor & Gamble
 21 |     <HTML>
 22 |         <p>SECURITY AND EXCHANGE COMISSION FILING</p>
 23 |         <p>ITEM 1. BUSINESS</p>
 24 |         <p>This is a section and great and wonderful business dealings.</p>
 25 |         <p>{'ITEM 1A. ' if not is_s1 else ''}RISK FACTORS</p>
 26 |         <p>Wolverines</p>
 27 |         <p>The business could be attacked by wolverines.</p>
 28 |         <p>Bears</p>
 29 |         <p>The business could be attacked by bears.</p>
 30 |         <p>{'ITEM 1B. ' if not is_s1 else ''}UNRESOLVED STAFF COMMENTS</p>
 31 |         <p>None</p>
 32 |         <p>PROSPECTUS SUMMARY</p>
 33 |         <p>Here is a summary of the prospectus</p>
 34 |     </HTML>
 35 | </SEC-DOCUMENT>"""
 36 | 
 37 | 
 38 | @pytest.mark.parametrize(
 39 |     "form_type, section",
 40 |     [
 41 |         ("10-K", "RISK_FACTORS"),
 42 |         ("10-Q", "RISK_FACTORS"),
 43 |         ("S-1", "RISK_FACTORS"),
 44 |         ("10-K", "_ALL"),
 45 |         ("10-Q", "_ALL"),
 46 |         ("S-1", "_ALL"),
 47 |     ],
 48 | )
 49 | def test_section_narrative_api(form_type, section, tmpdir):
 50 |     sample_document = generate_sample_document(form_type)
 51 |     filename = os.path.join(tmpdir.dirname, "wilderness.xbrl")
 52 |     with open(filename, "w") as f:
 53 |         f.write(sample_document)
 54 | 
 55 |     # NOTE(robinson) - Reset the rate limit to avoid 429s in tests
 56 |     client = TestClient(app)
 57 |     response = client.post(
 58 |         SECTION_ROUTE,
 59 |         files=[("text_files", (filename, open(filename, "rb"), "text/plain"))],
 60 |         data={"section": [section]},
 61 |     )
 62 | 
 63 |     assert response.status_code == 200
 64 |     response_dict = response.json()
 65 | 
 66 |     assert response_dict["RISK_FACTORS"] == [
 67 |         {
 68 |             "text": "The business could be attacked by wolverines.",
 69 |             "type": "NarrativeText",
 70 |         },
 71 |         {
 72 |             "text": "The business could be attacked by bears.",
 73 |             "type": "NarrativeText",
 74 |         },
 75 |     ]
 76 | 
 77 | 
 78 | @pytest.mark.parametrize(
 79 |     "form_type, section",
 80 |     [
 81 |         ("10-K", "RISK_FACTORS"),
 82 |         ("10-Q", "RISK_FACTORS"),
 83 |         ("S-1", "RISK_FACTORS"),
 84 |         ("10-K", "_ALL"),
 85 |         ("10-Q", "_ALL"),
 86 |         ("S-1", "_ALL"),
 87 |     ],
 88 | )
 89 | def test_section_narrative_api_labelstudio(form_type, section, tmpdir):
 90 |     sample_document = generate_sample_document(form_type)
 91 |     filename = os.path.join(tmpdir.dirname, "wilderness.xbrl")
 92 |     with open(filename, "w") as f:
 93 |         f.write(sample_document)
 94 | 
 95 |     # NOTE(robinson) - Reset the rate limit to avoid 429s in tests
 96 |     client = TestClient(app)
 97 |     response = client.post(
 98 |         SECTION_ROUTE,
 99 |         files=[("text_files", (filename, open(filename, "rb"), "text/plain"))],
100 |         data={"output_schema": "labelstudio", "section": [section]},
101 |     )
102 | 
103 |     assert response.status_code == 200
104 |     response_dict = response.json()
105 | 
106 |     assert response_dict["RISK_FACTORS"][0] == {
107 |         "data": {
108 |             "text": "The business could be attacked by wolverines.",
109 |             "ref_id": "bd91f9f2e43cf85a8ce9b7a19c2e63e5",
110 |         }
111 |     }
112 | 
113 |     assert response_dict["RISK_FACTORS"][1] == {
114 |         "data": {
115 |             "text": "The business could be attacked by bears.",
116 |             "ref_id": "e731c6ec715fedfe8d07fe84a7e02efb",
117 |         }
118 |     }
119 | 
120 | 
121 | @pytest.mark.parametrize(
122 |     "form_type, section",
123 |     [
124 |         ("10-K", "RISK_FACTORS"),
125 |         ("10-Q", "RISK_FACTORS"),
126 |         ("S-1", "RISK_FACTORS"),
127 |         ("10-K", "_ALL"),
128 |         ("10-Q", "_ALL"),
129 |         ("S-1", "_ALL"),
130 |     ],
131 | )
132 | def test_section_narrative_api_with_unsupported_response_schema(form_type, section, tmpdir):
133 |     sample_document = generate_sample_document(form_type)
134 |     filename = os.path.join(tmpdir.dirname, "wilderness.xbrl")
135 |     with open(filename, "w") as f:
136 |         f.write(sample_document)
137 | 
138 |     # NOTE(robinson) - Reset the rate limit to avoid 429s in tests
139 |     client = TestClient(app)
140 | 
141 |     # FIXME(nyoon): need to handle ValueError in a better way in unstructured-api-tools
142 |     with pytest.raises(ValueError):
143 |         response = client.post(
144 |             SECTION_ROUTE,
145 |             files=[("text_files", (filename, open(filename, "rb"), "text/plain"))],
146 |             data={"output_schema": "unsupported", "section": [section]},
147 |         )
148 |         assert response.status_code == 406
149 |         assert response.content == "Unsupported response schema unsupported.\n"
150 | 
151 | 
152 | @pytest.mark.parametrize(
153 |     "form_type",
154 |     [
155 |         ("10-K"),
156 |         ("10-Q"),
157 |         ("S-1"),
158 |     ],
159 | )
160 | def test_section_narrative_api_with_custom_regex(form_type, tmpdir):
161 |     sample_document = generate_sample_document(form_type)
162 |     filename = os.path.join(tmpdir.dirname, "wilderness.xbrl")
163 |     with open(filename, "w") as f:
164 |         f.write(sample_document)
165 | 
166 |     # NOTE(robinson) - Reset the rate limit to avoid 429s in tests
167 |     client = TestClient(app)
168 |     response = client.post(
169 |         SECTION_ROUTE,
170 |         files=[("text_files", (filename, open(filename, "rb"), "text/plain"))],
171 |         data={"section_regex": ["risk factors"]},
172 |     )
173 | 
174 |     assert response.status_code == 200
175 |     response_dict = response.json()
176 | 
177 |     assert response_dict["REGEX_0"] == [
178 |         {
179 |             "text": "The business could be attacked by wolverines.",
180 |             "type": "NarrativeText",
181 |         },
182 |         {
183 |             "text": "The business could be attacked by bears.",
184 |             "type": "NarrativeText",
185 |         },
186 |     ]
187 | 
188 | 
189 | @pytest.mark.parametrize(
190 |     "form_type",
191 |     [
192 |         ("10-K"),
193 |         ("10-Q"),
194 |         ("S-1"),
195 |     ],
196 | )
197 | def test_section_narrative_api_with_custom_regex_with_special_chars(form_type, tmpdir):
198 |     sample_document = generate_sample_document(form_type)
199 |     filename = os.path.join(tmpdir.dirname, "wilderness.xbrl")
200 |     with open(filename, "w") as f:
201 |         f.write(sample_document)
202 | 
203 |     # NOTE(robinson) - Reset the rate limit to avoid 429s in tests
204 |     client = TestClient(app)
205 |     response = client.post(
206 |         SECTION_ROUTE,
207 |         files=[("text_files", (filename, open(filename, "rb"), "text/plain"))],
208 |         data={"section_regex": ["^(?:prospectus )?summary$"]},
209 |     )
210 | 
211 |     assert response.status_code == 200
212 |     response_dict = response.json()
213 | 
214 |     assert response_dict["REGEX_0"] == [
215 |         {
216 |             "text": "Here is a summary of the prospectus",
217 |             "type": "NarrativeText",
218 |         },
219 |     ]
220 | 
221 | 
222 | @pytest.mark.parametrize(
223 |     "form_types, section",
224 |     [
225 |         (["10-K", "10-Q"], "RISK_FACTORS"),
226 |         (["10-K", "10-Q"], "_ALL"),
227 |     ],
228 | )
229 | def test_section_narrative_api_with_multiple_uploads(form_types, section, tmpdir):
230 |     filenames = []
231 |     for idx, form_type in enumerate(form_types):
232 |         sample_document = generate_sample_document(form_type)
233 |         filename = os.path.join(tmpdir.dirname, f"wilderness_{idx}.xbrl")
234 |         with open(filename, "w") as f:
235 |             f.write(sample_document)
236 |         filenames.append(filename)
237 | 
238 |     # NOTE(robinson) - Reset the rate limit to avoid 429s in tests
239 |     client = TestClient(app)
240 |     files = [
241 |         ("text_files", (filename, open(filename, "rb"), "text/plain")) for filename in filenames
242 |     ]
243 |     response = client.post(
244 |         SECTION_ROUTE,
245 |         files=files,
246 |         headers={
247 |             "Accept": "multipart/mixed",
248 |         },
249 |         data={"section": [section]},
250 |     )
251 | 
252 |     assert response.status_code == 200
253 | 
254 |     if len(filenames) > 1:
255 |         assert "multipart/mixed" in response.headers["content-type"]
256 |     else:
257 |         response_dict = response.json()
258 | 
259 |         assert response_dict["RISK_FACTORS"] == [
260 |             {
261 |                 "text": "The business could be attacked by wolverines.",
262 |                 "type": "NarrativeText",
263 |             },
264 |             {
265 |                 "text": "The business could be attacked by bears.",
266 |                 "type": "NarrativeText",
267 |             },
268 |         ]
269 | 
270 | 
271 | @pytest.mark.parametrize(
272 |     "form_types, section, accept_header, response_status",
273 |     [
274 |         (["10-K", "10-Q"], "RISK_FACTORS", "multipart/mixed", 200),
275 |         (["10-K", "10-Q"], "_ALL", "application/json", 200),
276 |         (
277 |             ["10-K", "10-Q"],
278 |             "_ALL",
279 |             "text/csv",  # Accept header must be multipart/mixed or application/json
280 |             406,
281 |         ),
282 |         ([], "_ALL", "application/json", 400),
283 |     ],
284 | )
285 | def test_section_narrative_api_with_headers(
286 |     form_types, section, accept_header, response_status, tmpdir
287 | ):
288 |     filenames = []
289 |     for idx, form_type in enumerate(form_types):
290 |         sample_document = generate_sample_document(form_type)
291 |         filename = os.path.join(tmpdir.dirname, f"wilderness_{idx}.xbrl")
292 |         with open(filename, "w") as f:
293 |             f.write(sample_document)
294 |         filenames.append(filename)
295 | 
296 |     # NOTE(robinson) - Reset the rate limit to avoid 429s in tests
297 |     client = TestClient(app)
298 |     files = [
299 |         ("text_files", (filename, open(filename, "rb"), "text/plain")) for filename in filenames
300 |     ]
301 |     response = client.post(
302 |         SECTION_ROUTE,
303 |         files=files,
304 |         headers={
305 |             "Accept": accept_header,
306 |         },
307 |         data={"section": [section]},
308 |     )
309 | 
310 |     assert response.status_code == response_status
311 | 
312 | 
313 | @pytest.mark.parametrize(
314 |     "form_type, response_type, section",
315 |     [
316 |         ("10-K", "text/csv", "RISK_FACTORS"),
317 |         ("10-Q", "text/csv", "RISK_FACTORS"),
318 |         ("S-1", "text/csv", "RISK_FACTORS"),
319 |         ("10-K", "text/csv", "_ALL"),
320 |         ("10-Q", "text/csv", "_ALL"),
321 |         ("S-1", "text/csv", "_ALL"),
322 |     ],
323 | )
324 | def test_section_narrative_api_csv_response(form_type, response_type, section, tmpdir):
325 |     sample_document = generate_sample_document(form_type)
326 |     filename = os.path.join(tmpdir.dirname, "wilderness.xbrl")
327 |     with open(filename, "w") as f:
328 |         f.write(sample_document)
329 | 
330 |     # NOTE(robinson) - Reset the rate limit to avoid 429s in tests
331 |     client = TestClient(app)
332 |     response = client.post(
333 |         SECTION_ROUTE,
334 |         files=[("text_files", (filename, open(filename, "rb"), "text/plain"))],
335 |         data={"output_format": response_type, "section": [section]},
336 |     )
337 |     assert response.status_code == 200
338 | 
339 |     response_csv = csv.DictReader(StringIO(response.json()), delimiter=",")
340 |     response_list = list(response_csv)
341 | 
342 |     assert [x["section"] for x in response_list]
343 |     assert [x["element_type"] for x in response_list]
344 |     assert [x["text"] for x in response_list]
345 | 
346 | 
347 | @pytest.mark.parametrize(
348 |     "form_type, response_type, section",
349 |     [
350 |         ("10-K", "text/csv", "RISK_FACTORS"),
351 |         ("10-Q", "text/csv", "RISK_FACTORS"),
352 |         ("S-1", "text/csv", "RISK_FACTORS"),
353 |         ("10-K", "text/csv", "_ALL"),
354 |         ("10-Q", "text/csv", "_ALL"),
355 |         ("S-1", "text/csv", "_ALL"),
356 |     ],
357 | )
358 | def test_section_narrative_api_csv_response_with_unsupported_response_schema(
359 |     form_type, response_type, section, tmpdir
360 | ):
361 |     sample_document = generate_sample_document(form_type)
362 |     filename = os.path.join(tmpdir.dirname, "wilderness.xbrl")
363 |     with open(filename, "w") as f:
364 |         f.write(sample_document)
365 | 
366 |     # NOTE(robinson) - Reset the rate limit to avoid 429s in tests
367 |     client = TestClient(app)
368 | 
369 |     # FIXME(nyoon): need to handle ValueError in a better way in unstructured-api-tools
370 |     with pytest.raises(ValueError):
371 |         response = client.post(
372 |             SECTION_ROUTE,
373 |             files=[("text_files", (filename, open(filename, "rb"), "text/plain"))],
374 |             data={
375 |                 "output_format": response_type,
376 |                 "output_schema": "unsupported",
377 |                 "section": [section],
378 |             },
379 |         )
380 |         assert response.status_code == 406
381 |         assert response.content == "Unsupported response schema unsupported.\n"
382 | 
383 | 
384 | def test_core_app_health_check():
385 |     # NOTE(crag): switch all tests to core_app when rate limiting is removed
386 |     client = TestClient(core_app)
387 |     response = client.get("/healthcheck")
388 | 
389 |     assert response.status_code == 200
390 | 


--------------------------------------------------------------------------------
/prepline_sec_filings/api/section.py:
--------------------------------------------------------------------------------
  1 | #####################################################################
  2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS.
  3 | # DO NOT MODIFY DIRECTLY
  4 | #####################################################################
  5 | 
  6 | import io
  7 | import os
  8 | import gzip
  9 | import mimetypes
 10 | from typing import List, Union
 11 | from fastapi import status, FastAPI, File, Form, Request, UploadFile, APIRouter, HTTPException
 12 | from fastapi.responses import PlainTextResponse
 13 | import json
 14 | from fastapi.responses import StreamingResponse
 15 | from starlette.datastructures import Headers
 16 | from starlette.types import Send
 17 | from base64 import b64encode
 18 | from typing import Optional, Mapping, Iterator, Tuple
 19 | import secrets
 20 | from prepline_sec_filings.sections import section_string_to_enum, validate_section_names, SECSection
 21 | from prepline_sec_filings.sec_document import SECDocument, REPORT_TYPES, VALID_FILING_TYPES
 22 | from enum import Enum
 23 | import re
 24 | import signal
 25 | from unstructured.staging.base import convert_to_isd
 26 | from prepline_sec_filings.sections import (
 27 |     ALL_SECTIONS,
 28 |     SECTIONS_10K,
 29 |     SECTIONS_10Q,
 30 |     SECTIONS_S1,
 31 | )
 32 | import csv
 33 | from typing import Dict
 34 | from unstructured.documents.elements import Text, NarrativeText, Title, ListItem
 35 | from unstructured.staging.label_studio import stage_for_label_studio
 36 | 
 37 | 
 38 | app = FastAPI()
 39 | router = APIRouter()
 40 | 
 41 | 
 42 | def is_expected_response_type(media_type, response_type):
 43 |     if media_type == "application/json" and response_type not in [dict, list]:
 44 |         return True
 45 |     elif media_type == "text/csv" and response_type != str:
 46 |         return True
 47 |     else:
 48 |         return False
 49 | 
 50 | 
 51 | # pipeline-api
 52 | 
 53 | 
 54 | class timeout:
 55 |     def __init__(self, seconds=1, error_message="Timeout"):
 56 |         self.seconds = seconds
 57 |         self.error_message = error_message
 58 | 
 59 |     def handle_timeout(self, signum, frame):
 60 |         raise TimeoutError(self.error_message)
 61 | 
 62 |     def __enter__(self):
 63 |         try:
 64 |             signal.signal(signal.SIGALRM, self.handle_timeout)
 65 |             signal.alarm(self.seconds)
 66 |         except ValueError:
 67 |             pass
 68 | 
 69 |     def __exit__(self, type, value, traceback):
 70 |         try:
 71 |             signal.alarm(0)
 72 |         except ValueError:
 73 |             pass
 74 | 
 75 | 
 76 | def get_regex_enum(section_regex):
 77 |     class CustomSECSection(Enum):
 78 |         CUSTOM = re.compile(section_regex)
 79 | 
 80 |         @property
 81 |         def pattern(self):
 82 |             return self.value
 83 | 
 84 |     return CustomSECSection.CUSTOM
 85 | 
 86 | 
 87 | def convert_to_isd_csv(results: dict) -> str:
 88 |     """
 89 |     Returns the representation of document elements as an Initial Structured Document (ISD)
 90 |     in CSV Format.
 91 |     """
 92 |     csv_fieldnames: List[str] = ["section", "element_type", "text"]
 93 |     new_rows = []
 94 |     for section, section_narrative in results.items():
 95 |         rows: List[Dict[str, str]] = convert_to_isd(section_narrative)
 96 |         for row in rows:
 97 |             new_row_item = dict()
 98 |             new_row_item["section"] = section
 99 |             new_row_item["element_type"] = row["type"]
100 |             new_row_item["text"] = row["text"]
101 |             new_rows.append(new_row_item)
102 | 
103 |     with io.StringIO() as buffer:
104 |         csv_writer = csv.DictWriter(buffer, fieldnames=csv_fieldnames)
105 |         csv_writer.writeheader()
106 |         csv_writer.writerows(new_rows)
107 |         return buffer.getvalue()
108 | 
109 | 
110 | # List of valid response schemas
111 | LABELSTUDIO = "labelstudio"
112 | ISD = "isd"
113 | 
114 | 
115 | def pipeline_api(
116 |     text, response_type="application/json", response_schema="isd", m_section=[], m_section_regex=[]
117 | ):
118 |     """Many supported sections including: RISK_FACTORS, MANAGEMENT_DISCUSSION, and many more"""
119 |     validate_section_names(m_section)
120 | 
121 |     sec_document = SECDocument.from_string(text)
122 |     if sec_document.filing_type not in VALID_FILING_TYPES:
123 |         raise ValueError(
124 |             f"SEC document filing type {sec_document.filing_type} is not supported, "
125 |             f"must be one of {','.join(VALID_FILING_TYPES)}"
126 |         )
127 |     results = {}
128 |     if m_section == [ALL_SECTIONS]:
129 |         filing_type = sec_document.filing_type
130 |         if filing_type in REPORT_TYPES:
131 |             if filing_type.startswith("10-K"):
132 |                 m_section = [enum.name for enum in SECTIONS_10K]
133 |             elif filing_type.startswith("10-Q"):
134 |                 m_section = [enum.name for enum in SECTIONS_10Q]
135 |             else:
136 |                 raise ValueError(f"Invalid report type: {filing_type}")
137 | 
138 |         else:
139 |             m_section = [enum.name for enum in SECTIONS_S1]
140 |     for section in m_section:
141 |         results[section] = sec_document.get_section_narrative(section_string_to_enum[section])
142 |     for i, section_regex in enumerate(m_section_regex):
143 |         regex_enum = get_regex_enum(section_regex)
144 |         with timeout(seconds=5):
145 |             section_elements = sec_document.get_section_narrative(regex_enum)
146 |             results[f"REGEX_{i}"] = section_elements
147 |     if response_type == "application/json":
148 |         if response_schema == LABELSTUDIO:
149 |             return {
150 |                 section: stage_for_label_studio(section_narrative)
151 |                 for section, section_narrative in results.items()
152 |             }
153 |         elif response_schema == ISD:
154 |             return {
155 |                 section: convert_to_isd(section_narrative)
156 |                 for section, section_narrative in results.items()
157 |             }
158 |         else:
159 |             raise ValueError(
160 |                 f"output_schema '{response_schema}' is not supported for {response_type}"
161 |             )
162 |     elif response_type == "text/csv":
163 |         if response_schema != ISD:
164 |             raise ValueError(
165 |                 f"output_schema '{response_schema}' is not supported for {response_type}"
166 |             )
167 |         return convert_to_isd_csv(results)
168 |     else:
169 |         raise ValueError(f"response_type '{response_type}' is not supported")
170 | 
171 | 
172 | def get_validated_mimetype(file):
173 |     """
174 |     Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too
175 |     generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and
176 |     return HTTP 400 for an invalid type.
177 |     """
178 |     content_type = file.content_type
179 |     if not content_type or content_type == "application/octet-stream":
180 |         content_type = mimetypes.guess_type(str(file.filename))[0]
181 | 
182 |         # Some filetypes missing for this library, just hardcode them for now
183 |         if not content_type:
184 |             if file.filename.endswith(".md"):
185 |                 content_type = "text/markdown"
186 |             elif file.filename.endswith(".msg"):
187 |                 content_type = "message/rfc822"
188 | 
189 |     allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES")
190 |     if allowed_mimetypes_str is not None:
191 |         allowed_mimetypes = allowed_mimetypes_str.split(",")
192 | 
193 |         if content_type not in allowed_mimetypes:
194 |             raise HTTPException(
195 |                 status_code=400,
196 |                 detail=(
197 |                     f"Unable to process {file.filename}: "
198 |                     f"File type {content_type} is not supported."
199 |                 ),
200 |             )
201 | 
202 |     return content_type
203 | 
204 | 
205 | class MultipartMixedResponse(StreamingResponse):
206 |     CRLF = b"\r\n"
207 | 
208 |     def __init__(self, *args, content_type: str = None, **kwargs):
209 |         super().__init__(*args, **kwargs)
210 |         self.content_type = content_type
211 | 
212 |     def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None:
213 |         super().init_headers(headers)
214 |         self.boundary_value = secrets.token_hex(16)
215 |         content_type = f'multipart/mixed; boundary="{self.boundary_value}"'
216 |         self.raw_headers.append((b"content-type", content_type.encode("latin-1")))
217 | 
218 |     @property
219 |     def boundary(self):
220 |         return b"--" + self.boundary_value.encode()
221 | 
222 |     def _build_part_headers(self, headers: dict) -> bytes:
223 |         header_bytes = b""
224 |         for header, value in headers.items():
225 |             header_bytes += f"{header}: {value}".encode() + self.CRLF
226 |         return header_bytes
227 | 
228 |     def build_part(self, chunk: bytes) -> bytes:
229 |         part = self.boundary + self.CRLF
230 |         part_headers = {"Content-Length": len(chunk), "Content-Transfer-Encoding": "base64"}
231 |         if self.content_type is not None:
232 |             part_headers["Content-Type"] = self.content_type
233 |         part += self._build_part_headers(part_headers)
234 |         part += self.CRLF + chunk + self.CRLF
235 |         return part
236 | 
237 |     async def stream_response(self, send: Send) -> None:
238 |         await send(
239 |             {
240 |                 "type": "http.response.start",
241 |                 "status": self.status_code,
242 |                 "headers": self.raw_headers,
243 |             }
244 |         )
245 |         async for chunk in self.body_iterator:
246 |             if not isinstance(chunk, bytes):
247 |                 chunk = chunk.encode(self.charset)
248 |                 chunk = b64encode(chunk)
249 |             await send(
250 |                 {"type": "http.response.body", "body": self.build_part(chunk), "more_body": True}
251 |             )
252 | 
253 |         await send({"type": "http.response.body", "body": b"", "more_body": False})
254 | 
255 | 
256 | def ungz_file(file: UploadFile, gz_uncompressed_content_type=None) -> UploadFile:
257 |     def return_content_type(filename):
258 |         if gz_uncompressed_content_type:
259 |             return gz_uncompressed_content_type
260 |         else:
261 |             return str(mimetypes.guess_type(filename)[0])
262 | 
263 |     filename = str(file.filename) if file.filename else ""
264 |     if filename.endswith(".gz"):
265 |         filename = filename[:-3]
266 | 
267 |     gzip_file = gzip.open(file.file).read()
268 |     return UploadFile(
269 |         file=io.BytesIO(gzip_file),
270 |         size=len(gzip_file),
271 |         filename=filename,
272 |         headers=Headers({"content-type": return_content_type(filename)}),
273 |     )
274 | 
275 | 
276 | @router.post("/sec-filings/v0/section")
277 | @router.post("/sec-filings/v0.2.1/section")
278 | def pipeline_1(
279 |     request: Request,
280 |     gz_uncompressed_content_type: Optional[str] = Form(default=None),
281 |     text_files: Union[List[UploadFile], None] = File(default=None),
282 |     output_format: Union[str, None] = Form(default=None),
283 |     output_schema: str = Form(default=None),
284 |     section: List[str] = Form(default=[]),
285 |     section_regex: List[str] = Form(default=[]),
286 | ):
287 |     if text_files:
288 |         for file_index in range(len(text_files)):
289 |             if text_files[file_index].content_type == "application/gzip":
290 |                 text_files[file_index] = ungz_file(text_files[file_index])
291 | 
292 |     content_type = request.headers.get("Accept")
293 | 
294 |     default_response_type = output_format or "application/json"
295 |     if not content_type or content_type == "*/*" or content_type == "multipart/mixed":
296 |         media_type = default_response_type
297 |     else:
298 |         media_type = content_type
299 | 
300 |     default_response_schema = output_schema or "isd"
301 | 
302 |     if isinstance(text_files, list) and len(text_files):
303 |         if len(text_files) > 1:
304 |             if content_type and content_type not in ["*/*", "multipart/mixed", "application/json"]:
305 |                 raise HTTPException(
306 |                     detail=(
307 |                         f"Conflict in media type {content_type}"
308 |                         ' with response type "multipart/mixed".\n'
309 |                     ),
310 |                     status_code=status.HTTP_406_NOT_ACCEPTABLE,
311 |                 )
312 | 
313 |         def response_generator(is_multipart):
314 |             for file in text_files:
315 |                 get_validated_mimetype(file)
316 | 
317 |                 text = file.file.read().decode("utf-8")
318 | 
319 |                 response = pipeline_api(
320 |                     text,
321 |                     m_section=section,
322 |                     m_section_regex=section_regex,
323 |                     response_type=media_type,
324 |                     response_schema=default_response_schema,
325 |                 )
326 | 
327 |                 if is_expected_response_type(media_type, type(response)):
328 |                     raise HTTPException(
329 |                         detail=(
330 |                             f"Conflict in media type {media_type}"
331 |                             f" with response type {type(response)}.\n"
332 |                         ),
333 |                         status_code=status.HTTP_406_NOT_ACCEPTABLE,
334 |                     )
335 | 
336 |                 valid_response_types = ["application/json", "text/csv", "*/*", "multipart/mixed"]
337 |                 if media_type in valid_response_types:
338 |                     if is_multipart:
339 |                         if type(response) not in [str, bytes]:
340 |                             response = json.dumps(response)
341 |                     yield response
342 |                 else:
343 |                     raise HTTPException(
344 |                         detail=f"Unsupported media type {media_type}.\n",
345 |                         status_code=status.HTTP_406_NOT_ACCEPTABLE,
346 |                     )
347 | 
348 |         if content_type == "multipart/mixed":
349 |             return MultipartMixedResponse(
350 |                 response_generator(is_multipart=True), content_type=media_type
351 |             )
352 |         else:
353 |             return (
354 |                 list(response_generator(is_multipart=False))[0]
355 |                 if len(text_files) == 1
356 |                 else response_generator(is_multipart=False)
357 |             )
358 |     else:
359 |         raise HTTPException(
360 |             detail='Request parameter "text_files" is required.\n',
361 |             status_code=status.HTTP_400_BAD_REQUEST,
362 |         )
363 | 
364 | 
365 | app.include_router(router)
366 | 


--------------------------------------------------------------------------------
/test_sec_filings/test_sec_document.py:
--------------------------------------------------------------------------------
  1 | from itertools import product, combinations
  2 | import pytest
  3 | 
  4 | from unstructured.documents.base import NarrativeText
  5 | from unstructured.documents.elements import Title
  6 | 
  7 | from prepline_sec_filings.sec_document import (
  8 |     SECDocument,
  9 |     first,
 10 |     get_element_by_title,
 11 |     is_item_title,
 12 |     is_risk_title,
 13 |     _raise_for_invalid_filing_type,
 14 |     is_toc_title,
 15 |     match_s1_toc_title_to_section,
 16 |     match_10k_toc_title_to_section,
 17 |     remove_item_from_section_text,
 18 |     get_narrative_texts,
 19 | )
 20 | from prepline_sec_filings.sections import SECSection, ALL_SECTIONS, validate_section_names
 21 | 
 22 | 
 23 | @pytest.fixture
 24 | def table_toc(form_type):
 25 |     is_s1 = form_type == "S-1"
 26 |     return f"""<table>
 27 |     <tr><td><p>TABLE OF CONTENTS</p></td><td></td></tr>
 28 |     <tr><td></td><td></td></tr>
 29 |     <p>{'Part I. OTHER INFORMATION' if not is_s1 else 'None'}</p>
 30 |     <p>{'ITEM 1. ' if not is_s1 else ''}PROSPECTUS SUMMARY</p>
 31 |     <tr><td><p>{'ITEM 1A. ' if not is_s1 else ''}RISK FACTORS</p></td><td>1</td></tr>
 32 |     <tr><td><p>{'ITEM 1B. ' if not is_s1 else ''}UNRESOLVED STAFF COMMENTS</p></td><td>1</td></tr>
 33 |     <tr><td><p>{'ITEM 2 ' if not is_s1 else ''}DIVIDEND POLICY</p></td><td>1</td></tr>
 34 |     <tr><td><p>{'ITEM 3 ' if not is_s1 else ''}CAPITALIZATION</p></td><td>1</td></tr>
 35 |     <tr><td><p>{'ITEM 4 ' if not is_s1 else ''}DILUTION</p></td><td>1</td></tr>
 36 |     <tr><td><p>{'ITEM 5 ' if not is_s1 else ''}WOLVERINES AND BEARS</p></td><td>1</td></tr>
 37 |     <tr><td><p>{'ITEM 6 ' if not is_s1 else ''}PROPERTIES</p></td><td>1</td></tr>
 38 |     </table>"""
 39 | 
 40 | 
 41 | @pytest.fixture
 42 | def sample_document(form_type, table_toc, use_toc):
 43 |     is_s1 = form_type == "S-1"
 44 |     return f"""<SEC-DOCUMENT>
 45 |     <TYPE>{form_type}
 46 |     <COMPANY>Proctor & Gamble
 47 |     <HTML>
 48 |         {table_toc if use_toc else ''}
 49 |         <p>SECURITY AND EXCHANGE COMISSION FILING</p>
 50 |         <p>{'Part I.' if not is_s1 else 'None'}</p>
 51 |         <p>{'OTHER INFORMATION' if not is_s1 else 'None'}</p>
 52 |         <p>{'ITEM 1. ' if not is_s1 else ''}PROSPECTUS SUMMARY</p>
 53 |         <p>This is a section on prospectus.</p>
 54 |         <p>{'ITEM 1A. ' if not is_s1 else ''}RISK FACTORS</p>
 55 |         <p>Wolverines</p>
 56 |         <p>The business could be attacked by wolverines.</p>
 57 |         <p>Bears</p>
 58 |         <p>The business could be attacked by bears.</p>
 59 |         <p>{'ITEM 1B. ' if not is_s1 else ''}UNRESOLVED STAFF COMMENTS</p>
 60 |         <p>None</p>
 61 |         <p>{'ITEM 2 ' if not is_s1 else ''}DIVIDEND POLICY</p>
 62 |         <p>Dispersing Dividends</p>
 63 |         <p>Sometimes we disperse dividends, and everyone gets money.</p>
 64 |         <p>Uh Oh</p>
 65 |         <p>Sometimes we don't disperse dividends, and nobody gets money.</p>
 66 |         <p>{'ITEM 3 ' if not is_s1 else ''}CAPITALIZATION</p>
 67 |         <p>None</p>
 68 |         <p>{'ITEM 4 ' if not is_s1 else ''}DILUTION</p>
 69 |         <p>None</p>
 70 |         <p>{'ITEM 5 ' if not is_s1 else ''}WOLVERINES AND BEARS</p>
 71 |         <p>Just to reiterate, our business could be the victim of a wolverine attack.</p>
 72 |         <p>Also bears attack us literally twice a week.</p>
 73 |         <p>{'ITEM 6 ' if not is_s1 else ''}PROPERTIES</p>
 74 |         <p>One building in the middle of the woods.</p>
 75 |         <p>Why did we build it here?</p>
 76 |         <p>We really should not have done this.</p>
 77 |         <p>It was Steve's idea.</p>
 78 |     </HTML>
 79 | </SEC-DOCUMENT>"""
 80 | 
 81 | 
 82 | @pytest.fixture
 83 | def sample_document_with_last_sections(form_type, has_form_summary_section, has_exhibits_section):
 84 |     is_s1 = form_type == "S-1"
 85 |     show_exhibit = (not is_s1) and has_exhibits_section
 86 |     show_form_summary = (not is_s1) and has_form_summary_section
 87 | 
 88 |     table_toc = f"""<table>
 89 |     <tr><td><p>TABLE OF CONTENTS</p></td><td></td></tr>
 90 |     <tr><td></td><td></td></tr>
 91 |     <p>{'Part I. OTHER INFORMATION' if not is_s1 else 'None'}</p>
 92 |     <p>{'ITEM 1. ' if not is_s1 else ''}PROSPECTUS SUMMARY</p>
 93 |     {'<tr><td><p>ITEM 7 EXHIBIT</p></td><td>1</td></tr>' if show_exhibit else ''}
 94 |     {'<tr><td><p>ITEM 8 FORM 10-K SUMMARY</p></td><td>1</td></tr>' if show_form_summary else ''}
 95 |     </table>"""
 96 | 
 97 |     return f"""<SEC-DOCUMENT>
 98 |     <TYPE>{form_type}
 99 |     <COMPANY>Proctor & Gamble
100 |     <HTML>
101 |         {table_toc}
102 |         <p>SECURITY AND EXCHANGE COMISSION FILING</p>
103 |         <p>{'Part I.' if not is_s1 else 'None'}</p>
104 |         <p>{'OTHER INFORMATION' if not is_s1 else 'None'}</p>
105 |         <p>{'ITEM 1. ' if not is_s1 else ''}PROSPECTUS SUMMARY</p>
106 |         <p>This is a section on prospectus.</p>
107 |         {'<p>ITEM 7 EXHIBIT</p>' if show_exhibit else ''}
108 |         {'<p>This is a section on exhibit.</p>' if show_exhibit else ''}
109 |         {'<p>ITEM 8 FORM 10-K SUMMARY</p>' if show_form_summary else ''}
110 |         {'<p>This is a section on form summary.</p>' if show_form_summary else ''}
111 |     </HTML>
112 | </SEC-DOCUMENT>"""
113 | 
114 | 
115 | class MockElement:
116 |     def __init__(self, text):
117 |         self.text = text
118 | 
119 | 
120 | @pytest.fixture
121 | def elements():
122 |     texts = ["Risk Factors:", "ITEM 1a. risk factors", "ITEM 3. Cats", "Summary"]
123 |     return [MockElement(text) for text in texts]
124 | 
125 | 
126 | @pytest.mark.parametrize(
127 |     "section_name, form_type, use_toc",
128 |     product(
129 |         [SECSection.DIVIDEND_POLICY],
130 |         ["10-Q", "10-K", "S-1"],
131 |         [True, False],
132 |     ),
133 | )
134 | def test_get_dividend_narrative(section_name, sample_document):
135 |     sec_document = SECDocument.from_string(sample_document)
136 |     sections = sec_document.get_section_narrative(section_name)
137 |     assert sections == [
138 |         NarrativeText(text="Sometimes we disperse dividends, and everyone gets money."),
139 |         NarrativeText(text="Sometimes we don't disperse dividends, and nobody gets money."),
140 |     ]
141 | 
142 | 
143 | @pytest.mark.parametrize("form_type, use_toc", product(("10-Q", "10-K", "S-1"), (True, False)))
144 | def test_get_risk_narrative(sample_document):
145 |     sec_document = SECDocument.from_string(sample_document)
146 |     risk_sections = sec_document.get_risk_narrative()
147 |     assert risk_sections == [
148 |         NarrativeText(text="The business could be attacked by wolverines."),
149 |         NarrativeText(text="The business could be attacked by bears."),
150 |     ]
151 | 
152 | 
153 | @pytest.mark.parametrize("form_type, use_toc", product(("10-Q", "10-K", "S-1"), (True, False)))
154 | def test_get_table_of_contents(sample_document, form_type, use_toc):
155 |     is_s1 = form_type == "S-1"
156 |     sec_document = SECDocument.from_string(sample_document)
157 |     toc_elements = sec_document.get_table_of_contents().elements
158 |     if use_toc:
159 |         assert Title(text=f"{'ITEM 1A. ' if not is_s1 else ''}RISK FACTORS") in toc_elements
160 |     else:
161 |         assert toc_elements == []
162 | 
163 | 
164 | def test_get_10k_table_of_contents_processes_empty_doc():
165 |     sec_document = SECDocument.from_string("<SEC-DOCUMENT><TYPE>10-K</SEC-DOCUMENT>")
166 |     risk_sections = sec_document.get_table_of_contents().elements
167 |     assert risk_sections == list()
168 | 
169 | 
170 | def test_get_risk_narrative_raises_with_wrong_type():
171 |     sec_document = SECDocument.from_string("<SEC-DOCUMENT><TYPE>999-ZZZ</SEC-DOCUMENT>")
172 |     with pytest.raises(ValueError):
173 |         sec_document.get_risk_narrative()
174 | 
175 | 
176 | @pytest.mark.parametrize("form_type, use_toc", product(["10-K", "10-Q", "S-1"], [True]))
177 | def test__get_toc_sections(sample_document, form_type):
178 |     is_s1 = form_type == "S-1"
179 |     sec_document = SECDocument.from_string(sample_document)
180 |     toc = sec_document.get_table_of_contents()
181 |     # finds the section titles
182 |     section_toc, next_section_toc = sec_document._get_toc_sections(
183 |         SECSection.PROSPECTUS_SUMMARY, toc
184 |     )
185 |     assert (
186 |         section_toc.text == f"{'ITEM 1. ' if not is_s1 else ''}PROSPECTUS SUMMARY"
187 |         and next_section_toc.text == f"{'ITEM 1A. ' if not is_s1 else ''}RISK FACTORS"
188 |     )
189 |     # fails to find the section_toc because it's not in the document
190 |     section_toc, next_section_toc = sec_document._get_toc_sections(SECSection.EXHIBITS, toc)
191 |     assert (section_toc, next_section_toc) == (None, None)
192 |     assert sec_document.get_section_narrative(SECSection.EXHIBITS) == []
193 | 
194 | 
195 | @pytest.mark.parametrize(
196 |     "form_type, has_form_summary_section, has_exhibits_section, expected_last_section",
197 |     [
198 |         ("10-K", True, False, SECSection.FORM_SUMMARY),
199 |         ("10-K", False, True, SECSection.EXHIBITS),
200 |         ("10-K", True, True, SECSection.FORM_SUMMARY),
201 |         ("10-Q", False, True, SECSection.EXHIBITS),
202 |     ],
203 | )
204 | def test__is_last_section_in_report(sample_document_with_last_sections, expected_last_section):
205 |     sec_document = SECDocument.from_string(sample_document_with_last_sections)
206 |     toc = sec_document.get_table_of_contents()
207 |     assert sec_document._is_last_section_in_report(expected_last_section, toc)
208 |     assert len(sec_document.get_section_narrative(expected_last_section)) == 1
209 | 
210 | 
211 | @pytest.mark.parametrize(
212 |     "section", [SECSection.RISK_FACTORS, SECSection.CAPITALIZATION, SECSection.DIVIDEND_POLICY]
213 | )
214 | def test_get_10k_section_narrative_processes_empty_doc(section):
215 |     sec_document = SECDocument.from_string("<SEC-DOCUMENT><TYPE>10-K</SEC-DOCUMENT>")
216 |     sections = sec_document.get_section_narrative(section)
217 |     assert sections == list()
218 | 
219 | 
220 | @pytest.mark.parametrize("form_type, use_toc", product(["10-K", "10-Q", "S-1"], [False]))
221 | def test_get_filing_type(sample_document, form_type):
222 |     sec_document = SECDocument.from_string(sample_document)
223 |     assert sec_document.filing_type == form_type
224 | 
225 | 
226 | def test_get_filing_type_is_none_when_missing():
227 |     sec_document = SECDocument.from_string("<SEC-DOCUMENT></SEC-DOCUMENT>")
228 |     assert sec_document.filing_type is None
229 | 
230 | 
231 | def test_get_narrative_texts_up_to_next_title():
232 |     document_starts_with_narrative_text = """
233 |     <SEC-DOCUMENT>
234 |     <TYPE> 10-K
235 |     <COMPANY>Proctor & Gamble
236 |     <HTML>
237 |         <p>this is a narrative text.</p>
238 |         <p>'NEXT TITLE'</p>
239 |     </HTML>
240 |     </SEC-DOCUMENT>"""
241 |     sec_document = SECDocument.from_string(document_starts_with_narrative_text)
242 |     narrative_texts_up_to_next_title = get_narrative_texts(sec_document, up_to_next_title=True)
243 |     assert narrative_texts_up_to_next_title == [NarrativeText(text="this is a narrative text.")]
244 | 
245 | 
246 | @pytest.mark.parametrize(
247 |     "title, expected",
248 |     [
249 |         ("ITEM 1A.", True),
250 |         ("item 1a.", True),
251 |         ("Item 1.", True),
252 |         ("Item 3:", True),
253 |         ("Item 3(a):", True),
254 |         ("Item 3(a): ", True),
255 |         (
256 |             "ITEM 5(a).: MARKET FOR REGISTRANT’S COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND "
257 |             "ISSUER PURCHASES OF EQUITY SECURITIES",
258 |             True,
259 |         ),
260 |         ("Item 12A.", True),
261 |         ("This is a paragraph about an item", False),
262 |         ("RISK FACTORS", False),
263 |         ("Risk Factors", False),
264 |     ],
265 | )
266 | def test_is_10k_item_title(title, expected):
267 |     assert is_item_title(title, "10-K") == expected
268 | 
269 | 
270 | @pytest.mark.parametrize(
271 |     "title, expected",
272 |     [
273 |         ("ITEM 1A.", True),
274 |         ("item 1a.", True),
275 |         ("Item 1.", False),
276 |         ("Item 12A.", False),
277 |         ("This is a paragraph about an item", False),
278 |         ("RISK FACTORS", True),
279 |         ("Risk Factors", True),
280 |         ("DISCLOSURES", False),
281 |         ("Disclosures", False),
282 |         ("SUMMARY OF RISK FACTORS", False),
283 |     ],
284 | )
285 | def test_is_10_k_risk_title(title, expected):
286 |     assert is_risk_title(title, "10-K") == expected
287 | 
288 | 
289 | @pytest.mark.parametrize(
290 |     "title, expected",
291 |     [
292 |         ("RISK FACTORS", True),
293 |         ("SPECIAL NOTE", True),
294 |         ("Risk Factors Summary", False),
295 |     ],
296 | )
297 | def test_is_s1_item_title(title, expected):
298 |     assert is_item_title(title, "S-1") == expected
299 | 
300 | 
301 | @pytest.mark.parametrize(
302 |     "title, expected",
303 |     [
304 |         ("RISK FACTORS", True),
305 |         ("SPECIAL NOTE", False),
306 |         ("Risk Factors Summary", False),
307 |     ],
308 | )
309 | def test_is_s1_risk_title(title, expected):
310 |     assert is_risk_title(title, "S-1") == expected
311 | 
312 | 
313 | @pytest.mark.parametrize(
314 |     "text, title, expected",
315 |     [
316 |         ("risk factors", "risk factors", True),
317 |         ("risk factors", "something else", False),
318 |         ("summary of risk factors", "risk factors", False),
319 |     ],
320 | )
321 | def test_match_s1_toc_title_to_section(text, title, expected):
322 |     assert match_s1_toc_title_to_section(text, title) == expected
323 | 
324 | 
325 | @pytest.mark.parametrize(
326 |     "text, title, expected",
327 |     [
328 |         ("risk factors", "risk factors", True),
329 |         ("summary of risk factors", "risk factors", False),
330 |         ("item 1a. risk factors", "item 1a", True),
331 |         ("item 1a.", "item 1a", True),
332 |         ("item 1a. risk factors", "risk factors", True),
333 |         ("item 1a. summary of risk factors", "risk factors", False),
334 |         ("item 1a. summary of risk factors", "something else", False),
335 |     ],
336 | )
337 | def test_match_10k_toc_title_to_section(text, title, expected):
338 |     assert match_10k_toc_title_to_section(text, title) == expected
339 | 
340 | 
341 | @pytest.mark.parametrize(
342 |     "text, expected",
343 |     [("Item 1a.  Risk Factors", "Risk Factors"), ("Risk Factors", "Risk Factors")],
344 | )
345 | def test_remove_item_from_section_text(text, expected):
346 |     assert remove_item_from_section_text(text) == expected
347 | 
348 | 
349 | @pytest.mark.parametrize(
350 |     "title, expected",
351 |     [("Table of contents", True), ("Risk Factors", False), ("Index", True)],
352 | )
353 | def test_is_toc_title(title, expected):
354 |     assert is_toc_title(title) == expected
355 | 
356 | 
357 | def test_invalid_item_title_returns_false():
358 |     assert is_item_title("TEST", "INVALID") is False
359 | 
360 | 
361 | def test_invalid_risk_title_returns_false():
362 |     assert is_risk_title("TEST", "INVALID") is False
363 | 
364 | 
365 | def test_empty_filing_type_raises():
366 |     with pytest.raises(ValueError):
367 |         _raise_for_invalid_filing_type(None)
368 | 
369 | 
370 | @pytest.mark.parametrize("it, expected", [(["a"], "a"), (["b", "a"], "b"), ([], None)])
371 | def test_first(it, expected):
372 |     result = first(it)
373 |     if result is None:
374 |         assert expected is None
375 |     else:
376 |         assert result == expected
377 | 
378 | 
379 | @pytest.mark.parametrize(
380 |     "title, filing_type, expected",
381 |     [
382 |         ("risk factors", "S-1", "Risk Factors:"),
383 |         ("item 1a", "10-Q", "ITEM 1a. risk factors"),
384 |         ("cats", "10-Q", "ITEM 3. Cats"),
385 |         ("cats", "S-1", None),
386 |         ("summary", "S-1", "Summary"),
387 |         ("another title", "10-K", None),
388 |     ],
389 | )
390 | def test_get_element_by_title(elements, title, filing_type, expected):
391 |     result = get_element_by_title(elements, title, filing_type)
392 |     if result is None:
393 |         assert expected is None
394 |     else:
395 |         assert result.text == expected
396 | 
397 | 
398 | @pytest.mark.parametrize("form_type, use_toc", [("10-Q", True)])
399 | def test_doc_after_cleaners_keeps_filing_type(form_type, sample_document):
400 |     sec_document = SECDocument.from_string(sample_document).doc_after_cleaners()
401 |     assert sec_document.filing_type == form_type
402 | 
403 | 
404 | @pytest.mark.parametrize(
405 |     "section_names",
406 |     [
407 |         [section.name for section in combo]
408 |         for i in range(1, 3)
409 |         for combo in combinations(SECSection, i)
410 |     ]
411 |     + [[ALL_SECTIONS]],
412 | )
413 | def test_validate_section_names(section_names):
414 |     assert validate_section_names(section_names) is None
415 | 
416 | 
417 | def test_validate_section_names_raises_for_nonsingleton_all():
418 |     with pytest.raises(ValueError):
419 |         validate_section_names([ALL_SECTIONS, SECSection.ABOUT_PROSPECTUS])
420 | 
421 | 
422 | def test_validate_section_names_raises_for_invalid_section():
423 |     with pytest.raises(ValueError):
424 |         validate_section_names(["invalidsection"])
425 | 


--------------------------------------------------------------------------------