├── .github ├── FUNDING.yml └── workflows │ └── lint_and_test.yml ├── .gitignore ├── .readthedocs.yaml ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.rst ├── docs ├── Makefile ├── conf.py ├── index.rst ├── make.bat ├── make_readme.py └── pages │ ├── examples │ ├── as_dataframe.txt │ ├── csv_loader.rst │ ├── dataframe.rst │ ├── gs_loader.rst │ ├── index.rst │ ├── load_csv.txt │ └── url_loader.rst │ ├── genindex.rst │ ├── introduction │ ├── badges.txt │ ├── feature.txt │ ├── index.rst │ ├── installation.rst │ └── summary.txt │ ├── links.rst │ ├── reference │ ├── basic_loader.rst │ ├── error.rst │ ├── format_specific_loader.rst │ ├── index.rst │ └── loader_factory.rst │ └── sponsors.rst ├── examples ├── load_table_from_csv.py ├── load_table_from_gs.py ├── load_table_from_url.py └── pytablereader.ipynb ├── invoke_pytest.py ├── pylama.ini ├── pyproject.toml ├── pytablereader ├── __init__.py ├── __version__.py ├── _acceptor.py ├── _common.py ├── _constant.py ├── _logger │ ├── __init__.py │ ├── _logger.py │ └── _null_logger.py ├── _validator.py ├── csv │ ├── __init__.py │ ├── core.py │ └── formatter.py ├── error.py ├── factory │ ├── __init__.py │ ├── _base.py │ ├── _file.py │ ├── _text.py │ └── _url.py ├── formatter.py ├── html │ ├── __init__.py │ ├── core.py │ └── formatter.py ├── interface.py ├── json │ ├── __init__.py │ ├── core.py │ └── formatter.py ├── jsonlines │ ├── __init__.py │ ├── core.py │ └── formatter.py ├── loadermanager │ ├── __init__.py │ ├── _base.py │ ├── _file.py │ ├── _text.py │ └── _url.py ├── ltsv │ ├── __init__.py │ └── core.py ├── markdown │ ├── __init__.py │ ├── core.py │ └── formatter.py ├── mediawiki │ ├── __init__.py │ ├── core.py │ └── formatter.py ├── spreadsheet │ ├── __init__.py │ ├── core.py │ ├── excelloader.py │ └── gsloader.py ├── sqlite │ ├── __init__.py │ ├── core.py │ └── formatter.py └── tsv │ ├── __init__.py │ └── core.py ├── requirements ├── docs_requirements.txt ├── requirements.txt └── test_requirements.txt ├── setup.py ├── test ├── __init__.py ├── _common.py ├── data │ ├── python - Wiktionary.html │ ├── valid.sqlite3 │ └── validdata.xlsx ├── factory │ ├── test_file_loader_factory.py │ └── test_text_loader_factory.py ├── loader │ ├── test_fileloader.py │ ├── test_gsloader.py │ ├── test_textloader.py │ └── test_urlloader.py ├── test_common.py ├── test_csv_reader.py ├── test_excel_reader.py ├── test_html_reader.py ├── test_html_reader_from_file.py ├── test_json_reader.py ├── test_jsonlines_reader.py ├── test_logger.py ├── test_ltsv_reader.py ├── test_markdown_reader.py ├── test_mediawiki_reader.py ├── test_pandas.py ├── test_sqlite_reader.py ├── test_tsv_reader.py └── test_validator.py └── tox.ini /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: thombashi 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 13 | -------------------------------------------------------------------------------- /.github/workflows/lint_and_test.yml: -------------------------------------------------------------------------------- 1 | name: Lint and Test 2 | 3 | on: 4 | push: 5 | paths-ignore: 6 | - '.gitignore' 7 | - '.readthedocs.yaml' 8 | - 'README.rst' 9 | pull_request: 10 | paths-ignore: 11 | - '.gitignore' 12 | - '.readthedocs.yaml' 13 | - 'README.rst' 14 | 15 | env: 16 | PYTEST_DISCORD_WEBHOOK: ${{ secrets.PYTEST_DISCORD_WEBHOOK }} 17 | 18 | permissions: 19 | contents: read 20 | 21 | jobs: 22 | lint: 23 | runs-on: ubuntu-latest 24 | concurrency: 25 | group: ${{ github.event_name }}-${{ github.workflow }}-${{ github.ref_name }}-lint 26 | cancel-in-progress: true 27 | timeout-minutes: 20 28 | container: 29 | image: ghcr.io/thombashi/python-ci:3.11 30 | 31 | steps: 32 | - uses: actions/checkout@v3 33 | 34 | - name: Lint 35 | run: make check 36 | 37 | unit-test: 38 | runs-on: ${{ matrix.os }} 39 | concurrency: 40 | group: ${{ github.event_name }}-${{ github.workflow }}-${{ github.ref_name }}-ut-${{ matrix.os }}-${{ matrix.python-version }} 41 | cancel-in-progress: true 42 | strategy: 43 | fail-fast: false 44 | matrix: 45 | python-version: ['3.7', '3.8', '3.9', '3.10', '3.11', 'pypy-3.8'] 46 | os: [ubuntu-latest, macos-latest, windows-latest] 47 | timeout-minutes: 20 48 | 49 | steps: 50 | - uses: actions/checkout@v3 51 | 52 | - name: Setup Python ${{ matrix.python-version }} 53 | uses: actions/setup-python@v4 54 | with: 55 | python-version: ${{ matrix.python-version }} 56 | cache: pip 57 | cache-dependency-path: | 58 | setup.py 59 | **/*requirements.txt 60 | tox.ini 61 | 62 | - name: Install pip 63 | run: python -m pip install --upgrade --disable-pip-version-check "pip>=21.1" 64 | 65 | - name: Install dependencies 66 | run: make setup-ci 67 | 68 | - name: Run tests 69 | run: tox -e py 70 | 71 | coverage: 72 | runs-on: ubuntu-latest 73 | concurrency: 74 | group: ${{ github.event_name }}-${{ github.workflow }}-${{ github.ref_name }}-coverage 75 | cancel-in-progress: true 76 | timeout-minutes: 20 77 | 78 | steps: 79 | - uses: actions/checkout@v3 80 | 81 | - name: Setup Python 82 | uses: actions/setup-python@v4 83 | with: 84 | python-version: '3.10' 85 | cache: pip 86 | cache-dependency-path: | 87 | setup.py 88 | **/*requirements.txt 89 | tox.ini 90 | 91 | - name: Install dependencies 92 | run: make setup-ci 93 | 94 | - name: Run tests 95 | run: tox -e cov 96 | 97 | - name: Upload coverage report 98 | env: 99 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 100 | run: | 101 | python -m pip install --upgrade --disable-pip-version-check coveralls tomli 102 | coveralls --service=github 103 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # IPython 79 | profile_default/ 80 | ipython_config.py 81 | 82 | # pyenv 83 | .python-version 84 | 85 | # celery beat schedule file 86 | celerybeat-schedule 87 | 88 | # SageMath parsed files 89 | *.sage.py 90 | 91 | # Environments 92 | .env 93 | .venv 94 | env/ 95 | venv/ 96 | ENV/ 97 | env.bak/ 98 | venv.bak/ 99 | 100 | # Spyder project settings 101 | .spyderproject 102 | .spyproject 103 | 104 | # Rope project settings 105 | .ropeproject 106 | 107 | # mkdocs documentation 108 | /site 109 | 110 | # mypy 111 | .mypy_cache/ 112 | .dmypy.json 113 | dmypy.json 114 | 115 | # Pyre type checker 116 | .pyre/ 117 | 118 | # User settings 119 | _sandbox/ 120 | *_profile 121 | Untitled.ipynb 122 | 123 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: ubuntu-22.04 5 | tools: 6 | python: "3.11" 7 | 8 | sphinx: 9 | configuration: docs/conf.py 10 | 11 | formats: 12 | - pdf 13 | - epub 14 | 15 | python: 16 | install: 17 | - requirements: requirements/docs_requirements.txt 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Tsuyoshi Hombashi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include docs/pages/introduction/summary.txt 2 | include LICENSE 3 | include README.rst 4 | include setup.cfg 5 | include tox.ini 6 | 7 | recursive-include test * 8 | recursive-include requirements * 9 | 10 | global-exclude __pycache__/* 11 | global-exclude *.pyc 12 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | AUTHOR := thombashi 2 | PACKAGE := pytablereader 3 | BUILD_WORK_DIR := _work 4 | DOCS_DIR := docs 5 | PKG_BUILD_DIR := $(BUILD_WORK_DIR)/$(PACKAGE) 6 | PYTHON := python3 7 | 8 | 9 | .PHONY: build-remote 10 | build-remote: clean 11 | @mkdir -p $(BUILD_WORK_DIR) 12 | @cd $(BUILD_WORK_DIR) && \ 13 | git clone https://github.com/$(AUTHOR)/$(PACKAGE).git --depth 1 && \ 14 | cd $(PACKAGE) && \ 15 | $(PYTHON) -m tox -e build 16 | ls -lh $(PKG_BUILD_DIR)/dist/* 17 | 18 | .PHONY: build 19 | build: clean 20 | @$(PYTHON) -m tox -e build 21 | ls -lh dist/* 22 | 23 | .PHONY: check 24 | check: 25 | @$(PYTHON) -m tox -e lint 26 | 27 | .PHONY: clean 28 | clean: 29 | @rm -rf $(BUILD_WORK_DIR) 30 | @$(PYTHON) -m tox -e clean 31 | 32 | .PHONY: docs 33 | docs: 34 | @$(PYTHON) -m tox -e docs 35 | 36 | .PHONY: fmt 37 | fmt: 38 | @$(PYTHON) -m tox -e fmt 39 | 40 | .PHONY: readme 41 | readme: 42 | @$(PYTHON) -m tox -e readme 43 | 44 | .PHONY: release 45 | release: 46 | cd $(PKG_BUILD_DIR) && $(PYTHON) setup.py release --verbose 47 | $(MAKE) clean 48 | 49 | .PHONY: setup-ci 50 | setup-ci: 51 | @$(PYTHON) -m pip install --disable-pip-version-check --upgrade releasecmd tox 52 | 53 | .PHONY: setup 54 | setup: setup-ci 55 | @$(PYTHON) -m pip install -q --disable-pip-version-check --upgrade -e .[test] 56 | @$(PYTHON) -m pip check 57 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. contents:: **pytablereader** 2 | :backlinks: top 3 | :depth: 2 4 | 5 | Summary 6 | ========= 7 | `pytablereader `__ is a Python library to load structured table data from files/strings/URL with various data format: CSV / Excel / Google-Sheets / HTML / JSON / LDJSON / LTSV / Markdown / SQLite / TSV. 8 | 9 | .. image:: https://badge.fury.io/py/pytablereader.svg 10 | :target: https://badge.fury.io/py/pytablereader 11 | :alt: PyPI package version 12 | 13 | .. image:: https://img.shields.io/pypi/pyversions/pytablereader.svg 14 | :target: https://pypi.org/project/pytablereader 15 | :alt: Supported Python versions 16 | 17 | .. image:: https://img.shields.io/pypi/implementation/pytablereader.svg 18 | :target: https://pypi.org/project/pytablereader 19 | :alt: Supported Python implementations 20 | 21 | .. image:: https://github.com/thombashi/pytablereader/actions/workflows/lint_and_test.yml/badge.svg 22 | :target: https://github.com/thombashi/pytablereader/actions/workflows/lint_and_test.yml 23 | :alt: CI status of Linux/macOS/Windows 24 | 25 | .. image:: https://coveralls.io/repos/github/thombashi/pytablereader/badge.svg?branch=master 26 | :target: https://coveralls.io/github/thombashi/pytablereader?branch=master 27 | :alt: Test coverage 28 | 29 | .. image:: https://github.com/thombashi/pytablereader/actions/workflows/github-code-scanning/codeql/badge.svg 30 | :target: https://github.com/thombashi/pytablereader/actions/workflows/github-code-scanning/codeql 31 | :alt: CodeQL 32 | 33 | Features 34 | -------- 35 | - Extract structured tabular data from various data format: 36 | - CSV / Tab separated values (TSV) / Space separated values (SSV) 37 | - Microsoft Excel :superscript:`TM` file 38 | - `Google Sheets `_ 39 | - HTML (``table`` tags) 40 | - JSON 41 | - `Labeled Tab-separated Values (LTSV) `__ 42 | - `Line-delimited JSON(LDJSON) `__ / NDJSON / JSON Lines 43 | - Markdown 44 | - MediaWiki 45 | - SQLite database file 46 | - Supported data sources are: 47 | - Files on a local file system 48 | - Accessible URLs 49 | - ``str`` instances 50 | - Loaded table data can be used as: 51 | - `pandas.DataFrame `__ instance 52 | - ``dict`` instance 53 | 54 | Examples 55 | ========== 56 | Load a CSV table 57 | ------------------ 58 | :Sample Code: 59 | .. code-block:: python 60 | 61 | import pytablereader as ptr 62 | import pytablewriter as ptw 63 | 64 | 65 | # prepare data --- 66 | file_path = "sample_data.csv" 67 | csv_text = "\n".join([ 68 | '"attr_a","attr_b","attr_c"', 69 | '1,4,"a"', 70 | '2,2.1,"bb"', 71 | '3,120.9,"ccc"', 72 | ]) 73 | 74 | with open(file_path, "w") as f: 75 | f.write(csv_text) 76 | 77 | # load from a csv file --- 78 | loader = ptr.CsvTableFileLoader(file_path) 79 | for table_data in loader.load(): 80 | print("\n".join([ 81 | "load from file", 82 | "==============", 83 | "{:s}".format(ptw.dumps_tabledata(table_data)), 84 | ])) 85 | 86 | # load from a csv text --- 87 | loader = ptr.CsvTableTextLoader(csv_text) 88 | for table_data in loader.load(): 89 | print("\n".join([ 90 | "load from text", 91 | "==============", 92 | "{:s}".format(ptw.dumps_tabledata(table_data)), 93 | ])) 94 | 95 | 96 | :Output: 97 | .. code-block:: 98 | 99 | load from file 100 | ============== 101 | .. table:: sample_data 102 | 103 | ====== ====== ====== 104 | attr_a attr_b attr_c 105 | ====== ====== ====== 106 | 1 4.0 a 107 | 2 2.1 bb 108 | 3 120.9 ccc 109 | ====== ====== ====== 110 | 111 | load from text 112 | ============== 113 | .. table:: csv2 114 | 115 | ====== ====== ====== 116 | attr_a attr_b attr_c 117 | ====== ====== ====== 118 | 1 4.0 a 119 | 2 2.1 bb 120 | 3 120.9 ccc 121 | ====== ====== ====== 122 | 123 | Get loaded table data as pandas.DataFrame instance 124 | ---------------------------------------------------- 125 | 126 | :Sample Code: 127 | .. code-block:: python 128 | 129 | import pytablereader as ptr 130 | 131 | loader = ptr.CsvTableTextLoader( 132 | "\n".join([ 133 | "a,b", 134 | "1,2", 135 | "3.3,4.4", 136 | ])) 137 | for table_data in loader.load(): 138 | print(table_data.as_dataframe()) 139 | 140 | :Output: 141 | .. code-block:: 142 | 143 | a b 144 | 0 1 2 145 | 1 3.3 4.4 146 | 147 | For more information 148 | ---------------------- 149 | More examples are available at 150 | https://pytablereader.rtfd.io/en/latest/pages/examples/index.html 151 | 152 | Installation 153 | ============ 154 | 155 | Install from PyPI 156 | ------------------------------ 157 | :: 158 | 159 | pip install pytablereader 160 | 161 | Some of the formats require additional dependency packages, you can install the dependency packages as follows: 162 | 163 | - Excel 164 | - ``pip install pytablereader[excel]`` 165 | - Google Sheets 166 | - ``pip install pytablereader[gs]`` 167 | - Markdown 168 | - ``pip install pytablereader[md]`` 169 | - Mediawiki 170 | - ``pip install pytablereader[mediawiki]`` 171 | - SQLite 172 | - ``pip install pytablereader[sqlite]`` 173 | - Load from URLs 174 | - ``pip install pytablereader[url]`` 175 | - All of the extra dependencies 176 | - ``pip install pytablereader[all]`` 177 | 178 | Install from PPA (for Ubuntu) 179 | ------------------------------ 180 | :: 181 | 182 | sudo add-apt-repository ppa:thombashi/ppa 183 | sudo apt update 184 | sudo apt install python3-pytablereader 185 | 186 | 187 | Dependencies 188 | ============ 189 | - Python 3.7+ 190 | - `Python package dependencies (automatically installed) `__ 191 | 192 | 193 | Optional Python packages 194 | ------------------------------------------------ 195 | - ``logging`` extras 196 | - `loguru `__: Used for logging if the package installed 197 | - ``excel`` extras 198 | - `excelrd `__ 199 | - ``md`` extras 200 | - `Markdown `__ 201 | - ``mediawiki`` extras 202 | - `pypandoc `__ 203 | - ``sqlite`` extras 204 | - `SimpleSQLite `__ 205 | - ``url`` extras 206 | - `retryrequests `__ 207 | - `pandas `__ 208 | - required to get table data as a pandas data frame 209 | - `lxml `__ 210 | 211 | Optional packages (other than Python packages) 212 | ------------------------------------------------ 213 | - ``libxml2`` (faster HTML conversion) 214 | - `pandoc `__ (required when loading MediaWiki file) 215 | 216 | Documentation 217 | =============== 218 | https://pytablereader.rtfd.io/ 219 | 220 | Related Project 221 | ================= 222 | - `pytablewriter `__ 223 | - Tabular data loaded by ``pytablereader`` can be written another tabular data format with ``pytablewriter``. 224 | 225 | Sponsors 226 | ==================================== 227 | .. image:: https://avatars.githubusercontent.com/u/44389260?s=48&u=6da7176e51ae2654bcfd22564772ef8a3bb22318&v=4 228 | :target: https://github.com/chasbecker 229 | :alt: Charles Becker (chasbecker) 230 | .. image:: https://avatars.githubusercontent.com/u/46711571?s=48&u=57687c0e02d5d6e8eeaf9177f7b7af4c9f275eb5&v=4 231 | :target: https://github.com/Arturi0 232 | :alt: onetime: Arturi0 233 | .. image:: https://avatars.githubusercontent.com/u/3658062?s=48&v=4 234 | :target: https://github.com/b4tman 235 | :alt: onetime: Dmitry Belyaev (b4tman) 236 | 237 | `Become a sponsor `__ 238 | 239 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | # the i18n builder cannot share the environment and doctrees with the others 15 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 16 | 17 | .PHONY: help 18 | help: 19 | @echo "Please use \`make ' where is one of" 20 | @echo " html to make standalone HTML files" 21 | @echo " dirhtml to make HTML files named index.html in directories" 22 | @echo " singlehtml to make a single large HTML file" 23 | @echo " pickle to make pickle files" 24 | @echo " json to make JSON files" 25 | @echo " htmlhelp to make HTML files and a HTML help project" 26 | @echo " qthelp to make HTML files and a qthelp project" 27 | @echo " applehelp to make an Apple Help Book" 28 | @echo " devhelp to make HTML files and a Devhelp project" 29 | @echo " epub to make an epub" 30 | @echo " epub3 to make an epub3" 31 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 32 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 33 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 34 | @echo " text to make text files" 35 | @echo " man to make manual pages" 36 | @echo " texinfo to make Texinfo files" 37 | @echo " info to make Texinfo files and run them through makeinfo" 38 | @echo " gettext to make PO message catalogs" 39 | @echo " changes to make an overview of all changed/added/deprecated items" 40 | @echo " xml to make Docutils-native XML files" 41 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 42 | @echo " linkcheck to check all external links for integrity" 43 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 44 | @echo " coverage to run coverage check of the documentation (if enabled)" 45 | @echo " dummy to check syntax errors of document sources" 46 | 47 | .PHONY: clean 48 | clean: 49 | rm -rf $(BUILDDIR)/* 50 | 51 | .PHONY: html 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | .PHONY: dirhtml 58 | dirhtml: 59 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 60 | @echo 61 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 62 | 63 | .PHONY: singlehtml 64 | singlehtml: 65 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 66 | @echo 67 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 68 | 69 | .PHONY: pickle 70 | pickle: 71 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 72 | @echo 73 | @echo "Build finished; now you can process the pickle files." 74 | 75 | .PHONY: json 76 | json: 77 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 78 | @echo 79 | @echo "Build finished; now you can process the JSON files." 80 | 81 | .PHONY: htmlhelp 82 | htmlhelp: 83 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 84 | @echo 85 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 86 | ".hhp project file in $(BUILDDIR)/htmlhelp." 87 | 88 | .PHONY: qthelp 89 | qthelp: 90 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 91 | @echo 92 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 93 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 94 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pytablereader.qhcp" 95 | @echo "To view the help file:" 96 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pytablereader.qhc" 97 | 98 | .PHONY: applehelp 99 | applehelp: 100 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 101 | @echo 102 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 103 | @echo "N.B. You won't be able to view it unless you put it in" \ 104 | "~/Library/Documentation/Help or install it in your application" \ 105 | "bundle." 106 | 107 | .PHONY: devhelp 108 | devhelp: 109 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 110 | @echo 111 | @echo "Build finished." 112 | @echo "To view the help file:" 113 | @echo "# mkdir -p $$HOME/.local/share/devhelp/pytablereader" 114 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pytablereader" 115 | @echo "# devhelp" 116 | 117 | .PHONY: epub 118 | epub: 119 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 120 | @echo 121 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 122 | 123 | .PHONY: epub3 124 | epub3: 125 | $(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3 126 | @echo 127 | @echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3." 128 | 129 | .PHONY: latex 130 | latex: 131 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 132 | @echo 133 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 134 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 135 | "(use \`make latexpdf' here to do that automatically)." 136 | 137 | .PHONY: latexpdf 138 | latexpdf: 139 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 140 | @echo "Running LaTeX files through pdflatex..." 141 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 142 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 143 | 144 | .PHONY: latexpdfja 145 | latexpdfja: 146 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 147 | @echo "Running LaTeX files through platex and dvipdfmx..." 148 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 149 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 150 | 151 | .PHONY: text 152 | text: 153 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 154 | @echo 155 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 156 | 157 | .PHONY: man 158 | man: 159 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 160 | @echo 161 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 162 | 163 | .PHONY: texinfo 164 | texinfo: 165 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 166 | @echo 167 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 168 | @echo "Run \`make' in that directory to run these through makeinfo" \ 169 | "(use \`make info' here to do that automatically)." 170 | 171 | .PHONY: info 172 | info: 173 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 174 | @echo "Running Texinfo files through makeinfo..." 175 | make -C $(BUILDDIR)/texinfo info 176 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 177 | 178 | .PHONY: gettext 179 | gettext: 180 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 181 | @echo 182 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 183 | 184 | .PHONY: changes 185 | changes: 186 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 187 | @echo 188 | @echo "The overview file is in $(BUILDDIR)/changes." 189 | 190 | .PHONY: linkcheck 191 | linkcheck: 192 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 193 | @echo 194 | @echo "Link check complete; look for any errors in the above output " \ 195 | "or in $(BUILDDIR)/linkcheck/output.txt." 196 | 197 | .PHONY: doctest 198 | doctest: 199 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 200 | @echo "Testing of doctests in the sources finished, look at the " \ 201 | "results in $(BUILDDIR)/doctest/output.txt." 202 | 203 | .PHONY: coverage 204 | coverage: 205 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 206 | @echo "Testing of coverage in the sources finished, look at the " \ 207 | "results in $(BUILDDIR)/coverage/python.txt." 208 | 209 | .PHONY: xml 210 | xml: 211 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 212 | @echo 213 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 214 | 215 | .PHONY: pseudoxml 216 | pseudoxml: 217 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 218 | @echo 219 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 220 | 221 | .PHONY: dummy 222 | dummy: 223 | $(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy 224 | @echo 225 | @echo "Build finished. Dummy builder generates no files." 226 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to pytablereader's documentation! 2 | ========================================== 3 | 4 | .. raw:: html 5 | 6 |
7 | 8 |
9 |
10 | 11 | .. toctree:: 12 | :caption: Table of Contents 13 | :maxdepth: 4 14 | :numbered: 15 | 16 | pages/introduction/index 17 | pages/examples/index 18 | pages/reference/index 19 | pages/links 20 | 21 | 22 | Indices and tables 23 | ================== 24 | 25 | * :ref:`genindex` 26 | -------------------------------------------------------------------------------- /docs/make_readme.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | .. codeauthor:: Tsuyoshi Hombashi 5 | """ 6 | 7 | import sys 8 | 9 | from path import Path 10 | from readmemaker import ReadmeMaker 11 | 12 | 13 | PROJECT_NAME = "pytablereader" 14 | OUTPUT_DIR = ".." 15 | 16 | 17 | def write_examples(maker): 18 | maker.set_indent_level(0) 19 | maker.write_chapter("Examples") 20 | 21 | examples_root = Path("pages").joinpath("examples") 22 | maker.inc_indent_level() 23 | 24 | maker.write_chapter("Load a CSV table") 25 | maker.write_file(examples_root.joinpath("load_csv.txt")) 26 | 27 | maker.write_chapter("Get loaded table data as pandas.DataFrame instance") 28 | maker.write_file(examples_root.joinpath("as_dataframe.txt")) 29 | 30 | maker.write_chapter("For more information") 31 | maker.write_lines( 32 | [ 33 | "More examples are available at ", 34 | f"https://{PROJECT_NAME:s}.rtfd.io/en/latest/pages/examples/index.html", 35 | ] 36 | ) 37 | 38 | 39 | def main(): 40 | maker = ReadmeMaker( 41 | PROJECT_NAME, 42 | OUTPUT_DIR, 43 | is_make_toc=True, 44 | project_url=f"https://github.com/thombashi/{PROJECT_NAME}", 45 | ) 46 | 47 | maker.write_chapter("Summary") 48 | maker.write_introduction_file("summary.txt") 49 | maker.write_introduction_file("badges.txt") 50 | maker.write_introduction_file("feature.txt") 51 | 52 | write_examples(maker) 53 | 54 | maker.write_introduction_file("installation.rst") 55 | 56 | maker.set_indent_level(0) 57 | maker.write_chapter("Documentation") 58 | maker.write_lines([f"https://{PROJECT_NAME:s}.rtfd.io/"]) 59 | 60 | maker.write_chapter("Related Project") 61 | maker.write_lines( 62 | [ 63 | "- `pytablewriter `__", 64 | " - Tabular data loaded by ``pytablereader`` can be written " 65 | "another tabular data format with ``pytablewriter``.", 66 | ] 67 | ) 68 | 69 | maker.write_file(maker.doc_page_root_dir_path.joinpath("sponsors.rst")) 70 | 71 | return 0 72 | 73 | 74 | if __name__ == "__main__": 75 | sys.exit(main()) 76 | -------------------------------------------------------------------------------- /docs/pages/examples/as_dataframe.txt: -------------------------------------------------------------------------------- 1 | 2 | :Sample Code: 3 | .. code-block:: python 4 | :caption: Convert from loaded tabledata.TableData to pandas.DataFrame 5 | 6 | import pytablereader as ptr 7 | 8 | loader = ptr.CsvTableTextLoader( 9 | "\n".join([ 10 | "a,b", 11 | "1,2", 12 | "3.3,4.4", 13 | ])) 14 | for table_data in loader.load(): 15 | print(table_data.as_dataframe()) 16 | 17 | :Output: 18 | .. code-block:: none 19 | 20 | a b 21 | 0 1 2 22 | 1 3.3 4.4 23 | -------------------------------------------------------------------------------- /docs/pages/examples/csv_loader.rst: -------------------------------------------------------------------------------- 1 | .. _example-csv-table-loader: 2 | 3 | Load table data from CSV 4 | ---------------------------- 5 | 6 | Following example shows how to extract |TableData| from CSV data by using |CsvTableFileLoader| and |CsvTableTextLoader| classes. 7 | 8 | .. include:: load_csv.txt 9 | 10 | -------------------------------------------------------------------------------- /docs/pages/examples/dataframe.rst: -------------------------------------------------------------------------------- 1 | .. _example-as-dataframe: 2 | 3 | Get loaded table data as pandas.DataFrame 4 | -------------------------------------------------------- 5 | A |TableData| instance can be converted to a ``pandas.DataFrame`` instance 6 | by :py:meth:`~tabledata.TableData.as_dataframe`. 7 | 8 | .. include:: as_dataframe.txt 9 | -------------------------------------------------------------------------------- /docs/pages/examples/gs_loader.rst: -------------------------------------------------------------------------------- 1 | .. _example-gs-table-loader: 2 | 3 | Load table data from Google Sheets 4 | ------------------------------------- 5 | Following example shows how to extract |TableData| from Google Sheets by using |GoogleSheetsTableLoader| class. 6 | 7 | .. code-block:: python 8 | :caption: Load table data from Google Sheets 9 | 10 | import io 11 | 12 | import pytablereader as ptr 13 | import pytablewriter as ptw 14 | 15 | 16 | loader = ptr.TableUrlLoader( 17 | "https://en.wikipedia.org/wiki/List_of_unit_testing_frameworks", 18 | "html") 19 | 20 | writer = ptw.TableWriterFactory.create_from_format_name("rst") 21 | writer.stream = io.open("load_url_result.rst", "w", encoding=loader.encoding) 22 | for table_data in loader.load(): 23 | writer.from_tabledata(table_data) 24 | writer.write_table() 25 | 26 | -------------------------------------------------------------------------------- /docs/pages/examples/index.rst: -------------------------------------------------------------------------------- 1 | Examples 2 | ======== 3 | 4 | .. toctree:: 5 | :maxdepth: 3 6 | 7 | csv_loader 8 | url_loader 9 | gs_loader 10 | dataframe 11 | -------------------------------------------------------------------------------- /docs/pages/examples/load_csv.txt: -------------------------------------------------------------------------------- 1 | :Sample Code: 2 | .. code-block:: python 3 | :caption: Load table from CSV 4 | 5 | import pytablereader as ptr 6 | import pytablewriter as ptw 7 | 8 | 9 | # prepare data --- 10 | file_path = "sample_data.csv" 11 | csv_text = "\n".join([ 12 | '"attr_a","attr_b","attr_c"', 13 | '1,4,"a"', 14 | '2,2.1,"bb"', 15 | '3,120.9,"ccc"', 16 | ]) 17 | 18 | with open(file_path, "w") as f: 19 | f.write(csv_text) 20 | 21 | # load from a csv file --- 22 | loader = ptr.CsvTableFileLoader(file_path) 23 | for table_data in loader.load(): 24 | print("\n".join([ 25 | "load from file", 26 | "==============", 27 | "{:s}".format(ptw.dumps_tabledata(table_data)), 28 | ])) 29 | 30 | # load from a csv text --- 31 | loader = ptr.CsvTableTextLoader(csv_text) 32 | for table_data in loader.load(): 33 | print("\n".join([ 34 | "load from text", 35 | "==============", 36 | "{:s}".format(ptw.dumps_tabledata(table_data)), 37 | ])) 38 | 39 | 40 | :Output: 41 | .. code-block:: none 42 | 43 | load from file 44 | ============== 45 | .. table:: sample_data 46 | 47 | ====== ====== ====== 48 | attr_a attr_b attr_c 49 | ====== ====== ====== 50 | 1 4.0 a 51 | 2 2.1 bb 52 | 3 120.9 ccc 53 | ====== ====== ====== 54 | 55 | load from text 56 | ============== 57 | .. table:: csv2 58 | 59 | ====== ====== ====== 60 | attr_a attr_b attr_c 61 | ====== ====== ====== 62 | 1 4.0 a 63 | 2 2.1 bb 64 | 3 120.9 ccc 65 | ====== ====== ====== 66 | -------------------------------------------------------------------------------- /docs/pages/examples/url_loader.rst: -------------------------------------------------------------------------------- 1 | .. _example-url-table-loader: 2 | 3 | Load table data from a web page 4 | ------------------------------------- 5 | Following example shows how to extract |TableData| from a web page by using |TableUrlLoader| class. 6 | 7 | :Sample Code: 8 | .. code-block:: python 9 | :caption: Load table from a web page 10 | 11 | import io 12 | 13 | import pytablereader as ptr 14 | import pytablewriter as ptw 15 | 16 | 17 | loader = ptr.TableUrlLoader( 18 | "https://en.wikipedia.org/wiki/List_of_unit_testing_frameworks", 19 | "html") 20 | 21 | writer = ptw.TableWriterFactory.create_from_format_name("rst") 22 | writer.stream = io.open("load_url_result.rst", "w", encoding=loader.encoding) 23 | for table_data in loader.load(): 24 | writer.from_tabledata(table_data) 25 | writer.write_table() 26 | 27 | :Output: 28 | .. code-block:: console 29 | 30 | $ ./load_table_from_url.py 31 | $ head load_url_result.rst -n 8 32 | .. table:: List of unit testing frameworks - Wikipedia_html1 33 | 34 | +---------+-----+------+------------------------+ 35 | | Name |xUnit|Source| Remarks | 36 | +=========+=====+======+========================+ 37 | |ABAP Unit|Yes |[1] |since SAP NetWeaver 2004| 38 | +---------+-----+------+------------------------+ 39 | -------------------------------------------------------------------------------- /docs/pages/genindex.rst: -------------------------------------------------------------------------------- 1 | Indices and tables 2 | ================== 3 | 4 | * :ref:`genindex` -------------------------------------------------------------------------------- /docs/pages/introduction/badges.txt: -------------------------------------------------------------------------------- 1 | .. image:: https://badge.fury.io/py/pytablereader.svg 2 | :target: https://badge.fury.io/py/pytablereader 3 | :alt: PyPI package version 4 | 5 | .. image:: https://img.shields.io/pypi/pyversions/pytablereader.svg 6 | :target: https://pypi.org/project/pytablereader 7 | :alt: Supported Python versions 8 | 9 | .. image:: https://img.shields.io/pypi/implementation/pytablereader.svg 10 | :target: https://pypi.org/project/pytablereader 11 | :alt: Supported Python implementations 12 | 13 | .. image:: https://github.com/thombashi/pytablereader/actions/workflows/lint_and_test.yml/badge.svg 14 | :target: https://github.com/thombashi/pytablereader/actions/workflows/lint_and_test.yml 15 | :alt: CI status of Linux/macOS/Windows 16 | 17 | .. image:: https://coveralls.io/repos/github/thombashi/pytablereader/badge.svg?branch=master 18 | :target: https://coveralls.io/github/thombashi/pytablereader?branch=master 19 | :alt: Test coverage 20 | 21 | .. image:: https://github.com/thombashi/pytablereader/actions/workflows/github-code-scanning/codeql/badge.svg 22 | :target: https://github.com/thombashi/pytablereader/actions/workflows/github-code-scanning/codeql 23 | :alt: CodeQL 24 | -------------------------------------------------------------------------------- /docs/pages/introduction/feature.txt: -------------------------------------------------------------------------------- 1 | Features 2 | -------- 3 | - Extract structured tabular data from various data format: 4 | - CSV / Tab separated values (TSV) / Space separated values (SSV) 5 | - Microsoft Excel :superscript:`TM` file 6 | - `Google Sheets `_ 7 | - HTML (``table`` tags) 8 | - JSON 9 | - `Labeled Tab-separated Values (LTSV) `__ 10 | - `Line-delimited JSON(LDJSON) `__ / NDJSON / JSON Lines 11 | - Markdown 12 | - MediaWiki 13 | - SQLite database file 14 | - Supported data sources are: 15 | - Files on a local file system 16 | - Accessible URLs 17 | - ``str`` instances 18 | - Loaded table data can be used as: 19 | - `pandas.DataFrame `__ instance 20 | - ``dict`` instance 21 | -------------------------------------------------------------------------------- /docs/pages/introduction/index.rst: -------------------------------------------------------------------------------- 1 | pytablereader 2 | =============== 3 | 4 | .. include:: badges.txt 5 | 6 | 7 | Summary 8 | ------- 9 | 10 | .. include:: summary.txt 11 | 12 | .. raw:: html 13 | 14 |
15 | 16 |
17 |
18 | 19 | 20 | .. include:: feature.txt 21 | 22 | 23 | .. include:: installation.rst 24 | -------------------------------------------------------------------------------- /docs/pages/introduction/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | 4 | Install from PyPI 5 | ------------------------------ 6 | :: 7 | 8 | pip install pytablereader 9 | 10 | Some of the formats require additional dependency packages, you can install the dependency packages as follows: 11 | 12 | - Excel 13 | - ``pip install pytablereader[excel]`` 14 | - Google Sheets 15 | - ``pip install pytablereader[gs]`` 16 | - Markdown 17 | - ``pip install pytablereader[md]`` 18 | - Mediawiki 19 | - ``pip install pytablereader[mediawiki]`` 20 | - SQLite 21 | - ``pip install pytablereader[sqlite]`` 22 | - Load from URLs 23 | - ``pip install pytablereader[url]`` 24 | - All of the extra dependencies 25 | - ``pip install pytablereader[all]`` 26 | 27 | Install from PPA (for Ubuntu) 28 | ------------------------------ 29 | :: 30 | 31 | sudo add-apt-repository ppa:thombashi/ppa 32 | sudo apt update 33 | sudo apt install python3-pytablereader 34 | 35 | 36 | Dependencies 37 | ============ 38 | - Python 3.7+ 39 | - `Python package dependencies (automatically installed) `__ 40 | 41 | 42 | Optional Python packages 43 | ------------------------------------------------ 44 | - ``logging`` extras 45 | - `loguru `__: Used for logging if the package installed 46 | - ``excel`` extras 47 | - `excelrd `__ 48 | - ``md`` extras 49 | - `Markdown `__ 50 | - ``mediawiki`` extras 51 | - `pypandoc `__ 52 | - ``sqlite`` extras 53 | - `SimpleSQLite `__ 54 | - ``url`` extras 55 | - `retryrequests `__ 56 | - `pandas `__ 57 | - required to get table data as a pandas data frame 58 | - `lxml `__ 59 | 60 | Optional packages (other than Python packages) 61 | ------------------------------------------------ 62 | - ``libxml2`` (faster HTML conversion) 63 | - `pandoc `__ (required when loading MediaWiki file) 64 | -------------------------------------------------------------------------------- /docs/pages/introduction/summary.txt: -------------------------------------------------------------------------------- 1 | pytablereader is a Python library to load structured table data from files/strings/URL with various data format: CSV / Excel / Google-Sheets / HTML / JSON / LDJSON / LTSV / Markdown / SQLite / TSV. 2 | -------------------------------------------------------------------------------- /docs/pages/links.rst: -------------------------------------------------------------------------------- 1 | Changelog 2 | ========== 3 | https://github.com/thombashi/pytablereader/releases 4 | 5 | 6 | .. include:: sponsors.rst 7 | 8 | .. include:: genindex.rst 9 | 10 | 11 | Links 12 | ===== 13 | - `GitHub repository `__ 14 | - `Issue tracker `__ 15 | - `pip: A tool for installing Python packages `__ 16 | -------------------------------------------------------------------------------- /docs/pages/reference/basic_loader.rst: -------------------------------------------------------------------------------- 1 | Table Loader Wrapper Classes 2 | ---------------------------- 3 | 4 | File Loader Wrapper 5 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 6 | .. autoclass:: pytablereader.TableFileLoader 7 | :inherited-members: 8 | 9 | Text Loader Wrapper 10 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 11 | .. autoclass:: pytablereader.TableTextLoader 12 | :inherited-members: 13 | 14 | URL Loader Wrapper 15 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 16 | .. autoclass:: pytablereader.TableUrlLoader 17 | :inherited-members: 18 | -------------------------------------------------------------------------------- /docs/pages/reference/error.rst: -------------------------------------------------------------------------------- 1 | Exceptions 2 | ---------------------------- 3 | 4 | .. autoexception:: pytablereader.ValidationError 5 | :show-inheritance: 6 | 7 | .. autoexception:: pytablereader.PathError 8 | :show-inheritance: 9 | 10 | .. autoexception:: pytablereader.InvalidFilePathError 11 | :show-inheritance: 12 | 13 | .. autoexception:: pytablereader.UrlError 14 | :show-inheritance: 15 | 16 | .. autoexception:: pytablereader.OpenError 17 | :show-inheritance: 18 | 19 | .. autoexception:: pytablereader.LoaderNotFoundError 20 | :show-inheritance: 21 | 22 | .. autoexception:: pytablereader.HTTPError 23 | :show-inheritance: 24 | 25 | .. autoexception:: pytablereader.ProxyError 26 | :show-inheritance: 27 | -------------------------------------------------------------------------------- /docs/pages/reference/format_specific_loader.rst: -------------------------------------------------------------------------------- 1 | Format Specific Table Loader Classes 2 | -------------------------------------------- 3 | 4 | AbstractTableReader class 5 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 6 | .. autoclass:: pytablereader.interface.AbstractTableReader 7 | :inherited-members: 8 | :show-inheritance: 9 | 10 | 11 | CSV Loader Classes 12 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 13 | 14 | CSV Table Loader 15 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 16 | .. autoclass:: pytablereader.csv.core.CsvTableLoader 17 | :inherited-members: 18 | 19 | CSV File Loader 20 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 21 | .. autoclass:: pytablereader.CsvTableFileLoader 22 | :inherited-members: 23 | :show-inheritance: 24 | 25 | CSV Text Loader 26 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 27 | .. autoclass:: pytablereader.CsvTableTextLoader 28 | :inherited-members: 29 | :exclude-members: source_type,get_format_key,make_table_name 30 | :show-inheritance: 31 | 32 | 33 | HTML Loader Classes 34 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 35 | 36 | HTML File Loader 37 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 38 | .. autoclass:: pytablereader.HtmlTableFileLoader 39 | :inherited-members: 40 | 41 | HTML Text Loader 42 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 43 | .. autoclass:: pytablereader.HtmlTableTextLoader 44 | :inherited-members: 45 | 46 | 47 | JSON Loader Classes 48 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 49 | 50 | Json File Loader 51 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 52 | .. autoclass:: pytablereader.JsonTableFileLoader 53 | :inherited-members: 54 | 55 | Json Text Loader 56 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 57 | .. autoclass:: pytablereader.JsonTableTextLoader 58 | :inherited-members: 59 | 60 | Line-delimited Json File Loader 61 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 62 | .. autoclass:: pytablereader.JsonLinesTableFileLoader 63 | :inherited-members: 64 | 65 | Line-delimited Json Text Loader 66 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 67 | .. autoclass:: pytablereader.JsonLinesTableTextLoader 68 | :inherited-members: 69 | 70 | 71 | LTSV Loader Classes 72 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 73 | 74 | LTSV File Loader 75 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 76 | .. autoclass:: pytablereader.LtsvTableFileLoader 77 | :inherited-members: 78 | :show-inheritance: 79 | 80 | LTSV Text Loader 81 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 82 | .. autoclass:: pytablereader.LtsvTableTextLoader 83 | :inherited-members: 84 | :exclude-members: source_type,get_format_key,make_table_name 85 | :show-inheritance: 86 | 87 | 88 | Markdown Loader Classes 89 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 90 | 91 | Markdown File Loader 92 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 93 | .. autoclass:: pytablereader.MarkdownTableFileLoader 94 | :inherited-members: 95 | 96 | Markdown Text Loader 97 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 98 | .. autoclass:: pytablereader.MarkdownTableTextLoader 99 | :inherited-members: 100 | 101 | 102 | MediaWiki Loader Classes 103 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 104 | 105 | MediaWiki File Loader 106 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 107 | .. autoclass:: pytablereader.MediaWikiTableFileLoader 108 | :inherited-members: 109 | 110 | MediaWiki Text Loader 111 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 112 | .. autoclass:: pytablereader.MediaWikiTableTextLoader 113 | :inherited-members: 114 | 115 | 116 | Spread Sheet Loader Classes 117 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 118 | 119 | Excel File Loader 120 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 121 | .. autoclass:: pytablereader.ExcelTableFileLoader 122 | :inherited-members: 123 | 124 | Google Sheets Loader 125 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 126 | .. autoclass:: pytablereader.GoogleSheetsTableLoader 127 | :inherited-members: 128 | 129 | 130 | Database Loader Classes 131 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 132 | SQLite File Loader 133 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 134 | .. autoclass:: pytablereader.SqliteFileLoader 135 | :inherited-members: 136 | -------------------------------------------------------------------------------- /docs/pages/reference/index.rst: -------------------------------------------------------------------------------- 1 | Reference 2 | ========= 3 | 4 | .. toctree:: 5 | :maxdepth: 3 6 | 7 | basic_loader 8 | format_specific_loader 9 | loader_factory 10 | error 11 | -------------------------------------------------------------------------------- /docs/pages/reference/loader_factory.rst: -------------------------------------------------------------------------------- 1 | Table Loader Factory Classes 2 | ---------------------------- 3 | 4 | File Loader Factory 5 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 6 | .. autoclass:: pytablereader.factory.TableFileLoaderFactory 7 | :inherited-members: 8 | :undoc-members: 9 | 10 | Text Loader Factory 11 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 12 | .. autoclass:: pytablereader.factory.TableTextLoaderFactory 13 | :inherited-members: 14 | :undoc-members: 15 | 16 | Url Loader Factory 17 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 18 | .. autoclass:: pytablereader.factory.TableUrlLoaderFactory 19 | :inherited-members: 20 | :undoc-members: 21 | -------------------------------------------------------------------------------- /docs/pages/sponsors.rst: -------------------------------------------------------------------------------- 1 | Sponsors 2 | ==================================== 3 | .. image:: https://avatars.githubusercontent.com/u/44389260?s=48&u=6da7176e51ae2654bcfd22564772ef8a3bb22318&v=4 4 | :target: https://github.com/chasbecker 5 | :alt: Charles Becker (chasbecker) 6 | .. image:: https://avatars.githubusercontent.com/u/46711571?s=48&u=57687c0e02d5d6e8eeaf9177f7b7af4c9f275eb5&v=4 7 | :target: https://github.com/Arturi0 8 | :alt: onetime: Arturi0 9 | .. image:: https://avatars.githubusercontent.com/u/3658062?s=48&v=4 10 | :target: https://github.com/b4tman 11 | :alt: onetime: Dmitry Belyaev (b4tman) 12 | 13 | `Become a sponsor `__ 14 | -------------------------------------------------------------------------------- /examples/load_table_from_csv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import pytablewriter as ptw 4 | 5 | import pytablereader as ptr 6 | 7 | 8 | # prepare data --- 9 | file_path = "sample_data.csv" 10 | csv_text = "\n".join([ 11 | '"attr_a","attr_b","attr_c"', 12 | '1,4,"a"', 13 | '2,2.1,"bb"', 14 | '3,120.9,"ccc"', 15 | ]) 16 | 17 | with open(file_path, "w") as f: 18 | f.write(csv_text) 19 | 20 | # load from a csv file --- 21 | loader = ptr.CsvTableFileLoader(file_path) 22 | for table_data in loader.load(): 23 | print("\n".join([ 24 | "load from file", 25 | "==============", 26 | f"{ptw.dumps_tabledata(table_data):s}", 27 | ])) 28 | 29 | # load from a csv text --- 30 | loader = ptr.CsvTableTextLoader(csv_text) 31 | for table_data in loader.load(): 32 | print("\n".join([ 33 | "load from text", 34 | "==============", 35 | f"{ptw.dumps_tabledata(table_data):s}", 36 | ])) 37 | -------------------------------------------------------------------------------- /examples/load_table_from_gs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import pytablewriter as ptw 4 | 5 | import pytablereader as ptr 6 | 7 | 8 | credentials_file = "sample-xxxxxxxxxxxx.json" 9 | 10 | loader = ptr.GoogleSheetsTableLoader(credentials_file) 11 | loader.title = "testbook" 12 | 13 | for table_data in loader.load(): 14 | print(ptw.dumps_tabledata(table_data)) 15 | -------------------------------------------------------------------------------- /examples/load_table_from_url.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import pytablewriter as ptw 4 | 5 | import pytablereader as ptr 6 | 7 | 8 | loader = ptr.TableUrlLoader( 9 | "https://en.wikipedia.org/wiki/List_of_unit_testing_frameworks", 10 | "html") 11 | 12 | writer = ptw.TableWriterFactory.create_from_format_name("rst") 13 | writer.stream = open("load_url_result.rst", "w", encoding=loader.encoding) 14 | for table_data in loader.load(): 15 | writer.from_tabledata(table_data) 16 | writer.write_table() 17 | -------------------------------------------------------------------------------- /invoke_pytest.py: -------------------------------------------------------------------------------- 1 | """ 2 | Unit tests at Windows environments required to invoke from py module, 3 | because of multiprocessing: 4 | https://py.rtfd.io/en/latest/faq.html?highlight=cmdline#issues-with-py-test-multiprocess-and-setuptools 5 | """ 6 | 7 | import multiprocessing 8 | import sys 9 | 10 | import py 11 | 12 | 13 | if __name__ == "__main__": 14 | multiprocessing.freeze_support() 15 | sys.exit(py.test.cmdline.main()) 16 | -------------------------------------------------------------------------------- /pylama.ini: -------------------------------------------------------------------------------- 1 | [pylama] 2 | skip = .eggs/*,.tox/*,*/.env/*,build/*,_sandbox/*,build/*,docs/conf.py 3 | 4 | [pylama:pycodestyle] 5 | max_line_length = 100 6 | 7 | # E203: whitespace before ':' (for black) 8 | # W503: line break before binary operator (for black) 9 | ignore = E203,W503 10 | 11 | [pylama:pylint] 12 | max_line_length = 100 13 | 14 | [pylama:test/*] 15 | # E501: line too long [pycodestyle] 16 | ignore = E501 17 | 18 | [pylama:*/__init__.py] 19 | # W0611: imported but unused [pyflakes] 20 | ignore = W0611 21 | 22 | [pylama:test/test_logger.py] 23 | # E402: module level import not at top of file [pycodestyle] 24 | ignore = E402 25 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [tool.black] 6 | line-length = 100 7 | exclude = ''' 8 | /( 9 | \.eggs 10 | | \.git 11 | | \.mypy_cache 12 | | \.tox 13 | | \.venv 14 | | \.pytype 15 | | _build 16 | | buck-out 17 | | build 18 | | dist 19 | | examples 20 | )/ 21 | | docs/conf.py 22 | ''' 23 | target-version = ['py37', 'py38', 'py39', 'py310', 'py311'] 24 | 25 | [tool.isort] 26 | known_third_party = [ 27 | 'pytablewriter', 28 | 'pytest', 29 | 'readmemaker', 30 | 'responses', 31 | 'simplesqlite', 32 | 'sphinx_rtd_theme', 33 | 'xlsxwriter', 34 | ] 35 | include_trailing_comma = true 36 | line_length = 100 37 | lines_after_imports = 2 38 | multi_line_output = 3 39 | skip_glob = [ 40 | '*/.eggs/*', 41 | '*/.pytype/*', 42 | '*/.tox/*', 43 | ] 44 | 45 | [tool.coverage.run] 46 | source = ['pytablereader'] 47 | branch = true 48 | 49 | [tool.coverage.report] 50 | show_missing = true 51 | precision = 1 52 | exclude_lines = [ 53 | 'except ImportError', 54 | 'raise NotImplementedError', 55 | 'pass', 56 | 'ABCmeta', 57 | 'abstractmethod', 58 | 'abstractproperty', 59 | 'abstractclassmethod', 60 | 'warnings.warn', 61 | ] 62 | -------------------------------------------------------------------------------- /pytablereader/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi 3 | """ 4 | 5 | from tabledata import DataError, InvalidHeaderNameError, InvalidTableNameError 6 | 7 | from .__version__ import __author__, __copyright__, __email__, __license__, __version__ 8 | from ._constant import PatternMatch 9 | from ._logger import set_log_level, set_logger 10 | from .csv.core import CsvTableFileLoader, CsvTableTextLoader 11 | from .error import ( 12 | APIError, 13 | HTTPError, 14 | InvalidFilePathError, 15 | LoaderNotFoundError, 16 | OpenError, 17 | PathError, 18 | ProxyError, 19 | PypandocImportError, 20 | UrlError, 21 | ValidationError, 22 | ) 23 | from .html.core import HtmlTableFileLoader, HtmlTableTextLoader 24 | from .json.core import JsonTableDictLoader, JsonTableFileLoader, JsonTableTextLoader 25 | from .jsonlines.core import JsonLinesTableFileLoader, JsonLinesTableTextLoader 26 | from .loadermanager import TableFileLoader, TableTextLoader, TableUrlLoader 27 | from .ltsv.core import LtsvTableFileLoader, LtsvTableTextLoader 28 | from .markdown.core import MarkdownTableFileLoader, MarkdownTableTextLoader 29 | from .mediawiki.core import MediaWikiTableFileLoader, MediaWikiTableTextLoader 30 | from .spreadsheet.excelloader import ExcelTableFileLoader 31 | from .spreadsheet.gsloader import GoogleSheetsTableLoader 32 | from .sqlite.core import SqliteFileLoader 33 | from .tsv.core import TsvTableFileLoader, TsvTableTextLoader 34 | -------------------------------------------------------------------------------- /pytablereader/__version__.py: -------------------------------------------------------------------------------- 1 | __author__ = "Tsuyoshi Hombashi" 2 | __copyright__ = f"Copyright 2016, {__author__}" 3 | __license__ = "MIT License" 4 | __version__ = "0.31.4" 5 | __maintainer__ = __author__ 6 | __email__ = "tsuyoshi.hombashi@gmail.com" 7 | -------------------------------------------------------------------------------- /pytablereader/_acceptor.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi 3 | """ 4 | 5 | import abc 6 | 7 | 8 | class LoaderAcceptorInterface(metaclass=abc.ABCMeta): 9 | """ 10 | An interface class of table loader acceptor. 11 | """ 12 | 13 | @abc.abstractmethod 14 | def accept(self, loader): # pragma: no cover 15 | pass 16 | 17 | 18 | class LoaderAcceptor(LoaderAcceptorInterface): 19 | """ 20 | An abstract class of table loader acceptor. 21 | """ 22 | 23 | def __init__(self): 24 | self._loader = None 25 | 26 | def accept(self, loader): 27 | self._loader = loader 28 | -------------------------------------------------------------------------------- /pytablereader/_common.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi 3 | """ 4 | 5 | import os.path 6 | import posixpath 7 | from urllib.parse import urlparse 8 | 9 | import pathvalidate 10 | import typepy 11 | 12 | from ._constant import Default 13 | from .error import InvalidFilePathError 14 | 15 | 16 | try: 17 | import simplejson as json 18 | except ImportError: 19 | import json # type: ignore # noqa 20 | 21 | 22 | def get_file_encoding(file_path, encoding): 23 | from mbstrdecoder import detect_file_encoding 24 | 25 | if encoding: 26 | return encoding 27 | 28 | encoding = detect_file_encoding(file_path) 29 | if not encoding: 30 | return Default.ENCODING 31 | 32 | return encoding 33 | 34 | 35 | def get_extension(file_path): 36 | if typepy.is_null_string(file_path): 37 | raise InvalidFilePathError("file path is empty") 38 | 39 | return os.path.splitext(file_path)[1].lstrip(".") 40 | 41 | 42 | def make_temp_file_path_from_url(temp_dir_path, url): 43 | try: 44 | url_path = urlparse(url).path 45 | except AttributeError: 46 | raise InvalidFilePathError("url must be a string") 47 | 48 | if typepy.is_null_string(url_path): 49 | raise InvalidFilePathError(f"invalid URL path: {url_path}") 50 | 51 | temp_name = os.path.basename(url_path.rstrip("/")) 52 | if typepy.is_null_string(temp_name): 53 | temp_name = pathvalidate.replace_symbol(temp_name, replacement_text="_") 54 | 55 | if typepy.is_null_string(temp_name): 56 | raise InvalidFilePathError(f"invalid URL: {url}") 57 | 58 | try: 59 | return posixpath.join(temp_dir_path, temp_name) 60 | except (TypeError, AttributeError): 61 | raise InvalidFilePathError("temp_dir_path must be a string") 62 | -------------------------------------------------------------------------------- /pytablereader/_constant.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi 3 | """ 4 | 5 | import enum 6 | 7 | 8 | class Default: 9 | ENCODING = "utf-8" 10 | 11 | 12 | class SourceType: 13 | TEXT = "text" 14 | FILE = "file" 15 | URL = "url" 16 | OBJECT = "object" 17 | 18 | 19 | class TableNameTemplate: 20 | __FORMAT = "%({:s})s" 21 | DEFAULT = __FORMAT.format("default") 22 | FILENAME = __FORMAT.format("filename") 23 | FORMAT_NAME = __FORMAT.format("format_name") 24 | FORMAT_ID = __FORMAT.format("format_id") 25 | GLOBAL_ID = __FORMAT.format("global_id") 26 | KEY = __FORMAT.format("key") 27 | TITLE = __FORMAT.format("title") 28 | SHEET = __FORMAT.format("sheet") 29 | 30 | 31 | @enum.unique 32 | class PatternMatch(enum.Enum): 33 | OR = 0 34 | AND = 1 35 | -------------------------------------------------------------------------------- /pytablereader/_logger/__init__.py: -------------------------------------------------------------------------------- 1 | from ._logger import ( 2 | FileSourceLogger, 3 | NullSourceLogger, 4 | TextSourceLogger, 5 | logger, 6 | set_log_level, 7 | set_logger, 8 | ) 9 | -------------------------------------------------------------------------------- /pytablereader/_logger/_logger.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi 3 | """ 4 | 5 | import abc 6 | 7 | import dataproperty 8 | 9 | from ._null_logger import NullLogger 10 | 11 | 12 | MODULE_NAME = "pytablereader" 13 | 14 | try: 15 | from loguru import logger 16 | 17 | logger.disable(MODULE_NAME) 18 | except ImportError: 19 | logger = NullLogger() # type: ignore 20 | 21 | 22 | def set_logger(is_enable, propagation_depth=2): 23 | if is_enable: 24 | logger.enable(MODULE_NAME) 25 | else: 26 | logger.disable(MODULE_NAME) 27 | 28 | if propagation_depth <= 0: 29 | return 30 | 31 | dataproperty.set_logger(is_enable, propagation_depth - 1) 32 | 33 | try: 34 | import simplesqlite 35 | 36 | simplesqlite.set_logger(is_enable, propagation_depth - 1) 37 | except (ImportError, TypeError): 38 | pass 39 | 40 | 41 | def set_log_level(log_level): 42 | # deprecated 43 | return 44 | 45 | 46 | def typehints_to_str(type_hints): 47 | return ", ".join([type_hint.__name__ if type_hint else "none" for type_hint in type_hints]) 48 | 49 | 50 | class LoggerInterface(metaclass=abc.ABCMeta): 51 | @abc.abstractmethod 52 | def logging_load(self): # pragma: no cover 53 | pass 54 | 55 | 56 | class BaseLogger(LoggerInterface): 57 | def __init__(self, loader): 58 | self._loader = loader 59 | 60 | def logging_load(self): 61 | logger.debug(self._get_load_message()) 62 | 63 | def logging_table(self, table_data): 64 | logger.debug(f"loaded tabledata: {table_data}") 65 | 66 | @abc.abstractmethod 67 | def _get_load_message(self): 68 | pass 69 | 70 | 71 | class NullSourceLogger(BaseLogger): 72 | def logging_load(self): 73 | pass 74 | 75 | def logging_table(self, table_data): 76 | pass 77 | 78 | def _get_load_message(self): 79 | return "" 80 | 81 | 82 | class FileSourceLogger(BaseLogger): 83 | def _get_load_message(self): 84 | message = "loading {:s}: format={:s}, path={}".format( 85 | self._loader.source_type, self._loader.format_name, self._loader.source 86 | ) 87 | 88 | try: 89 | message += f", encoding={self._loader.encoding}" 90 | except AttributeError: 91 | pass 92 | 93 | if self._loader.type_hints: 94 | message += f", type-hints=({typehints_to_str(self._loader.type_hints)})" 95 | 96 | return message 97 | 98 | 99 | class TextSourceLogger(BaseLogger): 100 | def _get_load_message(self): 101 | message = "loading {:s}: format={:s}".format( 102 | self._loader.source_type, self._loader.format_name 103 | ) 104 | 105 | try: 106 | message += f", len={len(self._loader.source)}" 107 | except TypeError: 108 | pass 109 | 110 | try: 111 | message += f", encoding={self._loader.encoding}" 112 | except AttributeError: 113 | pass 114 | 115 | if self._loader.type_hints: 116 | message += f", type-hints=({typehints_to_str(self._loader.type_hints)})" 117 | 118 | return message 119 | -------------------------------------------------------------------------------- /pytablereader/_logger/_null_logger.py: -------------------------------------------------------------------------------- 1 | class NullLogger: 2 | level_name = None 3 | 4 | def remove(self, handler_id=None): # pragma: no cover 5 | pass 6 | 7 | def add(self, sink, **kwargs): # pragma: no cover 8 | pass 9 | 10 | def disable(self, name): # pragma: no cover 11 | pass 12 | 13 | def enable(self, name): # pragma: no cover 14 | pass 15 | 16 | def critical(self, __message, *args, **kwargs): # pragma: no cover 17 | pass 18 | 19 | def debug(self, __message, *args, **kwargs): # pragma: no cover 20 | pass 21 | 22 | def error(self, __message, *args, **kwargs): # pragma: no cover 23 | pass 24 | 25 | def exception(self, __message, *args, **kwargs): # pragma: no cover 26 | pass 27 | 28 | def info(self, __message, *args, **kwargs): # pragma: no cover 29 | pass 30 | 31 | def log(self, __level, __message, *args, **kwargs): # pragma: no cover 32 | pass 33 | 34 | def success(self, __message, *args, **kwargs): # pragma: no cover 35 | pass 36 | 37 | def trace(self, __message, *args, **kwargs): # pragma: no cover 38 | pass 39 | 40 | def warning(self, __message, *args, **kwargs): # pragma: no cover 41 | pass 42 | -------------------------------------------------------------------------------- /pytablereader/_validator.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi 3 | """ 4 | 5 | import abc 6 | import os 7 | import stat 8 | from errno import EBADF, ENAMETOOLONG, ENOENT, ENOTDIR 9 | from urllib.parse import urlparse 10 | 11 | import pathvalidate as pv 12 | import typepy 13 | 14 | from pytablereader import DataError 15 | 16 | from ._constant import SourceType 17 | from ._logger import logger 18 | from .error import InvalidFilePathError, UrlError 19 | 20 | 21 | def is_fifo(file_path: str) -> bool: 22 | try: 23 | return stat.S_ISFIFO(os.stat(file_path).st_mode) 24 | except OSError as e: 25 | logger.error(f"errno: {e.errno}") 26 | 27 | if e.errno not in (EBADF, ENAMETOOLONG, ENOENT, ENOTDIR): 28 | raise 29 | 30 | return False 31 | except ValueError: 32 | return False 33 | 34 | 35 | class ValidatorInterface(metaclass=abc.ABCMeta): 36 | """ 37 | An interface class for data source validator. 38 | """ 39 | 40 | @abc.abstractproperty 41 | def source_type(self): 42 | pass 43 | 44 | @abc.abstractmethod 45 | def validate(self): 46 | pass 47 | 48 | 49 | class BaseValidator(ValidatorInterface): 50 | """ 51 | An abstract base class for data source validator. 52 | """ 53 | 54 | @property 55 | def source(self): 56 | return self.__source 57 | 58 | def __init__(self, source): 59 | self.__source = source 60 | 61 | 62 | class NullValidator(BaseValidator): 63 | @property 64 | def source_type(self): 65 | return "null" 66 | 67 | def validate(self): 68 | pass 69 | 70 | 71 | class FileValidator(BaseValidator): 72 | """ 73 | Validator class for file data source. 74 | """ 75 | 76 | @property 77 | def source_type(self): 78 | return SourceType.FILE 79 | 80 | def validate(self): 81 | try: 82 | pv.validate_filepath(self.source, platform="auto") 83 | except pv.ValidationError as e: 84 | raise InvalidFilePathError(e) 85 | 86 | if os.path.isfile(self.source) or is_fifo(self.source): 87 | return 88 | 89 | raise OSError("file not found") 90 | 91 | 92 | class TextValidator(BaseValidator): 93 | """ 94 | Validator class for text object data source. 95 | """ 96 | 97 | @property 98 | def source_type(self): 99 | return SourceType.TEXT 100 | 101 | def validate(self): 102 | if typepy.is_null_string(self.source): 103 | raise DataError("data source is empty") 104 | 105 | 106 | class UrlValidator(BaseValidator): 107 | """ 108 | Validator class for URL data source. 109 | """ 110 | 111 | @property 112 | def source_type(self): 113 | return SourceType.URL 114 | 115 | def validate(self): 116 | if typepy.is_null_string(self.source): 117 | raise UrlError("url is empty") 118 | 119 | scheme = urlparse(self.source).scheme 120 | if scheme not in ["http", "https"]: 121 | raise UrlError(f"invalid scheme: expected=http/https, actual={scheme}") 122 | -------------------------------------------------------------------------------- /pytablereader/csv/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thombashi/pytablereader/b2a6a3db3ef52f5db942340ae75a6905df64a960/pytablereader/csv/__init__.py -------------------------------------------------------------------------------- /pytablereader/csv/core.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi 3 | """ 4 | 5 | import csv 6 | import io 7 | import warnings 8 | 9 | import typepy 10 | from mbstrdecoder import MultiByteStrDecoder 11 | 12 | from pytablereader import DataError 13 | 14 | from .._common import get_file_encoding 15 | from .._constant import TableNameTemplate as tnt 16 | from .._logger import FileSourceLogger, TextSourceLogger 17 | from .._validator import FileValidator, TextValidator 18 | from ..interface import AbstractTableReader 19 | from .formatter import CsvTableFormatter 20 | 21 | 22 | class CsvTableLoader(AbstractTableReader): 23 | """ 24 | The abstract class of CSV table loaders. 25 | 26 | .. py:attribute:: headers 27 | 28 | Attribute names of the table. Use the first line of 29 | the CSV file as attribute list if ``headers`` is empty. 30 | 31 | .. py:attribute:: delimiter 32 | 33 | A one-character string used to separate fields. 34 | Defaults to ``","``. 35 | 36 | .. py:attribute:: quotechar 37 | 38 | A one-character string used to quote fields containing 39 | special characters, such as the ``delimiter`` or ``quotechar``, 40 | or which contain new-line characters. 41 | Defaults to ``'"'``. 42 | 43 | .. py:attribute:: encoding 44 | 45 | Encoding of the CSV data. 46 | """ 47 | 48 | @property 49 | def format_name(self): 50 | return "csv" 51 | 52 | @property 53 | def delimiter(self): 54 | # "delimiter" must be a string, not an unicode 55 | return str(MultiByteStrDecoder(self.__delimiter).unicode_str) 56 | 57 | @delimiter.setter 58 | def delimiter(self, value): 59 | self.__delimiter = value 60 | 61 | @property 62 | def quotechar(self): 63 | # "quotechar" must be a string, not an unicode 64 | return str(MultiByteStrDecoder(self.__quotechar).unicode_str) 65 | 66 | @quotechar.setter 67 | def quotechar(self, value): 68 | self.__quotechar = value 69 | 70 | @property 71 | def header_list(self): 72 | warnings.warn("'header_list' has moved to 'headers'", DeprecationWarning) 73 | return self.headers 74 | 75 | @header_list.setter 76 | def header_list(self, value): 77 | warnings.warn("'header_list' has moved to 'headers'", DeprecationWarning) 78 | self.headers = value 79 | 80 | def __init__(self, source, quoting_flags, type_hints, type_hint_rules): 81 | super().__init__(source, quoting_flags, type_hints, type_hint_rules) 82 | 83 | self._csv_reader = None 84 | 85 | self.headers = () 86 | self.delimiter = "," 87 | self.quotechar = '"' 88 | self.encoding = None 89 | 90 | def _to_data_matrix(self): 91 | try: 92 | return [ 93 | [self.__modify_item(data, col) for col, data in enumerate(row)] 94 | for row in self._csv_reader 95 | if typepy.is_not_empty_sequence(row) 96 | ] 97 | except (csv.Error, UnicodeDecodeError) as e: 98 | raise DataError(e) 99 | 100 | def __modify_item(self, data, col: int): 101 | if self.type_hints and (col in self.type_hints): 102 | try: 103 | return self.type_hints[col](data).convert() 104 | except typepy.TypeConversionError: 105 | pass 106 | 107 | return MultiByteStrDecoder(data).unicode_str 108 | 109 | 110 | class CsvTableFileLoader(CsvTableLoader): 111 | """ 112 | A file loader class to extract tabular data from CSV files. 113 | 114 | :param str file_path: Path to the loading CSV file. 115 | 116 | .. py:attribute:: table_name 117 | 118 | Table name string. Defaults to ``%(filename)s``. 119 | 120 | :Examples: 121 | :ref:`example-csv-table-loader` 122 | """ 123 | 124 | def __init__(self, file_path, quoting_flags=None, type_hints=None, type_hint_rules=None): 125 | super().__init__(file_path, quoting_flags, type_hints, type_hint_rules) 126 | 127 | self._validator = FileValidator(file_path) 128 | self._logger = FileSourceLogger(self) 129 | 130 | def load(self): 131 | """ 132 | Extract tabular data as |TableData| instances from a CSV file. 133 | |load_source_desc_file| 134 | 135 | :return: 136 | Loaded table data. 137 | |load_table_name_desc| 138 | 139 | =================== ======================================== 140 | Format specifier Value after the replacement 141 | =================== ======================================== 142 | ``%(filename)s`` |filename_desc| 143 | ``%(format_name)s`` ``"csv"`` 144 | ``%(format_id)s`` |format_id_desc| 145 | ``%(global_id)s`` |global_id| 146 | =================== ======================================== 147 | :rtype: |TableData| iterator 148 | :raises pytablereader.DataError: 149 | If the CSV data is invalid. 150 | 151 | .. seealso:: 152 | :py:func:`csv.reader` 153 | """ 154 | 155 | self._validate() 156 | self._logger.logging_load() 157 | self.encoding = get_file_encoding(self.source, self.encoding) 158 | 159 | self._csv_reader = csv.reader( 160 | open(self.source, encoding=self.encoding), 161 | delimiter=self.delimiter, 162 | quotechar=self.quotechar, 163 | strict=True, 164 | skipinitialspace=True, 165 | ) 166 | 167 | formatter = CsvTableFormatter(self._to_data_matrix()) 168 | formatter.accept(self) 169 | 170 | return formatter.to_table_data() 171 | 172 | def _get_default_table_name_template(self): 173 | return tnt.FILENAME 174 | 175 | 176 | class CsvTableTextLoader(CsvTableLoader): 177 | """ 178 | A text loader class to extract tabular data from CSV text data. 179 | 180 | :param str text: CSV text to load. 181 | 182 | .. py:attribute:: table_name 183 | 184 | Table name string. Defaults to ``%(format_name)s%(format_id)s``. 185 | 186 | :Examples: 187 | :ref:`example-csv-table-loader` 188 | """ 189 | 190 | def __init__(self, text, quoting_flags=None, type_hints=None, type_hint_rules=None): 191 | super().__init__(text, quoting_flags, type_hints, type_hint_rules) 192 | 193 | self._validator = TextValidator(text) 194 | self._logger = TextSourceLogger(self) 195 | 196 | def load(self): 197 | """ 198 | Extract tabular data as |TableData| instances from a CSV text object. 199 | |load_source_desc_text| 200 | 201 | :return: 202 | Loaded table data. 203 | |load_table_name_desc| 204 | 205 | =================== ======================================== 206 | Format specifier Value after the replacement 207 | =================== ======================================== 208 | ``%(filename)s`` ``""`` 209 | ``%(format_name)s`` ``"csv"`` 210 | ``%(format_id)s`` |format_id_desc| 211 | ``%(global_id)s`` |global_id| 212 | =================== ======================================== 213 | :rtype: |TableData| iterator 214 | :raises pytablereader.DataError: 215 | If the CSV data is invalid. 216 | 217 | .. seealso:: 218 | :py:func:`csv.reader` 219 | """ 220 | 221 | self._validate() 222 | self._logger.logging_load() 223 | 224 | self._csv_reader = csv.reader( 225 | io.StringIO(self.source.strip()), 226 | delimiter=self.delimiter, 227 | quotechar=self.quotechar, 228 | strict=True, 229 | skipinitialspace=True, 230 | ) 231 | formatter = CsvTableFormatter(self._to_data_matrix()) 232 | formatter.accept(self) 233 | 234 | return formatter.to_table_data() 235 | 236 | def _get_default_table_name_template(self): 237 | return f"{tnt.FORMAT_NAME:s}{tnt.FORMAT_ID:s}" 238 | -------------------------------------------------------------------------------- /pytablereader/csv/formatter.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi 3 | """ 4 | 5 | import typepy 6 | from tabledata import TableData 7 | 8 | from pytablereader import DataError 9 | 10 | from ..formatter import TableFormatter 11 | 12 | 13 | class CsvTableFormatter(TableFormatter): 14 | def to_table_data(self): 15 | if typepy.is_empty_sequence(self._loader.headers): 16 | headers = self._source_data[0] 17 | 18 | if any([typepy.is_null_string(header) for header in headers]): 19 | raise DataError( 20 | "the first line includes empty string item." 21 | "all of the items should contain header name." 22 | "actual={}".format(headers) 23 | ) 24 | 25 | data_matrix = self._source_data[1:] 26 | else: 27 | headers = self._loader.headers 28 | data_matrix = self._source_data 29 | 30 | if not data_matrix: 31 | raise DataError("data row must be greater or equal than one") 32 | 33 | self._loader.inc_table_count() 34 | 35 | yield TableData( 36 | self._loader.make_table_name(), 37 | headers, 38 | data_matrix, 39 | dp_extractor=self._loader.dp_extractor, 40 | type_hints=self._extract_type_hints(headers), 41 | ) 42 | -------------------------------------------------------------------------------- /pytablereader/error.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi 3 | """ 4 | 5 | 6 | class ValidationError(Exception): 7 | """ 8 | Exception raised when data is not properly formatted. 9 | """ 10 | 11 | 12 | class PathError(Exception): 13 | """ 14 | Base path exception class. 15 | """ 16 | 17 | 18 | class InvalidFilePathError(PathError): 19 | """ 20 | Exception raised when invalid file path used. 21 | 22 | TODO: rename the error class 23 | """ 24 | 25 | 26 | class UrlError(PathError): 27 | """ 28 | Exception raised when invalid URL used. 29 | """ 30 | 31 | 32 | class OpenError(IOError): 33 | """ 34 | Exception raised when failed to open a file. 35 | """ 36 | 37 | 38 | class APIError(Exception): 39 | """ 40 | Exception raised when failed to execute API requests. 41 | """ 42 | 43 | 44 | class LoaderNotFoundError(Exception): 45 | """ 46 | Exception raised when loader not found. 47 | """ 48 | 49 | 50 | class PypandocImportError(ImportError): 51 | """ 52 | Exception raised when import error occurred with pypandoc package. 53 | """ 54 | 55 | 56 | try: 57 | import requests 58 | 59 | class HTTPError(requests.RequestException): 60 | """ 61 | An HTTP error occurred. 62 | 63 | .. seealso:: 64 | 65 | http://docs.python-requests.org/en/master/api/#exceptions 66 | """ 67 | 68 | class ProxyError(requests.exceptions.ProxyError): 69 | """ 70 | A proxy error occurred. 71 | 72 | .. seealso:: 73 | 74 | http://docs.python-requests.org/en/master/_modules/requests/exceptions/ 75 | """ 76 | 77 | except ImportError: 78 | 79 | class HTTPError(Exception): 80 | """ 81 | An HTTP error occurred. 82 | """ 83 | 84 | class ProxyError(Exception): 85 | """ 86 | A proxy error occurred. 87 | """ 88 | -------------------------------------------------------------------------------- /pytablereader/factory/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi 3 | """ 4 | 5 | from ._file import TableFileLoaderFactory 6 | from ._text import TableTextLoaderFactory 7 | from ._url import TableUrlLoaderFactory 8 | -------------------------------------------------------------------------------- /pytablereader/factory/_base.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi 3 | """ 4 | 5 | import abc 6 | import warnings 7 | 8 | from mbstrdecoder import MultiByteStrDecoder 9 | 10 | from .._constant import Default 11 | from ..error import LoaderNotFoundError 12 | 13 | 14 | class BaseTableLoaderFactory(metaclass=abc.ABCMeta): 15 | @property 16 | def source(self): 17 | """ 18 | :return: Data source to load. 19 | :rtype: str 20 | """ 21 | 22 | return self._source 23 | 24 | def __init__(self, source, encoding=None): 25 | if not encoding: 26 | self._encoding = Default.ENCODING 27 | else: 28 | self._encoding = encoding 29 | 30 | self._source = MultiByteStrDecoder(source, [encoding]).unicode_str 31 | 32 | @abc.abstractmethod 33 | def create_from_path(self): # pragma: no cover 34 | pass 35 | 36 | @abc.abstractmethod 37 | def create_from_format_name(self, format_name): # pragma: no cover 38 | pass 39 | 40 | @abc.abstractmethod 41 | def _get_extension_loader_mapping(self): # pragma: no cover 42 | pass 43 | 44 | @abc.abstractmethod 45 | def _get_format_name_loader_mapping(self): # pragma: no cover 46 | pass 47 | 48 | def get_format_names(self): 49 | """ 50 | :return: Available format names. 51 | :rtype: list 52 | """ 53 | 54 | return sorted(self._get_format_name_loader_mapping()) 55 | 56 | def get_format_name_list(self): 57 | warnings.warn("'get_format_name_list' has moved to 'get_format_names'", DeprecationWarning) 58 | return self.get_format_names() 59 | 60 | def get_extensions(self): 61 | """ 62 | :return: Available format file extensions. 63 | :rtype: list 64 | """ 65 | 66 | return sorted(self._get_extension_loader_mapping()) 67 | 68 | def get_extension_list(self): 69 | warnings.warn("'get_extension_list' has moved to 'get_extensions'", DeprecationWarning) 70 | return self.get_extensions() 71 | 72 | def _get_loader_class(self, loader_mapping, format_name): 73 | try: 74 | format_name = format_name.casefold() 75 | except AttributeError: 76 | raise TypeError("format name must be a string") 77 | 78 | try: 79 | return loader_mapping[format_name] 80 | except KeyError: 81 | raise LoaderNotFoundError( 82 | ", ".join( 83 | [ 84 | f"loader not found: format='{format_name}'", 85 | f"source='{self.source}'", 86 | ] 87 | ) 88 | ) 89 | 90 | def _create_from_extension(self, extension): 91 | try: 92 | loader = self._get_loader_class(self._get_extension_loader_mapping(), extension)( 93 | self.source 94 | ) 95 | 96 | return self._post_create(loader, extension=extension) 97 | except LoaderNotFoundError as e: 98 | raise LoaderNotFoundError( 99 | "\n".join( 100 | [ 101 | f"{e.args[0]:s} (unknown extension).", 102 | "", 103 | "acceptable extensions are: {}.".format(", ".join(self.get_extensions())), 104 | f"actual: '{extension}'", 105 | ] 106 | ) 107 | ) 108 | 109 | def _create_from_format_name(self, format_name): 110 | try: 111 | loader = self._get_loader_class(self._get_format_name_loader_mapping(), format_name)( 112 | self.source 113 | ) 114 | 115 | return self._post_create(loader, format_name=format_name) 116 | except LoaderNotFoundError as e: 117 | raise LoaderNotFoundError( 118 | "\n".join( 119 | [ 120 | f"{e.args[0]:s} (unknown format name).", 121 | "acceptable format names are: {}.".format( 122 | ", ".join(self.get_format_names()) 123 | ), 124 | ] 125 | ) 126 | ) 127 | 128 | def _post_create(self, loader, **kwargs): 129 | loader.encoding = self._encoding 130 | 131 | if loader.format_name == "csv" and kwargs.get("format_name") == "ssv": 132 | loader.delimiter = " " 133 | 134 | return loader 135 | -------------------------------------------------------------------------------- /pytablereader/factory/_file.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi 3 | """ 4 | 5 | from mbstrdecoder import detect_file_encoding 6 | 7 | from .._common import get_extension 8 | from .._logger import logger 9 | from ..csv.core import CsvTableFileLoader 10 | from ..html.core import HtmlTableFileLoader 11 | from ..json.core import JsonTableFileLoader 12 | from ..jsonlines.core import JsonLinesTableFileLoader 13 | from ..ltsv.core import LtsvTableFileLoader 14 | from ..markdown.core import MarkdownTableFileLoader 15 | from ..mediawiki.core import MediaWikiTableFileLoader 16 | from ..spreadsheet.excelloader import ExcelTableFileLoader 17 | from ..sqlite.core import SqliteFileLoader 18 | from ..tsv.core import TsvTableFileLoader 19 | from ._base import BaseTableLoaderFactory 20 | 21 | 22 | class TableFileLoaderFactory(BaseTableLoaderFactory): 23 | """ 24 | :param str file_path: Path to the loading file. 25 | :raises pytablereader.InvalidFilePathError: 26 | If the ``file_path`` is an empty path. 27 | """ 28 | 29 | @property 30 | def file_extension(self): 31 | """ 32 | :return: File extension of the :py:attr:`.source` (without period). 33 | :rtype: str 34 | """ 35 | 36 | return get_extension(self.source) 37 | 38 | def __init__(self, source, encoding=None): 39 | if not encoding and source: 40 | encoding = detect_file_encoding(source) 41 | logger.debug(f"detect encoding: file={source}, encoding={encoding}") 42 | 43 | super().__init__(source, encoding) 44 | 45 | def create_from_path(self): 46 | """ 47 | Create a file loader from the file extension to loading file. 48 | Supported file extensions are as follows: 49 | 50 | ========================== ======================================= 51 | Extension Loader 52 | ========================== ======================================= 53 | ``"csv"`` :py:class:`~.CsvTableFileLoader` 54 | ``"xls"``/``"xlsx"`` :py:class:`~.ExcelTableFileLoader` 55 | ``"htm"``/``"html"`` :py:class:`~.HtmlTableFileLoader` 56 | ``"json"`` :py:class:`~.JsonTableFileLoader` 57 | ``"jsonl"`` :py:class:`~.JsonLinesTableFileLoader` 58 | ``"ldjson"`` :py:class:`~.JsonLinesTableFileLoader` 59 | ``"ltsv"`` :py:class:`~.LtsvTableFileLoader` 60 | ``"md"`` :py:class:`~.MarkdownTableFileLoader` 61 | ``"ndjson"`` :py:class:`~.JsonLinesTableFileLoader` 62 | ``"sqlite"``/``"sqlite3"`` :py:class:`~.SqliteFileLoader` 63 | ``"tsv"`` :py:class:`~.TsvTableFileLoader` 64 | ========================== ======================================= 65 | 66 | :return: 67 | Loader that coincides with the file extension of the 68 | :py:attr:`.file_extension`. 69 | :raises pytablereader.LoaderNotFoundError: 70 | |LoaderNotFoundError_desc| loading the file. 71 | """ 72 | 73 | loader = self._create_from_extension(self.file_extension) 74 | 75 | logger.debug( 76 | "TableFileLoaderFactory.create_from_path: extension={}, loader={}".format( 77 | self.file_extension, loader.format_name 78 | ) 79 | ) 80 | 81 | return loader 82 | 83 | def create_from_format_name(self, format_name): 84 | """ 85 | Create a file loader from a format name. 86 | Supported file formats are as follows: 87 | 88 | ================ ====================================== 89 | Format name Loader 90 | ================ ====================================== 91 | ``"csv"`` :py:class:`~.CsvTableFileLoader` 92 | ``"excel"`` :py:class:`~.ExcelTableFileLoader` 93 | ``"html"`` :py:class:`~.HtmlTableFileLoader` 94 | ``"json"`` :py:class:`~.JsonTableFileLoader` 95 | ``"json"`` :py:class:`~.JsonTableFileLoader` 96 | ``"json_lines"`` :py:class:`~.JsonTableFileLoader` 97 | ``"jsonl"`` :py:class:`~.JsonLinesTableFileLoader` 98 | ``"ltsv"`` :py:class:`~.LtsvTableFileLoader` 99 | ``"markdown"`` :py:class:`~.MarkdownTableFileLoader` 100 | ``"mediawiki"`` :py:class:`~.MediaWikiTableFileLoader` 101 | ``"ndjson"`` :py:class:`~.JsonLinesTableFileLoader` 102 | ``"sqlite"`` :py:class:`~.SqliteFileLoader` 103 | ``"ssv"`` :py:class:`~.CsvTableFileLoader` 104 | ``"tsv"`` :py:class:`~.TsvTableFileLoader` 105 | ================ ====================================== 106 | 107 | :param str format_name: Format name string (case insensitive). 108 | :return: Loader that coincides with the ``format_name``: 109 | :raises pytablereader.LoaderNotFoundError: 110 | |LoaderNotFoundError_desc| the format. 111 | """ 112 | 113 | loader = self._create_from_format_name(format_name) 114 | 115 | logger.debug( 116 | "TableFileLoaderFactory.create_from_format_name: name={}, loader={}".format( 117 | format_name, loader.format_name 118 | ) 119 | ) 120 | 121 | return loader 122 | 123 | @staticmethod 124 | def _get_common_loader_mapping(): 125 | return { 126 | "csv": CsvTableFileLoader, 127 | "html": HtmlTableFileLoader, 128 | "json": JsonTableFileLoader, 129 | "jsonl": JsonLinesTableFileLoader, 130 | "ldjson": JsonLinesTableFileLoader, 131 | "ltsv": LtsvTableFileLoader, 132 | "ndjson": JsonLinesTableFileLoader, 133 | "sqlite": SqliteFileLoader, 134 | "tsv": TsvTableFileLoader, 135 | } 136 | 137 | def _get_extension_loader_mapping(self): 138 | """ 139 | :return: Mappings of format extension and loader class. 140 | :rtype: dict 141 | """ 142 | 143 | loader_table = self._get_common_loader_mapping() 144 | loader_table.update( 145 | { 146 | "htm": HtmlTableFileLoader, 147 | "md": MarkdownTableFileLoader, 148 | "sqlite3": SqliteFileLoader, 149 | "xlsx": ExcelTableFileLoader, 150 | "xls": ExcelTableFileLoader, 151 | } 152 | ) 153 | 154 | return loader_table 155 | 156 | def _get_format_name_loader_mapping(self): 157 | """ 158 | :return: Mappings of format name and loader class. 159 | :rtype: dict 160 | """ 161 | 162 | loader_table = self._get_common_loader_mapping() 163 | loader_table.update( 164 | { 165 | "excel": ExcelTableFileLoader, 166 | "json_lines": JsonLinesTableFileLoader, 167 | "markdown": MarkdownTableFileLoader, 168 | "mediawiki": MediaWikiTableFileLoader, 169 | "ssv": CsvTableFileLoader, 170 | } 171 | ) 172 | 173 | return loader_table 174 | -------------------------------------------------------------------------------- /pytablereader/factory/_text.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi 3 | """ 4 | 5 | from .._logger import logger 6 | from ..csv.core import CsvTableTextLoader 7 | from ..html.core import HtmlTableTextLoader 8 | from ..json.core import JsonTableTextLoader 9 | from ..jsonlines.core import JsonLinesTableTextLoader 10 | from ..ltsv.core import LtsvTableTextLoader 11 | from ..markdown.core import MarkdownTableTextLoader 12 | from ..mediawiki.core import MediaWikiTableTextLoader 13 | from ..tsv.core import TsvTableTextLoader 14 | from ._base import BaseTableLoaderFactory 15 | 16 | 17 | class TableTextLoaderFactory(BaseTableLoaderFactory): 18 | def create_from_path(self): 19 | raise NotImplementedError() 20 | 21 | def create_from_format_name(self, format_name): 22 | """ 23 | Create a file loader from a format name. 24 | Supported file formats are as follows: 25 | 26 | ========================== ====================================== 27 | Format name Loader 28 | ========================== ====================================== 29 | ``"csv"`` :py:class:`~.CsvTableTextLoader` 30 | ``"html"`` :py:class:`~.HtmlTableTextLoader` 31 | ``"json"`` :py:class:`~.JsonTableTextLoader` 32 | ``"json_lines"`` :py:class:`~.JsonLinesTableTextLoader` 33 | ``"jsonl"`` :py:class:`~.JsonLinesTableTextLoader` 34 | ``"ldjson"`` :py:class:`~.JsonLinesTableTextLoader` 35 | ``"ltsv"`` :py:class:`~.LtsvTableTextLoader` 36 | ``"markdown"`` :py:class:`~.MarkdownTableTextLoader` 37 | ``"mediawiki"`` :py:class:`~.MediaWikiTableTextLoader` 38 | ``"ndjson"`` :py:class:`~.JsonLinesTableTextLoader` 39 | ``"ssv"`` :py:class:`~.CsvTableTextLoader` 40 | ``"tsv"`` :py:class:`~.TsvTableTextLoader` 41 | ========================== ====================================== 42 | 43 | :param str format_name: Format name string (case insensitive). 44 | :return: Loader that coincide with the ``format_name``: 45 | :raises pytablereader.LoaderNotFoundError: 46 | |LoaderNotFoundError_desc| the format. 47 | :raises TypeError: If ``format_name`` is not a string. 48 | """ 49 | 50 | loader = self._create_from_format_name(format_name) 51 | 52 | logger.debug(f"TableTextLoaderFactory: name={format_name}, loader={loader.format_name}") 53 | 54 | return loader 55 | 56 | def _get_common_loader_mapping(self): 57 | return { 58 | "csv": CsvTableTextLoader, 59 | "html": HtmlTableTextLoader, 60 | "json": JsonTableTextLoader, 61 | "jsonl": JsonLinesTableTextLoader, 62 | "ldjson": JsonLinesTableTextLoader, 63 | "ltsv": LtsvTableTextLoader, 64 | "ndjson": JsonLinesTableTextLoader, 65 | "tsv": TsvTableTextLoader, 66 | } 67 | 68 | def _get_extension_loader_mapping(self): 69 | """ 70 | :return: Mappings of format-extension and loader class. 71 | :rtype: dict 72 | """ 73 | 74 | loader_table = self._get_common_loader_mapping() 75 | loader_table.update({"htm": HtmlTableTextLoader, "md": MarkdownTableTextLoader}) 76 | 77 | return loader_table 78 | 79 | def _get_format_name_loader_mapping(self): 80 | """ 81 | :return: Mappings of format-name and loader class. 82 | :rtype: dict 83 | """ 84 | 85 | loader_table = self._get_common_loader_mapping() 86 | loader_table.update( 87 | { 88 | "json_lines": JsonLinesTableTextLoader, 89 | "markdown": MarkdownTableTextLoader, 90 | "mediawiki": MediaWikiTableTextLoader, 91 | "ssv": CsvTableTextLoader, 92 | } 93 | ) 94 | 95 | return loader_table 96 | -------------------------------------------------------------------------------- /pytablereader/formatter.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi 3 | """ 4 | 5 | import abc 6 | from collections import OrderedDict 7 | from textwrap import dedent 8 | 9 | from pytablereader import DataError 10 | 11 | from ._acceptor import LoaderAcceptor 12 | from ._common import json 13 | from ._logger import logger 14 | 15 | 16 | class TableFormatterInterface(metaclass=abc.ABCMeta): 17 | """ 18 | The abstract class of table data validator. 19 | """ 20 | 21 | @abc.abstractmethod 22 | def to_table_data(self): # pragma: no cover 23 | pass 24 | 25 | 26 | class TableFormatter(LoaderAcceptor, TableFormatterInterface): 27 | """ 28 | The abstract class of |TableData| formatter. 29 | """ 30 | 31 | def _validate_source_data(self): 32 | if not self._source_data: 33 | raise DataError("source data is empty") 34 | 35 | def __init__(self, source_data): 36 | self._source_data = source_data 37 | 38 | self._validate_source_data() 39 | 40 | def _extract_type_hints(self, headers=None): 41 | if self._loader.type_hints: 42 | return self._loader.type_hints 43 | 44 | if not self._loader.type_hint_rules or not headers: 45 | return [] 46 | 47 | type_hints = [] 48 | for header in headers: 49 | for regexp, type_hint in self._loader.type_hint_rules.items(): 50 | if regexp.search(header): 51 | type_hints.append(type_hint) 52 | break 53 | else: 54 | type_hints.append(None) 55 | 56 | logger.debug( 57 | dedent( 58 | """\ 59 | extracted type hints: 60 | {} 61 | """ 62 | ).format( 63 | json.dumps( 64 | OrderedDict( 65 | {header: str(type_hint) for header, type_hint in zip(headers, type_hints)} 66 | ), 67 | indent=4, 68 | ) 69 | ) 70 | ) 71 | 72 | return type_hints 73 | -------------------------------------------------------------------------------- /pytablereader/html/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thombashi/pytablereader/b2a6a3db3ef52f5db942340ae75a6905df64a960/pytablereader/html/__init__.py -------------------------------------------------------------------------------- /pytablereader/html/core.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi 3 | """ 4 | 5 | from .._common import get_file_encoding 6 | from .._constant import TableNameTemplate as tnt 7 | from .._logger import FileSourceLogger, TextSourceLogger 8 | from .._validator import FileValidator, TextValidator 9 | from ..interface import AbstractTableReader 10 | from .formatter import HtmlTableFormatter 11 | 12 | 13 | class HtmlTableLoader(AbstractTableReader): 14 | """ 15 | An abstract class of HTML table loaders. 16 | """ 17 | 18 | @property 19 | def format_name(self): 20 | return "html" 21 | 22 | def _get_default_table_name_template(self): 23 | return f"{tnt.TITLE:s}_{tnt.KEY:s}" 24 | 25 | 26 | class HtmlTableFileLoader(HtmlTableLoader): 27 | """ 28 | A file loader class to extract tabular data from HTML files. 29 | 30 | :param str file_path: Path to the loading HTML file. 31 | 32 | .. py:attribute:: table_name 33 | 34 | Table name string. Defaults to ``%(title)s_%(key)s``. 35 | 36 | .. py:attribute:: encoding 37 | 38 | HTML file encoding. Defaults to ``"utf-8"``. 39 | """ 40 | 41 | def __init__(self, file_path=None, quoting_flags=None, type_hints=None, type_hint_rules=None): 42 | super().__init__(file_path, quoting_flags, type_hints, type_hint_rules) 43 | 44 | self.encoding = None 45 | 46 | self._validator = FileValidator(file_path) 47 | self._logger = FileSourceLogger(self) 48 | 49 | def load(self): 50 | """ 51 | Extract tabular data as |TableData| instances from HTML table tags in 52 | a HTML file. 53 | |load_source_desc_file| 54 | 55 | :return: 56 | Loaded table data iterator. 57 | |load_table_name_desc| 58 | 59 | =================== ============================================== 60 | Format specifier Value after the replacement 61 | =================== ============================================== 62 | ``%(filename)s`` |filename_desc| 63 | ``%(title)s`` ```` tag value of the HTML. 64 | ``%(key)s`` | This replaced to: 65 | | **(1)** ``id`` attribute of the table tag 66 | | **(2)** ``%(format_name)s%(format_id)s`` 67 | | if ``id`` attribute not present in the 68 | | table tag. 69 | ``%(format_name)s`` ``"html"`` 70 | ``%(format_id)s`` |format_id_desc| 71 | ``%(global_id)s`` |global_id| 72 | =================== ============================================== 73 | :rtype: |TableData| iterator 74 | :raises pytablereader.DataError: 75 | If the HTML data is invalid or empty. 76 | 77 | .. note:: 78 | 79 | Table tag attributes ignored with loaded |TableData|. 80 | """ 81 | 82 | self._validate() 83 | self._logger.logging_load() 84 | self.encoding = get_file_encoding(self.source, self.encoding) 85 | 86 | with open(self.source, encoding=self.encoding) as fp: 87 | formatter = HtmlTableFormatter(fp.read(), self._logger) 88 | formatter.accept(self) 89 | 90 | return formatter.to_table_data() 91 | 92 | 93 | class HtmlTableTextLoader(HtmlTableLoader): 94 | """ 95 | A text loader class to extract tabular data from HTML text data. 96 | 97 | :param str text: HTML text to load. 98 | 99 | .. py:attribute:: table_name 100 | 101 | Table name string. Defaults to ``%(title)s_%(key)s``. 102 | """ 103 | 104 | def __init__(self, text, quoting_flags=None, type_hints=None, type_hint_rules=None): 105 | super().__init__(text, quoting_flags, type_hints, type_hint_rules) 106 | 107 | self._validator = TextValidator(text) 108 | self._logger = TextSourceLogger(self) 109 | 110 | def load(self): 111 | """ 112 | Extract tabular data as |TableData| instances from HTML table tags in 113 | a HTML text object. 114 | |load_source_desc_text| 115 | 116 | :return: 117 | Loaded table data iterator. 118 | |load_table_name_desc| 119 | 120 | =================== ============================================== 121 | Format specifier Value after the replacement 122 | =================== ============================================== 123 | ``%(filename)s`` ``""`` 124 | ``%(title)s`` ``<title>`` tag value of the HTML. 125 | ``%(key)s`` | This replaced to: 126 | | **(1)** ``id`` attribute of the table tag 127 | | **(2)** ``%(format_name)s%(format_id)s`` 128 | | if ``id`` attribute is not included 129 | | in the table tag. 130 | ``%(format_name)s`` ``"html"`` 131 | ``%(format_id)s`` |format_id_desc| 132 | ``%(global_id)s`` |global_id| 133 | =================== ============================================== 134 | :rtype: |TableData| iterator 135 | :raises pytablereader.DataError: 136 | If the HTML data is invalid or empty. 137 | """ 138 | 139 | self._validate() 140 | self._logger.logging_load() 141 | 142 | formatter = HtmlTableFormatter(self.source, self._logger) 143 | formatter.accept(self) 144 | 145 | return formatter.to_table_data() 146 | -------------------------------------------------------------------------------- /pytablereader/html/formatter.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> 3 | """ 4 | 5 | import re 6 | 7 | import bs4 8 | import typepy 9 | from tabledata import TableData 10 | 11 | from pytablereader import DataError 12 | 13 | from .._constant import TableNameTemplate as tnt 14 | from .._logger import NullSourceLogger 15 | from ..formatter import TableFormatter 16 | 17 | 18 | class HtmlTableFormatter(TableFormatter): 19 | @property 20 | def table_id(self): 21 | return self.__table_id 22 | 23 | def __init__(self, source_data, logger=None): 24 | super().__init__(source_data) 25 | 26 | if logger: 27 | self.__logger = logger 28 | else: 29 | self.__logger = NullSourceLogger(None) 30 | 31 | self.__table_id = None 32 | 33 | if typepy.is_null_string(source_data): 34 | raise DataError 35 | 36 | try: 37 | self.__soup = bs4.BeautifulSoup(self._source_data, "lxml") 38 | except bs4.FeatureNotFound: 39 | self.__soup = bs4.BeautifulSoup(self._source_data, "html.parser") 40 | 41 | def to_table_data(self): 42 | for table in self.__soup.find_all("table"): 43 | try: 44 | table_data = self.__parse_html(table) 45 | except ValueError: 46 | continue 47 | 48 | if table_data.is_empty_rows(): 49 | continue 50 | 51 | self.__logger.logging_table(table_data) 52 | 53 | yield table_data 54 | 55 | def _make_table_name(self): 56 | from collections import OrderedDict 57 | 58 | key = self.table_id 59 | if typepy.is_null_string(key): 60 | key = self._loader.get_format_key() 61 | 62 | try: 63 | title = self.__soup.title.text 64 | except AttributeError: 65 | title = "" 66 | 67 | kv_mapping = self._loader._get_basic_tablename_keyvalue_mapping() 68 | kv_mapping.update(OrderedDict([(tnt.KEY, key), (tnt.TITLE, title)])) 69 | 70 | return self._loader._expand_table_name_format(kv_mapping) 71 | 72 | def __parse_tag_id(self, table): 73 | self.__table_id = table.get("id") 74 | 75 | if self.__table_id is None: 76 | caption = table.find("caption") 77 | if caption is not None: 78 | caption = caption.text.strip() 79 | if typepy.is_not_null_string(caption): 80 | self.__table_id = caption 81 | 82 | def __parse_html(self, table): 83 | headers = [] 84 | data_matrix = [] 85 | 86 | self.__parse_tag_id(table) 87 | 88 | rows = table.find_all("tr") 89 | re_table_val = re.compile("td|th") 90 | for row in rows: 91 | td_list = row.find_all("td") 92 | if typepy.is_empty_sequence(td_list): 93 | if typepy.is_not_empty_sequence(headers): 94 | continue 95 | 96 | th_list = row.find_all("th") 97 | if typepy.is_empty_sequence(th_list): 98 | continue 99 | 100 | headers = [row.text.strip() for row in th_list] 101 | continue 102 | 103 | data_matrix.append([value.get_text().strip() for value in row.find_all(re_table_val)]) 104 | 105 | if typepy.is_empty_sequence(data_matrix): 106 | raise ValueError("data matrix is empty") 107 | 108 | self._loader.inc_table_count() 109 | 110 | return TableData( 111 | self._make_table_name(), 112 | headers, 113 | data_matrix, 114 | dp_extractor=self._loader.dp_extractor, 115 | type_hints=self._extract_type_hints(headers), 116 | ) 117 | -------------------------------------------------------------------------------- /pytablereader/interface.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> 3 | """ 4 | 5 | import abc 6 | import threading 7 | 8 | import path 9 | import typepy 10 | from dataproperty import DataPropertyExtractor 11 | 12 | from pytablereader import InvalidTableNameError 13 | 14 | from ._constant import SourceType 15 | from ._constant import TableNameTemplate as tnt 16 | 17 | 18 | class TableLoaderInterface(metaclass=abc.ABCMeta): 19 | """ 20 | Interface class of table loader class. 21 | """ 22 | 23 | @abc.abstractproperty 24 | def format_name(self): # pragma: no cover 25 | pass 26 | 27 | @abc.abstractproperty 28 | def source_type(self): # pragma: no cover 29 | pass 30 | 31 | @abc.abstractmethod 32 | def load(self): # pragma: no cover 33 | pass 34 | 35 | @abc.abstractmethod 36 | def inc_table_count(self): # pragma: no cover 37 | pass 38 | 39 | 40 | class AbstractTableReader(TableLoaderInterface, metaclass=abc.ABCMeta): 41 | """ 42 | The abstract class of table data file loader. 43 | 44 | .. py:attribute:: table_name 45 | 46 | Table name string. 47 | 48 | .. py:attribute:: source 49 | 50 | Table data source to load. 51 | """ 52 | 53 | __table_count_lock = threading.Lock() 54 | __global_table_count = 0 55 | __format_table_count = {} 56 | 57 | @property 58 | def source_type(self): 59 | return self._validator.source_type 60 | 61 | @property 62 | def quoting_flags(self): 63 | return self.__quoting_flags 64 | 65 | @property 66 | def dp_extractor(self): 67 | return self.__dp_extractor 68 | 69 | def __init__(self, source, quoting_flags, type_hints, type_hint_rules=None): 70 | self.table_name = tnt.DEFAULT 71 | self.source = source 72 | self.__quoting_flags = quoting_flags 73 | self.type_hints = type_hints 74 | self.type_hint_rules = type_hint_rules 75 | self._validator = None 76 | self._logger = None 77 | 78 | self.__dp_extractor = DataPropertyExtractor() 79 | self.__dp_extractor.quoting_flags = self.quoting_flags 80 | self.__dp_extractor.update_strict_level_map({typepy.Typecode.BOOL: 1}) 81 | 82 | def get_format_key(self): 83 | return f"{self.format_name:s}{self.__get_format_table_count():d}" 84 | 85 | def make_table_name(self): 86 | return self._make_table_name() 87 | 88 | def inc_table_count(self): 89 | with self.__table_count_lock: 90 | self.__global_table_count += 1 91 | self.__format_table_count[self.format_name] = self.__get_format_table_count() + 1 92 | 93 | @abc.abstractmethod 94 | def _get_default_table_name_template(self): # pragma: no cover 95 | pass 96 | 97 | def _validate(self): 98 | self._validate_table_name() 99 | self._validate_source() 100 | 101 | def _validate_table_name(self): 102 | try: 103 | if typepy.is_null_string(self.table_name): 104 | raise ValueError("table name is empty") 105 | except (TypeError, AttributeError): 106 | raise TypeError("table_name must be a string") 107 | 108 | def _validate_source(self): 109 | self._validator.validate() 110 | 111 | def __get_format_table_count(self): 112 | return self.__format_table_count.get(self.format_name, 0) 113 | 114 | def _get_filename_tablename_mapping(self): 115 | filename = "" 116 | if all([self.source_type == SourceType.FILE, typepy.is_not_null_string(self.source)]): 117 | filename = path.Path(self.source).stem 118 | 119 | return (tnt.FILENAME, filename) 120 | 121 | def _get_basic_tablename_keyvalue_mapping(self): 122 | from collections import OrderedDict 123 | 124 | return OrderedDict( 125 | [ 126 | (tnt.DEFAULT, self._get_default_table_name_template()), 127 | (tnt.FORMAT_NAME, self.format_name), 128 | (tnt.FORMAT_ID, str(self.__get_format_table_count())), 129 | (tnt.GLOBAL_ID, str(self.__global_table_count)), 130 | self._get_filename_tablename_mapping(), 131 | ] 132 | ) 133 | 134 | def _expand_table_name_format(self, table_name_kv_mapping): 135 | self._validate_table_name() 136 | 137 | table_name = self.table_name 138 | for template, value in table_name_kv_mapping.items(): 139 | table_name = table_name.replace(template, value) 140 | 141 | return self._sanitize_table_name(table_name) 142 | 143 | def _make_table_name(self): 144 | self._validate_table_name() 145 | 146 | return self._expand_table_name_format(self._get_basic_tablename_keyvalue_mapping()) 147 | 148 | @staticmethod 149 | def _sanitize_table_name(table_name): 150 | if typepy.is_null_string(table_name): 151 | raise InvalidTableNameError("table name is empty after the template replacement") 152 | 153 | return table_name.strip("_") 154 | 155 | @classmethod 156 | def clear_table_count(cls): 157 | with cls.__table_count_lock: 158 | cls.__global_table_count = 0 159 | cls.__format_table_count = {} 160 | -------------------------------------------------------------------------------- /pytablereader/json/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thombashi/pytablereader/b2a6a3db3ef52f5db942340ae75a6905df64a960/pytablereader/json/__init__.py -------------------------------------------------------------------------------- /pytablereader/jsonlines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thombashi/pytablereader/b2a6a3db3ef52f5db942340ae75a6905df64a960/pytablereader/jsonlines/__init__.py -------------------------------------------------------------------------------- /pytablereader/jsonlines/core.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> 3 | """ 4 | 5 | import abc 6 | from collections import OrderedDict 7 | 8 | from .._common import get_file_encoding, json 9 | from .._constant import SourceType 10 | from .._constant import TableNameTemplate as tnt 11 | from .._logger import FileSourceLogger, TextSourceLogger 12 | from .._validator import FileValidator, TextValidator 13 | from ..error import ValidationError 14 | from ..interface import AbstractTableReader 15 | from .formatter import JsonLinesTableFormatter 16 | 17 | 18 | class JsonLinesTableLoader(AbstractTableReader, metaclass=abc.ABCMeta): 19 | """ 20 | An abstract class of JSON table loaders. 21 | """ 22 | 23 | @property 24 | def format_name(self): 25 | return "json_lines" 26 | 27 | @abc.abstractmethod 28 | def load_dict(self): # pragma: no cover 29 | pass 30 | 31 | 32 | class JsonLinesTableFileLoader(JsonLinesTableLoader): 33 | """ 34 | A file loader class to extract tabular data from Line-delimited JSON files. 35 | 36 | :param str file_path: Path to the loading Line-delimited JSON file. 37 | 38 | .. py:attribute:: table_name 39 | 40 | Table name string. Defaults to ``%(filename)s_%(key)s``. 41 | """ 42 | 43 | def __init__(self, file_path=None, quoting_flags=None, type_hints=None, type_hint_rules=None): 44 | super().__init__(file_path, quoting_flags, type_hints, type_hint_rules) 45 | 46 | self.encoding = None 47 | 48 | self._validator = FileValidator(file_path) 49 | self._logger = FileSourceLogger(self) 50 | 51 | def load(self): 52 | """ 53 | Extract tabular data as |TableData| instances from a Line-delimited JSON file. 54 | |load_source_desc_file| 55 | 56 | :return: 57 | Loaded table data iterator. 58 | |load_table_name_desc| 59 | 60 | :rtype: |TableData| iterator 61 | :raises pytablereader.DataError: 62 | If the data is invalid Line-delimited JSON. 63 | :raises pytablereader.error.ValidationError: 64 | If the data is not acceptable Line-delimited JSON format. 65 | """ 66 | 67 | formatter = JsonLinesTableFormatter(self.load_dict()) 68 | formatter.accept(self) 69 | 70 | return formatter.to_table_data() 71 | 72 | def load_dict(self): 73 | self._validate() 74 | self._logger.logging_load() 75 | self.encoding = get_file_encoding(self.source, self.encoding) 76 | 77 | buffer = [] 78 | with open(self.source, encoding=self.encoding) as fp: 79 | for line_idx, line in enumerate(fp): 80 | line = line.strip() 81 | if not line: 82 | continue 83 | 84 | try: 85 | buffer.append(json.loads(line, object_pairs_hook=OrderedDict)) 86 | except json.JSONDecodeError as e: 87 | raise ValidationError( 88 | "line {line_idx}: {msg}: {value}".format( 89 | line_idx=line_idx + 1, msg=e, value=line 90 | ) 91 | ) 92 | 93 | return buffer 94 | 95 | def _get_default_table_name_template(self): 96 | return f"{tnt.FILENAME:s}_{tnt.KEY:s}" 97 | 98 | 99 | class JsonLinesTableTextLoader(JsonLinesTableLoader): 100 | """ 101 | A text loader class to extract tabular data from Line-delimited JSON text data. 102 | 103 | :param str text: Line-delimited JSON text to load. 104 | 105 | .. py:attribute:: table_name 106 | 107 | Table name string. Defaults to ``%(key)s``. 108 | """ 109 | 110 | @property 111 | def source_type(self): 112 | return SourceType.TEXT 113 | 114 | def __init__(self, text=None, quoting_flags=None, type_hints=None, type_hint_rules=None): 115 | super().__init__(text, quoting_flags, type_hints) 116 | 117 | self._validator = TextValidator(text) 118 | self._logger = TextSourceLogger(self) 119 | 120 | def load(self): 121 | """ 122 | Extract tabular data as |TableData| instances from a Line-delimited JSON text object. 123 | |load_source_desc_text| 124 | 125 | :return: 126 | Loaded table data iterator. 127 | |load_table_name_desc| 128 | 129 | :rtype: |TableData| iterator 130 | 131 | .. seealso:: 132 | 133 | :py:meth:`.JsonLinesTableFileLoader.load()` 134 | """ 135 | 136 | formatter = JsonLinesTableFormatter(self.load_dict()) 137 | formatter.accept(self) 138 | 139 | return formatter.to_table_data() 140 | 141 | def load_dict(self): 142 | self._validate() 143 | self._logger.logging_load() 144 | 145 | buffer = [] 146 | for line_idx, line in enumerate(self.source.splitlines()): 147 | line = line.strip() 148 | if not line: 149 | continue 150 | 151 | try: 152 | buffer.append(json.loads(line, object_pairs_hook=OrderedDict)) 153 | except json.JSONDecodeError as e: 154 | raise ValidationError( 155 | "line {line_idx}: {msg}: {value}".format( 156 | line_idx=line_idx + 1, msg=e, value=line 157 | ) 158 | ) 159 | 160 | return buffer 161 | 162 | def _get_default_table_name_template(self): 163 | return f"{tnt.KEY:s}" 164 | -------------------------------------------------------------------------------- /pytablereader/jsonlines/formatter.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> 3 | """ 4 | 5 | import jsonschema 6 | from tabledata import TableData 7 | 8 | from ..error import ValidationError 9 | from ..formatter import TableFormatter 10 | from ..json.formatter import SingleJsonTableConverterBase 11 | 12 | 13 | class FlatJsonTableConverter(SingleJsonTableConverterBase): 14 | """ 15 | A concrete class of JSON table data formatter. 16 | """ 17 | 18 | @property 19 | def _schema(self): 20 | return {"type": "object", "additionalProperties": self._VALUE_TYPE_SCHEMA} 21 | 22 | def _validate_source_data(self): 23 | for json_record in self._buffer: 24 | try: 25 | jsonschema.validate(json_record, self._schema) 26 | except jsonschema.ValidationError as e: 27 | raise ValidationError(e) 28 | 29 | def to_table_data(self): 30 | """ 31 | :raises ValueError: 32 | :raises pytablereader.error.ValidationError: 33 | """ 34 | 35 | self._validate_source_data() 36 | 37 | header_list = [] 38 | for json_record in self._buffer: 39 | for key in json_record: 40 | if key not in header_list: 41 | header_list.append(key) 42 | 43 | self._loader.inc_table_count() 44 | 45 | yield TableData( 46 | self._make_table_name(), 47 | header_list, 48 | self._buffer, 49 | dp_extractor=self._loader.dp_extractor, 50 | type_hints=self._extract_type_hints(header_list), 51 | ) 52 | 53 | 54 | class JsonLinesTableFormatter(TableFormatter): 55 | def to_table_data(self): 56 | converter = FlatJsonTableConverter(self._source_data) 57 | converter.accept(self._loader) 58 | 59 | return converter.to_table_data() 60 | -------------------------------------------------------------------------------- /pytablereader/loadermanager/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> 3 | """ 4 | 5 | from ._file import TableFileLoader 6 | from ._text import TableTextLoader 7 | from ._url import TableUrlLoader 8 | -------------------------------------------------------------------------------- /pytablereader/loadermanager/_base.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> 3 | """ 4 | 5 | from ..interface import TableLoaderInterface 6 | 7 | 8 | class TableLoaderManager(TableLoaderInterface): 9 | def __init__(self, loader): 10 | self.__loader = loader 11 | 12 | @property 13 | def loader(self): 14 | return self.__loader 15 | 16 | @property 17 | def format_name(self): 18 | return self.__loader.format_name 19 | 20 | @property 21 | def source_type(self): 22 | return self.__loader.source_type 23 | 24 | @property 25 | def table_name(self): 26 | return self.__loader.table_name 27 | 28 | @table_name.setter 29 | def table_name(self, value): 30 | self.__loader.table_name = value 31 | 32 | @property 33 | def encoding(self): 34 | try: 35 | return self.__loader.encoding 36 | except AttributeError: 37 | return None 38 | 39 | @encoding.setter 40 | def encoding(self, codec_name): 41 | self.__loader.encoding = codec_name 42 | 43 | @property 44 | def type_hints(self): 45 | return self.__loader.type_hints 46 | 47 | @type_hints.setter 48 | def type_hints(self, value): 49 | self.__loader.type_hints = value 50 | 51 | def load(self): 52 | return self.__loader.load() 53 | 54 | def inc_table_count(self): 55 | self.__loader.inc_table_count() 56 | -------------------------------------------------------------------------------- /pytablereader/loadermanager/_file.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> 3 | """ 4 | 5 | import warnings 6 | 7 | import typepy 8 | 9 | from ..factory import TableFileLoaderFactory 10 | from ._base import TableLoaderManager 11 | 12 | 13 | class TableFileLoader(TableLoaderManager): 14 | """ 15 | Loader class to loading tables from a file. 16 | 17 | :param str file_path: Path to the file to load. 18 | :param str format_name: Data format name to load. 19 | Supported formats are as follows: 20 | ``"csv"``, ``"excel"``, ``"html"``, ``"json"``, ``"ltsv"``, 21 | ``"markdown"``, ``"mediawiki"``, ``"sqlite"``, ``"ssv"``, ``"tsv"``. 22 | If the value is |None|, automatically detect file format from 23 | the ``file_path``. 24 | :raise pytablereader.InvalidFilePathError: 25 | If ``file_path`` is an invalid file path. 26 | :raises pytablereader.LoaderNotFoundError: 27 | |LoaderNotFoundError_desc| loading the file. 28 | 29 | .. py:method:: load 30 | 31 | Loading table data from a file as ``format_name`` format. 32 | Automatically detect file format if ``format_name`` is |None|. 33 | 34 | :return: Loaded table data iterator. 35 | :rtype: |TableData| iterator 36 | 37 | .. seealso:: 38 | * :py:meth:`pytablereader.factory.TableFileLoaderFactory.create_from_format_name` 39 | * :py:meth:`pytablereader.factory.TableFileLoaderFactory.create_from_path` 40 | """ 41 | 42 | def __init__(self, file_path, format_name=None, encoding=None, type_hint_rules=None): 43 | loader_factory = TableFileLoaderFactory(file_path, encoding=encoding) 44 | 45 | if typepy.is_not_null_string(format_name): 46 | loader = loader_factory.create_from_format_name(format_name) 47 | else: 48 | loader = loader_factory.create_from_path() 49 | 50 | loader.type_hint_rules = type_hint_rules 51 | 52 | super().__init__(loader) 53 | 54 | @classmethod 55 | def get_format_names(cls): 56 | """ 57 | :return: 58 | Available format names. These names can use by 59 | :py:class:`.TableFileLoader` class constructor. 60 | :rtype: list 61 | 62 | :Example: 63 | .. code:: python 64 | 65 | >>> from pytablereader import TableFileLoader 66 | >>> for format_name in TableFileLoader.get_format_names(): 67 | ... print(format_name) 68 | ... 69 | csv 70 | excel 71 | html 72 | json 73 | json_lines 74 | jsonl 75 | ldjson 76 | ltsv 77 | markdown 78 | mediawiki 79 | ndjson 80 | sqlite 81 | ssv 82 | tsv 83 | """ 84 | 85 | return TableFileLoaderFactory("dummy").get_format_names() 86 | 87 | @classmethod 88 | def get_format_name_list(cls): 89 | warnings.warn("'get_format_name_list' has moved to 'get_format_names'", DeprecationWarning) 90 | return cls.get_format_names() 91 | -------------------------------------------------------------------------------- /pytablereader/loadermanager/_text.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> 3 | """ 4 | 5 | 6 | from typing import Optional, Sequence 7 | 8 | import typepy 9 | 10 | from ..factory import TableTextLoaderFactory 11 | from ._base import TableLoaderManager 12 | 13 | 14 | class TableTextLoader(TableLoaderManager): 15 | """ 16 | Loader class to loading tables from URL. 17 | 18 | :param str url: URL to load. 19 | :param str format_name: Data format name to load. 20 | Supported formats can be get by :py:meth:`.get_format_names` 21 | :param dict proxies: http/https proxy information. 22 | 23 | .. seealso:: 24 | `requests proxies <http://docs.python-requests.org/en/master/user/advanced/#proxies>`__ 25 | 26 | :raises pytablereader.LoaderNotFoundError: 27 | |LoaderNotFoundError_desc| loading the URL. 28 | 29 | .. py:method:: load 30 | 31 | Load tables from text as ``format_name`` format. 32 | 33 | :return: Loaded table data iterator. 34 | :rtype: |TableData| iterator 35 | 36 | .. seealso:: 37 | * :py:meth:`pytablereader.factory.TableTextLoaderFactory.create_from_format_name` 38 | * :py:meth:`pytablereader.factory.TableTextLoaderFactory.create_from_path` 39 | """ 40 | 41 | def __init__( 42 | self, source: str, format_name: str, encoding: Optional[str] = None, type_hint_rules=None 43 | ) -> None: 44 | loader_factory = TableTextLoaderFactory(source, encoding) 45 | 46 | if typepy.is_null_string(format_name): 47 | raise ValueError("requie format_name") 48 | 49 | loader = loader_factory.create_from_format_name(format_name) 50 | loader.type_hint_rules = type_hint_rules 51 | 52 | super().__init__(loader) 53 | 54 | @classmethod 55 | def get_format_names(cls) -> Sequence[str]: 56 | """ 57 | :return: 58 | Available format names. These names can use by 59 | :py:class:`.TableTextLoader` class constructor. 60 | :rtype: list 61 | 62 | :Example: 63 | .. code:: python 64 | 65 | >>> from pytablereader import TableTextLoader 66 | >>> for format_name in TableTextLoader.get_format_names(): 67 | ... print(format_name) 68 | ... 69 | csv 70 | excel 71 | html 72 | json 73 | json_lines 74 | jsonl 75 | ldjson 76 | ltsv 77 | markdown 78 | mediawiki 79 | ndjson 80 | sqlite 81 | ssv 82 | tsv 83 | """ 84 | 85 | return TableTextLoaderFactory("dummy").get_format_names() 86 | -------------------------------------------------------------------------------- /pytablereader/loadermanager/_url.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> 3 | """ 4 | 5 | import warnings 6 | 7 | import typepy 8 | 9 | from ..factory import TableUrlLoaderFactory 10 | from ._base import TableLoaderManager 11 | 12 | 13 | class TableUrlLoader(TableLoaderManager): 14 | """ 15 | Loader class to loading tables from URL. 16 | 17 | :param str url: URL to load. 18 | :param str format_name: Data format name to load. 19 | Supported formats are: 20 | ``"csv"``, ``"excel"``, ``"html"``, ``"json"``, ``"ltsv"``, 21 | ``"markdown"``, ``"mediawiki"``, ``"sqlite"``, ``"ssv"``, ``"tsv"``. 22 | If the value is |None|, automatically detect file format from 23 | the ``url``. 24 | :param dict proxies: http/https proxy information. 25 | 26 | .. seealso:: 27 | `requests proxies <http://docs.python-requests.org/en/master/user/advanced/#proxies>`__ 28 | 29 | :raises pytablereader.LoaderNotFoundError: 30 | |LoaderNotFoundError_desc| loading the URL. 31 | :raises pytablereader.HTTPError: 32 | If loader received an HTTP error when access to the URL. 33 | 34 | :Example: 35 | :ref:`example-url-table-loader` 36 | 37 | .. py:method:: load 38 | 39 | Load tables from URL as ``format_name`` format. 40 | 41 | :return: Loaded table data iterator. 42 | :rtype: |TableData| iterator 43 | 44 | .. seealso:: 45 | * :py:meth:`pytablereader.factory.TableUrlLoaderFactory.create_from_format_name` 46 | * :py:meth:`pytablereader.factory.TableUrlLoaderFactory.create_from_path` 47 | """ 48 | 49 | def __init__(self, url, format_name=None, encoding=None, type_hint_rules=None, proxies=None): 50 | loader_factory = TableUrlLoaderFactory(url, encoding, proxies) 51 | 52 | if typepy.is_not_null_string(format_name): 53 | loader = loader_factory.create_from_format_name(format_name) 54 | else: 55 | loader = loader_factory.create_from_path() 56 | 57 | loader.type_hint_rules = type_hint_rules 58 | 59 | super().__init__(loader) 60 | 61 | @classmethod 62 | def get_format_names(cls): 63 | """ 64 | :return: 65 | Available format names. These names can use by 66 | :py:class:`.TableUrlLoader` class constructor. 67 | :rtype: list 68 | 69 | :Example: 70 | .. code:: python 71 | 72 | >>> from pytablereader import TableUrlLoader 73 | >>> for format_name in TableUrlLoader.get_format_names(): 74 | ... print(format_name) 75 | ... 76 | csv 77 | excel 78 | html 79 | json 80 | json_lines 81 | jsonl 82 | ldjson 83 | ltsv 84 | markdown 85 | mediawiki 86 | ndjson 87 | sqlite 88 | ssv 89 | tsv 90 | """ 91 | 92 | return TableUrlLoaderFactory("http://dummy.com/").get_format_names() 93 | 94 | @classmethod 95 | def get_format_name_list(cls): 96 | warnings.warn("'get_format_name_list' has moved to 'get_format_names'", DeprecationWarning) 97 | return cls.get_format_names() 98 | -------------------------------------------------------------------------------- /pytablereader/ltsv/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thombashi/pytablereader/b2a6a3db3ef52f5db942340ae75a6905df64a960/pytablereader/ltsv/__init__.py -------------------------------------------------------------------------------- /pytablereader/ltsv/core.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> 3 | """ 4 | 5 | import pathvalidate as pv 6 | import typepy 7 | 8 | from pytablereader import DataError, InvalidHeaderNameError 9 | 10 | from .._common import get_file_encoding 11 | from .._constant import TableNameTemplate as tnt 12 | from .._logger import FileSourceLogger, TextSourceLogger 13 | from .._validator import FileValidator, TextValidator 14 | from ..interface import AbstractTableReader 15 | from ..json.formatter import SingleJsonTableConverterA 16 | 17 | 18 | class LtsvTableLoader(AbstractTableReader): 19 | """ 20 | Abstract class of 21 | `Labeled Tab-separated Values (LTSV) <http://ltsv.org/>`__ 22 | format table loaders. 23 | 24 | .. py:attribute:: encoding 25 | 26 | Encoding of the LTSV data. 27 | """ 28 | 29 | @property 30 | def format_name(self): 31 | return "ltsv" 32 | 33 | def __init__(self, source, quoting_flags, type_hints, type_hint_rules=None): 34 | super().__init__(source, quoting_flags, type_hints, type_hint_rules) 35 | 36 | self._ltsv_input_stream = None 37 | 38 | def _to_data_matrix(self): 39 | from collections import OrderedDict 40 | 41 | data_matrix = [] 42 | 43 | for row_idx, row in enumerate(self._ltsv_input_stream): 44 | if typepy.is_empty_sequence(row): 45 | continue 46 | 47 | ltsv_record = OrderedDict() 48 | for col_idx, ltsv_item in enumerate(row.strip().split("\t")): 49 | try: 50 | label, value = ltsv_item.split(":") 51 | except ValueError: 52 | raise DataError( 53 | "invalid ltsv item found: line={}, col={}, item='{}'".format( 54 | row_idx, col_idx, ltsv_item 55 | ) 56 | ) 57 | 58 | label = label.strip('"') 59 | 60 | try: 61 | pv.validate_ltsv_label(label) 62 | except pv.ValidationError: 63 | raise InvalidHeaderNameError( 64 | "invalid label found (acceptable chars are [0-9A-Za-z_.-]): " 65 | "line={}, col={}, label='{}'".format(row_idx, col_idx, label) 66 | ) 67 | 68 | ltsv_record[label] = value 69 | 70 | data_matrix.append(ltsv_record) 71 | 72 | # using generator to prepare for future enhancement to support 73 | # iterative load. 74 | yield data_matrix 75 | 76 | 77 | class LtsvTableFileLoader(LtsvTableLoader): 78 | """ 79 | `Labeled Tab-separated Values (LTSV) <http://ltsv.org/>`__ 80 | format file loader class. 81 | 82 | :param str file_path: Path to the loading LTSV file. 83 | 84 | .. py:attribute:: table_name 85 | 86 | Table name string. Defaults to ``%(filename)s``. 87 | """ 88 | 89 | def __init__(self, file_path, quoting_flags=None, type_hints=None, type_hint_rules=None): 90 | super().__init__(file_path, quoting_flags, type_hints, type_hint_rules) 91 | 92 | self.encoding = None 93 | 94 | self._validator = FileValidator(file_path) 95 | self._logger = FileSourceLogger(self) 96 | 97 | self.__file = None 98 | 99 | def load(self): 100 | """ 101 | Extract tabular data as |TableData| instances from a LTSV file. 102 | |load_source_desc_file| 103 | 104 | :return: 105 | Loaded table data. 106 | |load_table_name_desc| 107 | 108 | =================== ======================================== 109 | Format specifier Value after the replacement 110 | =================== ======================================== 111 | ``%(filename)s`` |filename_desc| 112 | ``%(format_name)s`` ``"ltsv"`` 113 | ``%(format_id)s`` |format_id_desc| 114 | ``%(global_id)s`` |global_id| 115 | =================== ======================================== 116 | :rtype: |TableData| iterator 117 | :raises pytablereader.InvalidHeaderNameError: 118 | If an invalid label name is included in the LTSV file. 119 | :raises pytablereader.DataError: 120 | If the LTSV data is invalid. 121 | """ 122 | 123 | self._validate() 124 | self._logger.logging_load() 125 | self.encoding = get_file_encoding(self.source, self.encoding) 126 | 127 | self._ltsv_input_stream = open(self.source, encoding=self.encoding) 128 | 129 | for data_matrix in self._to_data_matrix(): 130 | formatter = SingleJsonTableConverterA(data_matrix) 131 | formatter.accept(self) 132 | 133 | return formatter.to_table_data() 134 | 135 | def _get_default_table_name_template(self): 136 | return tnt.FILENAME 137 | 138 | 139 | class LtsvTableTextLoader(LtsvTableLoader): 140 | """ 141 | `Labeled Tab-separated Values (LTSV) <http://ltsv.org/>`__ 142 | format text loader class. 143 | 144 | :param str text: LTSV text to load. 145 | 146 | .. py:attribute:: table_name 147 | 148 | Table name string. Defaults to ``%(format_name)s%(format_id)s``. 149 | """ 150 | 151 | def __init__(self, text=None, quoting_flags=None, type_hints=None, type_hint_rules=None): 152 | super().__init__(text, quoting_flags, type_hints) 153 | 154 | self._validator = TextValidator(text) 155 | self._logger = TextSourceLogger(self) 156 | 157 | def load(self): 158 | """ 159 | Extract tabular data as |TableData| instances from a LTSV text object. 160 | |load_source_desc_text| 161 | 162 | :return: 163 | Loaded table data. 164 | |load_table_name_desc| 165 | 166 | =================== ======================================== 167 | Format specifier Value after the replacement 168 | =================== ======================================== 169 | ``%(filename)s`` ``""`` 170 | ``%(format_name)s`` ``"ltsv"`` 171 | ``%(format_id)s`` |format_id_desc| 172 | ``%(global_id)s`` |global_id| 173 | =================== ======================================== 174 | :rtype: |TableData| iterator 175 | :raises pytablereader.InvalidHeaderNameError: 176 | If an invalid label name is included in the LTSV file. 177 | :raises pytablereader.DataError: 178 | If the LTSV data is invalid. 179 | """ 180 | 181 | self._validate() 182 | self._logger.logging_load() 183 | 184 | self._ltsv_input_stream = self.source.splitlines() 185 | 186 | for data_matrix in self._to_data_matrix(): 187 | formatter = SingleJsonTableConverterA(data_matrix) 188 | formatter.accept(self) 189 | 190 | return formatter.to_table_data() 191 | 192 | def _get_default_table_name_template(self): 193 | return f"{tnt.FORMAT_NAME:s}{tnt.FORMAT_ID:s}" 194 | -------------------------------------------------------------------------------- /pytablereader/markdown/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thombashi/pytablereader/b2a6a3db3ef52f5db942340ae75a6905df64a960/pytablereader/markdown/__init__.py -------------------------------------------------------------------------------- /pytablereader/markdown/core.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> 3 | """ 4 | 5 | from .._common import get_file_encoding 6 | from .._constant import SourceType 7 | from .._constant import TableNameTemplate as tnt 8 | from .._logger import FileSourceLogger, TextSourceLogger 9 | from .._validator import FileValidator, TextValidator 10 | from ..interface import AbstractTableReader 11 | from .formatter import MarkdownTableFormatter 12 | 13 | 14 | class MarkdownTableLoader(AbstractTableReader): 15 | """ 16 | The abstract class of Markdown table loaders. 17 | """ 18 | 19 | @property 20 | def format_name(self): 21 | return "markdown" 22 | 23 | 24 | class MarkdownTableFileLoader(MarkdownTableLoader): 25 | """ 26 | A file loader class to extract tabular data from Markdown files. 27 | 28 | :param str file_path: Path to the loading Markdown file. 29 | 30 | .. py:attribute:: table_name 31 | 32 | Table name string. Defaults to ``%(filename)s_%(key)s``. 33 | """ 34 | 35 | def __init__(self, file_path=None, quoting_flags=None, type_hints=None, type_hint_rules=None): 36 | super().__init__(file_path, quoting_flags, type_hints, type_hint_rules) 37 | 38 | self.encoding = None 39 | 40 | self._validator = FileValidator(file_path) 41 | self._logger = FileSourceLogger(self) 42 | 43 | def load(self): 44 | """ 45 | Extract tabular data as |TableData| instances from a Markdown file. 46 | |load_source_desc_file| 47 | 48 | :return: 49 | Loaded table data iterator. 50 | |load_table_name_desc| 51 | 52 | =================== ============================================== 53 | Format specifier Value after the replacement 54 | =================== ============================================== 55 | ``%(filename)s`` |filename_desc| 56 | ``%(key)s`` ``%(format_name)s%(format_id)s`` 57 | ``%(format_name)s`` ``"markdown"`` 58 | ``%(format_id)s`` |format_id_desc| 59 | ``%(global_id)s`` |global_id| 60 | =================== ============================================== 61 | :rtype: |TableData| iterator 62 | :raises pytablereader.DataError: 63 | If the Markdown data is invalid or empty. 64 | """ 65 | 66 | self._validate() 67 | self._logger.logging_load() 68 | self.encoding = get_file_encoding(self.source, self.encoding) 69 | 70 | with open(self.source, encoding=self.encoding) as fp: 71 | formatter = MarkdownTableFormatter(fp.read(), self._logger) 72 | formatter.accept(self) 73 | 74 | return formatter.to_table_data() 75 | 76 | def _get_default_table_name_template(self): 77 | return f"{tnt.FILENAME:s}_{tnt.KEY:s}" 78 | 79 | 80 | class MarkdownTableTextLoader(MarkdownTableLoader): 81 | """ 82 | A text loader class to extract tabular data from Markdown text data. 83 | 84 | :param str text: Markdown text to load. 85 | 86 | .. py:attribute:: table_name 87 | 88 | Table name string. Defaults to ``%(key)s``. 89 | """ 90 | 91 | @property 92 | def source_type(self): 93 | return SourceType.TEXT 94 | 95 | def __init__(self, text=None, quoting_flags=None, type_hints=None, type_hint_rules=None): 96 | super().__init__(text, quoting_flags, type_hints) 97 | 98 | self._validator = TextValidator(text) 99 | self._logger = TextSourceLogger(self) 100 | 101 | def load(self): 102 | """ 103 | Extract tabular data as |TableData| instances from a Markdown text 104 | object. 105 | |load_source_desc_text| 106 | 107 | :return: 108 | Loaded table data iterator. 109 | |load_table_name_desc| 110 | 111 | =================== ============================================== 112 | Format specifier Value after the replacement 113 | =================== ============================================== 114 | ``%(filename)s`` ``""`` 115 | ``%(key)s`` ``%(format_name)s%(format_id)s`` 116 | ``%(format_name)s`` ``"markdown"`` 117 | ``%(format_id)s`` |format_id_desc| 118 | ``%(global_id)s`` |global_id| 119 | =================== ============================================== 120 | :rtype: |TableData| iterator 121 | :raises pytablereader.DataError: 122 | If the Markdown data is invalid or empty. 123 | """ 124 | 125 | self._validate() 126 | self._logger.logging_load() 127 | 128 | formatter = MarkdownTableFormatter(self.source, self._logger) 129 | formatter.accept(self) 130 | 131 | return formatter.to_table_data() 132 | 133 | def _get_default_table_name_template(self): 134 | return f"{tnt.KEY:s}" 135 | -------------------------------------------------------------------------------- /pytablereader/markdown/formatter.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> 3 | """ 4 | 5 | import typepy 6 | 7 | from pytablereader import DataError 8 | 9 | from ..html.formatter import HtmlTableFormatter 10 | 11 | 12 | class MarkdownTableFormatter(HtmlTableFormatter): 13 | def __init__(self, source_data, logger=None): 14 | import markdown 15 | 16 | if typepy.is_null_string(source_data): 17 | raise DataError 18 | 19 | super().__init__( 20 | markdown.markdown(source_data, extensions=["markdown.extensions.tables"]), logger=logger 21 | ) 22 | -------------------------------------------------------------------------------- /pytablereader/mediawiki/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thombashi/pytablereader/b2a6a3db3ef52f5db942340ae75a6905df64a960/pytablereader/mediawiki/__init__.py -------------------------------------------------------------------------------- /pytablereader/mediawiki/core.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> 3 | """ 4 | 5 | from .._common import get_file_encoding 6 | from .._constant import SourceType 7 | from .._constant import TableNameTemplate as tnt 8 | from .._logger import FileSourceLogger, TextSourceLogger 9 | from .._validator import FileValidator, TextValidator 10 | from ..interface import AbstractTableReader 11 | from .formatter import MediaWikiTableFormatter 12 | 13 | 14 | class MediaWikiTableLoader(AbstractTableReader): 15 | """ 16 | The abstract class of MediaWiki table loaders. 17 | """ 18 | 19 | @property 20 | def format_name(self): 21 | return "mediawiki" 22 | 23 | 24 | class MediaWikiTableFileLoader(MediaWikiTableLoader): 25 | """ 26 | A file loader class to extract tabular data from MediaWiki files. 27 | 28 | :param str file_path: Path to the loading file. 29 | 30 | .. py:attribute:: table_name 31 | 32 | Table name string. Defaults to ``%(filename)s_%(key)s``. 33 | """ 34 | 35 | def __init__(self, file_path=None, quoting_flags=None, type_hints=None, type_hint_rules=None): 36 | super().__init__(file_path, quoting_flags, type_hints, type_hint_rules) 37 | 38 | self.encoding = None 39 | 40 | self._validator = FileValidator(file_path) 41 | self._logger = FileSourceLogger(self) 42 | 43 | def load(self): 44 | """ 45 | Extract tabular data as |TableData| instances from a MediaWiki file. 46 | |load_source_desc_file| 47 | 48 | :return: 49 | Loaded table data iterator. 50 | |load_table_name_desc| 51 | 52 | =================== ============================================== 53 | Format specifier Value after the replacement 54 | =================== ============================================== 55 | ``%(filename)s`` |filename_desc| 56 | ``%(key)s`` | This replaced to: 57 | | **(1)** ``caption`` mark of the table 58 | | **(2)** ``%(format_name)s%(format_id)s`` 59 | | if ``caption`` mark not included 60 | | in the table. 61 | ``%(format_name)s`` ``"mediawiki"`` 62 | ``%(format_id)s`` |format_id_desc| 63 | ``%(global_id)s`` |global_id| 64 | =================== ============================================== 65 | :rtype: |TableData| iterator 66 | :raises pytablereader.DataError: 67 | If the MediaWiki data is invalid or empty. 68 | """ 69 | 70 | self._validate() 71 | self._logger.logging_load() 72 | self.encoding = get_file_encoding(self.source, self.encoding) 73 | 74 | with open(self.source, encoding=self.encoding) as fp: 75 | formatter = MediaWikiTableFormatter(fp.read()) 76 | formatter.accept(self) 77 | 78 | return formatter.to_table_data() 79 | 80 | def _get_default_table_name_template(self): 81 | return f"{tnt.FILENAME:s}_{tnt.KEY:s}" 82 | 83 | 84 | class MediaWikiTableTextLoader(MediaWikiTableLoader): 85 | """ 86 | A text loader class to extract tabular data from MediaWiki text data. 87 | 88 | :param str text: MediaWiki text to load. 89 | 90 | .. py:attribute:: table_name 91 | 92 | Table name string. Defaults to ``%(key)s``. 93 | """ 94 | 95 | @property 96 | def source_type(self): 97 | return SourceType.TEXT 98 | 99 | def __init__(self, text=None, quoting_flags=None, type_hints=None, type_hint_rules=None): 100 | super().__init__(text, quoting_flags, type_hints) 101 | 102 | self._validator = TextValidator(text) 103 | self._logger = TextSourceLogger(self) 104 | 105 | def load(self): 106 | """ 107 | Extract tabular data as |TableData| instances from a MediaWiki text 108 | object. 109 | |load_source_desc_text| 110 | 111 | :return: 112 | Loaded table data iterator. 113 | |load_table_name_desc| 114 | 115 | =================== ============================================== 116 | Format specifier Value after the replacement 117 | =================== ============================================== 118 | ``%(filename)s`` ``""`` 119 | ``%(key)s`` | This replaced to: 120 | | **(1)** ``caption`` mark of the table 121 | | **(2)** ``%(format_name)s%(format_id)s`` 122 | | if ``caption`` mark not included 123 | | in the table. 124 | ``%(format_name)s`` ``"mediawiki"`` 125 | ``%(format_id)s`` |format_id_desc| 126 | ``%(global_id)s`` |global_id| 127 | =================== ============================================== 128 | :rtype: |TableData| iterator 129 | :raises pytablereader.DataError: 130 | If the MediaWiki data is invalid or empty. 131 | """ 132 | 133 | self._validate() 134 | self._logger.logging_load() 135 | 136 | formatter = MediaWikiTableFormatter(self.source) 137 | formatter.accept(self) 138 | 139 | return formatter.to_table_data() 140 | 141 | def _get_default_table_name_template(self): 142 | return f"{tnt.KEY:s}" 143 | -------------------------------------------------------------------------------- /pytablereader/mediawiki/formatter.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> 3 | """ 4 | 5 | from ..error import PypandocImportError 6 | from ..html.formatter import HtmlTableFormatter 7 | 8 | 9 | class MediaWikiTableFormatter(HtmlTableFormatter): 10 | def __init__(self, source_data): 11 | try: 12 | import pypandoc 13 | except ImportError as e: 14 | # pypandoc package may do not installed in the system since the package is 15 | # an optional dependency 16 | raise PypandocImportError(e) 17 | 18 | super().__init__(pypandoc.convert_text(source_data, "html", format="mediawiki")) 19 | -------------------------------------------------------------------------------- /pytablereader/spreadsheet/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thombashi/pytablereader/b2a6a3db3ef52f5db942340ae75a6905df64a960/pytablereader/spreadsheet/__init__.py -------------------------------------------------------------------------------- /pytablereader/spreadsheet/core.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> 3 | """ 4 | 5 | import abc 6 | 7 | from .._constant import TableNameTemplate as tnt 8 | from ..interface import AbstractTableReader 9 | 10 | 11 | class SpreadSheetLoader(AbstractTableReader): 12 | """ 13 | An abstract class of table data. 14 | Especially spreadsheets that consists multiple rows. 15 | 16 | .. py:attribute:: start_row 17 | 18 | The first row to search header row. 19 | """ 20 | 21 | def __init__(self, source, quoting_flags, type_hints, type_hint_rules): 22 | super().__init__(source, quoting_flags, type_hints, type_hint_rules) 23 | 24 | self.start_row = 0 25 | self._worksheet = None 26 | self._start_col_idx = None 27 | self._end_col_idx = None 28 | 29 | @abc.abstractproperty 30 | def _sheet_name(self): # pragma: no cover 31 | pass 32 | 33 | @abc.abstractproperty 34 | def _row_count(self): # pragma: no cover 35 | pass 36 | 37 | @abc.abstractproperty 38 | def _col_count(self): # pragma: no cover 39 | pass 40 | 41 | @abc.abstractmethod 42 | def _is_empty_sheet(self): # pragma: no cover 43 | pass 44 | 45 | @abc.abstractmethod 46 | def _get_start_row_idx(self): # pragma: no cover 47 | pass 48 | 49 | @property 50 | def format_name(self): 51 | return "spreadsheet" 52 | 53 | def _make_table_name(self): 54 | kv_mapping = self._get_basic_tablename_keyvalue_mapping() 55 | 56 | try: 57 | kv_mapping[tnt.SHEET] = self._sheet_name 58 | except AttributeError: 59 | kv_mapping[tnt.SHEET] = "" 60 | 61 | return self._expand_table_name_format(kv_mapping) 62 | 63 | def _get_default_table_name_template(self): 64 | return f"{tnt.SHEET:s}" 65 | 66 | def _extract_type_hints(self, headers=None): 67 | if self.type_hints: 68 | return self.type_hints 69 | 70 | if not self.type_hint_rules or not headers: 71 | return [] 72 | 73 | type_hints = [] 74 | for header in headers: 75 | for regexp, type_hint in self.type_hint_rules.items(): 76 | if regexp.search(header): 77 | type_hints.append(type_hint) 78 | break 79 | else: 80 | type_hints.append(None) 81 | 82 | return type_hints 83 | -------------------------------------------------------------------------------- /pytablereader/spreadsheet/excelloader.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> 3 | """ 4 | 5 | from tabledata import TableData 6 | 7 | from pytablereader import DataError 8 | 9 | from .._logger import FileSourceLogger 10 | from .._validator import FileValidator 11 | from ..error import OpenError 12 | from .core import SpreadSheetLoader 13 | 14 | 15 | class ExcelTableFileLoader(SpreadSheetLoader): 16 | """ 17 | A file loader class to extract tabular data from Microsoft Excel |TM| 18 | files. 19 | 20 | :param str file_path: Path to the loading Excel workbook file. 21 | 22 | .. py:attribute:: table_name 23 | 24 | Table name string. Defaults to ``%(sheet)s``. 25 | 26 | .. py:attribute:: start_row 27 | 28 | The first row to search header row. 29 | """ 30 | 31 | @property 32 | def format_name(self): 33 | return "excel" 34 | 35 | @property 36 | def _sheet_name(self): 37 | return self._worksheet.name 38 | 39 | @property 40 | def _row_count(self): 41 | return self._worksheet.nrows 42 | 43 | @property 44 | def _col_count(self): 45 | return self._worksheet.ncols 46 | 47 | def __init__(self, file_path=None, quoting_flags=None, type_hints=None, type_hint_rules=None): 48 | super().__init__(file_path, quoting_flags, type_hints, type_hint_rules) 49 | 50 | self._validator = FileValidator(file_path) 51 | self._logger = FileSourceLogger(self) 52 | 53 | def load(self): 54 | """ 55 | Extract tabular data as |TableData| instances from an Excel file. 56 | |spreadsheet_load_desc| 57 | 58 | :return: 59 | Loaded |TableData| iterator. 60 | |TableData| created for each sheet in the workbook. 61 | |load_table_name_desc| 62 | 63 | =================== ==================================== 64 | Format specifier Value after the replacement 65 | =================== ==================================== 66 | ``%(filename)s`` Filename of the workbook 67 | ``%(sheet)s`` Name of the sheet 68 | ``%(format_name)s`` ``"spreadsheet"`` 69 | ``%(format_id)s`` |format_id_desc| 70 | ``%(global_id)s`` |global_id| 71 | =================== ==================================== 72 | :rtype: |TableData| iterator 73 | :raises pytablereader.DataError: 74 | If the header row is not found. 75 | :raises pytablereader.error.OpenError: 76 | If failed to open the source file. 77 | """ 78 | 79 | try: 80 | import excelrd as xlrd 81 | except ImportError: 82 | import xlrd 83 | 84 | self._validate() 85 | self._logger.logging_load() 86 | 87 | try: 88 | workbook = xlrd.open_workbook(self.source) 89 | except xlrd.biffh.XLRDError as e: 90 | raise OpenError(e) 91 | 92 | for worksheet in workbook.sheets(): 93 | self._worksheet = worksheet 94 | 95 | if self._is_empty_sheet(): 96 | continue 97 | 98 | self.__extract_not_empty_col_idx() 99 | 100 | try: 101 | start_row_idx = self._get_start_row_idx() 102 | except DataError: 103 | continue 104 | 105 | rows = [ 106 | self.__get_row_values(row_idx) 107 | for row_idx in range(start_row_idx + 1, self._row_count) 108 | ] 109 | 110 | self.inc_table_count() 111 | headers = self.__get_row_values(start_row_idx) 112 | 113 | yield TableData( 114 | self._make_table_name(), 115 | headers, 116 | rows, 117 | dp_extractor=self.dp_extractor, 118 | type_hints=self._extract_type_hints(headers), 119 | ) 120 | 121 | def _is_empty_sheet(self): 122 | return any( 123 | [ 124 | self._col_count == 0, 125 | self._row_count <= 1, 126 | # nrows == 1 means exists header row only 127 | ] 128 | ) 129 | 130 | def _get_start_row_idx(self): 131 | for row_idx in range(self.start_row, self._row_count): 132 | if self.__is_header_row(row_idx): 133 | break 134 | else: 135 | raise DataError("header row not found") 136 | 137 | return row_idx 138 | 139 | def __is_header_row(self, row_idx): 140 | try: 141 | from excelrd import XL_CELL_EMPTY 142 | except ImportError: 143 | from xlrd import XL_CELL_EMPTY 144 | 145 | return XL_CELL_EMPTY not in self._worksheet.row_types( 146 | row_idx, self._start_col_idx, self._end_col_idx + 1 147 | ) 148 | 149 | @staticmethod 150 | def __is_empty_cell_types(cell_types): 151 | try: 152 | from excelrd import XL_CELL_EMPTY 153 | except ImportError: 154 | from xlrd import XL_CELL_EMPTY 155 | 156 | return all([cell_type == XL_CELL_EMPTY for cell_type in cell_types]) 157 | 158 | def __extract_not_empty_col_idx(self): 159 | col_idx_list = [ 160 | col_idx 161 | for col_idx in range(self._col_count) 162 | if not self.__is_empty_cell_types(self._worksheet.col_types(col_idx)) 163 | ] 164 | 165 | self._start_col_idx = min(col_idx_list) 166 | self._end_col_idx = max(col_idx_list) 167 | 168 | def __get_row_values(self, row_idx): 169 | return self._worksheet.row_values(row_idx, self._start_col_idx, self._end_col_idx + 1) 170 | -------------------------------------------------------------------------------- /pytablereader/spreadsheet/gsloader.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> 3 | """ 4 | 5 | import typepy 6 | from tabledata import TableData 7 | 8 | from .._constant import TableNameTemplate as tnt 9 | from .._validator import TextValidator 10 | from ..error import APIError, OpenError 11 | from .core import SpreadSheetLoader 12 | 13 | 14 | class GoogleSheetsTableLoader(SpreadSheetLoader): 15 | """ 16 | Concrete class of Google Spreadsheet loader. 17 | 18 | .. py:attribute:: table_name 19 | 20 | Table name string. Defaults to ``%(sheet)s``. 21 | 22 | :param str file_path: Path to the Google Sheets credential JSON file. 23 | 24 | :Dependency Packages: 25 | - `gspread <https://github.com/burnash/gspread>`_ 26 | - `SimpleSQLite <https://github.com/thombashi/SimpleSQLite>`_ 27 | - `oauth2client <https://pypi.org/project/oauth2client>`_ 28 | - `pyOpenSSL <https://pypi.org/project/pyOpenSSL>`_ 29 | 30 | :Examples: 31 | :ref:`example-gs-table-loader` 32 | """ 33 | 34 | @property 35 | def _sheet_name(self): 36 | return self._worksheet.title 37 | 38 | @property 39 | def _row_count(self): 40 | return self._worksheet.row_count 41 | 42 | @property 43 | def _col_count(self): 44 | return self._worksheet.col_count 45 | 46 | def __init__(self, file_path=None, quoting_flags=None, type_hints=None, type_hint_rules=None): 47 | super().__init__(file_path, quoting_flags, type_hints, type_hint_rules) 48 | 49 | self.title = None 50 | self.start_row = 0 51 | 52 | self._validator = TextValidator(file_path) 53 | 54 | self.__all_values = None 55 | 56 | def load(self): 57 | """ 58 | Load table data from a Google Spreadsheet. 59 | 60 | This method consider :py:attr:`.source` as a path to the 61 | credential JSON file to access Google Sheets API. 62 | 63 | The method automatically search the header row start from 64 | :py:attr:`.start_row`. The condition of the header row is that 65 | all of the columns have value (except empty columns). 66 | 67 | :return: 68 | Loaded table data. Return one |TableData| for each sheet in 69 | the workbook. The table name for data will be determined by 70 | :py:meth:`~.GoogleSheetsTableLoader.make_table_name`. 71 | :rtype: iterator of |TableData| 72 | :raises pytablereader.DataError: 73 | If the header row is not found. 74 | :raises pytablereader.OpenError: 75 | If the spread sheet not found. 76 | """ 77 | 78 | import gspread 79 | from oauth2client.service_account import ServiceAccountCredentials 80 | 81 | self._validate_table_name() 82 | self._validate_title() 83 | 84 | scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"] 85 | credentials = ServiceAccountCredentials.from_json_keyfile_name(self.source, scope) 86 | 87 | gc = gspread.authorize(credentials) 88 | try: 89 | for worksheet in gc.open(self.title).worksheets(): 90 | self._worksheet = worksheet 91 | self.__all_values = [row for row in worksheet.get_all_values()] 92 | 93 | if self._is_empty_sheet(): 94 | continue 95 | 96 | try: 97 | self.__strip_empty_col() 98 | except ValueError: 99 | continue 100 | 101 | value_matrix = self.__all_values[self._get_start_row_idx() :] 102 | try: 103 | headers = value_matrix[0] 104 | rows = value_matrix[1:] 105 | except IndexError: 106 | continue 107 | 108 | self.inc_table_count() 109 | 110 | yield TableData( 111 | self.make_table_name(), 112 | headers, 113 | rows, 114 | dp_extractor=self.dp_extractor, 115 | type_hints=self._extract_type_hints(headers), 116 | ) 117 | except gspread.exceptions.SpreadsheetNotFound: 118 | raise OpenError(f"spreadsheet '{self.title}' not found") 119 | except gspread.exceptions.APIError as e: 120 | raise APIError(e) 121 | 122 | def _is_empty_sheet(self): 123 | return len(self.__all_values) <= 1 124 | 125 | def _get_start_row_idx(self): 126 | row_idx = 0 127 | for row_values in self.__all_values: 128 | if all([typepy.is_not_null_string(value) for value in row_values]): 129 | break 130 | 131 | row_idx += 1 132 | 133 | return self.start_row + row_idx 134 | 135 | def _validate_title(self): 136 | if typepy.is_null_string(self.title): 137 | raise ValueError("spreadsheet title is empty") 138 | 139 | def _make_table_name(self): 140 | self._validate_title() 141 | 142 | kv_mapping = self._get_basic_tablename_keyvalue_mapping() 143 | kv_mapping[tnt.TITLE] = self.title 144 | try: 145 | kv_mapping[tnt.SHEET] = self._sheet_name 146 | except AttributeError: 147 | kv_mapping[tnt.SHEET] = "" 148 | 149 | return self._expand_table_name_format(kv_mapping) 150 | 151 | def __strip_empty_col(self): 152 | from simplesqlite import connect_memdb 153 | from simplesqlite.query import Attr, AttrList 154 | 155 | con = connect_memdb() 156 | 157 | tmp_table_name = "tmp" 158 | headers = [f"a{i:d}" for i in range(len(self.__all_values[0]))] 159 | con.create_table_from_data_matrix(tmp_table_name, headers, self.__all_values) 160 | for col_idx, header in enumerate(headers): 161 | result = con.select(select=Attr(header), table_name=tmp_table_name) 162 | if any([typepy.is_not_null_string(record[0]) for record in result.fetchall()]): 163 | break 164 | 165 | strip_headers = headers[col_idx:] 166 | if typepy.is_empty_sequence(strip_headers): 167 | raise ValueError() 168 | 169 | result = con.select(select=AttrList(strip_headers), table_name=tmp_table_name) 170 | self.__all_values = result.fetchall() 171 | -------------------------------------------------------------------------------- /pytablereader/sqlite/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thombashi/pytablereader/b2a6a3db3ef52f5db942340ae75a6905df64a960/pytablereader/sqlite/__init__.py -------------------------------------------------------------------------------- /pytablereader/sqlite/core.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> 3 | """ 4 | 5 | from .._constant import TableNameTemplate as tnt 6 | from .._validator import FileValidator 7 | from ..interface import AbstractTableReader 8 | from .formatter import SqliteTableFormatter 9 | 10 | 11 | class SqliteFileLoader(AbstractTableReader): 12 | """ 13 | A file loader class to extract tabular data from SQLite database files. 14 | 15 | :param str file_path: Path to the loading SQLite database file. 16 | 17 | .. py:attribute:: table_name 18 | 19 | Table name string. Defaults to ``%(filename)s_%(key)s``. 20 | 21 | :Dependency Packages: 22 | - `SimpleSQLite <https://github.com/thombashi/SimpleSQLite>`__ 23 | """ 24 | 25 | @property 26 | def format_name(self): 27 | return "sqlite" 28 | 29 | def __init__(self, file_path=None, quoting_flags=None, type_hints=None, type_hint_rules=None): 30 | super().__init__(file_path, quoting_flags, type_hints, type_hint_rules) 31 | 32 | self._validator = FileValidator(file_path) 33 | 34 | def load(self): 35 | """ 36 | Extract tabular data as |TableData| instances from a SQLite database 37 | file. |load_source_desc_file| 38 | 39 | :return: 40 | Loaded table data iterator. 41 | |load_table_name_desc| 42 | 43 | =================== ============================================== 44 | Format specifier Value after the replacement 45 | =================== ============================================== 46 | ``%(filename)s`` |filename_desc| 47 | ``%(key)s`` ``%(format_name)s%(format_id)s`` 48 | ``%(format_name)s`` ``"sqlite"`` 49 | ``%(format_id)s`` |format_id_desc| 50 | ``%(global_id)s`` |global_id| 51 | =================== ============================================== 52 | :rtype: |TableData| iterator 53 | :raises pytablereader.DataError: 54 | If the SQLite database file data is invalid or empty. 55 | """ 56 | 57 | self._validate() 58 | 59 | formatter = SqliteTableFormatter(self.source) 60 | formatter.accept(self) 61 | 62 | return formatter.to_table_data() 63 | 64 | def _get_default_table_name_template(self): 65 | return f"{tnt.FORMAT_NAME:s}{tnt.FORMAT_ID:s}" 66 | -------------------------------------------------------------------------------- /pytablereader/sqlite/formatter.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> 3 | """ 4 | 5 | import typepy 6 | from tabledata import TableData 7 | 8 | from pytablereader import DataError 9 | 10 | from .._constant import TableNameTemplate as tnt 11 | from ..formatter import TableFormatter 12 | 13 | 14 | class SqliteTableFormatter(TableFormatter): 15 | def __init__(self, source_data): 16 | super().__init__(source_data) 17 | 18 | self.__table_name = None 19 | 20 | if typepy.is_null_string(source_data): 21 | raise DataError 22 | 23 | def to_table_data(self): 24 | from simplesqlite import SimpleSQLite 25 | from simplesqlite.query import AttrList 26 | 27 | con = SimpleSQLite(self._source_data, "r") 28 | 29 | for table in con.fetch_table_names(): 30 | self.__table_name = table 31 | 32 | attr_names = con.fetch_attr_names(table) 33 | data_matrix = con.select(select=AttrList(attr_names), table_name=table).fetchall() 34 | 35 | yield TableData( 36 | table, 37 | attr_names, 38 | data_matrix, 39 | dp_extractor=self._loader.dp_extractor, 40 | type_hints=self._extract_type_hints(attr_names), 41 | ) 42 | 43 | def _make_table_name(self): 44 | return self._loader._expand_table_name_format( 45 | self._loader._get_basic_tablename_keyvalue_mapping() + [(tnt.KEY, self.__table_name)] 46 | ) 47 | -------------------------------------------------------------------------------- /pytablereader/tsv/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thombashi/pytablereader/b2a6a3db3ef52f5db942340ae75a6905df64a960/pytablereader/tsv/__init__.py -------------------------------------------------------------------------------- /pytablereader/tsv/core.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> 3 | """ 4 | 5 | from .._validator import FileValidator, TextValidator 6 | from ..csv.core import CsvTableFileLoader, CsvTableTextLoader 7 | 8 | 9 | class TsvTableFileLoader(CsvTableFileLoader): 10 | """ 11 | Tab separated values (TSV) format file loader class. 12 | 13 | :param str file_path: Path to the loading TSV file. 14 | 15 | .. py:attribute:: table_name 16 | 17 | Table name string. Defaults to ``%(filename)s``. 18 | """ 19 | 20 | @property 21 | def format_name(self): 22 | return "tsv" 23 | 24 | def __init__(self, file_path, quoting_flags=None, type_hints=None, type_hint_rules=None): 25 | super().__init__(file_path, quoting_flags, type_hints, type_hint_rules) 26 | 27 | self.delimiter = "\t" 28 | 29 | self._validator = FileValidator(file_path) 30 | 31 | 32 | class TsvTableTextLoader(CsvTableTextLoader): 33 | """ 34 | Tab separated values (TSV) format text loader class. 35 | 36 | :param str text: TSV text to load. 37 | 38 | .. py:attribute:: table_name 39 | 40 | Table name string. Defaults to ``%(format_name)s%(format_id)s``. 41 | """ 42 | 43 | @property 44 | def format_name(self): 45 | return "tsv" 46 | 47 | def __init__(self, text, quoting_flags=None, type_hints=None, type_hint_rules=None): 48 | super().__init__(text, quoting_flags, type_hints, type_hint_rules) 49 | 50 | self.delimiter = "\t" 51 | 52 | self._validator = TextValidator(text) 53 | -------------------------------------------------------------------------------- /requirements/docs_requirements.txt: -------------------------------------------------------------------------------- 1 | path>=13 2 | pytablereader 3 | sphinx_rtd_theme>=1.2.2 4 | Sphinx>=2.4.1 5 | -------------------------------------------------------------------------------- /requirements/requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4>=4.5.3,<5 2 | DataProperty>=0.54.2,<2 3 | jsonschema>=2.5.1,<5 4 | mbstrdecoder>=1.0.0,<2 5 | pathvalidate>=2.5.2,<4 6 | path>=13,<17 7 | tabledata>=1.1.1,<2 8 | typepy>=1.2.0,<2 9 | -------------------------------------------------------------------------------- /requirements/test_requirements.txt: -------------------------------------------------------------------------------- 1 | pypandoc 2 | pytablewriter[excel]>=0.50 3 | pytest>=5 4 | responses 5 | subprocrunner 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> 3 | """ 4 | 5 | import os.path 6 | 7 | import setuptools 8 | 9 | 10 | MODULE_NAME = "pytablereader" 11 | REPOSITORY_URL = f"https://github.com/thombashi/{MODULE_NAME:s}" 12 | REQUIREMENT_DIR = "requirements" 13 | ENCODING = "utf8" 14 | 15 | pkg_info = {} 16 | 17 | 18 | def get_release_command_class(): 19 | try: 20 | from releasecmd import ReleaseCommand 21 | except ImportError: 22 | return {} 23 | 24 | return {"release": ReleaseCommand} 25 | 26 | 27 | with open(os.path.join(MODULE_NAME, "__version__.py")) as f: 28 | exec(f.read(), pkg_info) 29 | 30 | with open("README.rst", encoding=ENCODING) as fp: 31 | long_description = fp.read() 32 | 33 | with open(os.path.join("docs", "pages", "introduction", "summary.txt"), encoding=ENCODING) as f: 34 | summary = f.read().strip() 35 | 36 | with open(os.path.join(REQUIREMENT_DIR, "requirements.txt")) as f: 37 | install_requires = [line.strip() for line in f if line.strip()] 38 | 39 | with open(os.path.join(REQUIREMENT_DIR, "test_requirements.txt")) as f: 40 | tests_requires = [line.strip() for line in f if line.strip()] 41 | 42 | setuptools_require = ["setuptools>=38.3.0"] 43 | excel_requires = ["excelrd>=2.0.2"] 44 | 45 | markdown_requires = ["Markdown>=2.6.6,<4"] 46 | mediawiki_requires = ["pypandoc"] 47 | sqlite_requires = ["SimpleSQLite>=1.3.2,<2"] 48 | gs_requires = ["gspread", "oauth2client", "pyOpenSSL"] + sqlite_requires 49 | logging_requires = ["loguru>=0.4.1,<1"] 50 | url_requires = ["retryrequests>=0.1,<1"] 51 | optional_requires = ["simplejson>=3.8.1,<4"] 52 | tests_requires = frozenset( 53 | tests_requires 54 | + excel_requires 55 | + markdown_requires 56 | + mediawiki_requires 57 | + sqlite_requires 58 | + url_requires 59 | ) 60 | 61 | setuptools.setup( 62 | name=MODULE_NAME, 63 | version=pkg_info["__version__"], 64 | url=REPOSITORY_URL, 65 | author=pkg_info["__author__"], 66 | author_email=pkg_info["__email__"], 67 | description=summary, 68 | include_package_data=True, 69 | keywords=[ 70 | "table", 71 | "reader", 72 | "pandas", 73 | "CSV", 74 | "Excel", 75 | "HTML", 76 | "JSON", 77 | "LTSV", 78 | "Markdown", 79 | "MediaWiki", 80 | "TSV", 81 | "SQLite", 82 | ], 83 | license=pkg_info["__license__"], 84 | long_description=long_description, 85 | long_description_content_type="text/x-rst", 86 | packages=setuptools.find_packages(exclude=["test*"]), 87 | project_urls={ 88 | "Documentation": f"https://{MODULE_NAME:s}.rtfd.io/", 89 | "Source": REPOSITORY_URL, 90 | "Tracker": f"{REPOSITORY_URL:s}/issues", 91 | "Changlog": f"{REPOSITORY_URL:s}/releases", 92 | }, 93 | python_requires=">=3.7", 94 | install_requires=setuptools_require + install_requires, 95 | setup_requires=setuptools_require, 96 | extras_require={ 97 | "all": set( 98 | excel_requires 99 | + gs_requires 100 | + logging_requires 101 | + markdown_requires 102 | + mediawiki_requires 103 | + sqlite_requires 104 | + url_requires 105 | ), 106 | "excel": excel_requires, 107 | "gs": gs_requires, 108 | "logging": logging_requires, 109 | "md": markdown_requires, 110 | "mediawiki": mediawiki_requires, 111 | "url": url_requires, 112 | "sqlite": sqlite_requires, 113 | "test": tests_requires, 114 | }, 115 | classifiers=[ 116 | "Development Status :: 4 - Beta", 117 | "Intended Audience :: Developers", 118 | "Intended Audience :: Information Technology", 119 | "License :: OSI Approved :: MIT License", 120 | "Operating System :: OS Independent", 121 | "Programming Language :: Python :: 3", 122 | "Programming Language :: Python :: 3.7", 123 | "Programming Language :: Python :: 3.8", 124 | "Programming Language :: Python :: 3.9", 125 | "Programming Language :: Python :: 3.10", 126 | "Programming Language :: Python :: 3.11", 127 | "Programming Language :: Python :: 3 :: Only", 128 | "Programming Language :: Python :: Implementation :: CPython", 129 | "Programming Language :: Python :: Implementation :: PyPy", 130 | "Topic :: Database", 131 | "Topic :: Software Development :: Libraries", 132 | "Topic :: Software Development :: Libraries :: Python Modules", 133 | "Topic :: Text Processing", 134 | ], 135 | cmdclass=get_release_command_class(), 136 | ) 137 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thombashi/pytablereader/b2a6a3db3ef52f5db942340ae75a6905df64a960/test/__init__.py -------------------------------------------------------------------------------- /test/_common.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> 3 | """ 4 | 5 | import re 6 | import sys 7 | 8 | from typepy import Integer, RealNumber, String 9 | 10 | 11 | TYPE_HINT_RULES = { 12 | re.compile("[ -_]text$", re.IGNORECASE): String, 13 | re.compile("[ -_]integer$", re.IGNORECASE): Integer, 14 | re.compile("[ -_]real$", re.IGNORECASE): RealNumber, 15 | } 16 | 17 | 18 | def fifo_writer(fifo_name, text): 19 | with open(fifo_name, "w") as p: 20 | p.write(text) 21 | 22 | 23 | def print_test_result(expected, actual, error=None): 24 | print(f"[expected]\n{expected}\n") 25 | print(f"[actual]\n{actual}\n") 26 | 27 | if error: 28 | print(error, file=sys.stderr) 29 | -------------------------------------------------------------------------------- /test/data/valid.sqlite3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thombashi/pytablereader/b2a6a3db3ef52f5db942340ae75a6905df64a960/test/data/valid.sqlite3 -------------------------------------------------------------------------------- /test/data/validdata.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thombashi/pytablereader/b2a6a3db3ef52f5db942340ae75a6905df64a960/test/data/validdata.xlsx -------------------------------------------------------------------------------- /test/factory/test_file_loader_factory.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> 3 | """ 4 | 5 | import pytest 6 | 7 | import pytablereader as ptr 8 | 9 | 10 | class Test_TableFileLoaderFactory: 11 | @pytest.mark.parametrize(["value", "expected"], [[None, ValueError]]) 12 | def test_exception(self, value, expected): 13 | with pytest.raises(expected): 14 | ptr.factory.TableFileLoaderFactory(value) 15 | 16 | 17 | class Test_TableFileLoaderFactory_create_from_path: 18 | @pytest.mark.parametrize( 19 | ["value", "extension", "expected"], 20 | [ 21 | ["valid_ext.csv", "csv", ptr.CsvTableFileLoader], 22 | ["valid_ext.CSV", "csv", ptr.CsvTableFileLoader], 23 | ["valid_ext.html", "html", ptr.HtmlTableFileLoader], 24 | ["valid_ext.HTML", "html", ptr.HtmlTableFileLoader], 25 | ["valid_ext.htm", "htm", ptr.HtmlTableFileLoader], 26 | ["valid_ext.HTM", "htm", ptr.HtmlTableFileLoader], 27 | ["valid_ext.json", "json", ptr.JsonTableFileLoader], 28 | ["valid_ext.JSON", "json", ptr.JsonTableFileLoader], 29 | ["valid_ext.md", "md", ptr.MarkdownTableFileLoader], 30 | ["valid_ext.MD", "md", ptr.MarkdownTableFileLoader], 31 | ["valid_ext.sqlite", "sqlite", ptr.SqliteFileLoader], 32 | ["valid_ext.sqlite3", "sqlite3", ptr.SqliteFileLoader], 33 | ["valid_ext.tsv", "tsv", ptr.TsvTableFileLoader], 34 | ["valid_ext.TSV", "tsv", ptr.TsvTableFileLoader], 35 | ["valid_ext.xls", "xls", ptr.ExcelTableFileLoader], 36 | ["valid_ext.XLS", "xls", ptr.ExcelTableFileLoader], 37 | ["valid_ext.xlsx", "xlsx", ptr.ExcelTableFileLoader], 38 | ["valid_ext.XLSX", "xlsx", ptr.ExcelTableFileLoader], 39 | ], 40 | ) 41 | def test_normal(self, value, extension, expected): 42 | loader_factory = ptr.factory.TableFileLoaderFactory(value) 43 | loader = loader_factory.create_from_path() 44 | 45 | assert loader_factory.file_extension.lower() == extension 46 | assert loader.source == value 47 | assert isinstance(loader, expected) 48 | 49 | @pytest.mark.parametrize( 50 | ["value", "expected"], 51 | [ 52 | ["hoge", ptr.LoaderNotFoundError], 53 | ["hoge.txt", ptr.LoaderNotFoundError], 54 | [".txt", ptr.LoaderNotFoundError], 55 | ["", ptr.InvalidFilePathError], 56 | ], 57 | ) 58 | def test_exception(self, value, expected): 59 | loader_factory = ptr.factory.TableFileLoaderFactory(value) 60 | 61 | with pytest.raises(expected): 62 | loader_factory.create_from_path() 63 | 64 | 65 | class Test_TableFileLoaderFactory_create_from_format_name: 66 | @pytest.mark.parametrize( 67 | ["file_path", "format_name", "expected"], 68 | [ 69 | ["valid_ext.html", "csv", ptr.CsvTableFileLoader], 70 | ["invalid_ext.txt", "CSV", ptr.CsvTableFileLoader], 71 | ["valid_ext.html", "excel", ptr.ExcelTableFileLoader], 72 | ["invalid_ext.txt", "Excel", ptr.ExcelTableFileLoader], 73 | ["valid_ext.json", "html", ptr.HtmlTableFileLoader], 74 | ["invalid_ext.txt", "HTML", ptr.HtmlTableFileLoader], 75 | ["valid_ext.html", "json", ptr.JsonTableFileLoader], 76 | ["invalid_ext.txt", "JSON", ptr.JsonTableFileLoader], 77 | ["valid_ext.html", "markdown", ptr.MarkdownTableFileLoader], 78 | ["invalid_ext.txt", "Markdown", ptr.MarkdownTableFileLoader], 79 | ["valid_ext.html", "mediawiki", ptr.MediaWikiTableFileLoader], 80 | ["invalid_ext.txt", "MediaWiki", ptr.MediaWikiTableFileLoader], 81 | ["valid_ext.db", "sqlite", ptr.SqliteFileLoader], 82 | ["valid_ext.html", "tsv", ptr.TsvTableFileLoader], 83 | ["invalid_ext.txt", "TSV", ptr.TsvTableFileLoader], 84 | ], 85 | ) 86 | def test_normal(self, file_path, format_name, expected): 87 | loader_factory = ptr.factory.TableFileLoaderFactory(file_path) 88 | loader = loader_factory.create_from_format_name(format_name) 89 | 90 | assert loader.source == file_path 91 | assert isinstance(loader, expected) 92 | 93 | @pytest.mark.parametrize( 94 | ["file_path", "format_name", "expected"], 95 | [ 96 | ["valid_ext.csv", "not_exist_format", ptr.LoaderNotFoundError], 97 | ["valid_ext.csv", "", ptr.LoaderNotFoundError], 98 | ["valid_ext.csv", None, TypeError], 99 | ["valid_ext.csv", 0, TypeError], 100 | ["valid_ext.csv", "auto", ptr.LoaderNotFoundError], 101 | ], 102 | ) 103 | def test_exception(self, file_path, format_name, expected): 104 | loader_factory = ptr.factory.TableFileLoaderFactory(file_path) 105 | 106 | with pytest.raises(expected): 107 | loader_factory.create_from_format_name(format_name) 108 | -------------------------------------------------------------------------------- /test/factory/test_text_loader_factory.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> 3 | """ 4 | 5 | import pytest 6 | 7 | import pytablereader as ptr 8 | 9 | 10 | class Test_TableTextLoaderFactory: 11 | @pytest.mark.parametrize(["value", "expected"], [[None, ValueError]]) 12 | def test_exception(self, value, expected): 13 | with pytest.raises(expected): 14 | ptr.factory.TableTextLoaderFactory(value) 15 | 16 | 17 | class Test_TableTextLoaderFactory_create_from_format_name: 18 | @pytest.mark.parametrize( 19 | ["format_name", "expected"], 20 | [ 21 | ["csv", ptr.CsvTableTextLoader], 22 | ["CSV", ptr.CsvTableTextLoader], 23 | ["html", ptr.HtmlTableTextLoader], 24 | ["HTML", ptr.HtmlTableTextLoader], 25 | ["json", ptr.JsonTableTextLoader], 26 | ["JSON", ptr.JsonTableTextLoader], 27 | ["markdown", ptr.MarkdownTableTextLoader], 28 | ["Markdown", ptr.MarkdownTableTextLoader], 29 | ["mediawiki", ptr.MediaWikiTableTextLoader], 30 | ["MediaWiki", ptr.MediaWikiTableTextLoader], 31 | ["tsv", ptr.TsvTableTextLoader], 32 | ["TSV", ptr.TsvTableTextLoader], 33 | ], 34 | ) 35 | def test_normal(self, format_name, expected): 36 | loader_factory = ptr.factory.TableTextLoaderFactory("dummy") 37 | loader = loader_factory.create_from_format_name(format_name) 38 | 39 | assert isinstance(loader, expected) 40 | 41 | @pytest.mark.parametrize( 42 | ["format_name", "expected"], 43 | [ 44 | ["not_exist_format", ptr.LoaderNotFoundError], 45 | ["", ptr.LoaderNotFoundError], 46 | [None, TypeError], 47 | [0, TypeError], 48 | ["auto", ptr.LoaderNotFoundError], 49 | ], 50 | ) 51 | def test_exception(self, format_name, expected): 52 | loader_factory = ptr.factory.TableTextLoaderFactory("dummyy") 53 | 54 | with pytest.raises(expected): 55 | loader_factory.create_from_format_name(format_name) 56 | -------------------------------------------------------------------------------- /test/loader/test_gsloader.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> 3 | """ 4 | 5 | import pytest 6 | 7 | from pytablereader import GoogleSheetsTableLoader 8 | 9 | 10 | class Test_GoogleSheetsTableLoader_make_table_name: 11 | @property 12 | def monkey_property(self): 13 | return "testsheet" 14 | 15 | @pytest.mark.parametrize( 16 | ["value", "title", "expected"], 17 | [ 18 | ["%(sheet)s", "titlename", "testsheet"], 19 | ["%(title)s", "titlename", "titlename"], 20 | ["%(title)s", "table", "table"], 21 | ["prefix_%(title)s_%(sheet)s", "titlename", "prefix_titlename_testsheet"], 22 | ["%(format_name)s%(format_id)s", "titlename", "spreadsheet0"], 23 | ], 24 | ) 25 | def test_normal(self, monkeypatch, value, title, expected): 26 | loader = GoogleSheetsTableLoader("dummy") 27 | loader.table_name = value 28 | loader.title = title 29 | 30 | monkeypatch.setattr(GoogleSheetsTableLoader, "_sheet_name", self.monkey_property) 31 | 32 | assert loader.make_table_name() == expected 33 | 34 | @pytest.mark.parametrize( 35 | ["value", "title", "expected"], 36 | [ 37 | [None, "titlename", ValueError], 38 | ["", "titlename", ValueError], 39 | ["%(sheet)s", None, ValueError], 40 | ["%(sheet)s", "", ValueError], 41 | ], 42 | ) 43 | def test_exception(self, value, title, expected): 44 | loader = GoogleSheetsTableLoader("dummy") 45 | loader.table_name = value 46 | loader.title = title 47 | 48 | with pytest.raises(expected): 49 | loader.make_table_name() 50 | -------------------------------------------------------------------------------- /test/loader/test_textloader.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> 3 | """ 4 | 5 | from textwrap import dedent 6 | 7 | import pytest 8 | from pytablewriter import dumps_tabledata 9 | from tabledata import TableData 10 | 11 | import pytablereader as ptr 12 | from pytablereader.interface import AbstractTableReader 13 | 14 | 15 | class Test_TableTextLoader_get_format_names: 16 | def test_normal(self): 17 | assert ptr.TableTextLoader.get_format_names() == [ 18 | "csv", 19 | "html", 20 | "json", 21 | "json_lines", 22 | "jsonl", 23 | "ldjson", 24 | "ltsv", 25 | "markdown", 26 | "mediawiki", 27 | "ndjson", 28 | "ssv", 29 | "tsv", 30 | ] 31 | 32 | 33 | class Test_TableTextLoader_constructor: 34 | @pytest.mark.parametrize( 35 | ["value", "format_name", "expected"], 36 | [ 37 | [None, None, ValueError], 38 | ["", None, ValueError], 39 | ["https://github.com/", None, ValueError], 40 | ["/tmp/valid/test/data/validext.csv/", None, ValueError], 41 | ["/tmp/invalid/test/data/invalidext.txt", "invalidformat", ptr.LoaderNotFoundError], 42 | ], 43 | ) 44 | def test_exception(self, value, format_name, expected): 45 | with pytest.raises(expected): 46 | ptr.TableTextLoader(value, format_name=format_name) 47 | 48 | 49 | class Test_TableTextLoader_load: 50 | def setup_method(self, method): 51 | AbstractTableReader.clear_table_count() 52 | 53 | def test_normal_csv(self): 54 | text = dedent( 55 | """\ 56 | "attr_a","attr_b","attr_c" 57 | 1,4,"a" 58 | 2,2.1,"bb" 59 | 3,120.9,"ccc" 60 | """ 61 | ) 62 | 63 | expected_list = [ 64 | TableData( 65 | "csv1", 66 | ["attr_a", "attr_b", "attr_c"], 67 | [[1, 4, "a"], [2, "2.1", "bb"], [3, "120.9", "ccc"]], 68 | ) 69 | ] 70 | loader = ptr.TableTextLoader(text, format_name="csv") 71 | 72 | assert loader.format_name == "csv" 73 | 74 | for tabledata, expected in zip(loader.load(), expected_list): 75 | print(dumps_tabledata(expected)) 76 | print(dumps_tabledata(tabledata)) 77 | 78 | assert tabledata.equals(expected) 79 | 80 | def test_normal_ssv(self): 81 | text = dedent( 82 | """\ 83 | USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND 84 | root 1 0.0 0.4 77664 8784 ? Ss May11 0:02 /sbin/init 85 | root 2 0.0 0.0 0 0 ? S May11 0:00 [kthreadd] 86 | root 4 0.0 0.0 0 0 ? I< May11 0:00 [kworker/0:0H] 87 | root 6 0.0 0.0 0 0 ? I< May11 0:00 [mm_percpu_wq] 88 | root 7 0.0 0.0 0 0 ? S May11 0:01 [ksoftirqd/0] 89 | """ 90 | ) 91 | 92 | expected_list = [ 93 | TableData( 94 | "csv1", 95 | [ 96 | "USER", 97 | "PID", 98 | "%CPU", 99 | "%MEM", 100 | "VSZ", 101 | "RSS", 102 | "TTY", 103 | "STAT", 104 | "START", 105 | "TIME", 106 | "COMMAND", 107 | ], 108 | [ 109 | ["root", 1, 0, 0.4, 77664, 8784, "?", "Ss", "May11", "0:02", "/sbin/init"], 110 | ["root", 2, 0, 0, 0, 0, "?", "S", "May11", "0:00", "[kthreadd]"], 111 | ["root", 4, 0, 0, 0, 0, "?", "I<", "May11", "0:00", "[kworker/0:0H]"], 112 | ["root", 6, 0, 0, 0, 0, "?", "I<", "May11", "0:00", "[mm_percpu_wq]"], 113 | ["root", 7, 0, 0, 0, 0, "?", "S", "May11", "0:01", "[ksoftirqd/0]"], 114 | ], 115 | ) 116 | ] 117 | loader = ptr.TableTextLoader(text, format_name="ssv") 118 | 119 | assert loader.format_name == "csv" 120 | 121 | for tabledata, expected in zip(loader.load(), expected_list): 122 | print(dumps_tabledata(expected)) 123 | print(dumps_tabledata(tabledata)) 124 | 125 | assert tabledata.equals(expected) 126 | 127 | def test_normal_json(self): 128 | text = dedent( 129 | """\ 130 | [ 131 | {"attr_a": 1}, 132 | {"attr_b": 2.1, "attr_c": "bb"} 133 | ]""" 134 | ) 135 | 136 | expected_list = [ 137 | TableData( 138 | "json1", 139 | ["attr_a", "attr_b", "attr_c"], 140 | [{"attr_a": 1}, {"attr_b": 2.1, "attr_c": "bb"}], 141 | ) 142 | ] 143 | loader = ptr.TableTextLoader(text, format_name="json") 144 | 145 | assert loader.format_name == "json" 146 | 147 | for table_data, expected in zip(loader.load(), expected_list): 148 | print(dumps_tabledata(expected)) 149 | print(dumps_tabledata(table_data)) 150 | 151 | assert table_data.equals(expected) 152 | -------------------------------------------------------------------------------- /test/test_common.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> 3 | """ 4 | 5 | import pytest 6 | 7 | from pytablereader import InvalidFilePathError 8 | from pytablereader._common import get_extension, make_temp_file_path_from_url 9 | 10 | 11 | class Test_get_extension: 12 | @pytest.mark.parametrize( 13 | ["value", "expected"], [["test.txt", "txt"], [".csv", ""], ["html", ""]] 14 | ) 15 | def test_normal(self, value, expected): 16 | assert get_extension(value) == expected 17 | 18 | @pytest.mark.parametrize( 19 | ["value", "expected"], [["", InvalidFilePathError], [None, InvalidFilePathError]] 20 | ) 21 | def test_null_table_name(self, value, expected): 22 | with pytest.raises(expected): 23 | get_extension(value) 24 | 25 | 26 | class Test_make_temp_file_path_from_url: 27 | @pytest.mark.parametrize( 28 | ["temp_dir_path", "value", "expected"], 29 | [ 30 | [ 31 | "/tmp", 32 | "https://raw.githubusercontent.com/valid/test/data/validext.csv", 33 | "/tmp/validext.csv", 34 | ], 35 | [ 36 | "/tmp", 37 | "https://raw.githubusercontent.com/valid/test/data/validext/", 38 | "/tmp/validext", 39 | ], 40 | ], 41 | ) 42 | def test_normal(self, temp_dir_path, value, expected): 43 | assert make_temp_file_path_from_url(temp_dir_path, value) == expected 44 | 45 | @pytest.mark.parametrize( 46 | ["temp_dir_path", "value", "expected"], 47 | [ 48 | [None, "tmp", InvalidFilePathError], 49 | ["tmp", "", InvalidFilePathError], 50 | ["tmp", None, InvalidFilePathError], 51 | ], 52 | ) 53 | def test_null_table_name(self, temp_dir_path, value, expected): 54 | with pytest.raises(expected): 55 | make_temp_file_path_from_url(temp_dir_path, value) 56 | -------------------------------------------------------------------------------- /test/test_excel_reader.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> 3 | """ 4 | 5 | import pytest 6 | import xlsxwriter 7 | from pytablewriter import dumps_tabledata 8 | from tabledata import TableData 9 | 10 | import pytablereader as ptr 11 | from pytablereader.interface import AbstractTableReader 12 | 13 | 14 | def write_worksheet(worksheet, table): 15 | for row_idx, row in enumerate(table): 16 | for col_idx, item in enumerate(row): 17 | worksheet.write(row_idx, col_idx, item) 18 | 19 | 20 | @pytest.fixture 21 | def valid_excel_file_path(tmpdir): 22 | test_file_path = tmpdir.join("tmp.xlsx") 23 | workbook = xlsxwriter.Workbook(str(test_file_path)) 24 | 25 | write_worksheet( 26 | workbook.add_worksheet("boolsheet"), 27 | table=[ 28 | ["true", "false", "tf", "lost"], 29 | ["True", "False", "True", "True"], 30 | ["true", "false", "False", ""], 31 | ["TRUE", "FALSE", "False", "False"], 32 | ], 33 | ) 34 | 35 | write_worksheet( 36 | workbook.add_worksheet("testsheet1"), 37 | table=[ 38 | ["", "", "", ""], 39 | ["", "a1", "b1", "c1"], 40 | ["", "aa1", "ab1", "ac1"], 41 | ["", 1, 1.1, "a"], 42 | ["", 2, 2.2, "bb"], 43 | ["", 3, 3.3, "cc"], 44 | ], 45 | ) 46 | 47 | worksheet = workbook.add_worksheet("testsheet2") # noqa: W0612 48 | 49 | write_worksheet( 50 | workbook.add_worksheet("testsheet3"), 51 | table=[ 52 | ["", "", ""], 53 | ["", "", ""], 54 | ["a3", "b3", "c3"], 55 | ["aa3", "ab3", "ac3"], 56 | [4, 1.1, "a"], 57 | [5, "", "bb"], 58 | [6, 3.3, ""], 59 | ], 60 | ) 61 | 62 | write_worksheet( 63 | workbook.add_worksheet("invalid_sheet"), 64 | table=[["", "", "", ""], ["", "a", "", "c"], ["", "aa", "ab", ""], ["", "", 1.1, "a"]], 65 | ) 66 | 67 | workbook.close() 68 | 69 | return str(test_file_path) 70 | 71 | 72 | @pytest.fixture 73 | def invalid_excel_file_path(tmpdir): 74 | test_file_path = tmpdir.join("invalid.xlsx") 75 | workbook = xlsxwriter.Workbook(str(test_file_path)) 76 | 77 | write_worksheet( 78 | workbook.add_worksheet("testsheet1"), 79 | table=[["", "", "", ""], ["", "a", "", "c"], ["", "aa", "ab", ""], ["", "", 1.1, "a"]], 80 | ) 81 | 82 | worksheet = workbook.add_worksheet("testsheet2") # noqa: W0612 83 | 84 | workbook.close() 85 | 86 | return str(test_file_path) 87 | 88 | 89 | @pytest.mark.xfail(run=False) 90 | class Test_ExcelTableFileLoader_make_table_name: 91 | def setup_method(self, method): 92 | AbstractTableReader.clear_table_count() 93 | 94 | @property 95 | def monkey_property(self): 96 | return "testsheet" 97 | 98 | @pytest.mark.parametrize( 99 | ["value", "source", "expected"], 100 | [ 101 | ["%(sheet)s", "/path/to/data.xlsx", "testsheet"], 102 | ["%(filename)s", "/path/to/data.xlsx", "data"], 103 | ["prefix_%(filename)s_%(sheet)s", "/path/to/data.xlsx", "prefix_data_testsheet"], 104 | ["%(format_name)s%(format_id)s_%(filename)s", "/path/to/data.xlsx", "excel0_data"], 105 | ], 106 | ) 107 | def test_normal(self, monkeypatch, value, source, expected): 108 | loader = ptr.ExcelTableFileLoader(source) 109 | loader.table_name = value 110 | 111 | monkeypatch.setattr(ptr.ExcelTableFileLoader, "_sheet_name", self.monkey_property) 112 | 113 | assert loader.make_table_name() == expected 114 | 115 | @pytest.mark.parametrize( 116 | ["value", "source", "expected"], 117 | [ 118 | [None, "/path/to/data.xlsx", ValueError], 119 | ["", "/path/to/data.xlsx", ValueError], 120 | ["%(sheet)s", None, ptr.InvalidTableNameError], 121 | ["%(sheet)s", "", ptr.InvalidTableNameError], 122 | ], 123 | ) 124 | def test_exception(self, value, source, expected): 125 | loader = ptr.ExcelTableFileLoader(source) 126 | loader.table_name = value 127 | 128 | with pytest.raises(expected): 129 | loader.make_table_name() 130 | 131 | 132 | @pytest.mark.xfail(run=False) 133 | class Test_ExcelTableFileLoader_load: 134 | def setup_method(self, method): 135 | AbstractTableReader.clear_table_count() 136 | 137 | @pytest.mark.parametrize( 138 | ["table_name", "start_row", "expected_list"], 139 | [ 140 | [ 141 | "%(sheet)s", 142 | 0, 143 | [ 144 | TableData( 145 | "boolsheet", 146 | ["true", "false", "tf", "lost"], 147 | [ 148 | [True, False, True, True], 149 | [True, False, False, ""], 150 | [True, False, False, False], 151 | ], 152 | ), 153 | TableData( 154 | "testsheet1", 155 | ["a1", "b1", "c1"], 156 | [ 157 | ["aa1", "ab1", "ac1"], 158 | [1.0, 1.1, "a"], 159 | [2.0, 2.2, "bb"], 160 | [3.0, 3.3, "cc"], 161 | ], 162 | ), 163 | TableData( 164 | "testsheet3", 165 | ["a3", "b3", "c3"], 166 | [["aa3", "ab3", "ac3"], [4.0, 1.1, "a"], [5.0, "", "bb"], [6.0, 3.3, ""]], 167 | ), 168 | ], 169 | ], 170 | [ 171 | "%(filename)s_%(sheet)s", 172 | 2, 173 | [ 174 | TableData("tmp_boolsheet", ["TRUE", "FALSE", "False", "False"], []), 175 | TableData( 176 | "tmp_testsheet1", 177 | ["aa1", "ab1", "ac1"], 178 | [[1.0, 1.1, "a"], [2.0, 2.2, "bb"], [3.0, 3.3, "cc"]], 179 | ), 180 | TableData( 181 | "tmp_testsheet3", 182 | ["a3", "b3", "c3"], 183 | [["aa3", "ab3", "ac3"], [4.0, 1.1, "a"], [5.0, "", "bb"], [6.0, 3.3, ""]], 184 | ), 185 | ], 186 | ], 187 | ], 188 | ) 189 | def test_normal(self, valid_excel_file_path, table_name, start_row, expected_list): 190 | loader = ptr.ExcelTableFileLoader(valid_excel_file_path) 191 | loader.table_name = table_name 192 | loader.start_row = start_row 193 | 194 | for table_data in loader.load(): 195 | print(f"[actual]\n{dumps_tabledata(table_data)}") 196 | assert table_data.in_tabledata_list(expected_list) 197 | 198 | @pytest.mark.parametrize( 199 | ["table_name", "start_row", "expected"], [["%(sheet)s", 0, ptr.DataError]] 200 | ) 201 | def test_abnormal(self, invalid_excel_file_path, table_name, start_row, expected): 202 | loader = ptr.ExcelTableFileLoader(invalid_excel_file_path) 203 | loader.table_name = table_name 204 | loader.start_row = start_row 205 | 206 | for tabletuple in loader.load(): 207 | assert tabletuple == [] 208 | 209 | @pytest.mark.parametrize( 210 | ["source", "expected"], [["", ptr.InvalidFilePathError], [None, ptr.InvalidFilePathError]] 211 | ) 212 | def test_null_file_path(self, source, expected): 213 | loader = ptr.ExcelTableFileLoader(source) 214 | 215 | with pytest.raises(expected): 216 | for _tabletuple in loader.load(): 217 | pass 218 | 219 | @pytest.mark.parametrize(["table_name", "expected"], [["", ValueError], [None, ValueError]]) 220 | def test_null_table_name(self, valid_excel_file_path, table_name, expected): 221 | loader = ptr.ExcelTableFileLoader(valid_excel_file_path) 222 | loader.table_name = table_name 223 | 224 | with pytest.raises(expected): 225 | for _tabletuple in loader.load(): 226 | pass 227 | -------------------------------------------------------------------------------- /test/test_html_reader_from_file.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> 3 | """ 4 | 5 | import os 6 | 7 | import pytest 8 | from pytablewriter import dumps_tabledata 9 | 10 | import pytablereader as ptr 11 | 12 | 13 | class Test_HtmlTableTextLoader_load: 14 | @pytest.mark.parametrize(["filename"], [["python - Wiktionary.html"]]) 15 | def test_smoke(self, tmpdir, filename): 16 | test_data_file_path = os.path.join(os.path.dirname(__file__), "data", filename) 17 | loader = ptr.TableFileLoader(test_data_file_path) 18 | 19 | success_count = 0 20 | 21 | for tabledata in loader.load(): 22 | if tabledata.is_empty(): 23 | continue 24 | 25 | assert len(dumps_tabledata(tabledata)) > 10 26 | 27 | success_count += 1 28 | 29 | assert success_count > 0 30 | -------------------------------------------------------------------------------- /test/test_logger.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> 3 | """ 4 | 5 | import pytest 6 | 7 | from pytablereader import set_logger 8 | from pytablereader._logger._null_logger import NullLogger 9 | 10 | 11 | class Test_set_logger: 12 | @pytest.mark.parametrize(["value"], [[True], [False]]) 13 | def test_smoke(self, value): 14 | set_logger(value) 15 | 16 | 17 | class Test_NullLogger: 18 | @pytest.mark.parametrize(["value"], [[True], [False]]) 19 | def test_smoke(self, value, monkeypatch): 20 | monkeypatch.setattr("pytablereader._logger._logger.logger", NullLogger()) 21 | set_logger(value) 22 | -------------------------------------------------------------------------------- /test/test_ltsv_reader.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> 3 | """ 4 | 5 | import collections 6 | from decimal import Decimal 7 | from textwrap import dedent 8 | 9 | import pytest 10 | from path import Path 11 | from pytablewriter import dumps_tabledata 12 | from tabledata import TableData 13 | 14 | import pytablereader as ptr 15 | from pytablereader import DataError, InvalidHeaderNameError, InvalidTableNameError 16 | from pytablereader.interface import AbstractTableReader 17 | 18 | from ._common import TYPE_HINT_RULES 19 | 20 | 21 | Data = collections.namedtuple("Data", "value expected") 22 | 23 | test_data_00 = Data( 24 | """a.0:1\tb-1:123.1\tc_2:a\t"dd":1.0\te.f-g_4:"1" 25 | a.0:2\tb-1:2.2\tc_2:bb\t"dd":2.2\te.f-g_4:"2.2" 26 | a.0:3\tb-1:3.3\tc_2:ccc\t"dd":3.0\te.f-g_4:"cccc" 27 | """, 28 | TableData( 29 | "tmp", 30 | ["a.0", "b-1", "c_2", "dd", "e.f-g_4"], 31 | [ 32 | [1, Decimal("123.1"), "a", 1, '"1"'], 33 | [2, Decimal("2.2"), "bb", Decimal("2.2"), '"2.2"'], 34 | [3, Decimal("3.3"), "ccc", 3, '"cccc"'], 35 | ], 36 | ), 37 | ) 38 | 39 | 40 | class Test_LtsvTableFileLoader_make_table_name: 41 | def setup_method(self, method): 42 | AbstractTableReader.clear_table_count() 43 | 44 | @pytest.mark.parametrize( 45 | ["value", "source", "expected"], 46 | [ 47 | ["%(default)s", "/path/to/data.ltsv", "data"], 48 | ["%(filename)s", "/path/to/data.ltsv", "data"], 49 | ["prefix_%(filename)s", "/path/to/data.ltsv", "prefix_data"], 50 | ["%(filename)s_suffix", "/path/to/data.ltsv", "data_suffix"], 51 | ["prefix_%(filename)s_suffix", "/path/to/data.ltsv", "prefix_data_suffix"], 52 | ["%(filename)s%(filename)s", "/path/to/data.ltsv", "datadata"], 53 | ["%(format_name)s%(format_id)s_%(filename)s", "/path/to/data.ltsv", "ltsv0_data"], 54 | ["%(%(filename)s)", "/path/to/data.ltsv", "%(data)"], 55 | ], 56 | ) 57 | def test_normal(self, value, source, expected): 58 | loader = ptr.LtsvTableFileLoader(source) 59 | loader.table_name = value 60 | 61 | assert loader.make_table_name() == expected 62 | 63 | @pytest.mark.parametrize( 64 | ["value", "source", "expected"], 65 | [ 66 | [None, "/path/to/data.ltsv", ValueError], 67 | ["", "/path/to/data.ltsv", ValueError], 68 | ["%(filename)s", None, InvalidTableNameError], 69 | ["%(filename)s", "", InvalidTableNameError], 70 | ], 71 | ) 72 | def test_exception(self, value, source, expected): 73 | loader = ptr.LtsvTableFileLoader(source) 74 | loader.table_name = value 75 | 76 | with pytest.raises(expected): 77 | loader.make_table_name() 78 | 79 | 80 | class Test_LtsvTableFileLoader_load: 81 | def setup_method(self, method): 82 | AbstractTableReader.clear_table_count() 83 | 84 | @pytest.mark.parametrize( 85 | ["test_id", "table_text", "filename", "expected"], 86 | [[0, test_data_00.value, "tmp.ltsv", test_data_00.expected]], 87 | ) 88 | def test_normal(self, tmpdir, test_id, table_text, filename, expected): 89 | file_path = Path(str(tmpdir.join(filename))) 90 | file_path.parent.makedirs_p() 91 | 92 | with open(file_path, "w", encoding="utf-8") as f: 93 | f.write(table_text) 94 | 95 | loader = ptr.LtsvTableFileLoader(file_path) 96 | 97 | for tabledata in loader.load(): 98 | print(f"test-id={test_id}") 99 | print(f"[expected]\n{dumps_tabledata(expected)}") 100 | print(f"[actual]\n{dumps_tabledata(tabledata)}") 101 | 102 | assert tabledata.equals(expected) 103 | 104 | @pytest.mark.parametrize( 105 | ["table_text", "filename", "expected"], 106 | [ 107 | ["\n".join(['"attr_a"\t"attr_b"\t"attr_c"']), "hoge.ltsv", ptr.DataError], 108 | ["\n".join(['"a":1"\t"attr_b"\t"attr_c"']), "hoge.ltsv", ptr.DataError], 109 | ], 110 | ) 111 | def test_exception(self, tmpdir, table_text, filename, expected): 112 | p_ltsv = tmpdir.join(filename) 113 | 114 | with open(str(p_ltsv), "w", encoding="utf8") as f: 115 | f.write(table_text) 116 | 117 | loader = ptr.LtsvTableFileLoader(str(p_ltsv)) 118 | 119 | with pytest.raises(expected): 120 | for _tabletuple in loader.load(): 121 | pass 122 | 123 | @pytest.mark.parametrize( 124 | ["filename", "headers", "expected"], 125 | [["", [], ptr.InvalidFilePathError], [None, [], ptr.InvalidFilePathError]], 126 | ) 127 | def test_null(self, tmpdir, filename, headers, expected): 128 | loader = ptr.LtsvTableFileLoader(filename) 129 | loader.headers = headers 130 | 131 | with pytest.raises(expected): 132 | for _tabletuple in loader.load(): 133 | pass 134 | 135 | 136 | class Test_LtsvTableTextLoader_make_table_name: 137 | def setup_method(self, method): 138 | AbstractTableReader.clear_table_count() 139 | 140 | @pytest.mark.parametrize( 141 | ["value", "expected"], 142 | [["%(format_name)s%(format_id)s", "ltsv0"], ["tablename", "tablename"]], 143 | ) 144 | def test_normal(self, value, expected): 145 | loader = ptr.LtsvTableTextLoader("dummy") 146 | loader.table_name = value 147 | 148 | assert loader.make_table_name() == expected 149 | 150 | @pytest.mark.parametrize( 151 | ["value", "source", "expected"], 152 | [[None, "tablename", ValueError], ["", "tablename", ValueError]], 153 | ) 154 | def test_exception(self, value, source, expected): 155 | loader = ptr.LtsvTableFileLoader(source) 156 | loader.table_name = value 157 | 158 | with pytest.raises(expected): 159 | loader.make_table_name() 160 | 161 | 162 | class Test_LtsvTableTextLoader_load: 163 | def setup_method(self, method): 164 | AbstractTableReader.clear_table_count() 165 | 166 | @pytest.mark.parametrize( 167 | ["table_text", "table_name", "expected"], 168 | [[test_data_00.value, "tmp", test_data_00.expected]], 169 | ) 170 | def test_normal(self, table_text, table_name, expected): 171 | loader = ptr.LtsvTableTextLoader(table_text) 172 | loader.table_name = table_name 173 | 174 | for tabledata in loader.load(): 175 | print(f"[expected]: {dumps_tabledata(expected)}") 176 | print(f"[actual]: {dumps_tabledata(tabledata)}") 177 | 178 | assert tabledata.equals(expected) 179 | 180 | def test_normal_type_hint_rules(self): 181 | table_text = dedent( 182 | """\ 183 | a_text:1\tb_integer:1\tc_integer:1.1 184 | a_text:2\tb_integer:2\tc_integer:1.2 185 | a_text:3\tb_integer:3\tc_integer:1.3 186 | """ 187 | ) 188 | 189 | loader = ptr.LtsvTableTextLoader(table_text) 190 | loader.table_name = "type hint rules" 191 | loader.type_hint_rules = TYPE_HINT_RULES 192 | 193 | for tbldata in loader.load(): 194 | assert tbldata.headers == ["a_text", "b_integer", "c_integer"] 195 | assert tbldata.value_matrix == [["1", 1, 1], ["2", 2, 1], ["3", 3, 1]] 196 | 197 | @pytest.mark.parametrize( 198 | ["table_text", "table_name", "expected"], 199 | [ 200 | ['"":"invalid"\ta:1', "dummy", InvalidHeaderNameError], 201 | ["", "dummy", DataError], 202 | ["a!:1\tb:2", "dummy", InvalidHeaderNameError], 203 | ["a:1\tb$c:2", "dummy", InvalidHeaderNameError], 204 | ], 205 | ) 206 | def test_exception_insufficient_data(self, table_text, table_name, expected): 207 | loader = ptr.LtsvTableTextLoader(table_text) 208 | loader.table_name = table_name 209 | 210 | with pytest.raises(expected): 211 | for _tabledata in loader.load(): 212 | print(_tabledata) 213 | 214 | @pytest.mark.parametrize(["table_name", "expected"], [["", ValueError], [None, ValueError]]) 215 | def test_null(self, table_name, expected): 216 | loader = ptr.LtsvTableTextLoader("dummy") 217 | loader.table_name = table_name 218 | 219 | with pytest.raises(expected): 220 | for _tabletuple in loader.load(): 221 | pass 222 | -------------------------------------------------------------------------------- /test/test_pandas.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> 3 | """ 4 | 5 | from decimal import Decimal 6 | 7 | import pytest 8 | import typepy 9 | from tabledata import TableData 10 | 11 | 12 | try: 13 | import pandas 14 | 15 | PANDAS_IMPORT = True 16 | except ImportError: 17 | PANDAS_IMPORT = False 18 | 19 | 20 | @pytest.mark.skipif(not PANDAS_IMPORT, reason="required package not found") 21 | class Test_TableData_as_dataframe: 22 | @pytest.mark.parametrize( 23 | ["table_name", "headers", "rows"], 24 | [ 25 | ["normal", ["a", "b"], [[10, 11], [20, 21]]], 26 | ["normal", None, [[10, 11], [20, 21]]], 27 | ["normal", None, None], 28 | ], 29 | ) 30 | def test_normal(self, table_name, headers, rows): 31 | tabledata = TableData(table_name, headers, rows) 32 | dataframe = pandas.DataFrame(rows) 33 | if typepy.is_not_empty_sequence(headers): 34 | dataframe.columns = headers 35 | 36 | print(f"lhs: {tabledata.as_dataframe()}") 37 | print(f"rhs: {dataframe}") 38 | 39 | assert tabledata.as_dataframe().equals(dataframe) 40 | 41 | 42 | @pytest.mark.skipif(not PANDAS_IMPORT, reason="required package not found") 43 | class Test_TableData_from_dataframe: 44 | def test_normal(self): 45 | dataframe = pandas.DataFrame( 46 | [[0, 0.1, "a"], [1, 1.1, "bb"], [2, 2.2, "ccc"]], columns=["id", "value", "name"] 47 | ) 48 | expected = TableData( 49 | "tablename", 50 | ["id", "value", "name"], 51 | [[0, Decimal("0.1"), "a"], [1, Decimal("1.1"), "bb"], [2, Decimal("2.2"), "ccc"]], 52 | ) 53 | 54 | assert TableData.from_dataframe(dataframe, "tablename").equals(expected) 55 | -------------------------------------------------------------------------------- /test/test_sqlite_reader.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> 3 | """ 4 | 5 | import collections 6 | from decimal import Decimal 7 | 8 | import pytest 9 | from path import Path 10 | from pytablewriter import dumps_tabledata 11 | from simplesqlite import SimpleSQLite 12 | from tabledata import TableData 13 | 14 | import pytablereader as ptr 15 | from pytablereader.interface import AbstractTableReader 16 | 17 | 18 | Data = collections.namedtuple("Data", "value expected") 19 | 20 | test_data_00 = Data( 21 | TableData( 22 | "tmp", 23 | ["attr_a", "attr_b", "attr_c"], 24 | [[1, 4, "a"], [2, Decimal("2.1"), "bb"], [3, Decimal("120.9"), "ccc"]], 25 | ), 26 | [ 27 | TableData( 28 | "tmp", 29 | ["attr_a", "attr_b", "attr_c"], 30 | [[1, 4, "a"], [2, Decimal("2.1"), "bb"], [3, Decimal("120.9"), "ccc"]], 31 | ) 32 | ], 33 | ) 34 | test_data_01 = Data( 35 | TableData( 36 | "foo_bar", 37 | ["attr_a", "attr_b", "attr_c"], 38 | [["aaaa", "bbbb", "cccc"], [1, 4, "a"], [2, "2.1", "bb"], [3, "120.9", "ccc"]], 39 | ), 40 | [ 41 | TableData( 42 | "foo_bar", 43 | ["attr_a", "attr_b", "attr_c"], 44 | [["aaaa", "bbbb", "cccc"], ["1", "4", "a"], ["2", "2.1", "bb"], ["3", "120.9", "ccc"]], 45 | ) 46 | ], 47 | ) 48 | test_data_02 = Data( 49 | TableData("foo_bar", ["attr_a", "attr_b", "attr_c"], [[3, "120.9", "ccc"]]), 50 | [TableData("foo_bar", ["attr_a", "attr_b", "attr_c"], [[3, "120.9", "ccc"]])], 51 | ) 52 | test_data_03 = Data( 53 | TableData( 54 | "tmp", ["attr_a", "attr_b", "attr_c"], [[1, 4, "a"], [2, "2.1", "bb"], [3, "120.9", "ccc"]] 55 | ), 56 | [ 57 | TableData( 58 | "tmp", 59 | ["attr_a", "attr_b", "attr_c"], 60 | [[1, 4, "a"], [2, "2.1", "bb"], [3, "120.9", "ccc"]], 61 | ) 62 | ], 63 | ) 64 | test_data_04 = Data( 65 | TableData( 66 | "tmp", ["attr_a", "attr_b", "attr_c"], [[1, 4, "a"], [2, "2.1", "bb"], [3, "120.9", "ccc"]] 67 | ), 68 | [ 69 | TableData( 70 | "tmp", 71 | ["attr_a", "attr_b", "attr_c"], 72 | [[1, 4, "a"], [2, "2.1", "bb"], [3, "120.9", "ccc"]], 73 | ) 74 | ], 75 | ) 76 | test_data_05 = Data( 77 | TableData( 78 | "tmp", 79 | ["姓", "名", "生年月日", "郵便番号", "住所", "電話番号"], 80 | [ 81 | ["山田", "太郎", "2001/1/1", "100-0002", "東京都千代田区皇居外苑", "03-1234-5678"], 82 | ["山田", "次郎", "2001/1/2", "251-0036", "神奈川県藤沢市江の島1丁目", "03-9999-9999"], 83 | ], 84 | ), 85 | [ 86 | TableData( 87 | "tmp", 88 | ["姓", "名", "生年月日", "郵便番号", "住所", "電話番号"], 89 | [ 90 | ["山田", "太郎", "2001/1/1", "100-0002", "東京都千代田区皇居外苑", "03-1234-5678"], 91 | ["山田", "次郎", "2001/1/2", "251-0036", "神奈川県藤沢市江の島1丁目", "03-9999-9999"], 92 | ], 93 | ) 94 | ], 95 | ) 96 | 97 | 98 | class Test_SqliteFileLoader_load: 99 | def setup_method(self, method): 100 | AbstractTableReader.clear_table_count() 101 | 102 | @pytest.mark.parametrize( 103 | ["test_id", "tabledata", "filename", "headers", "expected"], 104 | [ 105 | [0, test_data_00.value, "tmp.sqlite", [], test_data_00.expected], 106 | [ 107 | 1, 108 | test_data_01.value, 109 | "foo_bar.sqlite", 110 | ["attr_a", "attr_b", "attr_c"], 111 | test_data_01.expected, 112 | ], 113 | [ 114 | 2, 115 | test_data_02.value, 116 | "foo_bar.sqlite", 117 | ["attr_a", "attr_b", "attr_c"], 118 | test_data_02.expected, 119 | ], 120 | [3, test_data_03.value, "tmp.sqlite", [], test_data_03.expected], 121 | [4, test_data_04.value, "tmp.sqlite", [], test_data_04.expected], 122 | [5, test_data_05.value, "tmp.sqlite", [], test_data_05.expected], 123 | ], 124 | ) 125 | def test_normal(self, tmpdir, test_id, tabledata, filename, headers, expected): 126 | file_path = Path(str(tmpdir.join(filename))) 127 | file_path.parent.makedirs_p() 128 | 129 | con = SimpleSQLite(file_path, "w") 130 | 131 | con.create_table_from_tabledata(tabledata) 132 | 133 | loader = ptr.SqliteFileLoader(file_path) 134 | loader.headers = headers 135 | 136 | for tabledata in loader.load(): 137 | print(f"test-id={test_id}") 138 | print(dumps_tabledata(tabledata)) 139 | 140 | assert tabledata.in_tabledata_list(expected) 141 | 142 | @pytest.mark.parametrize( 143 | ["filename", "headers", "expected"], 144 | [["", [], ptr.InvalidFilePathError], [None, [], ptr.InvalidFilePathError]], 145 | ) 146 | def test_null(self, tmpdir, filename, headers, expected): 147 | loader = ptr.SqliteFileLoader(filename) 148 | loader.headers = headers 149 | 150 | with pytest.raises(expected): 151 | for _tabletuple in loader.load(): 152 | pass 153 | -------------------------------------------------------------------------------- /test/test_validator.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> 3 | """ 4 | 5 | import platform 6 | 7 | import pytest 8 | 9 | import pytablereader as ptr 10 | from pytablereader._constant import SourceType 11 | from pytablereader._validator import FileValidator, TextValidator, UrlValidator, is_fifo 12 | 13 | 14 | class Test_FileValidator_validate: 15 | @pytest.mark.parametrize(["value"], [["test"]]) 16 | def test_normal(self, tmpdir, value): 17 | p_file_path = tmpdir.join(value) 18 | 19 | with open(str(p_file_path), "w"): 20 | pass 21 | 22 | validator = FileValidator(str(p_file_path)) 23 | assert validator.source_type == SourceType.FILE 24 | validator.validate() 25 | 26 | @pytest.mark.parametrize( 27 | ["value", "expected"], [[None, ptr.InvalidFilePathError], ["", ptr.InvalidFilePathError]] 28 | ) 29 | def test_exception_null(self, value, expected): 30 | validator = FileValidator(value) 31 | 32 | with pytest.raises(expected): 33 | validator.validate() 34 | 35 | @pytest.mark.parametrize(["value", "expected"], [["te\0st", ptr.InvalidFilePathError]]) 36 | def test_exception_invalid_path(self, tmpdir, value, expected): 37 | validator = FileValidator(value) 38 | 39 | with pytest.raises(expected): 40 | validator.validate() 41 | 42 | 43 | class Test_TextValidator_validate: 44 | @pytest.mark.parametrize(["value"], [["test"]]) 45 | def test_normal(self, value): 46 | validator = TextValidator(value) 47 | assert validator.source_type == SourceType.TEXT 48 | validator.validate() 49 | 50 | @pytest.mark.parametrize(["value", "expected"], [[None, ptr.DataError], ["", ptr.DataError]]) 51 | def test_exception(self, value, expected): 52 | validator = TextValidator(value) 53 | 54 | with pytest.raises(expected): 55 | validator.validate() 56 | 57 | 58 | class Test_UrlValidator_validate: 59 | @pytest.mark.parametrize(["value"], [["http://www.google.com"], ["https://github.com/"]]) 60 | def test_normal(self, value): 61 | validator = UrlValidator(value) 62 | assert validator.source_type == SourceType.URL 63 | validator.validate() 64 | 65 | @pytest.mark.parametrize( 66 | ["value", "expected"], 67 | [[None, ptr.UrlError], ["", ptr.UrlError], ["www.google.com", ptr.UrlError]], 68 | ) 69 | def test_exception(self, value, expected): 70 | validator = UrlValidator(value) 71 | 72 | with pytest.raises(expected): 73 | validator.validate() 74 | 75 | 76 | class Test_is_fifo: 77 | @pytest.mark.skipif( 78 | platform.system() == "Windows", 79 | reason="platform dependent tests: only failed at GitHub Actions", 80 | ) 81 | def test_filename_too_long(self): 82 | assert not is_fifo("a" * 1000) 83 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = 3 | py{37,38,39,310,311} 4 | pypy3 5 | build 6 | cov 7 | docs 8 | fmt 9 | lint 10 | readme 11 | 12 | [testenv] 13 | extras = 14 | test 15 | commands = 16 | pytest {posargs} 17 | 18 | [testenv:build] 19 | deps = 20 | build>=0.10 21 | twine 22 | wheel 23 | commands = 24 | python -m build 25 | twine check dist/*.whl dist/*.tar.gz 26 | 27 | [testenv:clean] 28 | skip_install = true 29 | deps = 30 | cleanpy>=0.4 31 | commands = 32 | cleanpy --all --exclude-envs . 33 | 34 | [testenv:cov] 35 | extras = 36 | test 37 | deps = 38 | coverage[toml]>=5 39 | commands = 40 | coverage run -m pytest {posargs:-vv} 41 | coverage report -m 42 | 43 | [testenv:docs] 44 | deps = 45 | -r{toxinidir}/requirements/docs_requirements.txt 46 | commands = 47 | sphinx-build docs/ docs/_build 48 | 49 | [testenv:fmt] 50 | skip_install = true 51 | deps = 52 | autoflake>=2 53 | black>=23.1 54 | isort>=5 55 | commands = 56 | black setup.py test pytablereader 57 | autoflake --in-place --recursive --remove-all-unused-imports --ignore-init-module-imports . 58 | isort . 59 | 60 | [testenv:lint] 61 | skip_install = true 62 | deps = 63 | codespell>=2 64 | #mypy>=1 65 | pylama>=8.4.1 66 | commands = 67 | python setup.py check 68 | #mypy pytablereader setup.py --ignore-missing-imports --show-error-context --show-error-codes --python-version 3.5 69 | codespell pytablereader docs/pages examples test -q 2 --check-filenames --ignore-words-list te --exclude-file "test/data/python - Wiktionary.html" 70 | pylama 71 | 72 | [testenv:readme] 73 | skip_install = true 74 | changedir = docs 75 | deps = 76 | path 77 | readmemaker>=1.1.0 78 | commands = 79 | python make_readme.py 80 | --------------------------------------------------------------------------------