├── .github
    ├── FUNDING.yml
    └── workflows
    │   └── lint_and_test.yml
├── .gitignore
├── .readthedocs.yaml
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.rst
├── docs
    ├── Makefile
    ├── conf.py
    ├── index.rst
    ├── make.bat
    ├── make_readme.py
    └── pages
    │   ├── examples
    │       ├── as_dataframe.txt
    │       ├── csv_loader.rst
    │       ├── dataframe.rst
    │       ├── gs_loader.rst
    │       ├── index.rst
    │       ├── load_csv.txt
    │       └── url_loader.rst
    │   ├── genindex.rst
    │   ├── introduction
    │       ├── badges.txt
    │       ├── feature.txt
    │       ├── index.rst
    │       ├── installation.rst
    │       └── summary.txt
    │   ├── links.rst
    │   ├── reference
    │       ├── basic_loader.rst
    │       ├── error.rst
    │       ├── format_specific_loader.rst
    │       ├── index.rst
    │       └── loader_factory.rst
    │   └── sponsors.rst
├── examples
    ├── load_table_from_csv.py
    ├── load_table_from_gs.py
    ├── load_table_from_url.py
    └── pytablereader.ipynb
├── invoke_pytest.py
├── pylama.ini
├── pyproject.toml
├── pytablereader
    ├── __init__.py
    ├── __version__.py
    ├── _acceptor.py
    ├── _common.py
    ├── _constant.py
    ├── _logger
    │   ├── __init__.py
    │   ├── _logger.py
    │   └── _null_logger.py
    ├── _validator.py
    ├── csv
    │   ├── __init__.py
    │   ├── core.py
    │   └── formatter.py
    ├── error.py
    ├── factory
    │   ├── __init__.py
    │   ├── _base.py
    │   ├── _file.py
    │   ├── _text.py
    │   └── _url.py
    ├── formatter.py
    ├── html
    │   ├── __init__.py
    │   ├── core.py
    │   └── formatter.py
    ├── interface.py
    ├── json
    │   ├── __init__.py
    │   ├── core.py
    │   └── formatter.py
    ├── jsonlines
    │   ├── __init__.py
    │   ├── core.py
    │   └── formatter.py
    ├── loadermanager
    │   ├── __init__.py
    │   ├── _base.py
    │   ├── _file.py
    │   ├── _text.py
    │   └── _url.py
    ├── ltsv
    │   ├── __init__.py
    │   └── core.py
    ├── markdown
    │   ├── __init__.py
    │   ├── core.py
    │   └── formatter.py
    ├── mediawiki
    │   ├── __init__.py
    │   ├── core.py
    │   └── formatter.py
    ├── spreadsheet
    │   ├── __init__.py
    │   ├── core.py
    │   ├── excelloader.py
    │   └── gsloader.py
    ├── sqlite
    │   ├── __init__.py
    │   ├── core.py
    │   └── formatter.py
    └── tsv
    │   ├── __init__.py
    │   └── core.py
├── requirements
    ├── docs_requirements.txt
    ├── requirements.txt
    └── test_requirements.txt
├── setup.py
├── test
    ├── __init__.py
    ├── _common.py
    ├── data
    │   ├── python - Wiktionary.html
    │   ├── valid.sqlite3
    │   └── validdata.xlsx
    ├── factory
    │   ├── test_file_loader_factory.py
    │   └── test_text_loader_factory.py
    ├── loader
    │   ├── test_fileloader.py
    │   ├── test_gsloader.py
    │   ├── test_textloader.py
    │   └── test_urlloader.py
    ├── test_common.py
    ├── test_csv_reader.py
    ├── test_excel_reader.py
    ├── test_html_reader.py
    ├── test_html_reader_from_file.py
    ├── test_json_reader.py
    ├── test_jsonlines_reader.py
    ├── test_logger.py
    ├── test_ltsv_reader.py
    ├── test_markdown_reader.py
    ├── test_mediawiki_reader.py
    ├── test_pandas.py
    ├── test_sqlite_reader.py
    ├── test_tsv_reader.py
    └── test_validator.py
└── tox.ini


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: thombashi
 4 | patreon: # Replace with a single Patreon username
 5 | open_collective: # Replace with a single Open Collective username
 6 | ko_fi: # Replace with a single Ko-fi username
 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
13 | 


--------------------------------------------------------------------------------
/.github/workflows/lint_and_test.yml:
--------------------------------------------------------------------------------
  1 | name: Lint and Test
  2 | 
  3 | on:
  4 |   push:
  5 |     paths-ignore:
  6 |       - '.gitignore'
  7 |       - '.readthedocs.yaml'
  8 |       - 'README.rst'
  9 |   pull_request:
 10 |     paths-ignore:
 11 |       - '.gitignore'
 12 |       - '.readthedocs.yaml'
 13 |       - 'README.rst'
 14 | 
 15 | env:
 16 |   PYTEST_DISCORD_WEBHOOK: ${{ secrets.PYTEST_DISCORD_WEBHOOK }}
 17 | 
 18 | permissions:
 19 |   contents: read
 20 | 
 21 | jobs:
 22 |   lint:
 23 |     runs-on: ubuntu-latest
 24 |     concurrency:
 25 |       group: ${{ github.event_name }}-${{ github.workflow }}-${{ github.ref_name }}-lint
 26 |       cancel-in-progress: true
 27 |     timeout-minutes: 20
 28 |     container:
 29 |       image: ghcr.io/thombashi/python-ci:3.11
 30 | 
 31 |     steps:
 32 |       - uses: actions/checkout@v3
 33 | 
 34 |       - name: Lint
 35 |         run: make check
 36 | 
 37 |   unit-test:
 38 |     runs-on: ${{ matrix.os }}
 39 |     concurrency:
 40 |       group: ${{ github.event_name }}-${{ github.workflow }}-${{ github.ref_name }}-ut-${{ matrix.os }}-${{ matrix.python-version }}
 41 |       cancel-in-progress: true
 42 |     strategy:
 43 |       fail-fast: false
 44 |       matrix:
 45 |         python-version: ['3.7', '3.8', '3.9', '3.10', '3.11', 'pypy-3.8']
 46 |         os: [ubuntu-latest, macos-latest, windows-latest]
 47 |     timeout-minutes: 20
 48 | 
 49 |     steps:
 50 |       - uses: actions/checkout@v3
 51 | 
 52 |       - name: Setup Python ${{ matrix.python-version }}
 53 |         uses: actions/setup-python@v4
 54 |         with:
 55 |           python-version: ${{ matrix.python-version }}
 56 |           cache: pip
 57 |           cache-dependency-path: |
 58 |             setup.py
 59 |             **/*requirements.txt
 60 |             tox.ini
 61 | 
 62 |       - name: Install pip
 63 |         run: python -m pip install --upgrade --disable-pip-version-check "pip>=21.1"
 64 | 
 65 |       - name: Install dependencies
 66 |         run: make setup-ci
 67 | 
 68 |       - name: Run tests
 69 |         run: tox -e py
 70 | 
 71 |   coverage:
 72 |     runs-on: ubuntu-latest
 73 |     concurrency:
 74 |       group: ${{ github.event_name }}-${{ github.workflow }}-${{ github.ref_name }}-coverage
 75 |       cancel-in-progress: true
 76 |     timeout-minutes: 20
 77 | 
 78 |     steps:
 79 |       - uses: actions/checkout@v3
 80 | 
 81 |       - name: Setup Python
 82 |         uses: actions/setup-python@v4
 83 |         with:
 84 |           python-version: '3.10'
 85 |           cache: pip
 86 |           cache-dependency-path: |
 87 |             setup.py
 88 |             **/*requirements.txt
 89 |             tox.ini
 90 | 
 91 |       - name: Install dependencies
 92 |         run: make setup-ci
 93 | 
 94 |       - name: Run tests
 95 |         run: tox -e cov
 96 | 
 97 |       - name: Upload coverage report
 98 |         env:
 99 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
100 |         run: |
101 |           python -m pip install --upgrade --disable-pip-version-check coveralls tomli
102 |           coveralls --service=github
103 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | 
 62 | # Flask stuff:
 63 | instance/
 64 | .webassets-cache
 65 | 
 66 | # Scrapy stuff:
 67 | .scrapy
 68 | 
 69 | # Sphinx documentation
 70 | docs/_build/
 71 | 
 72 | # PyBuilder
 73 | target/
 74 | 
 75 | # Jupyter Notebook
 76 | .ipynb_checkpoints
 77 | 
 78 | # IPython
 79 | profile_default/
 80 | ipython_config.py
 81 | 
 82 | # pyenv
 83 | .python-version
 84 | 
 85 | # celery beat schedule file
 86 | celerybeat-schedule
 87 | 
 88 | # SageMath parsed files
 89 | *.sage.py
 90 | 
 91 | # Environments
 92 | .env
 93 | .venv
 94 | env/
 95 | venv/
 96 | ENV/
 97 | env.bak/
 98 | venv.bak/
 99 | 
100 | # Spyder project settings
101 | .spyderproject
102 | .spyproject
103 | 
104 | # Rope project settings
105 | .ropeproject
106 | 
107 | # mkdocs documentation
108 | /site
109 | 
110 | # mypy
111 | .mypy_cache/
112 | .dmypy.json
113 | dmypy.json
114 | 
115 | # Pyre type checker
116 | .pyre/
117 | 
118 | # User settings
119 | _sandbox/
120 | *_profile
121 | Untitled.ipynb
122 | 
123 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | build:
 4 |   os: ubuntu-22.04
 5 |   tools:
 6 |     python: "3.11"
 7 | 
 8 | sphinx:
 9 |   configuration: docs/conf.py
10 | 
11 | formats:
12 |   - pdf
13 |   - epub
14 | 
15 | python:
16 |   install:
17 |     - requirements: requirements/docs_requirements.txt
18 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 Tsuyoshi Hombashi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include docs/pages/introduction/summary.txt
 2 | include LICENSE
 3 | include README.rst
 4 | include setup.cfg
 5 | include tox.ini
 6 | 
 7 | recursive-include test *
 8 | recursive-include requirements *
 9 | 
10 | global-exclude __pycache__/*
11 | global-exclude *.pyc
12 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | AUTHOR := thombashi
 2 | PACKAGE := pytablereader
 3 | BUILD_WORK_DIR := _work
 4 | DOCS_DIR := docs
 5 | PKG_BUILD_DIR := $(BUILD_WORK_DIR)/$(PACKAGE)
 6 | PYTHON := python3
 7 | 
 8 | 
 9 | .PHONY: build-remote
10 | build-remote: clean
11 | 	@mkdir -p $(BUILD_WORK_DIR)
12 | 	@cd $(BUILD_WORK_DIR) && \
13 | 		git clone https://github.com/$(AUTHOR)/$(PACKAGE).git --depth 1 && \
14 | 		cd $(PACKAGE) && \
15 | 		$(PYTHON) -m tox -e build
16 | 	ls -lh $(PKG_BUILD_DIR)/dist/*
17 | 
18 | .PHONY: build
19 | build: clean
20 | 	@$(PYTHON) -m tox -e build
21 | 	ls -lh dist/*
22 | 
23 | .PHONY: check
24 | check:
25 | 	@$(PYTHON) -m tox -e lint
26 | 
27 | .PHONY: clean
28 | clean:
29 | 	@rm -rf $(BUILD_WORK_DIR)
30 | 	@$(PYTHON) -m tox -e clean
31 | 
32 | .PHONY: docs
33 | docs:
34 | 	@$(PYTHON) -m tox -e docs
35 | 
36 | .PHONY: fmt
37 | fmt:
38 | 	@$(PYTHON) -m tox -e fmt
39 | 
40 | .PHONY: readme
41 | readme:
42 | 	@$(PYTHON) -m tox -e readme
43 | 
44 | .PHONY: release
45 | release:
46 | 	cd $(PKG_BUILD_DIR) && $(PYTHON) setup.py release --verbose
47 | 	$(MAKE) clean
48 | 
49 | .PHONY: setup-ci
50 | setup-ci:
51 | 	@$(PYTHON) -m pip install --disable-pip-version-check --upgrade releasecmd tox
52 | 
53 | .PHONY: setup
54 | setup: setup-ci
55 | 	@$(PYTHON) -m pip install -q --disable-pip-version-check --upgrade -e .[test]
56 | 	@$(PYTHON) -m pip check
57 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | .. contents:: **pytablereader**
  2 |    :backlinks: top
  3 |    :depth: 2
  4 | 
  5 | Summary
  6 | =========
  7 | `pytablereader <https://github.com/thombashi/pytablereader>`__ is a Python library to load structured table data from files/strings/URL with various data format: CSV / Excel / Google-Sheets / HTML / JSON / LDJSON / LTSV / Markdown / SQLite / TSV.
  8 | 
  9 | .. image:: https://badge.fury.io/py/pytablereader.svg
 10 |     :target: https://badge.fury.io/py/pytablereader
 11 |     :alt: PyPI package version
 12 | 
 13 | .. image:: https://img.shields.io/pypi/pyversions/pytablereader.svg
 14 |     :target: https://pypi.org/project/pytablereader
 15 |     :alt: Supported Python versions
 16 | 
 17 | .. image:: https://img.shields.io/pypi/implementation/pytablereader.svg
 18 |     :target: https://pypi.org/project/pytablereader
 19 |     :alt: Supported Python implementations
 20 | 
 21 | .. image:: https://github.com/thombashi/pytablereader/actions/workflows/lint_and_test.yml/badge.svg
 22 |     :target: https://github.com/thombashi/pytablereader/actions/workflows/lint_and_test.yml
 23 |     :alt: CI status of Linux/macOS/Windows
 24 | 
 25 | .. image:: https://coveralls.io/repos/github/thombashi/pytablereader/badge.svg?branch=master
 26 |     :target: https://coveralls.io/github/thombashi/pytablereader?branch=master
 27 |     :alt: Test coverage
 28 | 
 29 | .. image:: https://github.com/thombashi/pytablereader/actions/workflows/github-code-scanning/codeql/badge.svg
 30 |     :target: https://github.com/thombashi/pytablereader/actions/workflows/github-code-scanning/codeql
 31 |     :alt: CodeQL
 32 | 
 33 | Features
 34 | --------
 35 | - Extract structured tabular data from various data format:
 36 |     - CSV / Tab separated values (TSV) / Space separated values (SSV)
 37 |     - Microsoft Excel :superscript:`TM` file
 38 |     - `Google Sheets <https://www.google.com/intl/en_us/sheets/about/>`_
 39 |     - HTML (``table`` tags)
 40 |     - JSON
 41 |     - `Labeled Tab-separated Values (LTSV) <http://ltsv.org/>`__
 42 |     - `Line-delimited JSON(LDJSON) <https://en.wikipedia.org/wiki/JSON_streaming#Line-delimited_JSON>`__ / NDJSON / JSON Lines
 43 |     - Markdown
 44 |     - MediaWiki
 45 |     - SQLite database file
 46 | - Supported data sources are:
 47 |     - Files on a local file system
 48 |     - Accessible URLs
 49 |     - ``str`` instances
 50 | - Loaded table data can be used as:
 51 |     - `pandas.DataFrame <https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html>`__ instance
 52 |     - ``dict`` instance
 53 | 
 54 | Examples
 55 | ==========
 56 | Load a CSV table
 57 | ------------------
 58 | :Sample Code:
 59 |     .. code-block:: python
 60 | 
 61 |         import pytablereader as ptr
 62 |         import pytablewriter as ptw
 63 | 
 64 | 
 65 |         # prepare data ---
 66 |         file_path = "sample_data.csv"
 67 |         csv_text = "\n".join([
 68 |             '"attr_a","attr_b","attr_c"',
 69 |             '1,4,"a"',
 70 |             '2,2.1,"bb"',
 71 |             '3,120.9,"ccc"',
 72 |         ])
 73 | 
 74 |         with open(file_path, "w") as f:
 75 |             f.write(csv_text)
 76 | 
 77 |         # load from a csv file ---
 78 |         loader = ptr.CsvTableFileLoader(file_path)
 79 |         for table_data in loader.load():
 80 |             print("\n".join([
 81 |                 "load from file",
 82 |                 "==============",
 83 |                 "{:s}".format(ptw.dumps_tabledata(table_data)),
 84 |             ]))
 85 | 
 86 |         # load from a csv text ---
 87 |         loader = ptr.CsvTableTextLoader(csv_text)
 88 |         for table_data in loader.load():
 89 |             print("\n".join([
 90 |                 "load from text",
 91 |                 "==============",
 92 |                 "{:s}".format(ptw.dumps_tabledata(table_data)),
 93 |             ]))
 94 | 
 95 | 
 96 | :Output:
 97 |     .. code-block::
 98 | 
 99 |         load from file
100 |         ==============
101 |         .. table:: sample_data
102 | 
103 |             ======  ======  ======
104 |             attr_a  attr_b  attr_c
105 |             ======  ======  ======
106 |                  1     4.0  a
107 |                  2     2.1  bb
108 |                  3   120.9  ccc
109 |             ======  ======  ======
110 | 
111 |         load from text
112 |         ==============
113 |         .. table:: csv2
114 | 
115 |             ======  ======  ======
116 |             attr_a  attr_b  attr_c
117 |             ======  ======  ======
118 |                  1     4.0  a
119 |                  2     2.1  bb
120 |                  3   120.9  ccc
121 |             ======  ======  ======
122 | 
123 | Get loaded table data as pandas.DataFrame instance
124 | ----------------------------------------------------
125 | 
126 | :Sample Code:
127 |     .. code-block:: python
128 | 
129 |         import pytablereader as ptr
130 | 
131 |         loader = ptr.CsvTableTextLoader(
132 |             "\n".join([
133 |                 "a,b",
134 |                 "1,2",
135 |                 "3.3,4.4",
136 |             ]))
137 |         for table_data in loader.load():
138 |             print(table_data.as_dataframe())
139 | 
140 | :Output:
141 |     .. code-block::
142 | 
143 |              a    b
144 |         0    1    2
145 |         1  3.3  4.4
146 | 
147 | For more information
148 | ----------------------
149 | More examples are available at 
150 | https://pytablereader.rtfd.io/en/latest/pages/examples/index.html
151 | 
152 | Installation
153 | ============
154 | 
155 | Install from PyPI
156 | ------------------------------
157 | ::
158 | 
159 |     pip install pytablereader
160 | 
161 | Some of the formats require additional dependency packages, you can install the dependency packages as follows:
162 | 
163 | - Excel
164 |     - ``pip install pytablereader[excel]``
165 | - Google Sheets
166 |     - ``pip install pytablereader[gs]``
167 | - Markdown
168 |     - ``pip install pytablereader[md]``
169 | - Mediawiki
170 |     - ``pip install pytablereader[mediawiki]``
171 | - SQLite
172 |     - ``pip install pytablereader[sqlite]``
173 | - Load from URLs
174 |     - ``pip install pytablereader[url]``
175 | - All of the extra dependencies
176 |     - ``pip install pytablereader[all]``
177 | 
178 | Install from PPA (for Ubuntu)
179 | ------------------------------
180 | ::
181 | 
182 |     sudo add-apt-repository ppa:thombashi/ppa
183 |     sudo apt update
184 |     sudo apt install python3-pytablereader
185 | 
186 | 
187 | Dependencies
188 | ============
189 | - Python 3.7+
190 | - `Python package dependencies (automatically installed) <https://github.com/thombashi/pytablereader/network/dependencies>`__
191 | 
192 | 
193 | Optional Python packages
194 | ------------------------------------------------
195 | - ``logging`` extras
196 |     - `loguru <https://github.com/Delgan/loguru>`__: Used for logging if the package installed
197 | - ``excel`` extras
198 |     - `excelrd <https://github.com/thombashi/excelrd>`__
199 | - ``md`` extras
200 |     - `Markdown <https://github.com/Python-Markdown/markdown>`__
201 | - ``mediawiki`` extras
202 |     - `pypandoc <https://github.com/bebraw/pypandoc>`__
203 | - ``sqlite`` extras
204 |     - `SimpleSQLite <https://github.com/thombashi/SimpleSQLite>`__
205 | - ``url`` extras
206 |     - `retryrequests <https://github.com/thombashi/retryrequests>`__
207 | - `pandas <https://pandas.pydata.org/>`__
208 |     - required to get table data as a pandas data frame
209 | - `lxml <https://lxml.de/installation.html>`__
210 | 
211 | Optional packages (other than Python packages)
212 | ------------------------------------------------
213 | - ``libxml2`` (faster HTML conversion)
214 | - `pandoc <https://pandoc.org/>`__ (required when loading MediaWiki file)
215 | 
216 | Documentation
217 | ===============
218 | https://pytablereader.rtfd.io/
219 | 
220 | Related Project
221 | =================
222 | - `pytablewriter <https://github.com/thombashi/pytablewriter>`__
223 |     - Tabular data loaded by ``pytablereader`` can be written another tabular data format with ``pytablewriter``.
224 | 
225 | Sponsors
226 | ====================================
227 | .. image:: https://avatars.githubusercontent.com/u/44389260?s=48&u=6da7176e51ae2654bcfd22564772ef8a3bb22318&v=4
228 |    :target: https://github.com/chasbecker
229 |    :alt: Charles Becker (chasbecker)
230 | .. image:: https://avatars.githubusercontent.com/u/46711571?s=48&u=57687c0e02d5d6e8eeaf9177f7b7af4c9f275eb5&v=4
231 |    :target: https://github.com/Arturi0
232 |    :alt: onetime: Arturi0
233 | .. image:: https://avatars.githubusercontent.com/u/3658062?s=48&v=4
234 |    :target: https://github.com/b4tman
235 |    :alt: onetime: Dmitry Belyaev (b4tman)
236 | 
237 | `Become a sponsor <https://github.com/sponsors/thombashi>`__
238 | 
239 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # Internal variables.
 11 | PAPEROPT_a4     = -D latex_paper_size=a4
 12 | PAPEROPT_letter = -D latex_paper_size=letter
 13 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 14 | # the i18n builder cannot share the environment and doctrees with the others
 15 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 16 | 
 17 | .PHONY: help
 18 | help:
 19 | 	@echo "Please use \`make <target>' where <target> is one of"
 20 | 	@echo "  html       to make standalone HTML files"
 21 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 22 | 	@echo "  singlehtml to make a single large HTML file"
 23 | 	@echo "  pickle     to make pickle files"
 24 | 	@echo "  json       to make JSON files"
 25 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 26 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 27 | 	@echo "  applehelp  to make an Apple Help Book"
 28 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 29 | 	@echo "  epub       to make an epub"
 30 | 	@echo "  epub3      to make an epub3"
 31 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 32 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 33 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 34 | 	@echo "  text       to make text files"
 35 | 	@echo "  man        to make manual pages"
 36 | 	@echo "  texinfo    to make Texinfo files"
 37 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 38 | 	@echo "  gettext    to make PO message catalogs"
 39 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 40 | 	@echo "  xml        to make Docutils-native XML files"
 41 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 42 | 	@echo "  linkcheck  to check all external links for integrity"
 43 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 44 | 	@echo "  coverage   to run coverage check of the documentation (if enabled)"
 45 | 	@echo "  dummy      to check syntax errors of document sources"
 46 | 
 47 | .PHONY: clean
 48 | clean:
 49 | 	rm -rf $(BUILDDIR)/*
 50 | 
 51 | .PHONY: html
 52 | html:
 53 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 54 | 	@echo
 55 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 56 | 
 57 | .PHONY: dirhtml
 58 | dirhtml:
 59 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 60 | 	@echo
 61 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 62 | 
 63 | .PHONY: singlehtml
 64 | singlehtml:
 65 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 66 | 	@echo
 67 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 68 | 
 69 | .PHONY: pickle
 70 | pickle:
 71 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 72 | 	@echo
 73 | 	@echo "Build finished; now you can process the pickle files."
 74 | 
 75 | .PHONY: json
 76 | json:
 77 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 78 | 	@echo
 79 | 	@echo "Build finished; now you can process the JSON files."
 80 | 
 81 | .PHONY: htmlhelp
 82 | htmlhelp:
 83 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 84 | 	@echo
 85 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 86 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 87 | 
 88 | .PHONY: qthelp
 89 | qthelp:
 90 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 91 | 	@echo
 92 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 93 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 94 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pytablereader.qhcp"
 95 | 	@echo "To view the help file:"
 96 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pytablereader.qhc"
 97 | 
 98 | .PHONY: applehelp
 99 | applehelp:
100 | 	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
101 | 	@echo
102 | 	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
103 | 	@echo "N.B. You won't be able to view it unless you put it in" \
104 | 	      "~/Library/Documentation/Help or install it in your application" \
105 | 	      "bundle."
106 | 
107 | .PHONY: devhelp
108 | devhelp:
109 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
110 | 	@echo
111 | 	@echo "Build finished."
112 | 	@echo "To view the help file:"
113 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/pytablereader"
114 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pytablereader"
115 | 	@echo "# devhelp"
116 | 
117 | .PHONY: epub
118 | epub:
119 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
120 | 	@echo
121 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
122 | 
123 | .PHONY: epub3
124 | epub3:
125 | 	$(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3
126 | 	@echo
127 | 	@echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3."
128 | 
129 | .PHONY: latex
130 | latex:
131 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
132 | 	@echo
133 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
134 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
135 | 	      "(use \`make latexpdf' here to do that automatically)."
136 | 
137 | .PHONY: latexpdf
138 | latexpdf:
139 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
140 | 	@echo "Running LaTeX files through pdflatex..."
141 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
142 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
143 | 
144 | .PHONY: latexpdfja
145 | latexpdfja:
146 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
147 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
148 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
149 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
150 | 
151 | .PHONY: text
152 | text:
153 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
154 | 	@echo
155 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
156 | 
157 | .PHONY: man
158 | man:
159 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
160 | 	@echo
161 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
162 | 
163 | .PHONY: texinfo
164 | texinfo:
165 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
166 | 	@echo
167 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
168 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
169 | 	      "(use \`make info' here to do that automatically)."
170 | 
171 | .PHONY: info
172 | info:
173 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
174 | 	@echo "Running Texinfo files through makeinfo..."
175 | 	make -C $(BUILDDIR)/texinfo info
176 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
177 | 
178 | .PHONY: gettext
179 | gettext:
180 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
181 | 	@echo
182 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
183 | 
184 | .PHONY: changes
185 | changes:
186 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
187 | 	@echo
188 | 	@echo "The overview file is in $(BUILDDIR)/changes."
189 | 
190 | .PHONY: linkcheck
191 | linkcheck:
192 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
193 | 	@echo
194 | 	@echo "Link check complete; look for any errors in the above output " \
195 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
196 | 
197 | .PHONY: doctest
198 | doctest:
199 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
200 | 	@echo "Testing of doctests in the sources finished, look at the " \
201 | 	      "results in $(BUILDDIR)/doctest/output.txt."
202 | 
203 | .PHONY: coverage
204 | coverage:
205 | 	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
206 | 	@echo "Testing of coverage in the sources finished, look at the " \
207 | 	      "results in $(BUILDDIR)/coverage/python.txt."
208 | 
209 | .PHONY: xml
210 | xml:
211 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
212 | 	@echo
213 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
214 | 
215 | .PHONY: pseudoxml
216 | pseudoxml:
217 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
218 | 	@echo
219 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
220 | 
221 | .PHONY: dummy
222 | dummy:
223 | 	$(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy
224 | 	@echo
225 | 	@echo "Build finished. Dummy builder generates no files."
226 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | Welcome to pytablereader's documentation!
 2 | ==========================================
 3 | 
 4 | .. raw:: html
 5 | 
 6 |     <div class='ghscard' src='//raw.githubusercontent.com/thombashi/thombashi.github.io/master/data/thombashi_pytablereader.json'></div>
 7 |     <script src='//cdn.jsdelivr.net/gh/thombashi/ghscard@master/dist/ghscard.min.js'></script>
 8 |     <br />
 9 |     <br />
10 | 
11 | .. toctree::
12 |    :caption: Table of Contents
13 |    :maxdepth: 4
14 |    :numbered:
15 |    
16 |    pages/introduction/index
17 |    pages/examples/index
18 |    pages/reference/index
19 |    pages/links
20 | 
21 | 
22 | Indices and tables
23 | ==================
24 | 
25 | * :ref:`genindex`
26 | 


--------------------------------------------------------------------------------
/docs/make_readme.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """
 4 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
 5 | """
 6 | 
 7 | import sys
 8 | 
 9 | from path import Path
10 | from readmemaker import ReadmeMaker
11 | 
12 | 
13 | PROJECT_NAME = "pytablereader"
14 | OUTPUT_DIR = ".."
15 | 
16 | 
17 | def write_examples(maker):
18 |     maker.set_indent_level(0)
19 |     maker.write_chapter("Examples")
20 | 
21 |     examples_root = Path("pages").joinpath("examples")
22 |     maker.inc_indent_level()
23 | 
24 |     maker.write_chapter("Load a CSV table")
25 |     maker.write_file(examples_root.joinpath("load_csv.txt"))
26 | 
27 |     maker.write_chapter("Get loaded table data as pandas.DataFrame instance")
28 |     maker.write_file(examples_root.joinpath("as_dataframe.txt"))
29 | 
30 |     maker.write_chapter("For more information")
31 |     maker.write_lines(
32 |         [
33 |             "More examples are available at ",
34 |             f"https://{PROJECT_NAME:s}.rtfd.io/en/latest/pages/examples/index.html",
35 |         ]
36 |     )
37 | 
38 | 
39 | def main():
40 |     maker = ReadmeMaker(
41 |         PROJECT_NAME,
42 |         OUTPUT_DIR,
43 |         is_make_toc=True,
44 |         project_url=f"https://github.com/thombashi/{PROJECT_NAME}",
45 |     )
46 | 
47 |     maker.write_chapter("Summary")
48 |     maker.write_introduction_file("summary.txt")
49 |     maker.write_introduction_file("badges.txt")
50 |     maker.write_introduction_file("feature.txt")
51 | 
52 |     write_examples(maker)
53 | 
54 |     maker.write_introduction_file("installation.rst")
55 | 
56 |     maker.set_indent_level(0)
57 |     maker.write_chapter("Documentation")
58 |     maker.write_lines([f"https://{PROJECT_NAME:s}.rtfd.io/"])
59 | 
60 |     maker.write_chapter("Related Project")
61 |     maker.write_lines(
62 |         [
63 |             "- `pytablewriter <https://github.com/thombashi/pytablewriter>`__",
64 |             "    - Tabular data loaded by ``pytablereader`` can be written "
65 |             "another tabular data format with ``pytablewriter``.",
66 |         ]
67 |     )
68 | 
69 |     maker.write_file(maker.doc_page_root_dir_path.joinpath("sponsors.rst"))
70 | 
71 |     return 0
72 | 
73 | 
74 | if __name__ == "__main__":
75 |     sys.exit(main())
76 | 


--------------------------------------------------------------------------------
/docs/pages/examples/as_dataframe.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | :Sample Code:
 3 |     .. code-block:: python
 4 |         :caption: Convert from loaded tabledata.TableData to pandas.DataFrame
 5 | 
 6 |         import pytablereader as ptr
 7 |         
 8 |         loader = ptr.CsvTableTextLoader(
 9 |             "\n".join([
10 |                 "a,b",
11 |                 "1,2",
12 |                 "3.3,4.4",
13 |             ]))
14 |         for table_data in loader.load():
15 |             print(table_data.as_dataframe())
16 | 
17 | :Output:
18 |     .. code-block:: none
19 | 
20 |              a    b
21 |         0    1    2
22 |         1  3.3  4.4
23 | 


--------------------------------------------------------------------------------
/docs/pages/examples/csv_loader.rst:
--------------------------------------------------------------------------------
 1 | .. _example-csv-table-loader:
 2 | 
 3 | Load table data from CSV
 4 | ----------------------------
 5 | 
 6 | Following example shows how to extract |TableData| from CSV data by using |CsvTableFileLoader| and |CsvTableTextLoader| classes.
 7 | 
 8 | .. include:: load_csv.txt
 9 | 
10 | 


--------------------------------------------------------------------------------
/docs/pages/examples/dataframe.rst:
--------------------------------------------------------------------------------
1 | .. _example-as-dataframe:
2 | 
3 | Get loaded table data as pandas.DataFrame
4 | --------------------------------------------------------
5 | A |TableData| instance can be converted to a ``pandas.DataFrame`` instance 
6 | by :py:meth:`~tabledata.TableData.as_dataframe`.
7 | 
8 | .. include:: as_dataframe.txt
9 | 


--------------------------------------------------------------------------------
/docs/pages/examples/gs_loader.rst:
--------------------------------------------------------------------------------
 1 | .. _example-gs-table-loader:
 2 | 
 3 | Load table data from Google Sheets
 4 | -------------------------------------
 5 | Following example shows how to extract |TableData| from Google Sheets by using |GoogleSheetsTableLoader| class.
 6 | 
 7 | .. code-block:: python
 8 |     :caption: Load table data from Google Sheets
 9 | 
10 |     import io
11 | 
12 |     import pytablereader as ptr
13 |     import pytablewriter as ptw
14 | 
15 | 
16 |     loader = ptr.TableUrlLoader(
17 |         "https://en.wikipedia.org/wiki/List_of_unit_testing_frameworks",
18 |         "html")
19 | 
20 |     writer = ptw.TableWriterFactory.create_from_format_name("rst")
21 |     writer.stream = io.open("load_url_result.rst", "w", encoding=loader.encoding)
22 |     for table_data in loader.load():
23 |         writer.from_tabledata(table_data)
24 |         writer.write_table()
25 | 
26 | 


--------------------------------------------------------------------------------
/docs/pages/examples/index.rst:
--------------------------------------------------------------------------------
 1 | Examples
 2 | ========
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 3
 6 | 
 7 |    csv_loader
 8 |    url_loader
 9 |    gs_loader
10 |    dataframe
11 | 


--------------------------------------------------------------------------------
/docs/pages/examples/load_csv.txt:
--------------------------------------------------------------------------------
 1 | :Sample Code:
 2 |     .. code-block:: python
 3 |         :caption: Load table from CSV
 4 |         
 5 |         import pytablereader as ptr
 6 |         import pytablewriter as ptw
 7 |         
 8 |         
 9 |         # prepare data ---
10 |         file_path = "sample_data.csv"
11 |         csv_text = "\n".join([
12 |             '"attr_a","attr_b","attr_c"',
13 |             '1,4,"a"',
14 |             '2,2.1,"bb"',
15 |             '3,120.9,"ccc"',
16 |         ])
17 |         
18 |         with open(file_path, "w") as f:
19 |             f.write(csv_text)
20 |         
21 |         # load from a csv file ---
22 |         loader = ptr.CsvTableFileLoader(file_path)
23 |         for table_data in loader.load():
24 |             print("\n".join([
25 |                 "load from file",
26 |                 "==============",
27 |                 "{:s}".format(ptw.dumps_tabledata(table_data)),
28 |             ]))
29 |         
30 |         # load from a csv text ---
31 |         loader = ptr.CsvTableTextLoader(csv_text)
32 |         for table_data in loader.load():
33 |             print("\n".join([
34 |                 "load from text",
35 |                 "==============",
36 |                 "{:s}".format(ptw.dumps_tabledata(table_data)),
37 |             ]))
38 | 
39 | 
40 | :Output:
41 |     .. code-block:: none
42 | 
43 |         load from file
44 |         ==============
45 |         .. table:: sample_data
46 | 
47 |             ======  ======  ======
48 |             attr_a  attr_b  attr_c
49 |             ======  ======  ======
50 |                  1     4.0  a
51 |                  2     2.1  bb
52 |                  3   120.9  ccc
53 |             ======  ======  ======
54 | 
55 |         load from text
56 |         ==============
57 |         .. table:: csv2
58 | 
59 |             ======  ======  ======
60 |             attr_a  attr_b  attr_c
61 |             ======  ======  ======
62 |                  1     4.0  a
63 |                  2     2.1  bb
64 |                  3   120.9  ccc
65 |             ======  ======  ======
66 | 


--------------------------------------------------------------------------------
/docs/pages/examples/url_loader.rst:
--------------------------------------------------------------------------------
 1 | .. _example-url-table-loader:
 2 | 
 3 | Load table data from a web page
 4 | -------------------------------------
 5 | Following example shows how to extract |TableData| from a web page by using |TableUrlLoader| class.
 6 | 
 7 | :Sample Code:
 8 |     .. code-block:: python
 9 |         :caption: Load table from a web page
10 | 
11 |         import io
12 | 
13 |         import pytablereader as ptr
14 |         import pytablewriter as ptw
15 | 
16 | 
17 |         loader = ptr.TableUrlLoader(
18 |             "https://en.wikipedia.org/wiki/List_of_unit_testing_frameworks",
19 |             "html")
20 | 
21 |         writer = ptw.TableWriterFactory.create_from_format_name("rst")
22 |         writer.stream = io.open("load_url_result.rst", "w", encoding=loader.encoding)
23 |         for table_data in loader.load():
24 |             writer.from_tabledata(table_data)
25 |             writer.write_table()
26 | 
27 | :Output:
28 |     .. code-block:: console
29 | 
30 |         $ ./load_table_from_url.py
31 |         $ head load_url_result.rst -n 8
32 |         .. table:: List of unit testing frameworks - Wikipedia_html1
33 | 
34 |             +---------+-----+------+------------------------+
35 |             |  Name   |xUnit|Source|        Remarks         |
36 |             +=========+=====+======+========================+
37 |             |ABAP Unit|Yes  |[1]   |since SAP NetWeaver 2004|
38 |             +---------+-----+------+------------------------+
39 | 


--------------------------------------------------------------------------------
/docs/pages/genindex.rst:
--------------------------------------------------------------------------------
1 | Indices and tables
2 | ==================
3 | 
4 | * :ref:`genindex`


--------------------------------------------------------------------------------
/docs/pages/introduction/badges.txt:
--------------------------------------------------------------------------------
 1 | .. image:: https://badge.fury.io/py/pytablereader.svg
 2 |     :target: https://badge.fury.io/py/pytablereader
 3 |     :alt: PyPI package version
 4 | 
 5 | .. image:: https://img.shields.io/pypi/pyversions/pytablereader.svg
 6 |     :target: https://pypi.org/project/pytablereader
 7 |     :alt: Supported Python versions
 8 | 
 9 | .. image:: https://img.shields.io/pypi/implementation/pytablereader.svg
10 |     :target: https://pypi.org/project/pytablereader
11 |     :alt: Supported Python implementations
12 | 
13 | .. image:: https://github.com/thombashi/pytablereader/actions/workflows/lint_and_test.yml/badge.svg
14 |     :target: https://github.com/thombashi/pytablereader/actions/workflows/lint_and_test.yml
15 |     :alt: CI status of Linux/macOS/Windows
16 | 
17 | .. image:: https://coveralls.io/repos/github/thombashi/pytablereader/badge.svg?branch=master
18 |     :target: https://coveralls.io/github/thombashi/pytablereader?branch=master
19 |     :alt: Test coverage
20 | 
21 | .. image:: https://github.com/thombashi/pytablereader/actions/workflows/github-code-scanning/codeql/badge.svg
22 |     :target: https://github.com/thombashi/pytablereader/actions/workflows/github-code-scanning/codeql
23 |     :alt: CodeQL
24 | 


--------------------------------------------------------------------------------
/docs/pages/introduction/feature.txt:
--------------------------------------------------------------------------------
 1 | Features
 2 | --------
 3 | - Extract structured tabular data from various data format:
 4 |     - CSV / Tab separated values (TSV) / Space separated values (SSV)
 5 |     - Microsoft Excel :superscript:`TM` file
 6 |     - `Google Sheets <https://www.google.com/intl/en_us/sheets/about/>`_
 7 |     - HTML (``table`` tags)
 8 |     - JSON
 9 |     - `Labeled Tab-separated Values (LTSV) <http://ltsv.org/>`__
10 |     - `Line-delimited JSON(LDJSON) <https://en.wikipedia.org/wiki/JSON_streaming#Line-delimited_JSON>`__ / NDJSON / JSON Lines
11 |     - Markdown
12 |     - MediaWiki
13 |     - SQLite database file
14 | - Supported data sources are:
15 |     - Files on a local file system
16 |     - Accessible URLs
17 |     - ``str`` instances
18 | - Loaded table data can be used as:
19 |     - `pandas.DataFrame <https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html>`__ instance
20 |     - ``dict`` instance
21 | 


--------------------------------------------------------------------------------
/docs/pages/introduction/index.rst:
--------------------------------------------------------------------------------
 1 | pytablereader
 2 | ===============
 3 | 
 4 | .. include:: badges.txt
 5 | 
 6 | 
 7 | Summary
 8 | -------
 9 | 
10 | .. include:: summary.txt
11 | 
12 | .. raw:: html
13 | 
14 |     <div class='ghscard' src='//raw.githubusercontent.com/thombashi/thombashi.github.io/master/data/thombashi_pytablereader.json'></div>
15 |     <script src='//cdn.jsdelivr.net/gh/thombashi/ghscard@master/dist/ghscard.min.js'></script>
16 |     <br />
17 |     <br />
18 | 
19 | 
20 | .. include:: feature.txt
21 | 
22 | 
23 | .. include:: installation.rst
24 | 


--------------------------------------------------------------------------------
/docs/pages/introduction/installation.rst:
--------------------------------------------------------------------------------
 1 | Installation
 2 | ============
 3 | 
 4 | Install from PyPI
 5 | ------------------------------
 6 | ::
 7 | 
 8 |     pip install pytablereader
 9 | 
10 | Some of the formats require additional dependency packages, you can install the dependency packages as follows:
11 | 
12 | - Excel
13 |     - ``pip install pytablereader[excel]``
14 | - Google Sheets
15 |     - ``pip install pytablereader[gs]``
16 | - Markdown
17 |     - ``pip install pytablereader[md]``
18 | - Mediawiki
19 |     - ``pip install pytablereader[mediawiki]``
20 | - SQLite
21 |     - ``pip install pytablereader[sqlite]``
22 | - Load from URLs
23 |     - ``pip install pytablereader[url]``
24 | - All of the extra dependencies
25 |     - ``pip install pytablereader[all]``
26 | 
27 | Install from PPA (for Ubuntu)
28 | ------------------------------
29 | ::
30 | 
31 |     sudo add-apt-repository ppa:thombashi/ppa
32 |     sudo apt update
33 |     sudo apt install python3-pytablereader
34 | 
35 | 
36 | Dependencies
37 | ============
38 | - Python 3.7+
39 | - `Python package dependencies (automatically installed) <https://github.com/thombashi/pytablereader/network/dependencies>`__
40 | 
41 | 
42 | Optional Python packages
43 | ------------------------------------------------
44 | - ``logging`` extras
45 |     - `loguru <https://github.com/Delgan/loguru>`__: Used for logging if the package installed
46 | - ``excel`` extras
47 |     - `excelrd <https://github.com/thombashi/excelrd>`__
48 | - ``md`` extras
49 |     - `Markdown <https://github.com/Python-Markdown/markdown>`__
50 | - ``mediawiki`` extras
51 |     - `pypandoc <https://github.com/bebraw/pypandoc>`__
52 | - ``sqlite`` extras
53 |     - `SimpleSQLite <https://github.com/thombashi/SimpleSQLite>`__
54 | - ``url`` extras
55 |     - `retryrequests <https://github.com/thombashi/retryrequests>`__
56 | - `pandas <https://pandas.pydata.org/>`__
57 |     - required to get table data as a pandas data frame
58 | - `lxml <https://lxml.de/installation.html>`__
59 | 
60 | Optional packages (other than Python packages)
61 | ------------------------------------------------
62 | - ``libxml2`` (faster HTML conversion)
63 | - `pandoc <https://pandoc.org/>`__ (required when loading MediaWiki file)
64 | 


--------------------------------------------------------------------------------
/docs/pages/introduction/summary.txt:
--------------------------------------------------------------------------------
1 | pytablereader is a Python library to load structured table data from files/strings/URL with various data format: CSV / Excel / Google-Sheets / HTML / JSON / LDJSON / LTSV / Markdown / SQLite / TSV.
2 | 


--------------------------------------------------------------------------------
/docs/pages/links.rst:
--------------------------------------------------------------------------------
 1 | Changelog
 2 | ==========
 3 | https://github.com/thombashi/pytablereader/releases
 4 | 
 5 | 
 6 | .. include:: sponsors.rst
 7 | 
 8 | .. include:: genindex.rst
 9 | 
10 | 
11 | Links
12 | =====
13 | - `GitHub repository <https://github.com/thombashi/pytablereader>`__
14 | - `Issue tracker <https://github.com/thombashi/pytablereader/issues>`__
15 | - `pip: A tool for installing Python packages <https://pip.pypa.io/en/stable/>`__
16 | 


--------------------------------------------------------------------------------
/docs/pages/reference/basic_loader.rst:
--------------------------------------------------------------------------------
 1 | Table Loader Wrapper Classes
 2 | ----------------------------
 3 | 
 4 | File Loader Wrapper
 5 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 6 | .. autoclass:: pytablereader.TableFileLoader
 7 |     :inherited-members:
 8 | 
 9 | Text Loader Wrapper
10 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
11 | .. autoclass:: pytablereader.TableTextLoader
12 |     :inherited-members:
13 | 
14 | URL Loader Wrapper
15 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
16 | .. autoclass:: pytablereader.TableUrlLoader
17 |     :inherited-members:
18 | 


--------------------------------------------------------------------------------
/docs/pages/reference/error.rst:
--------------------------------------------------------------------------------
 1 | Exceptions
 2 | ----------------------------
 3 | 
 4 | .. autoexception:: pytablereader.ValidationError
 5 |     :show-inheritance:
 6 | 
 7 | .. autoexception:: pytablereader.PathError
 8 |     :show-inheritance:
 9 | 
10 | .. autoexception:: pytablereader.InvalidFilePathError
11 |     :show-inheritance:
12 | 
13 | .. autoexception:: pytablereader.UrlError
14 |     :show-inheritance:
15 | 
16 | .. autoexception:: pytablereader.OpenError
17 |     :show-inheritance:
18 | 
19 | .. autoexception:: pytablereader.LoaderNotFoundError
20 |     :show-inheritance:
21 | 
22 | .. autoexception:: pytablereader.HTTPError
23 |     :show-inheritance:
24 | 
25 | .. autoexception:: pytablereader.ProxyError
26 |     :show-inheritance:
27 | 


--------------------------------------------------------------------------------
/docs/pages/reference/format_specific_loader.rst:
--------------------------------------------------------------------------------
  1 | Format Specific Table Loader Classes
  2 | --------------------------------------------
  3 | 
  4 | AbstractTableReader class
  5 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  6 | .. autoclass:: pytablereader.interface.AbstractTableReader
  7 |     :inherited-members:
  8 |     :show-inheritance:
  9 | 
 10 | 
 11 | CSV Loader Classes
 12 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 13 | 
 14 | CSV Table Loader
 15 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 16 | .. autoclass:: pytablereader.csv.core.CsvTableLoader
 17 |     :inherited-members:
 18 | 
 19 | CSV File Loader
 20 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 21 | .. autoclass:: pytablereader.CsvTableFileLoader
 22 |     :inherited-members:
 23 |     :show-inheritance:
 24 | 
 25 | CSV Text Loader
 26 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 27 | .. autoclass:: pytablereader.CsvTableTextLoader
 28 |     :inherited-members:
 29 |     :exclude-members: source_type,get_format_key,make_table_name
 30 |     :show-inheritance:
 31 | 
 32 | 
 33 | HTML Loader Classes
 34 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 35 | 
 36 | HTML File Loader
 37 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 38 | .. autoclass:: pytablereader.HtmlTableFileLoader
 39 |     :inherited-members:
 40 | 
 41 | HTML Text Loader
 42 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 43 | .. autoclass:: pytablereader.HtmlTableTextLoader
 44 |     :inherited-members:
 45 | 
 46 | 
 47 | JSON Loader Classes
 48 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 49 | 
 50 | Json File Loader
 51 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 52 | .. autoclass:: pytablereader.JsonTableFileLoader
 53 |     :inherited-members:
 54 | 
 55 | Json Text Loader
 56 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 57 | .. autoclass:: pytablereader.JsonTableTextLoader
 58 |     :inherited-members:
 59 | 
 60 | Line-delimited Json File Loader
 61 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 62 | .. autoclass:: pytablereader.JsonLinesTableFileLoader
 63 |     :inherited-members:
 64 | 
 65 | Line-delimited Json Text Loader
 66 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 67 | .. autoclass:: pytablereader.JsonLinesTableTextLoader
 68 |     :inherited-members:
 69 | 
 70 | 
 71 | LTSV Loader Classes
 72 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 73 | 
 74 | LTSV File Loader
 75 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 76 | .. autoclass:: pytablereader.LtsvTableFileLoader
 77 |     :inherited-members:
 78 |     :show-inheritance:
 79 | 
 80 | LTSV Text Loader
 81 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 82 | .. autoclass:: pytablereader.LtsvTableTextLoader
 83 |     :inherited-members:
 84 |     :exclude-members: source_type,get_format_key,make_table_name
 85 |     :show-inheritance:
 86 | 
 87 | 
 88 | Markdown Loader Classes
 89 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 90 | 
 91 | Markdown File Loader
 92 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 93 | .. autoclass:: pytablereader.MarkdownTableFileLoader
 94 |     :inherited-members:
 95 | 
 96 | Markdown Text Loader
 97 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 98 | .. autoclass:: pytablereader.MarkdownTableTextLoader
 99 |     :inherited-members:
100 | 
101 | 
102 | MediaWiki Loader Classes
103 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
104 | 
105 | MediaWiki File Loader
106 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
107 | .. autoclass:: pytablereader.MediaWikiTableFileLoader
108 |     :inherited-members:
109 | 
110 | MediaWiki Text Loader
111 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
112 | .. autoclass:: pytablereader.MediaWikiTableTextLoader
113 |     :inherited-members:
114 | 
115 | 
116 | Spread Sheet Loader Classes
117 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
118 | 
119 | Excel File Loader
120 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
121 | .. autoclass:: pytablereader.ExcelTableFileLoader
122 |     :inherited-members:
123 | 
124 | Google Sheets Loader
125 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
126 | .. autoclass:: pytablereader.GoogleSheetsTableLoader
127 |     :inherited-members:
128 | 
129 | 
130 | Database Loader Classes
131 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
132 | SQLite File Loader
133 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
134 | .. autoclass:: pytablereader.SqliteFileLoader
135 |     :inherited-members:
136 | 


--------------------------------------------------------------------------------
/docs/pages/reference/index.rst:
--------------------------------------------------------------------------------
 1 | Reference
 2 | =========
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 3
 6 | 
 7 |    basic_loader
 8 |    format_specific_loader
 9 |    loader_factory
10 |    error
11 | 


--------------------------------------------------------------------------------
/docs/pages/reference/loader_factory.rst:
--------------------------------------------------------------------------------
 1 | Table Loader Factory Classes
 2 | ----------------------------
 3 | 
 4 | File Loader Factory
 5 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 6 | .. autoclass:: pytablereader.factory.TableFileLoaderFactory
 7 |     :inherited-members:
 8 |     :undoc-members:
 9 | 
10 | Text Loader Factory
11 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
12 | .. autoclass:: pytablereader.factory.TableTextLoaderFactory
13 |     :inherited-members:
14 |     :undoc-members:
15 | 
16 | Url Loader Factory
17 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
18 | .. autoclass:: pytablereader.factory.TableUrlLoaderFactory
19 |     :inherited-members:
20 |     :undoc-members:
21 | 


--------------------------------------------------------------------------------
/docs/pages/sponsors.rst:
--------------------------------------------------------------------------------
 1 | Sponsors
 2 | ====================================
 3 | .. image:: https://avatars.githubusercontent.com/u/44389260?s=48&u=6da7176e51ae2654bcfd22564772ef8a3bb22318&v=4
 4 |    :target: https://github.com/chasbecker
 5 |    :alt: Charles Becker (chasbecker)
 6 | .. image:: https://avatars.githubusercontent.com/u/46711571?s=48&u=57687c0e02d5d6e8eeaf9177f7b7af4c9f275eb5&v=4
 7 |    :target: https://github.com/Arturi0
 8 |    :alt: onetime: Arturi0
 9 | .. image:: https://avatars.githubusercontent.com/u/3658062?s=48&v=4
10 |    :target: https://github.com/b4tman
11 |    :alt: onetime: Dmitry Belyaev (b4tman)
12 | 
13 | `Become a sponsor <https://github.com/sponsors/thombashi>`__
14 | 


--------------------------------------------------------------------------------
/examples/load_table_from_csv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import pytablewriter as ptw
 4 | 
 5 | import pytablereader as ptr
 6 | 
 7 | 
 8 | # prepare data ---
 9 | file_path = "sample_data.csv"
10 | csv_text = "\n".join([
11 |     '"attr_a","attr_b","attr_c"',
12 |     '1,4,"a"',
13 |     '2,2.1,"bb"',
14 |     '3,120.9,"ccc"',
15 | ])
16 | 
17 | with open(file_path, "w") as f:
18 |     f.write(csv_text)
19 | 
20 | # load from a csv file ---
21 | loader = ptr.CsvTableFileLoader(file_path)
22 | for table_data in loader.load():
23 |     print("\n".join([
24 |         "load from file",
25 |         "==============",
26 |         f"{ptw.dumps_tabledata(table_data):s}",
27 |     ]))
28 | 
29 | # load from a csv text ---
30 | loader = ptr.CsvTableTextLoader(csv_text)
31 | for table_data in loader.load():
32 |     print("\n".join([
33 |         "load from text",
34 |         "==============",
35 |         f"{ptw.dumps_tabledata(table_data):s}",
36 |     ]))
37 | 


--------------------------------------------------------------------------------
/examples/load_table_from_gs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import pytablewriter as ptw
 4 | 
 5 | import pytablereader as ptr
 6 | 
 7 | 
 8 | credentials_file = "sample-xxxxxxxxxxxx.json"
 9 | 
10 | loader = ptr.GoogleSheetsTableLoader(credentials_file)
11 | loader.title = "testbook"
12 | 
13 | for table_data in loader.load():
14 |     print(ptw.dumps_tabledata(table_data))
15 | 


--------------------------------------------------------------------------------
/examples/load_table_from_url.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import pytablewriter as ptw
 4 | 
 5 | import pytablereader as ptr
 6 | 
 7 | 
 8 | loader = ptr.TableUrlLoader(
 9 |     "https://en.wikipedia.org/wiki/List_of_unit_testing_frameworks",
10 |     "html")
11 | 
12 | writer = ptw.TableWriterFactory.create_from_format_name("rst")
13 | writer.stream = open("load_url_result.rst", "w", encoding=loader.encoding)
14 | for table_data in loader.load():
15 |     writer.from_tabledata(table_data)
16 |     writer.write_table()
17 | 


--------------------------------------------------------------------------------
/invoke_pytest.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Unit tests at Windows environments required to invoke from py module,
 3 | because of multiprocessing:
 4 | https://py.rtfd.io/en/latest/faq.html?highlight=cmdline#issues-with-py-test-multiprocess-and-setuptools
 5 | """
 6 | 
 7 | import multiprocessing
 8 | import sys
 9 | 
10 | import py
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     multiprocessing.freeze_support()
15 |     sys.exit(py.test.cmdline.main())
16 | 


--------------------------------------------------------------------------------
/pylama.ini:
--------------------------------------------------------------------------------
 1 | [pylama]
 2 | skip = .eggs/*,.tox/*,*/.env/*,build/*,_sandbox/*,build/*,docs/conf.py
 3 | 
 4 | [pylama:pycodestyle]
 5 | max_line_length = 100
 6 | 
 7 | # E203: whitespace before ':' (for black)
 8 | # W503: line break before binary operator (for black)
 9 | ignore = E203,W503
10 | 
11 | [pylama:pylint]
12 | max_line_length = 100
13 | 
14 | [pylama:test/*]
15 | # E501: line too long [pycodestyle]
16 | ignore = E501
17 | 
18 | [pylama:*/__init__.py]
19 | # W0611: imported but unused [pyflakes]
20 | ignore = W0611
21 | 
22 | [pylama:test/test_logger.py]
23 | # E402: module level import not at top of file [pycodestyle]
24 | ignore = E402
25 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [tool.black]
 6 | line-length = 100
 7 | exclude = '''
 8 | /(
 9 |       \.eggs
10 |     | \.git
11 |     | \.mypy_cache
12 |     | \.tox
13 |     | \.venv
14 |     | \.pytype
15 |     | _build
16 |     | buck-out
17 |     | build
18 |     | dist
19 |     | examples
20 | )/
21 | | docs/conf.py
22 | '''
23 | target-version = ['py37', 'py38', 'py39', 'py310', 'py311']
24 | 
25 | [tool.isort]
26 | known_third_party = [
27 |     'pytablewriter',
28 |     'pytest',
29 |     'readmemaker',
30 |     'responses',
31 |     'simplesqlite',
32 |     'sphinx_rtd_theme',
33 |     'xlsxwriter',
34 | ]
35 | include_trailing_comma = true
36 | line_length = 100
37 | lines_after_imports = 2
38 | multi_line_output = 3
39 | skip_glob = [
40 |     '*/.eggs/*',
41 |     '*/.pytype/*',
42 |     '*/.tox/*',
43 | ]
44 | 
45 | [tool.coverage.run]
46 | source = ['pytablereader']
47 | branch = true
48 | 
49 | [tool.coverage.report]
50 | show_missing = true
51 | precision = 1
52 | exclude_lines = [
53 |     'except ImportError',
54 |     'raise NotImplementedError',
55 |     'pass',
56 |     'ABCmeta',
57 |     'abstractmethod',
58 |     'abstractproperty',
59 |     'abstractclassmethod',
60 |     'warnings.warn',
61 | ]
62 | 


--------------------------------------------------------------------------------
/pytablereader/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
 3 | """
 4 | 
 5 | from tabledata import DataError, InvalidHeaderNameError, InvalidTableNameError
 6 | 
 7 | from .__version__ import __author__, __copyright__, __email__, __license__, __version__
 8 | from ._constant import PatternMatch
 9 | from ._logger import set_log_level, set_logger
10 | from .csv.core import CsvTableFileLoader, CsvTableTextLoader
11 | from .error import (
12 |     APIError,
13 |     HTTPError,
14 |     InvalidFilePathError,
15 |     LoaderNotFoundError,
16 |     OpenError,
17 |     PathError,
18 |     ProxyError,
19 |     PypandocImportError,
20 |     UrlError,
21 |     ValidationError,
22 | )
23 | from .html.core import HtmlTableFileLoader, HtmlTableTextLoader
24 | from .json.core import JsonTableDictLoader, JsonTableFileLoader, JsonTableTextLoader
25 | from .jsonlines.core import JsonLinesTableFileLoader, JsonLinesTableTextLoader
26 | from .loadermanager import TableFileLoader, TableTextLoader, TableUrlLoader
27 | from .ltsv.core import LtsvTableFileLoader, LtsvTableTextLoader
28 | from .markdown.core import MarkdownTableFileLoader, MarkdownTableTextLoader
29 | from .mediawiki.core import MediaWikiTableFileLoader, MediaWikiTableTextLoader
30 | from .spreadsheet.excelloader import ExcelTableFileLoader
31 | from .spreadsheet.gsloader import GoogleSheetsTableLoader
32 | from .sqlite.core import SqliteFileLoader
33 | from .tsv.core import TsvTableFileLoader, TsvTableTextLoader
34 | 


--------------------------------------------------------------------------------
/pytablereader/__version__.py:
--------------------------------------------------------------------------------
1 | __author__ = "Tsuyoshi Hombashi"
2 | __copyright__ = f"Copyright 2016, {__author__}"
3 | __license__ = "MIT License"
4 | __version__ = "0.31.4"
5 | __maintainer__ = __author__
6 | __email__ = "tsuyoshi.hombashi@gmail.com"
7 | 


--------------------------------------------------------------------------------
/pytablereader/_acceptor.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
 3 | """
 4 | 
 5 | import abc
 6 | 
 7 | 
 8 | class LoaderAcceptorInterface(metaclass=abc.ABCMeta):
 9 |     """
10 |     An interface class of table loader acceptor.
11 |     """
12 | 
13 |     @abc.abstractmethod
14 |     def accept(self, loader):  # pragma: no cover
15 |         pass
16 | 
17 | 
18 | class LoaderAcceptor(LoaderAcceptorInterface):
19 |     """
20 |     An abstract class of table loader acceptor.
21 |     """
22 | 
23 |     def __init__(self):
24 |         self._loader = None
25 | 
26 |     def accept(self, loader):
27 |         self._loader = loader
28 | 


--------------------------------------------------------------------------------
/pytablereader/_common.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
 3 | """
 4 | 
 5 | import os.path
 6 | import posixpath
 7 | from urllib.parse import urlparse
 8 | 
 9 | import pathvalidate
10 | import typepy
11 | 
12 | from ._constant import Default
13 | from .error import InvalidFilePathError
14 | 
15 | 
16 | try:
17 |     import simplejson as json
18 | except ImportError:
19 |     import json  # type: ignore # noqa
20 | 
21 | 
22 | def get_file_encoding(file_path, encoding):
23 |     from mbstrdecoder import detect_file_encoding
24 | 
25 |     if encoding:
26 |         return encoding
27 | 
28 |     encoding = detect_file_encoding(file_path)
29 |     if not encoding:
30 |         return Default.ENCODING
31 | 
32 |     return encoding
33 | 
34 | 
35 | def get_extension(file_path):
36 |     if typepy.is_null_string(file_path):
37 |         raise InvalidFilePathError("file path is empty")
38 | 
39 |     return os.path.splitext(file_path)[1].lstrip(".")
40 | 
41 | 
42 | def make_temp_file_path_from_url(temp_dir_path, url):
43 |     try:
44 |         url_path = urlparse(url).path
45 |     except AttributeError:
46 |         raise InvalidFilePathError("url must be a string")
47 | 
48 |     if typepy.is_null_string(url_path):
49 |         raise InvalidFilePathError(f"invalid URL path: {url_path}")
50 | 
51 |     temp_name = os.path.basename(url_path.rstrip("/"))
52 |     if typepy.is_null_string(temp_name):
53 |         temp_name = pathvalidate.replace_symbol(temp_name, replacement_text="_")
54 | 
55 |     if typepy.is_null_string(temp_name):
56 |         raise InvalidFilePathError(f"invalid URL: {url}")
57 | 
58 |     try:
59 |         return posixpath.join(temp_dir_path, temp_name)
60 |     except (TypeError, AttributeError):
61 |         raise InvalidFilePathError("temp_dir_path must be a string")
62 | 


--------------------------------------------------------------------------------
/pytablereader/_constant.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
 3 | """
 4 | 
 5 | import enum
 6 | 
 7 | 
 8 | class Default:
 9 |     ENCODING = "utf-8"
10 | 
11 | 
12 | class SourceType:
13 |     TEXT = "text"
14 |     FILE = "file"
15 |     URL = "url"
16 |     OBJECT = "object"
17 | 
18 | 
19 | class TableNameTemplate:
20 |     __FORMAT = "%({:s})s"
21 |     DEFAULT = __FORMAT.format("default")
22 |     FILENAME = __FORMAT.format("filename")
23 |     FORMAT_NAME = __FORMAT.format("format_name")
24 |     FORMAT_ID = __FORMAT.format("format_id")
25 |     GLOBAL_ID = __FORMAT.format("global_id")
26 |     KEY = __FORMAT.format("key")
27 |     TITLE = __FORMAT.format("title")
28 |     SHEET = __FORMAT.format("sheet")
29 | 
30 | 
31 | @enum.unique
32 | class PatternMatch(enum.Enum):
33 |     OR = 0
34 |     AND = 1
35 | 


--------------------------------------------------------------------------------
/pytablereader/_logger/__init__.py:
--------------------------------------------------------------------------------
1 | from ._logger import (
2 |     FileSourceLogger,
3 |     NullSourceLogger,
4 |     TextSourceLogger,
5 |     logger,
6 |     set_log_level,
7 |     set_logger,
8 | )
9 | 


--------------------------------------------------------------------------------
/pytablereader/_logger/_logger.py:
--------------------------------------------------------------------------------
  1 | """
  2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
  3 | """
  4 | 
  5 | import abc
  6 | 
  7 | import dataproperty
  8 | 
  9 | from ._null_logger import NullLogger
 10 | 
 11 | 
 12 | MODULE_NAME = "pytablereader"
 13 | 
 14 | try:
 15 |     from loguru import logger
 16 | 
 17 |     logger.disable(MODULE_NAME)
 18 | except ImportError:
 19 |     logger = NullLogger()  # type: ignore
 20 | 
 21 | 
 22 | def set_logger(is_enable, propagation_depth=2):
 23 |     if is_enable:
 24 |         logger.enable(MODULE_NAME)
 25 |     else:
 26 |         logger.disable(MODULE_NAME)
 27 | 
 28 |     if propagation_depth <= 0:
 29 |         return
 30 | 
 31 |     dataproperty.set_logger(is_enable, propagation_depth - 1)
 32 | 
 33 |     try:
 34 |         import simplesqlite
 35 | 
 36 |         simplesqlite.set_logger(is_enable, propagation_depth - 1)
 37 |     except (ImportError, TypeError):
 38 |         pass
 39 | 
 40 | 
 41 | def set_log_level(log_level):
 42 |     # deprecated
 43 |     return
 44 | 
 45 | 
 46 | def typehints_to_str(type_hints):
 47 |     return ", ".join([type_hint.__name__ if type_hint else "none" for type_hint in type_hints])
 48 | 
 49 | 
 50 | class LoggerInterface(metaclass=abc.ABCMeta):
 51 |     @abc.abstractmethod
 52 |     def logging_load(self):  # pragma: no cover
 53 |         pass
 54 | 
 55 | 
 56 | class BaseLogger(LoggerInterface):
 57 |     def __init__(self, loader):
 58 |         self._loader = loader
 59 | 
 60 |     def logging_load(self):
 61 |         logger.debug(self._get_load_message())
 62 | 
 63 |     def logging_table(self, table_data):
 64 |         logger.debug(f"loaded tabledata: {table_data}")
 65 | 
 66 |     @abc.abstractmethod
 67 |     def _get_load_message(self):
 68 |         pass
 69 | 
 70 | 
 71 | class NullSourceLogger(BaseLogger):
 72 |     def logging_load(self):
 73 |         pass
 74 | 
 75 |     def logging_table(self, table_data):
 76 |         pass
 77 | 
 78 |     def _get_load_message(self):
 79 |         return ""
 80 | 
 81 | 
 82 | class FileSourceLogger(BaseLogger):
 83 |     def _get_load_message(self):
 84 |         message = "loading {:s}: format={:s}, path={}".format(
 85 |             self._loader.source_type, self._loader.format_name, self._loader.source
 86 |         )
 87 | 
 88 |         try:
 89 |             message += f", encoding={self._loader.encoding}"
 90 |         except AttributeError:
 91 |             pass
 92 | 
 93 |         if self._loader.type_hints:
 94 |             message += f", type-hints=({typehints_to_str(self._loader.type_hints)})"
 95 | 
 96 |         return message
 97 | 
 98 | 
 99 | class TextSourceLogger(BaseLogger):
100 |     def _get_load_message(self):
101 |         message = "loading {:s}: format={:s}".format(
102 |             self._loader.source_type, self._loader.format_name
103 |         )
104 | 
105 |         try:
106 |             message += f", len={len(self._loader.source)}"
107 |         except TypeError:
108 |             pass
109 | 
110 |         try:
111 |             message += f", encoding={self._loader.encoding}"
112 |         except AttributeError:
113 |             pass
114 | 
115 |         if self._loader.type_hints:
116 |             message += f", type-hints=({typehints_to_str(self._loader.type_hints)})"
117 | 
118 |         return message
119 | 


--------------------------------------------------------------------------------
/pytablereader/_logger/_null_logger.py:
--------------------------------------------------------------------------------
 1 | class NullLogger:
 2 |     level_name = None
 3 | 
 4 |     def remove(self, handler_id=None):  # pragma: no cover
 5 |         pass
 6 | 
 7 |     def add(self, sink, **kwargs):  # pragma: no cover
 8 |         pass
 9 | 
10 |     def disable(self, name):  # pragma: no cover
11 |         pass
12 | 
13 |     def enable(self, name):  # pragma: no cover
14 |         pass
15 | 
16 |     def critical(self, __message, *args, **kwargs):  # pragma: no cover
17 |         pass
18 | 
19 |     def debug(self, __message, *args, **kwargs):  # pragma: no cover
20 |         pass
21 | 
22 |     def error(self, __message, *args, **kwargs):  # pragma: no cover
23 |         pass
24 | 
25 |     def exception(self, __message, *args, **kwargs):  # pragma: no cover
26 |         pass
27 | 
28 |     def info(self, __message, *args, **kwargs):  # pragma: no cover
29 |         pass
30 | 
31 |     def log(self, __level, __message, *args, **kwargs):  # pragma: no cover
32 |         pass
33 | 
34 |     def success(self, __message, *args, **kwargs):  # pragma: no cover
35 |         pass
36 | 
37 |     def trace(self, __message, *args, **kwargs):  # pragma: no cover
38 |         pass
39 | 
40 |     def warning(self, __message, *args, **kwargs):  # pragma: no cover
41 |         pass
42 | 


--------------------------------------------------------------------------------
/pytablereader/_validator.py:
--------------------------------------------------------------------------------
  1 | """
  2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
  3 | """
  4 | 
  5 | import abc
  6 | import os
  7 | import stat
  8 | from errno import EBADF, ENAMETOOLONG, ENOENT, ENOTDIR
  9 | from urllib.parse import urlparse
 10 | 
 11 | import pathvalidate as pv
 12 | import typepy
 13 | 
 14 | from pytablereader import DataError
 15 | 
 16 | from ._constant import SourceType
 17 | from ._logger import logger
 18 | from .error import InvalidFilePathError, UrlError
 19 | 
 20 | 
 21 | def is_fifo(file_path: str) -> bool:
 22 |     try:
 23 |         return stat.S_ISFIFO(os.stat(file_path).st_mode)
 24 |     except OSError as e:
 25 |         logger.error(f"errno: {e.errno}")
 26 | 
 27 |         if e.errno not in (EBADF, ENAMETOOLONG, ENOENT, ENOTDIR):
 28 |             raise
 29 | 
 30 |         return False
 31 |     except ValueError:
 32 |         return False
 33 | 
 34 | 
 35 | class ValidatorInterface(metaclass=abc.ABCMeta):
 36 |     """
 37 |     An interface class for data source validator.
 38 |     """
 39 | 
 40 |     @abc.abstractproperty
 41 |     def source_type(self):
 42 |         pass
 43 | 
 44 |     @abc.abstractmethod
 45 |     def validate(self):
 46 |         pass
 47 | 
 48 | 
 49 | class BaseValidator(ValidatorInterface):
 50 |     """
 51 |     An abstract base class for data source validator.
 52 |     """
 53 | 
 54 |     @property
 55 |     def source(self):
 56 |         return self.__source
 57 | 
 58 |     def __init__(self, source):
 59 |         self.__source = source
 60 | 
 61 | 
 62 | class NullValidator(BaseValidator):
 63 |     @property
 64 |     def source_type(self):
 65 |         return "null"
 66 | 
 67 |     def validate(self):
 68 |         pass
 69 | 
 70 | 
 71 | class FileValidator(BaseValidator):
 72 |     """
 73 |     Validator class for file data source.
 74 |     """
 75 | 
 76 |     @property
 77 |     def source_type(self):
 78 |         return SourceType.FILE
 79 | 
 80 |     def validate(self):
 81 |         try:
 82 |             pv.validate_filepath(self.source, platform="auto")
 83 |         except pv.ValidationError as e:
 84 |             raise InvalidFilePathError(e)
 85 | 
 86 |         if os.path.isfile(self.source) or is_fifo(self.source):
 87 |             return
 88 | 
 89 |         raise OSError("file not found")
 90 | 
 91 | 
 92 | class TextValidator(BaseValidator):
 93 |     """
 94 |     Validator class for text object data source.
 95 |     """
 96 | 
 97 |     @property
 98 |     def source_type(self):
 99 |         return SourceType.TEXT
100 | 
101 |     def validate(self):
102 |         if typepy.is_null_string(self.source):
103 |             raise DataError("data source is empty")
104 | 
105 | 
106 | class UrlValidator(BaseValidator):
107 |     """
108 |     Validator class for URL data source.
109 |     """
110 | 
111 |     @property
112 |     def source_type(self):
113 |         return SourceType.URL
114 | 
115 |     def validate(self):
116 |         if typepy.is_null_string(self.source):
117 |             raise UrlError("url is empty")
118 | 
119 |         scheme = urlparse(self.source).scheme
120 |         if scheme not in ["http", "https"]:
121 |             raise UrlError(f"invalid scheme: expected=http/https, actual={scheme}")
122 | 


--------------------------------------------------------------------------------
/pytablereader/csv/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thombashi/pytablereader/b2a6a3db3ef52f5db942340ae75a6905df64a960/pytablereader/csv/__init__.py


--------------------------------------------------------------------------------
/pytablereader/csv/core.py:
--------------------------------------------------------------------------------
  1 | """
  2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
  3 | """
  4 | 
  5 | import csv
  6 | import io
  7 | import warnings
  8 | 
  9 | import typepy
 10 | from mbstrdecoder import MultiByteStrDecoder
 11 | 
 12 | from pytablereader import DataError
 13 | 
 14 | from .._common import get_file_encoding
 15 | from .._constant import TableNameTemplate as tnt
 16 | from .._logger import FileSourceLogger, TextSourceLogger
 17 | from .._validator import FileValidator, TextValidator
 18 | from ..interface import AbstractTableReader
 19 | from .formatter import CsvTableFormatter
 20 | 
 21 | 
 22 | class CsvTableLoader(AbstractTableReader):
 23 |     """
 24 |     The abstract class of CSV table loaders.
 25 | 
 26 |     .. py:attribute:: headers
 27 | 
 28 |         Attribute names of the table. Use the first line of
 29 |         the CSV file as attribute list if ``headers`` is empty.
 30 | 
 31 |     .. py:attribute:: delimiter
 32 | 
 33 |         A one-character string used to separate fields.
 34 |         Defaults to ``","``.
 35 | 
 36 |     .. py:attribute:: quotechar
 37 | 
 38 |         A one-character string used to quote fields containing
 39 |         special characters, such as the ``delimiter`` or ``quotechar``,
 40 |         or which contain new-line characters.
 41 |         Defaults to ``'"'``.
 42 | 
 43 |     .. py:attribute:: encoding
 44 | 
 45 |         Encoding of the CSV data.
 46 |     """
 47 | 
 48 |     @property
 49 |     def format_name(self):
 50 |         return "csv"
 51 | 
 52 |     @property
 53 |     def delimiter(self):
 54 |         # "delimiter" must be a string, not an unicode
 55 |         return str(MultiByteStrDecoder(self.__delimiter).unicode_str)
 56 | 
 57 |     @delimiter.setter
 58 |     def delimiter(self, value):
 59 |         self.__delimiter = value
 60 | 
 61 |     @property
 62 |     def quotechar(self):
 63 |         # "quotechar" must be a string, not an unicode
 64 |         return str(MultiByteStrDecoder(self.__quotechar).unicode_str)
 65 | 
 66 |     @quotechar.setter
 67 |     def quotechar(self, value):
 68 |         self.__quotechar = value
 69 | 
 70 |     @property
 71 |     def header_list(self):
 72 |         warnings.warn("'header_list' has moved to 'headers'", DeprecationWarning)
 73 |         return self.headers
 74 | 
 75 |     @header_list.setter
 76 |     def header_list(self, value):
 77 |         warnings.warn("'header_list' has moved to 'headers'", DeprecationWarning)
 78 |         self.headers = value
 79 | 
 80 |     def __init__(self, source, quoting_flags, type_hints, type_hint_rules):
 81 |         super().__init__(source, quoting_flags, type_hints, type_hint_rules)
 82 | 
 83 |         self._csv_reader = None
 84 | 
 85 |         self.headers = ()
 86 |         self.delimiter = ","
 87 |         self.quotechar = '"'
 88 |         self.encoding = None
 89 | 
 90 |     def _to_data_matrix(self):
 91 |         try:
 92 |             return [
 93 |                 [self.__modify_item(data, col) for col, data in enumerate(row)]
 94 |                 for row in self._csv_reader
 95 |                 if typepy.is_not_empty_sequence(row)
 96 |             ]
 97 |         except (csv.Error, UnicodeDecodeError) as e:
 98 |             raise DataError(e)
 99 | 
100 |     def __modify_item(self, data, col: int):
101 |         if self.type_hints and (col in self.type_hints):
102 |             try:
103 |                 return self.type_hints[col](data).convert()
104 |             except typepy.TypeConversionError:
105 |                 pass
106 | 
107 |         return MultiByteStrDecoder(data).unicode_str
108 | 
109 | 
110 | class CsvTableFileLoader(CsvTableLoader):
111 |     """
112 |     A file loader class to extract tabular data from CSV files.
113 | 
114 |     :param str file_path: Path to the loading CSV file.
115 | 
116 |     .. py:attribute:: table_name
117 | 
118 |         Table name string. Defaults to ``%(filename)s``.
119 | 
120 |     :Examples:
121 |         :ref:`example-csv-table-loader`
122 |     """
123 | 
124 |     def __init__(self, file_path, quoting_flags=None, type_hints=None, type_hint_rules=None):
125 |         super().__init__(file_path, quoting_flags, type_hints, type_hint_rules)
126 | 
127 |         self._validator = FileValidator(file_path)
128 |         self._logger = FileSourceLogger(self)
129 | 
130 |     def load(self):
131 |         """
132 |         Extract tabular data as |TableData| instances from a CSV file.
133 |         |load_source_desc_file|
134 | 
135 |         :return:
136 |             Loaded table data.
137 |             |load_table_name_desc|
138 | 
139 |             ===================  ========================================
140 |             Format specifier     Value after the replacement
141 |             ===================  ========================================
142 |             ``%(filename)s``     |filename_desc|
143 |             ``%(format_name)s``  ``"csv"``
144 |             ``%(format_id)s``    |format_id_desc|
145 |             ``%(global_id)s``    |global_id|
146 |             ===================  ========================================
147 |         :rtype: |TableData| iterator
148 |         :raises pytablereader.DataError:
149 |             If the CSV data is invalid.
150 | 
151 |         .. seealso::
152 |             :py:func:`csv.reader`
153 |         """
154 | 
155 |         self._validate()
156 |         self._logger.logging_load()
157 |         self.encoding = get_file_encoding(self.source, self.encoding)
158 | 
159 |         self._csv_reader = csv.reader(
160 |             open(self.source, encoding=self.encoding),
161 |             delimiter=self.delimiter,
162 |             quotechar=self.quotechar,
163 |             strict=True,
164 |             skipinitialspace=True,
165 |         )
166 | 
167 |         formatter = CsvTableFormatter(self._to_data_matrix())
168 |         formatter.accept(self)
169 | 
170 |         return formatter.to_table_data()
171 | 
172 |     def _get_default_table_name_template(self):
173 |         return tnt.FILENAME
174 | 
175 | 
176 | class CsvTableTextLoader(CsvTableLoader):
177 |     """
178 |     A text loader class to extract tabular data from CSV text data.
179 | 
180 |     :param str text: CSV text to load.
181 | 
182 |     .. py:attribute:: table_name
183 | 
184 |         Table name string. Defaults to ``%(format_name)s%(format_id)s``.
185 | 
186 |     :Examples:
187 |         :ref:`example-csv-table-loader`
188 |     """
189 | 
190 |     def __init__(self, text, quoting_flags=None, type_hints=None, type_hint_rules=None):
191 |         super().__init__(text, quoting_flags, type_hints, type_hint_rules)
192 | 
193 |         self._validator = TextValidator(text)
194 |         self._logger = TextSourceLogger(self)
195 | 
196 |     def load(self):
197 |         """
198 |         Extract tabular data as |TableData| instances from a CSV text object.
199 |         |load_source_desc_text|
200 | 
201 |         :return:
202 |             Loaded table data.
203 |             |load_table_name_desc|
204 | 
205 |             ===================  ========================================
206 |             Format specifier     Value after the replacement
207 |             ===================  ========================================
208 |             ``%(filename)s``     ``""``
209 |             ``%(format_name)s``  ``"csv"``
210 |             ``%(format_id)s``    |format_id_desc|
211 |             ``%(global_id)s``    |global_id|
212 |             ===================  ========================================
213 |         :rtype: |TableData| iterator
214 |         :raises pytablereader.DataError:
215 |             If the CSV data is invalid.
216 | 
217 |         .. seealso::
218 |             :py:func:`csv.reader`
219 |         """
220 | 
221 |         self._validate()
222 |         self._logger.logging_load()
223 | 
224 |         self._csv_reader = csv.reader(
225 |             io.StringIO(self.source.strip()),
226 |             delimiter=self.delimiter,
227 |             quotechar=self.quotechar,
228 |             strict=True,
229 |             skipinitialspace=True,
230 |         )
231 |         formatter = CsvTableFormatter(self._to_data_matrix())
232 |         formatter.accept(self)
233 | 
234 |         return formatter.to_table_data()
235 | 
236 |     def _get_default_table_name_template(self):
237 |         return f"{tnt.FORMAT_NAME:s}{tnt.FORMAT_ID:s}"
238 | 


--------------------------------------------------------------------------------
/pytablereader/csv/formatter.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
 3 | """
 4 | 
 5 | import typepy
 6 | from tabledata import TableData
 7 | 
 8 | from pytablereader import DataError
 9 | 
10 | from ..formatter import TableFormatter
11 | 
12 | 
13 | class CsvTableFormatter(TableFormatter):
14 |     def to_table_data(self):
15 |         if typepy.is_empty_sequence(self._loader.headers):
16 |             headers = self._source_data[0]
17 | 
18 |             if any([typepy.is_null_string(header) for header in headers]):
19 |                 raise DataError(
20 |                     "the first line includes empty string item."
21 |                     "all of the items should contain header name."
22 |                     "actual={}".format(headers)
23 |                 )
24 | 
25 |             data_matrix = self._source_data[1:]
26 |         else:
27 |             headers = self._loader.headers
28 |             data_matrix = self._source_data
29 | 
30 |         if not data_matrix:
31 |             raise DataError("data row must be greater or equal than one")
32 | 
33 |         self._loader.inc_table_count()
34 | 
35 |         yield TableData(
36 |             self._loader.make_table_name(),
37 |             headers,
38 |             data_matrix,
39 |             dp_extractor=self._loader.dp_extractor,
40 |             type_hints=self._extract_type_hints(headers),
41 |         )
42 | 


--------------------------------------------------------------------------------
/pytablereader/error.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
 3 | """
 4 | 
 5 | 
 6 | class ValidationError(Exception):
 7 |     """
 8 |     Exception raised when data is not properly formatted.
 9 |     """
10 | 
11 | 
12 | class PathError(Exception):
13 |     """
14 |     Base path exception class.
15 |     """
16 | 
17 | 
18 | class InvalidFilePathError(PathError):
19 |     """
20 |     Exception raised when invalid file path used.
21 | 
22 |     TODO: rename the error class
23 |     """
24 | 
25 | 
26 | class UrlError(PathError):
27 |     """
28 |     Exception raised when invalid URL used.
29 |     """
30 | 
31 | 
32 | class OpenError(IOError):
33 |     """
34 |     Exception raised when failed to open a file.
35 |     """
36 | 
37 | 
38 | class APIError(Exception):
39 |     """
40 |     Exception raised when failed to execute API requests.
41 |     """
42 | 
43 | 
44 | class LoaderNotFoundError(Exception):
45 |     """
46 |     Exception raised when loader not found.
47 |     """
48 | 
49 | 
50 | class PypandocImportError(ImportError):
51 |     """
52 |     Exception raised when import error occurred with pypandoc package.
53 |     """
54 | 
55 | 
56 | try:
57 |     import requests
58 | 
59 |     class HTTPError(requests.RequestException):
60 |         """
61 |         An HTTP error occurred.
62 | 
63 |         .. seealso::
64 | 
65 |             http://docs.python-requests.org/en/master/api/#exceptions
66 |         """
67 | 
68 |     class ProxyError(requests.exceptions.ProxyError):
69 |         """
70 |         A proxy error occurred.
71 | 
72 |         .. seealso::
73 | 
74 |             http://docs.python-requests.org/en/master/_modules/requests/exceptions/
75 |         """
76 | 
77 | except ImportError:
78 | 
79 |     class HTTPError(Exception):
80 |         """
81 |         An HTTP error occurred.
82 |         """
83 | 
84 |     class ProxyError(Exception):
85 |         """
86 |         A proxy error occurred.
87 |         """
88 | 


--------------------------------------------------------------------------------
/pytablereader/factory/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
3 | """
4 | 
5 | from ._file import TableFileLoaderFactory
6 | from ._text import TableTextLoaderFactory
7 | from ._url import TableUrlLoaderFactory
8 | 


--------------------------------------------------------------------------------
/pytablereader/factory/_base.py:
--------------------------------------------------------------------------------
  1 | """
  2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
  3 | """
  4 | 
  5 | import abc
  6 | import warnings
  7 | 
  8 | from mbstrdecoder import MultiByteStrDecoder
  9 | 
 10 | from .._constant import Default
 11 | from ..error import LoaderNotFoundError
 12 | 
 13 | 
 14 | class BaseTableLoaderFactory(metaclass=abc.ABCMeta):
 15 |     @property
 16 |     def source(self):
 17 |         """
 18 |         :return: Data source to load.
 19 |         :rtype: str
 20 |         """
 21 | 
 22 |         return self._source
 23 | 
 24 |     def __init__(self, source, encoding=None):
 25 |         if not encoding:
 26 |             self._encoding = Default.ENCODING
 27 |         else:
 28 |             self._encoding = encoding
 29 | 
 30 |         self._source = MultiByteStrDecoder(source, [encoding]).unicode_str
 31 | 
 32 |     @abc.abstractmethod
 33 |     def create_from_path(self):  # pragma: no cover
 34 |         pass
 35 | 
 36 |     @abc.abstractmethod
 37 |     def create_from_format_name(self, format_name):  # pragma: no cover
 38 |         pass
 39 | 
 40 |     @abc.abstractmethod
 41 |     def _get_extension_loader_mapping(self):  # pragma: no cover
 42 |         pass
 43 | 
 44 |     @abc.abstractmethod
 45 |     def _get_format_name_loader_mapping(self):  # pragma: no cover
 46 |         pass
 47 | 
 48 |     def get_format_names(self):
 49 |         """
 50 |         :return: Available format names.
 51 |         :rtype: list
 52 |         """
 53 | 
 54 |         return sorted(self._get_format_name_loader_mapping())
 55 | 
 56 |     def get_format_name_list(self):
 57 |         warnings.warn("'get_format_name_list' has moved to 'get_format_names'", DeprecationWarning)
 58 |         return self.get_format_names()
 59 | 
 60 |     def get_extensions(self):
 61 |         """
 62 |         :return: Available format file extensions.
 63 |         :rtype: list
 64 |         """
 65 | 
 66 |         return sorted(self._get_extension_loader_mapping())
 67 | 
 68 |     def get_extension_list(self):
 69 |         warnings.warn("'get_extension_list' has moved to 'get_extensions'", DeprecationWarning)
 70 |         return self.get_extensions()
 71 | 
 72 |     def _get_loader_class(self, loader_mapping, format_name):
 73 |         try:
 74 |             format_name = format_name.casefold()
 75 |         except AttributeError:
 76 |             raise TypeError("format name must be a string")
 77 | 
 78 |         try:
 79 |             return loader_mapping[format_name]
 80 |         except KeyError:
 81 |             raise LoaderNotFoundError(
 82 |                 ", ".join(
 83 |                     [
 84 |                         f"loader not found: format='{format_name}'",
 85 |                         f"source='{self.source}'",
 86 |                     ]
 87 |                 )
 88 |             )
 89 | 
 90 |     def _create_from_extension(self, extension):
 91 |         try:
 92 |             loader = self._get_loader_class(self._get_extension_loader_mapping(), extension)(
 93 |                 self.source
 94 |             )
 95 | 
 96 |             return self._post_create(loader, extension=extension)
 97 |         except LoaderNotFoundError as e:
 98 |             raise LoaderNotFoundError(
 99 |                 "\n".join(
100 |                     [
101 |                         f"{e.args[0]:s} (unknown extension).",
102 |                         "",
103 |                         "acceptable extensions are: {}.".format(", ".join(self.get_extensions())),
104 |                         f"actual: '{extension}'",
105 |                     ]
106 |                 )
107 |             )
108 | 
109 |     def _create_from_format_name(self, format_name):
110 |         try:
111 |             loader = self._get_loader_class(self._get_format_name_loader_mapping(), format_name)(
112 |                 self.source
113 |             )
114 | 
115 |             return self._post_create(loader, format_name=format_name)
116 |         except LoaderNotFoundError as e:
117 |             raise LoaderNotFoundError(
118 |                 "\n".join(
119 |                     [
120 |                         f"{e.args[0]:s} (unknown format name).",
121 |                         "acceptable format names are: {}.".format(
122 |                             ", ".join(self.get_format_names())
123 |                         ),
124 |                     ]
125 |                 )
126 |             )
127 | 
128 |     def _post_create(self, loader, **kwargs):
129 |         loader.encoding = self._encoding
130 | 
131 |         if loader.format_name == "csv" and kwargs.get("format_name") == "ssv":
132 |             loader.delimiter = " "
133 | 
134 |         return loader
135 | 


--------------------------------------------------------------------------------
/pytablereader/factory/_file.py:
--------------------------------------------------------------------------------
  1 | """
  2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
  3 | """
  4 | 
  5 | from mbstrdecoder import detect_file_encoding
  6 | 
  7 | from .._common import get_extension
  8 | from .._logger import logger
  9 | from ..csv.core import CsvTableFileLoader
 10 | from ..html.core import HtmlTableFileLoader
 11 | from ..json.core import JsonTableFileLoader
 12 | from ..jsonlines.core import JsonLinesTableFileLoader
 13 | from ..ltsv.core import LtsvTableFileLoader
 14 | from ..markdown.core import MarkdownTableFileLoader
 15 | from ..mediawiki.core import MediaWikiTableFileLoader
 16 | from ..spreadsheet.excelloader import ExcelTableFileLoader
 17 | from ..sqlite.core import SqliteFileLoader
 18 | from ..tsv.core import TsvTableFileLoader
 19 | from ._base import BaseTableLoaderFactory
 20 | 
 21 | 
 22 | class TableFileLoaderFactory(BaseTableLoaderFactory):
 23 |     """
 24 |     :param str file_path: Path to the loading file.
 25 |     :raises pytablereader.InvalidFilePathError:
 26 |         If the ``file_path`` is an empty path.
 27 |     """
 28 | 
 29 |     @property
 30 |     def file_extension(self):
 31 |         """
 32 |         :return: File extension of the :py:attr:`.source` (without period).
 33 |         :rtype: str
 34 |         """
 35 | 
 36 |         return get_extension(self.source)
 37 | 
 38 |     def __init__(self, source, encoding=None):
 39 |         if not encoding and source:
 40 |             encoding = detect_file_encoding(source)
 41 |             logger.debug(f"detect encoding: file={source}, encoding={encoding}")
 42 | 
 43 |         super().__init__(source, encoding)
 44 | 
 45 |     def create_from_path(self):
 46 |         """
 47 |         Create a file loader from the file extension to loading file.
 48 |         Supported file extensions are as follows:
 49 | 
 50 |             ==========================  =======================================
 51 |             Extension                   Loader
 52 |             ==========================  =======================================
 53 |             ``"csv"``                   :py:class:`~.CsvTableFileLoader`
 54 |             ``"xls"``/``"xlsx"``        :py:class:`~.ExcelTableFileLoader`
 55 |             ``"htm"``/``"html"``        :py:class:`~.HtmlTableFileLoader`
 56 |             ``"json"``                  :py:class:`~.JsonTableFileLoader`
 57 |             ``"jsonl"``                 :py:class:`~.JsonLinesTableFileLoader`
 58 |             ``"ldjson"``                :py:class:`~.JsonLinesTableFileLoader`
 59 |             ``"ltsv"``                  :py:class:`~.LtsvTableFileLoader`
 60 |             ``"md"``                    :py:class:`~.MarkdownTableFileLoader`
 61 |             ``"ndjson"``                :py:class:`~.JsonLinesTableFileLoader`
 62 |             ``"sqlite"``/``"sqlite3"``  :py:class:`~.SqliteFileLoader`
 63 |             ``"tsv"``                   :py:class:`~.TsvTableFileLoader`
 64 |             ==========================  =======================================
 65 | 
 66 |         :return:
 67 |             Loader that coincides with the file extension of the
 68 |             :py:attr:`.file_extension`.
 69 |         :raises pytablereader.LoaderNotFoundError:
 70 |             |LoaderNotFoundError_desc| loading the file.
 71 |         """
 72 | 
 73 |         loader = self._create_from_extension(self.file_extension)
 74 | 
 75 |         logger.debug(
 76 |             "TableFileLoaderFactory.create_from_path: extension={}, loader={}".format(
 77 |                 self.file_extension, loader.format_name
 78 |             )
 79 |         )
 80 | 
 81 |         return loader
 82 | 
 83 |     def create_from_format_name(self, format_name):
 84 |         """
 85 |         Create a file loader from a format name.
 86 |         Supported file formats are as follows:
 87 | 
 88 |             ================  ======================================
 89 |             Format name               Loader
 90 |             ================  ======================================
 91 |             ``"csv"``         :py:class:`~.CsvTableFileLoader`
 92 |             ``"excel"``       :py:class:`~.ExcelTableFileLoader`
 93 |             ``"html"``        :py:class:`~.HtmlTableFileLoader`
 94 |             ``"json"``        :py:class:`~.JsonTableFileLoader`
 95 |             ``"json"``        :py:class:`~.JsonTableFileLoader`
 96 |             ``"json_lines"``  :py:class:`~.JsonTableFileLoader`
 97 |             ``"jsonl"``       :py:class:`~.JsonLinesTableFileLoader`
 98 |             ``"ltsv"``        :py:class:`~.LtsvTableFileLoader`
 99 |             ``"markdown"``    :py:class:`~.MarkdownTableFileLoader`
100 |             ``"mediawiki"``   :py:class:`~.MediaWikiTableFileLoader`
101 |             ``"ndjson"``      :py:class:`~.JsonLinesTableFileLoader`
102 |             ``"sqlite"``      :py:class:`~.SqliteFileLoader`
103 |             ``"ssv"``         :py:class:`~.CsvTableFileLoader`
104 |             ``"tsv"``         :py:class:`~.TsvTableFileLoader`
105 |             ================  ======================================
106 | 
107 |         :param str format_name: Format name string (case insensitive).
108 |         :return: Loader that coincides with the ``format_name``:
109 |         :raises pytablereader.LoaderNotFoundError:
110 |             |LoaderNotFoundError_desc| the format.
111 |         """
112 | 
113 |         loader = self._create_from_format_name(format_name)
114 | 
115 |         logger.debug(
116 |             "TableFileLoaderFactory.create_from_format_name: name={}, loader={}".format(
117 |                 format_name, loader.format_name
118 |             )
119 |         )
120 | 
121 |         return loader
122 | 
123 |     @staticmethod
124 |     def _get_common_loader_mapping():
125 |         return {
126 |             "csv": CsvTableFileLoader,
127 |             "html": HtmlTableFileLoader,
128 |             "json": JsonTableFileLoader,
129 |             "jsonl": JsonLinesTableFileLoader,
130 |             "ldjson": JsonLinesTableFileLoader,
131 |             "ltsv": LtsvTableFileLoader,
132 |             "ndjson": JsonLinesTableFileLoader,
133 |             "sqlite": SqliteFileLoader,
134 |             "tsv": TsvTableFileLoader,
135 |         }
136 | 
137 |     def _get_extension_loader_mapping(self):
138 |         """
139 |         :return: Mappings of format extension and loader class.
140 |         :rtype: dict
141 |         """
142 | 
143 |         loader_table = self._get_common_loader_mapping()
144 |         loader_table.update(
145 |             {
146 |                 "htm": HtmlTableFileLoader,
147 |                 "md": MarkdownTableFileLoader,
148 |                 "sqlite3": SqliteFileLoader,
149 |                 "xlsx": ExcelTableFileLoader,
150 |                 "xls": ExcelTableFileLoader,
151 |             }
152 |         )
153 | 
154 |         return loader_table
155 | 
156 |     def _get_format_name_loader_mapping(self):
157 |         """
158 |         :return: Mappings of format name and loader class.
159 |         :rtype: dict
160 |         """
161 | 
162 |         loader_table = self._get_common_loader_mapping()
163 |         loader_table.update(
164 |             {
165 |                 "excel": ExcelTableFileLoader,
166 |                 "json_lines": JsonLinesTableFileLoader,
167 |                 "markdown": MarkdownTableFileLoader,
168 |                 "mediawiki": MediaWikiTableFileLoader,
169 |                 "ssv": CsvTableFileLoader,
170 |             }
171 |         )
172 | 
173 |         return loader_table
174 | 


--------------------------------------------------------------------------------
/pytablereader/factory/_text.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
 3 | """
 4 | 
 5 | from .._logger import logger
 6 | from ..csv.core import CsvTableTextLoader
 7 | from ..html.core import HtmlTableTextLoader
 8 | from ..json.core import JsonTableTextLoader
 9 | from ..jsonlines.core import JsonLinesTableTextLoader
10 | from ..ltsv.core import LtsvTableTextLoader
11 | from ..markdown.core import MarkdownTableTextLoader
12 | from ..mediawiki.core import MediaWikiTableTextLoader
13 | from ..tsv.core import TsvTableTextLoader
14 | from ._base import BaseTableLoaderFactory
15 | 
16 | 
17 | class TableTextLoaderFactory(BaseTableLoaderFactory):
18 |     def create_from_path(self):
19 |         raise NotImplementedError()
20 | 
21 |     def create_from_format_name(self, format_name):
22 |         """
23 |         Create a file loader from a format name.
24 |         Supported file formats are as follows:
25 | 
26 |             ==========================  ======================================
27 |             Format name                 Loader
28 |             ==========================  ======================================
29 |             ``"csv"``                   :py:class:`~.CsvTableTextLoader`
30 |             ``"html"``                  :py:class:`~.HtmlTableTextLoader`
31 |             ``"json"``                  :py:class:`~.JsonTableTextLoader`
32 |             ``"json_lines"``            :py:class:`~.JsonLinesTableTextLoader`
33 |             ``"jsonl"``                 :py:class:`~.JsonLinesTableTextLoader`
34 |             ``"ldjson"``                :py:class:`~.JsonLinesTableTextLoader`
35 |             ``"ltsv"``                  :py:class:`~.LtsvTableTextLoader`
36 |             ``"markdown"``              :py:class:`~.MarkdownTableTextLoader`
37 |             ``"mediawiki"``             :py:class:`~.MediaWikiTableTextLoader`
38 |             ``"ndjson"``                :py:class:`~.JsonLinesTableTextLoader`
39 |             ``"ssv"``                   :py:class:`~.CsvTableTextLoader`
40 |             ``"tsv"``                   :py:class:`~.TsvTableTextLoader`
41 |             ==========================  ======================================
42 | 
43 |         :param str format_name: Format name string (case insensitive).
44 |         :return: Loader that coincide with the ``format_name``:
45 |         :raises pytablereader.LoaderNotFoundError:
46 |             |LoaderNotFoundError_desc| the format.
47 |         :raises TypeError: If ``format_name`` is not a string.
48 |         """
49 | 
50 |         loader = self._create_from_format_name(format_name)
51 | 
52 |         logger.debug(f"TableTextLoaderFactory: name={format_name}, loader={loader.format_name}")
53 | 
54 |         return loader
55 | 
56 |     def _get_common_loader_mapping(self):
57 |         return {
58 |             "csv": CsvTableTextLoader,
59 |             "html": HtmlTableTextLoader,
60 |             "json": JsonTableTextLoader,
61 |             "jsonl": JsonLinesTableTextLoader,
62 |             "ldjson": JsonLinesTableTextLoader,
63 |             "ltsv": LtsvTableTextLoader,
64 |             "ndjson": JsonLinesTableTextLoader,
65 |             "tsv": TsvTableTextLoader,
66 |         }
67 | 
68 |     def _get_extension_loader_mapping(self):
69 |         """
70 |         :return: Mappings of format-extension and loader class.
71 |         :rtype: dict
72 |         """
73 | 
74 |         loader_table = self._get_common_loader_mapping()
75 |         loader_table.update({"htm": HtmlTableTextLoader, "md": MarkdownTableTextLoader})
76 | 
77 |         return loader_table
78 | 
79 |     def _get_format_name_loader_mapping(self):
80 |         """
81 |         :return: Mappings of format-name and loader class.
82 |         :rtype: dict
83 |         """
84 | 
85 |         loader_table = self._get_common_loader_mapping()
86 |         loader_table.update(
87 |             {
88 |                 "json_lines": JsonLinesTableTextLoader,
89 |                 "markdown": MarkdownTableTextLoader,
90 |                 "mediawiki": MediaWikiTableTextLoader,
91 |                 "ssv": CsvTableTextLoader,
92 |             }
93 |         )
94 | 
95 |         return loader_table
96 | 


--------------------------------------------------------------------------------
/pytablereader/formatter.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
 3 | """
 4 | 
 5 | import abc
 6 | from collections import OrderedDict
 7 | from textwrap import dedent
 8 | 
 9 | from pytablereader import DataError
10 | 
11 | from ._acceptor import LoaderAcceptor
12 | from ._common import json
13 | from ._logger import logger
14 | 
15 | 
16 | class TableFormatterInterface(metaclass=abc.ABCMeta):
17 |     """
18 |     The abstract class of table data validator.
19 |     """
20 | 
21 |     @abc.abstractmethod
22 |     def to_table_data(self):  # pragma: no cover
23 |         pass
24 | 
25 | 
26 | class TableFormatter(LoaderAcceptor, TableFormatterInterface):
27 |     """
28 |     The abstract class of |TableData| formatter.
29 |     """
30 | 
31 |     def _validate_source_data(self):
32 |         if not self._source_data:
33 |             raise DataError("source data is empty")
34 | 
35 |     def __init__(self, source_data):
36 |         self._source_data = source_data
37 | 
38 |         self._validate_source_data()
39 | 
40 |     def _extract_type_hints(self, headers=None):
41 |         if self._loader.type_hints:
42 |             return self._loader.type_hints
43 | 
44 |         if not self._loader.type_hint_rules or not headers:
45 |             return []
46 | 
47 |         type_hints = []
48 |         for header in headers:
49 |             for regexp, type_hint in self._loader.type_hint_rules.items():
50 |                 if regexp.search(header):
51 |                     type_hints.append(type_hint)
52 |                     break
53 |             else:
54 |                 type_hints.append(None)
55 | 
56 |         logger.debug(
57 |             dedent(
58 |                 """\
59 |                 extracted type hints:
60 |                 {}
61 |                 """
62 |             ).format(
63 |                 json.dumps(
64 |                     OrderedDict(
65 |                         {header: str(type_hint) for header, type_hint in zip(headers, type_hints)}
66 |                     ),
67 |                     indent=4,
68 |                 )
69 |             )
70 |         )
71 | 
72 |         return type_hints
73 | 


--------------------------------------------------------------------------------
/pytablereader/html/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thombashi/pytablereader/b2a6a3db3ef52f5db942340ae75a6905df64a960/pytablereader/html/__init__.py


--------------------------------------------------------------------------------
/pytablereader/html/core.py:
--------------------------------------------------------------------------------
  1 | """
  2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
  3 | """
  4 | 
  5 | from .._common import get_file_encoding
  6 | from .._constant import TableNameTemplate as tnt
  7 | from .._logger import FileSourceLogger, TextSourceLogger
  8 | from .._validator import FileValidator, TextValidator
  9 | from ..interface import AbstractTableReader
 10 | from .formatter import HtmlTableFormatter
 11 | 
 12 | 
 13 | class HtmlTableLoader(AbstractTableReader):
 14 |     """
 15 |     An abstract class of HTML table loaders.
 16 |     """
 17 | 
 18 |     @property
 19 |     def format_name(self):
 20 |         return "html"
 21 | 
 22 |     def _get_default_table_name_template(self):
 23 |         return f"{tnt.TITLE:s}_{tnt.KEY:s}"
 24 | 
 25 | 
 26 | class HtmlTableFileLoader(HtmlTableLoader):
 27 |     """
 28 |     A file loader class to extract tabular data from HTML files.
 29 | 
 30 |     :param str file_path: Path to the loading HTML file.
 31 | 
 32 |     .. py:attribute:: table_name
 33 | 
 34 |         Table name string. Defaults to ``%(title)s_%(key)s``.
 35 | 
 36 |     .. py:attribute:: encoding
 37 | 
 38 |         HTML file encoding. Defaults to ``"utf-8"``.
 39 |     """
 40 | 
 41 |     def __init__(self, file_path=None, quoting_flags=None, type_hints=None, type_hint_rules=None):
 42 |         super().__init__(file_path, quoting_flags, type_hints, type_hint_rules)
 43 | 
 44 |         self.encoding = None
 45 | 
 46 |         self._validator = FileValidator(file_path)
 47 |         self._logger = FileSourceLogger(self)
 48 | 
 49 |     def load(self):
 50 |         """
 51 |         Extract tabular data as |TableData| instances from HTML table tags in
 52 |         a HTML file.
 53 |         |load_source_desc_file|
 54 | 
 55 |         :return:
 56 |             Loaded table data iterator.
 57 |             |load_table_name_desc|
 58 | 
 59 |             ===================  ==============================================
 60 |             Format specifier     Value after the replacement
 61 |             ===================  ==============================================
 62 |             ``%(filename)s``     |filename_desc|
 63 |             ``%(title)s``        ``<title>`` tag value of the HTML.
 64 |             ``%(key)s``          | This replaced to:
 65 |                                  | **(1)** ``id`` attribute of the table tag
 66 |                                  | **(2)** ``%(format_name)s%(format_id)s``
 67 |                                  | if ``id`` attribute not present in the
 68 |                                  | table tag.
 69 |             ``%(format_name)s``  ``"html"``
 70 |             ``%(format_id)s``    |format_id_desc|
 71 |             ``%(global_id)s``    |global_id|
 72 |             ===================  ==============================================
 73 |         :rtype: |TableData| iterator
 74 |         :raises pytablereader.DataError:
 75 |             If the HTML data is invalid or empty.
 76 | 
 77 |         .. note::
 78 | 
 79 |             Table tag attributes ignored with loaded |TableData|.
 80 |         """
 81 | 
 82 |         self._validate()
 83 |         self._logger.logging_load()
 84 |         self.encoding = get_file_encoding(self.source, self.encoding)
 85 | 
 86 |         with open(self.source, encoding=self.encoding) as fp:
 87 |             formatter = HtmlTableFormatter(fp.read(), self._logger)
 88 |         formatter.accept(self)
 89 | 
 90 |         return formatter.to_table_data()
 91 | 
 92 | 
 93 | class HtmlTableTextLoader(HtmlTableLoader):
 94 |     """
 95 |     A text loader class to extract tabular data from HTML text data.
 96 | 
 97 |     :param str text: HTML text to load.
 98 | 
 99 |     .. py:attribute:: table_name
100 | 
101 |         Table name string. Defaults to ``%(title)s_%(key)s``.
102 |     """
103 | 
104 |     def __init__(self, text, quoting_flags=None, type_hints=None, type_hint_rules=None):
105 |         super().__init__(text, quoting_flags, type_hints, type_hint_rules)
106 | 
107 |         self._validator = TextValidator(text)
108 |         self._logger = TextSourceLogger(self)
109 | 
110 |     def load(self):
111 |         """
112 |         Extract tabular data as |TableData| instances from HTML table tags in
113 |         a HTML text object.
114 |         |load_source_desc_text|
115 | 
116 |         :return:
117 |             Loaded table data iterator.
118 |             |load_table_name_desc|
119 | 
120 |             ===================  ==============================================
121 |             Format specifier     Value after the replacement
122 |             ===================  ==============================================
123 |             ``%(filename)s``     ``""``
124 |             ``%(title)s``        ``<title>`` tag value of the HTML.
125 |             ``%(key)s``          | This replaced to:
126 |                                  | **(1)** ``id`` attribute of the table tag
127 |                                  | **(2)** ``%(format_name)s%(format_id)s``
128 |                                  | if ``id`` attribute is not included
129 |                                  | in the table tag.
130 |             ``%(format_name)s``  ``"html"``
131 |             ``%(format_id)s``    |format_id_desc|
132 |             ``%(global_id)s``    |global_id|
133 |             ===================  ==============================================
134 |         :rtype: |TableData| iterator
135 |         :raises pytablereader.DataError:
136 |             If the HTML data is invalid or empty.
137 |         """
138 | 
139 |         self._validate()
140 |         self._logger.logging_load()
141 | 
142 |         formatter = HtmlTableFormatter(self.source, self._logger)
143 |         formatter.accept(self)
144 | 
145 |         return formatter.to_table_data()
146 | 


--------------------------------------------------------------------------------
/pytablereader/html/formatter.py:
--------------------------------------------------------------------------------
  1 | """
  2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
  3 | """
  4 | 
  5 | import re
  6 | 
  7 | import bs4
  8 | import typepy
  9 | from tabledata import TableData
 10 | 
 11 | from pytablereader import DataError
 12 | 
 13 | from .._constant import TableNameTemplate as tnt
 14 | from .._logger import NullSourceLogger
 15 | from ..formatter import TableFormatter
 16 | 
 17 | 
 18 | class HtmlTableFormatter(TableFormatter):
 19 |     @property
 20 |     def table_id(self):
 21 |         return self.__table_id
 22 | 
 23 |     def __init__(self, source_data, logger=None):
 24 |         super().__init__(source_data)
 25 | 
 26 |         if logger:
 27 |             self.__logger = logger
 28 |         else:
 29 |             self.__logger = NullSourceLogger(None)
 30 | 
 31 |         self.__table_id = None
 32 | 
 33 |         if typepy.is_null_string(source_data):
 34 |             raise DataError
 35 | 
 36 |         try:
 37 |             self.__soup = bs4.BeautifulSoup(self._source_data, "lxml")
 38 |         except bs4.FeatureNotFound:
 39 |             self.__soup = bs4.BeautifulSoup(self._source_data, "html.parser")
 40 | 
 41 |     def to_table_data(self):
 42 |         for table in self.__soup.find_all("table"):
 43 |             try:
 44 |                 table_data = self.__parse_html(table)
 45 |             except ValueError:
 46 |                 continue
 47 | 
 48 |             if table_data.is_empty_rows():
 49 |                 continue
 50 | 
 51 |             self.__logger.logging_table(table_data)
 52 | 
 53 |             yield table_data
 54 | 
 55 |     def _make_table_name(self):
 56 |         from collections import OrderedDict
 57 | 
 58 |         key = self.table_id
 59 |         if typepy.is_null_string(key):
 60 |             key = self._loader.get_format_key()
 61 | 
 62 |         try:
 63 |             title = self.__soup.title.text
 64 |         except AttributeError:
 65 |             title = ""
 66 | 
 67 |         kv_mapping = self._loader._get_basic_tablename_keyvalue_mapping()
 68 |         kv_mapping.update(OrderedDict([(tnt.KEY, key), (tnt.TITLE, title)]))
 69 | 
 70 |         return self._loader._expand_table_name_format(kv_mapping)
 71 | 
 72 |     def __parse_tag_id(self, table):
 73 |         self.__table_id = table.get("id")
 74 | 
 75 |         if self.__table_id is None:
 76 |             caption = table.find("caption")
 77 |             if caption is not None:
 78 |                 caption = caption.text.strip()
 79 |                 if typepy.is_not_null_string(caption):
 80 |                     self.__table_id = caption
 81 | 
 82 |     def __parse_html(self, table):
 83 |         headers = []
 84 |         data_matrix = []
 85 | 
 86 |         self.__parse_tag_id(table)
 87 | 
 88 |         rows = table.find_all("tr")
 89 |         re_table_val = re.compile("td|th")
 90 |         for row in rows:
 91 |             td_list = row.find_all("td")
 92 |             if typepy.is_empty_sequence(td_list):
 93 |                 if typepy.is_not_empty_sequence(headers):
 94 |                     continue
 95 | 
 96 |                 th_list = row.find_all("th")
 97 |                 if typepy.is_empty_sequence(th_list):
 98 |                     continue
 99 | 
100 |                 headers = [row.text.strip() for row in th_list]
101 |                 continue
102 | 
103 |             data_matrix.append([value.get_text().strip() for value in row.find_all(re_table_val)])
104 | 
105 |         if typepy.is_empty_sequence(data_matrix):
106 |             raise ValueError("data matrix is empty")
107 | 
108 |         self._loader.inc_table_count()
109 | 
110 |         return TableData(
111 |             self._make_table_name(),
112 |             headers,
113 |             data_matrix,
114 |             dp_extractor=self._loader.dp_extractor,
115 |             type_hints=self._extract_type_hints(headers),
116 |         )
117 | 


--------------------------------------------------------------------------------
/pytablereader/interface.py:
--------------------------------------------------------------------------------
  1 | """
  2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
  3 | """
  4 | 
  5 | import abc
  6 | import threading
  7 | 
  8 | import path
  9 | import typepy
 10 | from dataproperty import DataPropertyExtractor
 11 | 
 12 | from pytablereader import InvalidTableNameError
 13 | 
 14 | from ._constant import SourceType
 15 | from ._constant import TableNameTemplate as tnt
 16 | 
 17 | 
 18 | class TableLoaderInterface(metaclass=abc.ABCMeta):
 19 |     """
 20 |     Interface class of table loader class.
 21 |     """
 22 | 
 23 |     @abc.abstractproperty
 24 |     def format_name(self):  # pragma: no cover
 25 |         pass
 26 | 
 27 |     @abc.abstractproperty
 28 |     def source_type(self):  # pragma: no cover
 29 |         pass
 30 | 
 31 |     @abc.abstractmethod
 32 |     def load(self):  # pragma: no cover
 33 |         pass
 34 | 
 35 |     @abc.abstractmethod
 36 |     def inc_table_count(self):  # pragma: no cover
 37 |         pass
 38 | 
 39 | 
 40 | class AbstractTableReader(TableLoaderInterface, metaclass=abc.ABCMeta):
 41 |     """
 42 |     The abstract class of table data file loader.
 43 | 
 44 |     .. py:attribute:: table_name
 45 | 
 46 |         Table name string.
 47 | 
 48 |     .. py:attribute:: source
 49 | 
 50 |         Table data source to load.
 51 |     """
 52 | 
 53 |     __table_count_lock = threading.Lock()
 54 |     __global_table_count = 0
 55 |     __format_table_count = {}
 56 | 
 57 |     @property
 58 |     def source_type(self):
 59 |         return self._validator.source_type
 60 | 
 61 |     @property
 62 |     def quoting_flags(self):
 63 |         return self.__quoting_flags
 64 | 
 65 |     @property
 66 |     def dp_extractor(self):
 67 |         return self.__dp_extractor
 68 | 
 69 |     def __init__(self, source, quoting_flags, type_hints, type_hint_rules=None):
 70 |         self.table_name = tnt.DEFAULT
 71 |         self.source = source
 72 |         self.__quoting_flags = quoting_flags
 73 |         self.type_hints = type_hints
 74 |         self.type_hint_rules = type_hint_rules
 75 |         self._validator = None
 76 |         self._logger = None
 77 | 
 78 |         self.__dp_extractor = DataPropertyExtractor()
 79 |         self.__dp_extractor.quoting_flags = self.quoting_flags
 80 |         self.__dp_extractor.update_strict_level_map({typepy.Typecode.BOOL: 1})
 81 | 
 82 |     def get_format_key(self):
 83 |         return f"{self.format_name:s}{self.__get_format_table_count():d}"
 84 | 
 85 |     def make_table_name(self):
 86 |         return self._make_table_name()
 87 | 
 88 |     def inc_table_count(self):
 89 |         with self.__table_count_lock:
 90 |             self.__global_table_count += 1
 91 |             self.__format_table_count[self.format_name] = self.__get_format_table_count() + 1
 92 | 
 93 |     @abc.abstractmethod
 94 |     def _get_default_table_name_template(self):  # pragma: no cover
 95 |         pass
 96 | 
 97 |     def _validate(self):
 98 |         self._validate_table_name()
 99 |         self._validate_source()
100 | 
101 |     def _validate_table_name(self):
102 |         try:
103 |             if typepy.is_null_string(self.table_name):
104 |                 raise ValueError("table name is empty")
105 |         except (TypeError, AttributeError):
106 |             raise TypeError("table_name must be a string")
107 | 
108 |     def _validate_source(self):
109 |         self._validator.validate()
110 | 
111 |     def __get_format_table_count(self):
112 |         return self.__format_table_count.get(self.format_name, 0)
113 | 
114 |     def _get_filename_tablename_mapping(self):
115 |         filename = ""
116 |         if all([self.source_type == SourceType.FILE, typepy.is_not_null_string(self.source)]):
117 |             filename = path.Path(self.source).stem
118 | 
119 |         return (tnt.FILENAME, filename)
120 | 
121 |     def _get_basic_tablename_keyvalue_mapping(self):
122 |         from collections import OrderedDict
123 | 
124 |         return OrderedDict(
125 |             [
126 |                 (tnt.DEFAULT, self._get_default_table_name_template()),
127 |                 (tnt.FORMAT_NAME, self.format_name),
128 |                 (tnt.FORMAT_ID, str(self.__get_format_table_count())),
129 |                 (tnt.GLOBAL_ID, str(self.__global_table_count)),
130 |                 self._get_filename_tablename_mapping(),
131 |             ]
132 |         )
133 | 
134 |     def _expand_table_name_format(self, table_name_kv_mapping):
135 |         self._validate_table_name()
136 | 
137 |         table_name = self.table_name
138 |         for template, value in table_name_kv_mapping.items():
139 |             table_name = table_name.replace(template, value)
140 | 
141 |         return self._sanitize_table_name(table_name)
142 | 
143 |     def _make_table_name(self):
144 |         self._validate_table_name()
145 | 
146 |         return self._expand_table_name_format(self._get_basic_tablename_keyvalue_mapping())
147 | 
148 |     @staticmethod
149 |     def _sanitize_table_name(table_name):
150 |         if typepy.is_null_string(table_name):
151 |             raise InvalidTableNameError("table name is empty after the template replacement")
152 | 
153 |         return table_name.strip("_")
154 | 
155 |     @classmethod
156 |     def clear_table_count(cls):
157 |         with cls.__table_count_lock:
158 |             cls.__global_table_count = 0
159 |             cls.__format_table_count = {}
160 | 


--------------------------------------------------------------------------------
/pytablereader/json/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thombashi/pytablereader/b2a6a3db3ef52f5db942340ae75a6905df64a960/pytablereader/json/__init__.py


--------------------------------------------------------------------------------
/pytablereader/jsonlines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thombashi/pytablereader/b2a6a3db3ef52f5db942340ae75a6905df64a960/pytablereader/jsonlines/__init__.py


--------------------------------------------------------------------------------
/pytablereader/jsonlines/core.py:
--------------------------------------------------------------------------------
  1 | """
  2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
  3 | """
  4 | 
  5 | import abc
  6 | from collections import OrderedDict
  7 | 
  8 | from .._common import get_file_encoding, json
  9 | from .._constant import SourceType
 10 | from .._constant import TableNameTemplate as tnt
 11 | from .._logger import FileSourceLogger, TextSourceLogger
 12 | from .._validator import FileValidator, TextValidator
 13 | from ..error import ValidationError
 14 | from ..interface import AbstractTableReader
 15 | from .formatter import JsonLinesTableFormatter
 16 | 
 17 | 
 18 | class JsonLinesTableLoader(AbstractTableReader, metaclass=abc.ABCMeta):
 19 |     """
 20 |     An abstract class of JSON table loaders.
 21 |     """
 22 | 
 23 |     @property
 24 |     def format_name(self):
 25 |         return "json_lines"
 26 | 
 27 |     @abc.abstractmethod
 28 |     def load_dict(self):  # pragma: no cover
 29 |         pass
 30 | 
 31 | 
 32 | class JsonLinesTableFileLoader(JsonLinesTableLoader):
 33 |     """
 34 |     A file loader class to extract tabular data from Line-delimited JSON files.
 35 | 
 36 |     :param str file_path: Path to the loading Line-delimited JSON file.
 37 | 
 38 |     .. py:attribute:: table_name
 39 | 
 40 |         Table name string. Defaults to ``%(filename)s_%(key)s``.
 41 |     """
 42 | 
 43 |     def __init__(self, file_path=None, quoting_flags=None, type_hints=None, type_hint_rules=None):
 44 |         super().__init__(file_path, quoting_flags, type_hints, type_hint_rules)
 45 | 
 46 |         self.encoding = None
 47 | 
 48 |         self._validator = FileValidator(file_path)
 49 |         self._logger = FileSourceLogger(self)
 50 | 
 51 |     def load(self):
 52 |         """
 53 |         Extract tabular data as |TableData| instances from a Line-delimited JSON file.
 54 |         |load_source_desc_file|
 55 | 
 56 |         :return:
 57 |             Loaded table data iterator.
 58 |             |load_table_name_desc|
 59 | 
 60 |         :rtype: |TableData| iterator
 61 |         :raises pytablereader.DataError:
 62 |             If the data is invalid Line-delimited JSON.
 63 |         :raises pytablereader.error.ValidationError:
 64 |             If the data is not acceptable Line-delimited JSON format.
 65 |         """
 66 | 
 67 |         formatter = JsonLinesTableFormatter(self.load_dict())
 68 |         formatter.accept(self)
 69 | 
 70 |         return formatter.to_table_data()
 71 | 
 72 |     def load_dict(self):
 73 |         self._validate()
 74 |         self._logger.logging_load()
 75 |         self.encoding = get_file_encoding(self.source, self.encoding)
 76 | 
 77 |         buffer = []
 78 |         with open(self.source, encoding=self.encoding) as fp:
 79 |             for line_idx, line in enumerate(fp):
 80 |                 line = line.strip()
 81 |                 if not line:
 82 |                     continue
 83 | 
 84 |                 try:
 85 |                     buffer.append(json.loads(line, object_pairs_hook=OrderedDict))
 86 |                 except json.JSONDecodeError as e:
 87 |                     raise ValidationError(
 88 |                         "line {line_idx}: {msg}: {value}".format(
 89 |                             line_idx=line_idx + 1, msg=e, value=line
 90 |                         )
 91 |                     )
 92 | 
 93 |         return buffer
 94 | 
 95 |     def _get_default_table_name_template(self):
 96 |         return f"{tnt.FILENAME:s}_{tnt.KEY:s}"
 97 | 
 98 | 
 99 | class JsonLinesTableTextLoader(JsonLinesTableLoader):
100 |     """
101 |     A text loader class to extract tabular data from Line-delimited JSON text data.
102 | 
103 |     :param str text: Line-delimited JSON text to load.
104 | 
105 |     .. py:attribute:: table_name
106 | 
107 |         Table name string. Defaults to ``%(key)s``.
108 |     """
109 | 
110 |     @property
111 |     def source_type(self):
112 |         return SourceType.TEXT
113 | 
114 |     def __init__(self, text=None, quoting_flags=None, type_hints=None, type_hint_rules=None):
115 |         super().__init__(text, quoting_flags, type_hints)
116 | 
117 |         self._validator = TextValidator(text)
118 |         self._logger = TextSourceLogger(self)
119 | 
120 |     def load(self):
121 |         """
122 |         Extract tabular data as |TableData| instances from a Line-delimited JSON text object.
123 |         |load_source_desc_text|
124 | 
125 |         :return:
126 |             Loaded table data iterator.
127 |             |load_table_name_desc|
128 | 
129 |         :rtype: |TableData| iterator
130 | 
131 |         .. seealso::
132 | 
133 |             :py:meth:`.JsonLinesTableFileLoader.load()`
134 |         """
135 | 
136 |         formatter = JsonLinesTableFormatter(self.load_dict())
137 |         formatter.accept(self)
138 | 
139 |         return formatter.to_table_data()
140 | 
141 |     def load_dict(self):
142 |         self._validate()
143 |         self._logger.logging_load()
144 | 
145 |         buffer = []
146 |         for line_idx, line in enumerate(self.source.splitlines()):
147 |             line = line.strip()
148 |             if not line:
149 |                 continue
150 | 
151 |             try:
152 |                 buffer.append(json.loads(line, object_pairs_hook=OrderedDict))
153 |             except json.JSONDecodeError as e:
154 |                 raise ValidationError(
155 |                     "line {line_idx}: {msg}: {value}".format(
156 |                         line_idx=line_idx + 1, msg=e, value=line
157 |                     )
158 |                 )
159 | 
160 |         return buffer
161 | 
162 |     def _get_default_table_name_template(self):
163 |         return f"{tnt.KEY:s}"
164 | 


--------------------------------------------------------------------------------
/pytablereader/jsonlines/formatter.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
 3 | """
 4 | 
 5 | import jsonschema
 6 | from tabledata import TableData
 7 | 
 8 | from ..error import ValidationError
 9 | from ..formatter import TableFormatter
10 | from ..json.formatter import SingleJsonTableConverterBase
11 | 
12 | 
13 | class FlatJsonTableConverter(SingleJsonTableConverterBase):
14 |     """
15 |     A concrete class of JSON table data formatter.
16 |     """
17 | 
18 |     @property
19 |     def _schema(self):
20 |         return {"type": "object", "additionalProperties": self._VALUE_TYPE_SCHEMA}
21 | 
22 |     def _validate_source_data(self):
23 |         for json_record in self._buffer:
24 |             try:
25 |                 jsonschema.validate(json_record, self._schema)
26 |             except jsonschema.ValidationError as e:
27 |                 raise ValidationError(e)
28 | 
29 |     def to_table_data(self):
30 |         """
31 |         :raises ValueError:
32 |         :raises pytablereader.error.ValidationError:
33 |         """
34 | 
35 |         self._validate_source_data()
36 | 
37 |         header_list = []
38 |         for json_record in self._buffer:
39 |             for key in json_record:
40 |                 if key not in header_list:
41 |                     header_list.append(key)
42 | 
43 |         self._loader.inc_table_count()
44 | 
45 |         yield TableData(
46 |             self._make_table_name(),
47 |             header_list,
48 |             self._buffer,
49 |             dp_extractor=self._loader.dp_extractor,
50 |             type_hints=self._extract_type_hints(header_list),
51 |         )
52 | 
53 | 
54 | class JsonLinesTableFormatter(TableFormatter):
55 |     def to_table_data(self):
56 |         converter = FlatJsonTableConverter(self._source_data)
57 |         converter.accept(self._loader)
58 | 
59 |         return converter.to_table_data()
60 | 


--------------------------------------------------------------------------------
/pytablereader/loadermanager/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
3 | """
4 | 
5 | from ._file import TableFileLoader
6 | from ._text import TableTextLoader
7 | from ._url import TableUrlLoader
8 | 


--------------------------------------------------------------------------------
/pytablereader/loadermanager/_base.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
 3 | """
 4 | 
 5 | from ..interface import TableLoaderInterface
 6 | 
 7 | 
 8 | class TableLoaderManager(TableLoaderInterface):
 9 |     def __init__(self, loader):
10 |         self.__loader = loader
11 | 
12 |     @property
13 |     def loader(self):
14 |         return self.__loader
15 | 
16 |     @property
17 |     def format_name(self):
18 |         return self.__loader.format_name
19 | 
20 |     @property
21 |     def source_type(self):
22 |         return self.__loader.source_type
23 | 
24 |     @property
25 |     def table_name(self):
26 |         return self.__loader.table_name
27 | 
28 |     @table_name.setter
29 |     def table_name(self, value):
30 |         self.__loader.table_name = value
31 | 
32 |     @property
33 |     def encoding(self):
34 |         try:
35 |             return self.__loader.encoding
36 |         except AttributeError:
37 |             return None
38 | 
39 |     @encoding.setter
40 |     def encoding(self, codec_name):
41 |         self.__loader.encoding = codec_name
42 | 
43 |     @property
44 |     def type_hints(self):
45 |         return self.__loader.type_hints
46 | 
47 |     @type_hints.setter
48 |     def type_hints(self, value):
49 |         self.__loader.type_hints = value
50 | 
51 |     def load(self):
52 |         return self.__loader.load()
53 | 
54 |     def inc_table_count(self):
55 |         self.__loader.inc_table_count()
56 | 


--------------------------------------------------------------------------------
/pytablereader/loadermanager/_file.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
 3 | """
 4 | 
 5 | import warnings
 6 | 
 7 | import typepy
 8 | 
 9 | from ..factory import TableFileLoaderFactory
10 | from ._base import TableLoaderManager
11 | 
12 | 
13 | class TableFileLoader(TableLoaderManager):
14 |     """
15 |     Loader class to loading tables from a file.
16 | 
17 |     :param str file_path: Path to the file to load.
18 |     :param str format_name: Data format name to load.
19 |         Supported formats are as follows:
20 |         ``"csv"``, ``"excel"``, ``"html"``, ``"json"``, ``"ltsv"``,
21 |         ``"markdown"``, ``"mediawiki"``, ``"sqlite"``, ``"ssv"``, ``"tsv"``.
22 |         If the value is |None|, automatically detect file format from
23 |         the ``file_path``.
24 |     :raise pytablereader.InvalidFilePathError:
25 |         If ``file_path`` is an invalid file path.
26 |     :raises pytablereader.LoaderNotFoundError:
27 |         |LoaderNotFoundError_desc| loading the file.
28 | 
29 |     .. py:method:: load
30 | 
31 |         Loading table data from a file as ``format_name`` format.
32 |         Automatically detect file format if ``format_name`` is |None|.
33 | 
34 |         :return: Loaded table data iterator.
35 |         :rtype: |TableData| iterator
36 | 
37 |         .. seealso::
38 |             * :py:meth:`pytablereader.factory.TableFileLoaderFactory.create_from_format_name`
39 |             * :py:meth:`pytablereader.factory.TableFileLoaderFactory.create_from_path`
40 |     """
41 | 
42 |     def __init__(self, file_path, format_name=None, encoding=None, type_hint_rules=None):
43 |         loader_factory = TableFileLoaderFactory(file_path, encoding=encoding)
44 | 
45 |         if typepy.is_not_null_string(format_name):
46 |             loader = loader_factory.create_from_format_name(format_name)
47 |         else:
48 |             loader = loader_factory.create_from_path()
49 | 
50 |         loader.type_hint_rules = type_hint_rules
51 | 
52 |         super().__init__(loader)
53 | 
54 |     @classmethod
55 |     def get_format_names(cls):
56 |         """
57 |         :return:
58 |             Available format names. These names can use by
59 |             :py:class:`.TableFileLoader` class constructor.
60 |         :rtype: list
61 | 
62 |         :Example:
63 |             .. code:: python
64 | 
65 |                 >>> from pytablereader import TableFileLoader
66 |                 >>> for format_name in TableFileLoader.get_format_names():
67 |                 ...     print(format_name)
68 |                 ...
69 |                 csv
70 |                 excel
71 |                 html
72 |                 json
73 |                 json_lines
74 |                 jsonl
75 |                 ldjson
76 |                 ltsv
77 |                 markdown
78 |                 mediawiki
79 |                 ndjson
80 |                 sqlite
81 |                 ssv
82 |                 tsv
83 |         """
84 | 
85 |         return TableFileLoaderFactory("dummy").get_format_names()
86 | 
87 |     @classmethod
88 |     def get_format_name_list(cls):
89 |         warnings.warn("'get_format_name_list' has moved to 'get_format_names'", DeprecationWarning)
90 |         return cls.get_format_names()
91 | 


--------------------------------------------------------------------------------
/pytablereader/loadermanager/_text.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
 3 | """
 4 | 
 5 | 
 6 | from typing import Optional, Sequence
 7 | 
 8 | import typepy
 9 | 
10 | from ..factory import TableTextLoaderFactory
11 | from ._base import TableLoaderManager
12 | 
13 | 
14 | class TableTextLoader(TableLoaderManager):
15 |     """
16 |     Loader class to loading tables from URL.
17 | 
18 |     :param str url: URL to load.
19 |     :param str format_name: Data format name to load.
20 |         Supported formats can be get by :py:meth:`.get_format_names`
21 |     :param dict proxies: http/https proxy information.
22 | 
23 |         .. seealso::
24 |             `requests proxies <http://docs.python-requests.org/en/master/user/advanced/#proxies>`__
25 | 
26 |     :raises pytablereader.LoaderNotFoundError:
27 |         |LoaderNotFoundError_desc| loading the URL.
28 | 
29 |     .. py:method:: load
30 | 
31 |         Load tables from text as ``format_name`` format.
32 | 
33 |         :return: Loaded table data iterator.
34 |         :rtype: |TableData| iterator
35 | 
36 |         .. seealso::
37 |             * :py:meth:`pytablereader.factory.TableTextLoaderFactory.create_from_format_name`
38 |             * :py:meth:`pytablereader.factory.TableTextLoaderFactory.create_from_path`
39 |     """
40 | 
41 |     def __init__(
42 |         self, source: str, format_name: str, encoding: Optional[str] = None, type_hint_rules=None
43 |     ) -> None:
44 |         loader_factory = TableTextLoaderFactory(source, encoding)
45 | 
46 |         if typepy.is_null_string(format_name):
47 |             raise ValueError("requie format_name")
48 | 
49 |         loader = loader_factory.create_from_format_name(format_name)
50 |         loader.type_hint_rules = type_hint_rules
51 | 
52 |         super().__init__(loader)
53 | 
54 |     @classmethod
55 |     def get_format_names(cls) -> Sequence[str]:
56 |         """
57 |         :return:
58 |             Available format names. These names can use by
59 |             :py:class:`.TableTextLoader` class constructor.
60 |         :rtype: list
61 | 
62 |         :Example:
63 |             .. code:: python
64 | 
65 |                 >>> from pytablereader import TableTextLoader
66 |                 >>> for format_name in TableTextLoader.get_format_names():
67 |                 ...     print(format_name)
68 |                 ...
69 |                 csv
70 |                 excel
71 |                 html
72 |                 json
73 |                 json_lines
74 |                 jsonl
75 |                 ldjson
76 |                 ltsv
77 |                 markdown
78 |                 mediawiki
79 |                 ndjson
80 |                 sqlite
81 |                 ssv
82 |                 tsv
83 |         """
84 | 
85 |         return TableTextLoaderFactory("dummy").get_format_names()
86 | 


--------------------------------------------------------------------------------
/pytablereader/loadermanager/_url.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
 3 | """
 4 | 
 5 | import warnings
 6 | 
 7 | import typepy
 8 | 
 9 | from ..factory import TableUrlLoaderFactory
10 | from ._base import TableLoaderManager
11 | 
12 | 
13 | class TableUrlLoader(TableLoaderManager):
14 |     """
15 |     Loader class to loading tables from URL.
16 | 
17 |     :param str url: URL to load.
18 |     :param str format_name: Data format name to load.
19 |         Supported formats are:
20 |         ``"csv"``, ``"excel"``, ``"html"``, ``"json"``, ``"ltsv"``,
21 |         ``"markdown"``, ``"mediawiki"``, ``"sqlite"``, ``"ssv"``, ``"tsv"``.
22 |         If the value is |None|, automatically detect file format from
23 |         the ``url``.
24 |     :param dict proxies: http/https proxy information.
25 | 
26 |         .. seealso::
27 |             `requests proxies <http://docs.python-requests.org/en/master/user/advanced/#proxies>`__
28 | 
29 |     :raises pytablereader.LoaderNotFoundError:
30 |         |LoaderNotFoundError_desc| loading the URL.
31 |     :raises pytablereader.HTTPError:
32 |         If loader received an HTTP error when access to the URL.
33 | 
34 |     :Example:
35 |         :ref:`example-url-table-loader`
36 | 
37 |     .. py:method:: load
38 | 
39 |         Load tables from URL as ``format_name`` format.
40 | 
41 |         :return: Loaded table data iterator.
42 |         :rtype: |TableData| iterator
43 | 
44 |         .. seealso::
45 |             * :py:meth:`pytablereader.factory.TableUrlLoaderFactory.create_from_format_name`
46 |             * :py:meth:`pytablereader.factory.TableUrlLoaderFactory.create_from_path`
47 |     """
48 | 
49 |     def __init__(self, url, format_name=None, encoding=None, type_hint_rules=None, proxies=None):
50 |         loader_factory = TableUrlLoaderFactory(url, encoding, proxies)
51 | 
52 |         if typepy.is_not_null_string(format_name):
53 |             loader = loader_factory.create_from_format_name(format_name)
54 |         else:
55 |             loader = loader_factory.create_from_path()
56 | 
57 |         loader.type_hint_rules = type_hint_rules
58 | 
59 |         super().__init__(loader)
60 | 
61 |     @classmethod
62 |     def get_format_names(cls):
63 |         """
64 |         :return:
65 |             Available format names. These names can use by
66 |             :py:class:`.TableUrlLoader` class constructor.
67 |         :rtype: list
68 | 
69 |         :Example:
70 |             .. code:: python
71 | 
72 |                 >>> from pytablereader import TableUrlLoader
73 |                 >>> for format_name in TableUrlLoader.get_format_names():
74 |                 ...     print(format_name)
75 |                 ...
76 |                 csv
77 |                 excel
78 |                 html
79 |                 json
80 |                 json_lines
81 |                 jsonl
82 |                 ldjson
83 |                 ltsv
84 |                 markdown
85 |                 mediawiki
86 |                 ndjson
87 |                 sqlite
88 |                 ssv
89 |                 tsv
90 |         """
91 | 
92 |         return TableUrlLoaderFactory("http://dummy.com/").get_format_names()
93 | 
94 |     @classmethod
95 |     def get_format_name_list(cls):
96 |         warnings.warn("'get_format_name_list' has moved to 'get_format_names'", DeprecationWarning)
97 |         return cls.get_format_names()
98 | 


--------------------------------------------------------------------------------
/pytablereader/ltsv/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thombashi/pytablereader/b2a6a3db3ef52f5db942340ae75a6905df64a960/pytablereader/ltsv/__init__.py


--------------------------------------------------------------------------------
/pytablereader/ltsv/core.py:
--------------------------------------------------------------------------------
  1 | """
  2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
  3 | """
  4 | 
  5 | import pathvalidate as pv
  6 | import typepy
  7 | 
  8 | from pytablereader import DataError, InvalidHeaderNameError
  9 | 
 10 | from .._common import get_file_encoding
 11 | from .._constant import TableNameTemplate as tnt
 12 | from .._logger import FileSourceLogger, TextSourceLogger
 13 | from .._validator import FileValidator, TextValidator
 14 | from ..interface import AbstractTableReader
 15 | from ..json.formatter import SingleJsonTableConverterA
 16 | 
 17 | 
 18 | class LtsvTableLoader(AbstractTableReader):
 19 |     """
 20 |     Abstract class of
 21 |     `Labeled Tab-separated Values (LTSV) <http://ltsv.org/>`__
 22 |     format table loaders.
 23 | 
 24 |     .. py:attribute:: encoding
 25 | 
 26 |         Encoding of the LTSV data.
 27 |     """
 28 | 
 29 |     @property
 30 |     def format_name(self):
 31 |         return "ltsv"
 32 | 
 33 |     def __init__(self, source, quoting_flags, type_hints, type_hint_rules=None):
 34 |         super().__init__(source, quoting_flags, type_hints, type_hint_rules)
 35 | 
 36 |         self._ltsv_input_stream = None
 37 | 
 38 |     def _to_data_matrix(self):
 39 |         from collections import OrderedDict
 40 | 
 41 |         data_matrix = []
 42 | 
 43 |         for row_idx, row in enumerate(self._ltsv_input_stream):
 44 |             if typepy.is_empty_sequence(row):
 45 |                 continue
 46 | 
 47 |             ltsv_record = OrderedDict()
 48 |             for col_idx, ltsv_item in enumerate(row.strip().split("\t")):
 49 |                 try:
 50 |                     label, value = ltsv_item.split(":")
 51 |                 except ValueError:
 52 |                     raise DataError(
 53 |                         "invalid ltsv item found: line={}, col={}, item='{}'".format(
 54 |                             row_idx, col_idx, ltsv_item
 55 |                         )
 56 |                     )
 57 | 
 58 |                 label = label.strip('"')
 59 | 
 60 |                 try:
 61 |                     pv.validate_ltsv_label(label)
 62 |                 except pv.ValidationError:
 63 |                     raise InvalidHeaderNameError(
 64 |                         "invalid label found (acceptable chars are [0-9A-Za-z_.-]): "
 65 |                         "line={}, col={}, label='{}'".format(row_idx, col_idx, label)
 66 |                     )
 67 | 
 68 |                 ltsv_record[label] = value
 69 | 
 70 |             data_matrix.append(ltsv_record)
 71 | 
 72 |         # using generator to prepare for future enhancement to support
 73 |         # iterative load.
 74 |         yield data_matrix
 75 | 
 76 | 
 77 | class LtsvTableFileLoader(LtsvTableLoader):
 78 |     """
 79 |     `Labeled Tab-separated Values (LTSV) <http://ltsv.org/>`__
 80 |     format file loader class.
 81 | 
 82 |     :param str file_path: Path to the loading LTSV file.
 83 | 
 84 |     .. py:attribute:: table_name
 85 | 
 86 |         Table name string. Defaults to ``%(filename)s``.
 87 |     """
 88 | 
 89 |     def __init__(self, file_path, quoting_flags=None, type_hints=None, type_hint_rules=None):
 90 |         super().__init__(file_path, quoting_flags, type_hints, type_hint_rules)
 91 | 
 92 |         self.encoding = None
 93 | 
 94 |         self._validator = FileValidator(file_path)
 95 |         self._logger = FileSourceLogger(self)
 96 | 
 97 |         self.__file = None
 98 | 
 99 |     def load(self):
100 |         """
101 |         Extract tabular data as |TableData| instances from a LTSV file.
102 |         |load_source_desc_file|
103 | 
104 |         :return:
105 |             Loaded table data.
106 |             |load_table_name_desc|
107 | 
108 |             ===================  ========================================
109 |             Format specifier     Value after the replacement
110 |             ===================  ========================================
111 |             ``%(filename)s``     |filename_desc|
112 |             ``%(format_name)s``  ``"ltsv"``
113 |             ``%(format_id)s``    |format_id_desc|
114 |             ``%(global_id)s``    |global_id|
115 |             ===================  ========================================
116 |         :rtype: |TableData| iterator
117 |         :raises pytablereader.InvalidHeaderNameError:
118 |             If an invalid label name is included in the LTSV file.
119 |         :raises pytablereader.DataError:
120 |             If the LTSV data is invalid.
121 |         """
122 | 
123 |         self._validate()
124 |         self._logger.logging_load()
125 |         self.encoding = get_file_encoding(self.source, self.encoding)
126 | 
127 |         self._ltsv_input_stream = open(self.source, encoding=self.encoding)
128 | 
129 |         for data_matrix in self._to_data_matrix():
130 |             formatter = SingleJsonTableConverterA(data_matrix)
131 |             formatter.accept(self)
132 | 
133 |             return formatter.to_table_data()
134 | 
135 |     def _get_default_table_name_template(self):
136 |         return tnt.FILENAME
137 | 
138 | 
139 | class LtsvTableTextLoader(LtsvTableLoader):
140 |     """
141 |     `Labeled Tab-separated Values (LTSV) <http://ltsv.org/>`__
142 |     format text loader class.
143 | 
144 |     :param str text: LTSV text to load.
145 | 
146 |     .. py:attribute:: table_name
147 | 
148 |         Table name string. Defaults to ``%(format_name)s%(format_id)s``.
149 |     """
150 | 
151 |     def __init__(self, text=None, quoting_flags=None, type_hints=None, type_hint_rules=None):
152 |         super().__init__(text, quoting_flags, type_hints)
153 | 
154 |         self._validator = TextValidator(text)
155 |         self._logger = TextSourceLogger(self)
156 | 
157 |     def load(self):
158 |         """
159 |         Extract tabular data as |TableData| instances from a LTSV text object.
160 |         |load_source_desc_text|
161 | 
162 |         :return:
163 |             Loaded table data.
164 |             |load_table_name_desc|
165 | 
166 |             ===================  ========================================
167 |             Format specifier     Value after the replacement
168 |             ===================  ========================================
169 |             ``%(filename)s``     ``""``
170 |             ``%(format_name)s``  ``"ltsv"``
171 |             ``%(format_id)s``    |format_id_desc|
172 |             ``%(global_id)s``    |global_id|
173 |             ===================  ========================================
174 |         :rtype: |TableData| iterator
175 |         :raises pytablereader.InvalidHeaderNameError:
176 |             If an invalid label name is included in the LTSV file.
177 |         :raises pytablereader.DataError:
178 |             If the LTSV data is invalid.
179 |         """
180 | 
181 |         self._validate()
182 |         self._logger.logging_load()
183 | 
184 |         self._ltsv_input_stream = self.source.splitlines()
185 | 
186 |         for data_matrix in self._to_data_matrix():
187 |             formatter = SingleJsonTableConverterA(data_matrix)
188 |             formatter.accept(self)
189 | 
190 |             return formatter.to_table_data()
191 | 
192 |     def _get_default_table_name_template(self):
193 |         return f"{tnt.FORMAT_NAME:s}{tnt.FORMAT_ID:s}"
194 | 


--------------------------------------------------------------------------------
/pytablereader/markdown/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thombashi/pytablereader/b2a6a3db3ef52f5db942340ae75a6905df64a960/pytablereader/markdown/__init__.py


--------------------------------------------------------------------------------
/pytablereader/markdown/core.py:
--------------------------------------------------------------------------------
  1 | """
  2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
  3 | """
  4 | 
  5 | from .._common import get_file_encoding
  6 | from .._constant import SourceType
  7 | from .._constant import TableNameTemplate as tnt
  8 | from .._logger import FileSourceLogger, TextSourceLogger
  9 | from .._validator import FileValidator, TextValidator
 10 | from ..interface import AbstractTableReader
 11 | from .formatter import MarkdownTableFormatter
 12 | 
 13 | 
 14 | class MarkdownTableLoader(AbstractTableReader):
 15 |     """
 16 |     The abstract class of Markdown table loaders.
 17 |     """
 18 | 
 19 |     @property
 20 |     def format_name(self):
 21 |         return "markdown"
 22 | 
 23 | 
 24 | class MarkdownTableFileLoader(MarkdownTableLoader):
 25 |     """
 26 |     A file loader class to extract tabular data from Markdown files.
 27 | 
 28 |     :param str file_path: Path to the loading Markdown file.
 29 | 
 30 |     .. py:attribute:: table_name
 31 | 
 32 |         Table name string. Defaults to ``%(filename)s_%(key)s``.
 33 |     """
 34 | 
 35 |     def __init__(self, file_path=None, quoting_flags=None, type_hints=None, type_hint_rules=None):
 36 |         super().__init__(file_path, quoting_flags, type_hints, type_hint_rules)
 37 | 
 38 |         self.encoding = None
 39 | 
 40 |         self._validator = FileValidator(file_path)
 41 |         self._logger = FileSourceLogger(self)
 42 | 
 43 |     def load(self):
 44 |         """
 45 |         Extract tabular data as |TableData| instances from a Markdown file.
 46 |         |load_source_desc_file|
 47 | 
 48 |         :return:
 49 |             Loaded table data iterator.
 50 |             |load_table_name_desc|
 51 | 
 52 |             ===================  ==============================================
 53 |             Format specifier     Value after the replacement
 54 |             ===================  ==============================================
 55 |             ``%(filename)s``     |filename_desc|
 56 |             ``%(key)s``          ``%(format_name)s%(format_id)s``
 57 |             ``%(format_name)s``  ``"markdown"``
 58 |             ``%(format_id)s``    |format_id_desc|
 59 |             ``%(global_id)s``    |global_id|
 60 |             ===================  ==============================================
 61 |         :rtype: |TableData| iterator
 62 |         :raises pytablereader.DataError:
 63 |             If the Markdown data is invalid or empty.
 64 |         """
 65 | 
 66 |         self._validate()
 67 |         self._logger.logging_load()
 68 |         self.encoding = get_file_encoding(self.source, self.encoding)
 69 | 
 70 |         with open(self.source, encoding=self.encoding) as fp:
 71 |             formatter = MarkdownTableFormatter(fp.read(), self._logger)
 72 |         formatter.accept(self)
 73 | 
 74 |         return formatter.to_table_data()
 75 | 
 76 |     def _get_default_table_name_template(self):
 77 |         return f"{tnt.FILENAME:s}_{tnt.KEY:s}"
 78 | 
 79 | 
 80 | class MarkdownTableTextLoader(MarkdownTableLoader):
 81 |     """
 82 |     A text loader class to extract tabular data from Markdown text data.
 83 | 
 84 |     :param str text: Markdown text to load.
 85 | 
 86 |     .. py:attribute:: table_name
 87 | 
 88 |         Table name string. Defaults to ``%(key)s``.
 89 |     """
 90 | 
 91 |     @property
 92 |     def source_type(self):
 93 |         return SourceType.TEXT
 94 | 
 95 |     def __init__(self, text=None, quoting_flags=None, type_hints=None, type_hint_rules=None):
 96 |         super().__init__(text, quoting_flags, type_hints)
 97 | 
 98 |         self._validator = TextValidator(text)
 99 |         self._logger = TextSourceLogger(self)
100 | 
101 |     def load(self):
102 |         """
103 |         Extract tabular data as |TableData| instances from a Markdown text
104 |         object.
105 |         |load_source_desc_text|
106 | 
107 |         :return:
108 |             Loaded table data iterator.
109 |             |load_table_name_desc|
110 | 
111 |             ===================  ==============================================
112 |             Format specifier     Value after the replacement
113 |             ===================  ==============================================
114 |             ``%(filename)s``     ``""``
115 |             ``%(key)s``          ``%(format_name)s%(format_id)s``
116 |             ``%(format_name)s``  ``"markdown"``
117 |             ``%(format_id)s``    |format_id_desc|
118 |             ``%(global_id)s``    |global_id|
119 |             ===================  ==============================================
120 |         :rtype: |TableData| iterator
121 |         :raises pytablereader.DataError:
122 |             If the Markdown data is invalid or empty.
123 |         """
124 | 
125 |         self._validate()
126 |         self._logger.logging_load()
127 | 
128 |         formatter = MarkdownTableFormatter(self.source, self._logger)
129 |         formatter.accept(self)
130 | 
131 |         return formatter.to_table_data()
132 | 
133 |     def _get_default_table_name_template(self):
134 |         return f"{tnt.KEY:s}"
135 | 


--------------------------------------------------------------------------------
/pytablereader/markdown/formatter.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
 3 | """
 4 | 
 5 | import typepy
 6 | 
 7 | from pytablereader import DataError
 8 | 
 9 | from ..html.formatter import HtmlTableFormatter
10 | 
11 | 
12 | class MarkdownTableFormatter(HtmlTableFormatter):
13 |     def __init__(self, source_data, logger=None):
14 |         import markdown
15 | 
16 |         if typepy.is_null_string(source_data):
17 |             raise DataError
18 | 
19 |         super().__init__(
20 |             markdown.markdown(source_data, extensions=["markdown.extensions.tables"]), logger=logger
21 |         )
22 | 


--------------------------------------------------------------------------------
/pytablereader/mediawiki/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thombashi/pytablereader/b2a6a3db3ef52f5db942340ae75a6905df64a960/pytablereader/mediawiki/__init__.py


--------------------------------------------------------------------------------
/pytablereader/mediawiki/core.py:
--------------------------------------------------------------------------------
  1 | """
  2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
  3 | """
  4 | 
  5 | from .._common import get_file_encoding
  6 | from .._constant import SourceType
  7 | from .._constant import TableNameTemplate as tnt
  8 | from .._logger import FileSourceLogger, TextSourceLogger
  9 | from .._validator import FileValidator, TextValidator
 10 | from ..interface import AbstractTableReader
 11 | from .formatter import MediaWikiTableFormatter
 12 | 
 13 | 
 14 | class MediaWikiTableLoader(AbstractTableReader):
 15 |     """
 16 |     The abstract class of MediaWiki table loaders.
 17 |     """
 18 | 
 19 |     @property
 20 |     def format_name(self):
 21 |         return "mediawiki"
 22 | 
 23 | 
 24 | class MediaWikiTableFileLoader(MediaWikiTableLoader):
 25 |     """
 26 |     A file loader class to extract tabular data from MediaWiki files.
 27 | 
 28 |     :param str file_path: Path to the loading file.
 29 | 
 30 |     .. py:attribute:: table_name
 31 | 
 32 |         Table name string. Defaults to ``%(filename)s_%(key)s``.
 33 |     """
 34 | 
 35 |     def __init__(self, file_path=None, quoting_flags=None, type_hints=None, type_hint_rules=None):
 36 |         super().__init__(file_path, quoting_flags, type_hints, type_hint_rules)
 37 | 
 38 |         self.encoding = None
 39 | 
 40 |         self._validator = FileValidator(file_path)
 41 |         self._logger = FileSourceLogger(self)
 42 | 
 43 |     def load(self):
 44 |         """
 45 |         Extract tabular data as |TableData| instances from a MediaWiki file.
 46 |         |load_source_desc_file|
 47 | 
 48 |         :return:
 49 |             Loaded table data iterator.
 50 |             |load_table_name_desc|
 51 | 
 52 |             ===================  ==============================================
 53 |             Format specifier     Value after the replacement
 54 |             ===================  ==============================================
 55 |             ``%(filename)s``     |filename_desc|
 56 |             ``%(key)s``          | This replaced to:
 57 |                                  | **(1)** ``caption`` mark of the table
 58 |                                  | **(2)** ``%(format_name)s%(format_id)s``
 59 |                                  | if ``caption`` mark not included
 60 |                                  | in the table.
 61 |             ``%(format_name)s``  ``"mediawiki"``
 62 |             ``%(format_id)s``    |format_id_desc|
 63 |             ``%(global_id)s``    |global_id|
 64 |             ===================  ==============================================
 65 |         :rtype: |TableData| iterator
 66 |         :raises pytablereader.DataError:
 67 |             If the MediaWiki data is invalid or empty.
 68 |         """
 69 | 
 70 |         self._validate()
 71 |         self._logger.logging_load()
 72 |         self.encoding = get_file_encoding(self.source, self.encoding)
 73 | 
 74 |         with open(self.source, encoding=self.encoding) as fp:
 75 |             formatter = MediaWikiTableFormatter(fp.read())
 76 |         formatter.accept(self)
 77 | 
 78 |         return formatter.to_table_data()
 79 | 
 80 |     def _get_default_table_name_template(self):
 81 |         return f"{tnt.FILENAME:s}_{tnt.KEY:s}"
 82 | 
 83 | 
 84 | class MediaWikiTableTextLoader(MediaWikiTableLoader):
 85 |     """
 86 |     A text loader class to extract tabular data from MediaWiki text data.
 87 | 
 88 |     :param str text: MediaWiki text to load.
 89 | 
 90 |     .. py:attribute:: table_name
 91 | 
 92 |         Table name string. Defaults to ``%(key)s``.
 93 |     """
 94 | 
 95 |     @property
 96 |     def source_type(self):
 97 |         return SourceType.TEXT
 98 | 
 99 |     def __init__(self, text=None, quoting_flags=None, type_hints=None, type_hint_rules=None):
100 |         super().__init__(text, quoting_flags, type_hints)
101 | 
102 |         self._validator = TextValidator(text)
103 |         self._logger = TextSourceLogger(self)
104 | 
105 |     def load(self):
106 |         """
107 |         Extract tabular data as |TableData| instances from a MediaWiki text
108 |         object.
109 |         |load_source_desc_text|
110 | 
111 |         :return:
112 |             Loaded table data iterator.
113 |             |load_table_name_desc|
114 | 
115 |             ===================  ==============================================
116 |             Format specifier     Value after the replacement
117 |             ===================  ==============================================
118 |             ``%(filename)s``     ``""``
119 |             ``%(key)s``          | This replaced to:
120 |                                  | **(1)** ``caption`` mark of the table
121 |                                  | **(2)** ``%(format_name)s%(format_id)s``
122 |                                  | if ``caption`` mark not included
123 |                                  | in the table.
124 |             ``%(format_name)s``  ``"mediawiki"``
125 |             ``%(format_id)s``    |format_id_desc|
126 |             ``%(global_id)s``    |global_id|
127 |             ===================  ==============================================
128 |         :rtype: |TableData| iterator
129 |         :raises pytablereader.DataError:
130 |             If the MediaWiki data is invalid or empty.
131 |         """
132 | 
133 |         self._validate()
134 |         self._logger.logging_load()
135 | 
136 |         formatter = MediaWikiTableFormatter(self.source)
137 |         formatter.accept(self)
138 | 
139 |         return formatter.to_table_data()
140 | 
141 |     def _get_default_table_name_template(self):
142 |         return f"{tnt.KEY:s}"
143 | 


--------------------------------------------------------------------------------
/pytablereader/mediawiki/formatter.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
 3 | """
 4 | 
 5 | from ..error import PypandocImportError
 6 | from ..html.formatter import HtmlTableFormatter
 7 | 
 8 | 
 9 | class MediaWikiTableFormatter(HtmlTableFormatter):
10 |     def __init__(self, source_data):
11 |         try:
12 |             import pypandoc
13 |         except ImportError as e:
14 |             # pypandoc package may do not installed in the system since the package is
15 |             # an optional dependency
16 |             raise PypandocImportError(e)
17 | 
18 |         super().__init__(pypandoc.convert_text(source_data, "html", format="mediawiki"))
19 | 


--------------------------------------------------------------------------------
/pytablereader/spreadsheet/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thombashi/pytablereader/b2a6a3db3ef52f5db942340ae75a6905df64a960/pytablereader/spreadsheet/__init__.py


--------------------------------------------------------------------------------
/pytablereader/spreadsheet/core.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
 3 | """
 4 | 
 5 | import abc
 6 | 
 7 | from .._constant import TableNameTemplate as tnt
 8 | from ..interface import AbstractTableReader
 9 | 
10 | 
11 | class SpreadSheetLoader(AbstractTableReader):
12 |     """
13 |     An abstract class of table data.
14 |     Especially spreadsheets that consists multiple rows.
15 | 
16 |     .. py:attribute:: start_row
17 | 
18 |         The first row to search header row.
19 |     """
20 | 
21 |     def __init__(self, source, quoting_flags, type_hints, type_hint_rules):
22 |         super().__init__(source, quoting_flags, type_hints, type_hint_rules)
23 | 
24 |         self.start_row = 0
25 |         self._worksheet = None
26 |         self._start_col_idx = None
27 |         self._end_col_idx = None
28 | 
29 |     @abc.abstractproperty
30 |     def _sheet_name(self):  # pragma: no cover
31 |         pass
32 | 
33 |     @abc.abstractproperty
34 |     def _row_count(self):  # pragma: no cover
35 |         pass
36 | 
37 |     @abc.abstractproperty
38 |     def _col_count(self):  # pragma: no cover
39 |         pass
40 | 
41 |     @abc.abstractmethod
42 |     def _is_empty_sheet(self):  # pragma: no cover
43 |         pass
44 | 
45 |     @abc.abstractmethod
46 |     def _get_start_row_idx(self):  # pragma: no cover
47 |         pass
48 | 
49 |     @property
50 |     def format_name(self):
51 |         return "spreadsheet"
52 | 
53 |     def _make_table_name(self):
54 |         kv_mapping = self._get_basic_tablename_keyvalue_mapping()
55 | 
56 |         try:
57 |             kv_mapping[tnt.SHEET] = self._sheet_name
58 |         except AttributeError:
59 |             kv_mapping[tnt.SHEET] = ""
60 | 
61 |         return self._expand_table_name_format(kv_mapping)
62 | 
63 |     def _get_default_table_name_template(self):
64 |         return f"{tnt.SHEET:s}"
65 | 
66 |     def _extract_type_hints(self, headers=None):
67 |         if self.type_hints:
68 |             return self.type_hints
69 | 
70 |         if not self.type_hint_rules or not headers:
71 |             return []
72 | 
73 |         type_hints = []
74 |         for header in headers:
75 |             for regexp, type_hint in self.type_hint_rules.items():
76 |                 if regexp.search(header):
77 |                     type_hints.append(type_hint)
78 |                     break
79 |             else:
80 |                 type_hints.append(None)
81 | 
82 |         return type_hints
83 | 


--------------------------------------------------------------------------------
/pytablereader/spreadsheet/excelloader.py:
--------------------------------------------------------------------------------
  1 | """
  2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
  3 | """
  4 | 
  5 | from tabledata import TableData
  6 | 
  7 | from pytablereader import DataError
  8 | 
  9 | from .._logger import FileSourceLogger
 10 | from .._validator import FileValidator
 11 | from ..error import OpenError
 12 | from .core import SpreadSheetLoader
 13 | 
 14 | 
 15 | class ExcelTableFileLoader(SpreadSheetLoader):
 16 |     """
 17 |     A file loader class to extract tabular data from Microsoft Excel |TM|
 18 |     files.
 19 | 
 20 |     :param str file_path: Path to the loading Excel workbook file.
 21 | 
 22 |     .. py:attribute:: table_name
 23 | 
 24 |         Table name string. Defaults to ``%(sheet)s``.
 25 | 
 26 |     .. py:attribute:: start_row
 27 | 
 28 |         The first row to search header row.
 29 |     """
 30 | 
 31 |     @property
 32 |     def format_name(self):
 33 |         return "excel"
 34 | 
 35 |     @property
 36 |     def _sheet_name(self):
 37 |         return self._worksheet.name
 38 | 
 39 |     @property
 40 |     def _row_count(self):
 41 |         return self._worksheet.nrows
 42 | 
 43 |     @property
 44 |     def _col_count(self):
 45 |         return self._worksheet.ncols
 46 | 
 47 |     def __init__(self, file_path=None, quoting_flags=None, type_hints=None, type_hint_rules=None):
 48 |         super().__init__(file_path, quoting_flags, type_hints, type_hint_rules)
 49 | 
 50 |         self._validator = FileValidator(file_path)
 51 |         self._logger = FileSourceLogger(self)
 52 | 
 53 |     def load(self):
 54 |         """
 55 |         Extract tabular data as |TableData| instances from an Excel file.
 56 |         |spreadsheet_load_desc|
 57 | 
 58 |         :return:
 59 |             Loaded |TableData| iterator.
 60 |             |TableData| created for each sheet in the workbook.
 61 |             |load_table_name_desc|
 62 | 
 63 |             ===================  ====================================
 64 |             Format specifier     Value after the replacement
 65 |             ===================  ====================================
 66 |             ``%(filename)s``     Filename of the workbook
 67 |             ``%(sheet)s``        Name of the sheet
 68 |             ``%(format_name)s``  ``"spreadsheet"``
 69 |             ``%(format_id)s``    |format_id_desc|
 70 |             ``%(global_id)s``    |global_id|
 71 |             ===================  ====================================
 72 |         :rtype: |TableData| iterator
 73 |         :raises pytablereader.DataError:
 74 |             If the header row is not found.
 75 |         :raises pytablereader.error.OpenError:
 76 |             If failed to open the source file.
 77 |         """
 78 | 
 79 |         try:
 80 |             import excelrd as xlrd
 81 |         except ImportError:
 82 |             import xlrd
 83 | 
 84 |         self._validate()
 85 |         self._logger.logging_load()
 86 | 
 87 |         try:
 88 |             workbook = xlrd.open_workbook(self.source)
 89 |         except xlrd.biffh.XLRDError as e:
 90 |             raise OpenError(e)
 91 | 
 92 |         for worksheet in workbook.sheets():
 93 |             self._worksheet = worksheet
 94 | 
 95 |             if self._is_empty_sheet():
 96 |                 continue
 97 | 
 98 |             self.__extract_not_empty_col_idx()
 99 | 
100 |             try:
101 |                 start_row_idx = self._get_start_row_idx()
102 |             except DataError:
103 |                 continue
104 | 
105 |             rows = [
106 |                 self.__get_row_values(row_idx)
107 |                 for row_idx in range(start_row_idx + 1, self._row_count)
108 |             ]
109 | 
110 |             self.inc_table_count()
111 |             headers = self.__get_row_values(start_row_idx)
112 | 
113 |             yield TableData(
114 |                 self._make_table_name(),
115 |                 headers,
116 |                 rows,
117 |                 dp_extractor=self.dp_extractor,
118 |                 type_hints=self._extract_type_hints(headers),
119 |             )
120 | 
121 |     def _is_empty_sheet(self):
122 |         return any(
123 |             [
124 |                 self._col_count == 0,
125 |                 self._row_count <= 1,
126 |                 # nrows == 1 means exists header row only
127 |             ]
128 |         )
129 | 
130 |     def _get_start_row_idx(self):
131 |         for row_idx in range(self.start_row, self._row_count):
132 |             if self.__is_header_row(row_idx):
133 |                 break
134 |         else:
135 |             raise DataError("header row not found")
136 | 
137 |         return row_idx
138 | 
139 |     def __is_header_row(self, row_idx):
140 |         try:
141 |             from excelrd import XL_CELL_EMPTY
142 |         except ImportError:
143 |             from xlrd import XL_CELL_EMPTY
144 | 
145 |         return XL_CELL_EMPTY not in self._worksheet.row_types(
146 |             row_idx, self._start_col_idx, self._end_col_idx + 1
147 |         )
148 | 
149 |     @staticmethod
150 |     def __is_empty_cell_types(cell_types):
151 |         try:
152 |             from excelrd import XL_CELL_EMPTY
153 |         except ImportError:
154 |             from xlrd import XL_CELL_EMPTY
155 | 
156 |         return all([cell_type == XL_CELL_EMPTY for cell_type in cell_types])
157 | 
158 |     def __extract_not_empty_col_idx(self):
159 |         col_idx_list = [
160 |             col_idx
161 |             for col_idx in range(self._col_count)
162 |             if not self.__is_empty_cell_types(self._worksheet.col_types(col_idx))
163 |         ]
164 | 
165 |         self._start_col_idx = min(col_idx_list)
166 |         self._end_col_idx = max(col_idx_list)
167 | 
168 |     def __get_row_values(self, row_idx):
169 |         return self._worksheet.row_values(row_idx, self._start_col_idx, self._end_col_idx + 1)
170 | 


--------------------------------------------------------------------------------
/pytablereader/spreadsheet/gsloader.py:
--------------------------------------------------------------------------------
  1 | """
  2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
  3 | """
  4 | 
  5 | import typepy
  6 | from tabledata import TableData
  7 | 
  8 | from .._constant import TableNameTemplate as tnt
  9 | from .._validator import TextValidator
 10 | from ..error import APIError, OpenError
 11 | from .core import SpreadSheetLoader
 12 | 
 13 | 
 14 | class GoogleSheetsTableLoader(SpreadSheetLoader):
 15 |     """
 16 |     Concrete class of Google Spreadsheet loader.
 17 | 
 18 |     .. py:attribute:: table_name
 19 | 
 20 |         Table name string. Defaults to ``%(sheet)s``.
 21 | 
 22 |     :param str file_path: Path to the Google Sheets credential JSON file.
 23 | 
 24 |     :Dependency Packages:
 25 |         - `gspread <https://github.com/burnash/gspread>`_
 26 |         - `SimpleSQLite <https://github.com/thombashi/SimpleSQLite>`_
 27 |         - `oauth2client <https://pypi.org/project/oauth2client>`_
 28 |         - `pyOpenSSL <https://pypi.org/project/pyOpenSSL>`_
 29 | 
 30 |     :Examples:
 31 |         :ref:`example-gs-table-loader`
 32 |     """
 33 | 
 34 |     @property
 35 |     def _sheet_name(self):
 36 |         return self._worksheet.title
 37 | 
 38 |     @property
 39 |     def _row_count(self):
 40 |         return self._worksheet.row_count
 41 | 
 42 |     @property
 43 |     def _col_count(self):
 44 |         return self._worksheet.col_count
 45 | 
 46 |     def __init__(self, file_path=None, quoting_flags=None, type_hints=None, type_hint_rules=None):
 47 |         super().__init__(file_path, quoting_flags, type_hints, type_hint_rules)
 48 | 
 49 |         self.title = None
 50 |         self.start_row = 0
 51 | 
 52 |         self._validator = TextValidator(file_path)
 53 | 
 54 |         self.__all_values = None
 55 | 
 56 |     def load(self):
 57 |         """
 58 |         Load table data from a Google Spreadsheet.
 59 | 
 60 |         This method consider :py:attr:`.source` as a path to the
 61 |         credential JSON file to access Google Sheets API.
 62 | 
 63 |         The method automatically search the header row start from
 64 |         :py:attr:`.start_row`. The condition of the header row is that
 65 |         all of the columns have value (except empty columns).
 66 | 
 67 |         :return:
 68 |             Loaded table data. Return one |TableData| for each sheet in
 69 |             the workbook. The table name for data will be determined by
 70 |             :py:meth:`~.GoogleSheetsTableLoader.make_table_name`.
 71 |         :rtype: iterator of |TableData|
 72 |         :raises pytablereader.DataError:
 73 |             If the header row is not found.
 74 |         :raises pytablereader.OpenError:
 75 |             If the spread sheet not found.
 76 |         """
 77 | 
 78 |         import gspread
 79 |         from oauth2client.service_account import ServiceAccountCredentials
 80 | 
 81 |         self._validate_table_name()
 82 |         self._validate_title()
 83 | 
 84 |         scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
 85 |         credentials = ServiceAccountCredentials.from_json_keyfile_name(self.source, scope)
 86 | 
 87 |         gc = gspread.authorize(credentials)
 88 |         try:
 89 |             for worksheet in gc.open(self.title).worksheets():
 90 |                 self._worksheet = worksheet
 91 |                 self.__all_values = [row for row in worksheet.get_all_values()]
 92 | 
 93 |                 if self._is_empty_sheet():
 94 |                     continue
 95 | 
 96 |                 try:
 97 |                     self.__strip_empty_col()
 98 |                 except ValueError:
 99 |                     continue
100 | 
101 |                 value_matrix = self.__all_values[self._get_start_row_idx() :]
102 |                 try:
103 |                     headers = value_matrix[0]
104 |                     rows = value_matrix[1:]
105 |                 except IndexError:
106 |                     continue
107 | 
108 |                 self.inc_table_count()
109 | 
110 |                 yield TableData(
111 |                     self.make_table_name(),
112 |                     headers,
113 |                     rows,
114 |                     dp_extractor=self.dp_extractor,
115 |                     type_hints=self._extract_type_hints(headers),
116 |                 )
117 |         except gspread.exceptions.SpreadsheetNotFound:
118 |             raise OpenError(f"spreadsheet '{self.title}' not found")
119 |         except gspread.exceptions.APIError as e:
120 |             raise APIError(e)
121 | 
122 |     def _is_empty_sheet(self):
123 |         return len(self.__all_values) <= 1
124 | 
125 |     def _get_start_row_idx(self):
126 |         row_idx = 0
127 |         for row_values in self.__all_values:
128 |             if all([typepy.is_not_null_string(value) for value in row_values]):
129 |                 break
130 | 
131 |             row_idx += 1
132 | 
133 |         return self.start_row + row_idx
134 | 
135 |     def _validate_title(self):
136 |         if typepy.is_null_string(self.title):
137 |             raise ValueError("spreadsheet title is empty")
138 | 
139 |     def _make_table_name(self):
140 |         self._validate_title()
141 | 
142 |         kv_mapping = self._get_basic_tablename_keyvalue_mapping()
143 |         kv_mapping[tnt.TITLE] = self.title
144 |         try:
145 |             kv_mapping[tnt.SHEET] = self._sheet_name
146 |         except AttributeError:
147 |             kv_mapping[tnt.SHEET] = ""
148 | 
149 |         return self._expand_table_name_format(kv_mapping)
150 | 
151 |     def __strip_empty_col(self):
152 |         from simplesqlite import connect_memdb
153 |         from simplesqlite.query import Attr, AttrList
154 | 
155 |         con = connect_memdb()
156 | 
157 |         tmp_table_name = "tmp"
158 |         headers = [f"a{i:d}" for i in range(len(self.__all_values[0]))]
159 |         con.create_table_from_data_matrix(tmp_table_name, headers, self.__all_values)
160 |         for col_idx, header in enumerate(headers):
161 |             result = con.select(select=Attr(header), table_name=tmp_table_name)
162 |             if any([typepy.is_not_null_string(record[0]) for record in result.fetchall()]):
163 |                 break
164 | 
165 |         strip_headers = headers[col_idx:]
166 |         if typepy.is_empty_sequence(strip_headers):
167 |             raise ValueError()
168 | 
169 |         result = con.select(select=AttrList(strip_headers), table_name=tmp_table_name)
170 |         self.__all_values = result.fetchall()
171 | 


--------------------------------------------------------------------------------
/pytablereader/sqlite/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thombashi/pytablereader/b2a6a3db3ef52f5db942340ae75a6905df64a960/pytablereader/sqlite/__init__.py


--------------------------------------------------------------------------------
/pytablereader/sqlite/core.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
 3 | """
 4 | 
 5 | from .._constant import TableNameTemplate as tnt
 6 | from .._validator import FileValidator
 7 | from ..interface import AbstractTableReader
 8 | from .formatter import SqliteTableFormatter
 9 | 
10 | 
11 | class SqliteFileLoader(AbstractTableReader):
12 |     """
13 |     A file loader class to extract tabular data from SQLite database files.
14 | 
15 |     :param str file_path: Path to the loading SQLite database file.
16 | 
17 |     .. py:attribute:: table_name
18 | 
19 |         Table name string. Defaults to ``%(filename)s_%(key)s``.
20 | 
21 |     :Dependency Packages:
22 |         - `SimpleSQLite <https://github.com/thombashi/SimpleSQLite>`__
23 |     """
24 | 
25 |     @property
26 |     def format_name(self):
27 |         return "sqlite"
28 | 
29 |     def __init__(self, file_path=None, quoting_flags=None, type_hints=None, type_hint_rules=None):
30 |         super().__init__(file_path, quoting_flags, type_hints, type_hint_rules)
31 | 
32 |         self._validator = FileValidator(file_path)
33 | 
34 |     def load(self):
35 |         """
36 |         Extract tabular data as |TableData| instances from a SQLite database
37 |         file. |load_source_desc_file|
38 | 
39 |         :return:
40 |             Loaded table data iterator.
41 |             |load_table_name_desc|
42 | 
43 |             ===================  ==============================================
44 |             Format specifier     Value after the replacement
45 |             ===================  ==============================================
46 |             ``%(filename)s``     |filename_desc|
47 |             ``%(key)s``          ``%(format_name)s%(format_id)s``
48 |             ``%(format_name)s``  ``"sqlite"``
49 |             ``%(format_id)s``    |format_id_desc|
50 |             ``%(global_id)s``    |global_id|
51 |             ===================  ==============================================
52 |         :rtype: |TableData| iterator
53 |         :raises pytablereader.DataError:
54 |             If the SQLite database file data is invalid or empty.
55 |         """
56 | 
57 |         self._validate()
58 | 
59 |         formatter = SqliteTableFormatter(self.source)
60 |         formatter.accept(self)
61 | 
62 |         return formatter.to_table_data()
63 | 
64 |     def _get_default_table_name_template(self):
65 |         return f"{tnt.FORMAT_NAME:s}{tnt.FORMAT_ID:s}"
66 | 


--------------------------------------------------------------------------------
/pytablereader/sqlite/formatter.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
 3 | """
 4 | 
 5 | import typepy
 6 | from tabledata import TableData
 7 | 
 8 | from pytablereader import DataError
 9 | 
10 | from .._constant import TableNameTemplate as tnt
11 | from ..formatter import TableFormatter
12 | 
13 | 
14 | class SqliteTableFormatter(TableFormatter):
15 |     def __init__(self, source_data):
16 |         super().__init__(source_data)
17 | 
18 |         self.__table_name = None
19 | 
20 |         if typepy.is_null_string(source_data):
21 |             raise DataError
22 | 
23 |     def to_table_data(self):
24 |         from simplesqlite import SimpleSQLite
25 |         from simplesqlite.query import AttrList
26 | 
27 |         con = SimpleSQLite(self._source_data, "r")
28 | 
29 |         for table in con.fetch_table_names():
30 |             self.__table_name = table
31 | 
32 |             attr_names = con.fetch_attr_names(table)
33 |             data_matrix = con.select(select=AttrList(attr_names), table_name=table).fetchall()
34 | 
35 |             yield TableData(
36 |                 table,
37 |                 attr_names,
38 |                 data_matrix,
39 |                 dp_extractor=self._loader.dp_extractor,
40 |                 type_hints=self._extract_type_hints(attr_names),
41 |             )
42 | 
43 |     def _make_table_name(self):
44 |         return self._loader._expand_table_name_format(
45 |             self._loader._get_basic_tablename_keyvalue_mapping() + [(tnt.KEY, self.__table_name)]
46 |         )
47 | 


--------------------------------------------------------------------------------
/pytablereader/tsv/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thombashi/pytablereader/b2a6a3db3ef52f5db942340ae75a6905df64a960/pytablereader/tsv/__init__.py


--------------------------------------------------------------------------------
/pytablereader/tsv/core.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
 3 | """
 4 | 
 5 | from .._validator import FileValidator, TextValidator
 6 | from ..csv.core import CsvTableFileLoader, CsvTableTextLoader
 7 | 
 8 | 
 9 | class TsvTableFileLoader(CsvTableFileLoader):
10 |     """
11 |     Tab separated values (TSV) format file loader class.
12 | 
13 |     :param str file_path: Path to the loading TSV file.
14 | 
15 |     .. py:attribute:: table_name
16 | 
17 |         Table name string. Defaults to ``%(filename)s``.
18 |     """
19 | 
20 |     @property
21 |     def format_name(self):
22 |         return "tsv"
23 | 
24 |     def __init__(self, file_path, quoting_flags=None, type_hints=None, type_hint_rules=None):
25 |         super().__init__(file_path, quoting_flags, type_hints, type_hint_rules)
26 | 
27 |         self.delimiter = "\t"
28 | 
29 |         self._validator = FileValidator(file_path)
30 | 
31 | 
32 | class TsvTableTextLoader(CsvTableTextLoader):
33 |     """
34 |     Tab separated values (TSV) format text loader class.
35 | 
36 |     :param str text: TSV text to load.
37 | 
38 |     .. py:attribute:: table_name
39 | 
40 |         Table name string. Defaults to ``%(format_name)s%(format_id)s``.
41 |     """
42 | 
43 |     @property
44 |     def format_name(self):
45 |         return "tsv"
46 | 
47 |     def __init__(self, text, quoting_flags=None, type_hints=None, type_hint_rules=None):
48 |         super().__init__(text, quoting_flags, type_hints, type_hint_rules)
49 | 
50 |         self.delimiter = "\t"
51 | 
52 |         self._validator = TextValidator(text)
53 | 


--------------------------------------------------------------------------------
/requirements/docs_requirements.txt:
--------------------------------------------------------------------------------
1 | path>=13
2 | pytablereader
3 | sphinx_rtd_theme>=1.2.2
4 | Sphinx>=2.4.1
5 | 


--------------------------------------------------------------------------------
/requirements/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4>=4.5.3,<5
2 | DataProperty>=0.54.2,<2
3 | jsonschema>=2.5.1,<5
4 | mbstrdecoder>=1.0.0,<2
5 | pathvalidate>=2.5.2,<4
6 | path>=13,<17
7 | tabledata>=1.1.1,<2
8 | typepy>=1.2.0,<2
9 | 


--------------------------------------------------------------------------------
/requirements/test_requirements.txt:
--------------------------------------------------------------------------------
1 | pypandoc
2 | pytablewriter[excel]>=0.50
3 | pytest>=5
4 | responses
5 | subprocrunner
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | """
  2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
  3 | """
  4 | 
  5 | import os.path
  6 | 
  7 | import setuptools
  8 | 
  9 | 
 10 | MODULE_NAME = "pytablereader"
 11 | REPOSITORY_URL = f"https://github.com/thombashi/{MODULE_NAME:s}"
 12 | REQUIREMENT_DIR = "requirements"
 13 | ENCODING = "utf8"
 14 | 
 15 | pkg_info = {}
 16 | 
 17 | 
 18 | def get_release_command_class():
 19 |     try:
 20 |         from releasecmd import ReleaseCommand
 21 |     except ImportError:
 22 |         return {}
 23 | 
 24 |     return {"release": ReleaseCommand}
 25 | 
 26 | 
 27 | with open(os.path.join(MODULE_NAME, "__version__.py")) as f:
 28 |     exec(f.read(), pkg_info)
 29 | 
 30 | with open("README.rst", encoding=ENCODING) as fp:
 31 |     long_description = fp.read()
 32 | 
 33 | with open(os.path.join("docs", "pages", "introduction", "summary.txt"), encoding=ENCODING) as f:
 34 |     summary = f.read().strip()
 35 | 
 36 | with open(os.path.join(REQUIREMENT_DIR, "requirements.txt")) as f:
 37 |     install_requires = [line.strip() for line in f if line.strip()]
 38 | 
 39 | with open(os.path.join(REQUIREMENT_DIR, "test_requirements.txt")) as f:
 40 |     tests_requires = [line.strip() for line in f if line.strip()]
 41 | 
 42 | setuptools_require = ["setuptools>=38.3.0"]
 43 | excel_requires = ["excelrd>=2.0.2"]
 44 | 
 45 | markdown_requires = ["Markdown>=2.6.6,<4"]
 46 | mediawiki_requires = ["pypandoc"]
 47 | sqlite_requires = ["SimpleSQLite>=1.3.2,<2"]
 48 | gs_requires = ["gspread", "oauth2client", "pyOpenSSL"] + sqlite_requires
 49 | logging_requires = ["loguru>=0.4.1,<1"]
 50 | url_requires = ["retryrequests>=0.1,<1"]
 51 | optional_requires = ["simplejson>=3.8.1,<4"]
 52 | tests_requires = frozenset(
 53 |     tests_requires
 54 |     + excel_requires
 55 |     + markdown_requires
 56 |     + mediawiki_requires
 57 |     + sqlite_requires
 58 |     + url_requires
 59 | )
 60 | 
 61 | setuptools.setup(
 62 |     name=MODULE_NAME,
 63 |     version=pkg_info["__version__"],
 64 |     url=REPOSITORY_URL,
 65 |     author=pkg_info["__author__"],
 66 |     author_email=pkg_info["__email__"],
 67 |     description=summary,
 68 |     include_package_data=True,
 69 |     keywords=[
 70 |         "table",
 71 |         "reader",
 72 |         "pandas",
 73 |         "CSV",
 74 |         "Excel",
 75 |         "HTML",
 76 |         "JSON",
 77 |         "LTSV",
 78 |         "Markdown",
 79 |         "MediaWiki",
 80 |         "TSV",
 81 |         "SQLite",
 82 |     ],
 83 |     license=pkg_info["__license__"],
 84 |     long_description=long_description,
 85 |     long_description_content_type="text/x-rst",
 86 |     packages=setuptools.find_packages(exclude=["test*"]),
 87 |     project_urls={
 88 |         "Documentation": f"https://{MODULE_NAME:s}.rtfd.io/",
 89 |         "Source": REPOSITORY_URL,
 90 |         "Tracker": f"{REPOSITORY_URL:s}/issues",
 91 |         "Changlog": f"{REPOSITORY_URL:s}/releases",
 92 |     },
 93 |     python_requires=">=3.7",
 94 |     install_requires=setuptools_require + install_requires,
 95 |     setup_requires=setuptools_require,
 96 |     extras_require={
 97 |         "all": set(
 98 |             excel_requires
 99 |             + gs_requires
100 |             + logging_requires
101 |             + markdown_requires
102 |             + mediawiki_requires
103 |             + sqlite_requires
104 |             + url_requires
105 |         ),
106 |         "excel": excel_requires,
107 |         "gs": gs_requires,
108 |         "logging": logging_requires,
109 |         "md": markdown_requires,
110 |         "mediawiki": mediawiki_requires,
111 |         "url": url_requires,
112 |         "sqlite": sqlite_requires,
113 |         "test": tests_requires,
114 |     },
115 |     classifiers=[
116 |         "Development Status :: 4 - Beta",
117 |         "Intended Audience :: Developers",
118 |         "Intended Audience :: Information Technology",
119 |         "License :: OSI Approved :: MIT License",
120 |         "Operating System :: OS Independent",
121 |         "Programming Language :: Python :: 3",
122 |         "Programming Language :: Python :: 3.7",
123 |         "Programming Language :: Python :: 3.8",
124 |         "Programming Language :: Python :: 3.9",
125 |         "Programming Language :: Python :: 3.10",
126 |         "Programming Language :: Python :: 3.11",
127 |         "Programming Language :: Python :: 3 :: Only",
128 |         "Programming Language :: Python :: Implementation :: CPython",
129 |         "Programming Language :: Python :: Implementation :: PyPy",
130 |         "Topic :: Database",
131 |         "Topic :: Software Development :: Libraries",
132 |         "Topic :: Software Development :: Libraries :: Python Modules",
133 |         "Topic :: Text Processing",
134 |     ],
135 |     cmdclass=get_release_command_class(),
136 | )
137 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thombashi/pytablereader/b2a6a3db3ef52f5db942340ae75a6905df64a960/test/__init__.py


--------------------------------------------------------------------------------
/test/_common.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
 3 | """
 4 | 
 5 | import re
 6 | import sys
 7 | 
 8 | from typepy import Integer, RealNumber, String
 9 | 
10 | 
11 | TYPE_HINT_RULES = {
12 |     re.compile("[ -_]text$", re.IGNORECASE): String,
13 |     re.compile("[ -_]integer$", re.IGNORECASE): Integer,
14 |     re.compile("[ -_]real$", re.IGNORECASE): RealNumber,
15 | }
16 | 
17 | 
18 | def fifo_writer(fifo_name, text):
19 |     with open(fifo_name, "w") as p:
20 |         p.write(text)
21 | 
22 | 
23 | def print_test_result(expected, actual, error=None):
24 |     print(f"[expected]\n{expected}\n")
25 |     print(f"[actual]\n{actual}\n")
26 | 
27 |     if error:
28 |         print(error, file=sys.stderr)
29 | 


--------------------------------------------------------------------------------
/test/data/valid.sqlite3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thombashi/pytablereader/b2a6a3db3ef52f5db942340ae75a6905df64a960/test/data/valid.sqlite3


--------------------------------------------------------------------------------
/test/data/validdata.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thombashi/pytablereader/b2a6a3db3ef52f5db942340ae75a6905df64a960/test/data/validdata.xlsx


--------------------------------------------------------------------------------
/test/factory/test_file_loader_factory.py:
--------------------------------------------------------------------------------
  1 | """
  2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
  3 | """
  4 | 
  5 | import pytest
  6 | 
  7 | import pytablereader as ptr
  8 | 
  9 | 
 10 | class Test_TableFileLoaderFactory:
 11 |     @pytest.mark.parametrize(["value", "expected"], [[None, ValueError]])
 12 |     def test_exception(self, value, expected):
 13 |         with pytest.raises(expected):
 14 |             ptr.factory.TableFileLoaderFactory(value)
 15 | 
 16 | 
 17 | class Test_TableFileLoaderFactory_create_from_path:
 18 |     @pytest.mark.parametrize(
 19 |         ["value", "extension", "expected"],
 20 |         [
 21 |             ["valid_ext.csv", "csv", ptr.CsvTableFileLoader],
 22 |             ["valid_ext.CSV", "csv", ptr.CsvTableFileLoader],
 23 |             ["valid_ext.html", "html", ptr.HtmlTableFileLoader],
 24 |             ["valid_ext.HTML", "html", ptr.HtmlTableFileLoader],
 25 |             ["valid_ext.htm", "htm", ptr.HtmlTableFileLoader],
 26 |             ["valid_ext.HTM", "htm", ptr.HtmlTableFileLoader],
 27 |             ["valid_ext.json", "json", ptr.JsonTableFileLoader],
 28 |             ["valid_ext.JSON", "json", ptr.JsonTableFileLoader],
 29 |             ["valid_ext.md", "md", ptr.MarkdownTableFileLoader],
 30 |             ["valid_ext.MD", "md", ptr.MarkdownTableFileLoader],
 31 |             ["valid_ext.sqlite", "sqlite", ptr.SqliteFileLoader],
 32 |             ["valid_ext.sqlite3", "sqlite3", ptr.SqliteFileLoader],
 33 |             ["valid_ext.tsv", "tsv", ptr.TsvTableFileLoader],
 34 |             ["valid_ext.TSV", "tsv", ptr.TsvTableFileLoader],
 35 |             ["valid_ext.xls", "xls", ptr.ExcelTableFileLoader],
 36 |             ["valid_ext.XLS", "xls", ptr.ExcelTableFileLoader],
 37 |             ["valid_ext.xlsx", "xlsx", ptr.ExcelTableFileLoader],
 38 |             ["valid_ext.XLSX", "xlsx", ptr.ExcelTableFileLoader],
 39 |         ],
 40 |     )
 41 |     def test_normal(self, value, extension, expected):
 42 |         loader_factory = ptr.factory.TableFileLoaderFactory(value)
 43 |         loader = loader_factory.create_from_path()
 44 | 
 45 |         assert loader_factory.file_extension.lower() == extension
 46 |         assert loader.source == value
 47 |         assert isinstance(loader, expected)
 48 | 
 49 |     @pytest.mark.parametrize(
 50 |         ["value", "expected"],
 51 |         [
 52 |             ["hoge", ptr.LoaderNotFoundError],
 53 |             ["hoge.txt", ptr.LoaderNotFoundError],
 54 |             [".txt", ptr.LoaderNotFoundError],
 55 |             ["", ptr.InvalidFilePathError],
 56 |         ],
 57 |     )
 58 |     def test_exception(self, value, expected):
 59 |         loader_factory = ptr.factory.TableFileLoaderFactory(value)
 60 | 
 61 |         with pytest.raises(expected):
 62 |             loader_factory.create_from_path()
 63 | 
 64 | 
 65 | class Test_TableFileLoaderFactory_create_from_format_name:
 66 |     @pytest.mark.parametrize(
 67 |         ["file_path", "format_name", "expected"],
 68 |         [
 69 |             ["valid_ext.html", "csv", ptr.CsvTableFileLoader],
 70 |             ["invalid_ext.txt", "CSV", ptr.CsvTableFileLoader],
 71 |             ["valid_ext.html", "excel", ptr.ExcelTableFileLoader],
 72 |             ["invalid_ext.txt", "Excel", ptr.ExcelTableFileLoader],
 73 |             ["valid_ext.json", "html", ptr.HtmlTableFileLoader],
 74 |             ["invalid_ext.txt", "HTML", ptr.HtmlTableFileLoader],
 75 |             ["valid_ext.html", "json", ptr.JsonTableFileLoader],
 76 |             ["invalid_ext.txt", "JSON", ptr.JsonTableFileLoader],
 77 |             ["valid_ext.html", "markdown", ptr.MarkdownTableFileLoader],
 78 |             ["invalid_ext.txt", "Markdown", ptr.MarkdownTableFileLoader],
 79 |             ["valid_ext.html", "mediawiki", ptr.MediaWikiTableFileLoader],
 80 |             ["invalid_ext.txt", "MediaWiki", ptr.MediaWikiTableFileLoader],
 81 |             ["valid_ext.db", "sqlite", ptr.SqliteFileLoader],
 82 |             ["valid_ext.html", "tsv", ptr.TsvTableFileLoader],
 83 |             ["invalid_ext.txt", "TSV", ptr.TsvTableFileLoader],
 84 |         ],
 85 |     )
 86 |     def test_normal(self, file_path, format_name, expected):
 87 |         loader_factory = ptr.factory.TableFileLoaderFactory(file_path)
 88 |         loader = loader_factory.create_from_format_name(format_name)
 89 | 
 90 |         assert loader.source == file_path
 91 |         assert isinstance(loader, expected)
 92 | 
 93 |     @pytest.mark.parametrize(
 94 |         ["file_path", "format_name", "expected"],
 95 |         [
 96 |             ["valid_ext.csv", "not_exist_format", ptr.LoaderNotFoundError],
 97 |             ["valid_ext.csv", "", ptr.LoaderNotFoundError],
 98 |             ["valid_ext.csv", None, TypeError],
 99 |             ["valid_ext.csv", 0, TypeError],
100 |             ["valid_ext.csv", "auto", ptr.LoaderNotFoundError],
101 |         ],
102 |     )
103 |     def test_exception(self, file_path, format_name, expected):
104 |         loader_factory = ptr.factory.TableFileLoaderFactory(file_path)
105 | 
106 |         with pytest.raises(expected):
107 |             loader_factory.create_from_format_name(format_name)
108 | 


--------------------------------------------------------------------------------
/test/factory/test_text_loader_factory.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
 3 | """
 4 | 
 5 | import pytest
 6 | 
 7 | import pytablereader as ptr
 8 | 
 9 | 
10 | class Test_TableTextLoaderFactory:
11 |     @pytest.mark.parametrize(["value", "expected"], [[None, ValueError]])
12 |     def test_exception(self, value, expected):
13 |         with pytest.raises(expected):
14 |             ptr.factory.TableTextLoaderFactory(value)
15 | 
16 | 
17 | class Test_TableTextLoaderFactory_create_from_format_name:
18 |     @pytest.mark.parametrize(
19 |         ["format_name", "expected"],
20 |         [
21 |             ["csv", ptr.CsvTableTextLoader],
22 |             ["CSV", ptr.CsvTableTextLoader],
23 |             ["html", ptr.HtmlTableTextLoader],
24 |             ["HTML", ptr.HtmlTableTextLoader],
25 |             ["json", ptr.JsonTableTextLoader],
26 |             ["JSON", ptr.JsonTableTextLoader],
27 |             ["markdown", ptr.MarkdownTableTextLoader],
28 |             ["Markdown", ptr.MarkdownTableTextLoader],
29 |             ["mediawiki", ptr.MediaWikiTableTextLoader],
30 |             ["MediaWiki", ptr.MediaWikiTableTextLoader],
31 |             ["tsv", ptr.TsvTableTextLoader],
32 |             ["TSV", ptr.TsvTableTextLoader],
33 |         ],
34 |     )
35 |     def test_normal(self, format_name, expected):
36 |         loader_factory = ptr.factory.TableTextLoaderFactory("dummy")
37 |         loader = loader_factory.create_from_format_name(format_name)
38 | 
39 |         assert isinstance(loader, expected)
40 | 
41 |     @pytest.mark.parametrize(
42 |         ["format_name", "expected"],
43 |         [
44 |             ["not_exist_format", ptr.LoaderNotFoundError],
45 |             ["", ptr.LoaderNotFoundError],
46 |             [None, TypeError],
47 |             [0, TypeError],
48 |             ["auto", ptr.LoaderNotFoundError],
49 |         ],
50 |     )
51 |     def test_exception(self, format_name, expected):
52 |         loader_factory = ptr.factory.TableTextLoaderFactory("dummyy")
53 | 
54 |         with pytest.raises(expected):
55 |             loader_factory.create_from_format_name(format_name)
56 | 


--------------------------------------------------------------------------------
/test/loader/test_gsloader.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
 3 | """
 4 | 
 5 | import pytest
 6 | 
 7 | from pytablereader import GoogleSheetsTableLoader
 8 | 
 9 | 
10 | class Test_GoogleSheetsTableLoader_make_table_name:
11 |     @property
12 |     def monkey_property(self):
13 |         return "testsheet"
14 | 
15 |     @pytest.mark.parametrize(
16 |         ["value", "title", "expected"],
17 |         [
18 |             ["%(sheet)s", "titlename", "testsheet"],
19 |             ["%(title)s", "titlename", "titlename"],
20 |             ["%(title)s", "table", "table"],
21 |             ["prefix_%(title)s_%(sheet)s", "titlename", "prefix_titlename_testsheet"],
22 |             ["%(format_name)s%(format_id)s", "titlename", "spreadsheet0"],
23 |         ],
24 |     )
25 |     def test_normal(self, monkeypatch, value, title, expected):
26 |         loader = GoogleSheetsTableLoader("dummy")
27 |         loader.table_name = value
28 |         loader.title = title
29 | 
30 |         monkeypatch.setattr(GoogleSheetsTableLoader, "_sheet_name", self.monkey_property)
31 | 
32 |         assert loader.make_table_name() == expected
33 | 
34 |     @pytest.mark.parametrize(
35 |         ["value", "title", "expected"],
36 |         [
37 |             [None, "titlename", ValueError],
38 |             ["", "titlename", ValueError],
39 |             ["%(sheet)s", None, ValueError],
40 |             ["%(sheet)s", "", ValueError],
41 |         ],
42 |     )
43 |     def test_exception(self, value, title, expected):
44 |         loader = GoogleSheetsTableLoader("dummy")
45 |         loader.table_name = value
46 |         loader.title = title
47 | 
48 |         with pytest.raises(expected):
49 |             loader.make_table_name()
50 | 


--------------------------------------------------------------------------------
/test/loader/test_textloader.py:
--------------------------------------------------------------------------------
  1 | """
  2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
  3 | """
  4 | 
  5 | from textwrap import dedent
  6 | 
  7 | import pytest
  8 | from pytablewriter import dumps_tabledata
  9 | from tabledata import TableData
 10 | 
 11 | import pytablereader as ptr
 12 | from pytablereader.interface import AbstractTableReader
 13 | 
 14 | 
 15 | class Test_TableTextLoader_get_format_names:
 16 |     def test_normal(self):
 17 |         assert ptr.TableTextLoader.get_format_names() == [
 18 |             "csv",
 19 |             "html",
 20 |             "json",
 21 |             "json_lines",
 22 |             "jsonl",
 23 |             "ldjson",
 24 |             "ltsv",
 25 |             "markdown",
 26 |             "mediawiki",
 27 |             "ndjson",
 28 |             "ssv",
 29 |             "tsv",
 30 |         ]
 31 | 
 32 | 
 33 | class Test_TableTextLoader_constructor:
 34 |     @pytest.mark.parametrize(
 35 |         ["value", "format_name", "expected"],
 36 |         [
 37 |             [None, None, ValueError],
 38 |             ["", None, ValueError],
 39 |             ["https://github.com/", None, ValueError],
 40 |             ["/tmp/valid/test/data/validext.csv/", None, ValueError],
 41 |             ["/tmp/invalid/test/data/invalidext.txt", "invalidformat", ptr.LoaderNotFoundError],
 42 |         ],
 43 |     )
 44 |     def test_exception(self, value, format_name, expected):
 45 |         with pytest.raises(expected):
 46 |             ptr.TableTextLoader(value, format_name=format_name)
 47 | 
 48 | 
 49 | class Test_TableTextLoader_load:
 50 |     def setup_method(self, method):
 51 |         AbstractTableReader.clear_table_count()
 52 | 
 53 |     def test_normal_csv(self):
 54 |         text = dedent(
 55 |             """\
 56 |             "attr_a","attr_b","attr_c"
 57 |             1,4,"a"
 58 |             2,2.1,"bb"
 59 |             3,120.9,"ccc"
 60 |             """
 61 |         )
 62 | 
 63 |         expected_list = [
 64 |             TableData(
 65 |                 "csv1",
 66 |                 ["attr_a", "attr_b", "attr_c"],
 67 |                 [[1, 4, "a"], [2, "2.1", "bb"], [3, "120.9", "ccc"]],
 68 |             )
 69 |         ]
 70 |         loader = ptr.TableTextLoader(text, format_name="csv")
 71 | 
 72 |         assert loader.format_name == "csv"
 73 | 
 74 |         for tabledata, expected in zip(loader.load(), expected_list):
 75 |             print(dumps_tabledata(expected))
 76 |             print(dumps_tabledata(tabledata))
 77 | 
 78 |             assert tabledata.equals(expected)
 79 | 
 80 |     def test_normal_ssv(self):
 81 |         text = dedent(
 82 |             """\
 83 |             USER       PID %CPU %MEM    VSZ   RSS TTY      STAT START   TIME COMMAND
 84 |             root         1  0.0  0.4  77664  8784 ?        Ss   May11   0:02 /sbin/init
 85 |             root         2  0.0  0.0      0     0 ?        S    May11   0:00 [kthreadd]
 86 |             root         4  0.0  0.0      0     0 ?        I<   May11   0:00 [kworker/0:0H]
 87 |             root         6  0.0  0.0      0     0 ?        I<   May11   0:00 [mm_percpu_wq]
 88 |             root         7  0.0  0.0      0     0 ?        S    May11   0:01 [ksoftirqd/0]
 89 |             """
 90 |         )
 91 | 
 92 |         expected_list = [
 93 |             TableData(
 94 |                 "csv1",
 95 |                 [
 96 |                     "USER",
 97 |                     "PID",
 98 |                     "%CPU",
 99 |                     "%MEM",
100 |                     "VSZ",
101 |                     "RSS",
102 |                     "TTY",
103 |                     "STAT",
104 |                     "START",
105 |                     "TIME",
106 |                     "COMMAND",
107 |                 ],
108 |                 [
109 |                     ["root", 1, 0, 0.4, 77664, 8784, "?", "Ss", "May11", "0:02", "/sbin/init"],
110 |                     ["root", 2, 0, 0, 0, 0, "?", "S", "May11", "0:00", "[kthreadd]"],
111 |                     ["root", 4, 0, 0, 0, 0, "?", "I<", "May11", "0:00", "[kworker/0:0H]"],
112 |                     ["root", 6, 0, 0, 0, 0, "?", "I<", "May11", "0:00", "[mm_percpu_wq]"],
113 |                     ["root", 7, 0, 0, 0, 0, "?", "S", "May11", "0:01", "[ksoftirqd/0]"],
114 |                 ],
115 |             )
116 |         ]
117 |         loader = ptr.TableTextLoader(text, format_name="ssv")
118 | 
119 |         assert loader.format_name == "csv"
120 | 
121 |         for tabledata, expected in zip(loader.load(), expected_list):
122 |             print(dumps_tabledata(expected))
123 |             print(dumps_tabledata(tabledata))
124 | 
125 |             assert tabledata.equals(expected)
126 | 
127 |     def test_normal_json(self):
128 |         text = dedent(
129 |             """\
130 |             [
131 |                 {"attr_a": 1},
132 |                 {"attr_b": 2.1, "attr_c": "bb"}
133 |             ]"""
134 |         )
135 | 
136 |         expected_list = [
137 |             TableData(
138 |                 "json1",
139 |                 ["attr_a", "attr_b", "attr_c"],
140 |                 [{"attr_a": 1}, {"attr_b": 2.1, "attr_c": "bb"}],
141 |             )
142 |         ]
143 |         loader = ptr.TableTextLoader(text, format_name="json")
144 | 
145 |         assert loader.format_name == "json"
146 | 
147 |         for table_data, expected in zip(loader.load(), expected_list):
148 |             print(dumps_tabledata(expected))
149 |             print(dumps_tabledata(table_data))
150 | 
151 |             assert table_data.equals(expected)
152 | 


--------------------------------------------------------------------------------
/test/test_common.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
 3 | """
 4 | 
 5 | import pytest
 6 | 
 7 | from pytablereader import InvalidFilePathError
 8 | from pytablereader._common import get_extension, make_temp_file_path_from_url
 9 | 
10 | 
11 | class Test_get_extension:
12 |     @pytest.mark.parametrize(
13 |         ["value", "expected"], [["test.txt", "txt"], [".csv", ""], ["html", ""]]
14 |     )
15 |     def test_normal(self, value, expected):
16 |         assert get_extension(value) == expected
17 | 
18 |     @pytest.mark.parametrize(
19 |         ["value", "expected"], [["", InvalidFilePathError], [None, InvalidFilePathError]]
20 |     )
21 |     def test_null_table_name(self, value, expected):
22 |         with pytest.raises(expected):
23 |             get_extension(value)
24 | 
25 | 
26 | class Test_make_temp_file_path_from_url:
27 |     @pytest.mark.parametrize(
28 |         ["temp_dir_path", "value", "expected"],
29 |         [
30 |             [
31 |                 "/tmp",
32 |                 "https://raw.githubusercontent.com/valid/test/data/validext.csv",
33 |                 "/tmp/validext.csv",
34 |             ],
35 |             [
36 |                 "/tmp",
37 |                 "https://raw.githubusercontent.com/valid/test/data/validext/",
38 |                 "/tmp/validext",
39 |             ],
40 |         ],
41 |     )
42 |     def test_normal(self, temp_dir_path, value, expected):
43 |         assert make_temp_file_path_from_url(temp_dir_path, value) == expected
44 | 
45 |     @pytest.mark.parametrize(
46 |         ["temp_dir_path", "value", "expected"],
47 |         [
48 |             [None, "tmp", InvalidFilePathError],
49 |             ["tmp", "", InvalidFilePathError],
50 |             ["tmp", None, InvalidFilePathError],
51 |         ],
52 |     )
53 |     def test_null_table_name(self, temp_dir_path, value, expected):
54 |         with pytest.raises(expected):
55 |             make_temp_file_path_from_url(temp_dir_path, value)
56 | 


--------------------------------------------------------------------------------
/test/test_excel_reader.py:
--------------------------------------------------------------------------------
  1 | """
  2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
  3 | """
  4 | 
  5 | import pytest
  6 | import xlsxwriter
  7 | from pytablewriter import dumps_tabledata
  8 | from tabledata import TableData
  9 | 
 10 | import pytablereader as ptr
 11 | from pytablereader.interface import AbstractTableReader
 12 | 
 13 | 
 14 | def write_worksheet(worksheet, table):
 15 |     for row_idx, row in enumerate(table):
 16 |         for col_idx, item in enumerate(row):
 17 |             worksheet.write(row_idx, col_idx, item)
 18 | 
 19 | 
 20 | @pytest.fixture
 21 | def valid_excel_file_path(tmpdir):
 22 |     test_file_path = tmpdir.join("tmp.xlsx")
 23 |     workbook = xlsxwriter.Workbook(str(test_file_path))
 24 | 
 25 |     write_worksheet(
 26 |         workbook.add_worksheet("boolsheet"),
 27 |         table=[
 28 |             ["true", "false", "tf", "lost"],
 29 |             ["True", "False", "True", "True"],
 30 |             ["true", "false", "False", ""],
 31 |             ["TRUE", "FALSE", "False", "False"],
 32 |         ],
 33 |     )
 34 | 
 35 |     write_worksheet(
 36 |         workbook.add_worksheet("testsheet1"),
 37 |         table=[
 38 |             ["", "", "", ""],
 39 |             ["", "a1", "b1", "c1"],
 40 |             ["", "aa1", "ab1", "ac1"],
 41 |             ["", 1, 1.1, "a"],
 42 |             ["", 2, 2.2, "bb"],
 43 |             ["", 3, 3.3, "cc"],
 44 |         ],
 45 |     )
 46 | 
 47 |     worksheet = workbook.add_worksheet("testsheet2")  # noqa: W0612
 48 | 
 49 |     write_worksheet(
 50 |         workbook.add_worksheet("testsheet3"),
 51 |         table=[
 52 |             ["", "", ""],
 53 |             ["", "", ""],
 54 |             ["a3", "b3", "c3"],
 55 |             ["aa3", "ab3", "ac3"],
 56 |             [4, 1.1, "a"],
 57 |             [5, "", "bb"],
 58 |             [6, 3.3, ""],
 59 |         ],
 60 |     )
 61 | 
 62 |     write_worksheet(
 63 |         workbook.add_worksheet("invalid_sheet"),
 64 |         table=[["", "", "", ""], ["", "a", "", "c"], ["", "aa", "ab", ""], ["", "", 1.1, "a"]],
 65 |     )
 66 | 
 67 |     workbook.close()
 68 | 
 69 |     return str(test_file_path)
 70 | 
 71 | 
 72 | @pytest.fixture
 73 | def invalid_excel_file_path(tmpdir):
 74 |     test_file_path = tmpdir.join("invalid.xlsx")
 75 |     workbook = xlsxwriter.Workbook(str(test_file_path))
 76 | 
 77 |     write_worksheet(
 78 |         workbook.add_worksheet("testsheet1"),
 79 |         table=[["", "", "", ""], ["", "a", "", "c"], ["", "aa", "ab", ""], ["", "", 1.1, "a"]],
 80 |     )
 81 | 
 82 |     worksheet = workbook.add_worksheet("testsheet2")  # noqa: W0612
 83 | 
 84 |     workbook.close()
 85 | 
 86 |     return str(test_file_path)
 87 | 
 88 | 
 89 | @pytest.mark.xfail(run=False)
 90 | class Test_ExcelTableFileLoader_make_table_name:
 91 |     def setup_method(self, method):
 92 |         AbstractTableReader.clear_table_count()
 93 | 
 94 |     @property
 95 |     def monkey_property(self):
 96 |         return "testsheet"
 97 | 
 98 |     @pytest.mark.parametrize(
 99 |         ["value", "source", "expected"],
100 |         [
101 |             ["%(sheet)s", "/path/to/data.xlsx", "testsheet"],
102 |             ["%(filename)s", "/path/to/data.xlsx", "data"],
103 |             ["prefix_%(filename)s_%(sheet)s", "/path/to/data.xlsx", "prefix_data_testsheet"],
104 |             ["%(format_name)s%(format_id)s_%(filename)s", "/path/to/data.xlsx", "excel0_data"],
105 |         ],
106 |     )
107 |     def test_normal(self, monkeypatch, value, source, expected):
108 |         loader = ptr.ExcelTableFileLoader(source)
109 |         loader.table_name = value
110 | 
111 |         monkeypatch.setattr(ptr.ExcelTableFileLoader, "_sheet_name", self.monkey_property)
112 | 
113 |         assert loader.make_table_name() == expected
114 | 
115 |     @pytest.mark.parametrize(
116 |         ["value", "source", "expected"],
117 |         [
118 |             [None, "/path/to/data.xlsx", ValueError],
119 |             ["", "/path/to/data.xlsx", ValueError],
120 |             ["%(sheet)s", None, ptr.InvalidTableNameError],
121 |             ["%(sheet)s", "", ptr.InvalidTableNameError],
122 |         ],
123 |     )
124 |     def test_exception(self, value, source, expected):
125 |         loader = ptr.ExcelTableFileLoader(source)
126 |         loader.table_name = value
127 | 
128 |         with pytest.raises(expected):
129 |             loader.make_table_name()
130 | 
131 | 
132 | @pytest.mark.xfail(run=False)
133 | class Test_ExcelTableFileLoader_load:
134 |     def setup_method(self, method):
135 |         AbstractTableReader.clear_table_count()
136 | 
137 |     @pytest.mark.parametrize(
138 |         ["table_name", "start_row", "expected_list"],
139 |         [
140 |             [
141 |                 "%(sheet)s",
142 |                 0,
143 |                 [
144 |                     TableData(
145 |                         "boolsheet",
146 |                         ["true", "false", "tf", "lost"],
147 |                         [
148 |                             [True, False, True, True],
149 |                             [True, False, False, ""],
150 |                             [True, False, False, False],
151 |                         ],
152 |                     ),
153 |                     TableData(
154 |                         "testsheet1",
155 |                         ["a1", "b1", "c1"],
156 |                         [
157 |                             ["aa1", "ab1", "ac1"],
158 |                             [1.0, 1.1, "a"],
159 |                             [2.0, 2.2, "bb"],
160 |                             [3.0, 3.3, "cc"],
161 |                         ],
162 |                     ),
163 |                     TableData(
164 |                         "testsheet3",
165 |                         ["a3", "b3", "c3"],
166 |                         [["aa3", "ab3", "ac3"], [4.0, 1.1, "a"], [5.0, "", "bb"], [6.0, 3.3, ""]],
167 |                     ),
168 |                 ],
169 |             ],
170 |             [
171 |                 "%(filename)s_%(sheet)s",
172 |                 2,
173 |                 [
174 |                     TableData("tmp_boolsheet", ["TRUE", "FALSE", "False", "False"], []),
175 |                     TableData(
176 |                         "tmp_testsheet1",
177 |                         ["aa1", "ab1", "ac1"],
178 |                         [[1.0, 1.1, "a"], [2.0, 2.2, "bb"], [3.0, 3.3, "cc"]],
179 |                     ),
180 |                     TableData(
181 |                         "tmp_testsheet3",
182 |                         ["a3", "b3", "c3"],
183 |                         [["aa3", "ab3", "ac3"], [4.0, 1.1, "a"], [5.0, "", "bb"], [6.0, 3.3, ""]],
184 |                     ),
185 |                 ],
186 |             ],
187 |         ],
188 |     )
189 |     def test_normal(self, valid_excel_file_path, table_name, start_row, expected_list):
190 |         loader = ptr.ExcelTableFileLoader(valid_excel_file_path)
191 |         loader.table_name = table_name
192 |         loader.start_row = start_row
193 | 
194 |         for table_data in loader.load():
195 |             print(f"[actual]\n{dumps_tabledata(table_data)}")
196 |             assert table_data.in_tabledata_list(expected_list)
197 | 
198 |     @pytest.mark.parametrize(
199 |         ["table_name", "start_row", "expected"], [["%(sheet)s", 0, ptr.DataError]]
200 |     )
201 |     def test_abnormal(self, invalid_excel_file_path, table_name, start_row, expected):
202 |         loader = ptr.ExcelTableFileLoader(invalid_excel_file_path)
203 |         loader.table_name = table_name
204 |         loader.start_row = start_row
205 | 
206 |         for tabletuple in loader.load():
207 |             assert tabletuple == []
208 | 
209 |     @pytest.mark.parametrize(
210 |         ["source", "expected"], [["", ptr.InvalidFilePathError], [None, ptr.InvalidFilePathError]]
211 |     )
212 |     def test_null_file_path(self, source, expected):
213 |         loader = ptr.ExcelTableFileLoader(source)
214 | 
215 |         with pytest.raises(expected):
216 |             for _tabletuple in loader.load():
217 |                 pass
218 | 
219 |     @pytest.mark.parametrize(["table_name", "expected"], [["", ValueError], [None, ValueError]])
220 |     def test_null_table_name(self, valid_excel_file_path, table_name, expected):
221 |         loader = ptr.ExcelTableFileLoader(valid_excel_file_path)
222 |         loader.table_name = table_name
223 | 
224 |         with pytest.raises(expected):
225 |             for _tabletuple in loader.load():
226 |                 pass
227 | 


--------------------------------------------------------------------------------
/test/test_html_reader_from_file.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
 3 | """
 4 | 
 5 | import os
 6 | 
 7 | import pytest
 8 | from pytablewriter import dumps_tabledata
 9 | 
10 | import pytablereader as ptr
11 | 
12 | 
13 | class Test_HtmlTableTextLoader_load:
14 |     @pytest.mark.parametrize(["filename"], [["python - Wiktionary.html"]])
15 |     def test_smoke(self, tmpdir, filename):
16 |         test_data_file_path = os.path.join(os.path.dirname(__file__), "data", filename)
17 |         loader = ptr.TableFileLoader(test_data_file_path)
18 | 
19 |         success_count = 0
20 | 
21 |         for tabledata in loader.load():
22 |             if tabledata.is_empty():
23 |                 continue
24 | 
25 |             assert len(dumps_tabledata(tabledata)) > 10
26 | 
27 |             success_count += 1
28 | 
29 |         assert success_count > 0
30 | 


--------------------------------------------------------------------------------
/test/test_logger.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
 3 | """
 4 | 
 5 | import pytest
 6 | 
 7 | from pytablereader import set_logger
 8 | from pytablereader._logger._null_logger import NullLogger
 9 | 
10 | 
11 | class Test_set_logger:
12 |     @pytest.mark.parametrize(["value"], [[True], [False]])
13 |     def test_smoke(self, value):
14 |         set_logger(value)
15 | 
16 | 
17 | class Test_NullLogger:
18 |     @pytest.mark.parametrize(["value"], [[True], [False]])
19 |     def test_smoke(self, value, monkeypatch):
20 |         monkeypatch.setattr("pytablereader._logger._logger.logger", NullLogger())
21 |         set_logger(value)
22 | 


--------------------------------------------------------------------------------
/test/test_ltsv_reader.py:
--------------------------------------------------------------------------------
  1 | """
  2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
  3 | """
  4 | 
  5 | import collections
  6 | from decimal import Decimal
  7 | from textwrap import dedent
  8 | 
  9 | import pytest
 10 | from path import Path
 11 | from pytablewriter import dumps_tabledata
 12 | from tabledata import TableData
 13 | 
 14 | import pytablereader as ptr
 15 | from pytablereader import DataError, InvalidHeaderNameError, InvalidTableNameError
 16 | from pytablereader.interface import AbstractTableReader
 17 | 
 18 | from ._common import TYPE_HINT_RULES
 19 | 
 20 | 
 21 | Data = collections.namedtuple("Data", "value expected")
 22 | 
 23 | test_data_00 = Data(
 24 |     """a.0:1\tb-1:123.1\tc_2:a\t"dd":1.0\te.f-g_4:"1"
 25 | a.0:2\tb-1:2.2\tc_2:bb\t"dd":2.2\te.f-g_4:"2.2"
 26 | a.0:3\tb-1:3.3\tc_2:ccc\t"dd":3.0\te.f-g_4:"cccc"
 27 | """,
 28 |     TableData(
 29 |         "tmp",
 30 |         ["a.0", "b-1", "c_2", "dd", "e.f-g_4"],
 31 |         [
 32 |             [1, Decimal("123.1"), "a", 1, '"1"'],
 33 |             [2, Decimal("2.2"), "bb", Decimal("2.2"), '"2.2"'],
 34 |             [3, Decimal("3.3"), "ccc", 3, '"cccc"'],
 35 |         ],
 36 |     ),
 37 | )
 38 | 
 39 | 
 40 | class Test_LtsvTableFileLoader_make_table_name:
 41 |     def setup_method(self, method):
 42 |         AbstractTableReader.clear_table_count()
 43 | 
 44 |     @pytest.mark.parametrize(
 45 |         ["value", "source", "expected"],
 46 |         [
 47 |             ["%(default)s", "/path/to/data.ltsv", "data"],
 48 |             ["%(filename)s", "/path/to/data.ltsv", "data"],
 49 |             ["prefix_%(filename)s", "/path/to/data.ltsv", "prefix_data"],
 50 |             ["%(filename)s_suffix", "/path/to/data.ltsv", "data_suffix"],
 51 |             ["prefix_%(filename)s_suffix", "/path/to/data.ltsv", "prefix_data_suffix"],
 52 |             ["%(filename)s%(filename)s", "/path/to/data.ltsv", "datadata"],
 53 |             ["%(format_name)s%(format_id)s_%(filename)s", "/path/to/data.ltsv", "ltsv0_data"],
 54 |             ["%(%(filename)s)", "/path/to/data.ltsv", "%(data)"],
 55 |         ],
 56 |     )
 57 |     def test_normal(self, value, source, expected):
 58 |         loader = ptr.LtsvTableFileLoader(source)
 59 |         loader.table_name = value
 60 | 
 61 |         assert loader.make_table_name() == expected
 62 | 
 63 |     @pytest.mark.parametrize(
 64 |         ["value", "source", "expected"],
 65 |         [
 66 |             [None, "/path/to/data.ltsv", ValueError],
 67 |             ["", "/path/to/data.ltsv", ValueError],
 68 |             ["%(filename)s", None, InvalidTableNameError],
 69 |             ["%(filename)s", "", InvalidTableNameError],
 70 |         ],
 71 |     )
 72 |     def test_exception(self, value, source, expected):
 73 |         loader = ptr.LtsvTableFileLoader(source)
 74 |         loader.table_name = value
 75 | 
 76 |         with pytest.raises(expected):
 77 |             loader.make_table_name()
 78 | 
 79 | 
 80 | class Test_LtsvTableFileLoader_load:
 81 |     def setup_method(self, method):
 82 |         AbstractTableReader.clear_table_count()
 83 | 
 84 |     @pytest.mark.parametrize(
 85 |         ["test_id", "table_text", "filename", "expected"],
 86 |         [[0, test_data_00.value, "tmp.ltsv", test_data_00.expected]],
 87 |     )
 88 |     def test_normal(self, tmpdir, test_id, table_text, filename, expected):
 89 |         file_path = Path(str(tmpdir.join(filename)))
 90 |         file_path.parent.makedirs_p()
 91 | 
 92 |         with open(file_path, "w", encoding="utf-8") as f:
 93 |             f.write(table_text)
 94 | 
 95 |         loader = ptr.LtsvTableFileLoader(file_path)
 96 | 
 97 |         for tabledata in loader.load():
 98 |             print(f"test-id={test_id}")
 99 |             print(f"[expected]\n{dumps_tabledata(expected)}")
100 |             print(f"[actual]\n{dumps_tabledata(tabledata)}")
101 | 
102 |             assert tabledata.equals(expected)
103 | 
104 |     @pytest.mark.parametrize(
105 |         ["table_text", "filename", "expected"],
106 |         [
107 |             ["\n".join(['"attr_a"\t"attr_b"\t"attr_c"']), "hoge.ltsv", ptr.DataError],
108 |             ["\n".join(['"a":1"\t"attr_b"\t"attr_c"']), "hoge.ltsv", ptr.DataError],
109 |         ],
110 |     )
111 |     def test_exception(self, tmpdir, table_text, filename, expected):
112 |         p_ltsv = tmpdir.join(filename)
113 | 
114 |         with open(str(p_ltsv), "w", encoding="utf8") as f:
115 |             f.write(table_text)
116 | 
117 |         loader = ptr.LtsvTableFileLoader(str(p_ltsv))
118 | 
119 |         with pytest.raises(expected):
120 |             for _tabletuple in loader.load():
121 |                 pass
122 | 
123 |     @pytest.mark.parametrize(
124 |         ["filename", "headers", "expected"],
125 |         [["", [], ptr.InvalidFilePathError], [None, [], ptr.InvalidFilePathError]],
126 |     )
127 |     def test_null(self, tmpdir, filename, headers, expected):
128 |         loader = ptr.LtsvTableFileLoader(filename)
129 |         loader.headers = headers
130 | 
131 |         with pytest.raises(expected):
132 |             for _tabletuple in loader.load():
133 |                 pass
134 | 
135 | 
136 | class Test_LtsvTableTextLoader_make_table_name:
137 |     def setup_method(self, method):
138 |         AbstractTableReader.clear_table_count()
139 | 
140 |     @pytest.mark.parametrize(
141 |         ["value", "expected"],
142 |         [["%(format_name)s%(format_id)s", "ltsv0"], ["tablename", "tablename"]],
143 |     )
144 |     def test_normal(self, value, expected):
145 |         loader = ptr.LtsvTableTextLoader("dummy")
146 |         loader.table_name = value
147 | 
148 |         assert loader.make_table_name() == expected
149 | 
150 |     @pytest.mark.parametrize(
151 |         ["value", "source", "expected"],
152 |         [[None, "tablename", ValueError], ["", "tablename", ValueError]],
153 |     )
154 |     def test_exception(self, value, source, expected):
155 |         loader = ptr.LtsvTableFileLoader(source)
156 |         loader.table_name = value
157 | 
158 |         with pytest.raises(expected):
159 |             loader.make_table_name()
160 | 
161 | 
162 | class Test_LtsvTableTextLoader_load:
163 |     def setup_method(self, method):
164 |         AbstractTableReader.clear_table_count()
165 | 
166 |     @pytest.mark.parametrize(
167 |         ["table_text", "table_name", "expected"],
168 |         [[test_data_00.value, "tmp", test_data_00.expected]],
169 |     )
170 |     def test_normal(self, table_text, table_name, expected):
171 |         loader = ptr.LtsvTableTextLoader(table_text)
172 |         loader.table_name = table_name
173 | 
174 |         for tabledata in loader.load():
175 |             print(f"[expected]: {dumps_tabledata(expected)}")
176 |             print(f"[actual]: {dumps_tabledata(tabledata)}")
177 | 
178 |             assert tabledata.equals(expected)
179 | 
180 |     def test_normal_type_hint_rules(self):
181 |         table_text = dedent(
182 |             """\
183 |             a_text:1\tb_integer:1\tc_integer:1.1
184 |             a_text:2\tb_integer:2\tc_integer:1.2
185 |             a_text:3\tb_integer:3\tc_integer:1.3
186 |             """
187 |         )
188 | 
189 |         loader = ptr.LtsvTableTextLoader(table_text)
190 |         loader.table_name = "type hint rules"
191 |         loader.type_hint_rules = TYPE_HINT_RULES
192 | 
193 |         for tbldata in loader.load():
194 |             assert tbldata.headers == ["a_text", "b_integer", "c_integer"]
195 |             assert tbldata.value_matrix == [["1", 1, 1], ["2", 2, 1], ["3", 3, 1]]
196 | 
197 |     @pytest.mark.parametrize(
198 |         ["table_text", "table_name", "expected"],
199 |         [
200 |             ['"":"invalid"\ta:1', "dummy", InvalidHeaderNameError],
201 |             ["", "dummy", DataError],
202 |             ["a!:1\tb:2", "dummy", InvalidHeaderNameError],
203 |             ["a:1\tb$c:2", "dummy", InvalidHeaderNameError],
204 |         ],
205 |     )
206 |     def test_exception_insufficient_data(self, table_text, table_name, expected):
207 |         loader = ptr.LtsvTableTextLoader(table_text)
208 |         loader.table_name = table_name
209 | 
210 |         with pytest.raises(expected):
211 |             for _tabledata in loader.load():
212 |                 print(_tabledata)
213 | 
214 |     @pytest.mark.parametrize(["table_name", "expected"], [["", ValueError], [None, ValueError]])
215 |     def test_null(self, table_name, expected):
216 |         loader = ptr.LtsvTableTextLoader("dummy")
217 |         loader.table_name = table_name
218 | 
219 |         with pytest.raises(expected):
220 |             for _tabletuple in loader.load():
221 |                 pass
222 | 


--------------------------------------------------------------------------------
/test/test_pandas.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
 3 | """
 4 | 
 5 | from decimal import Decimal
 6 | 
 7 | import pytest
 8 | import typepy
 9 | from tabledata import TableData
10 | 
11 | 
12 | try:
13 |     import pandas
14 | 
15 |     PANDAS_IMPORT = True
16 | except ImportError:
17 |     PANDAS_IMPORT = False
18 | 
19 | 
20 | @pytest.mark.skipif(not PANDAS_IMPORT, reason="required package not found")
21 | class Test_TableData_as_dataframe:
22 |     @pytest.mark.parametrize(
23 |         ["table_name", "headers", "rows"],
24 |         [
25 |             ["normal", ["a", "b"], [[10, 11], [20, 21]]],
26 |             ["normal", None, [[10, 11], [20, 21]]],
27 |             ["normal", None, None],
28 |         ],
29 |     )
30 |     def test_normal(self, table_name, headers, rows):
31 |         tabledata = TableData(table_name, headers, rows)
32 |         dataframe = pandas.DataFrame(rows)
33 |         if typepy.is_not_empty_sequence(headers):
34 |             dataframe.columns = headers
35 | 
36 |         print(f"lhs: {tabledata.as_dataframe()}")
37 |         print(f"rhs: {dataframe}")
38 | 
39 |         assert tabledata.as_dataframe().equals(dataframe)
40 | 
41 | 
42 | @pytest.mark.skipif(not PANDAS_IMPORT, reason="required package not found")
43 | class Test_TableData_from_dataframe:
44 |     def test_normal(self):
45 |         dataframe = pandas.DataFrame(
46 |             [[0, 0.1, "a"], [1, 1.1, "bb"], [2, 2.2, "ccc"]], columns=["id", "value", "name"]
47 |         )
48 |         expected = TableData(
49 |             "tablename",
50 |             ["id", "value", "name"],
51 |             [[0, Decimal("0.1"), "a"], [1, Decimal("1.1"), "bb"], [2, Decimal("2.2"), "ccc"]],
52 |         )
53 | 
54 |         assert TableData.from_dataframe(dataframe, "tablename").equals(expected)
55 | 


--------------------------------------------------------------------------------
/test/test_sqlite_reader.py:
--------------------------------------------------------------------------------
  1 | """
  2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
  3 | """
  4 | 
  5 | import collections
  6 | from decimal import Decimal
  7 | 
  8 | import pytest
  9 | from path import Path
 10 | from pytablewriter import dumps_tabledata
 11 | from simplesqlite import SimpleSQLite
 12 | from tabledata import TableData
 13 | 
 14 | import pytablereader as ptr
 15 | from pytablereader.interface import AbstractTableReader
 16 | 
 17 | 
 18 | Data = collections.namedtuple("Data", "value expected")
 19 | 
 20 | test_data_00 = Data(
 21 |     TableData(
 22 |         "tmp",
 23 |         ["attr_a", "attr_b", "attr_c"],
 24 |         [[1, 4, "a"], [2, Decimal("2.1"), "bb"], [3, Decimal("120.9"), "ccc"]],
 25 |     ),
 26 |     [
 27 |         TableData(
 28 |             "tmp",
 29 |             ["attr_a", "attr_b", "attr_c"],
 30 |             [[1, 4, "a"], [2, Decimal("2.1"), "bb"], [3, Decimal("120.9"), "ccc"]],
 31 |         )
 32 |     ],
 33 | )
 34 | test_data_01 = Data(
 35 |     TableData(
 36 |         "foo_bar",
 37 |         ["attr_a", "attr_b", "attr_c"],
 38 |         [["aaaa", "bbbb", "cccc"], [1, 4, "a"], [2, "2.1", "bb"], [3, "120.9", "ccc"]],
 39 |     ),
 40 |     [
 41 |         TableData(
 42 |             "foo_bar",
 43 |             ["attr_a", "attr_b", "attr_c"],
 44 |             [["aaaa", "bbbb", "cccc"], ["1", "4", "a"], ["2", "2.1", "bb"], ["3", "120.9", "ccc"]],
 45 |         )
 46 |     ],
 47 | )
 48 | test_data_02 = Data(
 49 |     TableData("foo_bar", ["attr_a", "attr_b", "attr_c"], [[3, "120.9", "ccc"]]),
 50 |     [TableData("foo_bar", ["attr_a", "attr_b", "attr_c"], [[3, "120.9", "ccc"]])],
 51 | )
 52 | test_data_03 = Data(
 53 |     TableData(
 54 |         "tmp", ["attr_a", "attr_b", "attr_c"], [[1, 4, "a"], [2, "2.1", "bb"], [3, "120.9", "ccc"]]
 55 |     ),
 56 |     [
 57 |         TableData(
 58 |             "tmp",
 59 |             ["attr_a", "attr_b", "attr_c"],
 60 |             [[1, 4, "a"], [2, "2.1", "bb"], [3, "120.9", "ccc"]],
 61 |         )
 62 |     ],
 63 | )
 64 | test_data_04 = Data(
 65 |     TableData(
 66 |         "tmp", ["attr_a", "attr_b", "attr_c"], [[1, 4, "a"], [2, "2.1", "bb"], [3, "120.9", "ccc"]]
 67 |     ),
 68 |     [
 69 |         TableData(
 70 |             "tmp",
 71 |             ["attr_a", "attr_b", "attr_c"],
 72 |             [[1, 4, "a"], [2, "2.1", "bb"], [3, "120.9", "ccc"]],
 73 |         )
 74 |     ],
 75 | )
 76 | test_data_05 = Data(
 77 |     TableData(
 78 |         "tmp",
 79 |         ["姓", "名", "生年月日", "郵便番号", "住所", "電話番号"],
 80 |         [
 81 |             ["山田", "太郎", "2001/1/1", "100-0002", "東京都千代田区皇居外苑", "03-1234-5678"],
 82 |             ["山田", "次郎", "2001/1/2", "251-0036", "神奈川県藤沢市江の島１丁目", "03-9999-9999"],
 83 |         ],
 84 |     ),
 85 |     [
 86 |         TableData(
 87 |             "tmp",
 88 |             ["姓", "名", "生年月日", "郵便番号", "住所", "電話番号"],
 89 |             [
 90 |                 ["山田", "太郎", "2001/1/1", "100-0002", "東京都千代田区皇居外苑", "03-1234-5678"],
 91 |                 ["山田", "次郎", "2001/1/2", "251-0036", "神奈川県藤沢市江の島１丁目", "03-9999-9999"],
 92 |             ],
 93 |         )
 94 |     ],
 95 | )
 96 | 
 97 | 
 98 | class Test_SqliteFileLoader_load:
 99 |     def setup_method(self, method):
100 |         AbstractTableReader.clear_table_count()
101 | 
102 |     @pytest.mark.parametrize(
103 |         ["test_id", "tabledata", "filename", "headers", "expected"],
104 |         [
105 |             [0, test_data_00.value, "tmp.sqlite", [], test_data_00.expected],
106 |             [
107 |                 1,
108 |                 test_data_01.value,
109 |                 "foo_bar.sqlite",
110 |                 ["attr_a", "attr_b", "attr_c"],
111 |                 test_data_01.expected,
112 |             ],
113 |             [
114 |                 2,
115 |                 test_data_02.value,
116 |                 "foo_bar.sqlite",
117 |                 ["attr_a", "attr_b", "attr_c"],
118 |                 test_data_02.expected,
119 |             ],
120 |             [3, test_data_03.value, "tmp.sqlite", [], test_data_03.expected],
121 |             [4, test_data_04.value, "tmp.sqlite", [], test_data_04.expected],
122 |             [5, test_data_05.value, "tmp.sqlite", [], test_data_05.expected],
123 |         ],
124 |     )
125 |     def test_normal(self, tmpdir, test_id, tabledata, filename, headers, expected):
126 |         file_path = Path(str(tmpdir.join(filename)))
127 |         file_path.parent.makedirs_p()
128 | 
129 |         con = SimpleSQLite(file_path, "w")
130 | 
131 |         con.create_table_from_tabledata(tabledata)
132 | 
133 |         loader = ptr.SqliteFileLoader(file_path)
134 |         loader.headers = headers
135 | 
136 |         for tabledata in loader.load():
137 |             print(f"test-id={test_id}")
138 |             print(dumps_tabledata(tabledata))
139 | 
140 |             assert tabledata.in_tabledata_list(expected)
141 | 
142 |     @pytest.mark.parametrize(
143 |         ["filename", "headers", "expected"],
144 |         [["", [], ptr.InvalidFilePathError], [None, [], ptr.InvalidFilePathError]],
145 |     )
146 |     def test_null(self, tmpdir, filename, headers, expected):
147 |         loader = ptr.SqliteFileLoader(filename)
148 |         loader.headers = headers
149 | 
150 |         with pytest.raises(expected):
151 |             for _tabletuple in loader.load():
152 |                 pass
153 | 


--------------------------------------------------------------------------------
/test/test_validator.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
 3 | """
 4 | 
 5 | import platform
 6 | 
 7 | import pytest
 8 | 
 9 | import pytablereader as ptr
10 | from pytablereader._constant import SourceType
11 | from pytablereader._validator import FileValidator, TextValidator, UrlValidator, is_fifo
12 | 
13 | 
14 | class Test_FileValidator_validate:
15 |     @pytest.mark.parametrize(["value"], [["test"]])
16 |     def test_normal(self, tmpdir, value):
17 |         p_file_path = tmpdir.join(value)
18 | 
19 |         with open(str(p_file_path), "w"):
20 |             pass
21 | 
22 |         validator = FileValidator(str(p_file_path))
23 |         assert validator.source_type == SourceType.FILE
24 |         validator.validate()
25 | 
26 |     @pytest.mark.parametrize(
27 |         ["value", "expected"], [[None, ptr.InvalidFilePathError], ["", ptr.InvalidFilePathError]]
28 |     )
29 |     def test_exception_null(self, value, expected):
30 |         validator = FileValidator(value)
31 | 
32 |         with pytest.raises(expected):
33 |             validator.validate()
34 | 
35 |     @pytest.mark.parametrize(["value", "expected"], [["te\0st", ptr.InvalidFilePathError]])
36 |     def test_exception_invalid_path(self, tmpdir, value, expected):
37 |         validator = FileValidator(value)
38 | 
39 |         with pytest.raises(expected):
40 |             validator.validate()
41 | 
42 | 
43 | class Test_TextValidator_validate:
44 |     @pytest.mark.parametrize(["value"], [["test"]])
45 |     def test_normal(self, value):
46 |         validator = TextValidator(value)
47 |         assert validator.source_type == SourceType.TEXT
48 |         validator.validate()
49 | 
50 |     @pytest.mark.parametrize(["value", "expected"], [[None, ptr.DataError], ["", ptr.DataError]])
51 |     def test_exception(self, value, expected):
52 |         validator = TextValidator(value)
53 | 
54 |         with pytest.raises(expected):
55 |             validator.validate()
56 | 
57 | 
58 | class Test_UrlValidator_validate:
59 |     @pytest.mark.parametrize(["value"], [["http://www.google.com"], ["https://github.com/"]])
60 |     def test_normal(self, value):
61 |         validator = UrlValidator(value)
62 |         assert validator.source_type == SourceType.URL
63 |         validator.validate()
64 | 
65 |     @pytest.mark.parametrize(
66 |         ["value", "expected"],
67 |         [[None, ptr.UrlError], ["", ptr.UrlError], ["www.google.com", ptr.UrlError]],
68 |     )
69 |     def test_exception(self, value, expected):
70 |         validator = UrlValidator(value)
71 | 
72 |         with pytest.raises(expected):
73 |             validator.validate()
74 | 
75 | 
76 | class Test_is_fifo:
77 |     @pytest.mark.skipif(
78 |         platform.system() == "Windows",
79 |         reason="platform dependent tests: only failed at GitHub Actions",
80 |     )
81 |     def test_filename_too_long(self):
82 |         assert not is_fifo("a" * 1000)
83 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist =
 3 |     py{37,38,39,310,311}
 4 |     pypy3
 5 |     build
 6 |     cov
 7 |     docs
 8 |     fmt
 9 |     lint
10 |     readme
11 | 
12 | [testenv]
13 | extras =
14 |     test
15 | commands =
16 |     pytest {posargs}
17 | 
18 | [testenv:build]
19 | deps =
20 |     build>=0.10
21 |     twine
22 |     wheel
23 | commands =
24 |     python -m build
25 |     twine check dist/*.whl dist/*.tar.gz
26 | 
27 | [testenv:clean]
28 | skip_install = true
29 | deps =
30 |     cleanpy>=0.4
31 | commands =
32 |     cleanpy --all --exclude-envs .
33 | 
34 | [testenv:cov]
35 | extras =
36 |     test
37 | deps =
38 |     coverage[toml]>=5
39 | commands =
40 |     coverage run -m pytest {posargs:-vv}
41 |     coverage report -m
42 | 
43 | [testenv:docs]
44 | deps =
45 |     -r{toxinidir}/requirements/docs_requirements.txt
46 | commands =
47 |     sphinx-build docs/ docs/_build
48 | 
49 | [testenv:fmt]
50 | skip_install = true
51 | deps =
52 |     autoflake>=2
53 |     black>=23.1
54 |     isort>=5
55 | commands =
56 |     black setup.py test pytablereader
57 |     autoflake --in-place --recursive --remove-all-unused-imports --ignore-init-module-imports .
58 |     isort .
59 | 
60 | [testenv:lint]
61 | skip_install = true
62 | deps =
63 |     codespell>=2
64 |     #mypy>=1
65 |     pylama>=8.4.1
66 | commands =
67 |     python setup.py check
68 |     #mypy pytablereader setup.py --ignore-missing-imports --show-error-context --show-error-codes --python-version 3.5
69 |     codespell pytablereader docs/pages examples test -q 2 --check-filenames --ignore-words-list te --exclude-file "test/data/python - Wiktionary.html"
70 |     pylama
71 | 
72 | [testenv:readme]
73 | skip_install = true
74 | changedir = docs
75 | deps =
76 |     path
77 |     readmemaker>=1.1.0
78 | commands =
79 |     python make_readme.py
80 | 


--------------------------------------------------------------------------------