├── .github ├── issue_template.md ├── pull_request_template.md ├── stale.yml └── workflows │ └── release.yml ├── .gitignore ├── .travis.yml ├── LEAD.md ├── LICENSE.md ├── MANIFEST.in ├── Makefile ├── README.md ├── data ├── 2-files.zip ├── datapackage.json ├── matrix.csv ├── special │ ├── accent.csv │ ├── adjust_floating_point_error.xlsx │ ├── bom.csv │ ├── doublequote.csv │ ├── doublequote.csv.zip │ ├── escaping.csv │ ├── issue305.csv │ ├── issue320.xlsx │ ├── latin1.csv │ ├── long.csv │ ├── merged-cells-boolean.xls │ ├── merged-cells.xls │ ├── merged-cells.xlsx │ ├── multiline-headers.xlsx │ ├── number_format_multicode.xlsx │ ├── preserve-formatting-percentage.xlsx │ ├── preserve-formatting.xlsx │ ├── sheet2.xls │ ├── sheet2.xlsx │ ├── sheets.xlsx │ ├── skip-blank-at-the-end.csv │ ├── skip-rows-before-headers.csv │ ├── skip-rows.csv │ ├── skip-rows.xlsx │ ├── table-with-booleans.ods │ ├── table-with-booleans.xls │ ├── table-with-ints-floats-dates.ods │ ├── table-with-ints-floats-dates.xls │ ├── table.bad-format │ ├── table.csv.html │ ├── table.csv.zip │ └── test_scientific_notation.xlsx ├── table-dicts.json ├── table-lists.json ├── table-reverse.csv ├── table.csv ├── table.csv.gz ├── table.csv.zip ├── table.ndjson ├── table.ods ├── table.tsv ├── table.xls ├── table.xlsx ├── table1.html ├── table2.html ├── table3.html ├── table4.html └── table_unicode_headers.csv ├── examples ├── __init__.py └── stream.py ├── pylama.ini ├── pytest.ini ├── setup.cfg ├── setup.py ├── tabulator ├── VERSION ├── __init__.py ├── __main__.py ├── cli.py ├── config.py ├── exceptions.py ├── helpers.py ├── loader.py ├── loaders │ ├── __init__.py │ ├── aws.py │ ├── local.py │ ├── remote.py │ ├── stream.py │ └── text.py ├── parser.py ├── parsers │ ├── __init__.py │ ├── csv.py │ ├── datapackage.py │ ├── gsheet.py │ ├── html.py │ ├── inline.py │ ├── json.py │ ├── ndjson.py │ ├── ods.py │ ├── sql.py │ ├── tsv.py │ ├── xls.py │ └── xlsx.py ├── stream.py ├── validate.py ├── writer.py └── writers │ ├── __init__.py │ ├── csv.py │ ├── json.py │ ├── sql.py │ └── xlsx.py └── tests ├── __init__.py ├── conftest.py ├── formats ├── __init__.py ├── test_csv.py ├── test_datapackage.py ├── test_gsheet.py ├── test_html.py ├── test_inline.py ├── test_json.py ├── test_ndjson.py ├── test_ods.py ├── test_sql.py ├── test_tsv.py ├── test_xls.py └── test_xlsx.py ├── schemes ├── __init__.py ├── test_aws.py ├── test_local.py ├── test_remote.py ├── test_stream.py └── test_text.py ├── test_cli.py ├── test_helpers.py ├── test_stream.py └── test_validate.py /.github/issue_template.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | Please replace this line with full information about your idea or problem. If it's a bug share as much as possible to reproduce it 4 | 5 | --- 6 | 7 | Please preserve this line to notify @roll (lead of this repository) 8 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | Please replace this line with full information about your pull request. Make sure that tests pass before publishing it 4 | 5 | --- 6 | 7 | Please preserve this line to notify @roll (lead of this repository) 8 | -------------------------------------------------------------------------------- /.github/stale.yml: -------------------------------------------------------------------------------- 1 | # Number of days of inactivity before an issue becomes stale 2 | daysUntilStale: 90 3 | 4 | # Number of days of inactivity before a stale issue is closed 5 | daysUntilClose: 30 6 | 7 | # Issues with these labels will never be considered stale 8 | exemptLabels: 9 | - feature 10 | - enhancement 11 | - bug 12 | 13 | # Label to use when marking an issue as stale 14 | staleLabel: wontfix 15 | 16 | # Comment to post when marking an issue as stale. Set to `false` to disable 17 | markComment: > 18 | This issue has been automatically marked as stale because it has not had 19 | recent activity. It will be closed if no further activity occurs. Thank you 20 | for your contributions. 21 | 22 | # Comment to post when closing a stale issue. Set to `false` to disable 23 | closeComment: false 24 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: release 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*.*.*' 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Checkout 13 | uses: actions/checkout@v1 14 | - name: Release 15 | uses: softprops/action-gh-release@v1 16 | env: 17 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .pytest_cache/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | 47 | # Translations 48 | *.mo 49 | *.pot 50 | 51 | # Django stuff: 52 | *.log 53 | 54 | # Sphinx documentation 55 | docs/_build/ 56 | 57 | # PyBuilder 58 | target/ 59 | 60 | # Node 61 | node_modules/ 62 | 63 | # Virtualenv 64 | venv/ 65 | venv2/ 66 | venv3/ 67 | 68 | # Tmux 69 | .tmuxp.yml 70 | 71 | # Project 72 | tmp 73 | .projectile 74 | .~lock* 75 | 76 | # Extra 77 | datapackage 78 | .#* 79 | .idea/ 80 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | dist: 2 | xenial 3 | 4 | sudo: 5 | required 6 | 7 | language: 8 | python 9 | 10 | python: 11 | - 2.7 12 | - 3.6 13 | - 3.7 14 | - 3.8 15 | 16 | install: 17 | - make install 18 | - pip install coveralls 19 | 20 | script: 21 | - make test 22 | 23 | after_success: 24 | - coveralls 25 | 26 | jobs: 27 | include: 28 | - stage: release 29 | if: tag IS present 30 | python: 3.8 31 | deploy: 32 | provider: pypi 33 | user: okfn 34 | distributions: sdist bdist_wheel 35 | skip_cleanup: true 36 | on: 37 | tags: true 38 | password: 39 | secure: Iuf7V4+XHL6wwFYt4IyEe0vWLGO/uOpMJWQnO+1eUjmcQ1qi4E9vyEJvsJRzWKm5+/Lv9uFIRGlmpNWQzUPs5VnMc3LEBh7Clv/WIlRGvi+omCeWoEPAPUueF8qjBcvpT37QNzjB5QXJY074uAihmKh/DU2xA4K0yCB8YQefBHYeNBl0pNYVnELUW8BFmz0GE0lTwHOnM681vgR01LdPjrgIHVEvnTZkKYtDXc/cwkw610fqrFS10srnTX6KjjC/pgDm4WSuaUxbPycmriIhZR29QgAx24NO/wrdGdp5H8TIsvBFnNFlC4QuHfwiXdAKpjL6cMu2uMo639Sev/484XxTorg2QQvNhNAJtiESVAaqVviAlmUItGdmsw4xhZb0JK6NC8fOuOoccL4DBD6JtCyGurwSpznuGXh1DQUYZ7fTd5qaUDnzBuhYGc8XDvcj14XU4P5OKES4NdruRVJOwFiNSMOAT6wm8b2Ue6N+FvgsghjwUr9ESKBrPj0VoouC2+FGZWT65vt/3R9PhFuBdC6SgMLWHESBuU5GW9Bc2ucS3HUi+uUV1IGjpfIsc3qifojNJiaU7hSAggJs9QlXd7goH2fKhb9ro2klzcDKmpBLXmMk3uH0QRpv1dGUYFtgGeEFN93vP3cxYsXf8OvV+MuCxYYGgrGZu3h8fvbc5hY= 40 | -------------------------------------------------------------------------------- /LEAD.md: -------------------------------------------------------------------------------- 1 | roll 2 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Open Knowledge 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | global-include VERSION 2 | include LICENSE.md 3 | include Makefile 4 | include pylama.ini 5 | include pytest.ini 6 | include README.md 7 | include tox.ini 8 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all install list readme release templates test version 2 | 3 | 4 | PACKAGE := $(shell grep '^PACKAGE =' setup.py | cut -d "'" -f2) 5 | VERSION := $(shell head -n 1 $(PACKAGE)/VERSION) 6 | LEAD := $(shell head -n 1 LEAD.md) 7 | 8 | 9 | all: list 10 | 11 | install: 12 | pip install --upgrade -e .[datapackage,develop,ods,html] 13 | 14 | list: 15 | @grep '^\.PHONY' Makefile | cut -d' ' -f2- | tr ' ' '\n' 16 | 17 | readme: 18 | pip install md-toc 19 | pip install referencer 20 | referencer $(PACKAGE) README.md --in-place 21 | md_toc -p github --header-levels 3 README.md 22 | sed -i '/(#$(PACKAGE)-py)/,+2d' README.md 23 | 24 | release: 25 | git checkout master && git pull origin && git fetch -p 26 | @git log --pretty=format:"%C(yellow)%h%Creset %s%Cgreen%d" --reverse -20 27 | @echo "\nReleasing v$(VERSION) in 10 seconds. Press to abort\n" && sleep 10 28 | git commit -a -m 'v$(VERSION)' && git tag -a v$(VERSION) -m 'v$(VERSION)' 29 | git push --follow-tags 30 | 31 | templates: 32 | sed -i -E "s/@(\w*)/@$(LEAD)/" .github/issue_template.md 33 | sed -i -E "s/@(\w*)/@$(LEAD)/" .github/pull_request_template.md 34 | 35 | test: 36 | pylama $(PACKAGE) 37 | pytest --cov ${PACKAGE} --cov-report term-missing --cov-fail-under 90 38 | 39 | version: 40 | @echo $(VERSION) 41 | -------------------------------------------------------------------------------- /data/2-files.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/2-files.zip -------------------------------------------------------------------------------- /data/datapackage.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "test-tabulator", 3 | "resources": [ 4 | { "name": "first-resource", 5 | "path": "table.xls", 6 | "schema": { 7 | "fields": [ 8 | { 9 | "name": "id", 10 | "type": "number" 11 | }, 12 | { 13 | "name": "name", 14 | "type": "string" 15 | } 16 | ] 17 | } 18 | }, 19 | {"name": "number-two", "path": "table-reverse.csv", 20 | "schema": { 21 | "fields": [ 22 | { 23 | "name": "id", 24 | "type": "integer" 25 | }, 26 | { 27 | "name": "name", 28 | "type": "string" 29 | } 30 | ] 31 | }} 32 | ] 33 | } 34 | -------------------------------------------------------------------------------- /data/matrix.csv: -------------------------------------------------------------------------------- 1 | f1,f2,f3,f4 2 | 11,12,13,14 3 | 21,22,23,24 4 | 31,32,33,34 5 | 41,42,43,44 6 | -------------------------------------------------------------------------------- /data/special/accent.csv: -------------------------------------------------------------------------------- 1 | n_amenageur;n_operateur;n_enseigne;id_station;n_station;ad_station;code_insee;Xlongitude;Ylatitude;nbre_pdc;id_pdc;puiss_max;type_prise;acces_recharge;accessibilité;observations;date_maj 2 | XXX;YYY;ZZZ;FR*A17*P*ZZZ*3*_*_*_;Parking 1;D109A;06090;6.92641;43.59413;2;FR*A17*E*ZZZ*3*1*1*_;22.0;T2 - E/F;Payant (badge, appli et QR code);24h/24;RAS;2018/03/31 3 | XXX;YYY;ZZZ;FR*A17*P*ZZZ*3*_*_*_;Parking 1;D109A;06090;6.92641;43.59413;2;FR*A17*E*ZZZ*3*1*2*_;22.0;T2 - E/F;Payant (badge, appli et QR code);24h/24;RAS;2018/3/31 4 | -------------------------------------------------------------------------------- /data/special/adjust_floating_point_error.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/adjust_floating_point_error.xlsx -------------------------------------------------------------------------------- /data/special/bom.csv: -------------------------------------------------------------------------------- 1 | id,name 2 | 1,english 3 | 2,中国人 4 | -------------------------------------------------------------------------------- /data/special/doublequote.csv: -------------------------------------------------------------------------------- 1 | "INCISO","NOMBREINCISO","UE","NOMBREUE","AÑO","CODIGOAP","NOMBREAP","DESCRIPCIONAP","CODIGOPROGRAMA","NOMBREPROGRAMA","DESCRIPCIONPROGRAMA","INCISOCODIGO","UECODIGO","ue_cod","presupuestado","pfi","pbi" 2 | "A.N.E.P.","Administración Nacional de Educación Pública","CES","Consejo de Educación Secundaria",2019,8,"EDUCACIÓN","Comprende los servicios prestados a alumnos y estudiantes, y los servicios docentes; la formulación y administración de políticas educativas, aplicación de normas, regulación, autorización y supervisión de los centros de enseñanza, así como la investigación aplicada y el desarrollo experimental en relación con los asuntos y servicios docentes; los gastos relacionados con la administración y gestión de centros educativos que imparten educación preescolar, primaria, secundaria y universitaria, así como otras instituciones que brindan formación y capacitación. Comprende la prestación de servicios auxiliares de la educación, como transporte, seguridad, alimentación, alojamiento, atención médica y odontológica y otros servicios auxiliares conexos. Comprende además las escuelas y academias militares, las escuelas de policía que imparten enseñanza general y la formación para el servicio exterior.",608,"Inversiones edilicias y equipamiento"," ",25,3,25003,74350819,"Inversión",".003465" 3 | "A.N.E.P.","Administración Nacional de Educación Pública","CFed","Consejo de Formación en Educación",2015,8,"EDUCACIÓN","Comprende los servicios prestados a alumnos y estudiantes, y los servicios docentes; la formulación y administración de políticas educativas, aplicación de normas, regulación, autorización y supervisión de los centros de enseñanza, así como la investigación aplicada y el desarrollo experimental en relación con los asuntos y servicios docentes; los gastos relacionados con la administración y gestión de centros educativos que imparten educación preescolar, primaria, secundaria y universitaria, así como otras instituciones que brindan formación y capacitación. Comprende la prestación de servicios auxiliares de la educación, como transporte, seguridad, alimentación, alojamiento, atención médica y odontológica y otros servicios auxiliares conexos. Comprende además las escuelas y academias militares, las escuelas de policía que imparten enseñanza general y la formación para el servicio exterior.",5,"Formación en Educación","Formación de profesionales de la educación, incluyendo la formación de educadores sociales. La Ley General de Educación Nº 18.43, en su artículo nro. 31 define la formación en educación. Ésta... '""se concebirá como enseñanza terciaria universitaria y abarcará la formación de maestros, maestros técnicos, profesores, profesores de educación física y educadores sociales, así como de otras formaciones que el Sistema Nacional de Educación requiera'"".",25,5,25005,117396841,"Funcionamiento",".007937" 4 | "A.N.E.P.","Administración Nacional de Educación Pública","CFed","Consejo de Formación en Educación",2015,8,"EDUCACIÓN","Comprende los servicios prestados a alumnos y estudiantes, y los servicios docentes; la formulación y administración de políticas educativas, aplicación de normas, regulación, autorización y supervisión de los centros de enseñanza, así como la investigación aplicada y el desarrollo experimental en relación con los asuntos y servicios docentes; los gastos relacionados con la administración y gestión de centros educativos que imparten educación preescolar, primaria, secundaria y universitaria, así como otras instituciones que brindan formación y capacitación. Comprende la prestación de servicios auxiliares de la educación, como transporte, seguridad, alimentación, alojamiento, atención médica y odontológica y otros servicios auxiliares conexos. Comprende además las escuelas y academias militares, las escuelas de policía que imparten enseñanza general y la formación para el servicio exterior.",5,"Formación en Educación","Formación de profesionales de la educación, incluyendo la formación de educadores sociales. La Ley General de Educación Nº 18.43, en su artículo nro. 31 define la formación en educación. Ésta... '""se concebirá como enseñanza terciaria universitaria y abarcará la formación de maestros, maestros técnicos, profesores, profesores de educación física y educadores sociales, así como de otras formaciones que el Sistema Nacional de Educación requiera'"".",25,5,25005,38107510,"Inversión",".002576" 5 | "A.N.E.P.","Administración Nacional de Educación Pública","CFed","Consejo de Formación en Educación",2015,8,"EDUCACIÓN","Comprende los servicios prestados a alumnos y estudiantes, y los servicios docentes; la formulación y administración de políticas educativas, aplicación de normas, regulación, autorización y supervisión de los centros de enseñanza, así como la investigación aplicada y el desarrollo experimental en relación con los asuntos y servicios docentes; los gastos relacionados con la administración y gestión de centros educativos que imparten educación preescolar, primaria, secundaria y universitaria, así como otras instituciones que brindan formación y capacitación. Comprende la prestación de servicios auxiliares de la educación, como transporte, seguridad, alimentación, alojamiento, atención médica y odontológica y otros servicios auxiliares conexos. Comprende además las escuelas y academias militares, las escuelas de policía que imparten enseñanza general y la formación para el servicio exterior.",5,"Formación en Educación","Formación de profesionales de la educación, incluyendo la formación de educadores sociales. La Ley General de Educación Nº 18.43, en su artículo nro. 31 define la formación en educación. Ésta... '""se concebirá como enseñanza terciaria universitaria y abarcará la formación de maestros, maestros técnicos, profesores, profesores de educación física y educadores sociales, así como de otras formaciones que el Sistema Nacional de Educación requiera'"".",25,5,25005,1616869527,"Personal",".109327" 6 | "A.N.E.P.","Administración Nacional de Educación Pública","CFed","Consejo de Formación en Educación",2016,8,"EDUCACIÓN","Comprende los servicios prestados a alumnos y estudiantes, y los servicios docentes; la formulación y administración de políticas educativas, aplicación de normas, regulación, autorización y supervisión de los centros de enseñanza, así como la investigación aplicada y el desarrollo experimental en relación con los asuntos y servicios docentes; los gastos relacionados con la administración y gestión de centros educativos que imparten educación preescolar, primaria, secundaria y universitaria, así como otras instituciones que brindan formación y capacitación. Comprende la prestación de servicios auxiliares de la educación, como transporte, seguridad, alimentación, alojamiento, atención médica y odontológica y otros servicios auxiliares conexos. Comprende además las escuelas y academias militares, las escuelas de policía que imparten enseñanza general y la formación para el servicio exterior.",601,"Administración de la Educación y Gestión de Políticas Transversales"," ",25,5,25005,25856879,"Funcionamiento",".001586" 7 | -------------------------------------------------------------------------------- /data/special/doublequote.csv.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/doublequote.csv.zip -------------------------------------------------------------------------------- /data/special/escaping.csv: -------------------------------------------------------------------------------- 1 | ID, Test 2 | 1, "Test line 1" 3 | 2, "Test "" line 2" 4 | 3, "Test \" line 3" 5 | -------------------------------------------------------------------------------- /data/special/issue320.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/issue320.xlsx -------------------------------------------------------------------------------- /data/special/latin1.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/latin1.csv -------------------------------------------------------------------------------- /data/special/long.csv: -------------------------------------------------------------------------------- 1 | id,name 2 | 1,a 3 | 2,b 4 | 3,c 5 | 4,d 6 | 5,e 7 | 6,f 8 | -------------------------------------------------------------------------------- /data/special/merged-cells-boolean.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/merged-cells-boolean.xls -------------------------------------------------------------------------------- /data/special/merged-cells.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/merged-cells.xls -------------------------------------------------------------------------------- /data/special/merged-cells.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/merged-cells.xlsx -------------------------------------------------------------------------------- /data/special/multiline-headers.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/multiline-headers.xlsx -------------------------------------------------------------------------------- /data/special/number_format_multicode.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/number_format_multicode.xlsx -------------------------------------------------------------------------------- /data/special/preserve-formatting-percentage.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/preserve-formatting-percentage.xlsx -------------------------------------------------------------------------------- /data/special/preserve-formatting.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/preserve-formatting.xlsx -------------------------------------------------------------------------------- /data/special/sheet2.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/sheet2.xls -------------------------------------------------------------------------------- /data/special/sheet2.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/sheet2.xlsx -------------------------------------------------------------------------------- /data/special/sheets.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/sheets.xlsx -------------------------------------------------------------------------------- /data/special/skip-blank-at-the-end.csv: -------------------------------------------------------------------------------- 1 | test1,test2 2 | #testing comment 3 | 1,2 4 | 5 | -------------------------------------------------------------------------------- /data/special/skip-rows-before-headers.csv: -------------------------------------------------------------------------------- 1 | # it's a comment! 2 | id,name 3 | 1,english 4 | 2,中国人 5 | -------------------------------------------------------------------------------- /data/special/skip-rows.csv: -------------------------------------------------------------------------------- 1 | # it's a comment! 2 | id,name 3 | 1,english 4 | # it's a comment! 5 | 2,中国人 6 | -------------------------------------------------------------------------------- /data/special/skip-rows.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/skip-rows.xlsx -------------------------------------------------------------------------------- /data/special/table-with-booleans.ods: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/table-with-booleans.ods -------------------------------------------------------------------------------- /data/special/table-with-booleans.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/table-with-booleans.xls -------------------------------------------------------------------------------- /data/special/table-with-ints-floats-dates.ods: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/table-with-ints-floats-dates.ods -------------------------------------------------------------------------------- /data/special/table-with-ints-floats-dates.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/table-with-ints-floats-dates.xls -------------------------------------------------------------------------------- /data/special/table.bad-format: -------------------------------------------------------------------------------- 1 | id,name 2 | 1,english 3 | 2,中国人 4 | -------------------------------------------------------------------------------- /data/special/table.csv.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/table.csv.zip -------------------------------------------------------------------------------- /data/special/test_scientific_notation.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/test_scientific_notation.xlsx -------------------------------------------------------------------------------- /data/table-dicts.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "id": 1, 4 | "name": "english" 5 | }, 6 | { 7 | "id": 2, 8 | "name": "中国人" 9 | } 10 | ] 11 | -------------------------------------------------------------------------------- /data/table-lists.json: -------------------------------------------------------------------------------- 1 | [ 2 | ["id", "name"], 3 | [1, "english"], 4 | [2, "中国人"] 5 | ] 6 | -------------------------------------------------------------------------------- /data/table-reverse.csv: -------------------------------------------------------------------------------- 1 | id,name 2 | 1,中国人 3 | 2,english 4 | -------------------------------------------------------------------------------- /data/table.csv: -------------------------------------------------------------------------------- 1 | id,name 2 | 1,english 3 | 2,中国人 4 | -------------------------------------------------------------------------------- /data/table.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/table.csv.gz -------------------------------------------------------------------------------- /data/table.csv.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/table.csv.zip -------------------------------------------------------------------------------- /data/table.ndjson: -------------------------------------------------------------------------------- 1 | {"id":1,"name":"english"} 2 | {"id":2,"name":"中国人"} 3 | -------------------------------------------------------------------------------- /data/table.ods: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/table.ods -------------------------------------------------------------------------------- /data/table.tsv: -------------------------------------------------------------------------------- 1 | id name 2 | 1 english 3 | 2 中国人 4 | 3 \N 5 | -------------------------------------------------------------------------------- /data/table.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/table.xls -------------------------------------------------------------------------------- /data/table.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/table.xlsx -------------------------------------------------------------------------------- /data/table1.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 |
idname
1english
2中国人
21 | 22 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 |
idname
1english
2中国人
27 | 28 | 29 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 |
idname
3french
4עברית
21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 |
idname
1english
2中国人
35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 |
idname
3french
4עברית
49 | 50 | 2 | 3 | id 4 | name 5 | 6 | 7 | 1 8 | english 9 | 10 | 11 | 2 12 | 中国人 13 | 14 | -------------------------------------------------------------------------------- /data/table_unicode_headers.csv: -------------------------------------------------------------------------------- 1 | id,国人 2 | 1,english 3 | 2,中国人 4 | -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/examples/__init__.py -------------------------------------------------------------------------------- /examples/stream.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import io 8 | import sys 9 | from tabulator import Stream 10 | 11 | 12 | print('Parse csv format:') 13 | source = 'data/table.csv' 14 | with Stream(source, headers='row1') as stream: 15 | print(stream.headers) 16 | for row in stream: 17 | print(row) 18 | 19 | 20 | print('\nParse linear tsv format:') 21 | source = 'data/table.tsv' 22 | with Stream(source, headers='row1') as stream: 23 | print(stream.headers) 24 | for row in stream: 25 | print(row) 26 | 27 | 28 | print('\nParse json with dicts:') 29 | source = 'file://data/table-dicts.json' 30 | with Stream(source) as stream: 31 | print(stream.headers) 32 | for row in stream: 33 | print(row) 34 | 35 | 36 | print('\nParse json with lists:') 37 | source = 'file://data/table-lists.json' 38 | with Stream(source, headers='row1') as stream: 39 | print(stream.headers) 40 | for row in stream: 41 | print(row) 42 | 43 | 44 | print('\nParse xls format:') 45 | source = 'data/table.xls' 46 | with Stream(source, headers='row1') as stream: 47 | print(stream.headers) 48 | for row in stream: 49 | print(row) 50 | 51 | 52 | print('\nParse xlsx format:') 53 | source = 'data/table.xlsx' 54 | with Stream(source, headers='row1') as stream: 55 | print(stream.headers) 56 | for row in stream: 57 | print(row) 58 | 59 | 60 | # print('\nLoad from stream scheme:') 61 | source = io.open('data/table.csv', mode='rb') 62 | with Stream(source, headers='row1', format='csv') as stream: 63 | print(stream.headers) 64 | for row in stream: 65 | print(row) 66 | 67 | 68 | print('\nLoad from text scheme:') 69 | source = 'text://id,name\n1,english\n2,中国人\n' 70 | with Stream(source, headers='row1', format='csv') as stream: 71 | print(stream.headers) 72 | for row in stream: 73 | print(row) 74 | 75 | 76 | print('\nLoad from http scheme:') 77 | source = 'https://raw.githubusercontent.com' 78 | source += '/okfn/tabulator-py/master/data/table.csv' 79 | with Stream(source, headers='row1') as stream: 80 | print(stream.headers) 81 | for row in stream: 82 | print(row) 83 | 84 | 85 | print('\nUsage of inline lists:') 86 | source = [['id', 'name'], ['1', 'english'], ('2', '中国人')] 87 | with Stream(source, headers='row1') as stream: 88 | print(stream.headers) 89 | for row in stream: 90 | print(row) 91 | 92 | 93 | print('\nUsage of inline lists (keyed):') 94 | source = [{'id': '1', 'name': 'english'}, {'id': '2', 'name': '中国人'}] 95 | with Stream(source) as stream: 96 | print(stream.headers) 97 | for row in stream: 98 | print(row) 99 | 100 | 101 | print('\nIter with keyed rows representation:') 102 | source = [{'id': '1', 'name': 'english'}, {'id': '2', 'name': '中国人'}] 103 | with Stream(source, headers=1) as stream: 104 | print(stream.headers) 105 | for row in stream.iter(keyed=True): 106 | print(row) 107 | 108 | 109 | print('\nTable reset and read limit:') 110 | source = 'data/table.csv' 111 | with Stream(source, headers='row1') as stream: 112 | print(stream.headers) 113 | print(stream.read(limit=1)) 114 | stream.reset() 115 | print(stream.read(limit=1)) 116 | 117 | 118 | print('\nLate headers (on a second row):') 119 | source = 'data/special/late_headers.csv' 120 | with Stream(source, headers='row2') as stream: 121 | print(stream.headers) 122 | for row in stream: 123 | print(row) 124 | 125 | 126 | print('\nSpaces in headers:') 127 | source = 'https://raw.githubusercontent.com/datasets/gdp/master/data/gdp.csv' 128 | with Stream(source, headers='row1') as stream: 129 | print(stream.headers) 130 | for row in stream.read(limit=5): 131 | print(row) 132 | -------------------------------------------------------------------------------- /pylama.ini: -------------------------------------------------------------------------------- 1 | [pylama] 2 | linters = pyflakes,mccabe,pep8 3 | ignore = E128, E301,E306,E731 4 | 5 | [pylama:pep8] 6 | max_line_length = 120 7 | 8 | [pylama:mccabe] 9 | complexity = 36 10 | 11 | [pylama:*/__init__.py] 12 | ignore = W0611 13 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | testpaths = tests 3 | markers = 4 | remote: marks tests as requiring Internet (deselect with '-m "not remote"') 5 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal=1 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | 6 | import os 7 | import io 8 | from setuptools import setup, find_packages 9 | 10 | 11 | # Helpers 12 | def read(*paths): 13 | """Read a text file.""" 14 | basedir = os.path.dirname(__file__) 15 | fullpath = os.path.join(basedir, *paths) 16 | contents = io.open(fullpath, encoding='utf-8').read().strip() 17 | return contents 18 | 19 | 20 | # Prepare 21 | PACKAGE = 'tabulator' 22 | INSTALL_REQUIRES = [ 23 | # General 24 | 'six>=1.9', 25 | 'click>=6.0', 26 | 'requests>=2.8', 27 | 'chardet>=3.0', 28 | 'boto3>=1.9', 29 | # Format: csv 30 | 'unicodecsv>=0.14', 31 | # Format: json 32 | 'ijson>=3.0.3', 33 | # Format: ndjson 34 | 'jsonlines>=1.1', 35 | # Format: sql 36 | 'sqlalchemy>=0.9.6', 37 | # Format: tsv 38 | 'linear-tsv>=1.0', 39 | # Format: xls 40 | 'xlrd>=1.0', 41 | # Format: xlsx 42 | 'openpyxl>=2.6', 43 | ] 44 | INSTALL_FORMAT_DATAPACKAGE_REQUIRES = [ 45 | 'datapackage>=1.12', 46 | ] 47 | INSTALL_FORMAT_ODS_REQUIRES = [ 48 | 'ezodf>=0.3', 49 | 'lxml>=3.0', 50 | ] 51 | INSTALL_PARSER_HTML_REQUIRES = [ 52 | 'pyquery<1.4.2', 53 | ] 54 | INSTALL_CCHARDET_REQUIRES = [ 55 | 'cchardet>=2.0', 56 | ] 57 | TESTS_REQUIRE = [ 58 | 'mock', 59 | 'pylama', 60 | 'pytest', 61 | 'pytest-cov', 62 | 'moto[server]', 63 | ] 64 | README = read('README.md') 65 | VERSION = read(PACKAGE, 'VERSION') 66 | PACKAGES = find_packages(exclude=['examples', 'tests']) 67 | 68 | 69 | # Run 70 | setup( 71 | name=PACKAGE, 72 | version=VERSION, 73 | packages=PACKAGES, 74 | include_package_data=True, 75 | install_requires=INSTALL_REQUIRES, 76 | tests_require=TESTS_REQUIRE, 77 | extras_require={ 78 | 'datapackage': INSTALL_FORMAT_DATAPACKAGE_REQUIRES, 79 | 'develop': TESTS_REQUIRE, 80 | 'ods': INSTALL_FORMAT_ODS_REQUIRES, 81 | 'html': INSTALL_PARSER_HTML_REQUIRES, 82 | 'cchardet': INSTALL_CCHARDET_REQUIRES, 83 | }, 84 | entry_points={ 85 | 'console_scripts': [ 86 | 'tabulator = tabulator.__main__:cli', 87 | ] 88 | }, 89 | zip_safe=False, 90 | long_description=README, 91 | long_description_content_type='text/markdown', 92 | description='Consistent interface for stream reading and writing tabular data (csv/xls/json/etc)', 93 | author='Open Knowledge Foundation', 94 | author_email='info@okfn.org', 95 | url='https://github.com/frictionlessdata/tabulator-py', 96 | license='MIT', 97 | keywords=[ 98 | 'frictionless data', 99 | ], 100 | classifiers=[ 101 | 'Development Status :: 4 - Beta', 102 | 'Environment :: Web Environment', 103 | 'Intended Audience :: Developers', 104 | 'License :: OSI Approved :: MIT License', 105 | 'Operating System :: OS Independent', 106 | 'Programming Language :: Python :: 2', 107 | 'Programming Language :: Python :: 2.7', 108 | 'Programming Language :: Python :: 3', 109 | 'Programming Language :: Python :: 3.3', 110 | 'Programming Language :: Python :: 3.4', 111 | 'Programming Language :: Python :: 3.5', 112 | 'Programming Language :: Python :: 3.6', 113 | 'Programming Language :: Python :: 3.7', 114 | 'Topic :: Internet :: WWW/HTTP :: Dynamic Content', 115 | 'Topic :: Software Development :: Libraries :: Python Modules' 116 | ], 117 | ) 118 | -------------------------------------------------------------------------------- /tabulator/VERSION: -------------------------------------------------------------------------------- 1 | 1.53.5 2 | -------------------------------------------------------------------------------- /tabulator/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | from . import config 7 | __version__ = config.VERSION 8 | 9 | 10 | # Module API 11 | 12 | from .cli import cli 13 | from .stream import Stream 14 | from .loader import Loader 15 | from .parser import Parser 16 | from .writer import Writer 17 | from .validate import validate 18 | from .exceptions import TabulatorException 19 | from .exceptions import SourceError 20 | from .exceptions import SchemeError 21 | from .exceptions import FormatError 22 | from .exceptions import EncodingError 23 | from .exceptions import CompressionError 24 | 25 | # Deprecated 26 | 27 | from . import exceptions 28 | from .exceptions import IOError 29 | from .exceptions import LoadingError 30 | from .exceptions import HTTPError 31 | -------------------------------------------------------------------------------- /tabulator/__main__.py: -------------------------------------------------------------------------------- 1 | from .cli import cli 2 | 3 | 4 | # Module API 5 | 6 | if __name__ == "__main__": 7 | cli() 8 | -------------------------------------------------------------------------------- /tabulator/cli.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | # from __future__ import unicode_literals 6 | 7 | import six 8 | import click 9 | import tabulator 10 | from . import config 11 | from . import exceptions 12 | 13 | 14 | # Module API 15 | 16 | @click.command(help='') 17 | @click.argument('source') 18 | @click.option('--headers', type=click.INT) 19 | @click.option('--scheme') 20 | @click.option('--format') 21 | @click.option('--encoding') 22 | @click.option('--limit', type=click.INT) 23 | @click.option('--sheet') 24 | @click.option('--fill-merged-cells', is_flag=True, default=None) 25 | @click.option('--preserve-formatting', is_flag=True, default=None) 26 | @click.option('--adjust-floating-point-error', is_flag=True, default=None) 27 | @click.option('--table') 28 | @click.option('--order_by') 29 | @click.option('--resource') 30 | @click.option('--property') 31 | @click.option('--keyed', is_flag=True, default=None) 32 | @click.version_option(config.VERSION, message='%(version)s') 33 | def cli(source, limit, **options): 34 | """Command-line interface 35 | 36 | ``` 37 | Usage: tabulator [OPTIONS] SOURCE 38 | 39 | Options: 40 | --headers INTEGER 41 | --scheme TEXT 42 | --format TEXT 43 | --encoding TEXT 44 | --limit INTEGER 45 | --sheet TEXT/INTEGER (excel) 46 | --fill-merged-cells BOOLEAN (excel) 47 | --preserve-formatting BOOLEAN (excel) 48 | --adjust-floating-point-error BOOLEAN (excel) 49 | --table TEXT (sql) 50 | --order_by TEXT (sql) 51 | --resource TEXT/INTEGER (datapackage) 52 | --property TEXT (json) 53 | --keyed BOOLEAN (json) 54 | --version Show the version and exit. 55 | --help Show this message and exit. 56 | ``` 57 | 58 | """ 59 | 60 | # Normalize options 61 | options = {key: value for key, value in options.items() if value is not None} 62 | try: 63 | options['sheet'] = int(options.get('sheet')) 64 | options['resource'] = int(options.get('resource')) 65 | except Exception: 66 | pass 67 | 68 | # Read the table 69 | try: 70 | with tabulator.Stream(source, **options) as stream: 71 | cast = str 72 | if six.PY2: 73 | cast = unicode # noqa 74 | if stream.headers: 75 | click.echo(click.style(', '.join(map(cast, stream.headers)), bold=True)) 76 | for count, row in enumerate(stream, start=1): 77 | click.echo(','.join(map(cast, row))) 78 | if count == limit: 79 | break 80 | except exceptions.TabulatorException as exception: 81 | click.echo('[error] %s' % str(exception)) 82 | exit(1) 83 | -------------------------------------------------------------------------------- /tabulator/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import io 8 | import os 9 | 10 | 11 | # General 12 | 13 | VERSION = io.open(os.path.join(os.path.dirname(__file__), 'VERSION')).read().strip() 14 | DEFAULT_SCHEME = 'file' 15 | DEFAULT_ENCODING = 'utf-8' 16 | DEFAULT_SAMPLE_SIZE = 100 17 | DEFAULT_BYTES_SAMPLE_SIZE = 10000 18 | SUPPORTED_COMPRESSION = ['zip', 'gz'] 19 | SUPPORTED_HASHING_ALGORITHMS = ['md5', 'sha1', 'sha256', 'sha512'] 20 | ENCODING_CONFIDENCE = 0.5 21 | HTTP_HEADERS = { 22 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) ' + 23 | 'AppleWebKit/537.36 (KHTML, like Gecko) ' + 24 | 'Chrome/54.0.2840.87 Safari/537.36' 25 | } 26 | CSV_SAMPLE_LINES = 100 27 | # http://docs.sqlalchemy.org/en/latest/dialects/index.html 28 | SQL_SCHEMES = ['firebird', 'mssql', 'mysql', 'oracle', 'postgresql', 'sqlite', 'sybase'] 29 | S3_DEFAULT_ENDPOINT_URL = 'https://s3.amazonaws.com' 30 | 31 | # Loaders 32 | 33 | LOADERS = { 34 | 's3': 'tabulator.loaders.aws.AWSLoader', 35 | 'file': 'tabulator.loaders.local.LocalLoader', 36 | 'http': 'tabulator.loaders.remote.RemoteLoader', 37 | 'https': 'tabulator.loaders.remote.RemoteLoader', 38 | 'ftp': 'tabulator.loaders.remote.RemoteLoader', 39 | 'ftps': 'tabulator.loaders.remote.RemoteLoader', 40 | 'stream': 'tabulator.loaders.stream.StreamLoader', 41 | 'text': 'tabulator.loaders.text.TextLoader', 42 | } 43 | 44 | # Parsers 45 | 46 | PARSERS = { 47 | 'csv': 'tabulator.parsers.csv.CSVParser', 48 | 'datapackage': 'tabulator.parsers.datapackage.DataPackageParser', 49 | 'gsheet': 'tabulator.parsers.gsheet.GsheetParser', 50 | 'html': 'tabulator.parsers.html.HTMLTableParser', 51 | 'inline': 'tabulator.parsers.inline.InlineParser', 52 | 'json': 'tabulator.parsers.json.JSONParser', 53 | 'jsonl': 'tabulator.parsers.ndjson.NDJSONParser', 54 | 'ndjson': 'tabulator.parsers.ndjson.NDJSONParser', 55 | 'ods': 'tabulator.parsers.ods.ODSParser', 56 | 'sql': 'tabulator.parsers.sql.SQLParser', 57 | 'tsv': 'tabulator.parsers.tsv.TSVParser', 58 | 'xls': 'tabulator.parsers.xls.XLSParser', 59 | 'xlsx': 'tabulator.parsers.xlsx.XLSXParser', 60 | } 61 | 62 | # Writers 63 | 64 | WRITERS = { 65 | 'csv': 'tabulator.writers.csv.CSVWriter', 66 | 'json': 'tabulator.writers.json.JSONWriter', 67 | 'xlsx': 'tabulator.writers.xlsx.XLSXWriter', 68 | 'sql': 'tabulator.writers.sql.SQLWriter', 69 | } 70 | -------------------------------------------------------------------------------- /tabulator/exceptions.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | 8 | # Module API 9 | 10 | class TabulatorException(Exception): 11 | """Base class for all tabulator exceptions. 12 | """ 13 | pass 14 | 15 | 16 | class SourceError(TabulatorException): 17 | """The source file could not be parsed correctly. 18 | """ 19 | pass 20 | 21 | 22 | class SchemeError(TabulatorException): 23 | """The file scheme is not supported. 24 | """ 25 | pass 26 | 27 | 28 | class FormatError(TabulatorException): 29 | """The file format is unsupported or invalid. 30 | """ 31 | pass 32 | 33 | 34 | class EncodingError(TabulatorException): 35 | """Encoding error 36 | """ 37 | pass 38 | 39 | 40 | class CompressionError(TabulatorException): 41 | """Compression error 42 | """ 43 | pass 44 | 45 | 46 | # Deprecated 47 | 48 | OptionsError = TabulatorException 49 | ResetError = TabulatorException 50 | 51 | 52 | class IOError(SchemeError): 53 | """Local loading error 54 | """ 55 | pass 56 | 57 | 58 | class LoadingError(IOError): 59 | """Local loading error 60 | """ 61 | pass 62 | 63 | 64 | class HTTPError(LoadingError): 65 | """Remote loading error 66 | """ 67 | pass 68 | -------------------------------------------------------------------------------- /tabulator/helpers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import os 8 | import re 9 | import six 10 | import codecs 11 | import hashlib 12 | from copy import copy 13 | from importlib import import_module 14 | from six.moves.urllib.parse import parse_qs, urlparse, urlunparse 15 | from . import exceptions 16 | from . import config 17 | 18 | 19 | # Module API 20 | 21 | def detect_scheme_and_format(source): 22 | """Detect scheme and format based on source and return as a tuple. 23 | 24 | Scheme is a minimum 2 letters before `://` (will be lower cased). 25 | For example `http` from `http://example.com/table.csv` 26 | 27 | """ 28 | 29 | # Scheme: stream 30 | if hasattr(source, 'read'): 31 | return ('stream', None) 32 | 33 | # Format: inline 34 | if not isinstance(source, six.string_types): 35 | return (None, 'inline') 36 | 37 | # Format: gsheet 38 | if 'docs.google.com/spreadsheets' in source: 39 | if 'export' not in source and 'pub' not in source: 40 | return (None, 'gsheet') 41 | elif 'csv' in source: 42 | return ('https', 'csv') 43 | 44 | # Format: sql 45 | for sql_scheme in config.SQL_SCHEMES: 46 | if source.startswith('%s://' % sql_scheme): 47 | return (None, 'sql') 48 | 49 | # General 50 | parsed = urlparse(source) 51 | scheme = parsed.scheme.lower() 52 | if len(scheme) < 2: 53 | scheme = config.DEFAULT_SCHEME 54 | format = os.path.splitext(parsed.path or parsed.netloc)[1][1:].lower() or None 55 | if format is None: 56 | # Test if query string contains a "format=" parameter. 57 | query_string = parse_qs(parsed.query) 58 | query_string_format = query_string.get("format") 59 | if query_string_format is not None and len(query_string_format) == 1: 60 | format = query_string_format[0] 61 | 62 | # Format: datapackage 63 | if parsed.path.endswith('datapackage.json'): 64 | return (None, 'datapackage') 65 | 66 | return (scheme, format) 67 | 68 | 69 | # TODO: consider merging cp1252/iso8859-1 70 | def detect_encoding(sample, encoding=None): 71 | """Detect encoding of a byte string sample. 72 | """ 73 | # To reduce tabulator import time 74 | try: 75 | from cchardet import detect 76 | except ImportError: 77 | from chardet import detect 78 | if encoding is not None: 79 | return normalize_encoding(sample, encoding) 80 | result = detect(sample) 81 | confidence = result['confidence'] or 0 82 | encoding = result['encoding'] or 'ascii' 83 | encoding = normalize_encoding(sample, encoding) 84 | if confidence < config.ENCODING_CONFIDENCE: 85 | encoding = config.DEFAULT_ENCODING 86 | if encoding == 'ascii': 87 | encoding = config.DEFAULT_ENCODING 88 | return encoding 89 | 90 | 91 | def normalize_encoding(sample, encoding): 92 | """Normalize encoding including 'utf-8-sig', 'utf-16-be', utf-16-le tweaks. 93 | """ 94 | encoding = codecs.lookup(encoding).name 95 | # Work around 'Incorrect detection of utf-8-sig encoding' 96 | # 97 | if encoding == 'utf-8': 98 | if sample.startswith(codecs.BOM_UTF8): 99 | encoding = 'utf-8-sig' 100 | # Use the BOM stripping name (without byte-order) for UTF-16 encodings 101 | elif encoding == 'utf-16-be': 102 | if sample.startswith(codecs.BOM_UTF16_BE): 103 | encoding = 'utf-16' 104 | elif encoding == 'utf-16-le': 105 | if sample.startswith(codecs.BOM_UTF16_LE): 106 | encoding = 'utf-16' 107 | return encoding 108 | 109 | 110 | def detect_html(text): 111 | """Detect if text is HTML. 112 | """ 113 | pattern = re.compile('\\s*<(!doctype|html)', re.IGNORECASE) 114 | return bool(pattern.match(text)) 115 | 116 | 117 | def reset_stream(stream): 118 | """Reset stream pointer to the first element. 119 | 120 | If stream is not seekable raise Exception. 121 | 122 | """ 123 | try: 124 | position = stream.tell() 125 | except Exception: 126 | position = True 127 | if position != 0: 128 | try: 129 | stream.seek(0) 130 | except Exception: 131 | message = 'It\'s not possible to reset this stream' 132 | raise exceptions.TabulatorException(message) 133 | 134 | 135 | def ensure_dir(path): 136 | """Ensure path directory exists. 137 | """ 138 | dirpath = os.path.dirname(path) 139 | if dirpath and not os.path.exists(dirpath): 140 | os.makedirs(dirpath) 141 | 142 | 143 | def requote_uri(uri): 144 | """Requote uri if it contains non-ascii chars, spaces etc. 145 | """ 146 | # To reduce tabulator import time 147 | import requests.utils 148 | if six.PY2: 149 | def url_encode_non_ascii(bytes): 150 | pattern = '[\x80-\xFF]' 151 | replace = lambda c: ('%%%02x' % ord(c.group(0))).upper() 152 | return re.sub(pattern, replace, bytes) 153 | parts = urlparse(uri) 154 | uri = urlunparse( 155 | part.encode('idna') if index == 1 156 | else url_encode_non_ascii(part.encode('utf-8')) 157 | for index, part in enumerate(parts)) 158 | return requests.utils.requote_uri(uri) 159 | 160 | 161 | def import_attribute(path): 162 | """Import attribute by path like `package.module.attribute` 163 | """ 164 | module_name, attribute_name = path.rsplit('.', 1) 165 | module = import_module(module_name) 166 | attribute = getattr(module, attribute_name) 167 | return attribute 168 | 169 | 170 | def extract_options(options, names): 171 | """Return options for names and remove it from given options in-place. 172 | """ 173 | result = {} 174 | for name, value in copy(options).items(): 175 | if name in names: 176 | result[name] = value 177 | del options[name] 178 | return result 179 | 180 | 181 | def stringify_value(value): 182 | """Convert any value to string. 183 | """ 184 | if value is None: 185 | return u'' 186 | isoformat = getattr(value, 'isoformat', None) 187 | if isoformat is not None: 188 | value = isoformat() 189 | return type(u'')(value) 190 | 191 | 192 | class BytesStatsWrapper(object): 193 | """This class is intended to be used as 194 | 195 | stats = {'size': 0, 'hash': ''} 196 | bytes = BytesStatsWrapper(bytes, stats) 197 | 198 | It will be updating the stats during reading. 199 | 200 | """ 201 | 202 | def __init__(self, bytes, stats): 203 | self.__hasher = getattr(hashlib, stats['hashing_algorithm'])() 204 | self.__bytes = bytes 205 | self.__stats = stats 206 | 207 | def __getattr__(self, name): 208 | return getattr(self.__bytes, name) 209 | 210 | @property 211 | def closed(self): 212 | return self.__bytes.closed 213 | 214 | def read1(self, size=None): 215 | chunk = self.__bytes.read1(size) 216 | self.__hasher.update(chunk) 217 | self.__stats['size'] += len(chunk) 218 | self.__stats['hash'] = self.__hasher.hexdigest() 219 | return chunk 220 | -------------------------------------------------------------------------------- /tabulator/loader.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | from six import add_metaclass 8 | from abc import ABCMeta, abstractmethod 9 | 10 | 11 | # Module API 12 | 13 | @add_metaclass(ABCMeta) 14 | class Loader(object): 15 | """Abstract class implemented by the data loaders 16 | 17 | The loaders inherit and implement this class' methods to add support for a 18 | new scheme (e.g. ssh). 19 | 20 | # Arguments 21 | bytes_sample_size (int): Sample size in bytes 22 | **options (dict): Loader options 23 | 24 | """ 25 | 26 | # Public 27 | 28 | options = [] 29 | 30 | def __init__(self, bytes_sample_size, **options): 31 | pass 32 | 33 | @abstractmethod 34 | def load(self, source, mode='t', encoding=None): 35 | """Load source file. 36 | 37 | # Arguments 38 | source (str): Path to tabular source file. 39 | mode (str, optional): 40 | Text stream mode, `t` (text) or `b` (binary). Defaults to `t`. 41 | encoding (str, optional): 42 | Source encoding. Auto-detect by default. 43 | 44 | # Returns 45 | Union[TextIO, BinaryIO]: I/O stream opened either as text or binary. 46 | 47 | """ 48 | pass 49 | -------------------------------------------------------------------------------- /tabulator/loaders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/tabulator/loaders/__init__.py -------------------------------------------------------------------------------- /tabulator/loaders/aws.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import os 8 | import io 9 | import boto3 10 | from six.moves.urllib.parse import urlparse 11 | from ..loader import Loader 12 | from .. import exceptions 13 | from .. import helpers 14 | from .. import config 15 | 16 | 17 | # Module API 18 | 19 | class AWSLoader(Loader): 20 | """Loader to load source from the AWS. 21 | """ 22 | 23 | # Public 24 | 25 | remote = True 26 | options = [ 27 | 's3_endpoint_url', 28 | ] 29 | 30 | def __init__(self, 31 | bytes_sample_size=config.DEFAULT_BYTES_SAMPLE_SIZE, 32 | s3_endpoint_url=None): 33 | self.__bytes_sample_size = bytes_sample_size 34 | self.__s3_endpoint_url = ( 35 | s3_endpoint_url or 36 | os.environ.get('S3_ENDPOINT_URL') or 37 | config.S3_DEFAULT_ENDPOINT_URL) 38 | self.__s3_client = boto3.client('s3', endpoint_url=self.__s3_endpoint_url) 39 | self.__stats = None 40 | 41 | def attach_stats(self, stats): 42 | self.__stats = stats 43 | 44 | def load(self, source, mode='t', encoding=None): 45 | 46 | # Prepare bytes 47 | try: 48 | parts = urlparse(source, allow_fragments=False) 49 | response = self.__s3_client.get_object(Bucket=parts.netloc, Key=parts.path[1:]) 50 | # https://github.com/frictionlessdata/tabulator-py/issues/271 51 | bytes = io.BufferedRandom(io.BytesIO()) 52 | bytes.write(response['Body'].read()) 53 | bytes.seek(0) 54 | if self.__stats: 55 | bytes = helpers.BytesStatsWrapper(bytes, self.__stats) 56 | except Exception as exception: 57 | raise exceptions.LoadingError(str(exception)) 58 | 59 | # Return bytes 60 | if mode == 'b': 61 | return bytes 62 | 63 | # Detect encoding 64 | if self.__bytes_sample_size: 65 | sample = bytes.read(self.__bytes_sample_size) 66 | bytes.seek(0) 67 | encoding = helpers.detect_encoding(sample, encoding) 68 | 69 | # Prepare chars 70 | chars = io.TextIOWrapper(bytes, encoding) 71 | 72 | return chars 73 | -------------------------------------------------------------------------------- /tabulator/loaders/local.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import io 8 | from ..loader import Loader 9 | from .. import exceptions 10 | from .. import helpers 11 | from .. import config 12 | 13 | 14 | # Module API 15 | 16 | class LocalLoader(Loader): 17 | """Loader to load source from filesystem. 18 | """ 19 | 20 | # Public 21 | 22 | options = [] 23 | 24 | def __init__(self, bytes_sample_size=config.DEFAULT_BYTES_SAMPLE_SIZE): 25 | self.__bytes_sample_size = bytes_sample_size 26 | self.__stats = None 27 | 28 | def attach_stats(self, stats): 29 | self.__stats = stats 30 | 31 | def load(self, source, mode='t', encoding=None): 32 | 33 | # Prepare source 34 | scheme = 'file://' 35 | if source.startswith(scheme): 36 | source = source.replace(scheme, '', 1) 37 | 38 | # Prepare bytes 39 | try: 40 | bytes = io.open(source, 'rb') 41 | if self.__stats: 42 | bytes = helpers.BytesStatsWrapper(bytes, self.__stats) 43 | except IOError as exception: 44 | raise exceptions.LoadingError(str(exception)) 45 | 46 | # Return bytes 47 | if mode == 'b': 48 | return bytes 49 | 50 | # Detect encoding 51 | if self.__bytes_sample_size: 52 | sample = bytes.read(self.__bytes_sample_size) 53 | bytes.seek(0) 54 | encoding = helpers.detect_encoding(sample, encoding) 55 | 56 | # Prepare chars 57 | chars = io.TextIOWrapper(bytes, encoding) 58 | 59 | return chars 60 | -------------------------------------------------------------------------------- /tabulator/loaders/remote.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import io 8 | import six 9 | import requests 10 | from ..loader import Loader 11 | from .. import exceptions 12 | from .. import helpers 13 | from .. import config 14 | 15 | 16 | # Module API 17 | 18 | class RemoteLoader(Loader): 19 | """Loader to load source from the web. 20 | """ 21 | 22 | # Public 23 | 24 | remote = True 25 | options = [ 26 | 'http_session', 27 | 'http_stream', 28 | 'http_timeout', 29 | ] 30 | 31 | def __init__(self, 32 | bytes_sample_size=config.DEFAULT_BYTES_SAMPLE_SIZE, 33 | http_session=None, 34 | http_stream=True, 35 | http_timeout=None): 36 | 37 | # Create default session 38 | if not http_session: 39 | http_session = requests.Session() 40 | http_session.headers.update(config.HTTP_HEADERS) 41 | 42 | # No stream support 43 | if six.PY2: 44 | http_stream = False 45 | 46 | # Set attributes 47 | self.__bytes_sample_size = bytes_sample_size 48 | self.__http_session = http_session 49 | self.__http_stream = http_stream 50 | self.__http_timeout = http_timeout 51 | self.__stats = None 52 | 53 | def attach_stats(self, stats): 54 | self.__stats = stats 55 | 56 | def load(self, source, mode='t', encoding=None): 57 | 58 | # Prepare source 59 | source = helpers.requote_uri(source) 60 | 61 | # Prepare bytes 62 | try: 63 | bytes = _RemoteStream(source, self.__http_session, self.__http_timeout).open() 64 | if not self.__http_stream: 65 | buffer = io.BufferedRandom(io.BytesIO()) 66 | buffer.write(bytes.read()) 67 | buffer.seek(0) 68 | bytes = buffer 69 | if self.__stats: 70 | bytes = helpers.BytesStatsWrapper(bytes, self.__stats) 71 | except IOError as exception: 72 | raise exceptions.HTTPError(str(exception)) 73 | 74 | # Return bytes 75 | if mode == 'b': 76 | return bytes 77 | 78 | # Detect encoding 79 | if self.__bytes_sample_size: 80 | sample = bytes.read(self.__bytes_sample_size)[:self.__bytes_sample_size] 81 | bytes.seek(0) 82 | encoding = helpers.detect_encoding(sample, encoding) 83 | 84 | # Prepare chars 85 | chars = io.TextIOWrapper(bytes, encoding) 86 | 87 | return chars 88 | 89 | 90 | # Internal 91 | 92 | class _RemoteStream(object): 93 | 94 | # It's possible to implement cache for bytes sample 95 | # size to prevent additional HTTP calls used in seek 96 | 97 | # Public 98 | 99 | remote = True 100 | 101 | def __init__(self, source, session, timeout): 102 | self.__source = source 103 | self.__session = session 104 | self.__timeout = timeout 105 | 106 | def readable(self): 107 | return True 108 | 109 | def writable(self): 110 | return False 111 | 112 | def seekable(self): 113 | return True 114 | 115 | @property 116 | def closed(self): 117 | return self.__closed 118 | 119 | def open(self): 120 | self.__closed = False 121 | self.seek(0) 122 | return self 123 | 124 | def close(self): 125 | self.__closed = True 126 | 127 | def tell(self): 128 | return self.__response.raw.tell() 129 | 130 | def flush(self): 131 | pass 132 | 133 | def read(self, size=None): 134 | return self.__response.raw.read(size) 135 | 136 | def read1(self, size=None): 137 | return self.__response.raw.read(size) 138 | 139 | def seek(self, offset, whence=0): 140 | assert offset == 0 141 | assert whence == 0 142 | self.__response = self.__session.get(self.__source, stream=True, timeout=self.__timeout) 143 | self.__response.raise_for_status() 144 | self.__response.raw.decode_content = True 145 | -------------------------------------------------------------------------------- /tabulator/loaders/stream.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import io 8 | from ..loader import Loader 9 | from .. import exceptions 10 | from .. import helpers 11 | from .. import config 12 | 13 | 14 | # Module API 15 | 16 | class StreamLoader(Loader): 17 | """Loader to load source from file-like byte stream. 18 | """ 19 | 20 | # Public 21 | 22 | options = [] 23 | 24 | def __init__(self, bytes_sample_size=config.DEFAULT_BYTES_SAMPLE_SIZE): 25 | self.__bytes_sample_size = bytes_sample_size 26 | self.__stats = None 27 | 28 | def attach_stats(self, stats): 29 | self.__stats = stats 30 | 31 | def load(self, source, mode='t', encoding=None): 32 | 33 | # Support only bytes 34 | if hasattr(source, 'encoding'): 35 | message = 'Only byte streams are supported.' 36 | raise exceptions.SourceError(message) 37 | 38 | # Prepare bytes 39 | bytes = source 40 | if self.__stats: 41 | bytes = helpers.BytesStatsWrapper(bytes, self.__stats) 42 | 43 | # Return bytes 44 | if mode == 'b': 45 | return bytes 46 | 47 | # Detect encoding 48 | if self.__bytes_sample_size: 49 | sample = bytes.read(self.__bytes_sample_size) 50 | bytes.seek(0) 51 | encoding = helpers.detect_encoding(sample, encoding) 52 | 53 | # Prepare chars 54 | chars = io.TextIOWrapper(bytes, encoding) 55 | 56 | return chars 57 | -------------------------------------------------------------------------------- /tabulator/loaders/text.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import io 8 | from ..loader import Loader 9 | from .. import helpers 10 | from .. import config 11 | 12 | 13 | # Module API 14 | 15 | class TextLoader(Loader): 16 | """Loader to load source from text. 17 | """ 18 | 19 | # Public 20 | 21 | options = [] 22 | 23 | def __init__(self, bytes_sample_size=config.DEFAULT_BYTES_SAMPLE_SIZE): 24 | self.__bytes_sample_size = bytes_sample_size 25 | self.__stats = None 26 | 27 | def attach_stats(self, stats): 28 | self.__stats = stats 29 | 30 | def load(self, source, mode='t', encoding=None): 31 | 32 | # Prepare source 33 | scheme = 'text://' 34 | if source.startswith(scheme): 35 | source = source.replace(scheme, '', 1) 36 | 37 | # Prepare bytes 38 | bytes = io.BufferedRandom(io.BytesIO()) 39 | bytes.write(source.encode(encoding or config.DEFAULT_ENCODING)) 40 | bytes.seek(0) 41 | if self.__stats: 42 | bytes = helpers.BytesStatsWrapper(bytes, self.__stats) 43 | 44 | # Return bytes 45 | if mode == 'b': 46 | return bytes 47 | 48 | # Prepare chars 49 | chars = io.TextIOWrapper(bytes, encoding) 50 | 51 | return chars 52 | -------------------------------------------------------------------------------- /tabulator/parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | from six import add_metaclass 8 | from abc import ABCMeta, abstractmethod 9 | 10 | 11 | # Module API 12 | 13 | @add_metaclass(ABCMeta) 14 | class Parser(object): 15 | """Abstract class implemented by the data parsers. 16 | 17 | The parsers inherit and implement this class' methods to add support for a 18 | new file type. 19 | 20 | # Arguments 21 | loader (tabulator.Loader): Loader instance to read the file. 22 | force_parse (bool): 23 | When `True`, the parser yields an empty extended 24 | row tuple `(row_number, None, [])` when there is an error parsing a 25 | row. Otherwise, it stops the iteration by raising the exception 26 | `tabulator.exceptions.SourceError`. 27 | **options (dict): Loader options 28 | 29 | """ 30 | 31 | # Public 32 | 33 | options = [] 34 | 35 | def __init__(self, loader, force_parse, **options): 36 | pass 37 | 38 | @property 39 | @abstractmethod 40 | def closed(self): 41 | """Flag telling if the parser is closed. 42 | 43 | # Returns 44 | bool: whether closed 45 | 46 | """ 47 | pass # pragma: no cover 48 | 49 | @abstractmethod 50 | def open(self, source, encoding=None): 51 | """Open underlying file stream in the beginning of the file. 52 | 53 | The parser gets a byte or text stream from the `tabulator.Loader` 54 | instance and start emitting items. 55 | 56 | # Arguments 57 | source (str): Path to source table. 58 | encoding (str, optional): Source encoding. Auto-detect by default. 59 | 60 | # Returns 61 | None 62 | 63 | """ 64 | pass # pragma: no cover 65 | 66 | @abstractmethod 67 | def close(self): 68 | """Closes underlying file stream. 69 | """ 70 | pass # pragma: no cover 71 | 72 | @abstractmethod 73 | def reset(self): 74 | """Resets underlying stream and current items list. 75 | 76 | After `reset()` is called, iterating over the items will start from the beginning. 77 | """ 78 | pass # pragma: no cover 79 | 80 | @property 81 | @abstractmethod 82 | def encoding(self): 83 | """Encoding 84 | 85 | # Returns 86 | str: encoding 87 | 88 | """ 89 | pass # pragma: no cover 90 | 91 | @property 92 | @abstractmethod 93 | def extended_rows(self): 94 | """Returns extended rows iterator. 95 | 96 | The extended rows are tuples containing `(row_number, headers, row)`, 97 | 98 | # Raises 99 | SourceError: 100 | If `force_parse` is `False` and 101 | a row can't be parsed, this exception will be raised. 102 | Otherwise, an empty extended row is returned (i.e. 103 | `(row_number, None, [])`). 104 | 105 | Returns: 106 | Iterator[Tuple[int, List[str], List[Any]]]: 107 | Extended rows containing 108 | `(row_number, headers, row)`, where `headers` is a list of the 109 | header names (can be `None`), and `row` is a list of row 110 | values. 111 | 112 | """ 113 | pass # pragma: no cover 114 | -------------------------------------------------------------------------------- /tabulator/parsers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/tabulator/parsers/__init__.py -------------------------------------------------------------------------------- /tabulator/parsers/csv.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import csv 8 | import six 9 | from itertools import chain 10 | from codecs import iterencode 11 | from ..parser import Parser 12 | from .. import helpers 13 | from .. import config 14 | 15 | 16 | # Module API 17 | 18 | class CSVParser(Parser): 19 | """Parser to parse CSV data format. 20 | """ 21 | 22 | # Public 23 | 24 | options = [ 25 | 'delimiter', 26 | 'doublequote', 27 | 'escapechar', 28 | 'quotechar', 29 | 'quoting', 30 | 'skipinitialspace', 31 | 'lineterminator' 32 | ] 33 | 34 | def __init__(self, loader, force_parse=False, **options): 35 | 36 | # Make bytes 37 | if six.PY2: 38 | for key, value in options.items(): 39 | if isinstance(value, six.string_types): 40 | options[key] = str(value) 41 | 42 | # Set attributes 43 | self.__loader = loader 44 | self.__options = options 45 | self.__force_parse = force_parse 46 | self.__extended_rows = None 47 | self.__encoding = None 48 | self.__dialect = None 49 | self.__chars = None 50 | 51 | @property 52 | def closed(self): 53 | return self.__chars is None or self.__chars.closed 54 | 55 | def open(self, source, encoding=None): 56 | self.close() 57 | self.__chars = self.__loader.load(source, encoding=encoding) 58 | self.__encoding = getattr(self.__chars, 'encoding', encoding) 59 | if self.__encoding: 60 | self.__encoding.lower() 61 | self.reset() 62 | 63 | def close(self): 64 | if not self.closed: 65 | self.__chars.close() 66 | 67 | def reset(self): 68 | helpers.reset_stream(self.__chars) 69 | self.__extended_rows = self.__iter_extended_rows() 70 | 71 | @property 72 | def encoding(self): 73 | return self.__encoding 74 | 75 | @property 76 | def dialect(self): 77 | if self.__dialect: 78 | dialect = { 79 | 'delimiter': self.__dialect.delimiter, 80 | 'doubleQuote': self.__dialect.doublequote, 81 | 'lineTerminator': self.__dialect.lineterminator, 82 | 'quoteChar': self.__dialect.quotechar, 83 | 'skipInitialSpace': self.__dialect.skipinitialspace, 84 | } 85 | if self.__dialect.escapechar is not None: 86 | dialect['escapeChar'] = self.__dialect.escapechar 87 | return dialect 88 | 89 | @property 90 | def extended_rows(self): 91 | return self.__extended_rows 92 | 93 | # Private 94 | 95 | def __iter_extended_rows(self): 96 | 97 | # For PY2 encode/decode 98 | if six.PY2: 99 | # Reader requires utf-8 encoded stream 100 | bytes = iterencode(self.__chars, 'utf-8') 101 | sample, dialect = self.__prepare_dialect(bytes) 102 | items = csv.reader(chain(sample, bytes), dialect=dialect) 103 | for row_number, item in enumerate(items, start=1): 104 | values = [] 105 | for value in item: 106 | value = value.decode('utf-8') 107 | values.append(value) 108 | yield (row_number, None, list(values)) 109 | 110 | # For PY3 use chars 111 | else: 112 | sample, dialect = self.__prepare_dialect(self.__chars) 113 | items = csv.reader(chain(sample, self.__chars), dialect=dialect) 114 | for row_number, item in enumerate(items, start=1): 115 | yield (row_number, None, list(item)) 116 | 117 | def __prepare_dialect(self, stream): 118 | 119 | # Get sample 120 | sample = [] 121 | while True: 122 | try: 123 | sample.append(next(stream)) 124 | except StopIteration: 125 | break 126 | if len(sample) >= config.CSV_SAMPLE_LINES: 127 | break 128 | 129 | # Get dialect 130 | try: 131 | separator = b'' if six.PY2 else '' 132 | delimiter = self.__options.get('delimiter', ',\t;|') 133 | dialect = csv.Sniffer().sniff(separator.join(sample), delimiter) 134 | if not dialect.escapechar: 135 | dialect.doublequote = True 136 | except csv.Error: 137 | class dialect(csv.excel): 138 | pass 139 | for key, value in self.__options.items(): 140 | setattr(dialect, key, value) 141 | # https://github.com/frictionlessdata/FrictionlessDarwinCore/issues/1 142 | if getattr(dialect, 'quotechar', None) == '': 143 | setattr(dialect, 'quoting', csv.QUOTE_NONE) 144 | 145 | self.__dialect = dialect 146 | return sample, dialect 147 | -------------------------------------------------------------------------------- /tabulator/parsers/datapackage.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import six 8 | import datapackage 9 | from ..parser import Parser 10 | from .. import exceptions 11 | 12 | 13 | # Module API 14 | 15 | class DataPackageParser(Parser): 16 | """Parser to extract data from Tabular Data Packages. 17 | """ 18 | 19 | # Public 20 | 21 | options = [ 22 | 'resource', 23 | ] 24 | 25 | def __init__(self, loader, force_parse=False, resource=0): 26 | self.__force_parse = force_parse 27 | self.__resource_pointer = resource 28 | self.__extended_rows = None 29 | self.__encoding = None 30 | self.__fragment = None 31 | self.__resource = None 32 | 33 | @property 34 | def closed(self): 35 | return self.__extended_rows is None 36 | 37 | def open(self, source, encoding=None): 38 | self.close() 39 | package = datapackage.DataPackage(source) 40 | if isinstance(self.__resource_pointer, six.string_types): 41 | self.__resource = package.get_resource(self.__resource_pointer) 42 | else: 43 | try: 44 | self.__resource = package.resources[self.__resource_pointer] 45 | except (TypeError, IndexError): 46 | pass 47 | if not self.__resource: 48 | message = 'Data package "%s" doesn\'t have resource "%s"' 49 | raise exceptions.SourceError(message % (source, self.__resource_pointer)) 50 | self.__resource.infer() 51 | self.__encoding = self.__resource.descriptor.get('encoding') 52 | self.__fragment = self.__resource.name 53 | self.reset() 54 | 55 | def close(self): 56 | if not self.closed: 57 | self.__extended_rows = None 58 | 59 | def reset(self): 60 | self.__extended_rows = self.__iter_extended_rows() 61 | 62 | @property 63 | def encoding(self): 64 | return self.__encoding 65 | 66 | @property 67 | def fragment(self): 68 | return self.__fragment 69 | 70 | @property 71 | def extended_rows(self): 72 | return self.__extended_rows 73 | 74 | # Private 75 | 76 | def __iter_extended_rows(self): 77 | for row_number, headers, row in self.__resource.iter(extended=True): 78 | yield (row_number - 1, headers, row) 79 | -------------------------------------------------------------------------------- /tabulator/parsers/gsheet.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import re 8 | from ..stream import Stream 9 | from ..parser import Parser 10 | 11 | 12 | # Module API 13 | 14 | class GsheetParser(Parser): 15 | """Parser to parse Google Spreadsheets. 16 | """ 17 | 18 | # Public 19 | 20 | options = [] 21 | 22 | def __init__(self, loader, force_parse=False): 23 | self.__loader = loader 24 | self.__force_parse = force_parse 25 | self.__stream = None 26 | self.__encoding = None 27 | 28 | @property 29 | def closed(self): 30 | return self.__stream is None or self.__stream.closed 31 | 32 | def open(self, source, encoding=None): 33 | self.close() 34 | url = 'https://docs.google.com/spreadsheets/d/%s/export?format=csv&id=%s' 35 | match = re.search(r'.*/d/(?P[^/]+)/.*?(?:gid=(?P\d+))?$', source) 36 | key, gid = '', '' 37 | if match: 38 | key = match.group('key') 39 | gid = match.group('gid') 40 | url = url % (key, key) 41 | if gid: 42 | url = '%s&gid=%s' % (url, gid) 43 | self.__stream = Stream( 44 | url, format='csv', encoding=encoding, force_parse=self.__force_parse).open() 45 | self.__extended_rows = self.__stream.iter(extended=True) 46 | self.__encoding = encoding 47 | 48 | def close(self): 49 | if not self.closed: 50 | self.__stream.close() 51 | 52 | def reset(self): 53 | self.__stream.reset() 54 | self.__extended_rows = self.__stream.iter(extended=True) 55 | 56 | @property 57 | def encoding(self): 58 | return self.__encoding 59 | 60 | @property 61 | def extended_rows(self): 62 | return self.__extended_rows 63 | -------------------------------------------------------------------------------- /tabulator/parsers/html.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | from pyquery import PyQuery as pq 8 | from ..parser import Parser 9 | from .. import helpers 10 | 11 | 12 | # Module API 13 | 14 | class HTMLTableParser(Parser): 15 | """Parser to extract data out of HTML tables 16 | """ 17 | 18 | # Public 19 | 20 | options = [ 21 | 'selector', 22 | 'raw_html' 23 | ] 24 | 25 | def __init__(self, loader, force_parse=False, selector='table', raw_html=False): 26 | self.__loader = loader 27 | self.__selector = selector 28 | self.__force_parse = force_parse 29 | self.__extended_rows = None 30 | self.__encoding = None 31 | self.__chars = None 32 | self.__extractor = (lambda x: x.html()) if raw_html else (lambda x: x.text()) 33 | 34 | @property 35 | def closed(self): 36 | return self.__chars is None or self.__chars.closed 37 | 38 | def open(self, source, encoding=None): 39 | self.close() 40 | self.__encoding = encoding 41 | self.__chars = self.__loader.load(source, encoding=encoding) 42 | if self.__encoding: 43 | self.__encoding.lower() 44 | self.reset() 45 | 46 | def close(self): 47 | if not self.closed: 48 | self.__chars.close() 49 | 50 | def reset(self): 51 | helpers.reset_stream(self.__chars) 52 | self.__extended_rows = self.__iter_extended_rows() 53 | 54 | @property 55 | def encoding(self): 56 | return self.__encoding 57 | 58 | @property 59 | def extended_rows(self): 60 | return self.__extended_rows 61 | 62 | # Private 63 | 64 | def __iter_extended_rows(self): 65 | 66 | # Get Page content 67 | page = pq(self.__chars.read(), parser='html') 68 | 69 | # Find required table 70 | if self.__selector: 71 | table = pq(page.find(self.__selector)[0]) 72 | else: 73 | table = page 74 | 75 | # Extract headers 76 | rows = ( 77 | table.children('thead').children('tr') + 78 | table.children('thead') + 79 | table.children('tr') + 80 | table.children('tbody').children('tr') 81 | ) 82 | rows = [pq(r) for r in rows if len(r) > 0] 83 | # Extract rows 84 | rows = [pq(tr).children('td,th') for tr in rows] 85 | rows = [[self.__extractor(pq(td)) for td in tr] 86 | for tr in rows if len(tr) > 0] 87 | 88 | # Yield rows 89 | for row_number, row in enumerate(rows, start=1): 90 | yield (row_number, None, row) 91 | -------------------------------------------------------------------------------- /tabulator/parsers/inline.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import six 8 | from collections import OrderedDict 9 | from ..parser import Parser 10 | from .. import exceptions 11 | 12 | 13 | # Module API 14 | 15 | class InlineParser(Parser): 16 | """Parser to provide support for python inline lists. 17 | """ 18 | 19 | # Public 20 | 21 | options = [] 22 | 23 | def __init__(self, loader, force_parse=False): 24 | self.__loader = loader 25 | self.__force_parse = force_parse 26 | self.__extended_rows = None 27 | self.__encoding = None 28 | self.__source = None 29 | 30 | @property 31 | def closed(self): 32 | return False 33 | 34 | def open(self, source, encoding=None): 35 | if hasattr(source, '__next__' if six.PY3 else 'next'): 36 | message = 'Only callable returning an iterator is supported' 37 | raise exceptions.SourceError(message) 38 | self.close() 39 | self.__source = source 40 | self.__encoding = encoding 41 | self.reset() 42 | 43 | def close(self): 44 | pass 45 | 46 | def reset(self): 47 | self.__extended_rows = self.__iter_extended_rows() 48 | 49 | @property 50 | def encoding(self): 51 | return self.__encoding 52 | 53 | @property 54 | def extended_rows(self): 55 | return self.__extended_rows 56 | 57 | # Private 58 | 59 | def __iter_extended_rows(self): 60 | items = self.__source 61 | if not hasattr(items, '__iter__'): 62 | items = items() 63 | for row_number, item in enumerate(items, start=1): 64 | if isinstance(item, (tuple, list)): 65 | yield (row_number, None, list(item)) 66 | elif isinstance(item, dict): 67 | keys = [] 68 | values = [] 69 | iterator = item.keys() 70 | if not isinstance(item, OrderedDict): 71 | iterator = sorted(iterator) 72 | for key in iterator: 73 | keys.append(key) 74 | values.append(item[key]) 75 | yield (row_number, list(keys), list(values)) 76 | else: 77 | if not self.__force_parse: 78 | message = 'Inline data item has to be tuple, list or dict' 79 | raise exceptions.SourceError(message) 80 | yield (row_number, None, []) 81 | -------------------------------------------------------------------------------- /tabulator/parsers/json.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import ijson 8 | from ..parser import Parser 9 | from .. import exceptions 10 | from .. import helpers 11 | 12 | 13 | # Module API 14 | 15 | class JSONParser(Parser): 16 | """Parser to parse JSON data format. 17 | """ 18 | 19 | # Public 20 | 21 | options = [ 22 | 'property', 23 | ] 24 | 25 | def __init__(self, loader, force_parse=False, property=None): 26 | self.__loader = loader 27 | self.__property = property 28 | self.__force_parse = force_parse 29 | self.__extended_rows = None 30 | self.__encoding = None 31 | self.__bytes = None 32 | 33 | @property 34 | def closed(self): 35 | return self.__bytes is None or self.__bytes.closed 36 | 37 | def open(self, source, encoding=None): 38 | self.close() 39 | self.__encoding = encoding 40 | self.__bytes = self.__loader.load(source, mode='b', encoding=encoding) 41 | if self.__encoding: 42 | self.__encoding.lower() 43 | self.reset() 44 | 45 | def close(self): 46 | if not self.closed: 47 | self.__bytes.close() 48 | 49 | def reset(self): 50 | helpers.reset_stream(self.__bytes) 51 | self.__extended_rows = self.__iter_extended_rows() 52 | 53 | @property 54 | def encoding(self): 55 | return self.__encoding 56 | 57 | @property 58 | def extended_rows(self): 59 | return self.__extended_rows 60 | 61 | # Private 62 | 63 | def __iter_extended_rows(self): 64 | path = 'item' 65 | if self.__property is not None: 66 | path = '%s.item' % self.__property 67 | items = ijson.items(self.__bytes, path) 68 | for row_number, item in enumerate(items, start=1): 69 | if isinstance(item, (tuple, list)): 70 | yield (row_number, None, list(item)) 71 | elif isinstance(item, dict): 72 | keys = [] 73 | values = [] 74 | for key in sorted(item.keys()): 75 | keys.append(key) 76 | values.append(item[key]) 77 | yield (row_number, list(keys), list(values)) 78 | else: 79 | if not self.__force_parse: 80 | message = 'JSON item has to be list or dict' 81 | raise exceptions.SourceError(message) 82 | yield (row_number, None, []) 83 | -------------------------------------------------------------------------------- /tabulator/parsers/ndjson.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import jsonlines 8 | from ..parser import Parser 9 | from .. import exceptions 10 | from .. import helpers 11 | 12 | 13 | # Module API 14 | 15 | class NDJSONParser(Parser): 16 | """Parser to parse NDJSON data format. 17 | 18 | See: http://specs.okfnlabs.org/ndjson/ 19 | """ 20 | 21 | # Public 22 | 23 | options = [] 24 | 25 | def __init__(self, loader, force_parse=False): 26 | self.__loader = loader 27 | self.__force_parse = force_parse 28 | self.__extended_rows = None 29 | self.__encoding = None 30 | self.__chars = None 31 | 32 | @property 33 | def closed(self): 34 | return self.__chars is None or self.__chars.closed 35 | 36 | def open(self, source, encoding=None): 37 | self.close() 38 | self.__chars = self.__loader.load(source, encoding=encoding) 39 | self.__encoding = getattr(self.__chars, 'encoding', encoding) 40 | if self.__encoding: 41 | self.__encoding.lower() 42 | self.reset() 43 | 44 | def close(self): 45 | if not self.closed: 46 | self.__chars.close() 47 | 48 | def reset(self): 49 | helpers.reset_stream(self.__chars) 50 | self.__extended_rows = self.__iter_extended_rows() 51 | 52 | @property 53 | def encoding(self): 54 | return self.__encoding 55 | 56 | @property 57 | def extended_rows(self): 58 | return self.__extended_rows 59 | 60 | # Private 61 | 62 | def __iter_extended_rows(self): 63 | rows = jsonlines.Reader(self.__chars) 64 | for row_number, row in enumerate(rows, start=1): 65 | if isinstance(row, (tuple, list)): 66 | yield row_number, None, list(row) 67 | elif isinstance(row, dict): 68 | keys, values = zip(*sorted(row.items())) 69 | yield (row_number, list(keys), list(values)) 70 | else: 71 | if not self.__force_parse: 72 | raise exceptions.SourceError('JSON item has to be list or dict') 73 | yield (row_number, None, []) 74 | -------------------------------------------------------------------------------- /tabulator/parsers/ods.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | from datetime import datetime 7 | 8 | import six 9 | import ezodf 10 | from six import BytesIO 11 | from ..parser import Parser 12 | from .. import exceptions 13 | from .. import helpers 14 | 15 | 16 | # Module API 17 | 18 | class ODSParser(Parser): 19 | """Parser to parse ODF Spreadsheets. 20 | """ 21 | 22 | # Public 23 | 24 | options = [ 25 | 'sheet', 26 | ] 27 | 28 | def __init__(self, loader, force_parse=False, sheet=1): 29 | self.__loader = loader 30 | self.__sheet_pointer = sheet 31 | self.__force_parse = force_parse 32 | self.__extended_rows = None 33 | self.__encoding = None 34 | self.__bytes = None 35 | self.__book = None 36 | self.__sheet = None 37 | 38 | @property 39 | def closed(self): 40 | return self.__bytes is None or self.__bytes.closed 41 | 42 | def open(self, source, encoding=None): 43 | self.close() 44 | self.__encoding = encoding 45 | self.__bytes = self.__loader.load(source, mode='b', encoding=encoding) 46 | 47 | # Get book 48 | self.__book = ezodf.opendoc(BytesIO(self.__bytes.read())) 49 | 50 | # Get sheet 51 | try: 52 | if isinstance(self.__sheet_pointer, six.string_types): 53 | self.__sheet = self.__book.sheets[self.__sheet_pointer] 54 | else: 55 | self.__sheet = self.__book.sheets[self.__sheet_pointer - 1] 56 | except (KeyError, IndexError): 57 | message = 'OpenOffice document "%s" doesn\'t have a sheet "%s"' 58 | raise exceptions.SourceError(message % (source, self.__sheet_pointer)) 59 | 60 | # Rest parser 61 | self.reset() 62 | 63 | def close(self): 64 | if not self.closed: 65 | self.__bytes.close() 66 | 67 | def reset(self): 68 | helpers.reset_stream(self.__bytes) 69 | self.__extended_rows = self.__iter_extended_rows() 70 | 71 | @property 72 | def encoding(self): 73 | return self.__encoding 74 | 75 | @property 76 | def extended_rows(self): 77 | return self.__extended_rows 78 | 79 | # Private 80 | 81 | def __iter_extended_rows(self): 82 | 83 | def type_value(cell): 84 | """Detects int value, date and datetime""" 85 | 86 | ctype = cell.value_type 87 | value = cell.value 88 | 89 | # ods numbers are float only 90 | # float with no decimals can be cast into int 91 | if isinstance(value, float) and value == value // 1: 92 | return int(value) 93 | 94 | # Date or datetime 95 | if ctype == 'date': 96 | if len(value) == 10: 97 | return datetime.strptime(value, '%Y-%m-%d').date() 98 | else: 99 | return datetime.strptime(value, '%Y-%m-%dT%H:%M:%S') 100 | 101 | return value 102 | 103 | for row_number, row in enumerate(self.__sheet.rows(), start=1): 104 | yield row_number, None, [type_value(cell) for cell in row] 105 | -------------------------------------------------------------------------------- /tabulator/parsers/sql.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | from sqlalchemy import create_engine, sql 8 | from ..parser import Parser 9 | from .. import exceptions 10 | 11 | 12 | # Module API 13 | 14 | class SQLParser(Parser): 15 | """Parser to get data from SQL database. 16 | """ 17 | 18 | # Public 19 | 20 | options = [ 21 | 'table', 22 | 'order_by', 23 | ] 24 | 25 | def __init__(self, loader, force_parse=False, table=None, order_by=None): 26 | 27 | # Ensure table 28 | if table is None: 29 | raise exceptions.TabulatorException('Format `sql` requires `table` option.') 30 | 31 | # Set attributes 32 | self.__loader = loader 33 | self.__table = table 34 | self.__order_by = order_by 35 | self.__force_parse = force_parse 36 | self.__engine = None 37 | self.__extended_rows = None 38 | self.__encoding = None 39 | 40 | @property 41 | def closed(self): 42 | return self.__engine is None 43 | 44 | def open(self, source, encoding=None): 45 | self.close() 46 | self.__engine = create_engine(source) 47 | self.__engine.update_execution_options(stream_results=True) 48 | self.__encoding = encoding 49 | self.reset() 50 | 51 | def close(self): 52 | if not self.closed: 53 | self.__engine.dispose() 54 | self.__engine = None 55 | 56 | def reset(self): 57 | self.__extended_rows = self.__iter_extended_rows() 58 | 59 | @property 60 | def encoding(self): 61 | return self.__encoding 62 | 63 | @property 64 | def extended_rows(self): 65 | return self.__extended_rows 66 | 67 | # Private 68 | 69 | def __iter_extended_rows(self): 70 | table = sql.table(self.__table) 71 | order = sql.text(self.__order_by) if self.__order_by else None 72 | query = sql.select(['*']).select_from(table).order_by(order) 73 | result = self.__engine.execute(query) 74 | for row_number, row in enumerate(iter(result), start=1): 75 | yield (row_number, list(row.keys()), list(row)) 76 | -------------------------------------------------------------------------------- /tabulator/parsers/tsv.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import tsv 8 | from ..parser import Parser 9 | from .. import helpers 10 | 11 | 12 | # Module API 13 | 14 | class TSVParser(Parser): 15 | """Parser to parse linear TSV data format. 16 | 17 | See: http://dataprotocols.org/linear-tsv/ 18 | 19 | """ 20 | 21 | # Public 22 | 23 | options = [] 24 | 25 | def __init__(self, loader, force_parse=False): 26 | self.__loader = loader 27 | self.__force_parse = force_parse 28 | self.__extended_rows = None 29 | self.__encoding = None 30 | self.__chars = None 31 | 32 | @property 33 | def closed(self): 34 | return self.__chars is None or self.__chars.closed 35 | 36 | def open(self, source, encoding=None): 37 | self.close() 38 | self.__chars = self.__loader.load(source, encoding=encoding) 39 | self.__encoding = getattr(self.__chars, 'encoding', encoding) 40 | if self.__encoding: 41 | self.__encoding.lower() 42 | self.reset() 43 | 44 | def close(self): 45 | if not self.closed: 46 | self.__chars.close() 47 | 48 | def reset(self): 49 | helpers.reset_stream(self.__chars) 50 | self.__extended_rows = self.__iter_extended_rows() 51 | 52 | @property 53 | def encoding(self): 54 | return self.__encoding 55 | 56 | @property 57 | def extended_rows(self): 58 | return self.__extended_rows 59 | 60 | # Private 61 | 62 | def __iter_extended_rows(self): 63 | items = tsv.un(self.__chars) 64 | for row_number, item in enumerate(items, start=1): 65 | yield (row_number, None, list(item)) 66 | -------------------------------------------------------------------------------- /tabulator/parsers/xls.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import six 8 | import sys 9 | import xlrd 10 | from ..parser import Parser 11 | from .. import exceptions 12 | from .. import helpers 13 | 14 | 15 | # Module API 16 | 17 | class XLSParser(Parser): 18 | """Parser to parse Excel data format. 19 | """ 20 | 21 | # Public 22 | 23 | options = [ 24 | 'sheet', 25 | 'fill_merged_cells', 26 | ] 27 | 28 | def __init__(self, loader, force_parse=False, sheet=1, fill_merged_cells=False): 29 | self.__loader = loader 30 | self.__sheet_pointer = sheet 31 | self.__fill_merged_cells = fill_merged_cells 32 | self.__force_parse = force_parse 33 | self.__extended_rows = None 34 | self.__encoding = None 35 | self.__fragment = None 36 | self.__bytes = None 37 | 38 | @property 39 | def closed(self): 40 | return self.__bytes is None or self.__bytes.closed 41 | 42 | def open(self, source, encoding=None): 43 | self.close() 44 | self.__encoding = encoding 45 | self.__bytes = self.__loader.load(source, mode='b', encoding=encoding) 46 | 47 | # Get book 48 | file_contents = self.__bytes.read() 49 | try: 50 | self.__book = xlrd.open_workbook( 51 | file_contents=file_contents, 52 | encoding_override=encoding, 53 | formatting_info=True, 54 | logfile=sys.stderr 55 | ) 56 | except NotImplementedError: 57 | self.__book = xlrd.open_workbook( 58 | file_contents=file_contents, 59 | encoding_override=encoding, 60 | formatting_info=False, 61 | logfile=sys.stderr 62 | ) 63 | 64 | # Get sheet 65 | try: 66 | if isinstance(self.__sheet_pointer, six.string_types): 67 | self.__sheet = self.__book.sheet_by_name(self.__sheet_pointer) 68 | else: 69 | self.__sheet = self.__book.sheet_by_index(self.__sheet_pointer - 1) 70 | except (xlrd.XLRDError, IndexError): 71 | message = 'Excel document "%s" doesn\'t have a sheet "%s"' 72 | raise exceptions.SourceError(message % (source, self.__sheet_pointer)) 73 | self.__fragment = self.__sheet.name 74 | 75 | # Reset parser 76 | self.reset() 77 | 78 | def close(self): 79 | if not self.closed: 80 | self.__bytes.close() 81 | 82 | def reset(self): 83 | helpers.reset_stream(self.__bytes) 84 | self.__extended_rows = self.__iter_extended_rows() 85 | 86 | @property 87 | def encoding(self): 88 | return self.__encoding 89 | 90 | @property 91 | def fragment(self): 92 | return self.__fragment 93 | 94 | @property 95 | def extended_rows(self): 96 | return self.__extended_rows 97 | 98 | # Private 99 | 100 | def __iter_extended_rows(self): 101 | 102 | def type_value(ctype, value): 103 | """ Detects boolean value, int value, datetime """ 104 | 105 | # Boolean 106 | if ctype == xlrd.XL_CELL_BOOLEAN: 107 | return bool(value) 108 | 109 | # Excel numbers are only float 110 | # Float with no decimals can be cast into int 111 | if ctype == xlrd.XL_CELL_NUMBER and value == value // 1: 112 | return int(value) 113 | 114 | # Datetime 115 | if ctype == xlrd.XL_CELL_DATE: 116 | return xlrd.xldate.xldate_as_datetime(value, self.__book.datemode) 117 | 118 | return value 119 | 120 | for x in range(0, self.__sheet.nrows): 121 | row_number = x + 1 122 | row = [] 123 | for y, value in enumerate(self.__sheet.row_values(x)): 124 | value = type_value(self.__sheet.cell(x, y).ctype, value) 125 | if self.__fill_merged_cells: 126 | for xlo, xhi, ylo, yhi in self.__sheet.merged_cells: 127 | if x in range(xlo, xhi) and y in range(ylo, yhi): 128 | value = type_value(self.__sheet.cell(xlo, ylo).ctype, 129 | self.__sheet.cell_value(xlo, ylo)) 130 | row.append(value) 131 | yield (row_number, None, row) 132 | -------------------------------------------------------------------------------- /tabulator/parsers/xlsx.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import os 8 | import io 9 | import six 10 | import shutil 11 | import atexit 12 | import openpyxl 13 | import datetime 14 | import re 15 | from itertools import chain 16 | from tempfile import NamedTemporaryFile 17 | from ..parser import Parser 18 | from .. import exceptions 19 | from .. import helpers 20 | 21 | 22 | # Module API 23 | 24 | 25 | class XLSXParser(Parser): 26 | """Parser to parse Excel modern `xlsx` data format. 27 | """ 28 | 29 | # Public 30 | 31 | options = [ 32 | "sheet", 33 | "workbook_cache", 34 | "fill_merged_cells", 35 | "preserve_formatting", 36 | "adjust_floating_point_error", 37 | ] 38 | 39 | def __init__( 40 | self, 41 | loader, 42 | force_parse=False, 43 | sheet=1, 44 | workbook_cache=None, 45 | fill_merged_cells=False, 46 | preserve_formatting=False, 47 | adjust_floating_point_error=False, 48 | ): 49 | self.__loader = loader 50 | self.__sheet_pointer = sheet 51 | self.__workbook_cache = workbook_cache 52 | self.__fill_merged_cells = fill_merged_cells 53 | self.__preserve_formatting = preserve_formatting 54 | self.__adjust_floating_point_error = adjust_floating_point_error 55 | self.__extended_rows = None 56 | self.__encoding = None 57 | self.__fragment = None 58 | self.__force_parse = force_parse 59 | self.__bytes = None 60 | 61 | @property 62 | def closed(self): 63 | return self.__bytes is None or self.__bytes.closed 64 | 65 | def open(self, source, encoding=None): 66 | self.close() 67 | self.__encoding = encoding 68 | 69 | # Remote 70 | # Create copy for remote source 71 | # For remote stream we need local copy (will be deleted on close by Python) 72 | # https://docs.python.org/3.5/library/tempfile.html#tempfile.TemporaryFile 73 | if getattr(self.__loader, "remote", False): 74 | # Cached 75 | if self.__workbook_cache is not None and source in self.__workbook_cache: 76 | self.__bytes = io.open(self.__workbook_cache[source], "rb") 77 | # Not cached 78 | else: 79 | prefix = "tabulator-" 80 | delete = self.__workbook_cache is None 81 | source_bytes = self.__loader.load(source, mode="b", encoding=encoding) 82 | target_bytes = NamedTemporaryFile(prefix=prefix, delete=delete) 83 | shutil.copyfileobj(source_bytes, target_bytes) 84 | source_bytes.close() 85 | target_bytes.seek(0) 86 | self.__bytes = target_bytes 87 | if self.__workbook_cache is not None: 88 | self.__workbook_cache[source] = target_bytes.name 89 | atexit.register(os.remove, target_bytes.name) 90 | 91 | # Local 92 | else: 93 | self.__bytes = self.__loader.load(source, mode="b", encoding=encoding) 94 | 95 | # Get book 96 | # To fill merged cells we can't use read-only because 97 | # `sheet.merged_cell_ranges` is not available in this mode 98 | self.__book = openpyxl.load_workbook( 99 | self.__bytes, read_only=not self.__fill_merged_cells, data_only=True 100 | ) 101 | 102 | # Get sheet 103 | try: 104 | if isinstance(self.__sheet_pointer, six.string_types): 105 | self.__sheet = self.__book[self.__sheet_pointer] 106 | else: 107 | self.__sheet = self.__book.worksheets[self.__sheet_pointer - 1] 108 | except (KeyError, IndexError): 109 | message = 'Excel document "%s" doesn\'t have a sheet "%s"' 110 | raise exceptions.SourceError(message % (source, self.__sheet_pointer)) 111 | self.__fragment = self.__sheet.title 112 | self.__process_merged_cells() 113 | 114 | # Reset parser 115 | self.reset() 116 | 117 | def close(self): 118 | if not self.closed: 119 | self.__bytes.close() 120 | 121 | def reset(self): 122 | helpers.reset_stream(self.__bytes) 123 | self.__extended_rows = self.__iter_extended_rows() 124 | 125 | @property 126 | def encoding(self): 127 | return self.__encoding 128 | 129 | @property 130 | def fragment(self): 131 | return self.__fragment 132 | 133 | @property 134 | def extended_rows(self): 135 | return self.__extended_rows 136 | 137 | # Private 138 | 139 | def __iter_extended_rows(self): 140 | for row_number, row in enumerate(self.__sheet.iter_rows(), start=1): 141 | yield ( 142 | row_number, 143 | None, 144 | extract_row_values( 145 | row, self.__preserve_formatting, self.__adjust_floating_point_error, 146 | ), 147 | ) 148 | 149 | def __process_merged_cells(self): 150 | if self.__fill_merged_cells: 151 | for merged_cell_range in list(self.__sheet.merged_cells.ranges): 152 | merged_cell_range = str(merged_cell_range) 153 | self.__sheet.unmerge_cells(merged_cell_range) 154 | merged_rows = openpyxl.utils.rows_from_range(merged_cell_range) 155 | coordinates = list(chain.from_iterable(merged_rows)) 156 | value = self.__sheet[coordinates[0]].value 157 | for coordinate in coordinates: 158 | cell = self.__sheet[coordinate] 159 | cell.value = value 160 | 161 | 162 | # Internal 163 | 164 | EXCEL_CODES = { 165 | "yyyy": "%Y", 166 | "yy": "%y", 167 | "dddd": "%A", 168 | "ddd": "%a", 169 | "dd": "%d", 170 | "d": "%-d", 171 | # Different from excel as there is no J-D in strftime 172 | "mmmmmm": "%b", 173 | "mmmm": "%B", 174 | "mmm": "%b", 175 | "hh": "%H", 176 | "h": "%-H", 177 | "ss": "%S", 178 | "s": "%-S", 179 | # Possibly different from excel as there is no am/pm in strftime 180 | "am/pm": "%p", 181 | # Different from excel as there is no A/P or a/p in strftime 182 | "a/p": "%p", 183 | } 184 | 185 | EXCEL_MINUTE_CODES = { 186 | "mm": "%M", 187 | "m": "%-M", 188 | } 189 | EXCEL_MONTH_CODES = { 190 | "mm": "%m", 191 | "m": "%-m", 192 | } 193 | 194 | EXCEL_MISC_CHARS = [ 195 | "$", 196 | "+", 197 | "(", 198 | ":", 199 | "^", 200 | "'", 201 | "{", 202 | "<", 203 | "=", 204 | "-", 205 | "/", 206 | ")", 207 | "!", 208 | "&", 209 | "~", 210 | "}", 211 | ">", 212 | " ", 213 | ] 214 | 215 | EXCEL_ESCAPE_CHAR = "\\" 216 | EXCEL_SECTION_DIVIDER = ";" 217 | 218 | 219 | def convert_excel_date_format_string(excel_date): 220 | """ 221 | Created using documentation here: 222 | https://support.office.com/en-us/article/review-guidelines-for-customizing-a-number-format-c0a1d1fa-d3f4-4018-96b7-9c9354dd99f5 223 | 224 | """ 225 | # The python date string that is being built 226 | python_date = "" 227 | # The excel code currently being parsed 228 | excel_code = "" 229 | prev_code = "" 230 | # If the previous character was the escape character 231 | char_escaped = False 232 | # If we are in a quotation block (surrounded by "") 233 | quotation_block = False 234 | # Variables used for checking if a code should be a minute or a month 235 | checking_minute_or_month = False 236 | minute_or_month_buffer = "" 237 | 238 | for c in excel_date: 239 | ec = excel_code.lower() 240 | # The previous character was an escape, the next character should be added normally 241 | if char_escaped: 242 | if checking_minute_or_month: 243 | minute_or_month_buffer += c 244 | else: 245 | python_date += c 246 | char_escaped = False 247 | continue 248 | # Inside a quotation block 249 | if quotation_block: 250 | if c == '"': 251 | # Quotation block should now end 252 | quotation_block = False 253 | elif checking_minute_or_month: 254 | minute_or_month_buffer += c 255 | else: 256 | python_date += c 257 | continue 258 | # The start of a quotation block 259 | if c == '"': 260 | quotation_block = True 261 | continue 262 | if c == EXCEL_SECTION_DIVIDER: 263 | # We ignore excel sections for datetimes 264 | break 265 | 266 | is_escape_char = c == EXCEL_ESCAPE_CHAR 267 | # The am/pm and a/p code add some complications, need to make sure we are not that code 268 | is_misc_char = c in EXCEL_MISC_CHARS and ( 269 | c != "/" or (ec != "am" and ec != "a") 270 | ) 271 | new_excel_code = False 272 | 273 | # Handle a new code without a different characeter in between 274 | if ( 275 | ec 276 | and not is_escape_char 277 | and not is_misc_char 278 | # If the code does not start with c, we are in a new code 279 | and not ec.startswith(c.lower()) 280 | # other than the case where we are building up 281 | # am/pm (minus the case where it is fully built), we are in a new code 282 | and (not ec.startswith("a") or ec == "am/pm") 283 | ): 284 | new_excel_code = True 285 | 286 | # Code is finished, check if it is a proper code 287 | if (is_escape_char or is_misc_char or new_excel_code) and ec: 288 | # Checking if the previous code should have been minute or month 289 | if checking_minute_or_month: 290 | if ec == "ss" or ec == "s": 291 | # It should be a minute! 292 | minute_or_month_buffer = ( 293 | EXCEL_MINUTE_CODES[prev_code] + minute_or_month_buffer 294 | ) 295 | else: 296 | # It should be a months! 297 | minute_or_month_buffer = ( 298 | EXCEL_MONTH_CODES[prev_code] + minute_or_month_buffer 299 | ) 300 | python_date += minute_or_month_buffer 301 | checking_minute_or_month = False 302 | minute_or_month_buffer = "" 303 | 304 | if ec in EXCEL_CODES: 305 | python_date += EXCEL_CODES[ec] 306 | # Handle months/minutes differently 307 | elif ec in EXCEL_MINUTE_CODES: 308 | # If preceded by hours, we know this is referring to minutes 309 | if prev_code == "h" or prev_code == "hh": 310 | python_date += EXCEL_MINUTE_CODES[ec] 311 | else: 312 | # Have to check if the next code is ss or s 313 | checking_minute_or_month = True 314 | minute_or_month_buffer = "" 315 | else: 316 | # Have to abandon this attempt to convert because the code is not recognized 317 | return None 318 | prev_code = ec 319 | excel_code = "" 320 | if is_escape_char: 321 | char_escaped = True 322 | elif is_misc_char: 323 | # Add the misc char 324 | if checking_minute_or_month: 325 | minute_or_month_buffer += c 326 | else: 327 | python_date += c 328 | else: 329 | # Just add to the code 330 | excel_code += c 331 | 332 | # Complete, check if there is still a buffer 333 | if checking_minute_or_month: 334 | # We know it's a month because there were no more codes after 335 | minute_or_month_buffer = EXCEL_MONTH_CODES[prev_code] + minute_or_month_buffer 336 | python_date += minute_or_month_buffer 337 | if excel_code: 338 | ec = excel_code.lower() 339 | if ec in EXCEL_CODES: 340 | python_date += EXCEL_CODES[ec] 341 | elif ec in EXCEL_MINUTE_CODES: 342 | if prev_code == "h" or prev_code == "hh": 343 | python_date += EXCEL_MINUTE_CODES[ec] 344 | else: 345 | python_date += EXCEL_MONTH_CODES[ec] 346 | else: 347 | return None 348 | return python_date 349 | 350 | 351 | def eformat(f, prec, exp_digits): 352 | """ 353 | Formats to Scientific Notation, including precise exponent digits 354 | 355 | """ 356 | s = "%.*e" % (prec, f) 357 | mantissa, exp = s.split("e") 358 | # add 1 to digits as 1 is taken by sign +/- 359 | return "%sE%+0*d" % (mantissa, exp_digits + 1, int(exp)) 360 | 361 | 362 | def convert_excel_number_format_string( 363 | excel_number, value, 364 | ): 365 | """ 366 | A basic attempt to convert excel number_format to a number string 367 | 368 | The important goal here is to get proper amount of rounding 369 | """ 370 | if "@" in excel_number: 371 | # We don't try to parse complicated strings 372 | return str(value) 373 | percentage = False 374 | if excel_number.endswith("%"): 375 | value = value * 100 376 | excel_number = excel_number[:-1] 377 | percentage = True 378 | if excel_number == "General": 379 | return value 380 | multi_codes = excel_number.split(";") 381 | if value < 0 and len(multi_codes) > 1: 382 | excel_number = multi_codes[1] 383 | else: 384 | excel_number = multi_codes[0] 385 | 386 | code = excel_number.split(".") 387 | 388 | if len(code) > 2: 389 | return None 390 | if len(code) < 2: 391 | # No decimals 392 | new_value = "{0:.0f}".format(value) 393 | 394 | # Currently we do not support "engineering notation" 395 | elif re.match(r"^#+0*E\+0*$", code[1]): 396 | return value 397 | elif re.match(r"^0*E\+0*$", code[1]): 398 | # Handle scientific notation 399 | 400 | # Note, it will only actually be returned as a string if 401 | # type is not inferred 402 | 403 | prec = len(code[1]) - len(code[1].lstrip("0")) 404 | exp_digits = len(code[1]) - len(code[1].rstrip("0")) 405 | return eformat(value, prec, exp_digits) 406 | 407 | else: 408 | decimal_section = code[1] 409 | # Only pay attention to the 0, # and ? characters as they provide precision information 410 | decimal_section = "".join(d for d in decimal_section if d in ["0", "#", "?"]) 411 | 412 | # Count the number of hashes at the end of the decimal_section in order to know how 413 | # the number should be truncated 414 | number_hash = 0 415 | for i in reversed(range(len(decimal_section))): 416 | if decimal_section[i] == "#": 417 | number_hash += 1 418 | else: 419 | break 420 | string_format_code = "{0:." + str(len(decimal_section)) + "f}" 421 | new_value = string_format_code.format(value) 422 | if number_hash > 0: 423 | for i in range(number_hash): 424 | if new_value.endswith("0"): 425 | new_value = new_value[:-1] 426 | if percentage: 427 | return new_value + "%" 428 | 429 | return new_value 430 | 431 | 432 | def extract_row_values( 433 | row, preserve_formatting=False, adjust_floating_point_error=False, 434 | ): 435 | if preserve_formatting: 436 | values = [] 437 | for cell in row: 438 | number_format = cell.number_format or "" 439 | value = cell.value 440 | 441 | if isinstance(cell.value, datetime.datetime) or isinstance( 442 | cell.value, datetime.time 443 | ): 444 | temporal_format = convert_excel_date_format_string(number_format) 445 | if temporal_format: 446 | value = cell.value.strftime(temporal_format) 447 | elif ( 448 | adjust_floating_point_error 449 | and isinstance(cell.value, float) 450 | and number_format == "General" 451 | ): 452 | # We have a float with format General 453 | # Calculate the number of integer digits 454 | integer_digits = len(str(int(cell.value))) 455 | # Set the precision to 15 minus the number of integer digits 456 | precision = 15 - (integer_digits) 457 | value = round(cell.value, precision) 458 | elif isinstance(cell.value, (int, float)): 459 | new_value = convert_excel_number_format_string( 460 | number_format, cell.value, 461 | ) 462 | if new_value: 463 | value = new_value 464 | values.append(value) 465 | return values 466 | return list(cell.value for cell in row) 467 | -------------------------------------------------------------------------------- /tabulator/validate.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | from . import config 8 | from . import helpers 9 | from . import exceptions 10 | 11 | 12 | # Module API 13 | 14 | def validate(source, scheme=None, format=None): 15 | """Check if tabulator is able to load the source. 16 | 17 | # Arguments 18 | source (Union[str, IO]): The source path or IO object. 19 | scheme (str, optional): The source scheme. Auto-detect by default. 20 | format (str, optional): The source file format. Auto-detect by default. 21 | 22 | # Raises 23 | SchemeError: The file scheme is not supported. 24 | FormatError: The file format is not supported. 25 | 26 | # Returns 27 | bool: Whether tabulator is able to load the source file. 28 | 29 | """ 30 | 31 | # Get scheme and format 32 | detected_scheme, detected_format = helpers.detect_scheme_and_format(source) 33 | scheme = scheme or detected_scheme 34 | format = format or detected_format 35 | 36 | # Validate scheme and format 37 | if scheme is not None: 38 | if scheme not in config.LOADERS: 39 | raise exceptions.SchemeError('Scheme "%s" is not supported' % scheme) 40 | if format not in config.PARSERS: 41 | raise exceptions.FormatError('Format "%s" is not supported' % format) 42 | 43 | return True 44 | -------------------------------------------------------------------------------- /tabulator/writer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | from six import add_metaclass 8 | from abc import ABCMeta, abstractmethod 9 | 10 | 11 | # Module API 12 | 13 | @add_metaclass(ABCMeta) 14 | class Writer(object): 15 | """Abstract class implemented by the data writers. 16 | 17 | The writers inherit and implement this class' methods to add support for a 18 | new file destination. 19 | 20 | # Arguments 21 | **options (dict): Writer options. 22 | 23 | """ 24 | 25 | # Public 26 | 27 | options = [] 28 | 29 | def __init__(self, **options): 30 | pass 31 | 32 | @abstractmethod 33 | def write(self, source, target, headers, encoding=None): 34 | """Writes source data to target. 35 | 36 | # Arguments 37 | source (str): Source data. 38 | target (str): Write target. 39 | headers (List[str]): List of header names. 40 | encoding (str, optional): Source file encoding. 41 | 42 | # Returns 43 | count (int?): Written rows count if available 44 | 45 | """ 46 | pass 47 | -------------------------------------------------------------------------------- /tabulator/writers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/tabulator/writers/__init__.py -------------------------------------------------------------------------------- /tabulator/writers/csv.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import io 8 | import six 9 | import unicodecsv 10 | from ..writer import Writer 11 | from .. import helpers 12 | 13 | 14 | # Module API 15 | 16 | class CSVWriter(Writer): 17 | """CSV writer. 18 | """ 19 | 20 | # Public 21 | 22 | options = [ 23 | 'delimiter', 24 | ] 25 | 26 | def __init__(self, **options): 27 | 28 | # Make bytes 29 | if six.PY2: 30 | for key, value in options.items(): 31 | if isinstance(value, six.string_types): 32 | options[key] = str(value) 33 | 34 | # Set attributes 35 | self.__options = options 36 | 37 | def write(self, source, target, headers, encoding=None): 38 | helpers.ensure_dir(target) 39 | count = 0 40 | with io.open(target, 'wb') as file: 41 | writer = unicodecsv.writer(file, encoding=encoding, **self.__options) 42 | if headers: 43 | writer.writerow(headers) 44 | for row in source: 45 | count += 1 46 | writer.writerow(row) 47 | return count 48 | -------------------------------------------------------------------------------- /tabulator/writers/json.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import json 8 | from ..writer import Writer 9 | from .. import helpers 10 | 11 | 12 | # Module API 13 | 14 | class JSONWriter(Writer): 15 | """JSON writer. 16 | """ 17 | 18 | # Public 19 | 20 | options = [ 21 | 'keyed', 22 | ] 23 | 24 | def __init__(self, keyed=False): 25 | self.__keyed = keyed 26 | 27 | def write(self, source, target, headers, encoding=None): 28 | helpers.ensure_dir(target) 29 | data = [] 30 | count = 0 31 | if not self.__keyed: 32 | data.append(headers) 33 | for row in source: 34 | if self.__keyed: 35 | row = dict(zip(headers, row)) 36 | data.append(row) 37 | count += 1 38 | with open(target, 'w') as file: 39 | json.dump(data, file, indent=2) 40 | return count 41 | -------------------------------------------------------------------------------- /tabulator/writers/sql.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | from sqlalchemy import create_engine, MetaData, Table, Column, String 8 | from ..writer import Writer 9 | from .. import exceptions 10 | 11 | 12 | # Module API 13 | 14 | class SQLWriter(Writer): 15 | """SQL writer. 16 | """ 17 | 18 | # Public 19 | 20 | options = [ 21 | 'table', 22 | ] 23 | 24 | def __init__(self, table=None, **options): 25 | 26 | # Ensure table 27 | if table is None: 28 | raise exceptions.TabulatorException('Format `sql` requires `table` option.') 29 | 30 | self.__table = table 31 | 32 | def write(self, source, target, headers, encoding=None): 33 | engine = create_engine(target) 34 | count = 0 35 | buffer = [] 36 | buffer_size = 1000 37 | with engine.begin() as conn: 38 | meta = MetaData() 39 | columns = [Column(header, String()) for header in headers] 40 | table = Table(self.__table, meta, *columns) 41 | meta.create_all(conn) 42 | for row in source: 43 | count += 1 44 | buffer.append(row) 45 | if len(buffer) > buffer_size: 46 | conn.execute(table.insert().values(buffer)) 47 | buffer = [] 48 | if len(buffer): 49 | conn.execute(table.insert().values(buffer)) 50 | return count 51 | -------------------------------------------------------------------------------- /tabulator/writers/xlsx.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import six 8 | import openpyxl 9 | from ..writer import Writer 10 | from .. import helpers 11 | 12 | 13 | # Module API 14 | 15 | class XLSXWriter(Writer): 16 | """XLSX writer. 17 | """ 18 | 19 | # Public 20 | 21 | options = [ 22 | 'sheet', 23 | ] 24 | 25 | def __init__(self, **options): 26 | 27 | # Make bytes 28 | if six.PY2: 29 | for key, value in options.items(): 30 | if isinstance(value, six.string_types): 31 | options[key] = str(value) 32 | 33 | # Set attributes 34 | self.__options = options 35 | 36 | def write(self, source, target, headers, encoding=None): 37 | helpers.ensure_dir(target) 38 | count = 0 39 | wb = openpyxl.Workbook(write_only=True) 40 | ws = wb.create_sheet(title=self.__options.get('sheet')) 41 | ws.append(headers) 42 | for row in source: 43 | ws.append(row) 44 | count += 1 45 | wb.save(target) 46 | return count 47 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import pytest 8 | import sqlite3 9 | 10 | 11 | # Fixtures 12 | 13 | @pytest.fixture 14 | def database_url(tmpdir): 15 | path = str(tmpdir.join('database.db')) 16 | conn = sqlite3.connect(path) 17 | conn.execute('CREATE TABLE data (id INTEGER PRIMARY KEY, name TEXT)') 18 | conn.execute('INSERT INTO data VALUES (1, "english"), (2, "中国人")') 19 | conn.commit() 20 | yield 'sqlite:///%s' % path 21 | conn.close() 22 | -------------------------------------------------------------------------------- /tests/formats/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/tests/formats/__init__.py -------------------------------------------------------------------------------- /tests/formats/test_csv.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import io 8 | import pytest 9 | from mock import Mock 10 | from tabulator import Stream 11 | from tabulator.parsers.csv import CSVParser 12 | BASE_URL = 'https://raw.githubusercontent.com/okfn/tabulator-py/master/%s' 13 | 14 | 15 | # Read 16 | 17 | def test_stream_local_csv(): 18 | with Stream('data/table.csv') as stream: 19 | assert stream.headers is None 20 | assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']] 21 | 22 | 23 | def test_stream_local_csv_with_bom(): 24 | with Stream('data/special/bom.csv') as stream: 25 | assert stream.headers is None 26 | assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']] 27 | 28 | 29 | def test_stream_local_csv_with_bom_with_encoding(): 30 | with Stream('data/special/bom.csv', encoding='utf-8') as stream: 31 | assert stream.headers is None 32 | assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']] 33 | 34 | 35 | def test_stream_csv_excel(): 36 | source = 'value1,value2\nvalue3,value4' 37 | with Stream(source, scheme='text', format='csv') as stream: 38 | assert stream.read() == [['value1', 'value2'], ['value3', 'value4']] 39 | 40 | 41 | def test_stream_csv_excel_tab(): 42 | source = 'value1\tvalue2\nvalue3\tvalue4' 43 | with Stream(source, scheme='text', format='csv', delimiter='\t') as stream: 44 | assert stream.read() == [['value1', 'value2'], ['value3', 'value4']] 45 | 46 | 47 | def test_stream_csv_unix(): 48 | source = '"value1","value2"\n"value3","value4"' 49 | with Stream(source, scheme='text', format='csv') as stream: 50 | assert stream.read() == [['value1', 'value2'], ['value3', 'value4']] 51 | 52 | 53 | def test_stream_csv_escaping(): 54 | with Stream('data/special/escaping.csv', escapechar='\\') as stream: 55 | assert stream.read() == [ 56 | ['ID', 'Test'], 57 | ['1', 'Test line 1'], 58 | ['2', 'Test " line 2'], 59 | ['3', 'Test " line 3'], 60 | ] 61 | 62 | 63 | def test_stream_csv_doublequote(): 64 | with Stream('data/special/doublequote.csv') as stream: 65 | for row in stream: 66 | assert len(row) == 17 67 | 68 | 69 | def test_stream_stream_csv(): 70 | source = io.open('data/table.csv', mode='rb') 71 | with Stream(source, format='csv') as stream: 72 | assert stream.headers is None 73 | assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']] 74 | 75 | 76 | def test_stream_text_csv(): 77 | source = 'text://id,name\n1,english\n2,中国人\n' 78 | with Stream(source, format='csv') as stream: 79 | assert stream.headers is None 80 | assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']] 81 | 82 | 83 | @pytest.mark.remote 84 | def test_stream_remote_csv(): 85 | with Stream(BASE_URL % 'data/table.csv') as stream: 86 | assert stream.headers is None 87 | assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']] 88 | 89 | 90 | @pytest.mark.remote 91 | def test_stream_remote_csv_non_ascii_url(): 92 | with Stream('http://data.defra.gov.uk/ops/government_procurement_card/over_£500_GPC_apr_2013.csv') as stream: 93 | assert stream.sample[0] == [ 94 | 'Entity', 95 | 'Transaction Posting Date', 96 | 'Merchant Name', 97 | 'Amount', 98 | 'Description'] 99 | 100 | 101 | def test_stream_csv_delimiter(): 102 | source = '"value1";"value2"\n"value3";"value4"' 103 | with Stream(source, scheme='text', format='csv', delimiter=';') as stream: 104 | assert stream.read() == [['value1', 'value2'], ['value3', 'value4']] 105 | 106 | 107 | def test_stream_csv_escapechar(): 108 | source = 'value1%,value2\nvalue3%,value4' 109 | with Stream(source, scheme='text', format='csv', escapechar='%') as stream: 110 | assert stream.read() == [['value1,value2'], ['value3,value4']] 111 | 112 | 113 | def test_stream_csv_quotechar(): 114 | source = '%value1,value2%\n%value3,value4%' 115 | with Stream(source, scheme='text', format='csv', quotechar='%') as stream: 116 | assert stream.read() == [['value1,value2'], ['value3,value4']] 117 | 118 | 119 | def test_stream_csv_skipinitialspace(): 120 | source = 'value1, value2\nvalue3, value4' 121 | with Stream(source, scheme='text', format='csv', skipinitialspace=True) as stream: 122 | assert stream.read() == [['value1', 'value2'], ['value3', 'value4']] 123 | 124 | 125 | def test_stream_csv_detect_delimiter_tab(): 126 | source = 'a1\tb1\tc1A,c1B\na2\tb2\tc2\n' 127 | with Stream(source, scheme='text', format='csv') as stream: 128 | assert stream.read() == [['a1', 'b1', 'c1A,c1B'], ['a2', 'b2', 'c2']] 129 | 130 | 131 | def test_stream_csv_detect_delimiter_semicolon(): 132 | source = 'a1;b1\na2;b2\n' 133 | with Stream(source, scheme='text', format='csv') as stream: 134 | assert stream.read() == [['a1', 'b1'], ['a2', 'b2']] 135 | 136 | 137 | def test_stream_csv_detect_delimiter_pipe(): 138 | source = 'a1|b1\na2|b2\n' 139 | with Stream(source, scheme='text', format='csv') as stream: 140 | assert stream.read() == [['a1', 'b1'], ['a2', 'b2']] 141 | 142 | 143 | def test_stream_csv_dialect_should_not_persist_if_sniffing_fails_issue_goodtables_228(): 144 | source1 = 'a;b;c\n#comment' 145 | source2 = 'a,b,c\n#comment' 146 | with Stream(source1, scheme='text', format='csv', headers=1, delimiter=';') as stream: 147 | assert stream.headers == ['a', 'b', 'c'] 148 | with Stream(source2, scheme='text', format='csv', headers=1) as stream: 149 | assert stream.headers == ['a', 'b', 'c'] 150 | 151 | 152 | def test_stream_csv_quotechar_is_empty_string(): 153 | source = 'value1,value2",value3' 154 | with Stream(source, scheme='text', format='csv', quotechar='') as stream: 155 | stream.read() == ['value1', 'value2"', 'value3'] 156 | 157 | 158 | # Write 159 | 160 | def test_stream_save_csv(tmpdir): 161 | source = 'data/table.csv' 162 | target = str(tmpdir.join('table.csv')) 163 | with Stream(source, headers=1) as stream: 164 | assert stream.save(target) == 2 165 | with Stream(target, headers=1) as stream: 166 | assert stream.headers == ['id', 'name'] 167 | assert stream.read(extended=True) == [ 168 | (2, ['id', 'name'], ['1', 'english']), 169 | (3, ['id', 'name'], ['2', '中国人']), 170 | ] 171 | 172 | 173 | # Internal 174 | 175 | def test_parser_csv(): 176 | 177 | source = 'data/table.csv' 178 | encoding = None 179 | loader = Mock() 180 | loader.load = Mock(return_value=io.open(source, encoding='utf-8')) 181 | parser = CSVParser(loader) 182 | 183 | assert parser.closed 184 | parser.open(source, encoding=encoding) 185 | assert not parser.closed 186 | 187 | assert list(parser.extended_rows) == [ 188 | (1, None, ['id', 'name']), 189 | (2, None, ['1', 'english']), 190 | (3, None, ['2', '中国人'])] 191 | 192 | assert len(list(parser.extended_rows)) == 0 193 | parser.reset() 194 | assert len(list(parser.extended_rows)) == 3 195 | 196 | parser.close() 197 | assert parser.closed 198 | -------------------------------------------------------------------------------- /tests/formats/test_datapackage.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import os 8 | import json 9 | import pytest 10 | from mock import Mock 11 | from tabulator import Stream 12 | from tabulator.parsers.datapackage import DataPackageParser 13 | 14 | 15 | # Read 16 | 17 | 18 | def test_stream_datapackage(): 19 | with Stream('data/datapackage.json', resource=0, headers=1) as stream: 20 | assert stream.fragment == 'first-resource' 21 | assert stream.headers == ['id', 'name'] 22 | assert stream.read(keyed=True) == [ 23 | {'id': 1, 'name': 'english'}, 24 | {'id': 2, 'name': '中国人'}] 25 | 26 | 27 | def test_second_resource(): 28 | with Stream('data/datapackage.json', resource=1, headers=1) as stream: 29 | assert stream.fragment == 'number-two' 30 | assert stream.headers == ['id', 'name'] 31 | assert stream.read(keyed=True) == [ 32 | {'id': 1, 'name': '中国人'}, 33 | {'id': 2, 'name': 'english'} 34 | ] 35 | 36 | 37 | def test_named_resource(): 38 | curdir = os.getcwd() 39 | try: 40 | os.chdir('data/') 41 | with Stream('datapackage.json', resource='number-two', headers=1) as stream: 42 | assert stream.fragment == 'number-two' 43 | assert stream.headers == ['id', 'name'] 44 | assert stream.read(keyed=True) == [ 45 | {'id': 1, 'name': '中国人'}, 46 | {'id': 2, 'name': 'english'}, 47 | ] 48 | finally: 49 | os.chdir(curdir) 50 | 51 | 52 | # Internal 53 | 54 | def test_datapackage_parser(): 55 | 56 | source = 'data/datapackage.json' 57 | parser = DataPackageParser(None) 58 | 59 | assert parser.closed is True 60 | parser.open(source) 61 | assert parser.closed is False 62 | 63 | assert list(parser.extended_rows) == [ 64 | (1, ['id', 'name'], [1, 'english']), 65 | (2, ['id', 'name'], [2, '中国人']), 66 | ] 67 | 68 | assert len(list(parser.extended_rows)) == 0 69 | parser.reset() 70 | assert len(list(parser.extended_rows)) == 2 71 | 72 | parser.close() 73 | assert parser.closed 74 | 75 | 76 | def test_datapackage_list(): 77 | curdir= os.getcwd() 78 | try: 79 | os.chdir('data/') 80 | stream = json.load(open('datapackage.json')) 81 | 82 | parser = DataPackageParser(None) 83 | parser.open(stream) 84 | 85 | assert list(parser.extended_rows) == [ 86 | (1, ['id', 'name'], [1, 'english']), 87 | (2, ['id', 'name'], [2, '中国人']) 88 | ] 89 | finally: 90 | os.chdir(curdir) 91 | -------------------------------------------------------------------------------- /tests/formats/test_gsheet.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import pytest 8 | from tabulator import Stream, exceptions 9 | 10 | 11 | # Read 12 | 13 | @pytest.mark.remote 14 | def test_stream_gsheet(): 15 | source = 'https://docs.google.com/spreadsheets/d/1mHIWnDvW9cALRMq9OdNfRwjAthCUFUOACPp0Lkyl7b4/edit?usp=sharing' 16 | with Stream(source) as stream: 17 | assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']] 18 | 19 | 20 | @pytest.mark.remote 21 | def test_stream_gsheet_with_gid(): 22 | source = 'https://docs.google.com/spreadsheets/d/1mHIWnDvW9cALRMq9OdNfRwjAthCUFUOACPp0Lkyl7b4/edit#gid=960698813' 23 | with Stream(source) as stream: 24 | assert stream.read() == [['id', 'name'], ['2', '中国人'], ['3', 'german']] 25 | 26 | 27 | @pytest.mark.remote 28 | def test_stream_gsheet_bad_url(): 29 | stream = Stream('https://docs.google.com/spreadsheets/d/bad') 30 | with pytest.raises(exceptions.HTTPError) as excinfo: 31 | stream.open() 32 | -------------------------------------------------------------------------------- /tests/formats/test_html.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import io 8 | import pytest 9 | from mock import Mock 10 | from six import StringIO 11 | from tabulator import exceptions, Stream 12 | 13 | 14 | # Read 15 | 16 | @pytest.mark.parametrize('source, selector', [ 17 | ('data/table1.html', 'table'), 18 | ('data/table2.html', 'table'), 19 | ('data/table3.html', '.mememe'), 20 | ('data/table4.html', ''), 21 | ]) 22 | def test_stream_html(source, selector): 23 | with Stream(source, selector=selector, headers=1, encoding='utf8') as stream: 24 | assert stream.headers == ['id', 'name'] 25 | assert stream.read(keyed=True) == [ 26 | {'id': '1', 'name': 'english'}, 27 | {'id': '2', 'name': '中国人'}] 28 | 29 | def test_stream_html_raw_html(): 30 | with Stream('data/table3.html', selector='.mememe', headers=1, encoding='utf8', raw_html=True) as stream: 31 | assert stream.headers == ['id', 'name'] 32 | assert stream.read(keyed=True) == [ 33 | {'id': '1', 'name': 'english'}, 34 | {'id': '2', 'name': '中国人'}] 35 | 36 | 37 | -------------------------------------------------------------------------------- /tests/formats/test_inline.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import pytest 8 | from collections import OrderedDict 9 | from tabulator import Stream, exceptions 10 | 11 | 12 | # Read 13 | 14 | def test_stream_inline(): 15 | source = [['id', 'name'], ['1', 'english'], ['2', '中国人']] 16 | with Stream(source) as stream: 17 | assert stream.headers is None 18 | assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']] 19 | 20 | 21 | def test_stream_inline_iterator(): 22 | source = iter([['id', 'name'], ['1', 'english'], ['2', '中国人']]) 23 | with Stream(source) as stream: 24 | assert stream.headers is None 25 | assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']] 26 | 27 | 28 | def test_stream_inline_iterator(): 29 | def generator(): 30 | yield ['id', 'name'] 31 | yield ['1', 'english'] 32 | yield ['2', '中国人'] 33 | with pytest.raises(exceptions.SourceError) as excinfo: 34 | iterator = generator() 35 | Stream(iterator).open() 36 | assert 'callable' in str(excinfo.value) 37 | 38 | 39 | def test_stream_inline_generator(): 40 | def generator(): 41 | yield ['id', 'name'] 42 | yield ['1', 'english'] 43 | yield ['2', '中国人'] 44 | with Stream(generator) as stream: 45 | assert stream.headers is None 46 | assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']] 47 | 48 | 49 | def test_stream_inline_keyed(): 50 | source = [{'id': '1', 'name': 'english'}, {'id': '2', 'name': '中国人'}] 51 | with Stream(source, format='inline') as stream: 52 | assert stream.headers is None 53 | assert stream.read() == [['1', 'english'], ['2', '中国人']] 54 | 55 | 56 | def test_stream_inline_keyed_with_headers_argument(): 57 | source = [{'id': '1', 'name': 'english'}, {'id': '2', 'name': '中国人'}] 58 | with Stream(source, format='inline', headers=['name', 'id']) as stream: 59 | assert stream.headers == ['name', 'id'] 60 | assert stream.read() == [['english', '1'], ['中国人', '2']] 61 | 62 | 63 | def test_stream_inline_ordered_dict(): 64 | source = [ 65 | OrderedDict([('name', 'english'), ('id', '1')]), 66 | OrderedDict([('name', '中国人'), ('id', '2')]), 67 | ] 68 | with Stream(source, headers=1) as stream: 69 | assert stream.headers == ['name', 'id'] 70 | assert stream.read() == [['english', '1'], ['中国人', '2']] 71 | 72 | 73 | # Write 74 | 75 | def test_stream_save_inline_keyed_with_headers_argument(tmpdir): 76 | source = [{'key1': 'value1', 'key2': 'value2'}] 77 | target = str(tmpdir.join('table.csv')) 78 | with Stream(source, headers=['key2', 'key1']) as stream: 79 | stream.save(target) 80 | with Stream(target, headers=1) as stream: 81 | assert stream.headers == ['key2', 'key1'] 82 | assert stream.read() == [['value2', 'value1']] 83 | -------------------------------------------------------------------------------- /tests/formats/test_json.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import io 8 | import json 9 | import pytest 10 | from mock import Mock 11 | from tabulator import Stream, exceptions 12 | from tabulator.parsers.json import JSONParser 13 | BASE_URL = 'https://raw.githubusercontent.com/okfn/tabulator-py/master/%s' 14 | 15 | 16 | # Read 17 | 18 | def test_stream_local_json_dicts(): 19 | with Stream('data/table-dicts.json') as stream: 20 | assert stream.headers is None 21 | assert stream.read() == [[1, 'english'], [2, '中国人']] 22 | 23 | 24 | def test_stream_local_json_lists(): 25 | with Stream('data/table-lists.json') as stream: 26 | assert stream.headers is None 27 | assert stream.read() == [['id', 'name'], [1, 'english'], [2, '中国人']] 28 | 29 | 30 | def test_stream_text_json_dicts(): 31 | source = '[{"id": 1, "name": "english" }, {"id": 2, "name": "中国人" }]' 32 | with Stream(source, scheme='text', format='json') as stream: 33 | assert stream.headers is None 34 | assert stream.read() == [[1, 'english'], [2, '中国人']] 35 | 36 | 37 | def test_stream_text_json_dicts_with_headers_argument(): 38 | source = '[{"id": 1, "name": "english" }, {"id": 2, "name": "中国人" }]' 39 | with Stream(source, scheme='text', format='json', headers=['name', 'id']) as stream: 40 | assert stream.headers == ['name', 'id'] 41 | assert stream.read() == [['english', 1], ['中国人', 2]] 42 | 43 | 44 | def test_stream_text_json_lists(): 45 | source = '[["id", "name"], [1, "english"], [2, "中国人"]]' 46 | with Stream(source, scheme='text', format='json') as stream: 47 | assert stream.headers is None 48 | assert stream.read() == [['id', 'name'], [1, 'english'], [2, '中国人']] 49 | 50 | 51 | @pytest.mark.remote 52 | def test_stream_remote_json_dicts(): 53 | with Stream(BASE_URL % 'data/table-dicts.json') as stream: 54 | assert stream.headers is None 55 | assert stream.read() == [[1, 'english'], [2, '中国人']] 56 | 57 | 58 | @pytest.mark.remote 59 | def test_stream_remote_json_lists(): 60 | with Stream(BASE_URL % 'data/table-lists.json') as stream: 61 | assert stream.headers is None 62 | assert stream.read() == [['id', 'name'], [1, 'english'], [2, '中国人']] 63 | 64 | 65 | # Write 66 | 67 | def test_stream_save_json(tmpdir): 68 | source = 'data/table.csv' 69 | target = str(tmpdir.join('table.json')) 70 | with Stream(source, headers=1) as stream: 71 | assert stream.save(target) == 2 72 | with open(target) as file: 73 | assert json.load(file) == [ 74 | ['id', 'name'], 75 | ['1', 'english'], 76 | ['2', '中国人'], 77 | ] 78 | 79 | 80 | def test_stream_save_json_keyed(tmpdir): 81 | source = 'data/table.csv' 82 | target = str(tmpdir.join('table.json')) 83 | with Stream(source, headers=1) as stream: 84 | assert stream.save(target, keyed=True) == 2 85 | with open(target) as file: 86 | assert json.load(file) == [ 87 | {'id': '1', 'name': 'english'}, 88 | {'id': '2', 'name': '中国人'}, 89 | ] 90 | 91 | 92 | # Internal 93 | 94 | def test_parser_json(): 95 | 96 | source = 'data/table-dicts.json' 97 | encoding = None 98 | loader = Mock() 99 | loader.load = Mock(return_value=io.open(source, 'rb')) 100 | parser = JSONParser(loader) 101 | 102 | assert parser.closed 103 | parser.open(source, encoding=encoding) 104 | assert not parser.closed 105 | 106 | assert list(parser.extended_rows) == [ 107 | (1, ['id', 'name'], [1, 'english']), 108 | (2, ['id', 'name'], [2, '中国人'])] 109 | 110 | assert len(list(parser.extended_rows)) == 0 111 | parser.reset() 112 | assert len(list(parser.extended_rows)) == 2 113 | 114 | parser.close() 115 | assert parser.closed 116 | -------------------------------------------------------------------------------- /tests/formats/test_ndjson.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import io 8 | import pytest 9 | from mock import Mock 10 | from six import StringIO 11 | from tabulator import exceptions, Stream 12 | from tabulator.parsers.ndjson import NDJSONParser 13 | 14 | 15 | # Read 16 | 17 | def test_stream_ndjson(): 18 | with Stream('data/table.ndjson', headers=1) as stream: 19 | assert stream.headers == ['id', 'name'] 20 | assert stream.read(keyed=True) == [ 21 | {'id': 1, 'name': 'english'}, 22 | {'id': 2, 'name': '中国人'}] 23 | 24 | 25 | # Internal 26 | 27 | def test_parser_ndjson(): 28 | 29 | source = 'data/table.ndjson' 30 | encoding = None 31 | loader = Mock() 32 | loader.load = Mock(return_value=io.open(source, encoding='utf-8')) 33 | parser = NDJSONParser(loader) 34 | 35 | assert parser.closed is True 36 | parser.open(source, encoding=encoding) 37 | assert parser.closed is False 38 | 39 | assert list(parser.extended_rows) == [ 40 | (1, ['id', 'name'], [1, 'english']), 41 | (2, ['id', 'name'], [2, '中国人']), 42 | ] 43 | 44 | assert len(list(parser.extended_rows)) == 0 45 | parser.reset() 46 | assert len(list(parser.extended_rows)) == 2 47 | 48 | parser.close() 49 | assert parser.closed 50 | 51 | 52 | def test_parser_ndjson_list(): 53 | stream = StringIO( 54 | '[1, 2, 3]\n' 55 | '[4, 5, 6]\n' 56 | ) 57 | 58 | loader = Mock(load=Mock(return_value=stream)) 59 | parser = NDJSONParser(loader) 60 | parser.open(None) 61 | 62 | assert list(parser.extended_rows) == [ 63 | (1, None, [1, 2, 3]), 64 | (2, None, [4, 5, 6]), 65 | ] 66 | 67 | 68 | def test_parser_ndjson_scalar(): 69 | stream = StringIO( 70 | '1\n' 71 | '2\n' 72 | ) 73 | 74 | loader = Mock(load=Mock(return_value=stream)) 75 | parser = NDJSONParser(loader) 76 | parser.open(None) 77 | 78 | with pytest.raises(exceptions.SourceError): 79 | list(parser.extended_rows) 80 | -------------------------------------------------------------------------------- /tests/formats/test_ods.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | from datetime import datetime 7 | 8 | import io 9 | import pytest 10 | from mock import Mock 11 | from tabulator import Stream, exceptions 12 | from tabulator.parsers.ods import ODSParser 13 | BASE_URL = 'https://raw.githubusercontent.com/okfn/tabulator-py/master/%s' 14 | 15 | 16 | # Read 17 | 18 | def test_stream_ods(): 19 | with Stream('data/table.ods', headers=1) as stream: 20 | assert stream.headers == ['id', 'name'] 21 | assert stream.read(keyed=True) == [ 22 | {'id': 1, 'name': 'english'}, 23 | {'id': 2, 'name': '中国人'}, 24 | ] 25 | 26 | 27 | @pytest.mark.remote 28 | def test_stream_ods_remote(): 29 | source = BASE_URL % 'data/table.ods' 30 | with Stream(source) as stream: 31 | assert stream.read() == [['id', 'name'], [1, 'english'], [2, '中国人']] 32 | 33 | 34 | def test_stream_ods_sheet_by_index(): 35 | with Stream('data/table.ods', sheet=1) as stream: 36 | assert stream.read() == [['id', 'name'], [1, 'english'], [2, '中国人']] 37 | 38 | 39 | def test_stream_ods_sheet_by_index_not_existent(): 40 | with pytest.raises(exceptions.SourceError) as excinfo: 41 | Stream('data/table.ods', sheet=3).open() 42 | assert 'sheet "3"' in str(excinfo.value) 43 | 44 | 45 | def test_stream_ods_sheet_by_name(): 46 | with Stream('data/table.ods', sheet='Лист1') as stream: 47 | assert stream.read() == [['id', 'name'], [1, 'english'], [2, '中国人']] 48 | 49 | 50 | def test_stream_ods_sheet_by_index_not_existent_2(): 51 | with pytest.raises(exceptions.SourceError) as excinfo: 52 | Stream('data/table.ods', sheet='not-existent').open() 53 | assert 'sheet "not-existent"' in str(excinfo.value) 54 | 55 | 56 | def test_stream_ods_with_boolean(): 57 | with Stream('data/special/table-with-booleans.ods') as stream: 58 | assert stream.headers is None 59 | assert stream.read() == [['id', 'boolean'], [1, True], [2, False]] 60 | 61 | 62 | def test_stream_ods_with_ints_floats_dates(): 63 | source = 'data/special/table-with-ints-floats-dates.ods' 64 | with Stream(source) as stream: 65 | assert stream.read() == [['Int', 'Float', 'Date', 'Datetime'], 66 | [2013, 3.3, datetime(2009, 8, 16).date(), datetime(2009, 8, 16, 5, 43, 21)], 67 | [1997, 5.6, datetime(2009, 9, 20).date(), datetime(2009, 9, 20, 15, 30, 0)], 68 | [1969, 11.7, datetime(2012, 8, 23).date(), datetime(2012, 8, 23, 20, 40, 59)]] 69 | 70 | 71 | # Internal 72 | 73 | def test_parser_ods(): 74 | 75 | source = 'data/table.ods' 76 | encoding = None 77 | loader = Mock() 78 | loader.load = Mock(return_value=io.open(source, 'rb')) 79 | parser = ODSParser(loader) 80 | 81 | assert parser.closed 82 | parser.open(source, encoding=encoding) 83 | assert not parser.closed 84 | 85 | assert list(parser.extended_rows) == [ 86 | (1, None, ['id', 'name']), 87 | (2, None, [1.0, 'english']), 88 | (3, None, [2.0, '中国人']), 89 | ] 90 | 91 | assert len(list(parser.extended_rows)) == 0 92 | parser.reset() 93 | assert len(list(parser.extended_rows)) == 3 94 | 95 | parser.close() 96 | assert parser.closed 97 | -------------------------------------------------------------------------------- /tests/formats/test_sql.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import pytest 8 | from tabulator import Stream, exceptions 9 | 10 | 11 | # Read 12 | 13 | def test_stream_format_sql(database_url): 14 | with Stream(database_url, table='data') as stream: 15 | assert stream.read() == [[1, 'english'], [2, '中国人']] 16 | 17 | 18 | def test_stream_format_sql_order_by(database_url): 19 | with Stream(database_url, table='data', order_by='id') as stream: 20 | assert stream.read() == [[1, 'english'], [2, '中国人']] 21 | 22 | 23 | def test_stream_format_sql_order_by_desc(database_url): 24 | with Stream(database_url, table='data', order_by='id desc') as stream: 25 | assert stream.read() == [[2, '中国人'], [1, 'english']] 26 | 27 | 28 | def test_stream_format_sql_table_is_required_error(database_url): 29 | with pytest.raises(exceptions.TabulatorException) as excinfo: 30 | Stream(database_url).open() 31 | assert 'table' in str(excinfo.value) 32 | 33 | 34 | def test_stream_format_sql_headers(database_url): 35 | with Stream(database_url, table='data', headers=1) as stream: 36 | assert stream.headers == ['id', 'name'] 37 | assert stream.read() == [[1, 'english'], [2, '中国人']] 38 | 39 | 40 | # Write 41 | 42 | def test_stream_save_sqlite(database_url): 43 | source = 'data/table.csv' 44 | with Stream(source, headers=1) as stream: 45 | assert stream.save(database_url, table='test_stream_save_sqlite') == 2 46 | with Stream(database_url, table='test_stream_save_sqlite', order_by='id', headers=1) as stream: 47 | assert stream.read() == [['1', 'english'], ['2', '中国人']] 48 | assert stream.headers == ['id', 'name'] 49 | -------------------------------------------------------------------------------- /tests/formats/test_tsv.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import io 8 | from mock import Mock 9 | from tabulator import Stream 10 | from tabulator.parsers.tsv import TSVParser 11 | 12 | 13 | # Read 14 | 15 | def test_stream_format_tsv(): 16 | with Stream('data/table.tsv') as stream: 17 | assert stream.headers is None 18 | assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人'], ['3', None]] 19 | 20 | 21 | # Internal 22 | 23 | def test_parser_tsv(): 24 | 25 | source = 'data/table.tsv' 26 | encoding = None 27 | loader = Mock() 28 | loader.load = Mock(return_value=io.open(source)) 29 | parser = TSVParser(loader) 30 | 31 | assert parser.closed 32 | parser.open(source, encoding=encoding) 33 | assert not parser.closed 34 | 35 | assert list(parser.extended_rows) == [ 36 | (1, None, ['id', 'name']), 37 | (2, None, ['1', 'english']), 38 | (3, None, ['2', '中国人']), 39 | (4, None, ['3', None])] 40 | 41 | assert len(list(parser.extended_rows)) == 0 42 | parser.reset() 43 | assert len(list(parser.extended_rows)) == 4 44 | 45 | parser.close() 46 | assert parser.closed 47 | -------------------------------------------------------------------------------- /tests/formats/test_xls.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import io 8 | import pytest 9 | from datetime import datetime 10 | from mock import Mock 11 | from tabulator import parsers 12 | from tabulator import Stream, exceptions 13 | from tabulator.parsers.xls import XLSParser 14 | BASE_URL = 'https://raw.githubusercontent.com/okfn/tabulator-py/master/%s' 15 | 16 | 17 | # Read 18 | 19 | def test_stream_local_xls(): 20 | with Stream('data/table.xls') as stream: 21 | assert stream.headers is None 22 | assert stream.read() == [['id', 'name'], [1, 'english'], [2, '中国人']] 23 | 24 | 25 | @pytest.mark.remote 26 | def test_stream_remote_xls(): 27 | with Stream(BASE_URL % 'data/table.xls') as stream: 28 | assert stream.headers is None 29 | assert stream.read() == [['id', 'name'], [1, 'english'], [2, '中国人']] 30 | 31 | 32 | def test_stream_xls_sheet_by_index(): 33 | source = 'data/special/sheet2.xls' 34 | with Stream(source, sheet=2) as stream: 35 | assert stream.fragment == 'Sheet2' 36 | assert stream.read() == [['id', 'name'], [1, 'english'], [2, '中国人']] 37 | 38 | 39 | def test_stream_xls_sheet_by_index_not_existent(): 40 | source = 'data/special/sheet2.xls' 41 | with pytest.raises(exceptions.SourceError) as excinfo: 42 | Stream(source, sheet=3).open() 43 | assert 'sheet "3"' in str(excinfo.value) 44 | 45 | 46 | def test_stream_xls_sheet_by_name(): 47 | source = 'data/special/sheet2.xls' 48 | with Stream(source, sheet='Sheet2') as stream: 49 | assert stream.fragment == 'Sheet2' 50 | assert stream.read() == [['id', 'name'], [1, 'english'], [2, '中国人']] 51 | 52 | 53 | def test_stream_xls_sheet_by_name_not_existent(): 54 | source = 'data/special/sheet2.xls' 55 | with pytest.raises(exceptions.SourceError) as excinfo: 56 | Stream(source, sheet='not-existent').open() 57 | assert 'sheet "not-existent"' in str(excinfo.value) 58 | 59 | 60 | def test_stream_xlsx_merged_cells(): 61 | source = 'data/special/merged-cells.xls' 62 | with Stream(source) as stream: 63 | assert stream.read() == [['data', ''], ['', ''], ['', '']] 64 | 65 | 66 | def test_stream_xlsx_merged_cells_fill(): 67 | source = 'data/special/merged-cells.xls' 68 | with Stream(source, fill_merged_cells=True) as stream: 69 | assert stream.read() == [['data', 'data'], ['data', 'data'], ['data', 'data']] 70 | 71 | 72 | def test_stream_xls_with_boolean(): 73 | with Stream('data/special/table-with-booleans.xls') as stream: 74 | assert stream.headers is None 75 | assert stream.read() == [['id', 'boolean'], [1, True], [2, False]] 76 | 77 | 78 | def test_stream_xlsx_merged_cells_boolean(): 79 | source = 'data/special/merged-cells-boolean.xls' 80 | with Stream(source) as stream: 81 | assert stream.read() == [[True, ''], ['', ''], ['', '']] 82 | 83 | 84 | def test_stream_xlsx_merged_cells_fill_boolean(): 85 | source = 'data/special/merged-cells-boolean.xls' 86 | with Stream(source, fill_merged_cells=True) as stream: 87 | assert stream.read() == [[True, True], [True, True], [True, True]] 88 | 89 | 90 | def test_stream_xls_with_ints_floats_dates(): 91 | source = 'data/special/table-with-ints-floats-dates.xls' 92 | with Stream(source) as stream: 93 | assert stream.read() == [['Int', 'Float', 'Date'], 94 | [2013, 3.3, datetime(2009, 8, 16)], 95 | [1997, 5.6, datetime(2009, 9, 20)], 96 | [1969, 11.7, datetime(2012, 8, 23)]] 97 | 98 | @pytest.mark.skip 99 | @pytest.mark.remote 100 | def test_fix_for_2007_xls(): 101 | source = 'https://ams3.digitaloceanspaces.com/budgetkey-files/spending-reports/2018-3-משרד התרבות והספורט-לשכת הפרסום הממשלתית-2018-10-22-c457.xls' 102 | with Stream(source) as stream: 103 | assert len(stream.read()) > 10 104 | 105 | # Internal 106 | 107 | def test_parser_xls(): 108 | 109 | source = 'data/table.xls' 110 | encoding = None 111 | loader = Mock() 112 | loader.load = Mock(return_value=io.open(source, 'rb')) 113 | parser = XLSParser(loader) 114 | 115 | assert parser.closed 116 | parser.open(source, encoding=encoding) 117 | assert not parser.closed 118 | 119 | assert list(parser.extended_rows) == [ 120 | (1, None, ['id', 'name']), 121 | (2, None, [1, 'english']), 122 | (3, None, [2, '中国人'])] 123 | 124 | assert len(list(parser.extended_rows)) == 0 125 | parser.reset() 126 | assert len(list(parser.extended_rows)) == 3 127 | 128 | parser.close() 129 | assert parser.closed 130 | -------------------------------------------------------------------------------- /tests/formats/test_xlsx.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import io 8 | import pytest 9 | from mock import Mock 10 | from tabulator import Stream, exceptions 11 | from tabulator.parsers.xlsx import XLSXParser 12 | 13 | BASE_URL = "https://raw.githubusercontent.com/frictionlessdata/tabulator-py/master/%s" 14 | 15 | 16 | # Read 17 | 18 | 19 | def test_stream_xlsx_stream(): 20 | source = io.open("data/table.xlsx", mode="rb") 21 | with Stream(source, format="xlsx") as stream: 22 | assert stream.headers is None 23 | assert stream.read() == [["id", "name"], [1.0, "english"], [2.0, "中国人"]] 24 | 25 | 26 | @pytest.mark.remote 27 | def test_stream_xlsx_remote(): 28 | source = BASE_URL % "data/table.xlsx" 29 | with Stream(source) as stream: 30 | assert stream.read() == [["id", "name"], [1.0, "english"], [2.0, "中国人"]] 31 | 32 | 33 | def test_stream_xlsx_sheet_by_index(): 34 | source = "data/special/sheet2.xlsx" 35 | with Stream(source, sheet=2) as stream: 36 | assert stream.fragment == "Sheet2" 37 | assert stream.read() == [["id", "name"], [1, "english"], [2, "中国人"]] 38 | 39 | 40 | def test_stream_xlsx_sheet_by_index_not_existent(): 41 | source = "data/special/sheet2.xlsx" 42 | with pytest.raises(exceptions.SourceError) as excinfo: 43 | Stream(source, sheet=3).open() 44 | assert 'sheet "3"' in str(excinfo.value) 45 | 46 | 47 | def test_stream_xlsx_sheet_by_name(): 48 | source = "data/special/sheet2.xlsx" 49 | with Stream(source, sheet="Sheet2") as stream: 50 | assert stream.fragment == "Sheet2" 51 | assert stream.read() == [["id", "name"], [1, "english"], [2, "中国人"]] 52 | 53 | 54 | def test_stream_xlsx_sheet_by_name_not_existent(): 55 | source = "data/special/sheet2.xlsx" 56 | with pytest.raises(exceptions.SourceError) as excinfo: 57 | Stream(source, sheet="not-existent").open() 58 | assert 'sheet "not-existent"' in str(excinfo.value) 59 | 60 | 61 | def test_stream_xlsx_merged_cells(): 62 | source = "data/special/merged-cells.xlsx" 63 | with Stream(source) as stream: 64 | assert stream.read() == [["data", None]] 65 | 66 | 67 | def test_stream_xlsx_merged_cells_fill(): 68 | source = "data/special/merged-cells.xlsx" 69 | with Stream(source, fill_merged_cells=True) as stream: 70 | assert stream.read() == [["data", "data"], ["data", "data"], ["data", "data"]] 71 | 72 | 73 | def test_stream_xlsx_adjust_floating_point_error(): 74 | source = "data/special/adjust_floating_point_error.xlsx" 75 | with Stream( 76 | source, headers=1, ignore_blank_headers=True, preserve_formatting=True, 77 | ) as stream: 78 | assert stream.read(keyed=True)[1]["actual PO4 (values)"] == 274.65999999999997 79 | with Stream( 80 | source, 81 | headers=1, 82 | ignore_blank_headers=True, 83 | preserve_formatting=True, 84 | adjust_floating_point_error=True, 85 | ) as stream: 86 | assert stream.read(keyed=True)[1]["actual PO4 (values)"] == 274.66 87 | 88 | 89 | def test_stream_xlsx_preserve_formatting(): 90 | source = "data/special/preserve-formatting.xlsx" 91 | with Stream( 92 | source, headers=1, ignore_blank_headers=True, preserve_formatting=True 93 | ) as stream: 94 | assert stream.read(keyed=True) == [ 95 | { 96 | # general 97 | "empty": None, 98 | # numeric 99 | "0": "1001", 100 | "0.00": "1000.56", 101 | "0.0000": "1000.5577", 102 | "0.00000": "1000.55770", 103 | "0.0000#": "1000.5577", 104 | # temporal 105 | "m/d/yy": "5/20/40", 106 | "d-mmm": "20-May", 107 | "mm/dd/yy": "05/20/40", 108 | "mmddyy": "052040", 109 | "mmddyyam/pmdd": "052040AM20", 110 | "at_symbol": "259.153", 111 | } 112 | ] 113 | 114 | 115 | def test_stream_xlsx_preserve_formatting_percentage(): 116 | source = "data/special/preserve-formatting-percentage.xlsx" 117 | with Stream(source, headers=1, preserve_formatting=True) as stream: 118 | assert stream.read() == [ 119 | [123, "52.00%"], 120 | [456, "30.00%"], 121 | [789, "6.00%"], 122 | ] 123 | 124 | 125 | def test_stream_xlsx_preserve_formatting_number_multicode(): 126 | source = "data/special/number_format_multicode.xlsx" 127 | with Stream( 128 | source, headers=1, ignore_blank_headers=True, preserve_formatting=True 129 | ) as stream: 130 | assert stream.read() == [["4.5"], ["-9.032"], ["15.8"]] 131 | 132 | 133 | def test_stream_xlsx_scientific_notation(): 134 | source = "data/special/test_scientific_notation.xlsx" 135 | with Stream(source, headers=1, preserve_formatting=True,) as stream: 136 | assert stream.read() == [["4.273E-07"]] 137 | 138 | 139 | def test_stream_xlsx_workbook_cache(): 140 | workbook_cache = {} 141 | source = BASE_URL % "data/special/sheets.xlsx" 142 | for sheet in ["Sheet1", "Sheet2", "Sheet3"]: 143 | with Stream(source, sheet=sheet, workbook_cache=workbook_cache) as stream: 144 | assert len(workbook_cache) == 1 145 | assert stream.read() 146 | 147 | 148 | # Write 149 | 150 | 151 | def test_stream_save_xlsx(tmpdir): 152 | source = "data/table.csv" 153 | target = str(tmpdir.join("table.xlsx")) 154 | with Stream(source, headers=1) as stream: 155 | assert stream.save(target) == 2 156 | with Stream(target, headers=1) as stream: 157 | assert stream.headers == ["id", "name"] 158 | assert stream.read(extended=True) == [ 159 | (2, ["id", "name"], ["1", "english"]), 160 | (3, ["id", "name"], ["2", "中国人"]), 161 | ] 162 | 163 | 164 | def test_stream_save_xlsx_sheet_name(tmpdir): 165 | source = "data/table.csv" 166 | target = str(tmpdir.join("table.xlsx")) 167 | with Stream(source, headers=1) as stream: 168 | assert stream.save(target, sheet="my-data") == 2 169 | with Stream(target, headers=1, sheet="my-data") as stream: 170 | assert stream.headers == ["id", "name"] 171 | assert stream.read(extended=True) == [ 172 | (2, ["id", "name"], ["1", "english"]), 173 | (3, ["id", "name"], ["2", "中国人"]), 174 | ] 175 | 176 | 177 | # Internal 178 | 179 | 180 | def test_parser_xlsx(): 181 | 182 | source = "data/table.xlsx" 183 | encoding = None 184 | loader = Mock() 185 | loader.load = Mock(return_value=io.open(source, "rb")) 186 | parser = XLSXParser(loader) 187 | 188 | assert parser.closed 189 | parser.open(source, encoding=encoding) 190 | assert not parser.closed 191 | 192 | assert list(parser.extended_rows) == [ 193 | (1, None, ["id", "name"]), 194 | (2, None, [1.0, "english"]), 195 | (3, None, [2.0, "中国人"]), 196 | ] 197 | 198 | assert len(list(parser.extended_rows)) == 0 199 | parser.reset() 200 | assert len(list(parser.extended_rows)) == 3 201 | 202 | parser.close() 203 | assert parser.closed 204 | -------------------------------------------------------------------------------- /tests/schemes/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/tests/schemes/__init__.py -------------------------------------------------------------------------------- /tests/schemes/test_aws.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import os 8 | import six 9 | import sys 10 | import boto3 11 | import pytest 12 | import string 13 | import random 14 | import subprocess 15 | from moto import mock_s3 16 | from tabulator import Stream, exceptions 17 | 18 | # Setup 19 | 20 | S3_ENDPOINT_URL = os.environ['S3_ENDPOINT_URL'] = 'http://localhost:5000' 21 | 22 | 23 | # Read 24 | 25 | # https://github.com/frictionlessdata/tabulator-py/issues/271 26 | @pytest.mark.skip 27 | def test_stream_s3(s3_client, bucket): 28 | 29 | # Upload a file 30 | s3_client.put_object( 31 | ACL='private', 32 | Body=open('data/table.csv', 'rb'), 33 | Bucket=bucket, 34 | ContentType='text/csv', 35 | Key='table.csv') 36 | 37 | # Check the file 38 | with Stream('s3://%s/table.csv' % bucket) as stream: 39 | assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']] 40 | 41 | 42 | # https://github.com/frictionlessdata/tabulator-py/issues/271 43 | @pytest.mark.skip 44 | def test_stream_s3_endpoint_url(s3_client, bucket): 45 | 46 | # Upload a file 47 | s3_client.put_object( 48 | ACL='private', 49 | Body=open('data/table.csv', 'rb'), 50 | Bucket=bucket, 51 | ContentType='text/csv', 52 | Key='table.csv') 53 | 54 | # Check the file 55 | with Stream('s3://%s/table.csv' % bucket, s3_endpoint_url=S3_ENDPOINT_URL) as stream: 56 | assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']] 57 | 58 | 59 | # https://github.com/frictionlessdata/tabulator-py/issues/271 60 | @pytest.mark.skip 61 | def test_stream_s3_non_existent_file(s3_client, bucket): 62 | with pytest.raises(exceptions.IOError): 63 | Stream('s3://%s/table.csv' % bucket).open() 64 | 65 | 66 | # Fixtures 67 | 68 | @pytest.fixture(scope='module') 69 | def s3_client(): 70 | subprocess.Popen('moto_server', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 71 | s3_client = boto3.client('s3', endpoint_url=S3_ENDPOINT_URL) 72 | yield s3_client 73 | os.system('pkill moto_server') 74 | 75 | 76 | @pytest.fixture 77 | def bucket(s3_client): 78 | bucket = 'bucket_%s' % ''.join(random.choice(string.digits) for _ in range(16)) 79 | s3_client.create_bucket(Bucket=bucket, ACL='public-read') 80 | return bucket 81 | -------------------------------------------------------------------------------- /tests/schemes/test_local.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import sys 8 | import pytest 9 | from tabulator import Stream 10 | from importlib import import_module 11 | from tabulator.loaders.local import LocalLoader 12 | 13 | 14 | # Read 15 | 16 | def test_stream_file(): 17 | with Stream('data/table.csv') as stream: 18 | assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']] 19 | 20 | 21 | @pytest.mark.skipif(sys.version_info < (3, 4), reason='not supported') 22 | def test_stream_file_pathlib_path(): 23 | pathlib = import_module('pathlib') 24 | with Stream(pathlib.Path('data/table.csv')) as stream: 25 | assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']] 26 | 27 | 28 | # Internal 29 | 30 | def test_loader_local_t(): 31 | loader = LocalLoader() 32 | chars = loader.load('data/table.csv', encoding='utf-8') 33 | assert chars.read() == 'id,name\n1,english\n2,中国人\n' 34 | 35 | 36 | def test_loader_local_b(): 37 | spec = '中国人'.encode('utf-8') 38 | loader = LocalLoader() 39 | chars = loader.load('data/table.csv', mode='b', encoding='utf-8') 40 | assert chars.read() == b'id,name\n1,english\n2,' + spec + b'\n' 41 | -------------------------------------------------------------------------------- /tests/schemes/test_remote.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import pytest 8 | from tabulator import Stream 9 | from tabulator.loaders.remote import RemoteLoader 10 | from tabulator.exceptions import HTTPError 11 | from time import time 12 | 13 | BASE_URL = 'https://raw.githubusercontent.com/frictionlessdata/tabulator-py/master/%s' 14 | 15 | 16 | # Read 17 | 18 | @pytest.mark.remote 19 | def test_stream_https(): 20 | with Stream(BASE_URL % 'data/table.csv') as stream: 21 | assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']] 22 | 23 | 24 | @pytest.mark.remote 25 | def test_stream_https_latin1(): 26 | # Github returns wrong encoding `utf-8` 27 | with Stream(BASE_URL % 'data/special/latin1.csv') as stream: 28 | assert stream.read() 29 | 30 | 31 | # Internal 32 | 33 | @pytest.mark.remote 34 | def test_loader_remote_t(): 35 | loader = RemoteLoader() 36 | chars = loader.load(BASE_URL % 'data/table.csv', encoding='utf-8') 37 | assert chars.read() == 'id,name\n1,english\n2,中国人\n' 38 | 39 | 40 | @pytest.mark.remote 41 | def test_loader_remote_b(): 42 | spec = '中国人'.encode('utf-8') 43 | loader = RemoteLoader() 44 | chars = loader.load(BASE_URL % 'data/table.csv', mode='b', encoding='utf-8') 45 | assert chars.read() == b'id,name\n1,english\n2,' + spec + b'\n' 46 | 47 | 48 | @pytest.mark.skip 49 | @pytest.mark.remote 50 | def test_loader_no_timeout(): 51 | loader = RemoteLoader() 52 | t = time() 53 | chars = loader.load('https://httpstat.us/200?sleep=5000', mode='b', encoding='utf-8') 54 | assert time() - t > 5 55 | assert chars.read() == b'200 OK' 56 | t = time() 57 | 58 | 59 | @pytest.mark.remote 60 | def test_loader_has_timeout(): 61 | loader = RemoteLoader(http_timeout=1) 62 | t = time() 63 | with pytest.raises(HTTPError): 64 | chars = loader.load('https://httpstat.us/200?sleep=5000', mode='b', encoding='utf-8') 65 | assert time() - t < 5 66 | assert time() - t > 1 67 | -------------------------------------------------------------------------------- /tests/schemes/test_stream.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import io 8 | from tabulator import Stream 9 | 10 | 11 | # Read 12 | 13 | def test_stream_stream(): 14 | source = io.open('data/table.csv', mode='rb') 15 | with Stream(source, format='csv') as stream: 16 | assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']] 17 | -------------------------------------------------------------------------------- /tests/schemes/test_text.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | from tabulator import Stream 8 | from tabulator.loaders.text import TextLoader 9 | 10 | 11 | # Read 12 | 13 | def test_stream_text(): 14 | source = 'text://value1,value2\nvalue3,value4' 15 | with Stream(source, format='csv') as stream: 16 | assert stream.read() == [['value1', 'value2'], ['value3', 'value4']] 17 | 18 | 19 | # Internal 20 | 21 | def test_load_t(): 22 | loader = TextLoader() 23 | chars = loader.load('id,name\n1,english\n2,中国人\n', encoding='utf-8') 24 | assert chars.read() == 'id,name\n1,english\n2,中国人\n' 25 | 26 | def test_load_b(): 27 | spec = '中国人'.encode('utf-8') 28 | loader = TextLoader() 29 | chars = loader.load('id,name\n1,english\n2,中国人\n', mode='b', encoding='utf-8') 30 | assert chars.read() == b'id,name\n1,english\n2,' + spec + b'\n' 31 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | # from __future__ import unicode_literals 6 | 7 | from click.testing import CliRunner 8 | from tabulator.cli import cli 9 | 10 | 11 | # Tests 12 | 13 | def test_cli(): 14 | runner = CliRunner() 15 | result = runner.invoke(cli, ['data/table.csv']) 16 | assert result.exit_code == 0 17 | assert result.output.startswith('id,name\n1,english\n2,') 18 | 19 | 20 | def test_cli_version(): 21 | runner = CliRunner() 22 | result = runner.invoke(cli, ['--version']) 23 | assert result.exit_code == 0 24 | assert len(result.output.split('.')) == 3 25 | -------------------------------------------------------------------------------- /tests/test_helpers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import io 8 | import pytest 9 | from tabulator import helpers, config 10 | 11 | 12 | # Tests 13 | 14 | @pytest.mark.parametrize('source, scheme, format', [ 15 | ('text://path', 'text', None), 16 | ('stream://path', 'stream', None), 17 | ('file://path', 'file', None), 18 | ('ftp://path', 'ftp', None), 19 | ('ftps://path', 'ftps', None), 20 | ('http://path', 'http', None), 21 | ('https://path', 'https', None), 22 | ('xxx://path', 'xxx', None), 23 | ('xx://path', 'xx', None), 24 | ('XXX://path', 'xxx', None), 25 | ('XX://path', 'xx', None), 26 | ('c://path', 'file', None), 27 | ('c:\\path', 'file', None), 28 | (r'c:\path', 'file', None), 29 | ('http//path', 'file', None), 30 | ('path', 'file', None), 31 | ('path.CsV', 'file', 'csv'), 32 | ('http://someplace.com/foo/path.csv?foo=bar#baz', 'http', 'csv'), 33 | ('http://someplace.com/foo/path?foo=bar&format=csv#baz', 'http', 'csv'), 34 | ('https://docs.google.com/spreadsheets/d/X/edit?usp=sharing', None, 'gsheet'), 35 | ('https://docs.google.com/spreadsheets/d/X/export?format=csv&gid=0&single=true', 'https', 'csv'), 36 | ('https://docs.google.com/spreadsheets/d/X/pub?gid=0&single=true&output=csv', 'https', 'csv'), 37 | ]) 38 | def test_detect_scheme_and_format(source, scheme, format): 39 | assert helpers.detect_scheme_and_format(source) == (scheme, format) 40 | 41 | 42 | def test_detect_encoding(): 43 | with io.open('Makefile', 'rb') as fp: 44 | sample = fp.read(config.DEFAULT_BYTES_SAMPLE_SIZE) 45 | assert helpers.detect_encoding(sample) == 'utf-8' 46 | 47 | 48 | def test_detect_encoding_windows_1252(): 49 | sample = b'A\n' * 300 + b'\xff\xff' 50 | try: 51 | import cchardet 52 | assert helpers.detect_encoding(sample) == 'cp1252' 53 | except ImportError: 54 | assert helpers.detect_encoding(sample) == 'iso8859-1' 55 | 56 | 57 | def test_detect_encoding_utf_16_be(): 58 | sample = u'\uFEFFthen some text'.encode('utf-16-be') 59 | assert helpers.detect_encoding(sample) == 'utf-16' 60 | 61 | 62 | def test_detect_encoding_utf_16_le(): 63 | sample = u'\uFEFFthen some text'.encode('utf-16-le') 64 | assert helpers.detect_encoding(sample) == 'utf-16' 65 | 66 | 67 | def test_detect_encoding_unknown(): 68 | sample = b'\xff\x81' 69 | assert helpers.detect_encoding(sample) == 'utf-8' 70 | 71 | 72 | def test_reset_stream_seekable(): 73 | file = io.open(__file__) 74 | file.seek(1) 75 | assert file.tell() == 1 76 | helpers.reset_stream(file) 77 | assert file.tell() == 0 78 | 79 | 80 | def test_reset_stream_not_seekable(): 81 | with pytest.raises(Exception): 82 | helpers.reset_stream('not_seekable') 83 | 84 | 85 | def test_requote_uri(): 86 | url = 'http://next.openspending.org/fdp-adapter/convert?url=https%3A%2F%2Fraw.githubusercontent.com%2Fkravets-levko%2Fdata%2Fmaster%2Ftest.xlsx.csv' 87 | url1 = 'http://data.defra.gov.uk/ops/government_procurement_card/over_£500_GPC_apr_2013.csv' 88 | url2 = 'http://data.defra.gov.uk/ops/government_procurement_card/over_%C2%A3500_GPC_apr_2013.csv' 89 | assert helpers.requote_uri(url) == url 90 | assert helpers.requote_uri(url1) == url2 91 | 92 | 93 | def test_import_attribute(): 94 | assert helpers.import_attribute('tabulator.helpers') == helpers 95 | 96 | 97 | def test_import_attribute_import_error(): 98 | with pytest.raises((ImportError, AttributeError)): 99 | helpers.import_attribute('tabulator.bad_name') 100 | 101 | 102 | def test_extract_options(): 103 | names = ['opt1', 'opt2'] 104 | options = {'opt1': 1, 'opt2': 2, 'opt3': 3} 105 | extracted_options = helpers.extract_options(options, names) 106 | assert options == {'opt3': 3} 107 | assert extracted_options == {'opt1': 1, 'opt2': 2} 108 | 109 | 110 | @pytest.mark.parametrize('sample', [ 111 | ('\n\n\t ', True), 112 | ('', True), 113 | ('col1,col2\nval1,', False), 114 | ('val1,', False), 115 | ]) 116 | def test_detect_html(sample): 117 | text, is_html = sample 118 | assert helpers.detect_html(text) is is_html 119 | 120 | 121 | def test_stringify_value(): 122 | sample = '\u4e9c'.encode('utf-8-sig').decode("utf-8") 123 | assert helpers.stringify_value(sample) == sample 124 | 125 | 126 | def test_stringify_value_none(): 127 | assert helpers.stringify_value(None) == '' 128 | 129 | -------------------------------------------------------------------------------- /tests/test_validate.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import io 8 | import pytest 9 | from tabulator import validate, exceptions 10 | 11 | 12 | # Tests 13 | 14 | def test_validate_test_schemes(): 15 | # Supported 16 | assert validate('path.csv') 17 | assert validate('file://path.csv') 18 | assert validate('http://example.com/path.csv') 19 | assert validate('https://example.com/path.csv') 20 | assert validate('ftp://example.com/path.csv') 21 | assert validate('ftps://example.com/path.csv') 22 | assert validate('path.csv', scheme='file') 23 | # Not supported 24 | with pytest.raises(exceptions.SchemeError) as excinfo: 25 | validate('ssh://example.com/path.csv') 26 | with pytest.raises(exceptions.SchemeError) as excinfo: 27 | validate('bad://example.com/path.csv') 28 | 29 | 30 | def test_validate_test_formats(): 31 | # Supported 32 | assert validate('path.csv') 33 | assert validate('path.json') 34 | assert validate('path.jsonl') 35 | assert validate('path.ndjson') 36 | assert validate('path.tsv') 37 | assert validate('path.xls') 38 | assert validate('path.ods') 39 | assert validate('path.no-format', format='csv') 40 | # Not supported 41 | with pytest.raises(exceptions.FormatError) as excinfo: 42 | validate('path.txt') 43 | with pytest.raises(exceptions.FormatError) as excinfo: 44 | validate('path.bad') 45 | 46 | 47 | def test_validate_test_special(): 48 | # Gsheet 49 | assert validate('https://docs.google.com/spreadsheets/d/id', format='csv') 50 | # File-like 51 | assert validate(io.open('data/table.csv', encoding='utf-8'), format='csv') 52 | # Text 53 | assert validate('text://name,value\n1,2', format='csv') 54 | # Inline 55 | assert validate([{'name': 'value'}]) 56 | --------------------------------------------------------------------------------