├── .github
    ├── issue_template.md
    ├── pull_request_template.md
    ├── stale.yml
    └── workflows
    │   └── release.yml
├── .gitignore
├── .travis.yml
├── LEAD.md
├── LICENSE.md
├── MANIFEST.in
├── Makefile
├── README.md
├── data
    ├── 2-files.zip
    ├── datapackage.json
    ├── matrix.csv
    ├── special
    │   ├── accent.csv
    │   ├── adjust_floating_point_error.xlsx
    │   ├── bom.csv
    │   ├── doublequote.csv
    │   ├── doublequote.csv.zip
    │   ├── escaping.csv
    │   ├── issue305.csv
    │   ├── issue320.xlsx
    │   ├── latin1.csv
    │   ├── long.csv
    │   ├── merged-cells-boolean.xls
    │   ├── merged-cells.xls
    │   ├── merged-cells.xlsx
    │   ├── multiline-headers.xlsx
    │   ├── number_format_multicode.xlsx
    │   ├── preserve-formatting-percentage.xlsx
    │   ├── preserve-formatting.xlsx
    │   ├── sheet2.xls
    │   ├── sheet2.xlsx
    │   ├── sheets.xlsx
    │   ├── skip-blank-at-the-end.csv
    │   ├── skip-rows-before-headers.csv
    │   ├── skip-rows.csv
    │   ├── skip-rows.xlsx
    │   ├── table-with-booleans.ods
    │   ├── table-with-booleans.xls
    │   ├── table-with-ints-floats-dates.ods
    │   ├── table-with-ints-floats-dates.xls
    │   ├── table.bad-format
    │   ├── table.csv.html
    │   ├── table.csv.zip
    │   └── test_scientific_notation.xlsx
    ├── table-dicts.json
    ├── table-lists.json
    ├── table-reverse.csv
    ├── table.csv
    ├── table.csv.gz
    ├── table.csv.zip
    ├── table.ndjson
    ├── table.ods
    ├── table.tsv
    ├── table.xls
    ├── table.xlsx
    ├── table1.html
    ├── table2.html
    ├── table3.html
    ├── table4.html
    └── table_unicode_headers.csv
├── examples
    ├── __init__.py
    └── stream.py
├── pylama.ini
├── pytest.ini
├── setup.cfg
├── setup.py
├── tabulator
    ├── VERSION
    ├── __init__.py
    ├── __main__.py
    ├── cli.py
    ├── config.py
    ├── exceptions.py
    ├── helpers.py
    ├── loader.py
    ├── loaders
    │   ├── __init__.py
    │   ├── aws.py
    │   ├── local.py
    │   ├── remote.py
    │   ├── stream.py
    │   └── text.py
    ├── parser.py
    ├── parsers
    │   ├── __init__.py
    │   ├── csv.py
    │   ├── datapackage.py
    │   ├── gsheet.py
    │   ├── html.py
    │   ├── inline.py
    │   ├── json.py
    │   ├── ndjson.py
    │   ├── ods.py
    │   ├── sql.py
    │   ├── tsv.py
    │   ├── xls.py
    │   └── xlsx.py
    ├── stream.py
    ├── validate.py
    ├── writer.py
    └── writers
    │   ├── __init__.py
    │   ├── csv.py
    │   ├── json.py
    │   ├── sql.py
    │   └── xlsx.py
└── tests
    ├── __init__.py
    ├── conftest.py
    ├── formats
        ├── __init__.py
        ├── test_csv.py
        ├── test_datapackage.py
        ├── test_gsheet.py
        ├── test_html.py
        ├── test_inline.py
        ├── test_json.py
        ├── test_ndjson.py
        ├── test_ods.py
        ├── test_sql.py
        ├── test_tsv.py
        ├── test_xls.py
        └── test_xlsx.py
    ├── schemes
        ├── __init__.py
        ├── test_aws.py
        ├── test_local.py
        ├── test_remote.py
        ├── test_stream.py
        └── test_text.py
    ├── test_cli.py
    ├── test_helpers.py
    ├── test_stream.py
    └── test_validate.py


/.github/issue_template.md:
--------------------------------------------------------------------------------
1 | # Overview
2 | 
3 | Please replace this line with full information about your idea or problem. If it's a bug share as much as possible to reproduce it
4 | 
5 | ---
6 | 
7 | Please preserve this line to notify @roll (lead of this repository)
8 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 | # Overview
2 | 
3 | Please replace this line with full information about your pull request. Make sure that tests pass before publishing it
4 | 
5 | ---
6 | 
7 | Please preserve this line to notify @roll (lead of this repository)
8 | 


--------------------------------------------------------------------------------
/.github/stale.yml:
--------------------------------------------------------------------------------
 1 | # Number of days of inactivity before an issue becomes stale
 2 | daysUntilStale: 90
 3 | 
 4 | # Number of days of inactivity before a stale issue is closed
 5 | daysUntilClose: 30
 6 | 
 7 | # Issues with these labels will never be considered stale
 8 | exemptLabels:
 9 |   - feature
10 |   - enhancement
11 |   - bug
12 | 
13 | # Label to use when marking an issue as stale
14 | staleLabel: wontfix
15 | 
16 | # Comment to post when marking an issue as stale. Set to `false` to disable
17 | markComment: >
18 |   This issue has been automatically marked as stale because it has not had
19 |   recent activity. It will be closed if no further activity occurs. Thank you
20 |   for your contributions.
21 | 
22 | # Comment to post when closing a stale issue. Set to `false` to disable
23 | closeComment: false
24 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: release
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - 'v*.*.*'
 7 | 
 8 | jobs:
 9 |   build:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - name: Checkout
13 |         uses: actions/checkout@v1
14 |       - name: Release
15 |         uses: softprops/action-gh-release@v1
16 |         env:
17 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .pytest_cache/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | 
47 | # Translations
48 | *.mo
49 | *.pot
50 | 
51 | # Django stuff:
52 | *.log
53 | 
54 | # Sphinx documentation
55 | docs/_build/
56 | 
57 | # PyBuilder
58 | target/
59 | 
60 | # Node
61 | node_modules/
62 | 
63 | # Virtualenv
64 | venv/
65 | venv2/
66 | venv3/
67 | 
68 | # Tmux
69 | .tmuxp.yml
70 | 
71 | # Project
72 | tmp
73 | .projectile
74 | .~lock*
75 | 
76 | # Extra
77 | datapackage
78 | .#*
79 | .idea/
80 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | dist:
 2 |   xenial
 3 | 
 4 | sudo:
 5 |   required
 6 | 
 7 | language:
 8 |   python
 9 | 
10 | python:
11 |   - 2.7
12 |   - 3.6
13 |   - 3.7
14 |   - 3.8
15 | 
16 | install:
17 |   - make install
18 |   - pip install coveralls
19 | 
20 | script:
21 |   - make test
22 | 
23 | after_success:
24 |   - coveralls
25 | 
26 | jobs:
27 |   include:
28 |     - stage: release
29 |       if: tag IS present
30 |       python: 3.8
31 |       deploy:
32 |         provider: pypi
33 |         user: okfn
34 |         distributions: sdist bdist_wheel
35 |         skip_cleanup: true
36 |         on:
37 |           tags: true
38 |         password:
39 |           secure: Iuf7V4+XHL6wwFYt4IyEe0vWLGO/uOpMJWQnO+1eUjmcQ1qi4E9vyEJvsJRzWKm5+/Lv9uFIRGlmpNWQzUPs5VnMc3LEBh7Clv/WIlRGvi+omCeWoEPAPUueF8qjBcvpT37QNzjB5QXJY074uAihmKh/DU2xA4K0yCB8YQefBHYeNBl0pNYVnELUW8BFmz0GE0lTwHOnM681vgR01LdPjrgIHVEvnTZkKYtDXc/cwkw610fqrFS10srnTX6KjjC/pgDm4WSuaUxbPycmriIhZR29QgAx24NO/wrdGdp5H8TIsvBFnNFlC4QuHfwiXdAKpjL6cMu2uMo639Sev/484XxTorg2QQvNhNAJtiESVAaqVviAlmUItGdmsw4xhZb0JK6NC8fOuOoccL4DBD6JtCyGurwSpznuGXh1DQUYZ7fTd5qaUDnzBuhYGc8XDvcj14XU4P5OKES4NdruRVJOwFiNSMOAT6wm8b2Ue6N+FvgsghjwUr9ESKBrPj0VoouC2+FGZWT65vt/3R9PhFuBdC6SgMLWHESBuU5GW9Bc2ucS3HUi+uUV1IGjpfIsc3qifojNJiaU7hSAggJs9QlXd7goH2fKhb9ro2klzcDKmpBLXmMk3uH0QRpv1dGUYFtgGeEFN93vP3cxYsXf8OvV+MuCxYYGgrGZu3h8fvbc5hY=
40 | 


--------------------------------------------------------------------------------
/LEAD.md:
--------------------------------------------------------------------------------
1 | roll
2 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Open Knowledge
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | global-include VERSION
2 | include LICENSE.md
3 | include Makefile
4 | include pylama.ini
5 | include pytest.ini
6 | include README.md
7 | include tox.ini
8 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: all install list readme release templates test version
 2 | 
 3 | 
 4 | PACKAGE := $(shell grep '^PACKAGE =' setup.py | cut -d "'" -f2)
 5 | VERSION := $(shell head -n 1 $(PACKAGE)/VERSION)
 6 | LEAD := $(shell head -n 1 LEAD.md)
 7 | 
 8 | 
 9 | all: list
10 | 
11 | install:
12 | 	pip install --upgrade -e .[datapackage,develop,ods,html]
13 | 
14 | list:
15 | 	@grep '^\.PHONY' Makefile | cut -d' ' -f2- | tr ' ' '\n'
16 | 
17 | readme:
18 | 	pip install md-toc
19 | 	pip install referencer
20 | 	referencer $(PACKAGE) README.md --in-place
21 | 	md_toc -p github --header-levels 3 README.md
22 | 	sed -i '/(#$(PACKAGE)-py)/,+2d' README.md
23 | 
24 | release:
25 | 	git checkout master && git pull origin && git fetch -p
26 | 	@git log --pretty=format:"%C(yellow)%h%Creset %s%Cgreen%d" --reverse -20
27 | 	@echo "\nReleasing v$(VERSION) in 10 seconds. Press <CTRL+C> to abort\n" && sleep 10
28 | 	git commit -a -m 'v$(VERSION)' && git tag -a v$(VERSION) -m 'v$(VERSION)'
29 | 	git push --follow-tags
30 | 
31 | templates:
32 | 	sed -i -E "s/@(\w*)/@$(LEAD)/" .github/issue_template.md
33 | 	sed -i -E "s/@(\w*)/@$(LEAD)/" .github/pull_request_template.md
34 | 
35 | test:
36 | 	pylama $(PACKAGE)
37 | 	pytest --cov ${PACKAGE} --cov-report term-missing --cov-fail-under 90
38 | 
39 | version:
40 | 	@echo $(VERSION)
41 | 


--------------------------------------------------------------------------------
/data/2-files.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/2-files.zip


--------------------------------------------------------------------------------
/data/datapackage.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "test-tabulator",
 3 |   "resources": [
 4 |     { "name": "first-resource",
 5 |       "path": "table.xls",
 6 |       "schema": {
 7 |         "fields": [
 8 |           {
 9 |             "name": "id",
10 |             "type": "number"
11 |           },
12 |           {
13 |             "name": "name",
14 |             "type": "string"
15 |           }
16 |         ]
17 |       }
18 |     },
19 |     {"name": "number-two", "path": "table-reverse.csv",
20 |       "schema": {
21 |         "fields": [
22 |           {
23 |             "name": "id",
24 |             "type": "integer"
25 |           },
26 |           {
27 |             "name": "name",
28 |             "type": "string"
29 |           }
30 |         ]
31 |       }}
32 |   ]
33 | }
34 | 


--------------------------------------------------------------------------------
/data/matrix.csv:
--------------------------------------------------------------------------------
1 | f1,f2,f3,f4
2 | 11,12,13,14
3 | 21,22,23,24
4 | 31,32,33,34
5 | 41,42,43,44
6 | 


--------------------------------------------------------------------------------
/data/special/accent.csv:
--------------------------------------------------------------------------------
1 | n_amenageur;n_operateur;n_enseigne;id_station;n_station;ad_station;code_insee;Xlongitude;Ylatitude;nbre_pdc;id_pdc;puiss_max;type_prise;acces_recharge;accessibilité;observations;date_maj
2 | XXX;YYY;ZZZ;FR*A17*P*ZZZ*3*_*_*_;Parking 1;D109A;06090;6.92641;43.59413;2;FR*A17*E*ZZZ*3*1*1*_;22.0;T2 - E/F;Payant (badge, appli et QR code);24h/24;RAS;2018/03/31
3 | XXX;YYY;ZZZ;FR*A17*P*ZZZ*3*_*_*_;Parking 1;D109A;06090;6.92641;43.59413;2;FR*A17*E*ZZZ*3*1*2*_;22.0;T2 - E/F;Payant (badge, appli et QR code);24h/24;RAS;2018/3/31
4 | 


--------------------------------------------------------------------------------
/data/special/adjust_floating_point_error.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/adjust_floating_point_error.xlsx


--------------------------------------------------------------------------------
/data/special/bom.csv:
--------------------------------------------------------------------------------
1 | ﻿id,name
2 | 1,english
3 | 2,中国人
4 | 


--------------------------------------------------------------------------------
/data/special/doublequote.csv:
--------------------------------------------------------------------------------
1 | "INCISO","NOMBREINCISO","UE","NOMBREUE","AÑO","CODIGOAP","NOMBREAP","DESCRIPCIONAP","CODIGOPROGRAMA","NOMBREPROGRAMA","DESCRIPCIONPROGRAMA","INCISOCODIGO","UECODIGO","ue_cod","presupuestado","pfi","pbi"
2 | "A.N.E.P.","Administración Nacional de Educación Pública","CES","Consejo de Educación Secundaria",2019,8,"EDUCACIÓN","Comprende los servicios prestados a alumnos y estudiantes, y los servicios docentes; la formulación y administración de políticas educativas, aplicación de normas, regulación, autorización y supervisión de los centros de enseñanza, así como la investigación aplicada y el desarrollo experimental en relación con los asuntos y servicios docentes; los gastos relacionados con la administración y gestión de centros educativos que imparten educación preescolar, primaria, secundaria y universitaria, así como otras instituciones que brindan formación y capacitación. Comprende la prestación de servicios auxiliares de la educación, como transporte, seguridad, alimentación, alojamiento, atención médica y odontológica y otros servicios auxiliares conexos. Comprende además las escuelas y academias militares, las escuelas de policía que imparten enseñanza general y la formación para el servicio exterior.",608,"Inversiones edilicias y equipamiento"," ",25,3,25003,74350819,"Inversión",".003465"
3 | "A.N.E.P.","Administración Nacional de Educación Pública","CFed","Consejo de Formación en Educación",2015,8,"EDUCACIÓN","Comprende los servicios prestados a alumnos y estudiantes, y los servicios docentes; la formulación y administración de políticas educativas, aplicación de normas, regulación, autorización y supervisión de los centros de enseñanza, así como la investigación aplicada y el desarrollo experimental en relación con los asuntos y servicios docentes; los gastos relacionados con la administración y gestión de centros educativos que imparten educación preescolar, primaria, secundaria y universitaria, así como otras instituciones que brindan formación y capacitación. Comprende la prestación de servicios auxiliares de la educación, como transporte, seguridad, alimentación, alojamiento, atención médica y odontológica y otros servicios auxiliares conexos. Comprende además las escuelas y academias militares, las escuelas de policía que imparten enseñanza general y la formación para el servicio exterior.",5,"Formación en Educación","Formación de profesionales de la educación, incluyendo la formación de educadores sociales. La Ley General de Educación Nº 18.43, en su artículo nro. 31 define la formación en educación. Ésta... '""se concebirá como enseñanza terciaria universitaria y abarcará la formación de maestros, maestros técnicos, profesores, profesores de educación física y educadores sociales, así como de otras formaciones que el Sistema Nacional de Educación requiera'"".",25,5,25005,117396841,"Funcionamiento",".007937"
4 | "A.N.E.P.","Administración Nacional de Educación Pública","CFed","Consejo de Formación en Educación",2015,8,"EDUCACIÓN","Comprende los servicios prestados a alumnos y estudiantes, y los servicios docentes; la formulación y administración de políticas educativas, aplicación de normas, regulación, autorización y supervisión de los centros de enseñanza, así como la investigación aplicada y el desarrollo experimental en relación con los asuntos y servicios docentes; los gastos relacionados con la administración y gestión de centros educativos que imparten educación preescolar, primaria, secundaria y universitaria, así como otras instituciones que brindan formación y capacitación. Comprende la prestación de servicios auxiliares de la educación, como transporte, seguridad, alimentación, alojamiento, atención médica y odontológica y otros servicios auxiliares conexos. Comprende además las escuelas y academias militares, las escuelas de policía que imparten enseñanza general y la formación para el servicio exterior.",5,"Formación en Educación","Formación de profesionales de la educación, incluyendo la formación de educadores sociales. La Ley General de Educación Nº 18.43, en su artículo nro. 31 define la formación en educación. Ésta... '""se concebirá como enseñanza terciaria universitaria y abarcará la formación de maestros, maestros técnicos, profesores, profesores de educación física y educadores sociales, así como de otras formaciones que el Sistema Nacional de Educación requiera'"".",25,5,25005,38107510,"Inversión",".002576"
5 | "A.N.E.P.","Administración Nacional de Educación Pública","CFed","Consejo de Formación en Educación",2015,8,"EDUCACIÓN","Comprende los servicios prestados a alumnos y estudiantes, y los servicios docentes; la formulación y administración de políticas educativas, aplicación de normas, regulación, autorización y supervisión de los centros de enseñanza, así como la investigación aplicada y el desarrollo experimental en relación con los asuntos y servicios docentes; los gastos relacionados con la administración y gestión de centros educativos que imparten educación preescolar, primaria, secundaria y universitaria, así como otras instituciones que brindan formación y capacitación. Comprende la prestación de servicios auxiliares de la educación, como transporte, seguridad, alimentación, alojamiento, atención médica y odontológica y otros servicios auxiliares conexos. Comprende además las escuelas y academias militares, las escuelas de policía que imparten enseñanza general y la formación para el servicio exterior.",5,"Formación en Educación","Formación de profesionales de la educación, incluyendo la formación de educadores sociales. La Ley General de Educación Nº 18.43, en su artículo nro. 31 define la formación en educación. Ésta... '""se concebirá como enseñanza terciaria universitaria y abarcará la formación de maestros, maestros técnicos, profesores, profesores de educación física y educadores sociales, así como de otras formaciones que el Sistema Nacional de Educación requiera'"".",25,5,25005,1616869527,"Personal",".109327"
6 | "A.N.E.P.","Administración Nacional de Educación Pública","CFed","Consejo de Formación en Educación",2016,8,"EDUCACIÓN","Comprende los servicios prestados a alumnos y estudiantes, y los servicios docentes; la formulación y administración de políticas educativas, aplicación de normas, regulación, autorización y supervisión de los centros de enseñanza, así como la investigación aplicada y el desarrollo experimental en relación con los asuntos y servicios docentes; los gastos relacionados con la administración y gestión de centros educativos que imparten educación preescolar, primaria, secundaria y universitaria, así como otras instituciones que brindan formación y capacitación. Comprende la prestación de servicios auxiliares de la educación, como transporte, seguridad, alimentación, alojamiento, atención médica y odontológica y otros servicios auxiliares conexos. Comprende además las escuelas y academias militares, las escuelas de policía que imparten enseñanza general y la formación para el servicio exterior.",601,"Administración de la Educación y Gestión de Políticas Transversales"," ",25,5,25005,25856879,"Funcionamiento",".001586"
7 | 


--------------------------------------------------------------------------------
/data/special/doublequote.csv.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/doublequote.csv.zip


--------------------------------------------------------------------------------
/data/special/escaping.csv:
--------------------------------------------------------------------------------
1 | ID, Test
2 | 1, "Test line 1"
3 | 2, "Test "" line 2"
4 | 3, "Test \" line 3"
5 | 


--------------------------------------------------------------------------------
/data/special/issue320.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/issue320.xlsx


--------------------------------------------------------------------------------
/data/special/latin1.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/latin1.csv


--------------------------------------------------------------------------------
/data/special/long.csv:
--------------------------------------------------------------------------------
1 | id,name
2 | 1,a
3 | 2,b
4 | 3,c
5 | 4,d
6 | 5,e
7 | 6,f
8 | 


--------------------------------------------------------------------------------
/data/special/merged-cells-boolean.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/merged-cells-boolean.xls


--------------------------------------------------------------------------------
/data/special/merged-cells.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/merged-cells.xls


--------------------------------------------------------------------------------
/data/special/merged-cells.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/merged-cells.xlsx


--------------------------------------------------------------------------------
/data/special/multiline-headers.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/multiline-headers.xlsx


--------------------------------------------------------------------------------
/data/special/number_format_multicode.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/number_format_multicode.xlsx


--------------------------------------------------------------------------------
/data/special/preserve-formatting-percentage.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/preserve-formatting-percentage.xlsx


--------------------------------------------------------------------------------
/data/special/preserve-formatting.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/preserve-formatting.xlsx


--------------------------------------------------------------------------------
/data/special/sheet2.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/sheet2.xls


--------------------------------------------------------------------------------
/data/special/sheet2.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/sheet2.xlsx


--------------------------------------------------------------------------------
/data/special/sheets.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/sheets.xlsx


--------------------------------------------------------------------------------
/data/special/skip-blank-at-the-end.csv:
--------------------------------------------------------------------------------
1 | test1,test2
2 | #testing comment
3 | 1,2
4 | 
5 | 


--------------------------------------------------------------------------------
/data/special/skip-rows-before-headers.csv:
--------------------------------------------------------------------------------
1 | # it's a comment!
2 | id,name
3 | 1,english
4 | 2,中国人
5 | 


--------------------------------------------------------------------------------
/data/special/skip-rows.csv:
--------------------------------------------------------------------------------
1 | # it's a comment!
2 | id,name
3 | 1,english
4 | # it's a comment!
5 | 2,中国人
6 | 


--------------------------------------------------------------------------------
/data/special/skip-rows.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/skip-rows.xlsx


--------------------------------------------------------------------------------
/data/special/table-with-booleans.ods:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/table-with-booleans.ods


--------------------------------------------------------------------------------
/data/special/table-with-booleans.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/table-with-booleans.xls


--------------------------------------------------------------------------------
/data/special/table-with-ints-floats-dates.ods:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/table-with-ints-floats-dates.ods


--------------------------------------------------------------------------------
/data/special/table-with-ints-floats-dates.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/table-with-ints-floats-dates.xls


--------------------------------------------------------------------------------
/data/special/table.bad-format:
--------------------------------------------------------------------------------
1 | id,name
2 | 1,english
3 | 2,中国人
4 | 


--------------------------------------------------------------------------------
/data/special/table.csv.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/table.csv.zip


--------------------------------------------------------------------------------
/data/special/test_scientific_notation.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/special/test_scientific_notation.xlsx


--------------------------------------------------------------------------------
/data/table-dicts.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "id": 1,
 4 |         "name": "english"
 5 |     },
 6 |     {
 7 |         "id": 2,
 8 |         "name": "中国人"
 9 |     }
10 | ]
11 | 


--------------------------------------------------------------------------------
/data/table-lists.json:
--------------------------------------------------------------------------------
1 | [
2 |     ["id", "name"],
3 |     [1, "english"],
4 |     [2, "中国人"]
5 | ]
6 | 


--------------------------------------------------------------------------------
/data/table-reverse.csv:
--------------------------------------------------------------------------------
1 | id,name
2 | 1,中国人
3 | 2,english
4 | 


--------------------------------------------------------------------------------
/data/table.csv:
--------------------------------------------------------------------------------
1 | id,name
2 | 1,english
3 | 2,中国人
4 | 


--------------------------------------------------------------------------------
/data/table.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/table.csv.gz


--------------------------------------------------------------------------------
/data/table.csv.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/table.csv.zip


--------------------------------------------------------------------------------
/data/table.ndjson:
--------------------------------------------------------------------------------
1 | {"id":1,"name":"english"}
2 | {"id":2,"name":"中国人"}
3 | 


--------------------------------------------------------------------------------
/data/table.ods:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/table.ods


--------------------------------------------------------------------------------
/data/table.tsv:
--------------------------------------------------------------------------------
1 | id	name
2 | 1	english
3 | 2	中国人
4 | 3	\N
5 | 


--------------------------------------------------------------------------------
/data/table.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/table.xls


--------------------------------------------------------------------------------
/data/table.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/data/table.xlsx


--------------------------------------------------------------------------------
/data/table1.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 | </head>
 6 | <body>
 7 |     <table>
 8 |         <tr>
 9 |             <td>id</td>
10 |             <td>name</td>
11 |         </tr>
12 |         <tr>
13 |             <td>1</td>
14 |             <td>english</td>
15 |         </tr>
16 |         <tr>
17 |             <td>2</td>
18 |             <td>中国人</td>
19 |         </tr>
20 |     </table>
21 | </body>
22 | </html


--------------------------------------------------------------------------------
/data/table2.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | 
 4 | <head>
 5 |     <meta charset="UTF-8">
 6 | </head>
 7 | 
 8 | <body>
 9 |     <table>
10 |         <THEAD>
11 |             <tr>
12 |                 <th>id</th>
13 |                 <th>name</th>
14 |             </tr>
15 |         </THEAD>
16 |         <tbody>
17 |             <tr>
18 |                 <td>1</td>
19 |                 <td>english</td>
20 |             </tr>
21 |             <tr>
22 |                 <td>2</td>
23 |                 <td>中国人</td>
24 |             </tr>
25 |         </tbody>
26 |     </table>
27 | </body>
28 | 
29 | </html


--------------------------------------------------------------------------------
/data/table3.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 | </head>
 6 | <body>
 7 |     <table class='notme'>
 8 |         <tr>
 9 |             <td>id</td>
10 |             <td>name</td>
11 |         </tr>
12 |         <tr>
13 |             <td>3</td>
14 |             <td>french</td>
15 |         </tr>
16 |         <tr>
17 |             <td>4</td>
18 |             <td>עברית</td>
19 |         </tr>
20 |     </table>
21 |     <table class='mememe'>
22 |         <tr>
23 |             <td>id</td>
24 |             <td>name</td>
25 |         </tr>
26 |         <tr>
27 |             <td>1</td>
28 |             <td><b>english</b></td>
29 |         </tr>
30 |         <tr>
31 |             <td>2</td>
32 |             <td>中国人</td>
33 |         </tr>
34 |     </table>
35 |     <table class='notme'>
36 |         <tr>
37 |             <td>id</td>
38 |             <td>name</td>
39 |         </tr>
40 |         <tr>
41 |             <td>3</td>
42 |             <td>french</td>
43 |         </tr>
44 |         <tr>
45 |             <td>4</td>
46 |             <td>עברית</td>
47 |         </tr>
48 |     </table>
49 | </body>
50 | </html


--------------------------------------------------------------------------------
/data/table4.html:
--------------------------------------------------------------------------------
 1 | <TABLE>
 2 |     <THEAD>
 3 |         <th>id</th>
 4 |         <th>name</th>
 5 |     </THEAD>
 6 |     <tr>
 7 |         <td>1</td>
 8 |         <td>english</td>
 9 |     </tr>
10 |     <tr>
11 |         <td>2</td>
12 |         <td>中国人</td>
13 |     </tr>
14 | </TABLE>


--------------------------------------------------------------------------------
/data/table_unicode_headers.csv:
--------------------------------------------------------------------------------
1 | id,国人
2 | 1,english
3 | 2,中国人
4 | 


--------------------------------------------------------------------------------
/examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/examples/__init__.py


--------------------------------------------------------------------------------
/examples/stream.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | from __future__ import absolute_import
  5 | from __future__ import unicode_literals
  6 | 
  7 | import io
  8 | import sys
  9 | from tabulator import Stream
 10 | 
 11 | 
 12 | print('Parse csv format:')
 13 | source = 'data/table.csv'
 14 | with Stream(source, headers='row1') as stream:
 15 |     print(stream.headers)
 16 |     for row in stream:
 17 |         print(row)
 18 | 
 19 | 
 20 | print('\nParse linear tsv format:')
 21 | source = 'data/table.tsv'
 22 | with Stream(source, headers='row1') as stream:
 23 |     print(stream.headers)
 24 |     for row in stream:
 25 |         print(row)
 26 | 
 27 | 
 28 | print('\nParse json with dicts:')
 29 | source = 'file://data/table-dicts.json'
 30 | with Stream(source) as stream:
 31 |     print(stream.headers)
 32 |     for row in stream:
 33 |         print(row)
 34 | 
 35 | 
 36 | print('\nParse json with lists:')
 37 | source = 'file://data/table-lists.json'
 38 | with Stream(source, headers='row1') as stream:
 39 |     print(stream.headers)
 40 |     for row in stream:
 41 |         print(row)
 42 | 
 43 | 
 44 | print('\nParse xls format:')
 45 | source = 'data/table.xls'
 46 | with Stream(source, headers='row1') as stream:
 47 |     print(stream.headers)
 48 |     for row in stream:
 49 |         print(row)
 50 | 
 51 | 
 52 | print('\nParse xlsx format:')
 53 | source = 'data/table.xlsx'
 54 | with Stream(source, headers='row1') as stream:
 55 |     print(stream.headers)
 56 |     for row in stream:
 57 |         print(row)
 58 | 
 59 | 
 60 | # print('\nLoad from stream scheme:')
 61 | source = io.open('data/table.csv', mode='rb')
 62 | with Stream(source, headers='row1', format='csv') as stream:
 63 |     print(stream.headers)
 64 |     for row in stream:
 65 |         print(row)
 66 | 
 67 | 
 68 | print('\nLoad from text scheme:')
 69 | source = 'text://id,name\n1,english\n2,中国人\n'
 70 | with Stream(source, headers='row1', format='csv') as stream:
 71 |     print(stream.headers)
 72 |     for row in stream:
 73 |         print(row)
 74 | 
 75 | 
 76 | print('\nLoad from http scheme:')
 77 | source = 'https://raw.githubusercontent.com'
 78 | source += '/okfn/tabulator-py/master/data/table.csv'
 79 | with Stream(source, headers='row1') as stream:
 80 |     print(stream.headers)
 81 |     for row in stream:
 82 |         print(row)
 83 | 
 84 | 
 85 | print('\nUsage of inline lists:')
 86 | source = [['id', 'name'], ['1', 'english'], ('2', '中国人')]
 87 | with Stream(source, headers='row1') as stream:
 88 |     print(stream.headers)
 89 |     for row in stream:
 90 |         print(row)
 91 | 
 92 | 
 93 | print('\nUsage of inline lists (keyed):')
 94 | source = [{'id': '1', 'name': 'english'}, {'id': '2', 'name': '中国人'}]
 95 | with Stream(source) as stream:
 96 |     print(stream.headers)
 97 |     for row in stream:
 98 |         print(row)
 99 | 
100 | 
101 | print('\nIter with keyed rows representation:')
102 | source = [{'id': '1', 'name': 'english'}, {'id': '2', 'name': '中国人'}]
103 | with Stream(source, headers=1) as stream:
104 |     print(stream.headers)
105 |     for row in stream.iter(keyed=True):
106 |         print(row)
107 | 
108 | 
109 | print('\nTable reset and read limit:')
110 | source = 'data/table.csv'
111 | with Stream(source, headers='row1') as stream:
112 |     print(stream.headers)
113 |     print(stream.read(limit=1))
114 |     stream.reset()
115 |     print(stream.read(limit=1))
116 | 
117 | 
118 | print('\nLate headers (on a second row):')
119 | source = 'data/special/late_headers.csv'
120 | with Stream(source, headers='row2') as stream:
121 |     print(stream.headers)
122 |     for row in stream:
123 |         print(row)
124 | 
125 | 
126 | print('\nSpaces in headers:')
127 | source = 'https://raw.githubusercontent.com/datasets/gdp/master/data/gdp.csv'
128 | with Stream(source, headers='row1') as stream:
129 |     print(stream.headers)
130 |     for row in stream.read(limit=5):
131 |         print(row)
132 | 


--------------------------------------------------------------------------------
/pylama.ini:
--------------------------------------------------------------------------------
 1 | [pylama]
 2 | linters = pyflakes,mccabe,pep8
 3 | ignore = E128, E301,E306,E731
 4 | 
 5 | [pylama:pep8]
 6 | max_line_length = 120
 7 | 
 8 | [pylama:mccabe]
 9 | complexity = 36
10 | 
11 | [pylama:*/__init__.py]
12 | ignore = W0611
13 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | testpaths = tests
3 | markers =
4 |     remote: marks tests as requiring Internet (deselect with '-m "not remote"')
5 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal=1
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | from __future__ import absolute_import
  5 | 
  6 | import os
  7 | import io
  8 | from setuptools import setup, find_packages
  9 | 
 10 | 
 11 | # Helpers
 12 | def read(*paths):
 13 |     """Read a text file."""
 14 |     basedir = os.path.dirname(__file__)
 15 |     fullpath = os.path.join(basedir, *paths)
 16 |     contents = io.open(fullpath, encoding='utf-8').read().strip()
 17 |     return contents
 18 | 
 19 | 
 20 | # Prepare
 21 | PACKAGE = 'tabulator'
 22 | INSTALL_REQUIRES = [
 23 |     # General
 24 |     'six>=1.9',
 25 |     'click>=6.0',
 26 |     'requests>=2.8',
 27 |     'chardet>=3.0',
 28 |     'boto3>=1.9',
 29 |     # Format: csv
 30 |     'unicodecsv>=0.14',
 31 |     # Format: json
 32 |     'ijson>=3.0.3',
 33 |     # Format: ndjson
 34 |     'jsonlines>=1.1',
 35 |     # Format: sql
 36 |     'sqlalchemy>=0.9.6',
 37 |     # Format: tsv
 38 |     'linear-tsv>=1.0',
 39 |     # Format: xls
 40 |     'xlrd>=1.0',
 41 |     # Format: xlsx
 42 |     'openpyxl>=2.6',
 43 | ]
 44 | INSTALL_FORMAT_DATAPACKAGE_REQUIRES = [
 45 |     'datapackage>=1.12',
 46 | ]
 47 | INSTALL_FORMAT_ODS_REQUIRES = [
 48 |     'ezodf>=0.3',
 49 |     'lxml>=3.0',
 50 | ]
 51 | INSTALL_PARSER_HTML_REQUIRES = [
 52 |     'pyquery<1.4.2',
 53 | ]
 54 | INSTALL_CCHARDET_REQUIRES = [
 55 |     'cchardet>=2.0',
 56 | ]
 57 | TESTS_REQUIRE = [
 58 |     'mock',
 59 |     'pylama',
 60 |     'pytest',
 61 |     'pytest-cov',
 62 |     'moto[server]',
 63 | ]
 64 | README = read('README.md')
 65 | VERSION = read(PACKAGE, 'VERSION')
 66 | PACKAGES = find_packages(exclude=['examples', 'tests'])
 67 | 
 68 | 
 69 | # Run
 70 | setup(
 71 |     name=PACKAGE,
 72 |     version=VERSION,
 73 |     packages=PACKAGES,
 74 |     include_package_data=True,
 75 |     install_requires=INSTALL_REQUIRES,
 76 |     tests_require=TESTS_REQUIRE,
 77 |     extras_require={
 78 |         'datapackage': INSTALL_FORMAT_DATAPACKAGE_REQUIRES,
 79 |         'develop': TESTS_REQUIRE,
 80 |         'ods': INSTALL_FORMAT_ODS_REQUIRES,
 81 |         'html': INSTALL_PARSER_HTML_REQUIRES,
 82 |         'cchardet': INSTALL_CCHARDET_REQUIRES,
 83 |     },
 84 |     entry_points={
 85 |         'console_scripts': [
 86 |             'tabulator = tabulator.__main__:cli',
 87 |         ]
 88 |     },
 89 |     zip_safe=False,
 90 |     long_description=README,
 91 |     long_description_content_type='text/markdown',
 92 |     description='Consistent interface for stream reading and writing tabular data (csv/xls/json/etc)',
 93 |     author='Open Knowledge Foundation',
 94 |     author_email='info@okfn.org',
 95 |     url='https://github.com/frictionlessdata/tabulator-py',
 96 |     license='MIT',
 97 |     keywords=[
 98 |         'frictionless data',
 99 |     ],
100 |     classifiers=[
101 |         'Development Status :: 4 - Beta',
102 |         'Environment :: Web Environment',
103 |         'Intended Audience :: Developers',
104 |         'License :: OSI Approved :: MIT License',
105 |         'Operating System :: OS Independent',
106 |         'Programming Language :: Python :: 2',
107 |         'Programming Language :: Python :: 2.7',
108 |         'Programming Language :: Python :: 3',
109 |         'Programming Language :: Python :: 3.3',
110 |         'Programming Language :: Python :: 3.4',
111 |         'Programming Language :: Python :: 3.5',
112 |         'Programming Language :: Python :: 3.6',
113 |         'Programming Language :: Python :: 3.7',
114 |         'Topic :: Internet :: WWW/HTTP :: Dynamic Content',
115 |         'Topic :: Software Development :: Libraries :: Python Modules'
116 |     ],
117 | )
118 | 


--------------------------------------------------------------------------------
/tabulator/VERSION:
--------------------------------------------------------------------------------
1 | 1.53.5
2 | 


--------------------------------------------------------------------------------
/tabulator/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | from . import config
 7 | __version__ = config.VERSION
 8 | 
 9 | 
10 | # Module API
11 | 
12 | from .cli import cli
13 | from .stream import Stream
14 | from .loader import Loader
15 | from .parser import Parser
16 | from .writer import Writer
17 | from .validate import validate
18 | from .exceptions import TabulatorException
19 | from .exceptions import SourceError
20 | from .exceptions import SchemeError
21 | from .exceptions import FormatError
22 | from .exceptions import EncodingError
23 | from .exceptions import CompressionError
24 | 
25 | # Deprecated
26 | 
27 | from . import exceptions
28 | from .exceptions import IOError
29 | from .exceptions import LoadingError
30 | from .exceptions import HTTPError
31 | 


--------------------------------------------------------------------------------
/tabulator/__main__.py:
--------------------------------------------------------------------------------
1 | from .cli import cli
2 | 
3 | 
4 | # Module API
5 | 
6 | if __name__ == "__main__":
7 |     cli()
8 | 


--------------------------------------------------------------------------------
/tabulator/cli.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | # from __future__ import unicode_literals
 6 | 
 7 | import six
 8 | import click
 9 | import tabulator
10 | from . import config
11 | from . import exceptions
12 | 
13 | 
14 | # Module API
15 | 
16 | @click.command(help='')
17 | @click.argument('source')
18 | @click.option('--headers', type=click.INT)
19 | @click.option('--scheme')
20 | @click.option('--format')
21 | @click.option('--encoding')
22 | @click.option('--limit', type=click.INT)
23 | @click.option('--sheet')
24 | @click.option('--fill-merged-cells', is_flag=True, default=None)
25 | @click.option('--preserve-formatting', is_flag=True, default=None)
26 | @click.option('--adjust-floating-point-error', is_flag=True, default=None)
27 | @click.option('--table')
28 | @click.option('--order_by')
29 | @click.option('--resource')
30 | @click.option('--property')
31 | @click.option('--keyed', is_flag=True, default=None)
32 | @click.version_option(config.VERSION, message='%(version)s')
33 | def cli(source, limit, **options):
34 |     """Command-line interface
35 | 
36 |     ```
37 |     Usage: tabulator [OPTIONS] SOURCE
38 | 
39 |     Options:
40 |       --headers INTEGER
41 |       --scheme TEXT
42 |       --format TEXT
43 |       --encoding TEXT
44 |       --limit INTEGER
45 |       --sheet TEXT/INTEGER (excel)
46 |       --fill-merged-cells BOOLEAN (excel)
47 |       --preserve-formatting BOOLEAN (excel)
48 |       --adjust-floating-point-error BOOLEAN (excel)
49 |       --table TEXT (sql)
50 |       --order_by TEXT (sql)
51 |       --resource TEXT/INTEGER (datapackage)
52 |       --property TEXT (json)
53 |       --keyed BOOLEAN (json)
54 |       --version          Show the version and exit.
55 |       --help             Show this message and exit.
56 |     ```
57 | 
58 |     """
59 | 
60 |     # Normalize options
61 |     options = {key: value for key, value in options.items() if value is not None}
62 |     try:
63 |         options['sheet'] = int(options.get('sheet'))
64 |         options['resource'] = int(options.get('resource'))
65 |     except Exception:
66 |         pass
67 | 
68 |     # Read the table
69 |     try:
70 |         with tabulator.Stream(source, **options) as stream:
71 |             cast = str
72 |             if six.PY2:
73 |                 cast = unicode  # noqa
74 |             if stream.headers:
75 |                 click.echo(click.style(', '.join(map(cast, stream.headers)), bold=True))
76 |             for count, row in enumerate(stream, start=1):
77 |                 click.echo(','.join(map(cast, row)))
78 |                 if count == limit:
79 |                     break
80 |     except exceptions.TabulatorException as exception:
81 |         click.echo('[error] %s' % str(exception))
82 |         exit(1)
83 | 


--------------------------------------------------------------------------------
/tabulator/config.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import io
 8 | import os
 9 | 
10 | 
11 | # General
12 | 
13 | VERSION = io.open(os.path.join(os.path.dirname(__file__), 'VERSION')).read().strip()
14 | DEFAULT_SCHEME = 'file'
15 | DEFAULT_ENCODING = 'utf-8'
16 | DEFAULT_SAMPLE_SIZE = 100
17 | DEFAULT_BYTES_SAMPLE_SIZE = 10000
18 | SUPPORTED_COMPRESSION = ['zip', 'gz']
19 | SUPPORTED_HASHING_ALGORITHMS = ['md5', 'sha1', 'sha256', 'sha512']
20 | ENCODING_CONFIDENCE = 0.5
21 | HTTP_HEADERS = {
22 |   'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) ' +
23 |                 'AppleWebKit/537.36 (KHTML, like Gecko) ' +
24 |                 'Chrome/54.0.2840.87 Safari/537.36'
25 | }
26 | CSV_SAMPLE_LINES = 100
27 | # http://docs.sqlalchemy.org/en/latest/dialects/index.html
28 | SQL_SCHEMES = ['firebird', 'mssql', 'mysql', 'oracle', 'postgresql', 'sqlite', 'sybase']
29 | S3_DEFAULT_ENDPOINT_URL = 'https://s3.amazonaws.com'
30 | 
31 | # Loaders
32 | 
33 | LOADERS = {
34 |     's3': 'tabulator.loaders.aws.AWSLoader',
35 |     'file': 'tabulator.loaders.local.LocalLoader',
36 |     'http': 'tabulator.loaders.remote.RemoteLoader',
37 |     'https': 'tabulator.loaders.remote.RemoteLoader',
38 |     'ftp': 'tabulator.loaders.remote.RemoteLoader',
39 |     'ftps': 'tabulator.loaders.remote.RemoteLoader',
40 |     'stream': 'tabulator.loaders.stream.StreamLoader',
41 |     'text': 'tabulator.loaders.text.TextLoader',
42 | }
43 | 
44 | # Parsers
45 | 
46 | PARSERS = {
47 |     'csv': 'tabulator.parsers.csv.CSVParser',
48 |     'datapackage': 'tabulator.parsers.datapackage.DataPackageParser',
49 |     'gsheet': 'tabulator.parsers.gsheet.GsheetParser',
50 |     'html': 'tabulator.parsers.html.HTMLTableParser',
51 |     'inline': 'tabulator.parsers.inline.InlineParser',
52 |     'json': 'tabulator.parsers.json.JSONParser',
53 |     'jsonl': 'tabulator.parsers.ndjson.NDJSONParser',
54 |     'ndjson': 'tabulator.parsers.ndjson.NDJSONParser',
55 |     'ods': 'tabulator.parsers.ods.ODSParser',
56 |     'sql': 'tabulator.parsers.sql.SQLParser',
57 |     'tsv': 'tabulator.parsers.tsv.TSVParser',
58 |     'xls': 'tabulator.parsers.xls.XLSParser',
59 |     'xlsx': 'tabulator.parsers.xlsx.XLSXParser',
60 | }
61 | 
62 | # Writers
63 | 
64 | WRITERS = {
65 |     'csv': 'tabulator.writers.csv.CSVWriter',
66 |     'json': 'tabulator.writers.json.JSONWriter',
67 |     'xlsx': 'tabulator.writers.xlsx.XLSXWriter',
68 |     'sql': 'tabulator.writers.sql.SQLWriter',
69 | }
70 | 


--------------------------------------------------------------------------------
/tabulator/exceptions.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | 
 8 | # Module API
 9 | 
10 | class TabulatorException(Exception):
11 |     """Base class for all tabulator exceptions.
12 |     """
13 |     pass
14 | 
15 | 
16 | class SourceError(TabulatorException):
17 |     """The source file could not be parsed correctly.
18 |     """
19 |     pass
20 | 
21 | 
22 | class SchemeError(TabulatorException):
23 |     """The file scheme is not supported.
24 |     """
25 |     pass
26 | 
27 | 
28 | class FormatError(TabulatorException):
29 |     """The file format is unsupported or invalid.
30 |     """
31 |     pass
32 | 
33 | 
34 | class EncodingError(TabulatorException):
35 |     """Encoding error
36 |     """
37 |     pass
38 | 
39 | 
40 | class CompressionError(TabulatorException):
41 |     """Compression error
42 |     """
43 |     pass
44 | 
45 | 
46 | # Deprecated
47 | 
48 | OptionsError = TabulatorException
49 | ResetError = TabulatorException
50 | 
51 | 
52 | class IOError(SchemeError):
53 |     """Local loading error
54 |     """
55 |     pass
56 | 
57 | 
58 | class LoadingError(IOError):
59 |     """Local loading error
60 |     """
61 |     pass
62 | 
63 | 
64 | class HTTPError(LoadingError):
65 |     """Remote loading error
66 |     """
67 |     pass
68 | 


--------------------------------------------------------------------------------
/tabulator/helpers.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | from __future__ import absolute_import
  5 | from __future__ import unicode_literals
  6 | 
  7 | import os
  8 | import re
  9 | import six
 10 | import codecs
 11 | import hashlib
 12 | from copy import copy
 13 | from importlib import import_module
 14 | from six.moves.urllib.parse import parse_qs, urlparse, urlunparse
 15 | from . import exceptions
 16 | from . import config
 17 | 
 18 | 
 19 | # Module API
 20 | 
 21 | def detect_scheme_and_format(source):
 22 |     """Detect scheme and format based on source and return as a tuple.
 23 | 
 24 |     Scheme is a minimum 2 letters before `://` (will be lower cased).
 25 |     For example `http` from `http://example.com/table.csv`
 26 | 
 27 |     """
 28 | 
 29 |     # Scheme: stream
 30 |     if hasattr(source, 'read'):
 31 |         return ('stream', None)
 32 | 
 33 |     # Format: inline
 34 |     if not isinstance(source, six.string_types):
 35 |         return (None, 'inline')
 36 | 
 37 |     # Format: gsheet
 38 |     if 'docs.google.com/spreadsheets' in source:
 39 |         if 'export' not in source and 'pub' not in source:
 40 |             return (None, 'gsheet')
 41 |         elif 'csv' in source:
 42 |             return ('https', 'csv')
 43 | 
 44 |     # Format: sql
 45 |     for sql_scheme in config.SQL_SCHEMES:
 46 |         if source.startswith('%s://' % sql_scheme):
 47 |             return (None, 'sql')
 48 | 
 49 |     # General
 50 |     parsed = urlparse(source)
 51 |     scheme = parsed.scheme.lower()
 52 |     if len(scheme) < 2:
 53 |         scheme = config.DEFAULT_SCHEME
 54 |     format = os.path.splitext(parsed.path or parsed.netloc)[1][1:].lower() or None
 55 |     if format is None:
 56 |         # Test if query string contains a "format=" parameter.
 57 |         query_string = parse_qs(parsed.query)
 58 |         query_string_format = query_string.get("format")
 59 |         if query_string_format is not None and len(query_string_format) == 1:
 60 |             format = query_string_format[0]
 61 | 
 62 |     # Format: datapackage
 63 |     if parsed.path.endswith('datapackage.json'):
 64 |         return (None, 'datapackage')
 65 | 
 66 |     return (scheme, format)
 67 | 
 68 | 
 69 | # TODO: consider merging cp1252/iso8859-1
 70 | def detect_encoding(sample, encoding=None):
 71 |     """Detect encoding of a byte string sample.
 72 |     """
 73 |     # To reduce tabulator import time
 74 |     try:
 75 |         from cchardet import detect
 76 |     except ImportError:
 77 |         from chardet import detect
 78 |     if encoding is not None:
 79 |         return normalize_encoding(sample, encoding)
 80 |     result = detect(sample)
 81 |     confidence = result['confidence'] or 0
 82 |     encoding = result['encoding'] or 'ascii'
 83 |     encoding = normalize_encoding(sample, encoding)
 84 |     if confidence < config.ENCODING_CONFIDENCE:
 85 |         encoding = config.DEFAULT_ENCODING
 86 |     if encoding == 'ascii':
 87 |         encoding = config.DEFAULT_ENCODING
 88 |     return encoding
 89 | 
 90 | 
 91 | def normalize_encoding(sample, encoding):
 92 |     """Normalize encoding including 'utf-8-sig', 'utf-16-be', utf-16-le tweaks.
 93 |     """
 94 |     encoding = codecs.lookup(encoding).name
 95 |     # Work around 'Incorrect detection of utf-8-sig encoding'
 96 |     # <https://github.com/PyYoshi/cChardet/issues/28>
 97 |     if encoding == 'utf-8':
 98 |         if sample.startswith(codecs.BOM_UTF8):
 99 |             encoding = 'utf-8-sig'
100 |     # Use the BOM stripping name (without byte-order) for UTF-16 encodings
101 |     elif encoding == 'utf-16-be':
102 |         if sample.startswith(codecs.BOM_UTF16_BE):
103 |             encoding = 'utf-16'
104 |     elif encoding == 'utf-16-le':
105 |         if sample.startswith(codecs.BOM_UTF16_LE):
106 |             encoding = 'utf-16'
107 |     return encoding
108 | 
109 | 
110 | def detect_html(text):
111 |     """Detect if text is HTML.
112 |     """
113 |     pattern = re.compile('\\s*<(!doctype|html)', re.IGNORECASE)
114 |     return bool(pattern.match(text))
115 | 
116 | 
117 | def reset_stream(stream):
118 |     """Reset stream pointer to the first element.
119 | 
120 |     If stream is not seekable raise Exception.
121 | 
122 |     """
123 |     try:
124 |         position = stream.tell()
125 |     except Exception:
126 |         position = True
127 |     if position != 0:
128 |         try:
129 |             stream.seek(0)
130 |         except Exception:
131 |             message = 'It\'s not possible to reset this stream'
132 |             raise exceptions.TabulatorException(message)
133 | 
134 | 
135 | def ensure_dir(path):
136 |     """Ensure path directory exists.
137 |     """
138 |     dirpath = os.path.dirname(path)
139 |     if dirpath and not os.path.exists(dirpath):
140 |         os.makedirs(dirpath)
141 | 
142 | 
143 | def requote_uri(uri):
144 |     """Requote uri if it contains non-ascii chars, spaces etc.
145 |     """
146 |     # To reduce tabulator import time
147 |     import requests.utils
148 |     if six.PY2:
149 |         def url_encode_non_ascii(bytes):
150 |             pattern = '[\x80-\xFF]'
151 |             replace = lambda c: ('%%%02x' % ord(c.group(0))).upper()
152 |             return re.sub(pattern, replace, bytes)
153 |         parts = urlparse(uri)
154 |         uri = urlunparse(
155 |             part.encode('idna') if index == 1
156 |             else url_encode_non_ascii(part.encode('utf-8'))
157 |             for index, part in enumerate(parts))
158 |     return requests.utils.requote_uri(uri)
159 | 
160 | 
161 | def import_attribute(path):
162 |     """Import attribute by path like `package.module.attribute`
163 |     """
164 |     module_name, attribute_name = path.rsplit('.', 1)
165 |     module = import_module(module_name)
166 |     attribute = getattr(module, attribute_name)
167 |     return attribute
168 | 
169 | 
170 | def extract_options(options, names):
171 |     """Return options for names and remove it from given options in-place.
172 |     """
173 |     result = {}
174 |     for name, value in copy(options).items():
175 |         if name in names:
176 |             result[name] = value
177 |             del options[name]
178 |     return result
179 | 
180 | 
181 | def stringify_value(value):
182 |     """Convert any value to string.
183 |     """
184 |     if value is None:
185 |         return u''
186 |     isoformat = getattr(value, 'isoformat', None)
187 |     if isoformat is not None:
188 |         value = isoformat()
189 |     return type(u'')(value)
190 | 
191 | 
192 | class BytesStatsWrapper(object):
193 |     """This class is intended to be used as
194 | 
195 |     stats = {'size': 0, 'hash': ''}
196 |     bytes = BytesStatsWrapper(bytes, stats)
197 | 
198 |     It will be updating the stats during reading.
199 | 
200 |     """
201 | 
202 |     def __init__(self, bytes, stats):
203 |         self.__hasher = getattr(hashlib, stats['hashing_algorithm'])()
204 |         self.__bytes = bytes
205 |         self.__stats = stats
206 | 
207 |     def __getattr__(self, name):
208 |         return getattr(self.__bytes, name)
209 | 
210 |     @property
211 |     def closed(self):
212 |         return self.__bytes.closed
213 | 
214 |     def read1(self, size=None):
215 |         chunk = self.__bytes.read1(size)
216 |         self.__hasher.update(chunk)
217 |         self.__stats['size'] += len(chunk)
218 |         self.__stats['hash'] = self.__hasher.hexdigest()
219 |         return chunk
220 | 


--------------------------------------------------------------------------------
/tabulator/loader.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | from six import add_metaclass
 8 | from abc import ABCMeta, abstractmethod
 9 | 
10 | 
11 | # Module API
12 | 
13 | @add_metaclass(ABCMeta)
14 | class Loader(object):
15 |     """Abstract class implemented by the data loaders
16 | 
17 |     The loaders inherit and implement this class' methods to add support for a
18 |     new scheme (e.g. ssh).
19 | 
20 |     # Arguments
21 |         bytes_sample_size (int): Sample size in bytes
22 |         **options (dict): Loader options
23 | 
24 |     """
25 | 
26 |     # Public
27 | 
28 |     options = []
29 | 
30 |     def __init__(self, bytes_sample_size, **options):
31 |         pass
32 | 
33 |     @abstractmethod
34 |     def load(self, source, mode='t', encoding=None):
35 |         """Load source file.
36 | 
37 |         # Arguments
38 |             source (str): Path to tabular source file.
39 |             mode (str, optional):
40 |                 Text stream mode, `t` (text) or `b` (binary).  Defaults to `t`.
41 |             encoding (str, optional):
42 |                 Source encoding. Auto-detect by default.
43 | 
44 |         # Returns
45 |             Union[TextIO, BinaryIO]: I/O stream opened either as text or binary.
46 | 
47 |         """
48 |         pass
49 | 


--------------------------------------------------------------------------------
/tabulator/loaders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/tabulator/loaders/__init__.py


--------------------------------------------------------------------------------
/tabulator/loaders/aws.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import os
 8 | import io
 9 | import boto3
10 | from six.moves.urllib.parse import urlparse
11 | from ..loader import Loader
12 | from .. import exceptions
13 | from .. import helpers
14 | from .. import config
15 | 
16 | 
17 | # Module API
18 | 
19 | class AWSLoader(Loader):
20 |     """Loader to load source from the AWS.
21 |     """
22 | 
23 |     # Public
24 | 
25 |     remote = True
26 |     options = [
27 |         's3_endpoint_url',
28 |     ]
29 | 
30 |     def __init__(self,
31 |                  bytes_sample_size=config.DEFAULT_BYTES_SAMPLE_SIZE,
32 |                  s3_endpoint_url=None):
33 |         self.__bytes_sample_size = bytes_sample_size
34 |         self.__s3_endpoint_url = (
35 |             s3_endpoint_url or
36 |             os.environ.get('S3_ENDPOINT_URL') or
37 |             config.S3_DEFAULT_ENDPOINT_URL)
38 |         self.__s3_client = boto3.client('s3', endpoint_url=self.__s3_endpoint_url)
39 |         self.__stats = None
40 | 
41 |     def attach_stats(self, stats):
42 |         self.__stats = stats
43 | 
44 |     def load(self, source, mode='t', encoding=None):
45 | 
46 |         # Prepare bytes
47 |         try:
48 |             parts = urlparse(source, allow_fragments=False)
49 |             response = self.__s3_client.get_object(Bucket=parts.netloc, Key=parts.path[1:])
50 |             # https://github.com/frictionlessdata/tabulator-py/issues/271
51 |             bytes = io.BufferedRandom(io.BytesIO())
52 |             bytes.write(response['Body'].read())
53 |             bytes.seek(0)
54 |             if self.__stats:
55 |                 bytes = helpers.BytesStatsWrapper(bytes, self.__stats)
56 |         except Exception as exception:
57 |             raise exceptions.LoadingError(str(exception))
58 | 
59 |         # Return bytes
60 |         if mode == 'b':
61 |             return bytes
62 | 
63 |         # Detect encoding
64 |         if self.__bytes_sample_size:
65 |             sample = bytes.read(self.__bytes_sample_size)
66 |             bytes.seek(0)
67 |             encoding = helpers.detect_encoding(sample, encoding)
68 | 
69 |         # Prepare chars
70 |         chars = io.TextIOWrapper(bytes, encoding)
71 | 
72 |         return chars
73 | 


--------------------------------------------------------------------------------
/tabulator/loaders/local.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import io
 8 | from ..loader import Loader
 9 | from .. import exceptions
10 | from .. import helpers
11 | from .. import config
12 | 
13 | 
14 | # Module API
15 | 
16 | class LocalLoader(Loader):
17 |     """Loader to load source from filesystem.
18 |     """
19 | 
20 |     # Public
21 | 
22 |     options = []
23 | 
24 |     def __init__(self, bytes_sample_size=config.DEFAULT_BYTES_SAMPLE_SIZE):
25 |         self.__bytes_sample_size = bytes_sample_size
26 |         self.__stats = None
27 | 
28 |     def attach_stats(self, stats):
29 |         self.__stats = stats
30 | 
31 |     def load(self, source, mode='t', encoding=None):
32 | 
33 |         # Prepare source
34 |         scheme = 'file://'
35 |         if source.startswith(scheme):
36 |             source = source.replace(scheme, '', 1)
37 | 
38 |         # Prepare bytes
39 |         try:
40 |             bytes = io.open(source, 'rb')
41 |             if self.__stats:
42 |                 bytes = helpers.BytesStatsWrapper(bytes, self.__stats)
43 |         except IOError as exception:
44 |             raise exceptions.LoadingError(str(exception))
45 | 
46 |         # Return bytes
47 |         if mode == 'b':
48 |             return bytes
49 | 
50 |         # Detect encoding
51 |         if self.__bytes_sample_size:
52 |             sample = bytes.read(self.__bytes_sample_size)
53 |             bytes.seek(0)
54 |             encoding = helpers.detect_encoding(sample, encoding)
55 | 
56 |         # Prepare chars
57 |         chars = io.TextIOWrapper(bytes, encoding)
58 | 
59 |         return chars
60 | 


--------------------------------------------------------------------------------
/tabulator/loaders/remote.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | from __future__ import absolute_import
  5 | from __future__ import unicode_literals
  6 | 
  7 | import io
  8 | import six
  9 | import requests
 10 | from ..loader import Loader
 11 | from .. import exceptions
 12 | from .. import helpers
 13 | from .. import config
 14 | 
 15 | 
 16 | # Module API
 17 | 
 18 | class RemoteLoader(Loader):
 19 |     """Loader to load source from the web.
 20 |     """
 21 | 
 22 |     # Public
 23 | 
 24 |     remote = True
 25 |     options = [
 26 |         'http_session',
 27 |         'http_stream',
 28 |         'http_timeout',
 29 |     ]
 30 | 
 31 |     def __init__(self,
 32 |                  bytes_sample_size=config.DEFAULT_BYTES_SAMPLE_SIZE,
 33 |                  http_session=None,
 34 |                  http_stream=True,
 35 |                  http_timeout=None):
 36 | 
 37 |         # Create default session
 38 |         if not http_session:
 39 |             http_session = requests.Session()
 40 |             http_session.headers.update(config.HTTP_HEADERS)
 41 | 
 42 |         # No stream support
 43 |         if six.PY2:
 44 |             http_stream = False
 45 | 
 46 |         # Set attributes
 47 |         self.__bytes_sample_size = bytes_sample_size
 48 |         self.__http_session = http_session
 49 |         self.__http_stream = http_stream
 50 |         self.__http_timeout = http_timeout
 51 |         self.__stats = None
 52 | 
 53 |     def attach_stats(self, stats):
 54 |         self.__stats = stats
 55 | 
 56 |     def load(self, source, mode='t', encoding=None):
 57 | 
 58 |         # Prepare source
 59 |         source = helpers.requote_uri(source)
 60 | 
 61 |         # Prepare bytes
 62 |         try:
 63 |             bytes = _RemoteStream(source, self.__http_session, self.__http_timeout).open()
 64 |             if not self.__http_stream:
 65 |                 buffer = io.BufferedRandom(io.BytesIO())
 66 |                 buffer.write(bytes.read())
 67 |                 buffer.seek(0)
 68 |                 bytes = buffer
 69 |             if self.__stats:
 70 |                 bytes = helpers.BytesStatsWrapper(bytes, self.__stats)
 71 |         except IOError as exception:
 72 |             raise exceptions.HTTPError(str(exception))
 73 | 
 74 |         # Return bytes
 75 |         if mode == 'b':
 76 |             return bytes
 77 | 
 78 |         # Detect encoding
 79 |         if self.__bytes_sample_size:
 80 |             sample = bytes.read(self.__bytes_sample_size)[:self.__bytes_sample_size]
 81 |             bytes.seek(0)
 82 |             encoding = helpers.detect_encoding(sample, encoding)
 83 | 
 84 |         # Prepare chars
 85 |         chars = io.TextIOWrapper(bytes, encoding)
 86 | 
 87 |         return chars
 88 | 
 89 | 
 90 | # Internal
 91 | 
 92 | class _RemoteStream(object):
 93 | 
 94 |     # It's possible to implement cache for bytes sample
 95 |     # size to prevent additional HTTP calls used in seek
 96 | 
 97 |     # Public
 98 | 
 99 |     remote = True
100 | 
101 |     def __init__(self, source, session, timeout):
102 |         self.__source = source
103 |         self.__session = session
104 |         self.__timeout = timeout
105 | 
106 |     def readable(self):
107 |         return True
108 | 
109 |     def writable(self):
110 |         return False
111 | 
112 |     def seekable(self):
113 |         return True
114 | 
115 |     @property
116 |     def closed(self):
117 |         return self.__closed
118 | 
119 |     def open(self):
120 |         self.__closed = False
121 |         self.seek(0)
122 |         return self
123 | 
124 |     def close(self):
125 |         self.__closed = True
126 | 
127 |     def tell(self):
128 |         return self.__response.raw.tell()
129 | 
130 |     def flush(self):
131 |         pass
132 | 
133 |     def read(self, size=None):
134 |         return self.__response.raw.read(size)
135 | 
136 |     def read1(self, size=None):
137 |         return self.__response.raw.read(size)
138 | 
139 |     def seek(self, offset, whence=0):
140 |         assert offset == 0
141 |         assert whence == 0
142 |         self.__response = self.__session.get(self.__source, stream=True, timeout=self.__timeout)
143 |         self.__response.raise_for_status()
144 |         self.__response.raw.decode_content = True
145 | 


--------------------------------------------------------------------------------
/tabulator/loaders/stream.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import io
 8 | from ..loader import Loader
 9 | from .. import exceptions
10 | from .. import helpers
11 | from .. import config
12 | 
13 | 
14 | # Module API
15 | 
16 | class StreamLoader(Loader):
17 |     """Loader to load source from file-like byte stream.
18 |     """
19 | 
20 |     # Public
21 | 
22 |     options = []
23 | 
24 |     def __init__(self, bytes_sample_size=config.DEFAULT_BYTES_SAMPLE_SIZE):
25 |         self.__bytes_sample_size = bytes_sample_size
26 |         self.__stats = None
27 | 
28 |     def attach_stats(self, stats):
29 |         self.__stats = stats
30 | 
31 |     def load(self, source, mode='t', encoding=None):
32 | 
33 |         # Support only bytes
34 |         if hasattr(source, 'encoding'):
35 |             message = 'Only byte streams are supported.'
36 |             raise exceptions.SourceError(message)
37 | 
38 |         # Prepare bytes
39 |         bytes = source
40 |         if self.__stats:
41 |             bytes = helpers.BytesStatsWrapper(bytes, self.__stats)
42 | 
43 |         # Return bytes
44 |         if mode == 'b':
45 |             return bytes
46 | 
47 |         # Detect encoding
48 |         if self.__bytes_sample_size:
49 |             sample = bytes.read(self.__bytes_sample_size)
50 |             bytes.seek(0)
51 |             encoding = helpers.detect_encoding(sample, encoding)
52 | 
53 |         # Prepare chars
54 |         chars = io.TextIOWrapper(bytes, encoding)
55 | 
56 |         return chars
57 | 


--------------------------------------------------------------------------------
/tabulator/loaders/text.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import io
 8 | from ..loader import Loader
 9 | from .. import helpers
10 | from .. import config
11 | 
12 | 
13 | # Module API
14 | 
15 | class TextLoader(Loader):
16 |     """Loader to load source from text.
17 |     """
18 | 
19 |     # Public
20 | 
21 |     options = []
22 | 
23 |     def __init__(self, bytes_sample_size=config.DEFAULT_BYTES_SAMPLE_SIZE):
24 |         self.__bytes_sample_size = bytes_sample_size
25 |         self.__stats = None
26 | 
27 |     def attach_stats(self, stats):
28 |         self.__stats = stats
29 | 
30 |     def load(self, source, mode='t', encoding=None):
31 | 
32 |         # Prepare source
33 |         scheme = 'text://'
34 |         if source.startswith(scheme):
35 |             source = source.replace(scheme, '', 1)
36 | 
37 |         # Prepare bytes
38 |         bytes = io.BufferedRandom(io.BytesIO())
39 |         bytes.write(source.encode(encoding or config.DEFAULT_ENCODING))
40 |         bytes.seek(0)
41 |         if self.__stats:
42 |             bytes = helpers.BytesStatsWrapper(bytes, self.__stats)
43 | 
44 |         # Return bytes
45 |         if mode == 'b':
46 |             return bytes
47 | 
48 |         # Prepare chars
49 |         chars = io.TextIOWrapper(bytes, encoding)
50 | 
51 |         return chars
52 | 


--------------------------------------------------------------------------------
/tabulator/parser.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | from __future__ import absolute_import
  5 | from __future__ import unicode_literals
  6 | 
  7 | from six import add_metaclass
  8 | from abc import ABCMeta, abstractmethod
  9 | 
 10 | 
 11 | # Module API
 12 | 
 13 | @add_metaclass(ABCMeta)
 14 | class Parser(object):
 15 |     """Abstract class implemented by the data parsers.
 16 | 
 17 |     The parsers inherit and implement this class' methods to add support for a
 18 |     new file type.
 19 | 
 20 |     # Arguments
 21 |         loader (tabulator.Loader): Loader instance to read the file.
 22 |         force_parse (bool):
 23 |             When `True`, the parser yields an empty extended
 24 |             row tuple `(row_number, None, [])` when there is an error parsing a
 25 |             row. Otherwise, it stops the iteration by raising the exception
 26 |             `tabulator.exceptions.SourceError`.
 27 |         **options (dict): Loader options
 28 | 
 29 |     """
 30 | 
 31 |     # Public
 32 | 
 33 |     options = []
 34 | 
 35 |     def __init__(self, loader, force_parse, **options):
 36 |         pass
 37 | 
 38 |     @property
 39 |     @abstractmethod
 40 |     def closed(self):
 41 |         """Flag telling if the parser is closed.
 42 | 
 43 |         # Returns
 44 |             bool: whether closed
 45 | 
 46 |         """
 47 |         pass  # pragma: no cover
 48 | 
 49 |     @abstractmethod
 50 |     def open(self, source, encoding=None):
 51 |         """Open underlying file stream in the beginning of the file.
 52 | 
 53 |         The parser gets a byte or text stream from the `tabulator.Loader`
 54 |         instance and start emitting items.
 55 | 
 56 |         # Arguments
 57 |             source (str): Path to source table.
 58 |             encoding (str, optional): Source encoding. Auto-detect by default.
 59 | 
 60 |         # Returns
 61 |             None
 62 | 
 63 |         """
 64 |         pass  # pragma: no cover
 65 | 
 66 |     @abstractmethod
 67 |     def close(self):
 68 |         """Closes underlying file stream.
 69 |         """
 70 |         pass  # pragma: no cover
 71 | 
 72 |     @abstractmethod
 73 |     def reset(self):
 74 |         """Resets underlying stream and current items list.
 75 | 
 76 |         After `reset()` is called, iterating over the items will start from the beginning.
 77 |         """
 78 |         pass  # pragma: no cover
 79 | 
 80 |     @property
 81 |     @abstractmethod
 82 |     def encoding(self):
 83 |         """Encoding
 84 | 
 85 |         # Returns
 86 |             str: encoding
 87 | 
 88 |         """
 89 |         pass  # pragma: no cover
 90 | 
 91 |     @property
 92 |     @abstractmethod
 93 |     def extended_rows(self):
 94 |         """Returns extended rows iterator.
 95 | 
 96 |         The extended rows are tuples containing `(row_number, headers, row)`,
 97 | 
 98 |         # Raises
 99 |             SourceError:
100 |                 If `force_parse` is `False` and
101 |                 a row can't be parsed, this exception will be raised.
102 |                 Otherwise, an empty extended row is returned (i.e.
103 |                 `(row_number, None, [])`).
104 | 
105 |         Returns:
106 |             Iterator[Tuple[int, List[str], List[Any]]]:
107 |                 Extended rows containing
108 |                 `(row_number, headers, row)`, where `headers` is a list of the
109 |                 header names (can be `None`), and `row` is a list of row
110 |                 values.
111 | 
112 |         """
113 |         pass  # pragma: no cover
114 | 


--------------------------------------------------------------------------------
/tabulator/parsers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/tabulator/parsers/__init__.py


--------------------------------------------------------------------------------
/tabulator/parsers/csv.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | from __future__ import absolute_import
  5 | from __future__ import unicode_literals
  6 | 
  7 | import csv
  8 | import six
  9 | from itertools import chain
 10 | from codecs import iterencode
 11 | from ..parser import Parser
 12 | from .. import helpers
 13 | from .. import config
 14 | 
 15 | 
 16 | # Module API
 17 | 
 18 | class CSVParser(Parser):
 19 |     """Parser to parse CSV data format.
 20 |     """
 21 | 
 22 |     # Public
 23 | 
 24 |     options = [
 25 |         'delimiter',
 26 |         'doublequote',
 27 |         'escapechar',
 28 |         'quotechar',
 29 |         'quoting',
 30 |         'skipinitialspace',
 31 |         'lineterminator'
 32 |     ]
 33 | 
 34 |     def __init__(self, loader, force_parse=False, **options):
 35 | 
 36 |         # Make bytes
 37 |         if six.PY2:
 38 |             for key, value in options.items():
 39 |                 if isinstance(value, six.string_types):
 40 |                     options[key] = str(value)
 41 | 
 42 |         # Set attributes
 43 |         self.__loader = loader
 44 |         self.__options = options
 45 |         self.__force_parse = force_parse
 46 |         self.__extended_rows = None
 47 |         self.__encoding = None
 48 |         self.__dialect = None
 49 |         self.__chars = None
 50 | 
 51 |     @property
 52 |     def closed(self):
 53 |         return self.__chars is None or self.__chars.closed
 54 | 
 55 |     def open(self, source, encoding=None):
 56 |         self.close()
 57 |         self.__chars = self.__loader.load(source, encoding=encoding)
 58 |         self.__encoding = getattr(self.__chars, 'encoding', encoding)
 59 |         if self.__encoding:
 60 |             self.__encoding.lower()
 61 |         self.reset()
 62 | 
 63 |     def close(self):
 64 |         if not self.closed:
 65 |             self.__chars.close()
 66 | 
 67 |     def reset(self):
 68 |         helpers.reset_stream(self.__chars)
 69 |         self.__extended_rows = self.__iter_extended_rows()
 70 | 
 71 |     @property
 72 |     def encoding(self):
 73 |         return self.__encoding
 74 | 
 75 |     @property
 76 |     def dialect(self):
 77 |         if self.__dialect:
 78 |             dialect = {
 79 |                 'delimiter': self.__dialect.delimiter,
 80 |                 'doubleQuote': self.__dialect.doublequote,
 81 |                 'lineTerminator': self.__dialect.lineterminator,
 82 |                 'quoteChar': self.__dialect.quotechar,
 83 |                 'skipInitialSpace': self.__dialect.skipinitialspace,
 84 |             }
 85 |             if self.__dialect.escapechar is not None:
 86 |                 dialect['escapeChar'] = self.__dialect.escapechar
 87 |             return dialect
 88 | 
 89 |     @property
 90 |     def extended_rows(self):
 91 |         return self.__extended_rows
 92 | 
 93 |     # Private
 94 | 
 95 |     def __iter_extended_rows(self):
 96 | 
 97 |         # For PY2 encode/decode
 98 |         if six.PY2:
 99 |             # Reader requires utf-8 encoded stream
100 |             bytes = iterencode(self.__chars, 'utf-8')
101 |             sample, dialect = self.__prepare_dialect(bytes)
102 |             items = csv.reader(chain(sample, bytes), dialect=dialect)
103 |             for row_number, item in enumerate(items, start=1):
104 |                 values = []
105 |                 for value in item:
106 |                     value = value.decode('utf-8')
107 |                     values.append(value)
108 |                 yield (row_number, None, list(values))
109 | 
110 |         # For PY3 use chars
111 |         else:
112 |             sample, dialect = self.__prepare_dialect(self.__chars)
113 |             items = csv.reader(chain(sample, self.__chars), dialect=dialect)
114 |             for row_number, item in enumerate(items, start=1):
115 |                 yield (row_number, None, list(item))
116 | 
117 |     def __prepare_dialect(self, stream):
118 | 
119 |         # Get sample
120 |         sample = []
121 |         while True:
122 |             try:
123 |                 sample.append(next(stream))
124 |             except StopIteration:
125 |                 break
126 |             if len(sample) >= config.CSV_SAMPLE_LINES:
127 |                 break
128 | 
129 |         # Get dialect
130 |         try:
131 |             separator = b'' if six.PY2 else ''
132 |             delimiter = self.__options.get('delimiter', ',\t;|')
133 |             dialect = csv.Sniffer().sniff(separator.join(sample), delimiter)
134 |             if not dialect.escapechar:
135 |                 dialect.doublequote = True
136 |         except csv.Error:
137 |             class dialect(csv.excel):
138 |                 pass
139 |         for key, value in self.__options.items():
140 |             setattr(dialect, key, value)
141 |         # https://github.com/frictionlessdata/FrictionlessDarwinCore/issues/1
142 |         if getattr(dialect, 'quotechar', None) == '':
143 |             setattr(dialect, 'quoting', csv.QUOTE_NONE)
144 | 
145 |         self.__dialect = dialect
146 |         return sample, dialect
147 | 


--------------------------------------------------------------------------------
/tabulator/parsers/datapackage.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import six
 8 | import datapackage
 9 | from ..parser import Parser
10 | from .. import exceptions
11 | 
12 | 
13 | # Module API
14 | 
15 | class DataPackageParser(Parser):
16 |     """Parser to extract data from Tabular Data Packages.
17 |     """
18 | 
19 |     # Public
20 | 
21 |     options = [
22 |         'resource',
23 |     ]
24 | 
25 |     def __init__(self, loader, force_parse=False, resource=0):
26 |         self.__force_parse = force_parse
27 |         self.__resource_pointer = resource
28 |         self.__extended_rows = None
29 |         self.__encoding = None
30 |         self.__fragment = None
31 |         self.__resource = None
32 | 
33 |     @property
34 |     def closed(self):
35 |         return self.__extended_rows is None
36 | 
37 |     def open(self, source, encoding=None):
38 |         self.close()
39 |         package = datapackage.DataPackage(source)
40 |         if isinstance(self.__resource_pointer, six.string_types):
41 |             self.__resource = package.get_resource(self.__resource_pointer)
42 |         else:
43 |             try:
44 |                 self.__resource = package.resources[self.__resource_pointer]
45 |             except (TypeError, IndexError):
46 |                 pass
47 |         if not self.__resource:
48 |             message = 'Data package "%s" doesn\'t have resource "%s"'
49 |             raise exceptions.SourceError(message % (source, self.__resource_pointer))
50 |         self.__resource.infer()
51 |         self.__encoding = self.__resource.descriptor.get('encoding')
52 |         self.__fragment = self.__resource.name
53 |         self.reset()
54 | 
55 |     def close(self):
56 |         if not self.closed:
57 |             self.__extended_rows = None
58 | 
59 |     def reset(self):
60 |         self.__extended_rows = self.__iter_extended_rows()
61 | 
62 |     @property
63 |     def encoding(self):
64 |         return self.__encoding
65 | 
66 |     @property
67 |     def fragment(self):
68 |         return self.__fragment
69 | 
70 |     @property
71 |     def extended_rows(self):
72 |         return self.__extended_rows
73 | 
74 |     # Private
75 | 
76 |     def __iter_extended_rows(self):
77 |         for row_number, headers, row in self.__resource.iter(extended=True):
78 |             yield (row_number - 1, headers, row)
79 | 


--------------------------------------------------------------------------------
/tabulator/parsers/gsheet.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import re
 8 | from ..stream import Stream
 9 | from ..parser import Parser
10 | 
11 | 
12 | # Module API
13 | 
14 | class GsheetParser(Parser):
15 |     """Parser to parse Google Spreadsheets.
16 |     """
17 | 
18 |     # Public
19 | 
20 |     options = []
21 | 
22 |     def __init__(self, loader, force_parse=False):
23 |         self.__loader = loader
24 |         self.__force_parse = force_parse
25 |         self.__stream = None
26 |         self.__encoding = None
27 | 
28 |     @property
29 |     def closed(self):
30 |         return self.__stream is None or self.__stream.closed
31 | 
32 |     def open(self, source, encoding=None):
33 |         self.close()
34 |         url = 'https://docs.google.com/spreadsheets/d/%s/export?format=csv&id=%s'
35 |         match = re.search(r'.*/d/(?P<key>[^/]+)/.*?(?:gid=(?P<gid>\d+))?$', source)
36 |         key, gid = '', ''
37 |         if match:
38 |             key = match.group('key')
39 |             gid = match.group('gid')
40 |         url = url % (key, key)
41 |         if gid:
42 |             url = '%s&gid=%s' % (url, gid)
43 |         self.__stream = Stream(
44 |             url, format='csv', encoding=encoding, force_parse=self.__force_parse).open()
45 |         self.__extended_rows = self.__stream.iter(extended=True)
46 |         self.__encoding = encoding
47 | 
48 |     def close(self):
49 |         if not self.closed:
50 |             self.__stream.close()
51 | 
52 |     def reset(self):
53 |         self.__stream.reset()
54 |         self.__extended_rows = self.__stream.iter(extended=True)
55 | 
56 |     @property
57 |     def encoding(self):
58 |         return self.__encoding
59 | 
60 |     @property
61 |     def extended_rows(self):
62 |         return self.__extended_rows
63 | 


--------------------------------------------------------------------------------
/tabulator/parsers/html.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | from pyquery import PyQuery as pq
 8 | from ..parser import Parser
 9 | from .. import helpers
10 | 
11 | 
12 | # Module API
13 | 
14 | class HTMLTableParser(Parser):
15 |     """Parser to extract data out of HTML tables
16 |     """
17 | 
18 |     # Public
19 | 
20 |     options = [
21 |         'selector',
22 |         'raw_html'
23 |     ]
24 | 
25 |     def __init__(self, loader, force_parse=False, selector='table', raw_html=False):
26 |         self.__loader = loader
27 |         self.__selector = selector
28 |         self.__force_parse = force_parse
29 |         self.__extended_rows = None
30 |         self.__encoding = None
31 |         self.__chars = None
32 |         self.__extractor = (lambda x: x.html()) if raw_html else (lambda x: x.text())
33 | 
34 |     @property
35 |     def closed(self):
36 |         return self.__chars is None or self.__chars.closed
37 | 
38 |     def open(self, source, encoding=None):
39 |         self.close()
40 |         self.__encoding = encoding
41 |         self.__chars = self.__loader.load(source, encoding=encoding)
42 |         if self.__encoding:
43 |             self.__encoding.lower()
44 |         self.reset()
45 | 
46 |     def close(self):
47 |         if not self.closed:
48 |             self.__chars.close()
49 | 
50 |     def reset(self):
51 |         helpers.reset_stream(self.__chars)
52 |         self.__extended_rows = self.__iter_extended_rows()
53 | 
54 |     @property
55 |     def encoding(self):
56 |         return self.__encoding
57 | 
58 |     @property
59 |     def extended_rows(self):
60 |         return self.__extended_rows
61 | 
62 |     # Private
63 | 
64 |     def __iter_extended_rows(self):
65 | 
66 |         # Get Page content
67 |         page = pq(self.__chars.read(), parser='html')
68 | 
69 |         # Find required table
70 |         if self.__selector:
71 |             table = pq(page.find(self.__selector)[0])
72 |         else:
73 |             table = page
74 | 
75 |         # Extract headers
76 |         rows = (
77 |             table.children('thead').children('tr') +
78 |             table.children('thead') +
79 |             table.children('tr') +
80 |             table.children('tbody').children('tr')
81 |         )
82 |         rows = [pq(r) for r in rows if len(r) > 0]
83 |         # Extract rows
84 |         rows = [pq(tr).children('td,th') for tr in rows]
85 |         rows = [[self.__extractor(pq(td)) for td in tr]
86 |                 for tr in rows if len(tr) > 0]
87 | 
88 |         # Yield rows
89 |         for row_number, row in enumerate(rows, start=1):
90 |             yield (row_number, None, row)
91 | 


--------------------------------------------------------------------------------
/tabulator/parsers/inline.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import six
 8 | from collections import OrderedDict
 9 | from ..parser import Parser
10 | from .. import exceptions
11 | 
12 | 
13 | # Module API
14 | 
15 | class InlineParser(Parser):
16 |     """Parser to provide support for python inline lists.
17 |     """
18 | 
19 |     # Public
20 | 
21 |     options = []
22 | 
23 |     def __init__(self, loader, force_parse=False):
24 |         self.__loader = loader
25 |         self.__force_parse = force_parse
26 |         self.__extended_rows = None
27 |         self.__encoding = None
28 |         self.__source = None
29 | 
30 |     @property
31 |     def closed(self):
32 |         return False
33 | 
34 |     def open(self, source, encoding=None):
35 |         if hasattr(source, '__next__' if six.PY3 else 'next'):
36 |             message = 'Only callable returning an iterator is supported'
37 |             raise exceptions.SourceError(message)
38 |         self.close()
39 |         self.__source = source
40 |         self.__encoding = encoding
41 |         self.reset()
42 | 
43 |     def close(self):
44 |         pass
45 | 
46 |     def reset(self):
47 |         self.__extended_rows = self.__iter_extended_rows()
48 | 
49 |     @property
50 |     def encoding(self):
51 |         return self.__encoding
52 | 
53 |     @property
54 |     def extended_rows(self):
55 |         return self.__extended_rows
56 | 
57 |     # Private
58 | 
59 |     def __iter_extended_rows(self):
60 |         items = self.__source
61 |         if not hasattr(items, '__iter__'):
62 |             items = items()
63 |         for row_number, item in enumerate(items, start=1):
64 |             if isinstance(item, (tuple, list)):
65 |                 yield (row_number, None, list(item))
66 |             elif isinstance(item, dict):
67 |                 keys = []
68 |                 values = []
69 |                 iterator = item.keys()
70 |                 if not isinstance(item, OrderedDict):
71 |                     iterator = sorted(iterator)
72 |                 for key in iterator:
73 |                     keys.append(key)
74 |                     values.append(item[key])
75 |                 yield (row_number, list(keys), list(values))
76 |             else:
77 |                 if not self.__force_parse:
78 |                     message = 'Inline data item has to be tuple, list or dict'
79 |                     raise exceptions.SourceError(message)
80 |                 yield (row_number, None, [])
81 | 


--------------------------------------------------------------------------------
/tabulator/parsers/json.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import ijson
 8 | from ..parser import Parser
 9 | from .. import exceptions
10 | from .. import helpers
11 | 
12 | 
13 | # Module API
14 | 
15 | class JSONParser(Parser):
16 |     """Parser to parse JSON data format.
17 |     """
18 | 
19 |     # Public
20 | 
21 |     options = [
22 |         'property',
23 |     ]
24 | 
25 |     def __init__(self, loader, force_parse=False, property=None):
26 |         self.__loader = loader
27 |         self.__property = property
28 |         self.__force_parse = force_parse
29 |         self.__extended_rows = None
30 |         self.__encoding = None
31 |         self.__bytes = None
32 | 
33 |     @property
34 |     def closed(self):
35 |         return self.__bytes is None or self.__bytes.closed
36 | 
37 |     def open(self, source, encoding=None):
38 |         self.close()
39 |         self.__encoding = encoding
40 |         self.__bytes = self.__loader.load(source, mode='b', encoding=encoding)
41 |         if self.__encoding:
42 |             self.__encoding.lower()
43 |         self.reset()
44 | 
45 |     def close(self):
46 |         if not self.closed:
47 |             self.__bytes.close()
48 | 
49 |     def reset(self):
50 |         helpers.reset_stream(self.__bytes)
51 |         self.__extended_rows = self.__iter_extended_rows()
52 | 
53 |     @property
54 |     def encoding(self):
55 |         return self.__encoding
56 | 
57 |     @property
58 |     def extended_rows(self):
59 |         return self.__extended_rows
60 | 
61 |     # Private
62 | 
63 |     def __iter_extended_rows(self):
64 |         path = 'item'
65 |         if self.__property is not None:
66 |             path = '%s.item' % self.__property
67 |         items = ijson.items(self.__bytes, path)
68 |         for row_number, item in enumerate(items, start=1):
69 |             if isinstance(item, (tuple, list)):
70 |                 yield (row_number, None, list(item))
71 |             elif isinstance(item, dict):
72 |                 keys = []
73 |                 values = []
74 |                 for key in sorted(item.keys()):
75 |                     keys.append(key)
76 |                     values.append(item[key])
77 |                 yield (row_number, list(keys), list(values))
78 |             else:
79 |                 if not self.__force_parse:
80 |                     message = 'JSON item has to be list or dict'
81 |                     raise exceptions.SourceError(message)
82 |                 yield (row_number, None, [])
83 | 


--------------------------------------------------------------------------------
/tabulator/parsers/ndjson.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import jsonlines
 8 | from ..parser import Parser
 9 | from .. import exceptions
10 | from .. import helpers
11 | 
12 | 
13 | # Module API
14 | 
15 | class NDJSONParser(Parser):
16 |     """Parser to parse NDJSON data format.
17 | 
18 |     See: http://specs.okfnlabs.org/ndjson/
19 |     """
20 | 
21 |     # Public
22 | 
23 |     options = []
24 | 
25 |     def __init__(self, loader, force_parse=False):
26 |         self.__loader = loader
27 |         self.__force_parse = force_parse
28 |         self.__extended_rows = None
29 |         self.__encoding = None
30 |         self.__chars = None
31 | 
32 |     @property
33 |     def closed(self):
34 |         return self.__chars is None or self.__chars.closed
35 | 
36 |     def open(self, source, encoding=None):
37 |         self.close()
38 |         self.__chars = self.__loader.load(source, encoding=encoding)
39 |         self.__encoding = getattr(self.__chars, 'encoding', encoding)
40 |         if self.__encoding:
41 |             self.__encoding.lower()
42 |         self.reset()
43 | 
44 |     def close(self):
45 |         if not self.closed:
46 |             self.__chars.close()
47 | 
48 |     def reset(self):
49 |         helpers.reset_stream(self.__chars)
50 |         self.__extended_rows = self.__iter_extended_rows()
51 | 
52 |     @property
53 |     def encoding(self):
54 |         return self.__encoding
55 | 
56 |     @property
57 |     def extended_rows(self):
58 |         return self.__extended_rows
59 | 
60 |     # Private
61 | 
62 |     def __iter_extended_rows(self):
63 |         rows = jsonlines.Reader(self.__chars)
64 |         for row_number, row in enumerate(rows, start=1):
65 |             if isinstance(row, (tuple, list)):
66 |                 yield row_number, None, list(row)
67 |             elif isinstance(row, dict):
68 |                 keys, values = zip(*sorted(row.items()))
69 |                 yield (row_number, list(keys), list(values))
70 |             else:
71 |                 if not self.__force_parse:
72 |                     raise exceptions.SourceError('JSON item has to be list or dict')
73 |                 yield (row_number, None, [])
74 | 


--------------------------------------------------------------------------------
/tabulator/parsers/ods.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | from __future__ import absolute_import
  5 | from __future__ import unicode_literals
  6 | from datetime import datetime
  7 | 
  8 | import six
  9 | import ezodf
 10 | from six import BytesIO
 11 | from ..parser import Parser
 12 | from .. import exceptions
 13 | from .. import helpers
 14 | 
 15 | 
 16 | # Module API
 17 | 
 18 | class ODSParser(Parser):
 19 |     """Parser to parse ODF Spreadsheets.
 20 |     """
 21 | 
 22 |     # Public
 23 | 
 24 |     options = [
 25 |         'sheet',
 26 |     ]
 27 | 
 28 |     def __init__(self, loader, force_parse=False, sheet=1):
 29 |         self.__loader = loader
 30 |         self.__sheet_pointer = sheet
 31 |         self.__force_parse = force_parse
 32 |         self.__extended_rows = None
 33 |         self.__encoding = None
 34 |         self.__bytes = None
 35 |         self.__book = None
 36 |         self.__sheet = None
 37 | 
 38 |     @property
 39 |     def closed(self):
 40 |         return self.__bytes is None or self.__bytes.closed
 41 | 
 42 |     def open(self, source, encoding=None):
 43 |         self.close()
 44 |         self.__encoding = encoding
 45 |         self.__bytes = self.__loader.load(source, mode='b', encoding=encoding)
 46 | 
 47 |         # Get book
 48 |         self.__book = ezodf.opendoc(BytesIO(self.__bytes.read()))
 49 | 
 50 |         # Get sheet
 51 |         try:
 52 |             if isinstance(self.__sheet_pointer, six.string_types):
 53 |                 self.__sheet = self.__book.sheets[self.__sheet_pointer]
 54 |             else:
 55 |                 self.__sheet = self.__book.sheets[self.__sheet_pointer - 1]
 56 |         except (KeyError, IndexError):
 57 |             message = 'OpenOffice document "%s" doesn\'t have a sheet "%s"'
 58 |             raise exceptions.SourceError(message % (source, self.__sheet_pointer))
 59 | 
 60 |         # Rest parser
 61 |         self.reset()
 62 | 
 63 |     def close(self):
 64 |         if not self.closed:
 65 |             self.__bytes.close()
 66 | 
 67 |     def reset(self):
 68 |         helpers.reset_stream(self.__bytes)
 69 |         self.__extended_rows = self.__iter_extended_rows()
 70 | 
 71 |     @property
 72 |     def encoding(self):
 73 |         return self.__encoding
 74 | 
 75 |     @property
 76 |     def extended_rows(self):
 77 |         return self.__extended_rows
 78 | 
 79 |     # Private
 80 | 
 81 |     def __iter_extended_rows(self):
 82 | 
 83 |         def type_value(cell):
 84 |             """Detects int value, date and datetime"""
 85 | 
 86 |             ctype = cell.value_type
 87 |             value = cell.value
 88 | 
 89 |             # ods numbers are float only
 90 |             # float with no decimals can be cast into int
 91 |             if isinstance(value, float) and value == value // 1:
 92 |                 return int(value)
 93 | 
 94 |             # Date or datetime
 95 |             if ctype == 'date':
 96 |                 if len(value) == 10:
 97 |                     return datetime.strptime(value, '%Y-%m-%d').date()
 98 |                 else:
 99 |                     return datetime.strptime(value, '%Y-%m-%dT%H:%M:%S')
100 | 
101 |             return value
102 | 
103 |         for row_number, row in enumerate(self.__sheet.rows(), start=1):
104 |             yield row_number, None, [type_value(cell) for cell in row]
105 | 


--------------------------------------------------------------------------------
/tabulator/parsers/sql.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | from sqlalchemy import create_engine, sql
 8 | from ..parser import Parser
 9 | from .. import exceptions
10 | 
11 | 
12 | # Module API
13 | 
14 | class SQLParser(Parser):
15 |     """Parser to get data from SQL database.
16 |     """
17 | 
18 |     # Public
19 | 
20 |     options = [
21 |         'table',
22 |         'order_by',
23 |     ]
24 | 
25 |     def __init__(self, loader, force_parse=False, table=None, order_by=None):
26 | 
27 |         # Ensure table
28 |         if table is None:
29 |             raise exceptions.TabulatorException('Format `sql` requires `table` option.')
30 | 
31 |         # Set attributes
32 |         self.__loader = loader
33 |         self.__table = table
34 |         self.__order_by = order_by
35 |         self.__force_parse = force_parse
36 |         self.__engine = None
37 |         self.__extended_rows = None
38 |         self.__encoding = None
39 | 
40 |     @property
41 |     def closed(self):
42 |         return self.__engine is None
43 | 
44 |     def open(self, source, encoding=None):
45 |         self.close()
46 |         self.__engine = create_engine(source)
47 |         self.__engine.update_execution_options(stream_results=True)
48 |         self.__encoding = encoding
49 |         self.reset()
50 | 
51 |     def close(self):
52 |         if not self.closed:
53 |             self.__engine.dispose()
54 |             self.__engine = None
55 | 
56 |     def reset(self):
57 |         self.__extended_rows = self.__iter_extended_rows()
58 | 
59 |     @property
60 |     def encoding(self):
61 |         return self.__encoding
62 | 
63 |     @property
64 |     def extended_rows(self):
65 |         return self.__extended_rows
66 | 
67 |     # Private
68 | 
69 |     def __iter_extended_rows(self):
70 |         table = sql.table(self.__table)
71 |         order = sql.text(self.__order_by) if self.__order_by else None
72 |         query = sql.select(['*']).select_from(table).order_by(order)
73 |         result = self.__engine.execute(query)
74 |         for row_number, row in enumerate(iter(result), start=1):
75 |             yield (row_number, list(row.keys()), list(row))
76 | 


--------------------------------------------------------------------------------
/tabulator/parsers/tsv.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import tsv
 8 | from ..parser import Parser
 9 | from .. import helpers
10 | 
11 | 
12 | # Module API
13 | 
14 | class TSVParser(Parser):
15 |     """Parser to parse linear TSV data format.
16 | 
17 |     See: http://dataprotocols.org/linear-tsv/
18 | 
19 |     """
20 | 
21 |     # Public
22 | 
23 |     options = []
24 | 
25 |     def __init__(self, loader, force_parse=False):
26 |         self.__loader = loader
27 |         self.__force_parse = force_parse
28 |         self.__extended_rows = None
29 |         self.__encoding = None
30 |         self.__chars = None
31 | 
32 |     @property
33 |     def closed(self):
34 |         return self.__chars is None or self.__chars.closed
35 | 
36 |     def open(self, source, encoding=None):
37 |         self.close()
38 |         self.__chars = self.__loader.load(source, encoding=encoding)
39 |         self.__encoding = getattr(self.__chars, 'encoding', encoding)
40 |         if self.__encoding:
41 |             self.__encoding.lower()
42 |         self.reset()
43 | 
44 |     def close(self):
45 |         if not self.closed:
46 |             self.__chars.close()
47 | 
48 |     def reset(self):
49 |         helpers.reset_stream(self.__chars)
50 |         self.__extended_rows = self.__iter_extended_rows()
51 | 
52 |     @property
53 |     def encoding(self):
54 |         return self.__encoding
55 | 
56 |     @property
57 |     def extended_rows(self):
58 |         return self.__extended_rows
59 | 
60 |     # Private
61 | 
62 |     def __iter_extended_rows(self):
63 |         items = tsv.un(self.__chars)
64 |         for row_number, item in enumerate(items, start=1):
65 |             yield (row_number, None, list(item))
66 | 


--------------------------------------------------------------------------------
/tabulator/parsers/xls.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | from __future__ import absolute_import
  5 | from __future__ import unicode_literals
  6 | 
  7 | import six
  8 | import sys
  9 | import xlrd
 10 | from ..parser import Parser
 11 | from .. import exceptions
 12 | from .. import helpers
 13 | 
 14 | 
 15 | # Module API
 16 | 
 17 | class XLSParser(Parser):
 18 |     """Parser to parse Excel data format.
 19 |     """
 20 | 
 21 |     # Public
 22 | 
 23 |     options = [
 24 |         'sheet',
 25 |         'fill_merged_cells',
 26 |     ]
 27 | 
 28 |     def __init__(self, loader, force_parse=False, sheet=1, fill_merged_cells=False):
 29 |         self.__loader = loader
 30 |         self.__sheet_pointer = sheet
 31 |         self.__fill_merged_cells = fill_merged_cells
 32 |         self.__force_parse = force_parse
 33 |         self.__extended_rows = None
 34 |         self.__encoding = None
 35 |         self.__fragment = None
 36 |         self.__bytes = None
 37 | 
 38 |     @property
 39 |     def closed(self):
 40 |         return self.__bytes is None or self.__bytes.closed
 41 | 
 42 |     def open(self, source, encoding=None):
 43 |         self.close()
 44 |         self.__encoding = encoding
 45 |         self.__bytes = self.__loader.load(source, mode='b', encoding=encoding)
 46 | 
 47 |         # Get book
 48 |         file_contents = self.__bytes.read()
 49 |         try:
 50 |             self.__book = xlrd.open_workbook(
 51 |                 file_contents=file_contents,
 52 |                 encoding_override=encoding,
 53 |                 formatting_info=True,
 54 |                 logfile=sys.stderr
 55 |             )
 56 |         except NotImplementedError:
 57 |             self.__book = xlrd.open_workbook(
 58 |                 file_contents=file_contents,
 59 |                 encoding_override=encoding,
 60 |                 formatting_info=False,
 61 |                 logfile=sys.stderr
 62 |             )
 63 | 
 64 |         # Get sheet
 65 |         try:
 66 |             if isinstance(self.__sheet_pointer, six.string_types):
 67 |                 self.__sheet = self.__book.sheet_by_name(self.__sheet_pointer)
 68 |             else:
 69 |                 self.__sheet = self.__book.sheet_by_index(self.__sheet_pointer - 1)
 70 |         except (xlrd.XLRDError, IndexError):
 71 |             message = 'Excel document "%s" doesn\'t have a sheet "%s"'
 72 |             raise exceptions.SourceError(message % (source, self.__sheet_pointer))
 73 |         self.__fragment = self.__sheet.name
 74 | 
 75 |         # Reset parser
 76 |         self.reset()
 77 | 
 78 |     def close(self):
 79 |         if not self.closed:
 80 |             self.__bytes.close()
 81 | 
 82 |     def reset(self):
 83 |         helpers.reset_stream(self.__bytes)
 84 |         self.__extended_rows = self.__iter_extended_rows()
 85 | 
 86 |     @property
 87 |     def encoding(self):
 88 |         return self.__encoding
 89 | 
 90 |     @property
 91 |     def fragment(self):
 92 |         return self.__fragment
 93 | 
 94 |     @property
 95 |     def extended_rows(self):
 96 |         return self.__extended_rows
 97 | 
 98 |     # Private
 99 | 
100 |     def __iter_extended_rows(self):
101 | 
102 |         def type_value(ctype, value):
103 |             """ Detects boolean value, int value, datetime """
104 | 
105 |             # Boolean
106 |             if ctype == xlrd.XL_CELL_BOOLEAN:
107 |                 return bool(value)
108 | 
109 |             # Excel numbers are only float
110 |             # Float with no decimals can be cast into int
111 |             if ctype == xlrd.XL_CELL_NUMBER and value == value // 1:
112 |                 return int(value)
113 | 
114 |             # Datetime
115 |             if ctype == xlrd.XL_CELL_DATE:
116 |                 return xlrd.xldate.xldate_as_datetime(value, self.__book.datemode)
117 | 
118 |             return value
119 | 
120 |         for x in range(0, self.__sheet.nrows):
121 |             row_number = x + 1
122 |             row = []
123 |             for y, value in enumerate(self.__sheet.row_values(x)):
124 |                 value = type_value(self.__sheet.cell(x, y).ctype, value)
125 |                 if self.__fill_merged_cells:
126 |                     for xlo, xhi, ylo, yhi in self.__sheet.merged_cells:
127 |                         if x in range(xlo, xhi) and y in range(ylo, yhi):
128 |                             value = type_value(self.__sheet.cell(xlo, ylo).ctype,
129 |                                                self.__sheet.cell_value(xlo, ylo))
130 |                 row.append(value)
131 |             yield (row_number, None, row)
132 | 


--------------------------------------------------------------------------------
/tabulator/parsers/xlsx.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | from __future__ import absolute_import
  5 | from __future__ import unicode_literals
  6 | 
  7 | import os
  8 | import io
  9 | import six
 10 | import shutil
 11 | import atexit
 12 | import openpyxl
 13 | import datetime
 14 | import re
 15 | from itertools import chain
 16 | from tempfile import NamedTemporaryFile
 17 | from ..parser import Parser
 18 | from .. import exceptions
 19 | from .. import helpers
 20 | 
 21 | 
 22 | # Module API
 23 | 
 24 | 
 25 | class XLSXParser(Parser):
 26 |     """Parser to parse Excel modern `xlsx` data format.
 27 |     """
 28 | 
 29 |     # Public
 30 | 
 31 |     options = [
 32 |         "sheet",
 33 |         "workbook_cache",
 34 |         "fill_merged_cells",
 35 |         "preserve_formatting",
 36 |         "adjust_floating_point_error",
 37 |     ]
 38 | 
 39 |     def __init__(
 40 |         self,
 41 |         loader,
 42 |         force_parse=False,
 43 |         sheet=1,
 44 |         workbook_cache=None,
 45 |         fill_merged_cells=False,
 46 |         preserve_formatting=False,
 47 |         adjust_floating_point_error=False,
 48 |     ):
 49 |         self.__loader = loader
 50 |         self.__sheet_pointer = sheet
 51 |         self.__workbook_cache = workbook_cache
 52 |         self.__fill_merged_cells = fill_merged_cells
 53 |         self.__preserve_formatting = preserve_formatting
 54 |         self.__adjust_floating_point_error = adjust_floating_point_error
 55 |         self.__extended_rows = None
 56 |         self.__encoding = None
 57 |         self.__fragment = None
 58 |         self.__force_parse = force_parse
 59 |         self.__bytes = None
 60 | 
 61 |     @property
 62 |     def closed(self):
 63 |         return self.__bytes is None or self.__bytes.closed
 64 | 
 65 |     def open(self, source, encoding=None):
 66 |         self.close()
 67 |         self.__encoding = encoding
 68 | 
 69 |         # Remote
 70 |         # Create copy for remote source
 71 |         # For remote stream we need local copy (will be deleted on close by Python)
 72 |         # https://docs.python.org/3.5/library/tempfile.html#tempfile.TemporaryFile
 73 |         if getattr(self.__loader, "remote", False):
 74 |             # Cached
 75 |             if self.__workbook_cache is not None and source in self.__workbook_cache:
 76 |                 self.__bytes = io.open(self.__workbook_cache[source], "rb")
 77 |             # Not cached
 78 |             else:
 79 |                 prefix = "tabulator-"
 80 |                 delete = self.__workbook_cache is None
 81 |                 source_bytes = self.__loader.load(source, mode="b", encoding=encoding)
 82 |                 target_bytes = NamedTemporaryFile(prefix=prefix, delete=delete)
 83 |                 shutil.copyfileobj(source_bytes, target_bytes)
 84 |                 source_bytes.close()
 85 |                 target_bytes.seek(0)
 86 |                 self.__bytes = target_bytes
 87 |                 if self.__workbook_cache is not None:
 88 |                     self.__workbook_cache[source] = target_bytes.name
 89 |                     atexit.register(os.remove, target_bytes.name)
 90 | 
 91 |         # Local
 92 |         else:
 93 |             self.__bytes = self.__loader.load(source, mode="b", encoding=encoding)
 94 | 
 95 |         # Get book
 96 |         # To fill merged cells we can't use read-only because
 97 |         # `sheet.merged_cell_ranges` is not available in this mode
 98 |         self.__book = openpyxl.load_workbook(
 99 |             self.__bytes, read_only=not self.__fill_merged_cells, data_only=True
100 |         )
101 | 
102 |         # Get sheet
103 |         try:
104 |             if isinstance(self.__sheet_pointer, six.string_types):
105 |                 self.__sheet = self.__book[self.__sheet_pointer]
106 |             else:
107 |                 self.__sheet = self.__book.worksheets[self.__sheet_pointer - 1]
108 |         except (KeyError, IndexError):
109 |             message = 'Excel document "%s" doesn\'t have a sheet "%s"'
110 |             raise exceptions.SourceError(message % (source, self.__sheet_pointer))
111 |         self.__fragment = self.__sheet.title
112 |         self.__process_merged_cells()
113 | 
114 |         # Reset parser
115 |         self.reset()
116 | 
117 |     def close(self):
118 |         if not self.closed:
119 |             self.__bytes.close()
120 | 
121 |     def reset(self):
122 |         helpers.reset_stream(self.__bytes)
123 |         self.__extended_rows = self.__iter_extended_rows()
124 | 
125 |     @property
126 |     def encoding(self):
127 |         return self.__encoding
128 | 
129 |     @property
130 |     def fragment(self):
131 |         return self.__fragment
132 | 
133 |     @property
134 |     def extended_rows(self):
135 |         return self.__extended_rows
136 | 
137 |     # Private
138 | 
139 |     def __iter_extended_rows(self):
140 |         for row_number, row in enumerate(self.__sheet.iter_rows(), start=1):
141 |             yield (
142 |                 row_number,
143 |                 None,
144 |                 extract_row_values(
145 |                     row, self.__preserve_formatting, self.__adjust_floating_point_error,
146 |                 ),
147 |             )
148 | 
149 |     def __process_merged_cells(self):
150 |         if self.__fill_merged_cells:
151 |             for merged_cell_range in list(self.__sheet.merged_cells.ranges):
152 |                 merged_cell_range = str(merged_cell_range)
153 |                 self.__sheet.unmerge_cells(merged_cell_range)
154 |                 merged_rows = openpyxl.utils.rows_from_range(merged_cell_range)
155 |                 coordinates = list(chain.from_iterable(merged_rows))
156 |                 value = self.__sheet[coordinates[0]].value
157 |                 for coordinate in coordinates:
158 |                     cell = self.__sheet[coordinate]
159 |                     cell.value = value
160 | 
161 | 
162 | # Internal
163 | 
164 | EXCEL_CODES = {
165 |     "yyyy": "%Y",
166 |     "yy": "%y",
167 |     "dddd": "%A",
168 |     "ddd": "%a",
169 |     "dd": "%d",
170 |     "d": "%-d",
171 |     # Different from excel as there is no J-D in strftime
172 |     "mmmmmm": "%b",
173 |     "mmmm": "%B",
174 |     "mmm": "%b",
175 |     "hh": "%H",
176 |     "h": "%-H",
177 |     "ss": "%S",
178 |     "s": "%-S",
179 |     # Possibly different from excel as there is no am/pm in strftime
180 |     "am/pm": "%p",
181 |     # Different from excel as there is no A/P or a/p in strftime
182 |     "a/p": "%p",
183 | }
184 | 
185 | EXCEL_MINUTE_CODES = {
186 |     "mm": "%M",
187 |     "m": "%-M",
188 | }
189 | EXCEL_MONTH_CODES = {
190 |     "mm": "%m",
191 |     "m": "%-m",
192 | }
193 | 
194 | EXCEL_MISC_CHARS = [
195 |     "$",
196 |     "+",
197 |     "(",
198 |     ":",
199 |     "^",
200 |     "'",
201 |     "{",
202 |     "<",
203 |     "=",
204 |     "-",
205 |     "/",
206 |     ")",
207 |     "!",
208 |     "&",
209 |     "~",
210 |     "}",
211 |     ">",
212 |     " ",
213 | ]
214 | 
215 | EXCEL_ESCAPE_CHAR = "\\"
216 | EXCEL_SECTION_DIVIDER = ";"
217 | 
218 | 
219 | def convert_excel_date_format_string(excel_date):
220 |     """
221 |     Created using documentation here:
222 |     https://support.office.com/en-us/article/review-guidelines-for-customizing-a-number-format-c0a1d1fa-d3f4-4018-96b7-9c9354dd99f5
223 | 
224 |     """
225 |     # The python date string that is being built
226 |     python_date = ""
227 |     # The excel code currently being parsed
228 |     excel_code = ""
229 |     prev_code = ""
230 |     # If the previous character was the escape character
231 |     char_escaped = False
232 |     # If we are in a quotation block (surrounded by "")
233 |     quotation_block = False
234 |     # Variables used for checking if a code should be a minute or a month
235 |     checking_minute_or_month = False
236 |     minute_or_month_buffer = ""
237 | 
238 |     for c in excel_date:
239 |         ec = excel_code.lower()
240 |         # The previous character was an escape, the next character should be added normally
241 |         if char_escaped:
242 |             if checking_minute_or_month:
243 |                 minute_or_month_buffer += c
244 |             else:
245 |                 python_date += c
246 |             char_escaped = False
247 |             continue
248 |         # Inside a quotation block
249 |         if quotation_block:
250 |             if c == '"':
251 |                 # Quotation block should now end
252 |                 quotation_block = False
253 |             elif checking_minute_or_month:
254 |                 minute_or_month_buffer += c
255 |             else:
256 |                 python_date += c
257 |             continue
258 |         # The start of a quotation block
259 |         if c == '"':
260 |             quotation_block = True
261 |             continue
262 |         if c == EXCEL_SECTION_DIVIDER:
263 |             # We ignore excel sections for datetimes
264 |             break
265 | 
266 |         is_escape_char = c == EXCEL_ESCAPE_CHAR
267 |         # The am/pm and a/p code add some complications, need to make sure we are not that code
268 |         is_misc_char = c in EXCEL_MISC_CHARS and (
269 |             c != "/" or (ec != "am" and ec != "a")
270 |         )
271 |         new_excel_code = False
272 | 
273 |         # Handle a new code without a different characeter in between
274 |         if (
275 |             ec
276 |             and not is_escape_char
277 |             and not is_misc_char
278 |             # If the code does not start with c, we are in a new code
279 |             and not ec.startswith(c.lower())
280 |             # other than the case where we are building up
281 |             # am/pm (minus the case where it is fully built), we are in a new code
282 |             and (not ec.startswith("a") or ec == "am/pm")
283 |         ):
284 |             new_excel_code = True
285 | 
286 |         # Code is finished, check if it is a proper code
287 |         if (is_escape_char or is_misc_char or new_excel_code) and ec:
288 |             # Checking if the previous code should have been minute or month
289 |             if checking_minute_or_month:
290 |                 if ec == "ss" or ec == "s":
291 |                     # It should be a minute!
292 |                     minute_or_month_buffer = (
293 |                         EXCEL_MINUTE_CODES[prev_code] + minute_or_month_buffer
294 |                     )
295 |                 else:
296 |                     # It should be a months!
297 |                     minute_or_month_buffer = (
298 |                         EXCEL_MONTH_CODES[prev_code] + minute_or_month_buffer
299 |                     )
300 |                 python_date += minute_or_month_buffer
301 |                 checking_minute_or_month = False
302 |                 minute_or_month_buffer = ""
303 | 
304 |             if ec in EXCEL_CODES:
305 |                 python_date += EXCEL_CODES[ec]
306 |             # Handle months/minutes differently
307 |             elif ec in EXCEL_MINUTE_CODES:
308 |                 # If preceded by hours, we know this is referring to minutes
309 |                 if prev_code == "h" or prev_code == "hh":
310 |                     python_date += EXCEL_MINUTE_CODES[ec]
311 |                 else:
312 |                     # Have to check if the next code is ss or s
313 |                     checking_minute_or_month = True
314 |                     minute_or_month_buffer = ""
315 |             else:
316 |                 # Have to abandon this attempt to convert because the code is not recognized
317 |                 return None
318 |             prev_code = ec
319 |             excel_code = ""
320 |         if is_escape_char:
321 |             char_escaped = True
322 |         elif is_misc_char:
323 |             # Add the misc char
324 |             if checking_minute_or_month:
325 |                 minute_or_month_buffer += c
326 |             else:
327 |                 python_date += c
328 |         else:
329 |             # Just add to the code
330 |             excel_code += c
331 | 
332 |     # Complete, check if there is still a buffer
333 |     if checking_minute_or_month:
334 |         # We know it's a month because there were no more codes after
335 |         minute_or_month_buffer = EXCEL_MONTH_CODES[prev_code] + minute_or_month_buffer
336 |         python_date += minute_or_month_buffer
337 |     if excel_code:
338 |         ec = excel_code.lower()
339 |         if ec in EXCEL_CODES:
340 |             python_date += EXCEL_CODES[ec]
341 |         elif ec in EXCEL_MINUTE_CODES:
342 |             if prev_code == "h" or prev_code == "hh":
343 |                 python_date += EXCEL_MINUTE_CODES[ec]
344 |             else:
345 |                 python_date += EXCEL_MONTH_CODES[ec]
346 |         else:
347 |             return None
348 |     return python_date
349 | 
350 | 
351 | def eformat(f, prec, exp_digits):
352 |     """
353 |     Formats to Scientific Notation, including precise exponent digits
354 | 
355 |     """
356 |     s = "%.*e" % (prec, f)
357 |     mantissa, exp = s.split("e")
358 |     # add 1 to digits as 1 is taken by sign +/-
359 |     return "%sE%+0*d" % (mantissa, exp_digits + 1, int(exp))
360 | 
361 | 
362 | def convert_excel_number_format_string(
363 |     excel_number, value,
364 | ):
365 |     """
366 |     A basic attempt to convert excel number_format to a number string
367 | 
368 |     The important goal here is to get proper amount of rounding
369 |     """
370 |     if "@" in excel_number:
371 |         # We don't try to parse complicated strings
372 |         return str(value)
373 |     percentage = False
374 |     if excel_number.endswith("%"):
375 |         value = value * 100
376 |         excel_number = excel_number[:-1]
377 |         percentage = True
378 |     if excel_number == "General":
379 |         return value
380 |     multi_codes = excel_number.split(";")
381 |     if value < 0 and len(multi_codes) > 1:
382 |         excel_number = multi_codes[1]
383 |     else:
384 |         excel_number = multi_codes[0]
385 | 
386 |     code = excel_number.split(".")
387 | 
388 |     if len(code) > 2:
389 |         return None
390 |     if len(code) < 2:
391 |         # No decimals
392 |         new_value = "{0:.0f}".format(value)
393 | 
394 |     # Currently we do not support "engineering notation"
395 |     elif re.match(r"^#+0*E\+0*$", code[1]):
396 |         return value
397 |     elif re.match(r"^0*E\+0*$", code[1]):
398 |         # Handle scientific notation
399 | 
400 |         # Note, it will only actually be returned as a string if
401 |         # type is not inferred
402 | 
403 |         prec = len(code[1]) - len(code[1].lstrip("0"))
404 |         exp_digits = len(code[1]) - len(code[1].rstrip("0"))
405 |         return eformat(value, prec, exp_digits)
406 | 
407 |     else:
408 |         decimal_section = code[1]
409 |         # Only pay attention to the 0, # and ? characters as they provide precision information
410 |         decimal_section = "".join(d for d in decimal_section if d in ["0", "#", "?"])
411 | 
412 |         # Count the number of hashes at the end of the decimal_section in order to know how
413 |         # the number should be truncated
414 |         number_hash = 0
415 |         for i in reversed(range(len(decimal_section))):
416 |             if decimal_section[i] == "#":
417 |                 number_hash += 1
418 |             else:
419 |                 break
420 |         string_format_code = "{0:." + str(len(decimal_section)) + "f}"
421 |         new_value = string_format_code.format(value)
422 |         if number_hash > 0:
423 |             for i in range(number_hash):
424 |                 if new_value.endswith("0"):
425 |                     new_value = new_value[:-1]
426 |     if percentage:
427 |         return new_value + "%"
428 | 
429 |     return new_value
430 | 
431 | 
432 | def extract_row_values(
433 |     row, preserve_formatting=False, adjust_floating_point_error=False,
434 | ):
435 |     if preserve_formatting:
436 |         values = []
437 |         for cell in row:
438 |             number_format = cell.number_format or ""
439 |             value = cell.value
440 | 
441 |             if isinstance(cell.value, datetime.datetime) or isinstance(
442 |                 cell.value, datetime.time
443 |             ):
444 |                 temporal_format = convert_excel_date_format_string(number_format)
445 |                 if temporal_format:
446 |                     value = cell.value.strftime(temporal_format)
447 |             elif (
448 |                 adjust_floating_point_error
449 |                 and isinstance(cell.value, float)
450 |                 and number_format == "General"
451 |             ):
452 |                 # We have a float with format General
453 |                 # Calculate the number of integer digits
454 |                 integer_digits = len(str(int(cell.value)))
455 |                 # Set the precision to 15 minus the number of integer digits
456 |                 precision = 15 - (integer_digits)
457 |                 value = round(cell.value, precision)
458 |             elif isinstance(cell.value, (int, float)):
459 |                 new_value = convert_excel_number_format_string(
460 |                     number_format, cell.value,
461 |                 )
462 |                 if new_value:
463 |                     value = new_value
464 |             values.append(value)
465 |         return values
466 |     return list(cell.value for cell in row)
467 | 


--------------------------------------------------------------------------------
/tabulator/validate.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | from . import config
 8 | from . import helpers
 9 | from . import exceptions
10 | 
11 | 
12 | # Module API
13 | 
14 | def validate(source, scheme=None, format=None):
15 |     """Check if tabulator is able to load the source.
16 | 
17 |     # Arguments
18 |         source (Union[str, IO]): The source path or IO object.
19 |         scheme (str, optional): The source scheme. Auto-detect by default.
20 |         format (str, optional): The source file format. Auto-detect by default.
21 | 
22 |     # Raises
23 |         SchemeError: The file scheme is not supported.
24 |         FormatError: The file format is not supported.
25 | 
26 |     # Returns
27 |         bool: Whether tabulator is able to load the source file.
28 | 
29 |     """
30 | 
31 |     # Get scheme and format
32 |     detected_scheme, detected_format = helpers.detect_scheme_and_format(source)
33 |     scheme = scheme or detected_scheme
34 |     format = format or detected_format
35 | 
36 |     # Validate scheme and format
37 |     if scheme is not None:
38 |         if scheme not in config.LOADERS:
39 |             raise exceptions.SchemeError('Scheme "%s" is not supported' % scheme)
40 |     if format not in config.PARSERS:
41 |         raise exceptions.FormatError('Format "%s" is not supported' % format)
42 | 
43 |     return True
44 | 


--------------------------------------------------------------------------------
/tabulator/writer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | from six import add_metaclass
 8 | from abc import ABCMeta, abstractmethod
 9 | 
10 | 
11 | # Module API
12 | 
13 | @add_metaclass(ABCMeta)
14 | class Writer(object):
15 |     """Abstract class implemented by the data writers.
16 | 
17 |     The writers inherit and implement this class' methods to add support for a
18 |     new file destination.
19 | 
20 |     # Arguments
21 |         **options (dict): Writer options.
22 | 
23 |     """
24 | 
25 |     # Public
26 | 
27 |     options = []
28 | 
29 |     def __init__(self, **options):
30 |         pass
31 | 
32 |     @abstractmethod
33 |     def write(self, source, target, headers, encoding=None):
34 |         """Writes source data to target.
35 | 
36 |         # Arguments
37 |             source (str): Source data.
38 |             target (str): Write target.
39 |             headers (List[str]): List of header names.
40 |             encoding (str, optional): Source file encoding.
41 | 
42 |         # Returns
43 |             count (int?): Written rows count if available
44 | 
45 |         """
46 |         pass
47 | 


--------------------------------------------------------------------------------
/tabulator/writers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/tabulator/writers/__init__.py


--------------------------------------------------------------------------------
/tabulator/writers/csv.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import io
 8 | import six
 9 | import unicodecsv
10 | from ..writer import Writer
11 | from .. import helpers
12 | 
13 | 
14 | # Module API
15 | 
16 | class CSVWriter(Writer):
17 |     """CSV writer.
18 |     """
19 | 
20 |     # Public
21 | 
22 |     options = [
23 |         'delimiter',
24 |     ]
25 | 
26 |     def __init__(self, **options):
27 | 
28 |         # Make bytes
29 |         if six.PY2:
30 |             for key, value in options.items():
31 |                 if isinstance(value, six.string_types):
32 |                     options[key] = str(value)
33 | 
34 |         # Set attributes
35 |         self.__options = options
36 | 
37 |     def write(self, source, target, headers, encoding=None):
38 |         helpers.ensure_dir(target)
39 |         count = 0
40 |         with io.open(target, 'wb') as file:
41 |             writer = unicodecsv.writer(file, encoding=encoding, **self.__options)
42 |             if headers:
43 |                 writer.writerow(headers)
44 |             for row in source:
45 |                 count += 1
46 |                 writer.writerow(row)
47 |         return count
48 | 


--------------------------------------------------------------------------------
/tabulator/writers/json.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import json
 8 | from ..writer import Writer
 9 | from .. import helpers
10 | 
11 | 
12 | # Module API
13 | 
14 | class JSONWriter(Writer):
15 |     """JSON writer.
16 |     """
17 | 
18 |     # Public
19 | 
20 |     options = [
21 |         'keyed',
22 |     ]
23 | 
24 |     def __init__(self, keyed=False):
25 |         self.__keyed = keyed
26 | 
27 |     def write(self, source, target, headers, encoding=None):
28 |         helpers.ensure_dir(target)
29 |         data = []
30 |         count = 0
31 |         if not self.__keyed:
32 |             data.append(headers)
33 |         for row in source:
34 |             if self.__keyed:
35 |                 row = dict(zip(headers, row))
36 |             data.append(row)
37 |             count += 1
38 |         with open(target, 'w') as file:
39 |             json.dump(data, file, indent=2)
40 |         return count
41 | 


--------------------------------------------------------------------------------
/tabulator/writers/sql.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | from sqlalchemy import create_engine, MetaData, Table, Column, String
 8 | from ..writer import Writer
 9 | from .. import exceptions
10 | 
11 | 
12 | # Module API
13 | 
14 | class SQLWriter(Writer):
15 |     """SQL writer.
16 |     """
17 | 
18 |     # Public
19 | 
20 |     options = [
21 |         'table',
22 |     ]
23 | 
24 |     def __init__(self, table=None, **options):
25 | 
26 |         # Ensure table
27 |         if table is None:
28 |             raise exceptions.TabulatorException('Format `sql` requires `table` option.')
29 | 
30 |         self.__table = table
31 | 
32 |     def write(self, source, target, headers, encoding=None):
33 |         engine = create_engine(target)
34 |         count = 0
35 |         buffer = []
36 |         buffer_size = 1000
37 |         with engine.begin() as conn:
38 |             meta = MetaData()
39 |             columns = [Column(header, String()) for header in headers]
40 |             table = Table(self.__table, meta, *columns)
41 |             meta.create_all(conn)
42 |             for row in source:
43 |                 count += 1
44 |                 buffer.append(row)
45 |                 if len(buffer) > buffer_size:
46 |                     conn.execute(table.insert().values(buffer))
47 |                     buffer = []
48 |             if len(buffer):
49 |                 conn.execute(table.insert().values(buffer))
50 |         return count
51 | 


--------------------------------------------------------------------------------
/tabulator/writers/xlsx.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import six
 8 | import openpyxl
 9 | from ..writer import Writer
10 | from .. import helpers
11 | 
12 | 
13 | # Module API
14 | 
15 | class XLSXWriter(Writer):
16 |     """XLSX writer.
17 |     """
18 | 
19 |     # Public
20 | 
21 |     options = [
22 |         'sheet',
23 |     ]
24 | 
25 |     def __init__(self, **options):
26 | 
27 |         # Make bytes
28 |         if six.PY2:
29 |             for key, value in options.items():
30 |                 if isinstance(value, six.string_types):
31 |                     options[key] = str(value)
32 | 
33 |         # Set attributes
34 |         self.__options = options
35 | 
36 |     def write(self, source, target, headers, encoding=None):
37 |         helpers.ensure_dir(target)
38 |         count = 0
39 |         wb = openpyxl.Workbook(write_only=True)
40 |         ws = wb.create_sheet(title=self.__options.get('sheet'))
41 |         ws.append(headers)
42 |         for row in source:
43 |             ws.append(row)
44 |             count += 1
45 |         wb.save(target)
46 |         return count
47 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/tests/__init__.py


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import pytest
 8 | import sqlite3
 9 | 
10 | 
11 | # Fixtures
12 | 
13 | @pytest.fixture
14 | def database_url(tmpdir):
15 |     path = str(tmpdir.join('database.db'))
16 |     conn = sqlite3.connect(path)
17 |     conn.execute('CREATE TABLE data (id INTEGER PRIMARY KEY, name TEXT)')
18 |     conn.execute('INSERT INTO data VALUES (1, "english"), (2, "中国人")')
19 |     conn.commit()
20 |     yield 'sqlite:///%s' % path
21 |     conn.close()
22 | 


--------------------------------------------------------------------------------
/tests/formats/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/tests/formats/__init__.py


--------------------------------------------------------------------------------
/tests/formats/test_csv.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | from __future__ import absolute_import
  5 | from __future__ import unicode_literals
  6 | 
  7 | import io
  8 | import pytest
  9 | from mock import Mock
 10 | from tabulator import Stream
 11 | from tabulator.parsers.csv import CSVParser
 12 | BASE_URL = 'https://raw.githubusercontent.com/okfn/tabulator-py/master/%s'
 13 | 
 14 | 
 15 | # Read
 16 | 
 17 | def test_stream_local_csv():
 18 |     with Stream('data/table.csv') as stream:
 19 |         assert stream.headers is None
 20 |         assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']]
 21 | 
 22 | 
 23 | def test_stream_local_csv_with_bom():
 24 |     with Stream('data/special/bom.csv') as stream:
 25 |         assert stream.headers is None
 26 |         assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']]
 27 | 
 28 | 
 29 | def test_stream_local_csv_with_bom_with_encoding():
 30 |     with Stream('data/special/bom.csv', encoding='utf-8') as stream:
 31 |         assert stream.headers is None
 32 |         assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']]
 33 | 
 34 | 
 35 | def test_stream_csv_excel():
 36 |     source = 'value1,value2\nvalue3,value4'
 37 |     with Stream(source, scheme='text', format='csv') as stream:
 38 |         assert stream.read() == [['value1', 'value2'], ['value3', 'value4']]
 39 | 
 40 | 
 41 | def test_stream_csv_excel_tab():
 42 |     source = 'value1\tvalue2\nvalue3\tvalue4'
 43 |     with Stream(source, scheme='text', format='csv', delimiter='\t') as stream:
 44 |         assert stream.read() == [['value1', 'value2'], ['value3', 'value4']]
 45 | 
 46 | 
 47 | def test_stream_csv_unix():
 48 |     source = '"value1","value2"\n"value3","value4"'
 49 |     with Stream(source, scheme='text', format='csv') as stream:
 50 |         assert stream.read() == [['value1', 'value2'], ['value3', 'value4']]
 51 | 
 52 | 
 53 | def test_stream_csv_escaping():
 54 |     with Stream('data/special/escaping.csv', escapechar='\\') as stream:
 55 |         assert stream.read() == [
 56 |             ['ID', 'Test'],
 57 |             ['1', 'Test line 1'],
 58 |             ['2', 'Test " line 2'],
 59 |             ['3', 'Test " line 3'],
 60 |         ]
 61 | 
 62 | 
 63 | def test_stream_csv_doublequote():
 64 |     with Stream('data/special/doublequote.csv') as stream:
 65 |         for row in  stream:
 66 |             assert len(row) == 17
 67 | 
 68 | 
 69 | def test_stream_stream_csv():
 70 |     source = io.open('data/table.csv', mode='rb')
 71 |     with Stream(source, format='csv') as stream:
 72 |         assert stream.headers is None
 73 |         assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']]
 74 | 
 75 | 
 76 | def test_stream_text_csv():
 77 |     source = 'text://id,name\n1,english\n2,中国人\n'
 78 |     with Stream(source, format='csv') as stream:
 79 |         assert stream.headers is None
 80 |         assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']]
 81 | 
 82 | 
 83 | @pytest.mark.remote
 84 | def test_stream_remote_csv():
 85 |     with Stream(BASE_URL % 'data/table.csv') as stream:
 86 |         assert stream.headers is None
 87 |         assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']]
 88 | 
 89 | 
 90 | @pytest.mark.remote
 91 | def test_stream_remote_csv_non_ascii_url():
 92 |     with Stream('http://data.defra.gov.uk/ops/government_procurement_card/over_£500_GPC_apr_2013.csv') as stream:
 93 |         assert stream.sample[0] == [
 94 |             'Entity',
 95 |             'Transaction Posting Date',
 96 |             'Merchant Name',
 97 |             'Amount',
 98 |             'Description']
 99 | 
100 | 
101 | def test_stream_csv_delimiter():
102 |     source = '"value1";"value2"\n"value3";"value4"'
103 |     with Stream(source, scheme='text', format='csv', delimiter=';') as stream:
104 |         assert stream.read() == [['value1', 'value2'], ['value3', 'value4']]
105 | 
106 | 
107 | def test_stream_csv_escapechar():
108 |     source = 'value1%,value2\nvalue3%,value4'
109 |     with Stream(source, scheme='text', format='csv', escapechar='%') as stream:
110 |         assert stream.read() == [['value1,value2'], ['value3,value4']]
111 | 
112 | 
113 | def test_stream_csv_quotechar():
114 |     source = '%value1,value2%\n%value3,value4%'
115 |     with Stream(source, scheme='text', format='csv', quotechar='%') as stream:
116 |         assert stream.read() == [['value1,value2'], ['value3,value4']]
117 | 
118 | 
119 | def test_stream_csv_skipinitialspace():
120 |     source = 'value1, value2\nvalue3, value4'
121 |     with Stream(source, scheme='text', format='csv', skipinitialspace=True) as stream:
122 |         assert stream.read() == [['value1', 'value2'], ['value3', 'value4']]
123 | 
124 | 
125 | def test_stream_csv_detect_delimiter_tab():
126 |     source = 'a1\tb1\tc1A,c1B\na2\tb2\tc2\n'
127 |     with Stream(source, scheme='text', format='csv') as stream:
128 |         assert stream.read() == [['a1', 'b1', 'c1A,c1B'], ['a2', 'b2', 'c2']]
129 | 
130 | 
131 | def test_stream_csv_detect_delimiter_semicolon():
132 |     source = 'a1;b1\na2;b2\n'
133 |     with Stream(source, scheme='text', format='csv') as stream:
134 |         assert stream.read() == [['a1', 'b1'], ['a2', 'b2']]
135 | 
136 | 
137 | def test_stream_csv_detect_delimiter_pipe():
138 |     source = 'a1|b1\na2|b2\n'
139 |     with Stream(source, scheme='text', format='csv') as stream:
140 |         assert stream.read() == [['a1', 'b1'], ['a2', 'b2']]
141 | 
142 | 
143 | def test_stream_csv_dialect_should_not_persist_if_sniffing_fails_issue_goodtables_228():
144 |     source1 = 'a;b;c\n#comment'
145 |     source2 = 'a,b,c\n#comment'
146 |     with Stream(source1, scheme='text', format='csv', headers=1, delimiter=';') as stream:
147 |         assert stream.headers == ['a', 'b', 'c']
148 |     with Stream(source2, scheme='text', format='csv', headers=1) as stream:
149 |         assert stream.headers == ['a', 'b', 'c']
150 | 
151 | 
152 | def test_stream_csv_quotechar_is_empty_string():
153 |     source = 'value1,value2",value3'
154 |     with Stream(source, scheme='text', format='csv', quotechar='') as stream:
155 |         stream.read() == ['value1', 'value2"', 'value3']
156 | 
157 | 
158 | # Write
159 | 
160 | def test_stream_save_csv(tmpdir):
161 |     source = 'data/table.csv'
162 |     target = str(tmpdir.join('table.csv'))
163 |     with Stream(source, headers=1) as stream:
164 |         assert stream.save(target) == 2
165 |     with Stream(target, headers=1) as stream:
166 |         assert stream.headers == ['id', 'name']
167 |         assert stream.read(extended=True) == [
168 |             (2, ['id', 'name'], ['1', 'english']),
169 |             (3, ['id', 'name'], ['2', '中国人']),
170 |         ]
171 | 
172 | 
173 | # Internal
174 | 
175 | def test_parser_csv():
176 | 
177 |     source = 'data/table.csv'
178 |     encoding = None
179 |     loader = Mock()
180 |     loader.load = Mock(return_value=io.open(source, encoding='utf-8'))
181 |     parser = CSVParser(loader)
182 | 
183 |     assert parser.closed
184 |     parser.open(source, encoding=encoding)
185 |     assert not parser.closed
186 | 
187 |     assert list(parser.extended_rows) == [
188 |         (1, None, ['id', 'name']),
189 |         (2, None, ['1', 'english']),
190 |         (3, None, ['2', '中国人'])]
191 | 
192 |     assert len(list(parser.extended_rows)) == 0
193 |     parser.reset()
194 |     assert len(list(parser.extended_rows)) == 3
195 | 
196 |     parser.close()
197 |     assert parser.closed
198 | 


--------------------------------------------------------------------------------
/tests/formats/test_datapackage.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import os
 8 | import json
 9 | import pytest
10 | from mock import Mock
11 | from tabulator import Stream
12 | from tabulator.parsers.datapackage import DataPackageParser
13 | 
14 | 
15 | # Read
16 | 
17 | 
18 | def test_stream_datapackage():
19 |     with Stream('data/datapackage.json', resource=0, headers=1) as stream:
20 |         assert stream.fragment == 'first-resource'
21 |         assert stream.headers == ['id', 'name']
22 |         assert stream.read(keyed=True) == [
23 |             {'id': 1, 'name': 'english'},
24 |             {'id': 2, 'name': '中国人'}]
25 | 
26 | 
27 | def test_second_resource():
28 |     with Stream('data/datapackage.json', resource=1, headers=1) as stream:
29 |         assert stream.fragment == 'number-two'
30 |         assert stream.headers == ['id', 'name']
31 |         assert stream.read(keyed=True) == [
32 |             {'id': 1, 'name': '中国人'},
33 |             {'id': 2, 'name': 'english'}
34 |         ]
35 | 
36 | 
37 | def test_named_resource():
38 |     curdir = os.getcwd()
39 |     try:
40 |         os.chdir('data/')
41 |         with Stream('datapackage.json', resource='number-two', headers=1) as stream:
42 |             assert stream.fragment == 'number-two'
43 |             assert stream.headers == ['id', 'name']
44 |             assert stream.read(keyed=True) == [
45 |                 {'id': 1, 'name': '中国人'},
46 |                 {'id': 2, 'name': 'english'},
47 |             ]
48 |     finally:
49 |         os.chdir(curdir)
50 | 
51 | 
52 | # Internal
53 | 
54 | def test_datapackage_parser():
55 | 
56 |     source = 'data/datapackage.json'
57 |     parser = DataPackageParser(None)
58 | 
59 |     assert parser.closed is True
60 |     parser.open(source)
61 |     assert parser.closed is False
62 | 
63 |     assert list(parser.extended_rows) == [
64 |         (1, ['id', 'name'], [1, 'english']),
65 |         (2, ['id', 'name'], [2, '中国人']),
66 |     ]
67 | 
68 |     assert len(list(parser.extended_rows)) == 0
69 |     parser.reset()
70 |     assert len(list(parser.extended_rows)) == 2
71 | 
72 |     parser.close()
73 |     assert parser.closed
74 | 
75 | 
76 | def test_datapackage_list():
77 |     curdir= os.getcwd()
78 |     try:
79 |         os.chdir('data/')
80 |         stream = json.load(open('datapackage.json'))
81 | 
82 |         parser = DataPackageParser(None)
83 |         parser.open(stream)
84 | 
85 |         assert list(parser.extended_rows) == [
86 |             (1, ['id', 'name'], [1, 'english']),
87 |             (2, ['id', 'name'], [2, '中国人'])
88 |         ]
89 |     finally:
90 |         os.chdir(curdir)
91 | 


--------------------------------------------------------------------------------
/tests/formats/test_gsheet.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import pytest
 8 | from tabulator import Stream, exceptions
 9 | 
10 | 
11 | # Read
12 | 
13 | @pytest.mark.remote
14 | def test_stream_gsheet():
15 |     source = 'https://docs.google.com/spreadsheets/d/1mHIWnDvW9cALRMq9OdNfRwjAthCUFUOACPp0Lkyl7b4/edit?usp=sharing'
16 |     with Stream(source) as stream:
17 |         assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']]
18 | 
19 | 
20 | @pytest.mark.remote
21 | def test_stream_gsheet_with_gid():
22 |     source = 'https://docs.google.com/spreadsheets/d/1mHIWnDvW9cALRMq9OdNfRwjAthCUFUOACPp0Lkyl7b4/edit#gid=960698813'
23 |     with Stream(source) as stream:
24 |         assert stream.read() == [['id', 'name'], ['2', '中国人'], ['3', 'german']]
25 | 
26 | 
27 | @pytest.mark.remote
28 | def test_stream_gsheet_bad_url():
29 |     stream = Stream('https://docs.google.com/spreadsheets/d/bad')
30 |     with pytest.raises(exceptions.HTTPError) as excinfo:
31 |         stream.open()
32 | 


--------------------------------------------------------------------------------
/tests/formats/test_html.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import io
 8 | import pytest
 9 | from mock import Mock
10 | from six import StringIO
11 | from tabulator import exceptions, Stream
12 | 
13 | 
14 | # Read
15 | 
16 | @pytest.mark.parametrize('source, selector', [
17 |     ('data/table1.html', 'table'),
18 |     ('data/table2.html', 'table'),
19 |     ('data/table3.html', '.mememe'),
20 |     ('data/table4.html', ''),
21 | ])
22 | def test_stream_html(source, selector):
23 |     with Stream(source, selector=selector, headers=1, encoding='utf8') as stream:
24 |         assert stream.headers == ['id', 'name']
25 |         assert stream.read(keyed=True) == [
26 |             {'id': '1', 'name': 'english'},
27 |             {'id': '2', 'name': '中国人'}]
28 | 
29 | def test_stream_html_raw_html():
30 |     with Stream('data/table3.html', selector='.mememe', headers=1, encoding='utf8', raw_html=True) as stream:
31 |         assert stream.headers == ['id', 'name']
32 |         assert stream.read(keyed=True) == [
33 |             {'id': '1', 'name': '<b>english</b>'},
34 |             {'id': '2', 'name': '中国人'}]
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/tests/formats/test_inline.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import pytest
 8 | from collections import OrderedDict
 9 | from tabulator import Stream, exceptions
10 | 
11 | 
12 | # Read
13 | 
14 | def test_stream_inline():
15 |     source = [['id', 'name'], ['1', 'english'], ['2', '中国人']]
16 |     with Stream(source) as stream:
17 |         assert stream.headers is None
18 |         assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']]
19 | 
20 | 
21 | def test_stream_inline_iterator():
22 |     source = iter([['id', 'name'], ['1', 'english'], ['2', '中国人']])
23 |     with Stream(source) as stream:
24 |         assert stream.headers is None
25 |         assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']]
26 | 
27 | 
28 | def test_stream_inline_iterator():
29 |     def generator():
30 |         yield ['id', 'name']
31 |         yield ['1', 'english']
32 |         yield ['2', '中国人']
33 |     with pytest.raises(exceptions.SourceError) as excinfo:
34 |         iterator = generator()
35 |         Stream(iterator).open()
36 |     assert 'callable' in str(excinfo.value)
37 | 
38 | 
39 | def test_stream_inline_generator():
40 |     def generator():
41 |         yield ['id', 'name']
42 |         yield ['1', 'english']
43 |         yield ['2', '中国人']
44 |     with Stream(generator) as stream:
45 |         assert stream.headers is None
46 |         assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']]
47 | 
48 | 
49 | def test_stream_inline_keyed():
50 |     source = [{'id': '1', 'name': 'english'}, {'id': '2', 'name': '中国人'}]
51 |     with Stream(source, format='inline') as stream:
52 |         assert stream.headers is None
53 |         assert stream.read() == [['1', 'english'], ['2', '中国人']]
54 | 
55 | 
56 | def test_stream_inline_keyed_with_headers_argument():
57 |     source = [{'id': '1', 'name': 'english'}, {'id': '2', 'name': '中国人'}]
58 |     with Stream(source, format='inline', headers=['name', 'id']) as stream:
59 |         assert stream.headers == ['name', 'id']
60 |         assert stream.read() == [['english', '1'], ['中国人', '2']]
61 | 
62 | 
63 | def test_stream_inline_ordered_dict():
64 |     source = [
65 |         OrderedDict([('name', 'english'), ('id', '1')]),
66 |         OrderedDict([('name', '中国人'), ('id', '2')]),
67 |     ]
68 |     with Stream(source, headers=1) as stream:
69 |         assert stream.headers == ['name', 'id']
70 |         assert stream.read() == [['english', '1'], ['中国人', '2']]
71 | 
72 | 
73 | # Write
74 | 
75 | def test_stream_save_inline_keyed_with_headers_argument(tmpdir):
76 |     source = [{'key1': 'value1', 'key2': 'value2'}]
77 |     target = str(tmpdir.join('table.csv'))
78 |     with Stream(source, headers=['key2', 'key1']) as stream:
79 |         stream.save(target)
80 |     with Stream(target, headers=1) as stream:
81 |         assert stream.headers == ['key2', 'key1']
82 |         assert stream.read() == [['value2', 'value1']]
83 | 


--------------------------------------------------------------------------------
/tests/formats/test_json.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | from __future__ import absolute_import
  5 | from __future__ import unicode_literals
  6 | 
  7 | import io
  8 | import json
  9 | import pytest
 10 | from mock import Mock
 11 | from tabulator import Stream, exceptions
 12 | from tabulator.parsers.json import JSONParser
 13 | BASE_URL = 'https://raw.githubusercontent.com/okfn/tabulator-py/master/%s'
 14 | 
 15 | 
 16 | # Read
 17 | 
 18 | def test_stream_local_json_dicts():
 19 |     with Stream('data/table-dicts.json') as stream:
 20 |         assert stream.headers is None
 21 |         assert stream.read() == [[1, 'english'], [2, '中国人']]
 22 | 
 23 | 
 24 | def test_stream_local_json_lists():
 25 |     with Stream('data/table-lists.json') as stream:
 26 |         assert stream.headers is None
 27 |         assert stream.read() == [['id', 'name'], [1, 'english'], [2, '中国人']]
 28 | 
 29 | 
 30 | def test_stream_text_json_dicts():
 31 |     source = '[{"id": 1, "name": "english" }, {"id": 2, "name": "中国人" }]'
 32 |     with Stream(source, scheme='text', format='json') as stream:
 33 |         assert stream.headers is None
 34 |         assert stream.read() == [[1, 'english'], [2, '中国人']]
 35 | 
 36 | 
 37 | def test_stream_text_json_dicts_with_headers_argument():
 38 |     source = '[{"id": 1, "name": "english" }, {"id": 2, "name": "中国人" }]'
 39 |     with Stream(source, scheme='text', format='json', headers=['name', 'id']) as stream:
 40 |         assert stream.headers == ['name', 'id']
 41 |         assert stream.read() == [['english', 1], ['中国人', 2]]
 42 | 
 43 | 
 44 | def test_stream_text_json_lists():
 45 |     source = '[["id", "name"], [1, "english"], [2, "中国人"]]'
 46 |     with Stream(source, scheme='text', format='json') as stream:
 47 |         assert stream.headers is None
 48 |         assert stream.read() == [['id', 'name'], [1, 'english'], [2, '中国人']]
 49 | 
 50 | 
 51 | @pytest.mark.remote
 52 | def test_stream_remote_json_dicts():
 53 |     with Stream(BASE_URL % 'data/table-dicts.json') as stream:
 54 |         assert stream.headers is None
 55 |         assert stream.read() == [[1, 'english'], [2, '中国人']]
 56 | 
 57 | 
 58 | @pytest.mark.remote
 59 | def test_stream_remote_json_lists():
 60 |     with Stream(BASE_URL % 'data/table-lists.json') as stream:
 61 |         assert stream.headers is None
 62 |         assert stream.read() == [['id', 'name'], [1, 'english'], [2, '中国人']]
 63 | 
 64 | 
 65 | # Write
 66 | 
 67 | def test_stream_save_json(tmpdir):
 68 |     source = 'data/table.csv'
 69 |     target = str(tmpdir.join('table.json'))
 70 |     with Stream(source, headers=1) as stream:
 71 |         assert stream.save(target) == 2
 72 |     with open(target) as file:
 73 |         assert json.load(file) == [
 74 |             ['id', 'name'],
 75 |             ['1', 'english'],
 76 |             ['2', '中国人'],
 77 |         ]
 78 | 
 79 | 
 80 | def test_stream_save_json_keyed(tmpdir):
 81 |     source = 'data/table.csv'
 82 |     target = str(tmpdir.join('table.json'))
 83 |     with Stream(source, headers=1) as stream:
 84 |         assert stream.save(target, keyed=True) == 2
 85 |     with open(target) as file:
 86 |         assert json.load(file) == [
 87 |             {'id': '1', 'name': 'english'},
 88 |             {'id': '2', 'name': '中国人'},
 89 |         ]
 90 | 
 91 | 
 92 | # Internal
 93 | 
 94 | def test_parser_json():
 95 | 
 96 |     source = 'data/table-dicts.json'
 97 |     encoding = None
 98 |     loader = Mock()
 99 |     loader.load = Mock(return_value=io.open(source, 'rb'))
100 |     parser = JSONParser(loader)
101 | 
102 |     assert parser.closed
103 |     parser.open(source, encoding=encoding)
104 |     assert not parser.closed
105 | 
106 |     assert list(parser.extended_rows) == [
107 |         (1, ['id', 'name'], [1, 'english']),
108 |         (2, ['id', 'name'], [2, '中国人'])]
109 | 
110 |     assert len(list(parser.extended_rows)) == 0
111 |     parser.reset()
112 |     assert len(list(parser.extended_rows)) == 2
113 | 
114 |     parser.close()
115 |     assert parser.closed
116 | 


--------------------------------------------------------------------------------
/tests/formats/test_ndjson.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import io
 8 | import pytest
 9 | from mock import Mock
10 | from six import StringIO
11 | from tabulator import exceptions, Stream
12 | from tabulator.parsers.ndjson import NDJSONParser
13 | 
14 | 
15 | # Read
16 | 
17 | def test_stream_ndjson():
18 |     with Stream('data/table.ndjson', headers=1) as stream:
19 |         assert stream.headers == ['id', 'name']
20 |         assert stream.read(keyed=True) == [
21 |             {'id': 1, 'name': 'english'},
22 |             {'id': 2, 'name': '中国人'}]
23 | 
24 | 
25 | # Internal
26 | 
27 | def test_parser_ndjson():
28 | 
29 |     source = 'data/table.ndjson'
30 |     encoding = None
31 |     loader = Mock()
32 |     loader.load = Mock(return_value=io.open(source, encoding='utf-8'))
33 |     parser = NDJSONParser(loader)
34 | 
35 |     assert parser.closed is True
36 |     parser.open(source, encoding=encoding)
37 |     assert parser.closed is False
38 | 
39 |     assert list(parser.extended_rows) == [
40 |         (1, ['id', 'name'], [1, 'english']),
41 |         (2, ['id', 'name'], [2, '中国人']),
42 |     ]
43 | 
44 |     assert len(list(parser.extended_rows)) == 0
45 |     parser.reset()
46 |     assert len(list(parser.extended_rows)) == 2
47 | 
48 |     parser.close()
49 |     assert parser.closed
50 | 
51 | 
52 | def test_parser_ndjson_list():
53 |     stream = StringIO(
54 |         '[1, 2, 3]\n'
55 |         '[4, 5, 6]\n'
56 |     )
57 | 
58 |     loader = Mock(load=Mock(return_value=stream))
59 |     parser = NDJSONParser(loader)
60 |     parser.open(None)
61 | 
62 |     assert list(parser.extended_rows) == [
63 |         (1, None, [1, 2, 3]),
64 |         (2, None, [4, 5, 6]),
65 |     ]
66 | 
67 | 
68 | def test_parser_ndjson_scalar():
69 |     stream = StringIO(
70 |         '1\n'
71 |         '2\n'
72 |     )
73 | 
74 |     loader = Mock(load=Mock(return_value=stream))
75 |     parser = NDJSONParser(loader)
76 |     parser.open(None)
77 | 
78 |     with pytest.raises(exceptions.SourceError):
79 |         list(parser.extended_rows)
80 | 


--------------------------------------------------------------------------------
/tests/formats/test_ods.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | from datetime import datetime
 7 | 
 8 | import io
 9 | import pytest
10 | from mock import Mock
11 | from tabulator import Stream, exceptions
12 | from tabulator.parsers.ods import ODSParser
13 | BASE_URL = 'https://raw.githubusercontent.com/okfn/tabulator-py/master/%s'
14 | 
15 | 
16 | # Read
17 | 
18 | def test_stream_ods():
19 |     with Stream('data/table.ods', headers=1) as stream:
20 |         assert stream.headers == ['id', 'name']
21 |         assert stream.read(keyed=True) == [
22 |             {'id': 1, 'name': 'english'},
23 |             {'id': 2, 'name': '中国人'},
24 |         ]
25 | 
26 | 
27 | @pytest.mark.remote
28 | def test_stream_ods_remote():
29 |     source = BASE_URL % 'data/table.ods'
30 |     with Stream(source) as stream:
31 |         assert stream.read() == [['id', 'name'], [1, 'english'], [2, '中国人']]
32 | 
33 | 
34 | def test_stream_ods_sheet_by_index():
35 |     with Stream('data/table.ods', sheet=1) as stream:
36 |         assert stream.read() == [['id', 'name'], [1, 'english'], [2, '中国人']]
37 | 
38 | 
39 | def test_stream_ods_sheet_by_index_not_existent():
40 |     with pytest.raises(exceptions.SourceError) as excinfo:
41 |         Stream('data/table.ods', sheet=3).open()
42 |     assert 'sheet "3"' in str(excinfo.value)
43 | 
44 | 
45 | def test_stream_ods_sheet_by_name():
46 |     with Stream('data/table.ods', sheet='Лист1') as stream:
47 |         assert stream.read() == [['id', 'name'], [1, 'english'], [2, '中国人']]
48 | 
49 | 
50 | def test_stream_ods_sheet_by_index_not_existent_2():
51 |     with pytest.raises(exceptions.SourceError) as excinfo:
52 |         Stream('data/table.ods', sheet='not-existent').open()
53 |     assert 'sheet "not-existent"' in str(excinfo.value)
54 | 
55 | 
56 | def test_stream_ods_with_boolean():
57 |     with Stream('data/special/table-with-booleans.ods') as stream:
58 |         assert stream.headers is None
59 |         assert stream.read() == [['id', 'boolean'], [1, True], [2, False]]
60 | 
61 | 
62 | def test_stream_ods_with_ints_floats_dates():
63 |     source = 'data/special/table-with-ints-floats-dates.ods'
64 |     with Stream(source) as stream:
65 |         assert stream.read() == [['Int', 'Float', 'Date', 'Datetime'],
66 |                                  [2013, 3.3, datetime(2009, 8, 16).date(), datetime(2009, 8, 16, 5, 43, 21)],
67 |                                  [1997, 5.6, datetime(2009, 9, 20).date(), datetime(2009, 9, 20, 15, 30, 0)],
68 |                                  [1969, 11.7, datetime(2012, 8, 23).date(), datetime(2012, 8, 23, 20, 40, 59)]]
69 | 
70 | 
71 | # Internal
72 | 
73 | def test_parser_ods():
74 | 
75 |     source = 'data/table.ods'
76 |     encoding = None
77 |     loader = Mock()
78 |     loader.load = Mock(return_value=io.open(source, 'rb'))
79 |     parser = ODSParser(loader)
80 | 
81 |     assert parser.closed
82 |     parser.open(source, encoding=encoding)
83 |     assert not parser.closed
84 | 
85 |     assert list(parser.extended_rows) == [
86 |         (1, None, ['id', 'name']),
87 |         (2, None, [1.0, 'english']),
88 |         (3, None, [2.0, '中国人']),
89 |     ]
90 | 
91 |     assert len(list(parser.extended_rows)) == 0
92 |     parser.reset()
93 |     assert len(list(parser.extended_rows)) == 3
94 | 
95 |     parser.close()
96 |     assert parser.closed
97 | 


--------------------------------------------------------------------------------
/tests/formats/test_sql.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import pytest
 8 | from tabulator import Stream, exceptions
 9 | 
10 | 
11 | # Read
12 | 
13 | def test_stream_format_sql(database_url):
14 |     with Stream(database_url, table='data') as stream:
15 |         assert stream.read() == [[1, 'english'], [2, '中国人']]
16 | 
17 | 
18 | def test_stream_format_sql_order_by(database_url):
19 |     with Stream(database_url, table='data', order_by='id') as stream:
20 |         assert stream.read() == [[1, 'english'], [2, '中国人']]
21 | 
22 | 
23 | def test_stream_format_sql_order_by_desc(database_url):
24 |     with Stream(database_url, table='data', order_by='id desc') as stream:
25 |         assert stream.read() == [[2, '中国人'], [1, 'english']]
26 | 
27 | 
28 | def test_stream_format_sql_table_is_required_error(database_url):
29 |     with pytest.raises(exceptions.TabulatorException) as excinfo:
30 |         Stream(database_url).open()
31 |     assert 'table' in str(excinfo.value)
32 | 
33 | 
34 | def test_stream_format_sql_headers(database_url):
35 |     with Stream(database_url, table='data', headers=1) as stream:
36 |         assert stream.headers == ['id', 'name']
37 |         assert stream.read() == [[1, 'english'], [2, '中国人']]
38 | 
39 | 
40 | # Write
41 | 
42 | def test_stream_save_sqlite(database_url):
43 |     source = 'data/table.csv'
44 |     with Stream(source, headers=1) as stream:
45 |         assert stream.save(database_url, table='test_stream_save_sqlite') == 2
46 |     with Stream(database_url, table='test_stream_save_sqlite', order_by='id', headers=1) as stream:
47 |         assert stream.read() == [['1', 'english'], ['2', '中国人']]
48 |         assert stream.headers == ['id', 'name']
49 | 


--------------------------------------------------------------------------------
/tests/formats/test_tsv.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import io
 8 | from mock import Mock
 9 | from tabulator import Stream
10 | from tabulator.parsers.tsv import TSVParser
11 | 
12 | 
13 | # Read
14 | 
15 | def test_stream_format_tsv():
16 |     with Stream('data/table.tsv') as stream:
17 |         assert stream.headers is None
18 |         assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人'], ['3', None]]
19 | 
20 | 
21 | # Internal
22 | 
23 | def test_parser_tsv():
24 | 
25 |     source = 'data/table.tsv'
26 |     encoding = None
27 |     loader = Mock()
28 |     loader.load = Mock(return_value=io.open(source))
29 |     parser = TSVParser(loader)
30 | 
31 |     assert parser.closed
32 |     parser.open(source, encoding=encoding)
33 |     assert not parser.closed
34 | 
35 |     assert list(parser.extended_rows) == [
36 |         (1, None, ['id', 'name']),
37 |         (2, None, ['1', 'english']),
38 |         (3, None, ['2', '中国人']),
39 |         (4, None, ['3', None])]
40 | 
41 |     assert len(list(parser.extended_rows)) == 0
42 |     parser.reset()
43 |     assert len(list(parser.extended_rows)) == 4
44 | 
45 |     parser.close()
46 |     assert parser.closed
47 | 


--------------------------------------------------------------------------------
/tests/formats/test_xls.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | from __future__ import absolute_import
  5 | from __future__ import unicode_literals
  6 | 
  7 | import io
  8 | import pytest
  9 | from datetime import datetime
 10 | from mock import Mock
 11 | from tabulator import parsers
 12 | from tabulator import Stream, exceptions
 13 | from tabulator.parsers.xls import XLSParser
 14 | BASE_URL = 'https://raw.githubusercontent.com/okfn/tabulator-py/master/%s'
 15 | 
 16 | 
 17 | # Read
 18 | 
 19 | def test_stream_local_xls():
 20 |     with Stream('data/table.xls') as stream:
 21 |         assert stream.headers is None
 22 |         assert stream.read() == [['id', 'name'], [1, 'english'], [2, '中国人']]
 23 | 
 24 | 
 25 | @pytest.mark.remote
 26 | def test_stream_remote_xls():
 27 |     with Stream(BASE_URL % 'data/table.xls') as stream:
 28 |         assert stream.headers is None
 29 |         assert stream.read() == [['id', 'name'], [1, 'english'], [2, '中国人']]
 30 | 
 31 | 
 32 | def test_stream_xls_sheet_by_index():
 33 |     source = 'data/special/sheet2.xls'
 34 |     with Stream(source, sheet=2) as stream:
 35 |         assert stream.fragment == 'Sheet2'
 36 |         assert stream.read() == [['id', 'name'], [1, 'english'], [2, '中国人']]
 37 | 
 38 | 
 39 | def test_stream_xls_sheet_by_index_not_existent():
 40 |     source = 'data/special/sheet2.xls'
 41 |     with pytest.raises(exceptions.SourceError) as excinfo:
 42 |         Stream(source, sheet=3).open()
 43 |     assert 'sheet "3"' in str(excinfo.value)
 44 | 
 45 | 
 46 | def test_stream_xls_sheet_by_name():
 47 |     source = 'data/special/sheet2.xls'
 48 |     with Stream(source, sheet='Sheet2') as stream:
 49 |         assert stream.fragment == 'Sheet2'
 50 |         assert stream.read() == [['id', 'name'], [1, 'english'], [2, '中国人']]
 51 | 
 52 | 
 53 | def test_stream_xls_sheet_by_name_not_existent():
 54 |     source = 'data/special/sheet2.xls'
 55 |     with pytest.raises(exceptions.SourceError) as excinfo:
 56 |         Stream(source, sheet='not-existent').open()
 57 |     assert 'sheet "not-existent"' in str(excinfo.value)
 58 | 
 59 | 
 60 | def test_stream_xlsx_merged_cells():
 61 |     source = 'data/special/merged-cells.xls'
 62 |     with Stream(source) as stream:
 63 |         assert stream.read() == [['data', ''], ['', ''], ['', '']]
 64 | 
 65 | 
 66 | def test_stream_xlsx_merged_cells_fill():
 67 |     source = 'data/special/merged-cells.xls'
 68 |     with Stream(source, fill_merged_cells=True) as stream:
 69 |         assert stream.read() == [['data', 'data'], ['data', 'data'], ['data', 'data']]
 70 | 
 71 | 
 72 | def test_stream_xls_with_boolean():
 73 |     with Stream('data/special/table-with-booleans.xls') as stream:
 74 |         assert stream.headers is None
 75 |         assert stream.read() == [['id', 'boolean'], [1, True], [2, False]]
 76 | 
 77 | 
 78 | def test_stream_xlsx_merged_cells_boolean():
 79 |     source = 'data/special/merged-cells-boolean.xls'
 80 |     with Stream(source) as stream:
 81 |         assert stream.read() == [[True, ''], ['', ''], ['', '']]
 82 | 
 83 | 
 84 | def test_stream_xlsx_merged_cells_fill_boolean():
 85 |     source = 'data/special/merged-cells-boolean.xls'
 86 |     with Stream(source, fill_merged_cells=True) as stream:
 87 |         assert stream.read() == [[True, True], [True, True], [True, True]]
 88 | 
 89 | 
 90 | def test_stream_xls_with_ints_floats_dates():
 91 |     source = 'data/special/table-with-ints-floats-dates.xls'
 92 |     with Stream(source) as stream:
 93 |         assert stream.read() == [['Int', 'Float', 'Date'],
 94 |                                  [2013, 3.3, datetime(2009, 8, 16)],
 95 |                                  [1997, 5.6, datetime(2009, 9, 20)],
 96 |                                  [1969, 11.7, datetime(2012, 8, 23)]]
 97 | 
 98 | @pytest.mark.skip
 99 | @pytest.mark.remote
100 | def test_fix_for_2007_xls():
101 |     source = 'https://ams3.digitaloceanspaces.com/budgetkey-files/spending-reports/2018-3-משרד התרבות והספורט-לשכת הפרסום הממשלתית-2018-10-22-c457.xls'
102 |     with Stream(source) as stream:
103 |         assert len(stream.read()) > 10
104 | 
105 | # Internal
106 | 
107 | def test_parser_xls():
108 | 
109 |     source = 'data/table.xls'
110 |     encoding = None
111 |     loader = Mock()
112 |     loader.load = Mock(return_value=io.open(source, 'rb'))
113 |     parser = XLSParser(loader)
114 | 
115 |     assert parser.closed
116 |     parser.open(source, encoding=encoding)
117 |     assert not parser.closed
118 | 
119 |     assert list(parser.extended_rows) == [
120 |         (1, None, ['id', 'name']),
121 |         (2, None, [1, 'english']),
122 |         (3, None, [2, '中国人'])]
123 | 
124 |     assert len(list(parser.extended_rows)) == 0
125 |     parser.reset()
126 |     assert len(list(parser.extended_rows)) == 3
127 | 
128 |     parser.close()
129 |     assert parser.closed
130 | 


--------------------------------------------------------------------------------
/tests/formats/test_xlsx.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | from __future__ import absolute_import
  5 | from __future__ import unicode_literals
  6 | 
  7 | import io
  8 | import pytest
  9 | from mock import Mock
 10 | from tabulator import Stream, exceptions
 11 | from tabulator.parsers.xlsx import XLSXParser
 12 | 
 13 | BASE_URL = "https://raw.githubusercontent.com/frictionlessdata/tabulator-py/master/%s"
 14 | 
 15 | 
 16 | # Read
 17 | 
 18 | 
 19 | def test_stream_xlsx_stream():
 20 |     source = io.open("data/table.xlsx", mode="rb")
 21 |     with Stream(source, format="xlsx") as stream:
 22 |         assert stream.headers is None
 23 |         assert stream.read() == [["id", "name"], [1.0, "english"], [2.0, "中国人"]]
 24 | 
 25 | 
 26 | @pytest.mark.remote
 27 | def test_stream_xlsx_remote():
 28 |     source = BASE_URL % "data/table.xlsx"
 29 |     with Stream(source) as stream:
 30 |         assert stream.read() == [["id", "name"], [1.0, "english"], [2.0, "中国人"]]
 31 | 
 32 | 
 33 | def test_stream_xlsx_sheet_by_index():
 34 |     source = "data/special/sheet2.xlsx"
 35 |     with Stream(source, sheet=2) as stream:
 36 |         assert stream.fragment == "Sheet2"
 37 |         assert stream.read() == [["id", "name"], [1, "english"], [2, "中国人"]]
 38 | 
 39 | 
 40 | def test_stream_xlsx_sheet_by_index_not_existent():
 41 |     source = "data/special/sheet2.xlsx"
 42 |     with pytest.raises(exceptions.SourceError) as excinfo:
 43 |         Stream(source, sheet=3).open()
 44 |     assert 'sheet "3"' in str(excinfo.value)
 45 | 
 46 | 
 47 | def test_stream_xlsx_sheet_by_name():
 48 |     source = "data/special/sheet2.xlsx"
 49 |     with Stream(source, sheet="Sheet2") as stream:
 50 |         assert stream.fragment == "Sheet2"
 51 |         assert stream.read() == [["id", "name"], [1, "english"], [2, "中国人"]]
 52 | 
 53 | 
 54 | def test_stream_xlsx_sheet_by_name_not_existent():
 55 |     source = "data/special/sheet2.xlsx"
 56 |     with pytest.raises(exceptions.SourceError) as excinfo:
 57 |         Stream(source, sheet="not-existent").open()
 58 |     assert 'sheet "not-existent"' in str(excinfo.value)
 59 | 
 60 | 
 61 | def test_stream_xlsx_merged_cells():
 62 |     source = "data/special/merged-cells.xlsx"
 63 |     with Stream(source) as stream:
 64 |         assert stream.read() == [["data", None]]
 65 | 
 66 | 
 67 | def test_stream_xlsx_merged_cells_fill():
 68 |     source = "data/special/merged-cells.xlsx"
 69 |     with Stream(source, fill_merged_cells=True) as stream:
 70 |         assert stream.read() == [["data", "data"], ["data", "data"], ["data", "data"]]
 71 | 
 72 | 
 73 | def test_stream_xlsx_adjust_floating_point_error():
 74 |     source = "data/special/adjust_floating_point_error.xlsx"
 75 |     with Stream(
 76 |         source, headers=1, ignore_blank_headers=True, preserve_formatting=True,
 77 |     ) as stream:
 78 |         assert stream.read(keyed=True)[1]["actual PO4 (values)"] == 274.65999999999997
 79 |     with Stream(
 80 |         source,
 81 |         headers=1,
 82 |         ignore_blank_headers=True,
 83 |         preserve_formatting=True,
 84 |         adjust_floating_point_error=True,
 85 |     ) as stream:
 86 |         assert stream.read(keyed=True)[1]["actual PO4 (values)"] == 274.66
 87 | 
 88 | 
 89 | def test_stream_xlsx_preserve_formatting():
 90 |     source = "data/special/preserve-formatting.xlsx"
 91 |     with Stream(
 92 |         source, headers=1, ignore_blank_headers=True, preserve_formatting=True
 93 |     ) as stream:
 94 |         assert stream.read(keyed=True) == [
 95 |             {
 96 |                 # general
 97 |                 "empty": None,
 98 |                 # numeric
 99 |                 "0": "1001",
100 |                 "0.00": "1000.56",
101 |                 "0.0000": "1000.5577",
102 |                 "0.00000": "1000.55770",
103 |                 "0.0000#": "1000.5577",
104 |                 # temporal
105 |                 "m/d/yy": "5/20/40",
106 |                 "d-mmm": "20-May",
107 |                 "mm/dd/yy": "05/20/40",
108 |                 "mmddyy": "052040",
109 |                 "mmddyyam/pmdd": "052040AM20",
110 |                 "at_symbol": "259.153",
111 |             }
112 |         ]
113 | 
114 | 
115 | def test_stream_xlsx_preserve_formatting_percentage():
116 |     source = "data/special/preserve-formatting-percentage.xlsx"
117 |     with Stream(source, headers=1, preserve_formatting=True) as stream:
118 |         assert stream.read() == [
119 |             [123, "52.00%"],
120 |             [456, "30.00%"],
121 |             [789, "6.00%"],
122 |         ]
123 | 
124 | 
125 | def test_stream_xlsx_preserve_formatting_number_multicode():
126 |     source = "data/special/number_format_multicode.xlsx"
127 |     with Stream(
128 |         source, headers=1, ignore_blank_headers=True, preserve_formatting=True
129 |     ) as stream:
130 |         assert stream.read() == [["4.5"], ["-9.032"], ["15.8"]]
131 | 
132 | 
133 | def test_stream_xlsx_scientific_notation():
134 |     source = "data/special/test_scientific_notation.xlsx"
135 |     with Stream(source, headers=1, preserve_formatting=True,) as stream:
136 |         assert stream.read() == [["4.273E-07"]]
137 | 
138 | 
139 | def test_stream_xlsx_workbook_cache():
140 |     workbook_cache = {}
141 |     source = BASE_URL % "data/special/sheets.xlsx"
142 |     for sheet in ["Sheet1", "Sheet2", "Sheet3"]:
143 |         with Stream(source, sheet=sheet, workbook_cache=workbook_cache) as stream:
144 |             assert len(workbook_cache) == 1
145 |             assert stream.read()
146 | 
147 | 
148 | # Write
149 | 
150 | 
151 | def test_stream_save_xlsx(tmpdir):
152 |     source = "data/table.csv"
153 |     target = str(tmpdir.join("table.xlsx"))
154 |     with Stream(source, headers=1) as stream:
155 |         assert stream.save(target) == 2
156 |     with Stream(target, headers=1) as stream:
157 |         assert stream.headers == ["id", "name"]
158 |         assert stream.read(extended=True) == [
159 |             (2, ["id", "name"], ["1", "english"]),
160 |             (3, ["id", "name"], ["2", "中国人"]),
161 |         ]
162 | 
163 | 
164 | def test_stream_save_xlsx_sheet_name(tmpdir):
165 |     source = "data/table.csv"
166 |     target = str(tmpdir.join("table.xlsx"))
167 |     with Stream(source, headers=1) as stream:
168 |         assert stream.save(target, sheet="my-data") == 2
169 |     with Stream(target, headers=1, sheet="my-data") as stream:
170 |         assert stream.headers == ["id", "name"]
171 |         assert stream.read(extended=True) == [
172 |             (2, ["id", "name"], ["1", "english"]),
173 |             (3, ["id", "name"], ["2", "中国人"]),
174 |         ]
175 | 
176 | 
177 | # Internal
178 | 
179 | 
180 | def test_parser_xlsx():
181 | 
182 |     source = "data/table.xlsx"
183 |     encoding = None
184 |     loader = Mock()
185 |     loader.load = Mock(return_value=io.open(source, "rb"))
186 |     parser = XLSXParser(loader)
187 | 
188 |     assert parser.closed
189 |     parser.open(source, encoding=encoding)
190 |     assert not parser.closed
191 | 
192 |     assert list(parser.extended_rows) == [
193 |         (1, None, ["id", "name"]),
194 |         (2, None, [1.0, "english"]),
195 |         (3, None, [2.0, "中国人"]),
196 |     ]
197 | 
198 |     assert len(list(parser.extended_rows)) == 0
199 |     parser.reset()
200 |     assert len(list(parser.extended_rows)) == 3
201 | 
202 |     parser.close()
203 |     assert parser.closed
204 | 


--------------------------------------------------------------------------------
/tests/schemes/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tabulator-py/aecfa88d7ae8c633b1f8b691d061c5e9609ad95d/tests/schemes/__init__.py


--------------------------------------------------------------------------------
/tests/schemes/test_aws.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import os
 8 | import six
 9 | import sys
10 | import boto3
11 | import pytest
12 | import string
13 | import random
14 | import subprocess
15 | from moto import mock_s3
16 | from tabulator import Stream, exceptions
17 | 
18 | # Setup
19 | 
20 | S3_ENDPOINT_URL = os.environ['S3_ENDPOINT_URL'] = 'http://localhost:5000'
21 | 
22 | 
23 | # Read
24 | 
25 | # https://github.com/frictionlessdata/tabulator-py/issues/271
26 | @pytest.mark.skip
27 | def test_stream_s3(s3_client, bucket):
28 | 
29 |     # Upload a file
30 |     s3_client.put_object(
31 |         ACL='private',
32 |         Body=open('data/table.csv', 'rb'),
33 |         Bucket=bucket,
34 |         ContentType='text/csv',
35 |         Key='table.csv')
36 | 
37 |     # Check the file
38 |     with Stream('s3://%s/table.csv' % bucket) as stream:
39 |         assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']]
40 | 
41 | 
42 | # https://github.com/frictionlessdata/tabulator-py/issues/271
43 | @pytest.mark.skip
44 | def test_stream_s3_endpoint_url(s3_client, bucket):
45 | 
46 |     # Upload a file
47 |     s3_client.put_object(
48 |         ACL='private',
49 |         Body=open('data/table.csv', 'rb'),
50 |         Bucket=bucket,
51 |         ContentType='text/csv',
52 |         Key='table.csv')
53 | 
54 |     # Check the file
55 |     with Stream('s3://%s/table.csv' % bucket, s3_endpoint_url=S3_ENDPOINT_URL) as stream:
56 |         assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']]
57 | 
58 | 
59 | # https://github.com/frictionlessdata/tabulator-py/issues/271
60 | @pytest.mark.skip
61 | def test_stream_s3_non_existent_file(s3_client, bucket):
62 |     with pytest.raises(exceptions.IOError):
63 |         Stream('s3://%s/table.csv' % bucket).open()
64 | 
65 | 
66 | # Fixtures
67 | 
68 | @pytest.fixture(scope='module')
69 | def s3_client():
70 |     subprocess.Popen('moto_server', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
71 |     s3_client = boto3.client('s3', endpoint_url=S3_ENDPOINT_URL)
72 |     yield s3_client
73 |     os.system('pkill moto_server')
74 | 
75 | 
76 | @pytest.fixture
77 | def bucket(s3_client):
78 |     bucket = 'bucket_%s' % ''.join(random.choice(string.digits) for _ in range(16))
79 |     s3_client.create_bucket(Bucket=bucket, ACL='public-read')
80 |     return bucket
81 | 


--------------------------------------------------------------------------------
/tests/schemes/test_local.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import sys
 8 | import pytest
 9 | from tabulator import Stream
10 | from importlib import import_module
11 | from tabulator.loaders.local import LocalLoader
12 | 
13 | 
14 | # Read
15 | 
16 | def test_stream_file():
17 |     with Stream('data/table.csv') as stream:
18 |         assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']]
19 | 
20 | 
21 | @pytest.mark.skipif(sys.version_info < (3, 4), reason='not supported')
22 | def test_stream_file_pathlib_path():
23 |     pathlib = import_module('pathlib')
24 |     with Stream(pathlib.Path('data/table.csv')) as stream:
25 |         assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']]
26 | 
27 | 
28 | # Internal
29 | 
30 | def test_loader_local_t():
31 |     loader = LocalLoader()
32 |     chars = loader.load('data/table.csv', encoding='utf-8')
33 |     assert chars.read() == 'id,name\n1,english\n2,中国人\n'
34 | 
35 | 
36 | def test_loader_local_b():
37 |     spec = '中国人'.encode('utf-8')
38 |     loader = LocalLoader()
39 |     chars = loader.load('data/table.csv', mode='b', encoding='utf-8')
40 |     assert chars.read() == b'id,name\n1,english\n2,' + spec + b'\n'
41 | 


--------------------------------------------------------------------------------
/tests/schemes/test_remote.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import pytest
 8 | from tabulator import Stream
 9 | from tabulator.loaders.remote import RemoteLoader
10 | from tabulator.exceptions import HTTPError
11 | from time import time
12 | 
13 | BASE_URL = 'https://raw.githubusercontent.com/frictionlessdata/tabulator-py/master/%s'
14 | 
15 | 
16 | # Read
17 | 
18 | @pytest.mark.remote
19 | def test_stream_https():
20 |     with Stream(BASE_URL % 'data/table.csv') as stream:
21 |         assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']]
22 | 
23 | 
24 | @pytest.mark.remote
25 | def test_stream_https_latin1():
26 |     # Github returns wrong encoding `utf-8`
27 |     with Stream(BASE_URL % 'data/special/latin1.csv') as stream:
28 |         assert stream.read()
29 | 
30 | 
31 | # Internal
32 | 
33 | @pytest.mark.remote
34 | def test_loader_remote_t():
35 |     loader = RemoteLoader()
36 |     chars = loader.load(BASE_URL % 'data/table.csv', encoding='utf-8')
37 |     assert chars.read() == 'id,name\n1,english\n2,中国人\n'
38 | 
39 | 
40 | @pytest.mark.remote
41 | def test_loader_remote_b():
42 |     spec = '中国人'.encode('utf-8')
43 |     loader = RemoteLoader()
44 |     chars = loader.load(BASE_URL % 'data/table.csv', mode='b', encoding='utf-8')
45 |     assert chars.read() == b'id,name\n1,english\n2,' + spec + b'\n'
46 | 
47 | 
48 | @pytest.mark.skip
49 | @pytest.mark.remote
50 | def test_loader_no_timeout():
51 |     loader = RemoteLoader()
52 |     t = time()
53 |     chars = loader.load('https://httpstat.us/200?sleep=5000', mode='b', encoding='utf-8')
54 |     assert time() - t > 5
55 |     assert chars.read() == b'200 OK'
56 |     t = time()
57 | 
58 | 
59 | @pytest.mark.remote
60 | def test_loader_has_timeout():
61 |     loader = RemoteLoader(http_timeout=1)
62 |     t = time()
63 |     with pytest.raises(HTTPError):
64 |         chars = loader.load('https://httpstat.us/200?sleep=5000', mode='b', encoding='utf-8')
65 |     assert time() - t < 5
66 |     assert time() - t > 1
67 | 


--------------------------------------------------------------------------------
/tests/schemes/test_stream.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import io
 8 | from tabulator import Stream
 9 | 
10 | 
11 | # Read
12 | 
13 | def test_stream_stream():
14 |     source = io.open('data/table.csv', mode='rb')
15 |     with Stream(source, format='csv') as stream:
16 |         assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']]
17 | 


--------------------------------------------------------------------------------
/tests/schemes/test_text.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | from tabulator import Stream
 8 | from tabulator.loaders.text import TextLoader
 9 | 
10 | 
11 | # Read
12 | 
13 | def test_stream_text():
14 |     source = 'text://value1,value2\nvalue3,value4'
15 |     with Stream(source, format='csv') as stream:
16 |         assert stream.read() == [['value1', 'value2'], ['value3', 'value4']]
17 | 
18 | 
19 | # Internal
20 | 
21 | def test_load_t():
22 |     loader = TextLoader()
23 |     chars = loader.load('id,name\n1,english\n2,中国人\n', encoding='utf-8')
24 |     assert chars.read() == 'id,name\n1,english\n2,中国人\n'
25 | 
26 | def test_load_b():
27 |     spec = '中国人'.encode('utf-8')
28 |     loader = TextLoader()
29 |     chars = loader.load('id,name\n1,english\n2,中国人\n', mode='b', encoding='utf-8')
30 |     assert chars.read() == b'id,name\n1,english\n2,' + spec + b'\n'
31 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | # from __future__ import unicode_literals
 6 | 
 7 | from click.testing import CliRunner
 8 | from tabulator.cli import cli
 9 | 
10 | 
11 | # Tests
12 | 
13 | def test_cli():
14 |     runner = CliRunner()
15 |     result = runner.invoke(cli, ['data/table.csv'])
16 |     assert result.exit_code == 0
17 |     assert result.output.startswith('id,name\n1,english\n2,')
18 | 
19 | 
20 | def test_cli_version():
21 |     runner = CliRunner()
22 |     result = runner.invoke(cli, ['--version'])
23 |     assert result.exit_code == 0
24 |     assert len(result.output.split('.')) == 3
25 | 


--------------------------------------------------------------------------------
/tests/test_helpers.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | from __future__ import absolute_import
  5 | from __future__ import unicode_literals
  6 | 
  7 | import io
  8 | import pytest
  9 | from tabulator import helpers, config
 10 | 
 11 | 
 12 | # Tests
 13 | 
 14 | @pytest.mark.parametrize('source, scheme, format', [
 15 |     ('text://path', 'text', None),
 16 |     ('stream://path', 'stream', None),
 17 |     ('file://path', 'file', None),
 18 |     ('ftp://path', 'ftp', None),
 19 |     ('ftps://path', 'ftps', None),
 20 |     ('http://path', 'http', None),
 21 |     ('https://path', 'https', None),
 22 |     ('xxx://path', 'xxx', None),
 23 |     ('xx://path', 'xx', None),
 24 |     ('XXX://path', 'xxx', None),
 25 |     ('XX://path', 'xx', None),
 26 |     ('c://path', 'file', None),
 27 |     ('c:\\path', 'file', None),
 28 |     (r'c:\path', 'file', None),
 29 |     ('http//path', 'file', None),
 30 |     ('path', 'file', None),
 31 |     ('path.CsV', 'file', 'csv'),
 32 |     ('http://someplace.com/foo/path.csv?foo=bar#baz', 'http', 'csv'),
 33 |     ('http://someplace.com/foo/path?foo=bar&format=csv#baz', 'http', 'csv'),
 34 |     ('https://docs.google.com/spreadsheets/d/X/edit?usp=sharing', None, 'gsheet'),
 35 |     ('https://docs.google.com/spreadsheets/d/X/export?format=csv&gid=0&single=true', 'https', 'csv'),
 36 |     ('https://docs.google.com/spreadsheets/d/X/pub?gid=0&single=true&output=csv', 'https', 'csv'),
 37 | ])
 38 | def test_detect_scheme_and_format(source, scheme, format):
 39 |     assert helpers.detect_scheme_and_format(source) == (scheme, format)
 40 | 
 41 | 
 42 | def test_detect_encoding():
 43 |     with io.open('Makefile', 'rb') as fp:
 44 |         sample = fp.read(config.DEFAULT_BYTES_SAMPLE_SIZE)
 45 |         assert helpers.detect_encoding(sample) == 'utf-8'
 46 | 
 47 | 
 48 | def test_detect_encoding_windows_1252():
 49 |     sample = b'A\n' * 300 + b'\xff\xff'
 50 |     try:
 51 |         import cchardet
 52 |         assert helpers.detect_encoding(sample) == 'cp1252'
 53 |     except ImportError:
 54 |         assert helpers.detect_encoding(sample) == 'iso8859-1'
 55 | 
 56 | 
 57 | def test_detect_encoding_utf_16_be():
 58 |     sample = u'\uFEFFthen some text'.encode('utf-16-be')
 59 |     assert helpers.detect_encoding(sample) == 'utf-16'
 60 | 
 61 | 
 62 | def test_detect_encoding_utf_16_le():
 63 |     sample = u'\uFEFFthen some text'.encode('utf-16-le')
 64 |     assert helpers.detect_encoding(sample) == 'utf-16'
 65 | 
 66 | 
 67 | def test_detect_encoding_unknown():
 68 |     sample = b'\xff\x81'
 69 |     assert helpers.detect_encoding(sample) == 'utf-8'
 70 | 
 71 | 
 72 | def test_reset_stream_seekable():
 73 |     file = io.open(__file__)
 74 |     file.seek(1)
 75 |     assert file.tell() == 1
 76 |     helpers.reset_stream(file)
 77 |     assert file.tell() == 0
 78 | 
 79 | 
 80 | def test_reset_stream_not_seekable():
 81 |     with pytest.raises(Exception):
 82 |         helpers.reset_stream('not_seekable')
 83 | 
 84 | 
 85 | def test_requote_uri():
 86 |     url = 'http://next.openspending.org/fdp-adapter/convert?url=https%3A%2F%2Fraw.githubusercontent.com%2Fkravets-levko%2Fdata%2Fmaster%2Ftest.xlsx.csv'
 87 |     url1 = 'http://data.defra.gov.uk/ops/government_procurement_card/over_£500_GPC_apr_2013.csv'
 88 |     url2 = 'http://data.defra.gov.uk/ops/government_procurement_card/over_%C2%A3500_GPC_apr_2013.csv'
 89 |     assert helpers.requote_uri(url) == url
 90 |     assert helpers.requote_uri(url1) == url2
 91 | 
 92 | 
 93 | def test_import_attribute():
 94 |     assert helpers.import_attribute('tabulator.helpers') == helpers
 95 | 
 96 | 
 97 | def test_import_attribute_import_error():
 98 |     with pytest.raises((ImportError, AttributeError)):
 99 |         helpers.import_attribute('tabulator.bad_name')
100 | 
101 | 
102 | def test_extract_options():
103 |     names = ['opt1', 'opt2']
104 |     options = {'opt1': 1, 'opt2': 2, 'opt3': 3}
105 |     extracted_options = helpers.extract_options(options, names)
106 |     assert options == {'opt3': 3}
107 |     assert extracted_options == {'opt1': 1, 'opt2': 2}
108 | 
109 | 
110 | @pytest.mark.parametrize('sample', [
111 |     ('\n\n\t <html>', True),
112 |     ('<!DOCTYPE html>', True),
113 |     ('col1,col2\nval1,<html>', False),
114 |     ('val1,<html>', False),
115 | ])
116 | def test_detect_html(sample):
117 |     text, is_html = sample
118 |     assert helpers.detect_html(text) is is_html
119 | 
120 | 
121 | def test_stringify_value():
122 |     sample = '\u4e9c'.encode('utf-8-sig').decode("utf-8")
123 |     assert helpers.stringify_value(sample) == sample
124 | 
125 | 
126 | def test_stringify_value_none():
127 |     assert helpers.stringify_value(None) == ''
128 | 
129 | 


--------------------------------------------------------------------------------
/tests/test_validate.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import io
 8 | import pytest
 9 | from tabulator import validate, exceptions
10 | 
11 | 
12 | # Tests
13 | 
14 | def test_validate_test_schemes():
15 |     # Supported
16 |     assert validate('path.csv')
17 |     assert validate('file://path.csv')
18 |     assert validate('http://example.com/path.csv')
19 |     assert validate('https://example.com/path.csv')
20 |     assert validate('ftp://example.com/path.csv')
21 |     assert validate('ftps://example.com/path.csv')
22 |     assert validate('path.csv', scheme='file')
23 |     # Not supported
24 |     with pytest.raises(exceptions.SchemeError) as excinfo:
25 |         validate('ssh://example.com/path.csv')
26 |     with pytest.raises(exceptions.SchemeError) as excinfo:
27 |         validate('bad://example.com/path.csv')
28 | 
29 | 
30 | def test_validate_test_formats():
31 |     # Supported
32 |     assert validate('path.csv')
33 |     assert validate('path.json')
34 |     assert validate('path.jsonl')
35 |     assert validate('path.ndjson')
36 |     assert validate('path.tsv')
37 |     assert validate('path.xls')
38 |     assert validate('path.ods')
39 |     assert validate('path.no-format', format='csv')
40 |     # Not supported
41 |     with pytest.raises(exceptions.FormatError) as excinfo:
42 |         validate('path.txt')
43 |     with pytest.raises(exceptions.FormatError) as excinfo:
44 |         validate('path.bad')
45 | 
46 | 
47 | def test_validate_test_special():
48 |     # Gsheet
49 |     assert validate('https://docs.google.com/spreadsheets/d/id', format='csv')
50 |     # File-like
51 |     assert validate(io.open('data/table.csv', encoding='utf-8'), format='csv')
52 |     # Text
53 |     assert validate('text://name,value\n1,2', format='csv')
54 |     # Inline
55 |     assert validate([{'name': 'value'}])
56 | 


--------------------------------------------------------------------------------