├── dbs
    └── .gitkeep
├── csvapi
    ├── __init__.py
    ├── errors.py
    ├── security.py
    ├── uploadview.py
    ├── exportview.py
    ├── profiling.py
    ├── cli.py
    ├── webservice.py
    ├── type_tester.py
    ├── parser.py
    ├── parseview.py
    ├── utils.py
    └── tableview.py
├── tests
    ├── dbs
    │   └── .gitkeep
    ├── samples
    │   ├── test.xls
    │   ├── test.xlsx
    │   └── real_xls
    │   │   ├── file_example_XLS_50.xls
    │   │   └── tourisme-handicap-etablissements-21022020.xlsx
    └── test_api.py
├── setup.cfg
├── .circleci
    ├── images
    │   └── csvapi-circle
    │   │   └── Dockerfile
    └── config.yml
├── .gitignore
├── bumpr.rc
├── config.py
├── LICENSE
├── benchmark
    ├── bench.js
    ├── legacy
    │   ├── bench-apify.sh
    │   └── bench-parser.py
    └── bench.py
├── pyproject.toml
├── CHANGELOG.md
├── profiling-minimal.yml
└── README.md


/dbs/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/csvapi/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/dbs/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/samples/test.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/etalab/csvapi/HEAD/tests/samples/test.xls


--------------------------------------------------------------------------------
/tests/samples/test.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/etalab/csvapi/HEAD/tests/samples/test.xlsx


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120
3 | 
4 | [tool:pytest]
5 | filterwarnings =
6 |     ignore::DeprecationWarning
7 | 


--------------------------------------------------------------------------------
/.circleci/images/csvapi-circle/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM cimg/python:3.9
2 | 
3 | RUN sudo apt-get update && sudo apt-get install -y file
4 | 


--------------------------------------------------------------------------------
/tests/samples/real_xls/file_example_XLS_50.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/etalab/csvapi/HEAD/tests/samples/real_xls/file_example_XLS_50.xls


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | dbs/*.db
 2 | profiles/*.html
 3 | *.egg-info/
 4 | *.pyc
 5 | build/
 6 | dist/
 7 | reports/
 8 | .vscode/
 9 | .pytest_cache/
10 | 


--------------------------------------------------------------------------------
/tests/samples/real_xls/tourisme-handicap-etablissements-21022020.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/etalab/csvapi/HEAD/tests/samples/real_xls/tourisme-handicap-etablissements-21022020.xlsx


--------------------------------------------------------------------------------
/csvapi/errors.py:
--------------------------------------------------------------------------------
 1 | class APIError(Exception):
 2 |     status = 500
 3 | 
 4 |     def __init__(self, message, status=None, payload=None):
 5 |         super().__init__(message)
 6 |         self.message = message
 7 |         if status is not None:
 8 |             self.status = status
 9 |         self.payload = payload
10 | 
11 |     def to_dict(self):
12 |         rv = dict(self.payload or ())
13 |         rv['error'] = self.message
14 |         rv['ok'] = False
15 |         return rv
16 | 


--------------------------------------------------------------------------------
/bumpr.rc:
--------------------------------------------------------------------------------
 1 | [bumpr]
 2 | file = pyproject.toml
 3 | regex = version\s*=\s*"(?P<version>.+?)"
 4 | vcs = git
 5 | commit = true
 6 | tag = true
 7 | tag_format = v{version}
 8 | push = true
 9 | clean = rm -rf *egg-info build dist
10 | tests = poetry run pytest tests
11 | publish = poetry build
12 | files =
13 |     README.md
14 | 
15 | [bump]
16 | unsuffix = true
17 | 
18 | [prepare]
19 | part = patch
20 | suffix = dev
21 | 
22 | [changelog]
23 | file = CHANGELOG.md
24 | bump = ## {version} ({date:%Y-%m-%d})
25 | prepare = ## Current (in progress)
26 | separator =
27 | 


--------------------------------------------------------------------------------
/csvapi/security.py:
--------------------------------------------------------------------------------
 1 | from urllib.parse import urlparse
 2 | 
 3 | from quart import current_app as app, request, jsonify
 4 | 
 5 | 
 6 | def filter_referrers():
 7 |     filters = app.config.get('REFERRERS_FILTER')
 8 |     if not filters:
 9 |         return None
10 |     referrer = request.referrer
11 |     if referrer:
12 |         parsed = urlparse(referrer)
13 |         for filter in filters:
14 |             if parsed.hostname.endswith(filter):
15 |                 return None
16 |     return jsonify({
17 |             'ok': False,
18 |             'error': 'Unauthorized',
19 |     }), 403
20 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | DB_ROOT_DIR = './dbs'
 2 | CSV_CACHE_ENABLED = True
 3 | MAX_WORKERS = 3
 4 | DEBUG = True
 5 | SENTRY_DSN = None
 6 | FORCE_SSL = False
 7 | # In bytes, cf `sniff_limit` https://agate.readthedocs.io/en/1.6.1/api/table.html#agate.Table.from_csv
 8 | CSV_SNIFF_LIMIT = 4096 * 2
 9 | # In bytes, csvapi will stop downloading files if they reach this size
10 | # Default to 100 Mo
11 | MAX_FILE_SIZE = 1024 * 1024 * 100
12 | # Set this to an array of hosts to filter out calls by referer (403 returned if no match)
13 | # It will also match subdomains
14 | # e.g. REFERRERS_FILTER = ['data.gouv.fr'] will match 'demo.data.gouv.fr'
15 | REFERRERS_FILTER = None
16 | PANDAS_PROFILING_CONFIG_MIN = False
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2022 Etalab
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/benchmark/bench.js:
--------------------------------------------------------------------------------
 1 | import http from 'k6/http';
 2 | import { sleep } from 'k6';
 3 | import crypto from 'k6/crypto';
 4 | 
 5 | export const options = {
 6 |     vus: 10,
 7 |     iterations: 20,
 8 | };
 9 | 
10 | export default function () {
11 |     var toParse = "https://www.data.gouv.fr/fr/datasets/r/e3d83ab3-dc52-4c99-abaf-8a38050cc68c"
12 |     var base = "https://csvapi.data.gouv.fr";
13 | 
14 |     // change me to invalidate cache
15 |     var rdm = "2";
16 |     let toApify = `${toParse}?ts=${rdm}`
17 |     let hash = crypto.md5(toApify, 'hex');
18 |     console.log(hash);
19 | 
20 |     // apify 1
21 |     var apify = `${base}/apify?url=${toApify}`;
22 |     http.get(apify);
23 | 
24 |     // analyze 1
25 |     var analyze = `${base}/apify?analysis=yes&url=${toApify}`;
26 |     http.get(analyze);
27 | 
28 |     // make 10 requests
29 |     for (let id = 1; id <= 10; id++) {
30 |         http.get("https://csvapi.data.gouv.fr/api/26bdf0d090dfbaecbe213c6f551a46ac", {
31 |             tags: { name: 'request' },
32 |         });
33 |         sleep(0.1);
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "csvapi"
 3 | version = "2.2.1.dev"
 4 | description = "An instant JSON API for your CSV"
 5 | authors = ["Opendatateam <opendatateam@data.gouv.fr>"]
 6 | license = "MIT"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = ">=3.9,<4"
10 | click_default_group = "~1.2.2"
11 | click = "~8.1.3"
12 | agate = "~1.6.3"
13 | agate-sql = "~0.5.8"
14 | aiohttp = "~3.8.1"
15 | validators = "~0.20.0"
16 | agate-excel = "~0.2.5"
17 | Quart = "~0.18.0"
18 | quart-cors = "~0.5.0"
19 | sentry-sdk = "~1.9.8"
20 | cchardet = "~2.1.7"
21 | python-stdnum = "~1.17"
22 | aiosqlite = "~0.17.0"
23 | pandas = "~1.4.4"
24 | pandas-profiling = "~3.2.0"
25 | requests = "~2.28.1"
26 | boto3 = "~1.24.66"
27 | csv-detective = "~0.4.6"
28 | 
29 | [tool.poetry.dev-dependencies]
30 | aioresponses = "~0.7.3"
31 | pytest = "~7.1.3"
32 | pytest-asyncio = "~0.19.0"
33 | flake8 = "~5.0.4"
34 | pytest-cov = "~3.0.0"
35 | bumpr = "^0.3.8"
36 | 
37 | [tool.poetry.scripts]
38 | csvapi = "csvapi.cli:cli"
39 | 
40 | [build-system]
41 | requires = ["poetry-core>=1.0.0"]
42 | build-backend = "poetry.core.masonry.api"
43 | 


--------------------------------------------------------------------------------
/csvapi/uploadview.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from tempfile import NamedTemporaryFile
 3 | 
 4 | from quart import request, current_app as app, jsonify
 5 | from quart.views import MethodView
 6 | 
 7 | from csvapi.errors import APIError
 8 | from csvapi.utils import get_hash_bytes, already_exists
 9 | from csvapi.parser import parse
10 | 
11 | 
12 | class UploadView(MethodView):
13 | 
14 |     async def post(self):
15 |         files = await request.files
16 |         _file = files.get('file') or files.get('filepond')
17 |         if not _file:
18 |             raise APIError('Missing file.', status=400)
19 |         content_hash = get_hash_bytes(_file.read())
20 |         _file.seek(0)
21 |         if not already_exists(content_hash):
22 |             storage = app.config['DB_ROOT_DIR']
23 |             sniff_limit = app.config.get('CSV_SNIFF_LIMIT')
24 |             try:
25 |                 _tmpfile = NamedTemporaryFile(delete=False)
26 |                 _file.save(_tmpfile)
27 |                 _tmpfile.close()
28 |                 parse(_tmpfile.name, content_hash, storage, sniff_limit=sniff_limit)
29 |             finally:
30 |                 os.unlink(_tmpfile.name)
31 | 
32 |         scheme = 'https' if app.config.get('FORCE_SSL') else request.scheme
33 |         return jsonify({
34 |             'ok': True,
35 |             'endpoint': f"{scheme}://{request.host}/api/{content_hash}"
36 |         })
37 | 


--------------------------------------------------------------------------------
/csvapi/exportview.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import sqlite3
 3 | 
 4 | from io import StringIO
 5 | from pathlib import Path
 6 | 
 7 | from quart import make_response
 8 | 
 9 | from csvapi.errors import APIError
10 | from csvapi.tableview import TableView
11 | from csvapi.utils import get_db_info
12 | 
13 | 
14 | class ExportView(TableView):
15 | 
16 |     async def get(self, urlhash):
17 |         "This will inherit sorting and filtering from TableView"
18 |         db_info = get_db_info(urlhash)
19 |         p = Path(db_info['db_path'])
20 |         if not p.exists():
21 |             raise APIError('Database has probably been removed.', status=404)
22 | 
23 |         try:
24 |             columns, rows_iter = await self.data(db_info, export=True)
25 |         except (sqlite3.OperationalError, sqlite3.IntegrityError) as e:
26 |             raise APIError('Error selecting data', status=400, payload=dict(details=str(e)))
27 | 
28 |         def make_line(line_data):
29 |             line = StringIO()
30 |             writer = csv.writer(line)
31 |             writer.writerow(line_data)
32 |             line.seek(0)
33 |             return line.read().encode()
34 | 
35 |         async def _make_response():
36 |             yield make_line(columns)
37 |             for line in rows_iter:
38 |                 yield make_line(line)
39 | 
40 |         response = await make_response(_make_response())
41 |         response.mimetype = 'text/csv'
42 |         response.headers['Content-Disposition'] = f'attachment; filename={urlhash}.csv'
43 |         return response
44 | 


--------------------------------------------------------------------------------
/benchmark/legacy/bench-apify.sh:
--------------------------------------------------------------------------------
 1 | wget http://localhost:8001/apify?url=http://datanova.legroupe.laposte.fr/explore/dataset/laposte_poincont2/download/?format=csv&timezone=Europe/Berlin&use_labels_for_header=true &
 2 | wget http://localhost:8001/apify?url=https://datanova.legroupe.laposte.fr/explore/dataset/laposte_hexasmal/download/?format=csv&timezone=Europe/Berlin&use_labels_for_header=true &
 3 | wget http://localhost:8001/apify?url=http://file-examples.com/wp-content/uploads/2017/02/file_example_XLS_10.xls
 4 | wget http://localhost:8001/apify?url=http://file-examples.com/wp-content/uploads/2017/02/file_example_XLS_50.xls
 5 | wget http://localhost:8001/apify?url=http://file-examples.com/wp-content/uploads/2017/02/file_example_XLS_100.xls
 6 | wget http://localhost:8001/apify?url=http://file-examples.com/wp-content/uploads/2017/02/file_example_XLS_1000.xls
 7 | wget http://localhost:8001/apify?url=http://file-examples.com/wp-content/uploads/2017/02/file_example_XLS_5000.xls
 8 | wait
 9 | rm apify\?url\=*
10 | 
11 | 
12 | # workers = 1, app.run : 0m31.249s
13 | # workers = 3, app.run : 0m18.498s
14 | # workers = 5, app.run : 0m34.485s
15 | # workers = 10, app.run : 0m40.397s
16 | 
17 | # workers = 3, hypercorn w = 1 : 0m36.761s
18 | # workers = 3, hypercorn w = 3 : 0m16.607s
19 | # workers = 3, hypercorn w = 5 : 0m34.812s
20 | 
21 | # workers = 1, hypercorn w = 1 : 0m40.030s
22 | # workers = 1, hypercorn w = 3 : 0m40.920s
23 | # workers = 1, hypercorn w = 5 : 0m42.833s
24 | 
25 | # workers = 5, hypercorn w = 5 : 0m16.097s
26 | 
27 | # no shared executor, app.run : 0m35.871s
28 | # no shared executor, hypercorn w = 3 : 0m15.767s
29 | 


--------------------------------------------------------------------------------
/csvapi/profiling.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import pandas as pd
 4 | import sqlite3
 5 | 
 6 | from pandas_profiling import ProfileReport
 7 | 
 8 | from csvapi.errors import APIError
 9 | from csvapi.utils import get_db_info
10 | 
11 | import json
12 | 
13 | 
14 | class CSVAPIProfileReport:
15 | 
16 |     def get_dataframe(self, db_info):
17 |         dsn = 'file:{}?immutable=1'.format(db_info['db_path'])
18 |         conn = sqlite3.connect(dsn, uri=True)
19 |         sql = 'SELECT * FROM [{}]'.format(db_info['table_name'])
20 |         df = pd.read_sql_query(sql, con=conn)
21 |         return df
22 | 
23 |     async def get_minimal_profile(self, urlhash: str) -> dict:
24 |         db_info = get_db_info(urlhash)
25 |         p = Path(db_info['db_path'])
26 |         if not p.exists():
27 |             raise APIError('Database has probably been removed or does not exist yet.', status=404)
28 | 
29 |         try:
30 |             df = self.get_dataframe(db_info)
31 |             profile = ProfileReport(
32 |                 df, minimal=True,
33 |                 vars=dict(num={"low_categorical_threshold": 0}),
34 |                 plot=dict(histogram={"bins": 10}),
35 |                 # this disables the ThreadPoolExecutor in pandas-profiling
36 |                 # remove it or set it to 0 to use the number of CPUs a pool size
37 |                 pool_size=1,
38 |                 progress_bar=False,
39 |             )
40 |             profile_report = json.loads(profile.to_json())
41 |             return profile_report
42 |         except (sqlite3.OperationalError, sqlite3.IntegrityError) as e:
43 |             raise APIError('Error selecting data', status=400, payload=dict(details=str(e)))
44 | 


--------------------------------------------------------------------------------
/csvapi/cli.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | import ssl
 3 | from click_default_group import DefaultGroup
 4 | 
 5 | from csvapi.webservice import app
 6 | 
 7 | RESPONSE_TIMEOUT = 5 * 60  # in seconds
 8 | 
 9 | 
10 | @click.group(cls=DefaultGroup, default='serve', default_if_no_args=True)
11 | @click.version_option()
12 | def cli():
13 |     """
14 |     csvapi!
15 |     """
16 | 
17 | 
18 | @click.option('--dbs', default='./dbs',
19 |               type=click.Path(exists=True, file_okay=False),
20 |               help='Where to store sqlite DBs')
21 | @click.option('-h', '--host', default='127.0.0.1',
22 |               help='host for server, defaults to 127.0.0.1')
23 | @click.option('-p', '--port', default=8001,
24 |               help='port for server, defaults to 8001')
25 | @click.option('--debug', is_flag=True,
26 |               help='Enable debug mode - useful for development')
27 | @click.option('--reload', is_flag=True,
28 |               help='Automatically reload if code change detected')
29 | @click.option('--cache/--no-cache', default=True,
30 |               help='Do not parse CSV again if DB already exists')
31 | @click.option('--ssl-cert', default=None,
32 |               help='Path to SSL certificate')
33 | @click.option('--ssl-key', default=None,
34 |               help='Path to SSL key')
35 | @cli.command()
36 | def serve(dbs, host, port, debug, reload, cache, ssl_cert, ssl_key):
37 |     ssl_context = None
38 |     if ssl_cert and ssl_key:
39 |         ssl_context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
40 |         ssl_context.load_cert_chain(certfile=ssl_cert, keyfile=ssl_key)
41 |     app.config.update({
42 |         'DB_ROOT_DIR': dbs,
43 |         'CSV_CACHE_ENABLED': cache,
44 |         'DEBUG': debug,
45 |         'RESPONSE_TIMEOUT': RESPONSE_TIMEOUT,
46 |     })
47 |     app.run(host=host, port=port, debug=debug, use_reloader=reload, ssl=ssl_context)
48 | 


--------------------------------------------------------------------------------
/benchmark/legacy/bench-parser.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import warnings
 3 | from pathlib import Path
 4 | 
 5 | import click
 6 | 
 7 | from csvapi.parser import detect_encoding
 8 | from csvapi.parser import from_csv
 9 | from csvapi.parser import detect_type, CSV_FILETYPES
10 | 
11 | SNIFF_LIMIT = 4096
12 | FILES_DIR = "/Users/alexandre/Developer/Etalab/decapode/data/downloaded"
13 | 
14 | 
15 | @click.command()
16 | @click.option('--filename', prompt='Error file name')
17 | def run(filename):
18 |     filename = filename if filename.endswith(".csv") else f"{filename}.csv"
19 |     parsed = 0
20 |     warning = 0
21 |     not_csv = 0
22 |     errors = []
23 | 
24 |     for filepath in Path(FILES_DIR).glob("*.csv"):
25 |         parsed += 1
26 |         file_type = detect_type(filepath)
27 |         if not any([supported in file_type for supported in CSV_FILETYPES]):
28 |             # print(f"Not a CSV through magic number", file_type.strip())
29 |             not_csv += 1
30 |             continue
31 |         encoding = detect_encoding(filepath)
32 |         try:
33 |             with warnings.catch_warnings(record=True) as w:
34 |                 table = from_csv(filepath, encoding=encoding, sniff_limit=SNIFF_LIMIT)
35 |                 if any(["Column" in _w.message.__str__() for _w in w]):
36 |                     warning += 1
37 |         except Exception as e:
38 |             print('-----', filepath)
39 |             print("ERROR", e)
40 |             errors.append({
41 |                 "filepath": filepath,
42 |                 "error": e.__str__()
43 |             })
44 | 
45 |     print(f"Errors: {len(errors)}/{parsed} ({round(len(errors) / parsed * 100, 2)}%)")
46 |     print(f"Column warnings: {warning}/{parsed} ({round(warning / parsed * 100, 2)}%)")
47 |     print(f"Not CSV (magic): {not_csv}/{parsed} ({round(not_csv / parsed * 100, 2)}%)")
48 | 
49 |     if not errors:
50 |         return
51 | 
52 |     with open(filename, 'w') as ofile:
53 |         writer = csv.DictWriter(ofile, fieldnames=errors[0].keys())
54 |         writer.writeheader()
55 |         writer.writerows(errors)
56 | 
57 | if __name__ == '__main__':
58 |     run()
59 | 


--------------------------------------------------------------------------------
/csvapi/webservice.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import traceback
 3 | 
 4 | from quart import Quart, jsonify
 5 | from quart_cors import cors
 6 | from werkzeug.exceptions import NotFound
 7 | 
 8 | from csvapi.errors import APIError
 9 | from csvapi.tableview import TableView
10 | from csvapi.exportview import ExportView
11 | from csvapi.uploadview import UploadView
12 | from csvapi.parseview import ParseView
13 | from csvapi.security import filter_referrers
14 | 
15 | app = Quart(__name__)
16 | app = cors(app, allow_origin='*')
17 | 
18 | app.add_url_rule('/api/<urlhash>', view_func=TableView.as_view('table'))
19 | app.add_url_rule('/api/<urlhash>/export', view_func=ExportView.as_view('export'))
20 | app.add_url_rule('/apify', view_func=ParseView.as_view('parse'))
21 | app.add_url_rule('/upload', view_func=UploadView.as_view('upload'))
22 | app.before_request(filter_referrers)
23 | 
24 | 
25 | conffile = os.environ.get('CSVAPI_CONFIG_FILE') or '../config.py'
26 | app.config.from_pyfile(conffile)
27 | 
28 | 
29 | def handle_and_print_error(error):
30 |     sentry_id = None
31 |     if app.config.get('SENTRY_DSN'):
32 |         import sentry_sdk
33 |         with sentry_sdk.push_scope() as scope:
34 |             sentry_sdk.init(
35 |                 app.config['SENTRY_DSN'],
36 |                 traces_sample_rate=1.0
37 |             )
38 |             scope.set_extra('debug', False)
39 |             from sentry_sdk import capture_exception
40 |             sentry_id = capture_exception(error)
41 |     traceback.print_exc()
42 |     return sentry_id
43 | 
44 | 
45 | @app.errorhandler(NotFound)
46 | def handle_not_found(error):
47 |     response = jsonify({
48 |         'ok': False,
49 |         'error': 'Not found',
50 |     })
51 |     response.status_code = 404
52 |     return response
53 | 
54 | 
55 | @app.errorhandler(APIError)
56 | def handle_api_error(error):
57 |     error_id = handle_and_print_error(error)
58 |     data = error.to_dict()
59 |     app.logger.error(f"{data.get('error')}: {data.get('details', '')}")
60 |     data['error_id'] = error_id
61 |     response = jsonify(data)
62 |     response.status_code = error.status
63 |     return response
64 | 
65 | 
66 | @app.errorhandler(Exception)
67 | def handle_exceptions(error):
68 |     """Serialize all errors to API"""
69 |     error_id = handle_and_print_error(error)
70 |     response = jsonify(error=str(error), error_id=error_id, ok=False)
71 |     return response, 500
72 | 


--------------------------------------------------------------------------------
/csvapi/type_tester.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from agate.data_types.base import DataType
 4 | from agate.data_types.boolean import Boolean
 5 | from agate.data_types.date import Date
 6 | from agate.data_types.date_time import DateTime
 7 | from agate.data_types.number import Number
 8 | from agate.data_types.text import Text
 9 | from agate.data_types.time_delta import TimeDelta
10 | from agate.exceptions import CastError
11 | from agate.type_tester import TypeTester
12 | 
13 | from agatesql import table as agatesqltable
14 | 
15 | from sqlalchemy.types import VARCHAR
16 | 
17 | from stdnum.fr.siren import is_valid as is_valid_siren
18 | from stdnum.fr.siret import is_valid as is_valid_siret
19 | 
20 | 
21 | class Time(DataType):
22 |     # Detect an hour minute string.
23 |     # Examples: 12:20, 9:50, 23:30
24 |     def __init__(self, **kwargs):
25 |         super(Time, self).__init__(**kwargs)
26 | 
27 |     def cast(self, d):
28 |         if d is None:
29 |             return d
30 |         if re.match(r"^(?:[01]\d|2[0-3]|\d):[0-5]\d$", str(d)):
31 |             return Text().cast(d)
32 |         raise CastError('Can not parse value "%s" as time.' % d)
33 | 
34 | 
35 | class SirenSiret(DataType):
36 |     # Detect a SIREN or SIRET number
37 |     def __init__(self):
38 |         super(SirenSiret, self).__init__()
39 | 
40 |     def cast(self, d):
41 |         if d is None:
42 |             return d
43 |         if is_valid_siret(d) or is_valid_siren(d):
44 |             return Text().cast(d)
45 |         raise CastError('Can not parse value "%s" as a SIREN or SIRET.' % d)
46 | 
47 | 
48 | # agatesql needs to know the SQL equivalent of a type.
49 | # Tell agatesql how our custom types should be converted in SQL.
50 | #
51 | # Reference:
52 | # https://github.com/wireservice/agate-sql/blob/7466073d81289323851c21817ea33170e36ce2a5/agatesql/table.py#L21-L28
53 | agatesqltable.SQL_TYPE_MAP[Time] = VARCHAR
54 | agatesqltable.SQL_TYPE_MAP[SirenSiret] = VARCHAR
55 | 
56 | 
57 | def agate_tester():
58 |     # Override the original list of type checkers present in agate
59 |     # to detect types.
60 |     #
61 |     # Original list here:
62 |     # https://github.com/wireservice/agate/blob/e3078dca8b3566e8408e65981f79918c2f36f9fe/agate/type_tester.py#L64-L71
63 |     return TypeTester(
64 |         types=[
65 |             Boolean(),
66 |             SirenSiret(),
67 |             Number(),
68 |             Time(),
69 |             TimeDelta(),
70 |             Date(),
71 |             DateTime(),
72 |             Text(),
73 |         ]
74 |     )
75 | 


--------------------------------------------------------------------------------
/csvapi/parser.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import agate
 4 | import cchardet as chardet
 5 | 
 6 | from csvapi.utils import get_db_info
 7 | from csvapi.type_tester import agate_tester
 8 | import logging
 9 | 
10 | logging.captureWarnings(True)
11 | logging.getLogger("py.warnings").setLevel(logging.ERROR)
12 | 
13 | SNIFF_LIMIT = 4096
14 | CSV_FILETYPES = ('text/plain', 'application/csv', 'text/csv')
15 | 
16 | 
17 | def detect_type(filepath):
18 |     with os.popen(f'file {filepath} -b --mime-type') as proc:
19 |         return proc.read().lower()
20 | 
21 | 
22 | def detect_encoding(filepath):
23 |     with open(filepath, 'rb') as f:
24 |         return chardet.detect(f.read()).get('encoding')
25 | 
26 | 
27 | def from_csv(filepath, encoding='utf-8', sniff_limit=SNIFF_LIMIT):
28 |     """Try first w/ sniffing and then w/o sniffing if it fails,
29 |     and then again by forcing ';' delimiter w/o sniffing"""
30 |     kwargs = {
31 |         'sniff_limit': sniff_limit,
32 |         'encoding': encoding,
33 |         'column_types': agate_tester()
34 |     }
35 | 
36 |     with open(filepath, 'rb') as fp:
37 |         if len(fp.readlines()) < 2:
38 |             raise ValueError
39 | 
40 |     try:
41 |         return agate.Table.from_csv(filepath, **kwargs)
42 |     except ValueError:
43 |         try:
44 |             kwargs.pop('sniff_limit')
45 |             return agate.Table.from_csv(filepath, **kwargs)
46 |         except ValueError:
47 |             kwargs['delimiter'] = ';'
48 |             return agate.Table.from_csv(filepath, **kwargs)
49 | 
50 | 
51 | def from_excel(filepath, xlsx=False):
52 |     # Function exists to prevent side-effects after monckey patching with import
53 |     import agateexcel  # noqa
54 |     if xlsx:
55 |         return agate.Table.from_xlsx(filepath, column_types=agate_tester())
56 |     return agate.Table.from_xls(filepath, column_types=agate_tester())
57 | 
58 | 
59 | def to_sql(table, urlhash, storage):
60 |     db_info = get_db_info(urlhash, storage=storage)
61 |     table.to_sql(db_info['dsn'], db_info['db_name'], overwrite=True)
62 | 
63 | 
64 | def parse(filepath, urlhash, storage, encoding=None, sniff_limit=SNIFF_LIMIT):
65 |     is_csv = False
66 |     file_type = detect_type(filepath)
67 |     if 'application/vnd.ms-excel' in file_type:
68 |         table = from_excel(filepath)
69 |     elif 'application/vnd.openxml' in file_type:
70 |         table = from_excel(filepath, xlsx=True)
71 |     elif any([supported in file_type for supported in CSV_FILETYPES]):
72 |         encoding = detect_encoding(filepath) if not encoding else encoding
73 |         table = from_csv(filepath, encoding=encoding, sniff_limit=sniff_limit)
74 |         is_csv = True
75 |     else:
76 |         raise Exception(f'Unsupported file type {file_type}')
77 |     to_sql(table, urlhash, storage)
78 |     return is_csv
79 | 


--------------------------------------------------------------------------------
/benchmark/bench.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 2022-11-03
 3 | 
 4 | csvapi cli, no cache, pool_size=1, analysis=false
 5 | -------------->Time execution : 17.989488124847412<--------------
 6 | -------------->Time execution : 24.22131085395813<--------------
 7 | -------------->Time execution : 21.933547019958496<--------------
 8 | 
 9 | csvapi cli, no cache, pool_size=1, analysis=yes
10 | -------------->Time execution : 28.727387189865112<--------------
11 | -------------->Time execution : 27.748358964920044<--------------
12 | -------------->Time execution : 22.15376091003418<--------------
13 | 
14 | csvapi cli, no cache, pool_size=0, analysis=yes
15 | -------------->Time execution : 27.46714496612549<--------------
16 | -------------->Time execution : 28.398924112319946<--------------
17 | -------------->Time execution : 25.25711679458618<--------------
18 | 
19 | hypercorn -w3, no cache, pool_size=1, analysis=yes
20 | -------------->Time execution : 17.33577609062195<--------------
21 | -------------->Time execution : 27.747673988342285<--------------
22 | -------------->Time execution : 19.758486032485962<--------------
23 | 
24 | hypercorn -w3, no cache, pool_size=0, analysis=yes
25 | -------------->Time execution : 23.761262893676758<--------------
26 | -------------->Time execution : 18.91990613937378<--------------
27 | -------------->Time execution : 31.557281017303467<--------------
28 | -------------->Time execution : 31.700807809829712<--------------
29 | -------------->Time execution : 32.8078031539917<--------------
30 | """
31 | 
32 | import aiohttp
33 | import asyncio
34 | import time
35 | 
36 | ANALYSIS = True
37 | 
38 | URLS_APIFY = [
39 |     'http://localhost:8001/apify?url=http://datanova.legroupe.laposte.fr/explore/dataset/laposte_poincont2/download/?format=csv&timezone=Europe/Berlin&use_labels_for_header=true',
40 |     'http://localhost:8001/apify?url=https://datanova.legroupe.laposte.fr/explore/dataset/laposte_hexasmal/download/?format=csv&timezone=Europe/Berlin&use_labels_for_header=true',
41 |     'http://localhost:8001/apify?url=https://people.sc.fsu.edu/~jburkardt/data/csv/snakes_count_10.csv',
42 |     'http://localhost:8001/apify?url=https://people.sc.fsu.edu/~jburkardt/data/csv/snakes_count_100.csv',
43 |     'http://localhost:8001/apify?url=https://people.sc.fsu.edu/~jburkardt/data/csv/snakes_count_1000.csv',
44 |     'http://localhost:8001/apify?url=https://people.sc.fsu.edu/~jburkardt/data/csv/snakes_count_10000.csv'
45 | ]
46 | 
47 | 
48 | async def fetch_apify(session, url):
49 |     if ANALYSIS:
50 |         url = url.replace('apify?', 'apify?analysis=yes&')
51 |     async with session.get(url) as response:
52 |         res = await response.json()
53 |         return res['endpoint']
54 | 
55 | 
56 | async def fetch_api(session, url):
57 |     async with session.get(url) as response:
58 |         return await response.text()
59 | 
60 | 
61 | async def main():
62 |     start = time.time()
63 |     async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(force_close=True)) as session:
64 |         apify_requests = [fetch_apify(session, url) for url in URLS_APIFY]
65 |         endpoints = await asyncio.gather(*apify_requests)
66 |         api_requests = list()
67 |         for endpoint in endpoints:
68 |             for _ in range(20):
69 |                 api_requests.append(asyncio.ensure_future(fetch_api(session, endpoint)))
70 |         await asyncio.gather(*api_requests)
71 |     end = time.time()
72 |     print(f"-------------->Time execution : {end - start}<--------------")
73 | 
74 | 
75 | if __name__ == '__main__':
76 |     test = asyncio.run(main())
77 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | 
  3 | ## Current (in progress)
  4 | 
  5 | - Nothing yet
  6 | 
  7 | ## 2.2.0 (2022-11-04)
  8 | 
  9 | - Remove profile endpoint, disable thread pool for profiling [#135](https://github.com/etalab/csvapi/pull/135)
 10 | - Fix tests by using a custom docker image [#135](https://github.com/etalab/csvapi/pull/135)
 11 | 
 12 | ## 2.1.1 (2022-10-25)
 13 | 
 14 | * Fix bugs [#126](https://github.com/etalab/csvapi/pull/126) with json files
 15 | 
 16 | ## 2.1.0 (2022-10-13)
 17 | 
 18 | * Fix bugs [#110](https://github.com/etalab/csvapi/pull/110) and [#111](https://github.com/etalab/csvapi/pull/111)
 19 | * Add endpoint API greater_than or less_than int or float value [#109](https://github.com/etalab/csvapi/pull/109)
 20 | * Update version csv-detective [#119](https://github.com/etalab/csvapi/pull/119)
 21 | 
 22 | ## 2.0.0 (2022-09-15)
 23 | 
 24 | - [BREAKING] Migrate to python >= 3.9 [#104](https://github.com/etalab/csvapi/pull/104)
 25 | - Migrate to poetry [#104](https://github.com/etalab/csvapi/pull/104)
 26 | - Enrich sqlite dbs with metadata extracted from csv-detective and pandas profiling [#104](https://github.com/etalab/csvapi/pull/104)
 27 | - Enrich apify api with possibility to analyse resource [#104](https://github.com/etalab/csvapi/pull/104)
 28 | 
 29 | ## 1.2.1 (2021-04-29)
 30 | 
 31 | - Upgrade raven to sentry-sdk (a bit dirty so far)
 32 | 
 33 | ## 1.2.0 (2021-04-29)
 34 | 
 35 | - Add profiling support [#77](https://github.com/etalab/csvapi/pull/77)
 36 | - Fix bug in filters w/ blanks in column names [#77](https://github.com/etalab/csvapi/pull/77)
 37 | 
 38 | ## 1.1.0 (2021-03-23)
 39 | 
 40 | - Use aiosqlite [#76](https://github.com/etalab/csvapi/pull/76)
 41 | 
 42 | ## 1.0.6 (2020-12-14)
 43 | 
 44 | - Better parsing fallback [#71](https://github.com/etalab/csvapi/pull/71)
 45 | 
 46 | ## 1.0.5 (2020-11-17)
 47 | 
 48 | - Parsing view now raises exception on http error response codes [#69](https://github.com/etalab/csvapi/pull/69)
 49 | 
 50 | ## 1.0.4 (2020-10-26)
 51 | 
 52 | - Protect custom type testers against None values [#66](https://github.com/etalab/csvapi/pull/66)
 53 | - Fix xlsx file support [#67](https://github.com/etalab/csvapi/pull/67)
 54 | 
 55 | ## 1.0.3 (2020-03-04)
 56 | 
 57 | - Fix packaging problem
 58 | 
 59 | ## 1.0.2 (2020-03-04)
 60 | 
 61 | - Fix XLS parsing [#60](https://github.com/etalab/csvapi/pull/60)
 62 | 
 63 | ## 1.0.1 (2020-01-03)
 64 | 
 65 | - Fix aiohttp import [#52](https://github.com/etalab/csvapi/pull/52)
 66 | 
 67 | ## 1.0.0 (2020-01-03)
 68 | 
 69 | - Add filters support [#50](https://github.com/etalab/csvapi/pull/50)
 70 | - Replace requests by aiohttp for asynchronous http requests. Also replace every format() string to use only f"strings. [#46](https://github.com/etalab/csvapi/pull/46)
 71 | 
 72 | ## 0.1.0 (2019-09-06)
 73 | 
 74 | - Upgrade to Quart-0.9.1 :warning: requires python-3.7 [#21](https://github.com/opendatateam/csvapi/pull/21)
 75 | - Parse hours, SIREN and SIRET as text [#42](https://github.com/opendatateam/csvapi/pull/42)
 76 | 
 77 | ## 0.0.9 (2019-01-18)
 78 | 
 79 | - Upgrade to Quart-0.6.6 and hypercorn-0.4.6 [#16](https://github.com/opendatateam/csvapi/pull/16)
 80 | 
 81 | ## 0.0.8 (2018-10-04)
 82 | 
 83 | - Try to parse CSV w/o sniffing (excel dialect) after sniffing if it fails
 84 | 
 85 | ## 0.0.7 (2018-09-17)
 86 | 
 87 | - `MAX_FILE_SIZE` config variable [#13](https://github.com/opendatateam/csvapi/pull/13)
 88 | - Add filter by referrer feature (REFERRERS_FILTER) [#14](https://github.com/opendatateam/csvapi/pull/14)
 89 | 
 90 | ## 0.0.6 (2018-09-10)
 91 | 
 92 | - Compute the total number of rows in a table [#12](https://github.com/opendatateam/csvapi/pull/12)
 93 | 
 94 | ## 0.0.5 (2018-09-10)
 95 | 
 96 | - Make CSV sniff limit a config variable and raise the default value [#11](https://github.com/opendatateam/csvapi/pull/11)
 97 | - Properly handle not found (404) errors
 98 | 
 99 | ## 0.0.4 (2018-09-04)
100 | 
101 | - FORCE_SSL config variable
102 | 
103 | ## 0.0.3 (2018-08-31)
104 | 
105 | - Sentry support via SENTRY_DSN config variable
106 | 
107 | ## 0.0.2 (2018-08-30)
108 | 
109 | - CSVAPI_CONFIG_FILE env var support
110 | 
111 | ## 0.0.1 (2018-08-30)
112 | 
113 | - Initial version
114 | 


--------------------------------------------------------------------------------
/.circleci/config.yml:
--------------------------------------------------------------------------------
  1 | ---
  2 | version: 2.1
  3 | 
  4 | parameters:
  5 |   docker-image:
  6 |     type: string
  7 |     default: "etalab/csvapi-circle"
  8 |   python-module:
  9 |     type: string
 10 |     default: "csvapi"
 11 |   publish-branch:
 12 |     type: string
 13 |     default: "master"
 14 |   cache-prefix:
 15 |     type: string
 16 |     default: "py-cache-v2"
 17 | 
 18 | jobs:
 19 |   install:
 20 |     docker:
 21 |       - image: << pipeline.parameters.docker-image >>
 22 |     steps:
 23 |       - checkout
 24 |       - run:
 25 |           name: Get the base reference branch
 26 |           command: export BASE_BRANCH=$(base_branch)
 27 |       - restore_cache:
 28 |           keys:
 29 |             - << pipeline.parameters.cache-prefix >>-{{ arch }}-{{ checksum "poetry.lock" }}
 30 |             - << pipeline.parameters.cache-prefix >>-{{ arch }}-{{ .Branch }}
 31 |             - << pipeline.parameters.cache-prefix >>-{{ arch }}-{{ .Environment.BASE_BRANCH }}
 32 |       - run:
 33 |           name: Install python dependencies
 34 |           command: |
 35 |             poetry self update
 36 |             poetry config virtualenvs.in-project true
 37 |             poetry install
 38 |       - save_cache:
 39 |           key: << pipeline.parameters.cache-prefix >>-{{ arch }}-{{ checksum "poetry.lock" }}
 40 |           paths:
 41 |             - .venv
 42 |       - save_cache:
 43 |           key: << pipeline.parameters.cache-prefix >>-{{ arch }}-{{ .Branch }}
 44 |           paths:
 45 |             - .venv
 46 |       - persist_to_workspace:
 47 |           root: .
 48 |           paths:
 49 |             - .
 50 | 
 51 |   lint:
 52 |     docker:
 53 |       - image: << pipeline.parameters.docker-image >>
 54 |     steps:
 55 |       - attach_workspace:
 56 |           at: .
 57 |       - run:
 58 |           name: Lint code
 59 |           command: poetry run flake8 << pipeline.parameters.python-module >>
 60 | 
 61 |   tests:
 62 |     docker:
 63 |       - image: << pipeline.parameters.docker-image >>
 64 |     steps:
 65 |       - attach_workspace:
 66 |           at: .
 67 |       - run:
 68 |           name: Run tests
 69 |           command: |
 70 |             poetry run pytest --junitxml=reports/python/tests.xml -p no:sugar --color=yes
 71 |       - store_test_results:
 72 |           path: reports/python
 73 | 
 74 |   build:
 75 |     docker:
 76 |       - image: << pipeline.parameters.docker-image >>
 77 |     steps:
 78 |       - attach_workspace:
 79 |           at: .
 80 |       - run:
 81 |           name: Build a distributable package
 82 |           command: |
 83 |             # Build a wheel release
 84 |             if [[ $CIRCLE_TAG ]]; then
 85 |                 # This is a tagged release, version has been handled upstream
 86 |                 poetry build
 87 |             else
 88 |                 # Relies on a dev version like "1.2.1.dev" by default
 89 |                 poetry version $(poetry version -s)$CIRCLE_BUILD_NUM
 90 |                 poetry build
 91 |             fi
 92 |       - store_artifacts:
 93 |           path: dist
 94 |       - persist_to_workspace:
 95 |           root: .
 96 |           paths:
 97 |             - .
 98 | 
 99 |   publish:
100 |     docker:
101 |       - image: << pipeline.parameters.docker-image >>
102 |     steps:
103 |       - attach_workspace:
104 |           at: .
105 |       - deploy:
106 |           name: Publish on PyPI
107 |           command: |
108 |             poetry publish --username "${PYPI_USERNAME}" --password "${PYPI_PASSWORD}" --no-interaction
109 | 
110 | workflows:
111 |   version: 2
112 |   build:
113 |     jobs:
114 |       - install:
115 |           filters:
116 |             tags:
117 |               only: /v[0-9]+(\.[0-9]+)*/
118 |       - lint:
119 |           requires:
120 |             - install
121 |           filters:
122 |             tags:
123 |               only: /v[0-9]+(\.[0-9]+)*/
124 |       - tests:
125 |           requires:
126 |             - install
127 |           filters:
128 |             tags:
129 |               only: /v[0-9]+(\.[0-9]+)*/
130 |       - build:
131 |           requires:
132 |             - tests
133 |             - lint
134 |           filters:
135 |             tags:
136 |               only: /v[0-9]+(\.[0-9]+)*/
137 |       - publish:
138 |           requires:
139 |             - build
140 |           filters:
141 |             branches:
142 |               only:
143 |                 - << pipeline.parameters.publish-branch >>
144 |                 - /[0-9]+(\.[0-9]+)+/
145 |             tags:
146 |               only: /v[0-9]+(\.[0-9]+)*/
147 |           context: org-global
148 | 


--------------------------------------------------------------------------------
/profiling-minimal.yml:
--------------------------------------------------------------------------------
  1 | # Title of the document
  2 | title: "Pandas Profiling Report"
  3 | 
  4 | # Metadata
  5 | dataset:
  6 |   description: ""
  7 |   creator: ""
  8 |   author: "Etalab"
  9 |   copyright_holder: ""
 10 |   copyright_year: ""
 11 |   url: ""
 12 | 
 13 | variables:
 14 |   descriptions: {}
 15 | 
 16 | # infer dtypes
 17 | infer_dtypes: True
 18 | 
 19 | # Show the description at each variable (in addition to the overview tab)
 20 | show_variable_description: True
 21 | 
 22 | # Number of workers (0=multiprocessing.cpu_count())
 23 | pool_size: 0
 24 | 
 25 | # Show the progress bar
 26 | progress_bar: True
 27 | 
 28 | # Per variable type description settings
 29 | vars:
 30 |     num:
 31 |         quantiles:
 32 |               - 0.05
 33 |               - 0.25
 34 |               - 0.5
 35 |               - 0.75
 36 |               - 0.95
 37 |         skewness_threshold: 20
 38 |         low_categorical_threshold: 5
 39 | # Set to zero to disable
 40 |         chi_squared_threshold: 0.0
 41 |     cat:
 42 |         length: False
 43 |         characters: False
 44 |         words: False
 45 |         cardinality_threshold: 50
 46 |         n_obs: 5
 47 | # Set to zero to disable
 48 |         chi_squared_threshold: 0.0
 49 |         coerce_str_to_date: False
 50 |         redact: False
 51 |     bool:
 52 |         n_obs: 3
 53 | # string to boolean mappings pairs (true, false)
 54 |         mappings:
 55 |           - ["t", "f"]
 56 |           - ["yes", "no"]
 57 |           - ["y", "n"]
 58 |           - ["true", "false"]
 59 |     path:
 60 |         active: False
 61 |     file:
 62 |         active: False
 63 |     image:
 64 |         active: False
 65 |         exif: False
 66 |         hash: False
 67 |     url:
 68 |         active: False
 69 | 
 70 | 
 71 | # Sort the variables. Possible values: ascending, descending or None (leaves original sorting)
 72 | sort: None
 73 | 
 74 | # which diagrams to show
 75 | missing_diagrams:
 76 |     bar: False
 77 |     matrix: False
 78 |     heatmap: False
 79 |     dendrogram: False
 80 | 
 81 | correlations:
 82 |     pearson:
 83 |       calculate: False
 84 |       warn_high_correlations: True
 85 |       threshold: 0.9
 86 |     spearman:
 87 |       calculate: False
 88 |       warn_high_correlations: False
 89 |       threshold: 0.9
 90 |     kendall:
 91 |       calculate: False
 92 |       warn_high_correlations: False
 93 |       threshold: 0.9
 94 |     phi_k:
 95 |       calculate: False
 96 |       warn_high_correlations: False
 97 |       threshold: 0.9
 98 |     cramers:
 99 |       calculate: False
100 |       warn_high_correlations: True
101 |       threshold: 0.9
102 | 
103 | 
104 | # Bivariate / Pairwise relations
105 | interactions:
106 |   targets: []
107 |   continuous: False
108 | 
109 | # For categorical
110 | categorical_maximum_correlation_distinct: 100
111 | 
112 | # Plot-specific settings
113 | plot:
114 | # Image format (svg or png)
115 |     image_format: "svg"
116 |     dpi: 800
117 | 
118 |     scatter_threshold: 1000
119 | 
120 |     correlation:
121 |         cmap: 'RdBu'
122 |         bad: '#000000'
123 | 
124 |     missing:
125 |         cmap: 'RdBu'
126 | # Force labels when there are > 50 variables
127 | # https://github.com/ResidentMario/missingno/issues/93#issuecomment-513322615
128 |         force_labels: True
129 | 
130 |     pie:
131 | # display a pie chart if the number of distinct values is smaller or equal (set to 0 to disable)
132 |       max_unique: 0
133 | 
134 |     histogram:
135 |         x_axis_labels: True
136 | 
137 | # Number of bins (set to 0 to automatically detect the bin size)
138 |         bins: 50
139 | 
140 | # Maximum number of bins (when bins=0)
141 |         max_bins: 250
142 | 
143 | # The number of observations to show
144 | n_obs_unique: 5
145 | n_extreme_obs: 5
146 | n_freq_table_max: 10
147 | 
148 | # Use `deep` flag for memory_usage
149 | memory_deep: False
150 | 
151 | # Configuration related to the duplicates
152 | duplicates:
153 |     head: 0
154 | 
155 | # Configuration related to the samples area
156 | samples:
157 |     head: 0
158 |     tail: 0
159 |     random: 0
160 | 
161 | # Configuration related to the rejection of variables
162 | reject_variables: True
163 | 
164 | # When in a Jupyter notebook
165 | notebook:
166 |     iframe:
167 |         height: '800px'
168 |         width: '100%'
169 | # or 'src'
170 |         attribute: 'srcdoc'
171 | 
172 | html:
173 | # Minify the html
174 |     minify_html: True
175 | 
176 | # Offline support
177 |     use_local_assets: True
178 | 
179 | # If True, single file, else directory with assets
180 |     inline: True
181 | 
182 | # Show navbar
183 |     navbar_show: True
184 | 
185 | # For internal use
186 |     file_name: None
187 | 
188 | # Styling options for the HTML report
189 |     style:
190 |       theme: None
191 |       logo: ""
192 |       primary_color: "#337ab7"
193 |       full_width: False
194 | 


--------------------------------------------------------------------------------
/csvapi/parseview.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tempfile
  3 | 
  4 | import aiohttp
  5 | import validators
  6 | import pandas as pd
  7 | 
  8 | from quart import request, jsonify, current_app as app
  9 | from quart.views import MethodView
 10 | 
 11 | from csvapi.errors import APIError
 12 | from csvapi.parser import parse
 13 | from csvapi.profiling import CSVAPIProfileReport
 14 | from csvapi.utils import (
 15 |     already_exists,
 16 |     get_hash,
 17 |     check_csv_detective_report_structure,
 18 |     check_profile_report_structure,
 19 |     create_connection,
 20 |     enrich_db_with_metadata
 21 | )
 22 | 
 23 | from csv_detective.explore_csv import routine
 24 | 
 25 | 
 26 | class ParseView(MethodView):
 27 | 
 28 |     @staticmethod
 29 |     async def do_parse(
 30 |         url,
 31 |         urlhash,
 32 |         encoding,
 33 |         storage,
 34 |         logger,
 35 |         sniff_limit,
 36 |         max_file_size,
 37 |         analysis=None
 38 |     ):
 39 |         logger.debug('* do_parse %s (%s)', urlhash, url)
 40 |         tmp = tempfile.NamedTemporaryFile(delete=False)
 41 |         chunk_count = 0
 42 |         chunk_size = 1024
 43 |         try:
 44 |             async with aiohttp.ClientSession(raise_for_status=True) as session:
 45 |                 async with session.get(url) as resp:
 46 |                     while True:
 47 |                         chunk = await resp.content.read(chunk_size)
 48 |                         if chunk_count * chunk_size > max_file_size:
 49 |                             tmp.close()
 50 |                             raise Exception('File too big (max size is %s bytes)' % max_file_size)
 51 |                         if not chunk:
 52 |                             break
 53 |                         tmp.write(chunk)
 54 |                         chunk_count += 1
 55 |             tmp.close()
 56 | 
 57 |             logger.debug('* Downloaded %s', urlhash)
 58 |             logger.debug('* Parsing %s...', urlhash)
 59 |             is_csv = parse(
 60 |                 tmp.name,
 61 |                 urlhash,
 62 |                 storage,
 63 |                 encoding=encoding,
 64 |                 sniff_limit=sniff_limit
 65 |             )
 66 | 
 67 |             if is_csv and analysis and analysis == 'yes':
 68 |                 csv_detective_report = routine(tmp.name)
 69 | 
 70 |                 if not check_csv_detective_report_structure(csv_detective_report):
 71 |                     logger.error(
 72 |                         "csvdetective report malformed"
 73 |                     )
 74 |                     return
 75 | 
 76 |                 profile_report = await CSVAPIProfileReport().get_minimal_profile(urlhash)
 77 | 
 78 |                 if not check_profile_report_structure(profile_report):
 79 |                     logger.error(
 80 |                         "pandas profiling report malformed"
 81 |                     )
 82 |                     return
 83 | 
 84 |                 enrich_db_with_metadata(
 85 |                     urlhash,
 86 |                     csv_detective_report,
 87 |                     profile_report,
 88 |                     None,
 89 |                     None
 90 |                 )
 91 | 
 92 |             if not is_csv and analysis and analysis == 'yes':
 93 |                 conn = create_connection(f"{app.config['DB_ROOT_DIR']}/{urlhash}.db")
 94 |                 general_infos = [
 95 |                     {
 96 |                         'filetype': 'excel'
 97 |                     }
 98 |                 ]
 99 |                 df = pd.DataFrame(general_infos)
100 |                 df.to_sql('general_infos', con=conn, if_exists='replace', index=False)
101 | 
102 |             logger.debug('* Parsed %s', urlhash)
103 |         finally:
104 |             logger.debug('Removing tmp file: %s', tmp.name)
105 |             os.unlink(tmp.name)
106 | 
107 |     async def get(self):
108 |         app.logger.debug('* Starting ParseView.get')
109 |         url = request.args.get('url')
110 |         encoding = request.args.get('encoding')
111 |         if not url:
112 |             raise APIError('Missing url query string variable.', status=400)
113 |         if not validators.url(url):
114 |             raise APIError('Malformed url parameter.', status=400)
115 |         urlhash = get_hash(url)
116 |         analysis = request.args.get('analysis')
117 |         if not await already_exists(urlhash, analysis):
118 |             try:
119 |                 storage = app.config['DB_ROOT_DIR']
120 |                 await self.do_parse(url=url,
121 |                                     urlhash=urlhash,
122 |                                     encoding=encoding,
123 |                                     storage=storage,
124 |                                     logger=app.logger,
125 |                                     sniff_limit=app.config.get('CSV_SNIFF_LIMIT'),
126 |                                     max_file_size=app.config.get('MAX_FILE_SIZE'),
127 |                                     analysis=analysis)
128 |             except Exception as e:
129 |                 raise APIError('Error parsing CSV: %s' % e)
130 |         else:
131 |             app.logger.info(f"{urlhash}.db already exists, skipping parse.")
132 |         scheme = 'https' if app.config.get('FORCE_SSL') else request.scheme
133 |         return jsonify({
134 |             'ok': True,
135 |             'endpoint': f"{scheme}://{request.host}/api/{urlhash}",
136 |         })
137 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # csvapi
  2 | 
  3 | "Instantly" publish an API for a CSV hosted anywhere on the internet. Also supports Excel files.
  4 | 
  5 | This tool is used by [data.gouv.fr](https://www.data.gouv.fr) to show a preview of hosted CSV and XLS files.
  6 | 
  7 | ## Installation
  8 | 
  9 | Requires Python 3.9+ and a Unix OS with the `file` command available.
 10 | 
 11 | ```shell
 12 | python3 -m venv pyenv && . pyenv/bin/activate
 13 | pip install csvapi
 14 | ```
 15 | 
 16 | For development:
 17 | 
 18 | ```shell
 19 | poetry install
 20 | ```
 21 | 
 22 | ## Quickstart
 23 | 
 24 | ```shell
 25 | poetry run csvapi serve -h 0.0.0.0 -p 8000
 26 | ```
 27 | 
 28 | ## Command line options
 29 | 
 30 | ```shell
 31 | $ poetry run csvapi serve --help
 32 | Usage: csvapi serve [OPTIONS]
 33 | 
 34 | Options:
 35 |     --ssl-key TEXT             Path to SSL key
 36 |     --ssl-cert TEXT            Path to SSL certificate
 37 |     --cache / --no-cache       Do not parse CSV again if DB already exists
 38 |     --reload                   Automatically reload if code change detected
 39 |     --debug                    Enable debug mode - useful for development
 40 |     -p, --port INTEGER         port for server, defaults to 8001
 41 |     -h, --host TEXT            host for server, defaults to 127.0.0.1
 42 |     --dbs DIRECTORY            Where to store sqlite DBs
 43 |     --help                     Show this message and exit.
 44 | ```
 45 | 
 46 | ## Deploy
 47 | 
 48 | With SSL, using [Hypercorn](https://pgjones.gitlab.io/hypercorn/):
 49 | 
 50 | ```shell
 51 | hypercorn csvapi.webservice:app -b 0.0.0.0:443 --keyfile key.pem --ca-certs cert.pem
 52 | ```
 53 | 
 54 | See [the documentation](https://pgjones.gitlab.io/hypercorn/usage.html) for more options.
 55 | 
 56 | You can use the environment variable `CSVAPI_CONFIG_FILE` to point to a custom configuration file.
 57 | 
 58 | ## API usage
 59 | 
 60 | ### Conversion
 61 | 
 62 | `/apify?url=http://somewhere.com/a/file.csv`
 63 | 
 64 | This converts a CSV to an SQLite database (w/ `agate`) and returns the following response:
 65 | 
 66 | ```json
 67 | {"ok": true, "endpoint": "http://localhost:8001/api/cde857960e8dc24c9cbcced673b496bb"}
 68 | ```
 69 | 
 70 | ### Parameters
 71 | 
 72 | Some parameters can be used in the query string.
 73 | 
 74 | #### `encoding`
 75 | 
 76 | **default**: _automatic detection_
 77 | 
 78 | You can force an encoding (e.g. `utf-8`) using this parameter, instead of relying on the automatic detection.
 79 | 
 80 | 
 81 | ### Data API
 82 | 
 83 | This is the `endpoint` attribute of the previous response.
 84 | 
 85 | `/api/<md5-url-hash>`
 86 | 
 87 | This queries a previously converted API file and returns the first 100 rows like this:
 88 | 
 89 | ```json
 90 |     {
 91 |         "ok": true,
 92 |         "rows": [[], []],
 93 |         "columns": [],
 94 |         "query_ms": 1
 95 |     }
 96 | ```
 97 | 
 98 | ### Parameters
 99 | 
100 | Some parameters can be used in the query string.
101 | 
102 | #### `_size`
103 | 
104 | **default**: `100`
105 | 
106 | This will limit the query to a certain number of rows. For instance to get only 250 rows:
107 | 
108 | `/api/<md5-url-hash>?_size=250`
109 | 
110 | #### `_sort` and `_sort_desc`
111 | 
112 | Use those to sort by a column. `sort` will sort by ascending order, `sort_desc` by descending order.
113 | 
114 | `/api/<md5-url-hash>?_sort=<column-name>`
115 | 
116 | #### `_offset`
117 | 
118 | Use this to add on offset. Combined with `_size` it allows pagination.
119 | 
120 | `/api/<md5-url-hash>?_size=1&_offset=1`
121 | 
122 | #### `_shape`
123 | 
124 | **default**: `lists`
125 | 
126 | The `_shape` argument is used to specify the format output of the json. It can take the value `objects` to get an array of objects instead of an array of arrays:
127 | 
128 | `/api/<md5-url-hash>?_shape=objects`
129 | 
130 | For instance, instead of returning:
131 | 
132 | ```json
133 | {
134 |     "ok": true,
135 |     "query_ms": 0.4799365997,
136 |     "rows": [
137 |         [1, "Justice", "0101", 57663310],
138 |         [2, "Justice", "0101", 2255129],
139 |         [3, "Justice", "0101", 36290]
140 |     ],
141 |     "columns": ["rowid", "Mission", "Programme", "Consommation de CP"]
142 | }
143 | ```
144 | 
145 | It will return:
146 | 
147 | ```json
148 | {
149 |     "ok": true,
150 |     "query_ms": 2.681016922,
151 |     "rows": [
152 |     {
153 |         "rowid": 1,
154 |         "Mission": "Justice",
155 |         "Programme": "0101",
156 |         "Consommation de CP": 57663310
157 |     },
158 |     {
159 |         "rowid": 2,
160 |         "Mission": "Justice",
161 |         "Programme": "0101",
162 |         "Consommation de CP": 2255129
163 |     },
164 |     {
165 |         "rowid": 3,
166 |         "Mission": "Justice",
167 |         "Programme": "0101",
168 |         "Consommation de CP": 36290
169 |     }],
170 |     "columns": ["rowid", "Mission", "Programme", "Consommation de CP"]
171 | }
172 | ```
173 | 
174 | #### `_rowid`
175 | 
176 | **default**: `show`
177 | 
178 | The `_rowid` argument is used to display or hide rowids in the returned data. Use `_rowid=hide` to hide.
179 | 
180 | `/api/<md5-url-hash>?_shape=objects&_rowid=hide`
181 | 
182 | ```json
183 | {
184 |     "ok": true,
185 |     "query_ms": 2.681016922,
186 |     "rows": [
187 |     {
188 |         "Mission": "Justice",
189 |         "Programme": "0101",
190 |         "Consommation de CP": 57663310
191 |     },
192 |     {
193 |         "Mission": "Justice",
194 |         "Programme": "0101",
195 |         "Consommation de CP": 2255129
196 |     },
197 |     {
198 |         "Mission": "Justice",
199 |         "Programme": "0101",
200 |         "Consommation de CP": 36290
201 |     }],
202 |     "columns": ["Mission", "Programme", "Consommation de CP"]
203 | }
204 | ```
205 | 
206 | #### `_total`
207 | 
208 | **default**: `show`
209 | 
210 | The `_total` argument is used to display or hide the total number of rows (independent of pagination) in the returned data. Use `_total=hide` to hide.
211 | 
212 | ```json
213 | {
214 |     "ok": true,
215 |     "query_ms": 2.681016922,
216 |     "rows": [
217 |     {
218 |         "Mission": "Justice",
219 |         "Programme": "0101",
220 |         "Consommation de CP": 57663310
221 |     },
222 |     {
223 |         "Mission": "Justice",
224 |         "Programme": "0101",
225 |         "Consommation de CP": 2255129
226 |     },
227 |     {
228 |         "Mission": "Justice",
229 |         "Programme": "0101",
230 |         "Consommation de CP": 36290
231 |     }],
232 |     "columns": ["Mission", "Programme", "Consommation de CP"],
233 |     "total": 3
234 | }
235 | ```
236 | 
237 | #### Column based filters
238 | 
239 | By adding `{column}__{comparator}={value}` to the query string, you can filter the results based on the following criterions:
240 | - `{column}` must be a valid column in your CSV
241 | - `{comparator}` is `exact` (SQL `= {value}`) or `contains` (SQL `LIKE %{value}%`)
242 | - `{value}` is the value you're filtering the column against
243 | 
244 | You can add multiple filters, they will be joined with a `AND` at the SQL level.
245 | 
246 | ## Credits
247 | 
248 | Inspired by the excellent [Datasette](https://github.com/simonw/datasette).
249 | 


--------------------------------------------------------------------------------
/csvapi/utils.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | 
  3 | from pathlib import Path
  4 | 
  5 | from quart import current_app as app
  6 | 
  7 | import sqlite3
  8 | from datetime import datetime
  9 | import pandas as pd
 10 | 
 11 | executor = None
 12 | 
 13 | 
 14 | def get_db_info(urlhash, storage=None):
 15 |     if app:
 16 |         # app.config not thread safe, sometimes we need to pass storage directly
 17 |         db_storage = storage or app.config['DB_ROOT_DIR']
 18 | 
 19 |     db_path = f"{db_storage}/{urlhash}.db"
 20 |     return {
 21 |         'dsn': f"sqlite:///{db_path}",
 22 |         'db_name': urlhash,
 23 |         'table_name': urlhash,
 24 |         'db_path': db_path,
 25 |     }
 26 | 
 27 | 
 28 | def get_hash(to_hash):
 29 |     return get_hash_bytes(to_hash.encode('utf-8'))
 30 | 
 31 | 
 32 | def get_hash_bytes(to_hash):
 33 |     return hashlib.md5(to_hash).hexdigest()
 34 | 
 35 | 
 36 | async def already_exists(urlhash, analysis=None):
 37 |     '''
 38 |     Check if db exist. If analysis is requested, we check if general_infos table exist.
 39 |     If not, we bypass cache and do a new download of file to analyse it with pp and csv-detective.
 40 |     '''
 41 |     cache_enabled = app.config.get('CSV_CACHE_ENABLED')
 42 |     if not cache_enabled:
 43 |         return False
 44 | 
 45 |     db_exist = Path(get_db_info(urlhash)['db_path']).exists()
 46 | 
 47 |     if not analysis or analysis != 'yes':
 48 |         return db_exist
 49 |     else:
 50 |         conn = create_connection(get_db_info(urlhash)['db_path'])
 51 |         cur = conn.cursor()
 52 |         sql = 'SELECT count(*) FROM sqlite_master WHERE type=\'table\' AND name=\'general_infos\''
 53 |         cur.execute(sql)
 54 |         rows = cur.fetchall()
 55 |         if rows[0][0] != 0:
 56 |             return True
 57 |         else:
 58 |             return False
 59 | 
 60 | 
 61 | def create_connection(db_file):
 62 |     conn = None
 63 |     conn = sqlite3.connect(db_file)
 64 |     return conn
 65 | 
 66 | 
 67 | def keys_exists(element, *keys):
 68 |     '''
 69 |     Check if *keys (nested) exists in `element` (dict).
 70 |     '''
 71 |     if not isinstance(element, dict):
 72 |         raise AttributeError('keys_exists() expects dict as first argument.')
 73 |     if len(keys) == 0:
 74 |         raise AttributeError('keys_exists() expects at least two arguments, one given.')
 75 |     _element = element
 76 |     for key in keys:
 77 |         try:
 78 |             _element = _element[key]
 79 |         except KeyError:
 80 |             return False
 81 |     return True
 82 | 
 83 | 
 84 | def check_csv_detective_report_structure(report):
 85 |     if (report is not None) and \
 86 |             (keys_exists(report, "columns")) and \
 87 |             (keys_exists(report, "encoding")) and \
 88 |             (keys_exists(report, "separator")) and \
 89 |             (keys_exists(report, "header_row_idx")):
 90 | 
 91 |         for item in report['columns']:
 92 |             if (not keys_exists(report, "columns", item, "python_type")) | \
 93 |                     (not keys_exists(report, "columns", item, "format")):
 94 |                 return False
 95 |         return True
 96 |     else:
 97 |         return False
 98 | 
 99 | 
100 | def check_profile_report_structure(report):
101 |     if (report is not None) and \
102 |             (keys_exists(report, "table", "n")) and \
103 |             (keys_exists(report, "table", "n_var")) and \
104 |             (keys_exists(report, "table", "n_cells_missing")) and \
105 |             (keys_exists(report, "table", "n_vars_with_missing")) and \
106 |             (keys_exists(report, "table", "n_vars_all_missing")) and \
107 |             (keys_exists(report, "table", "n_cells_missing")) and \
108 |             (keys_exists(report, "variables")):
109 | 
110 |         for item in report['variables']:
111 |             if (not keys_exists(report, "variables", item, "n_distinct")) | \
112 |                     (not keys_exists(report, "variables", item, "is_unique")) | \
113 |                     (not keys_exists(report, "variables", item, "n_unique")) | \
114 |                     (not keys_exists(report, "variables", item, "type")) | \
115 |                     (not keys_exists(report, "variables", item, "n_missing")) | \
116 |                     (not keys_exists(report, "variables", item, "count")):
117 |                 return False
118 |         return True
119 |     else:
120 |         return False
121 | 
122 | 
123 | def df_to_sql(obj, conn, name):
124 |     df = pd.DataFrame(obj)
125 |     if df.shape[0] > 0:
126 |         df.to_sql(name, con=conn, if_exists='replace', index=False)
127 | 
128 | 
129 | def enrich_db_with_metadata(urlhash, csv_detective_report, profile_report, dataset_id, key):
130 |     # Save to sql
131 |     conn = create_connection(app.config['DB_ROOT_DIR'] + '/' + urlhash + '.db')
132 | 
133 |     general_infos = [
134 |         {
135 |             'encoding': csv_detective_report['encoding'],
136 |             'separator': csv_detective_report['separator'],
137 |             'header_row_idx': csv_detective_report['header_row_idx'],
138 |             'total_lines': profile_report['table']['n'],
139 |             'nb_columns': profile_report['table']['n_var'],
140 |             'nb_cells_missing': profile_report['table']['n_cells_missing'],
141 |             'nb_vars_with_missing': profile_report['table']['n_vars_with_missing'],
142 |             'nb_vars_all_missing': profile_report['table']['n_vars_all_missing'],
143 |             'date_last_check': datetime.today().strftime('%Y-%m-%d'),
144 |             'dataset_id': dataset_id,
145 |             'resource_id': key,
146 |             'filetype': 'csv'
147 |         }
148 |     ]
149 |     df = pd.DataFrame(general_infos)
150 |     df.to_sql('general_infos', con=conn, if_exists='replace', index=False)
151 | 
152 |     columns_infos = []
153 |     categorical_infos = []
154 |     top_infos = []
155 |     numeric_infos = []
156 |     numeric_plot_infos = []
157 | 
158 |     for col in profile_report['variables']:
159 |         column_info = {}
160 |         column_info['name'] = col
161 |         column_info['nb_distinct'] = profile_report['variables'][col]['n_distinct']
162 |         column_info['is_unique'] = profile_report['variables'][col]['is_unique']
163 |         column_info['nb_unique'] = profile_report['variables'][col]['n_unique']
164 |         column_info['type'] = profile_report['variables'][col]['type']
165 |         column_info['nb_missing'] = profile_report['variables'][col]['n_missing']
166 |         column_info['count'] = profile_report['variables'][col]['count']
167 |         column_info['format'] = 'unknown'
168 | 
169 |         if col in csv_detective_report['columns']:
170 |             column_info['format'] = csv_detective_report['columns'][col]['format']
171 |             if csv_detective_report['columns'][col]['format'] in [
172 |                     'siren',
173 |                     'siret',
174 |                     'code_postal',
175 |                     'code_commune_insee',
176 |                     'code_departement',
177 |                     'code_region',
178 |                     'tel_fr']:
179 |                 column_info['type'] = 'Categorical'
180 |         columns_infos.append(column_info)
181 | 
182 |         if (column_info['type'] == 'Categorical') & \
183 |                 (len(profile_report['variables'][col]['value_counts_without_nan']) < 10):
184 |             for cat in profile_report['variables'][col]['value_counts_without_nan']:
185 |                 categorical_info = {}
186 |                 categorical_info['column'] = col
187 |                 categorical_info['value'] = cat
188 |                 categorical_info['nb'] = profile_report['variables'][col]['value_counts_without_nan'][cat]
189 |                 categorical_infos.append(categorical_info)
190 | 
191 |         if column_info['type'] == 'Numeric':
192 |             numeric_info = {}
193 |             numeric_info['column'] = col
194 |             numeric_info['mean'] = profile_report['variables'][col]['mean']
195 |             numeric_info['std'] = profile_report['variables'][col]['std']
196 |             numeric_info['min'] = profile_report['variables'][col]['min']
197 |             numeric_info['max'] = profile_report['variables'][col]['max']
198 |             numeric_infos.append(numeric_info)
199 |             for i in range(len(profile_report['variables'][col]['histogram']['bin_edges'])):
200 |                 numeric_plot_info = {}
201 |                 numeric_plot_info['column'] = col
202 |                 numeric_plot_info['value'] = profile_report['variables'][col]['histogram']['bin_edges'][i]
203 |                 numeric_plot_info['type'] = 'bin_edges'
204 |                 numeric_plot_infos.append(numeric_plot_info)
205 | 
206 |             for i in range(len(profile_report['variables'][col]['histogram']['counts'])):
207 |                 numeric_plot_info = {}
208 |                 numeric_plot_info['column'] = col
209 |                 numeric_plot_info['value'] = profile_report['variables'][col]['histogram']['counts'][i]
210 |                 numeric_plot_info['type'] = 'counts'
211 |                 numeric_plot_infos.append(numeric_plot_info)
212 | 
213 |         cpt = 0
214 |         for top in profile_report['variables'][col]['value_counts_without_nan']:
215 |             if (cpt < 10):
216 |                 top_info = {}
217 |                 top_info['column'] = col
218 |                 top_info['value'] = top
219 |                 top_info['nb'] = profile_report['variables'][col]['value_counts_without_nan'][top]
220 |                 top_infos.append(top_info)
221 |                 cpt = cpt + 1
222 | 
223 |     df_to_sql(columns_infos, conn, 'columns_infos')
224 |     df_to_sql(categorical_infos, conn, 'categorical_infos')
225 |     df_to_sql(top_infos, conn, 'top_infos')
226 |     df_to_sql(numeric_infos, conn, 'numeric_infos')
227 |     df_to_sql(numeric_plot_infos, conn, 'numeric_plot_infos')
228 | 
229 |     conn.commit()
230 | 


--------------------------------------------------------------------------------
/csvapi/tableview.py:
--------------------------------------------------------------------------------
  1 | import aiosqlite
  2 | import sqlite3
  3 | import time
  4 | 
  5 | from contextlib import asynccontextmanager
  6 | from pathlib import Path
  7 | 
  8 | from quart import request, jsonify, current_app as app
  9 | from quart.views import MethodView
 10 | from slugify import slugify
 11 | 
 12 | from csvapi.errors import APIError
 13 | from csvapi.utils import get_db_info
 14 | 
 15 | ROWS_LIMIT = 100
 16 | SQL_TIME_LIMIT_MS = 1000
 17 | DEFAULT_SHAPE = 'lists'
 18 | 
 19 | 
 20 | def prepare_connection(conn):
 21 |     # conn.row_factory = sqlite3.Row
 22 |     conn.text_factory = lambda x: str(x, 'utf-8', 'replace')
 23 | 
 24 | 
 25 | @asynccontextmanager
 26 | async def sqlite_timelimit(conn, ms):
 27 |     deadline = time.time() + (ms / 1000)
 28 |     # n is the number of SQLite virtual machine instructions that will be
 29 |     # executed between each check. It's hard to know what to pick here.
 30 |     # After some experimentation, I've decided to go with 1000 by default and
 31 |     # 1 for time limits that are less than 50ms
 32 |     n = 1000
 33 |     if ms < 50:
 34 |         n = 1
 35 | 
 36 |     def handler():
 37 |         if time.time() >= deadline:
 38 |             return 1
 39 | 
 40 |     await conn.set_progress_handler(handler, n)
 41 |     yield
 42 |     await conn.set_progress_handler(None, n)
 43 | 
 44 | 
 45 | class TableView(MethodView):
 46 | 
 47 |     async def execute(self, sql, db_info, params=None):
 48 |         """Executes sql against db_name in a thread"""
 49 |         dsn = 'file:{}?immutable=1'.format(db_info['db_path'])
 50 |         # specify uri=True to make sure `file:xxx` is supported,
 51 |         # however the backend sqlite is configured (eg default MacOS)
 52 |         async with aiosqlite.connect(dsn, uri=True) as conn:
 53 |             conn.text_factory = lambda x: str(x, 'utf-8', 'replace')
 54 |             # this will raise
 55 |             #  {"details": "interrupted",
 56 |             #  "error": "Error selecting data",}
 57 |             async with sqlite_timelimit(conn, SQL_TIME_LIMIT_MS):
 58 |                 try:
 59 |                     async with conn.execute(sql, params or {}) as cursor:
 60 |                         rows = await cursor.fetchall()
 61 |                 except Exception:
 62 |                     app.logger.error('ERROR: conn={}, sql = {}, params = {}'.format(
 63 |                         conn, repr(sql), params
 64 |                     ))
 65 |                     raise
 66 |             return rows, cursor.description
 67 | 
 68 |     def add_filters_to_sql(self, sql, filters):
 69 |         wheres = []
 70 |         params = {}
 71 |         for (f_key, f_value) in filters:
 72 |             comparator = f_key.split('__')[1]
 73 |             column = f_key.split('__')[0]
 74 |             normalized_column = slugify(column, separator='_')
 75 |             if comparator == 'exact':
 76 |                 wheres.append(f"[{column}] = :filter_value_{normalized_column}")
 77 |                 params[f'filter_value_{normalized_column}'] = f_value
 78 |             elif comparator == 'contains':
 79 |                 wheres.append(f"[{column}] LIKE :filter_value_{normalized_column}")
 80 |                 params[f'filter_value_{normalized_column}'] = f'%{f_value}%'
 81 |             elif comparator == 'less':
 82 |                 try:
 83 |                     float_value = float(f_value)
 84 |                 except ValueError:
 85 |                     raise APIError('Float value expected for less comparison.', status=400)
 86 |                 wheres.append(f"[{column}] <= :filter_value_l_{normalized_column}")
 87 |                 params[f'filter_value_l_{normalized_column}'] = float_value
 88 |             elif comparator == 'greater':
 89 |                 try:
 90 |                     float_value = float(f_value)
 91 |                 except ValueError:
 92 |                     raise APIError('Float value expected for greater comparison.', status=400)
 93 |                 wheres.append(f"[{column}] >= :filter_value_gt_{normalized_column}")
 94 |                 params[f'filter_value_gt_{normalized_column}'] = float_value
 95 |             else:
 96 |                 app.logger.warning(f'Dropped unknown comparator in {f_key}')
 97 |         if wheres:
 98 |             sql += ' WHERE '
 99 |             sql += ' AND '.join(wheres)
100 |         return sql, params
101 | 
102 |     async def data(self, db_info, export=False):
103 |         limit = request.args.get('_size', ROWS_LIMIT) if not export else -1
104 |         rowid = not (request.args.get('_rowid') == 'hide') and not export
105 |         total = not (request.args.get('_total') == 'hide') and not export
106 |         sort = request.args.get('_sort')
107 |         sort_desc = request.args.get('_sort_desc')
108 |         offset = request.args.get('_offset') if not export else 0
109 | 
110 |         # get filter arguments, like column__exact=xxx
111 |         filters = []
112 |         for key, value in request.args.items():
113 |             if not key.startswith('_') and '__' in key:
114 |                 filters.append((key, value))
115 | 
116 |         cols = 'rowid, *' if rowid else '*'
117 |         sql = 'SELECT {} FROM [{}]'.format(cols, db_info['table_name'])
118 |         sql, params = self.add_filters_to_sql(sql, filters)
119 |         if sort:
120 |             sql += f' ORDER BY [{sort}]'
121 |         elif sort_desc:
122 |             sql += f' ORDER BY [{sort_desc}] DESC'
123 |         else:
124 |             sql += ' ORDER BY rowid'
125 |         sql += ' LIMIT :l'
126 |         params['l'] = limit
127 |         if offset:
128 |             sql += ' OFFSET :o'
129 |             params['o'] = offset
130 |         rows, description = await self.execute(
131 |             sql, db_info, params=params
132 |         )
133 | 
134 |         columns = [r[0] for r in description]
135 | 
136 |         if export:
137 |             return columns, rows
138 | 
139 |         res = {
140 |             'columns': columns,
141 |             'rows': list(rows),
142 |         }
143 | 
144 |         if total:
145 |             sql = f"SELECT COUNT(*) FROM [{db_info['table_name']}]"
146 |             sql, params = self.add_filters_to_sql(sql, filters)
147 |             r, _ = await self.execute(sql, db_info, params=params)
148 |             res['total'] = r[0][0]
149 | 
150 |         return res
151 | 
152 |     async def get(self, urlhash):
153 |         db_info = get_db_info(urlhash)
154 |         p = Path(db_info['db_path'])
155 |         if not p.exists():
156 |             raise APIError('Database has probably been removed.', status=404)
157 | 
158 |         start = time.time()
159 |         try:
160 |             data = await self.data(db_info)
161 |         except (sqlite3.OperationalError, sqlite3.IntegrityError) as e:
162 |             raise APIError('Error selecting data', status=400, payload=dict(details=str(e)))
163 |         end = time.time()
164 | 
165 |         _shape = request.args.get('_shape', DEFAULT_SHAPE)
166 |         if _shape == 'objects':
167 |             # Format data as an array of objects for the client
168 |             rows = []
169 |             for row in data['rows']:
170 |                 rows.append(dict(zip(data['columns'], row)))
171 |         elif _shape == 'lists':
172 |             rows = data['rows']
173 |         else:
174 |             raise APIError(f"Unknown _shape: {_shape}", status=400)
175 | 
176 |         general_infos = await self.general_infos(db_info)
177 |         columns_infos = await self.columns_infos(db_info)
178 | 
179 |         res = {
180 |             'ok': True,
181 |             'query_ms': (end - start) * 1000,
182 |             'rows': rows,
183 |             'columns': data['columns'],
184 |             'general_infos': general_infos,
185 |             'columns_infos': columns_infos
186 |         }
187 |         if data.get('total'):
188 |             res['total'] = data['total']
189 | 
190 |         return jsonify(res)
191 | 
192 |     async def general_infos(self, db_info):
193 |         params = {}
194 |         sql = 'SELECT count(*) FROM sqlite_master WHERE type=\'table\' AND name=\'general_infos\''
195 |         rows, description = await self.execute(
196 |             sql, db_info, params=params
197 |         )
198 |         if rows[0][0] != 0:
199 |             sql = 'SELECT * FROM general_infos'
200 |             rows, description = await self.execute(
201 |                 sql, db_info, params=params
202 |             )
203 |             columns = [r[0] for r in description]
204 |             res = {}
205 |             cpt = 0
206 |             for col in columns:
207 |                 res[col] = rows[0][cpt]
208 |                 cpt = cpt + 1
209 | 
210 |             return res
211 |         else:
212 |             return {}
213 | 
214 |     async def columns_infos(self, db_info):
215 |         params = {}
216 |         sql = 'SELECT count(*) FROM sqlite_master WHERE type=\'table\' AND name=\'columns_infos\''
217 |         rows, description = await self.execute(
218 |             sql, db_info, params=params
219 |         )
220 |         if rows[0][0] != 0:
221 |             sql = 'SELECT * FROM columns_infos'
222 |             rows, description = await self.execute(
223 |                 sql, db_info, params=params
224 |             )
225 |             columns = [r[0] for r in description]
226 | 
227 |             res = {}
228 |             for row in rows:
229 |                 cpt = 1
230 |                 res[row[0]] = {}
231 |                 for col in columns[1:]:
232 |                     res[row[0]][col] = row[cpt]
233 |                     cpt = cpt + 1
234 | 
235 |             res = await self.top_and_categorical_infos(db_info, res, 'top_infos')
236 |             res = await self.top_and_categorical_infos(db_info, res, 'categorical_infos')
237 |             res = await self.numeric_infos(db_info, res)
238 |             res = await self.numeric_plot_infos(db_info, res)
239 |             return res
240 |         else:
241 |             return {}
242 | 
243 |     async def top_and_categorical_infos(self, db_info, res, table_name):
244 |         params = {}
245 |         sql = 'SELECT count(*) FROM sqlite_master WHERE type=\'table\' AND name=\'{}\''.format(table_name)
246 |         rows, description = await self.execute(
247 |             sql, db_info, params=params
248 |         )
249 |         if rows[0][0] != 0:
250 |             sql = 'SELECT * FROM {}'.format(table_name)
251 |             rows, description = await self.execute(
252 |                 sql, db_info, params=params
253 |             )
254 | 
255 |             for row in rows:
256 |                 if table_name not in res[row[0]]:
257 |                     res[row[0]][table_name] = []
258 |                 inter = {}
259 |                 inter['value'] = row[1]
260 |                 inter['count'] = row[2]
261 |                 res[row[0]][table_name].append(inter)
262 | 
263 |             return res
264 |         else:
265 |             for col in res:
266 |                 res[col][table_name] = {}
267 |             return res
268 | 
269 |     async def numeric_infos(self, db_info, res):
270 |         params = {}
271 |         sql = 'SELECT count(*) FROM sqlite_master WHERE type=\'table\' AND name=\'numeric_infos\''
272 |         rows, description = await self.execute(
273 |             sql, db_info, params=params
274 |         )
275 |         if rows[0][0] != 0:
276 |             sql = 'SELECT * FROM {}'.format('numeric_infos')
277 |             rows, description = await self.execute(
278 |                 sql, db_info, params=params
279 |             )
280 | 
281 |             for row in rows:
282 |                 if 'numeric_infos' not in res[row[0]]:
283 |                     res[row[0]]['numeric_infos'] = {}
284 | 
285 |                 res[row[0]]['numeric_infos']['mean'] = row[1]
286 |                 res[row[0]]['numeric_infos']['std'] = row[2]
287 |                 res[row[0]]['numeric_infos']['min'] = row[3]
288 |                 res[row[0]]['numeric_infos']['max'] = row[4]
289 | 
290 |             return res
291 |         else:
292 |             for col in res:
293 |                 res[col]['numeric_infos'] = {}
294 |             return res
295 | 
296 |     async def numeric_plot_infos(self, db_info, res):
297 |         params = {}
298 |         sql = 'SELECT count(*) FROM sqlite_master WHERE type=\'table\' AND name=\'numeric_plot_infos\''
299 |         rows, description = await self.execute(
300 |             sql, db_info, params=params
301 |         )
302 |         if rows[0][0] != 0:
303 |             sql = 'SELECT * FROM {}'.format('numeric_plot_infos')
304 |             rows, description = await self.execute(
305 |                 sql, db_info, params=params
306 |             )
307 | 
308 |             for row in rows:
309 |                 if 'numeric_plot_infos' not in res[row[0]]:
310 |                     res[row[0]]['numeric_plot_infos'] = {}
311 |                 if 'counts' not in res[row[0]]['numeric_plot_infos']:
312 |                     res[row[0]]['numeric_plot_infos']['counts'] = []
313 |                 if 'bin_edges' not in res[row[0]]['numeric_plot_infos']:
314 |                     res[row[0]]['numeric_plot_infos']['bin_edges'] = []
315 |                 if row[2] == 'counts':
316 |                     res[row[0]]['numeric_plot_infos']['counts'].append(row[1])
317 |                 if row[2] == 'bin_edges':
318 |                     res[row[0]]['numeric_plot_infos']['bin_edges'].append(row[1])
319 | 
320 |             return res
321 |         else:
322 |             for col in res:
323 |                 res[col]['numeric_plot_infos'] = {}
324 |             return res
325 | 


--------------------------------------------------------------------------------
/tests/test_api.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import uuid
  3 | from pathlib import Path
  4 | 
  5 | import pytest
  6 | import pytest_asyncio
  7 | from aioresponses import aioresponses
  8 | 
  9 | from csvapi.utils import get_hash
 10 | from csvapi.webservice import app as csvapi_app
 11 | 
 12 | MOCK_CSV_URL = 'http://domain.com/file.csv'
 13 | MOCK_CSV_URL_FILTERS = 'http://domain.com/filters.csv'
 14 | MOCK_CSV_HASH_FILTERS = get_hash(MOCK_CSV_URL_FILTERS)
 15 | MOCK_CSV_HASH = get_hash(MOCK_CSV_URL)
 16 | DB_ROOT_DIR = './tests/dbs'
 17 | 
 18 | 
 19 | pytestmark = pytest.mark.asyncio
 20 | 
 21 | 
 22 | @pytest.fixture
 23 | def rmock():
 24 |     with aioresponses() as m:
 25 |         yield m
 26 | 
 27 | 
 28 | @pytest.fixture
 29 | def app():
 30 |     csvapi_app.config.update({
 31 |         'DB_ROOT_DIR': DB_ROOT_DIR,
 32 |         'CSV_CACHE_ENABLED': False,
 33 |     })
 34 |     yield csvapi_app
 35 |     [db.unlink() for db in Path(DB_ROOT_DIR).glob('*.db')]
 36 | 
 37 | 
 38 | @pytest.fixture
 39 | def client(app):
 40 |     yield app.test_client()
 41 | 
 42 | 
 43 | @pytest.fixture
 44 | def csv():
 45 |     return '''col a<sep>col b<sep>col c
 46 | data à1<sep>data b1<sep>z
 47 | data ª2<sep>data b2<sep>a
 48 | '''
 49 | 
 50 | 
 51 | @pytest.fixture
 52 | def csv_col_mismatch():
 53 |     return '''col a<sep>col b
 54 | data à1<sep>data b1<sep>2
 55 | data ª2<sep>data b2<sep>4<sep>
 56 | '''
 57 | 
 58 | 
 59 | @pytest.fixture
 60 | def csv_hour():
 61 |     return '''id<sep>hour
 62 | a<sep>12:30
 63 | b<sep>9:15
 64 | c<sep>09:45
 65 | '''
 66 | 
 67 | 
 68 | @pytest.fixture
 69 | def csv_filters():
 70 |     """
 71 |     TODO: also test with unicode value in column name, but Quart
 72 |     test client currently fails
 73 |     """
 74 |     return '''id,hour,value,another column
 75 | first,12:30,1,value
 76 | second,9:15,2,value
 77 | third,09:45,3,value
 78 | '''
 79 | 
 80 | 
 81 | @pytest.fixture
 82 | def csv_siren_siret():
 83 |     return """id<sep>siren<sep>siret
 84 | a<sep>130025265<sep>13002526500013
 85 | b<sep>522816651<sep>52281665100056
 86 | """
 87 | 
 88 | 
 89 | @pytest.fixture
 90 | def csv_numeric():
 91 |     return """id<sep>value
 92 | a<sep>2
 93 | b<sep>4
 94 | c<sep>12
 95 | """
 96 | 
 97 | 
 98 | @pytest.fixture
 99 | def csv_top():
100 |     return """cat<sep>value
101 | a<sep>15
102 | b<sep>13
103 | c<sep>11
104 | a<sep>9
105 | """
106 | 
107 | 
108 | @pytest.fixture
109 | def csv_custom_types_double_cr():
110 |     """
111 |     This is clearly an invalid file (double CR)
112 |     but it tests an interesting case: None values in
113 |     columns detected as custom types.
114 | 
115 |     In this case we'd rather display empty lines and None
116 |     values than break.
117 |     """
118 |     return """id<sep>siren<sep>siret<sep>time\r\r
119 | a<sep>13002526a5<sep>13002526500013<sep>12:30\r\r
120 | b<sep>522816651<sep>52281665100056<sep>15:50\r\r
121 | """
122 | 
123 | 
124 | @pytest.fixture
125 | def one_line_json_file():
126 |     return '''{ "property1": 1, "property2": 2}'''
127 | 
128 | 
129 | def random_url():
130 |     return f"https://example.com/{uuid.uuid4()}.csv"
131 | 
132 | 
133 | @pytest_asyncio.fixture
134 | async def uploaded_csv(rmock, csv, client):
135 |     content = csv.replace('<sep>', ';').encode('utf-8')
136 |     rmock.get(MOCK_CSV_URL, body=content)
137 |     await client.get(f"/apify?url={MOCK_CSV_URL}")
138 | 
139 | 
140 | async def test_apify_no_url(rmock, csv, client):
141 |     res = await client.get('/apify')
142 |     assert res.status_code == 400
143 | 
144 | 
145 | async def test_apify_wrong_url(rmock, csv, client):
146 |     res = await client.get('/apify?url=notanurl')
147 |     assert res.status_code == 400
148 | 
149 | 
150 | async def test_apify(rmock, csv, client):
151 |     rmock.get(MOCK_CSV_URL, status=200, body=csv.encode('utf-8'))
152 |     res = await client.get(f"/apify?url={MOCK_CSV_URL}")
153 |     assert res.status_code == 200
154 |     jsonres = await res.json
155 |     assert jsonres['ok']
156 |     assert 'endpoint' in jsonres
157 |     assert f"/api/{MOCK_CSV_HASH}" in jsonres['endpoint']
158 |     db_path = Path(DB_ROOT_DIR) / f"{MOCK_CSV_HASH}.db"
159 |     assert db_path.exists()
160 | 
161 | 
162 | async def test_apify_not_found(rmock, csv, client):
163 |     rmock.get(MOCK_CSV_URL, status=404)
164 |     res = await client.get(f"/apify?url={MOCK_CSV_URL}")
165 |     assert res.status_code == 500
166 |     jsonres = await res.json
167 |     assert not jsonres['ok']
168 |     assert jsonres['error'].startswith("Error parsing CSV: 404, message='Not Found'")
169 | 
170 | 
171 | async def test_apify_w_cache(app, rmock, csv, client):
172 |     app.config.update({'CSV_CACHE_ENABLED': True})
173 |     rmock.get(MOCK_CSV_URL, body=csv.encode('utf-8'))
174 |     res = await client.get(f"/apify?url={MOCK_CSV_URL}")
175 |     assert res.status_code == 200
176 |     jsonres = await res.json
177 |     assert jsonres['ok']
178 |     assert 'endpoint' in jsonres
179 |     assert f"/api/{MOCK_CSV_HASH}" in jsonres['endpoint']
180 |     db_path = Path(DB_ROOT_DIR) / f"{MOCK_CSV_HASH}.db"
181 |     assert db_path.exists()
182 |     app.config.update({'CSV_CACHE_ENABLED': False})
183 | 
184 | 
185 | async def test_apify_col_mismatch(rmock, csv_col_mismatch, client):
186 |     rmock.get(MOCK_CSV_URL, body=csv_col_mismatch.replace('<sep>', ';').encode('utf-8'))
187 |     res = await client.get(f"/apify?url={MOCK_CSV_URL}")
188 |     assert res.status_code == 200
189 |     jsonres = await res.json
190 |     assert jsonres['ok']
191 | 
192 | 
193 | async def test_apify_hour_format(rmock, csv_hour, client):
194 |     content = csv_hour.replace('<sep>', ';').encode('utf-8')
195 |     url = random_url()
196 |     rmock.get(url, body=content)
197 |     await client.get(f"/apify?url={url}")
198 |     res = await client.get(f"/api/{get_hash(url)}")
199 |     assert res.status_code == 200
200 |     jsonres = await res.json
201 |     assert jsonres['columns'] == ['rowid', 'id', 'hour']
202 |     assert jsonres['total'] == 3
203 |     assert jsonres['rows'] == [
204 |         [1, 'a', '12:30'],
205 |         [2, 'b', '9:15'],
206 |         [3, 'c', '09:45'],
207 |     ]
208 | 
209 | 
210 | async def test_apify_siren_siret_format(rmock, csv_siren_siret, client):
211 |     content = csv_siren_siret.replace('<sep>', ';').encode('utf-8')
212 |     url = random_url()
213 |     rmock.get(url, body=content)
214 |     await client.get(f"/apify?url={url}")
215 |     res = await client.get(f"/api/{get_hash(url)}")
216 |     assert res.status_code == 200
217 |     jsonres = await res.json
218 |     assert jsonres['columns'] == ['rowid', 'id', 'siren', 'siret']
219 |     assert jsonres['total'] == 2
220 |     assert jsonres['rows'] == [
221 |         [1, 'a', '130025265', '13002526500013'],
222 |         [2, 'b', '522816651', '52281665100056'],
223 |     ]
224 | 
225 | 
226 | async def test_apify_custom_types_double_cr(rmock, csv_custom_types_double_cr, client):
227 |     content = csv_custom_types_double_cr.replace('<sep>', ';').encode('utf-8')
228 |     url = random_url()
229 |     rmock.get(url, body=content)
230 |     await client.get(f"/apify?url={url}")
231 |     res = await client.get(f"/api/{get_hash(url)}")
232 |     assert res.status_code == 200
233 |     jsonres = await res.json
234 |     assert jsonres['columns'] == ['rowid', 'id', 'siren', 'siret', 'time']
235 |     assert jsonres['total'] == 5
236 |     assert jsonres['rows'] == [
237 |         [1, None, None, None, None],
238 |         [2, 'a', '13002526a5', '13002526500013', '12:30'],
239 |         [3, None, None, None, None],
240 |         [4, 'b', '522816651', '52281665100056', '15:50'],
241 |         [5, None, None, None, None]
242 |     ]
243 | 
244 | 
245 | @pytest.mark.parametrize('separator', [';', ',', '\t'])
246 | @pytest.mark.parametrize('encoding', ['utf-8', 'iso-8859-15', 'iso-8859-1'])
247 | async def test_api(client, rmock, csv, separator, encoding):
248 |     content = csv.replace('<sep>', separator).encode(encoding)
249 |     rmock.get(MOCK_CSV_URL, body=content)
250 |     await client.get(f"/apify?url={MOCK_CSV_URL}")
251 |     res = await client.get(f"/api/{MOCK_CSV_HASH}")
252 |     assert res.status_code == 200
253 |     jsonres = await res.json
254 |     assert jsonres['columns'] == ['rowid', 'col a', 'col b', 'col c']
255 |     assert jsonres['total'] == 2
256 |     assert jsonres['rows'] == [
257 |         [1, 'data à1', 'data b1', 'z'],
258 |         [2, 'data ª2', 'data b2', 'a'],
259 |     ]
260 | 
261 | 
262 | async def test_api_limit(client, rmock, uploaded_csv):
263 |     res = await client.get(f"/api/{MOCK_CSV_HASH}?_size=1")
264 |     assert res.status_code == 200
265 |     jsonres = await res.json
266 |     assert len(jsonres['rows']) == 1
267 |     assert jsonres['rows'] == [
268 |         [1, 'data à1', 'data b1', 'z'],
269 |     ]
270 | 
271 | 
272 | async def test_api_limit_offset(client, rmock, uploaded_csv):
273 |     res = await client.get(f"/api/{MOCK_CSV_HASH}?_size=1&_offset=1")
274 |     assert res.status_code == 200
275 |     jsonres = await res.json
276 |     assert len(jsonres['rows']) == 1
277 |     assert jsonres['rows'] == [
278 |         [2, 'data ª2', 'data b2', 'a'],
279 |     ]
280 | 
281 | 
282 | async def test_api_wrong_limit(client, rmock, uploaded_csv):
283 |     res = await client.get(f"/api/{MOCK_CSV_HASH}?_size=toto")
284 |     assert res.status_code == 400
285 | 
286 | 
287 | async def test_api_wrong_shape(client, rmock, uploaded_csv):
288 |     res = await client.get(f"/api/{MOCK_CSV_HASH}?_shape=toto")
289 |     assert res.status_code == 400
290 | 
291 | 
292 | async def test_api_objects_shape(client, rmock, uploaded_csv):
293 |     res = await client.get(f"/api/{MOCK_CSV_HASH}?_shape=objects")
294 |     assert res.status_code == 200
295 |     jsonres = await res.json
296 |     assert jsonres['rows'] == [
297 |         {
298 |             'rowid': 1,
299 |             'col a': 'data à1',
300 |             'col b': 'data b1',
301 |             'col c': 'z',
302 |         }, {
303 |             'rowid': 2,
304 |             'col a': 'data ª2',
305 |             'col b': 'data b2',
306 |             'col c': 'a',
307 |         }
308 |     ]
309 | 
310 | 
311 | async def test_api_objects_norowid(client, rmock, uploaded_csv):
312 |     res = await client.get(f"/api/{MOCK_CSV_HASH}?_shape=objects&_rowid=hide")
313 |     assert res.status_code == 200
314 |     jsonres = await res.json
315 |     assert jsonres['rows'] == [
316 |         {
317 |             'col a': 'data à1',
318 |             'col b': 'data b1',
319 |             'col c': 'z',
320 |         }, {
321 |             'col a': 'data ª2',
322 |             'col b': 'data b2',
323 |             'col c': 'a',
324 |         }
325 |     ]
326 | 
327 | 
328 | async def test_api_objects_nototal(client, rmock, uploaded_csv):
329 |     res = await client.get(f"/api/{MOCK_CSV_HASH}?_total=hide")
330 |     assert res.status_code == 200
331 |     jsonres = await res.json
332 |     assert jsonres.get('total') is None
333 | 
334 | 
335 | async def test_api_sort(client, rmock, uploaded_csv):
336 |     res = await client.get(f"/api/{MOCK_CSV_HASH}?_sort=col c")
337 |     assert res.status_code == 200
338 |     jsonres = await res.json
339 |     assert jsonres['rows'] == [
340 |         [2, 'data ª2', 'data b2', 'a'],
341 |         [1, 'data à1', 'data b1', 'z'],
342 |     ]
343 | 
344 | 
345 | async def test_api_sort_desc(client, rmock, uploaded_csv):
346 |     res = await client.get(f"/api/{MOCK_CSV_HASH}?_sort_desc=col b")
347 |     assert res.status_code == 200
348 |     jsonres = await res.json
349 |     assert jsonres['rows'] == [
350 |         [2, 'data ª2', 'data b2', 'a'],
351 |         [1, 'data à1', 'data b1', 'z'],
352 |     ]
353 | 
354 | 
355 | async def test_apify_file_too_big(app, client, rmock):
356 |     original_max_file_size = app.config.get('MAX_FILE_SIZE')
357 |     app.config.update({'MAX_FILE_SIZE': 1})
358 |     here = os.path.dirname(os.path.abspath(__file__))
359 |     content = open(f"{here}/samples/test.{'xls'}", 'rb')
360 |     mock_url = MOCK_CSV_URL.replace('.csv', 'xls')
361 |     rmock.get(mock_url, body=content.read())
362 |     content.close()
363 |     res = await client.get(f"/apify?url={mock_url}")
364 |     assert res.status_code == 500
365 |     jsonres = await res.json
366 |     assert 'File too big' in jsonres['error']
367 |     app.config.update({'MAX_FILE_SIZE': original_max_file_size})
368 | 
369 | 
370 | @pytest.mark.parametrize('extension', ['xls', 'xlsx'])
371 | async def test_api_excel(client, rmock, extension):
372 |     here = os.path.dirname(os.path.abspath(__file__))
373 |     content = open(f"{here}/samples/test.{extension}", 'rb')
374 |     mock_url = MOCK_CSV_URL.replace('.csv', extension)
375 |     mock_hash = get_hash(mock_url)
376 |     rmock.get(mock_url, body=content.read())
377 |     content.close()
378 |     await client.get(f"/apify?url={mock_url}")
379 |     res = await client.get(f"/api/{mock_hash}")
380 |     assert res.status_code == 200
381 |     jsonres = await res.json
382 |     assert jsonres['columns'] == ['rowid', 'col a', 'col b', 'col c']
383 |     assert jsonres['rows'] == [
384 |         [1, 'a1', 'b1', 'z'],
385 |         [2, 'a2', 'b2', 'a'],
386 |     ]
387 | 
388 | 
389 | async def test_api_filter_referrers(app, client):
390 |     app.config.update({'REFERRERS_FILTER': ['toto.com']})
391 |     res = await client.get(f"/api/{'404'}")
392 |     assert res.status_code == 403
393 |     res = await client.get(f"/apify?url={'http://toto.com'}")
394 |     assert res.status_code == 403
395 |     res = await client.get(f"/api/{'404'}", headers={'Referer': 'http://next.toto.com'})
396 |     assert res.status_code == 404
397 |     app.config.update({'REFERRERS_FILTER': None})
398 | 
399 | 
400 | @pytest.mark.parametrize('csv_path', Path(__file__).parent.glob('samples/real_csv/*.csv'))
401 | async def test_real_csv_files(client, rmock, csv_path):
402 |     with open(csv_path, 'rb') as content:
403 |         rmock.get(MOCK_CSV_URL, body=content.read())
404 |     res = await client.get(f"/apify?url={MOCK_CSV_URL}")
405 |     assert res.status_code == 200
406 |     res = await client.get(f"/api/{MOCK_CSV_HASH}")
407 |     # w/ no error and more than 1 column and row we should be OK
408 |     assert res.status_code == 200
409 |     jsonres = await res.json
410 |     assert len(jsonres['columns']) > 1
411 |     assert len(jsonres['rows']) > 1
412 | 
413 | 
414 | @pytest.mark.parametrize('xls_path', Path(__file__).parent.glob('samples/real_xls/*.xls*'))
415 | async def test_real_xls_files(client, rmock, xls_path):
416 |     with open(xls_path, 'rb') as content:
417 |         rmock.get(MOCK_CSV_URL, body=content.read())
418 |     res = await client.get(f"/apify?url={MOCK_CSV_URL}")
419 |     assert res.status_code == 200
420 |     res = await client.get(f"/api/{MOCK_CSV_HASH}")
421 |     # w/ no error and more than 1 column and row we should be OK
422 |     assert res.status_code == 200
423 |     jsonres = await res.json
424 |     assert len(jsonres['columns']) > 0
425 |     assert len(jsonres['rows']) > 0
426 | 
427 | 
428 | @pytest_asyncio.fixture
429 | async def uploaded_csv_filters(rmock, csv_filters, client):
430 |     content = csv_filters.encode('utf-8')
431 |     rmock.get(MOCK_CSV_URL_FILTERS, body=content)
432 |     await client.get(f"/apify?url={MOCK_CSV_URL_FILTERS}")
433 | 
434 | 
435 | async def test_api_filters_exact_hour(rmock, uploaded_csv_filters, client):
436 |     res = await client.get(f"/api/{MOCK_CSV_HASH_FILTERS}?hour__exact=12:30")
437 |     assert res.status_code == 200
438 |     jsonres = await res.json
439 |     assert jsonres['total'] == 1
440 |     assert jsonres['rows'] == [
441 |         [1, 'first', '12:30', 1.0, 'value'],
442 |     ]
443 | 
444 | 
445 | async def test_api_filters_contains_string(rmock, uploaded_csv_filters, client):
446 |     res = await client.get(f"/api/{MOCK_CSV_HASH_FILTERS}?id__contains=fir")
447 |     assert res.status_code == 200
448 |     jsonres = await res.json
449 |     assert jsonres['total'] == 1
450 |     assert jsonres['rows'] == [
451 |         [1, 'first', '12:30', 1.0, 'value'],
452 |     ]
453 | 
454 | 
455 | async def test_api_filters_contains_exact_int(rmock, uploaded_csv_filters, client):
456 |     "NB: suboptimal API result, int value returns a float"
457 |     res = await client.get(f"/api/{MOCK_CSV_HASH_FILTERS}?value__exact=1")
458 |     assert res.status_code == 200
459 |     jsonres = await res.json
460 |     assert jsonres['total'] == 1
461 |     assert jsonres['rows'] == [
462 |         [1, 'first', '12:30', 1.0, 'value'],
463 |     ]
464 | 
465 | 
466 | async def test_api_filters_contains_exact_float(rmock, uploaded_csv_filters, client):
467 |     res = await client.get(f"/api/{MOCK_CSV_HASH_FILTERS}?value__exact=1.0")
468 |     assert res.status_code == 200
469 |     jsonres = await res.json
470 |     assert jsonres['total'] == 1
471 |     assert jsonres['rows'] == [
472 |         [1, 'first', '12:30', 1.0, 'value'],
473 |     ]
474 | 
475 | 
476 | async def test_api_and_filters(rmock, uploaded_csv_filters, client):
477 |     res = await client.get(f"/api/{MOCK_CSV_HASH_FILTERS}?id__contains=fir&value__exact=1")
478 |     assert res.status_code == 200
479 |     jsonres = await res.json
480 |     assert jsonres['total'] == 1
481 |     assert jsonres['rows'] == [
482 |         [1, 'first', '12:30', 1.0, 'value'],
483 |     ]
484 | 
485 | 
486 | async def test_api_filters_greater_float(rmock, csv_numeric, client):
487 |     content = csv_numeric.replace('<sep>', ';').encode('utf-8')
488 |     url = random_url()
489 |     rmock.get(url, body=content)
490 |     await client.get(f"/apify?url={url}")
491 |     res = await client.get(f"/api/{get_hash(url)}?value__greater=10")
492 |     assert res.status_code == 200
493 |     jsonres = await res.json
494 |     print(jsonres)
495 |     assert jsonres['rows'] == [
496 |         [3, 'c', 12],
497 |     ]
498 | 
499 | 
500 | async def test_api_filters_less_float(rmock, csv_numeric, client):
501 |     content = csv_numeric.replace('<sep>', ';').encode('utf-8')
502 |     url = random_url()
503 |     rmock.get(url, body=content)
504 |     await client.get(f"/apify?url={url}")
505 |     res = await client.get(f"/api/{get_hash(url)}?value__less=3")
506 |     assert res.status_code == 200
507 |     jsonres = await res.json
508 |     print(jsonres)
509 |     assert jsonres['rows'] == [
510 |         [1, 'a', 2],
511 |     ]
512 | 
513 | 
514 | async def test_api_filters_less_greater_float(rmock, csv_numeric, client):
515 |     content = csv_numeric.replace('<sep>', ';').encode('utf-8')
516 |     url = random_url()
517 |     rmock.get(url, body=content)
518 |     await client.get(f"/apify?url={url}")
519 |     res = await client.get(f"/api/{get_hash(url)}?value__greater=3&value__less=10")
520 |     assert res.status_code == 200
521 |     jsonres = await res.json
522 |     assert jsonres['rows'] == [
523 |         [2, 'b', 4],
524 |     ]
525 | 
526 | async def test_api_filters_less_greater_string_error(rmock, csv_numeric, client):
527 |     content = csv_numeric.replace('<sep>', ';').encode('utf-8')
528 |     url = random_url()
529 |     rmock.get(url, body=content)
530 |     await client.get(f"/apify?url={url}")
531 |     res = await client.get(f"/api/{get_hash(url)}?value__greater=3&value__less=stan")
532 |     assert res.status_code == 400
533 |     jsonres = await res.json
534 |     assert jsonres == {"error":"Float value expected for less comparison.", "error_id": None , "ok":False}
535 | 
536 | 
537 | async def test_api_filters_unnormalized_column(rmock, uploaded_csv_filters, client):
538 |     res = await client.get(f"/api/{MOCK_CSV_HASH_FILTERS}?id__contains=fir&another column__contains=value")
539 |     assert res.status_code == 200
540 |     jsonres = await res.json
541 |     assert jsonres['total'] == 1
542 |     assert jsonres['rows'] == [
543 |         [1, 'first', '12:30', 1.0, 'value'],
544 |     ]
545 | 
546 | 
547 | async def test_apify_analysed_format_response(rmock, csv_siren_siret, client):
548 |     content = csv_siren_siret.replace('<sep>', ';').encode('utf-8')
549 |     url = random_url()
550 |     rmock.get(url, body=content)
551 |     await client.get(f"/apify?url={url}&analysis=yes")
552 |     res = await client.get(f"/api/{get_hash(url)}")
553 |     assert res.status_code == 200
554 |     jsonres = await res.json
555 |     assert all(x in jsonres['columns_infos'] for x in ['id', 'siren', 'siret'])
556 |     assert all(x in jsonres['general_infos'] for x in [
557 |         'dataset_id',
558 |         'date_last_check',
559 |         'encoding',
560 |         'header_row_idx',
561 |         'nb_cells_missing',
562 |         'nb_columns',
563 |         'nb_vars_all_missing',
564 |         'nb_vars_with_missing',
565 |         'resource_id',
566 |         'separator',
567 |         'total_lines',
568 |         'filetype'
569 |     ])
570 | 
571 | 
572 | async def test_apify_analysed_csv_detective_check_format(rmock, csv_siren_siret, client):
573 |     content = csv_siren_siret.replace('<sep>', ';').encode('utf-8')
574 |     url = random_url()
575 |     rmock.get(url, body=content)
576 |     await client.get(f"/apify?url={url}&analysis=yes")
577 |     res = await client.get(f"/api/{get_hash(url)}")
578 |     assert res.status_code == 200
579 |     jsonres = await res.json
580 |     assert jsonres['columns_infos']['siren']['format'] == 'siren'
581 |     assert jsonres['columns_infos']['siret']['format'] == 'siret'
582 | 
583 | 
584 | async def test_apify_analysed_pandas_profiling_check_numeric(rmock, csv_numeric, client):
585 |     content = csv_numeric.replace('<sep>', ';').encode('utf-8')
586 |     url = random_url()
587 |     rmock.get(url, body=content)
588 |     await client.get(f"/apify?url={url}&analysis=yes")
589 |     res = await client.get(f"/api/{get_hash(url)}")
590 |     assert res.status_code == 200
591 |     jsonres = await res.json
592 |     assert jsonres['columns_infos']['value']['numeric_infos']['max'] == 12
593 |     assert jsonres['columns_infos']['value']['numeric_infos']['min'] == 2
594 |     assert jsonres['columns_infos']['value']['numeric_infos']['mean'] == 6
595 | 
596 | 
597 | async def test_apify_analysed_pandas_profiling_check_top(rmock, csv_top, client):
598 |     content = csv_top.replace('<sep>', ';').encode('utf-8')
599 |     url = random_url()
600 |     rmock.get(url, body=content)
601 |     await client.get(f"/apify?url={url}&analysis=yes")
602 |     res = await client.get(f"/api/{get_hash(url)}")
603 |     assert res.status_code == 200
604 |     jsonres = await res.json
605 |     assert jsonres['columns_infos']['cat']['top_infos'][0]['value'] == 'a'
606 | 
607 | 
608 | async def test_apify_analysed_check_general_infos(rmock, csv_top, client):
609 |     content = csv_top.replace('<sep>', ';').encode('utf-8')
610 |     url = random_url()
611 |     rmock.get(url, body=content)
612 |     await client.get(f"/apify?url={url}&analysis=yes")
613 |     res = await client.get(f"/api/{get_hash(url)}")
614 |     assert res.status_code == 200
615 |     jsonres = await res.json
616 |     assert jsonres['general_infos']['nb_columns'] == 2
617 |     assert jsonres['general_infos']['total_lines'] == 4
618 |     assert jsonres['general_infos']['separator'] == ';'
619 |     assert jsonres['general_infos']['header_row_idx'] == 0
620 | 
621 | 
622 | @pytest.mark.parametrize('extension', ['xls', 'xlsx'])
623 | async def test_no_analysis_when_excel(client, rmock, extension):
624 |     here = os.path.dirname(os.path.abspath(__file__))
625 |     content = open(f"{here}/samples/test.{extension}", 'rb')
626 |     mock_url = MOCK_CSV_URL.replace('.csv', extension)
627 |     mock_hash = get_hash(mock_url)
628 |     rmock.get(mock_url, body=content.read())
629 |     content.close()
630 |     await client.get(f"/apify?url={mock_url}&analysis=yes")
631 |     res = await client.get(f"/api/{mock_hash}")
632 |     assert res.status_code == 200
633 |     jsonres = await res.json
634 |     print(jsonres)
635 |     assert jsonres['columns'] == ['rowid', 'col a', 'col b', 'col c']
636 |     assert jsonres['general_infos'] == { 'filetype': 'excel' }
637 |     assert jsonres['columns_infos'] == {}
638 | 
639 | 
640 | async def test_fail_one_line_json_file(rmock, one_line_json_file, client):
641 |     content = one_line_json_file
642 |     url = random_url()
643 |     rmock.get(url, body=content)
644 |     res = await client.get(f"/apify?url={url}&analysis=yes")
645 |     assert res.status_code == 500
646 | 


--------------------------------------------------------------------------------