├── dbs └── .gitkeep ├── csvapi ├── __init__.py ├── errors.py ├── security.py ├── uploadview.py ├── exportview.py ├── profiling.py ├── cli.py ├── webservice.py ├── type_tester.py ├── parser.py ├── parseview.py ├── utils.py └── tableview.py ├── tests ├── dbs │ └── .gitkeep ├── samples │ ├── test.xls │ ├── test.xlsx │ └── real_xls │ │ ├── file_example_XLS_50.xls │ │ └── tourisme-handicap-etablissements-21022020.xlsx └── test_api.py ├── setup.cfg ├── .circleci ├── images │ └── csvapi-circle │ │ └── Dockerfile └── config.yml ├── .gitignore ├── bumpr.rc ├── config.py ├── LICENSE ├── benchmark ├── bench.js ├── legacy │ ├── bench-apify.sh │ └── bench-parser.py └── bench.py ├── pyproject.toml ├── CHANGELOG.md ├── profiling-minimal.yml └── README.md /dbs/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /csvapi/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/dbs/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/samples/test.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/etalab/csvapi/HEAD/tests/samples/test.xls -------------------------------------------------------------------------------- /tests/samples/test.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/etalab/csvapi/HEAD/tests/samples/test.xlsx -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | 4 | [tool:pytest] 5 | filterwarnings = 6 | ignore::DeprecationWarning 7 | -------------------------------------------------------------------------------- /.circleci/images/csvapi-circle/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM cimg/python:3.9 2 | 3 | RUN sudo apt-get update && sudo apt-get install -y file 4 | -------------------------------------------------------------------------------- /tests/samples/real_xls/file_example_XLS_50.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/etalab/csvapi/HEAD/tests/samples/real_xls/file_example_XLS_50.xls -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | dbs/*.db 2 | profiles/*.html 3 | *.egg-info/ 4 | *.pyc 5 | build/ 6 | dist/ 7 | reports/ 8 | .vscode/ 9 | .pytest_cache/ 10 | -------------------------------------------------------------------------------- /tests/samples/real_xls/tourisme-handicap-etablissements-21022020.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/etalab/csvapi/HEAD/tests/samples/real_xls/tourisme-handicap-etablissements-21022020.xlsx -------------------------------------------------------------------------------- /csvapi/errors.py: -------------------------------------------------------------------------------- 1 | class APIError(Exception): 2 | status = 500 3 | 4 | def __init__(self, message, status=None, payload=None): 5 | super().__init__(message) 6 | self.message = message 7 | if status is not None: 8 | self.status = status 9 | self.payload = payload 10 | 11 | def to_dict(self): 12 | rv = dict(self.payload or ()) 13 | rv['error'] = self.message 14 | rv['ok'] = False 15 | return rv 16 | -------------------------------------------------------------------------------- /bumpr.rc: -------------------------------------------------------------------------------- 1 | [bumpr] 2 | file = pyproject.toml 3 | regex = version\s*=\s*"(?P.+?)" 4 | vcs = git 5 | commit = true 6 | tag = true 7 | tag_format = v{version} 8 | push = true 9 | clean = rm -rf *egg-info build dist 10 | tests = poetry run pytest tests 11 | publish = poetry build 12 | files = 13 | README.md 14 | 15 | [bump] 16 | unsuffix = true 17 | 18 | [prepare] 19 | part = patch 20 | suffix = dev 21 | 22 | [changelog] 23 | file = CHANGELOG.md 24 | bump = ## {version} ({date:%Y-%m-%d}) 25 | prepare = ## Current (in progress) 26 | separator = 27 | -------------------------------------------------------------------------------- /csvapi/security.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urlparse 2 | 3 | from quart import current_app as app, request, jsonify 4 | 5 | 6 | def filter_referrers(): 7 | filters = app.config.get('REFERRERS_FILTER') 8 | if not filters: 9 | return None 10 | referrer = request.referrer 11 | if referrer: 12 | parsed = urlparse(referrer) 13 | for filter in filters: 14 | if parsed.hostname.endswith(filter): 15 | return None 16 | return jsonify({ 17 | 'ok': False, 18 | 'error': 'Unauthorized', 19 | }), 403 20 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | DB_ROOT_DIR = './dbs' 2 | CSV_CACHE_ENABLED = True 3 | MAX_WORKERS = 3 4 | DEBUG = True 5 | SENTRY_DSN = None 6 | FORCE_SSL = False 7 | # In bytes, cf `sniff_limit` https://agate.readthedocs.io/en/1.6.1/api/table.html#agate.Table.from_csv 8 | CSV_SNIFF_LIMIT = 4096 * 2 9 | # In bytes, csvapi will stop downloading files if they reach this size 10 | # Default to 100 Mo 11 | MAX_FILE_SIZE = 1024 * 1024 * 100 12 | # Set this to an array of hosts to filter out calls by referer (403 returned if no match) 13 | # It will also match subdomains 14 | # e.g. REFERRERS_FILTER = ['data.gouv.fr'] will match 'demo.data.gouv.fr' 15 | REFERRERS_FILTER = None 16 | PANDAS_PROFILING_CONFIG_MIN = False 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2022 Etalab 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /benchmark/bench.js: -------------------------------------------------------------------------------- 1 | import http from 'k6/http'; 2 | import { sleep } from 'k6'; 3 | import crypto from 'k6/crypto'; 4 | 5 | export const options = { 6 | vus: 10, 7 | iterations: 20, 8 | }; 9 | 10 | export default function () { 11 | var toParse = "https://www.data.gouv.fr/fr/datasets/r/e3d83ab3-dc52-4c99-abaf-8a38050cc68c" 12 | var base = "https://csvapi.data.gouv.fr"; 13 | 14 | // change me to invalidate cache 15 | var rdm = "2"; 16 | let toApify = `${toParse}?ts=${rdm}` 17 | let hash = crypto.md5(toApify, 'hex'); 18 | console.log(hash); 19 | 20 | // apify 1 21 | var apify = `${base}/apify?url=${toApify}`; 22 | http.get(apify); 23 | 24 | // analyze 1 25 | var analyze = `${base}/apify?analysis=yes&url=${toApify}`; 26 | http.get(analyze); 27 | 28 | // make 10 requests 29 | for (let id = 1; id <= 10; id++) { 30 | http.get("https://csvapi.data.gouv.fr/api/26bdf0d090dfbaecbe213c6f551a46ac", { 31 | tags: { name: 'request' }, 32 | }); 33 | sleep(0.1); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "csvapi" 3 | version = "2.2.1.dev" 4 | description = "An instant JSON API for your CSV" 5 | authors = ["Opendatateam "] 6 | license = "MIT" 7 | 8 | [tool.poetry.dependencies] 9 | python = ">=3.9,<4" 10 | click_default_group = "~1.2.2" 11 | click = "~8.1.3" 12 | agate = "~1.6.3" 13 | agate-sql = "~0.5.8" 14 | aiohttp = "~3.8.1" 15 | validators = "~0.20.0" 16 | agate-excel = "~0.2.5" 17 | Quart = "~0.18.0" 18 | quart-cors = "~0.5.0" 19 | sentry-sdk = "~1.9.8" 20 | cchardet = "~2.1.7" 21 | python-stdnum = "~1.17" 22 | aiosqlite = "~0.17.0" 23 | pandas = "~1.4.4" 24 | pandas-profiling = "~3.2.0" 25 | requests = "~2.28.1" 26 | boto3 = "~1.24.66" 27 | csv-detective = "~0.4.6" 28 | 29 | [tool.poetry.dev-dependencies] 30 | aioresponses = "~0.7.3" 31 | pytest = "~7.1.3" 32 | pytest-asyncio = "~0.19.0" 33 | flake8 = "~5.0.4" 34 | pytest-cov = "~3.0.0" 35 | bumpr = "^0.3.8" 36 | 37 | [tool.poetry.scripts] 38 | csvapi = "csvapi.cli:cli" 39 | 40 | [build-system] 41 | requires = ["poetry-core>=1.0.0"] 42 | build-backend = "poetry.core.masonry.api" 43 | -------------------------------------------------------------------------------- /csvapi/uploadview.py: -------------------------------------------------------------------------------- 1 | import os 2 | from tempfile import NamedTemporaryFile 3 | 4 | from quart import request, current_app as app, jsonify 5 | from quart.views import MethodView 6 | 7 | from csvapi.errors import APIError 8 | from csvapi.utils import get_hash_bytes, already_exists 9 | from csvapi.parser import parse 10 | 11 | 12 | class UploadView(MethodView): 13 | 14 | async def post(self): 15 | files = await request.files 16 | _file = files.get('file') or files.get('filepond') 17 | if not _file: 18 | raise APIError('Missing file.', status=400) 19 | content_hash = get_hash_bytes(_file.read()) 20 | _file.seek(0) 21 | if not already_exists(content_hash): 22 | storage = app.config['DB_ROOT_DIR'] 23 | sniff_limit = app.config.get('CSV_SNIFF_LIMIT') 24 | try: 25 | _tmpfile = NamedTemporaryFile(delete=False) 26 | _file.save(_tmpfile) 27 | _tmpfile.close() 28 | parse(_tmpfile.name, content_hash, storage, sniff_limit=sniff_limit) 29 | finally: 30 | os.unlink(_tmpfile.name) 31 | 32 | scheme = 'https' if app.config.get('FORCE_SSL') else request.scheme 33 | return jsonify({ 34 | 'ok': True, 35 | 'endpoint': f"{scheme}://{request.host}/api/{content_hash}" 36 | }) 37 | -------------------------------------------------------------------------------- /csvapi/exportview.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import sqlite3 3 | 4 | from io import StringIO 5 | from pathlib import Path 6 | 7 | from quart import make_response 8 | 9 | from csvapi.errors import APIError 10 | from csvapi.tableview import TableView 11 | from csvapi.utils import get_db_info 12 | 13 | 14 | class ExportView(TableView): 15 | 16 | async def get(self, urlhash): 17 | "This will inherit sorting and filtering from TableView" 18 | db_info = get_db_info(urlhash) 19 | p = Path(db_info['db_path']) 20 | if not p.exists(): 21 | raise APIError('Database has probably been removed.', status=404) 22 | 23 | try: 24 | columns, rows_iter = await self.data(db_info, export=True) 25 | except (sqlite3.OperationalError, sqlite3.IntegrityError) as e: 26 | raise APIError('Error selecting data', status=400, payload=dict(details=str(e))) 27 | 28 | def make_line(line_data): 29 | line = StringIO() 30 | writer = csv.writer(line) 31 | writer.writerow(line_data) 32 | line.seek(0) 33 | return line.read().encode() 34 | 35 | async def _make_response(): 36 | yield make_line(columns) 37 | for line in rows_iter: 38 | yield make_line(line) 39 | 40 | response = await make_response(_make_response()) 41 | response.mimetype = 'text/csv' 42 | response.headers['Content-Disposition'] = f'attachment; filename={urlhash}.csv' 43 | return response 44 | -------------------------------------------------------------------------------- /benchmark/legacy/bench-apify.sh: -------------------------------------------------------------------------------- 1 | wget http://localhost:8001/apify?url=http://datanova.legroupe.laposte.fr/explore/dataset/laposte_poincont2/download/?format=csv&timezone=Europe/Berlin&use_labels_for_header=true & 2 | wget http://localhost:8001/apify?url=https://datanova.legroupe.laposte.fr/explore/dataset/laposte_hexasmal/download/?format=csv&timezone=Europe/Berlin&use_labels_for_header=true & 3 | wget http://localhost:8001/apify?url=http://file-examples.com/wp-content/uploads/2017/02/file_example_XLS_10.xls 4 | wget http://localhost:8001/apify?url=http://file-examples.com/wp-content/uploads/2017/02/file_example_XLS_50.xls 5 | wget http://localhost:8001/apify?url=http://file-examples.com/wp-content/uploads/2017/02/file_example_XLS_100.xls 6 | wget http://localhost:8001/apify?url=http://file-examples.com/wp-content/uploads/2017/02/file_example_XLS_1000.xls 7 | wget http://localhost:8001/apify?url=http://file-examples.com/wp-content/uploads/2017/02/file_example_XLS_5000.xls 8 | wait 9 | rm apify\?url\=* 10 | 11 | 12 | # workers = 1, app.run : 0m31.249s 13 | # workers = 3, app.run : 0m18.498s 14 | # workers = 5, app.run : 0m34.485s 15 | # workers = 10, app.run : 0m40.397s 16 | 17 | # workers = 3, hypercorn w = 1 : 0m36.761s 18 | # workers = 3, hypercorn w = 3 : 0m16.607s 19 | # workers = 3, hypercorn w = 5 : 0m34.812s 20 | 21 | # workers = 1, hypercorn w = 1 : 0m40.030s 22 | # workers = 1, hypercorn w = 3 : 0m40.920s 23 | # workers = 1, hypercorn w = 5 : 0m42.833s 24 | 25 | # workers = 5, hypercorn w = 5 : 0m16.097s 26 | 27 | # no shared executor, app.run : 0m35.871s 28 | # no shared executor, hypercorn w = 3 : 0m15.767s 29 | -------------------------------------------------------------------------------- /csvapi/profiling.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pandas as pd 4 | import sqlite3 5 | 6 | from pandas_profiling import ProfileReport 7 | 8 | from csvapi.errors import APIError 9 | from csvapi.utils import get_db_info 10 | 11 | import json 12 | 13 | 14 | class CSVAPIProfileReport: 15 | 16 | def get_dataframe(self, db_info): 17 | dsn = 'file:{}?immutable=1'.format(db_info['db_path']) 18 | conn = sqlite3.connect(dsn, uri=True) 19 | sql = 'SELECT * FROM [{}]'.format(db_info['table_name']) 20 | df = pd.read_sql_query(sql, con=conn) 21 | return df 22 | 23 | async def get_minimal_profile(self, urlhash: str) -> dict: 24 | db_info = get_db_info(urlhash) 25 | p = Path(db_info['db_path']) 26 | if not p.exists(): 27 | raise APIError('Database has probably been removed or does not exist yet.', status=404) 28 | 29 | try: 30 | df = self.get_dataframe(db_info) 31 | profile = ProfileReport( 32 | df, minimal=True, 33 | vars=dict(num={"low_categorical_threshold": 0}), 34 | plot=dict(histogram={"bins": 10}), 35 | # this disables the ThreadPoolExecutor in pandas-profiling 36 | # remove it or set it to 0 to use the number of CPUs a pool size 37 | pool_size=1, 38 | progress_bar=False, 39 | ) 40 | profile_report = json.loads(profile.to_json()) 41 | return profile_report 42 | except (sqlite3.OperationalError, sqlite3.IntegrityError) as e: 43 | raise APIError('Error selecting data', status=400, payload=dict(details=str(e))) 44 | -------------------------------------------------------------------------------- /csvapi/cli.py: -------------------------------------------------------------------------------- 1 | import click 2 | import ssl 3 | from click_default_group import DefaultGroup 4 | 5 | from csvapi.webservice import app 6 | 7 | RESPONSE_TIMEOUT = 5 * 60 # in seconds 8 | 9 | 10 | @click.group(cls=DefaultGroup, default='serve', default_if_no_args=True) 11 | @click.version_option() 12 | def cli(): 13 | """ 14 | csvapi! 15 | """ 16 | 17 | 18 | @click.option('--dbs', default='./dbs', 19 | type=click.Path(exists=True, file_okay=False), 20 | help='Where to store sqlite DBs') 21 | @click.option('-h', '--host', default='127.0.0.1', 22 | help='host for server, defaults to 127.0.0.1') 23 | @click.option('-p', '--port', default=8001, 24 | help='port for server, defaults to 8001') 25 | @click.option('--debug', is_flag=True, 26 | help='Enable debug mode - useful for development') 27 | @click.option('--reload', is_flag=True, 28 | help='Automatically reload if code change detected') 29 | @click.option('--cache/--no-cache', default=True, 30 | help='Do not parse CSV again if DB already exists') 31 | @click.option('--ssl-cert', default=None, 32 | help='Path to SSL certificate') 33 | @click.option('--ssl-key', default=None, 34 | help='Path to SSL key') 35 | @cli.command() 36 | def serve(dbs, host, port, debug, reload, cache, ssl_cert, ssl_key): 37 | ssl_context = None 38 | if ssl_cert and ssl_key: 39 | ssl_context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH) 40 | ssl_context.load_cert_chain(certfile=ssl_cert, keyfile=ssl_key) 41 | app.config.update({ 42 | 'DB_ROOT_DIR': dbs, 43 | 'CSV_CACHE_ENABLED': cache, 44 | 'DEBUG': debug, 45 | 'RESPONSE_TIMEOUT': RESPONSE_TIMEOUT, 46 | }) 47 | app.run(host=host, port=port, debug=debug, use_reloader=reload, ssl=ssl_context) 48 | -------------------------------------------------------------------------------- /benchmark/legacy/bench-parser.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import warnings 3 | from pathlib import Path 4 | 5 | import click 6 | 7 | from csvapi.parser import detect_encoding 8 | from csvapi.parser import from_csv 9 | from csvapi.parser import detect_type, CSV_FILETYPES 10 | 11 | SNIFF_LIMIT = 4096 12 | FILES_DIR = "/Users/alexandre/Developer/Etalab/decapode/data/downloaded" 13 | 14 | 15 | @click.command() 16 | @click.option('--filename', prompt='Error file name') 17 | def run(filename): 18 | filename = filename if filename.endswith(".csv") else f"{filename}.csv" 19 | parsed = 0 20 | warning = 0 21 | not_csv = 0 22 | errors = [] 23 | 24 | for filepath in Path(FILES_DIR).glob("*.csv"): 25 | parsed += 1 26 | file_type = detect_type(filepath) 27 | if not any([supported in file_type for supported in CSV_FILETYPES]): 28 | # print(f"Not a CSV through magic number", file_type.strip()) 29 | not_csv += 1 30 | continue 31 | encoding = detect_encoding(filepath) 32 | try: 33 | with warnings.catch_warnings(record=True) as w: 34 | table = from_csv(filepath, encoding=encoding, sniff_limit=SNIFF_LIMIT) 35 | if any(["Column" in _w.message.__str__() for _w in w]): 36 | warning += 1 37 | except Exception as e: 38 | print('-----', filepath) 39 | print("ERROR", e) 40 | errors.append({ 41 | "filepath": filepath, 42 | "error": e.__str__() 43 | }) 44 | 45 | print(f"Errors: {len(errors)}/{parsed} ({round(len(errors) / parsed * 100, 2)}%)") 46 | print(f"Column warnings: {warning}/{parsed} ({round(warning / parsed * 100, 2)}%)") 47 | print(f"Not CSV (magic): {not_csv}/{parsed} ({round(not_csv / parsed * 100, 2)}%)") 48 | 49 | if not errors: 50 | return 51 | 52 | with open(filename, 'w') as ofile: 53 | writer = csv.DictWriter(ofile, fieldnames=errors[0].keys()) 54 | writer.writeheader() 55 | writer.writerows(errors) 56 | 57 | if __name__ == '__main__': 58 | run() 59 | -------------------------------------------------------------------------------- /csvapi/webservice.py: -------------------------------------------------------------------------------- 1 | import os 2 | import traceback 3 | 4 | from quart import Quart, jsonify 5 | from quart_cors import cors 6 | from werkzeug.exceptions import NotFound 7 | 8 | from csvapi.errors import APIError 9 | from csvapi.tableview import TableView 10 | from csvapi.exportview import ExportView 11 | from csvapi.uploadview import UploadView 12 | from csvapi.parseview import ParseView 13 | from csvapi.security import filter_referrers 14 | 15 | app = Quart(__name__) 16 | app = cors(app, allow_origin='*') 17 | 18 | app.add_url_rule('/api/', view_func=TableView.as_view('table')) 19 | app.add_url_rule('/api//export', view_func=ExportView.as_view('export')) 20 | app.add_url_rule('/apify', view_func=ParseView.as_view('parse')) 21 | app.add_url_rule('/upload', view_func=UploadView.as_view('upload')) 22 | app.before_request(filter_referrers) 23 | 24 | 25 | conffile = os.environ.get('CSVAPI_CONFIG_FILE') or '../config.py' 26 | app.config.from_pyfile(conffile) 27 | 28 | 29 | def handle_and_print_error(error): 30 | sentry_id = None 31 | if app.config.get('SENTRY_DSN'): 32 | import sentry_sdk 33 | with sentry_sdk.push_scope() as scope: 34 | sentry_sdk.init( 35 | app.config['SENTRY_DSN'], 36 | traces_sample_rate=1.0 37 | ) 38 | scope.set_extra('debug', False) 39 | from sentry_sdk import capture_exception 40 | sentry_id = capture_exception(error) 41 | traceback.print_exc() 42 | return sentry_id 43 | 44 | 45 | @app.errorhandler(NotFound) 46 | def handle_not_found(error): 47 | response = jsonify({ 48 | 'ok': False, 49 | 'error': 'Not found', 50 | }) 51 | response.status_code = 404 52 | return response 53 | 54 | 55 | @app.errorhandler(APIError) 56 | def handle_api_error(error): 57 | error_id = handle_and_print_error(error) 58 | data = error.to_dict() 59 | app.logger.error(f"{data.get('error')}: {data.get('details', '')}") 60 | data['error_id'] = error_id 61 | response = jsonify(data) 62 | response.status_code = error.status 63 | return response 64 | 65 | 66 | @app.errorhandler(Exception) 67 | def handle_exceptions(error): 68 | """Serialize all errors to API""" 69 | error_id = handle_and_print_error(error) 70 | response = jsonify(error=str(error), error_id=error_id, ok=False) 71 | return response, 500 72 | -------------------------------------------------------------------------------- /csvapi/type_tester.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from agate.data_types.base import DataType 4 | from agate.data_types.boolean import Boolean 5 | from agate.data_types.date import Date 6 | from agate.data_types.date_time import DateTime 7 | from agate.data_types.number import Number 8 | from agate.data_types.text import Text 9 | from agate.data_types.time_delta import TimeDelta 10 | from agate.exceptions import CastError 11 | from agate.type_tester import TypeTester 12 | 13 | from agatesql import table as agatesqltable 14 | 15 | from sqlalchemy.types import VARCHAR 16 | 17 | from stdnum.fr.siren import is_valid as is_valid_siren 18 | from stdnum.fr.siret import is_valid as is_valid_siret 19 | 20 | 21 | class Time(DataType): 22 | # Detect an hour minute string. 23 | # Examples: 12:20, 9:50, 23:30 24 | def __init__(self, **kwargs): 25 | super(Time, self).__init__(**kwargs) 26 | 27 | def cast(self, d): 28 | if d is None: 29 | return d 30 | if re.match(r"^(?:[01]\d|2[0-3]|\d):[0-5]\d$", str(d)): 31 | return Text().cast(d) 32 | raise CastError('Can not parse value "%s" as time.' % d) 33 | 34 | 35 | class SirenSiret(DataType): 36 | # Detect a SIREN or SIRET number 37 | def __init__(self): 38 | super(SirenSiret, self).__init__() 39 | 40 | def cast(self, d): 41 | if d is None: 42 | return d 43 | if is_valid_siret(d) or is_valid_siren(d): 44 | return Text().cast(d) 45 | raise CastError('Can not parse value "%s" as a SIREN or SIRET.' % d) 46 | 47 | 48 | # agatesql needs to know the SQL equivalent of a type. 49 | # Tell agatesql how our custom types should be converted in SQL. 50 | # 51 | # Reference: 52 | # https://github.com/wireservice/agate-sql/blob/7466073d81289323851c21817ea33170e36ce2a5/agatesql/table.py#L21-L28 53 | agatesqltable.SQL_TYPE_MAP[Time] = VARCHAR 54 | agatesqltable.SQL_TYPE_MAP[SirenSiret] = VARCHAR 55 | 56 | 57 | def agate_tester(): 58 | # Override the original list of type checkers present in agate 59 | # to detect types. 60 | # 61 | # Original list here: 62 | # https://github.com/wireservice/agate/blob/e3078dca8b3566e8408e65981f79918c2f36f9fe/agate/type_tester.py#L64-L71 63 | return TypeTester( 64 | types=[ 65 | Boolean(), 66 | SirenSiret(), 67 | Number(), 68 | Time(), 69 | TimeDelta(), 70 | Date(), 71 | DateTime(), 72 | Text(), 73 | ] 74 | ) 75 | -------------------------------------------------------------------------------- /csvapi/parser.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import agate 4 | import cchardet as chardet 5 | 6 | from csvapi.utils import get_db_info 7 | from csvapi.type_tester import agate_tester 8 | import logging 9 | 10 | logging.captureWarnings(True) 11 | logging.getLogger("py.warnings").setLevel(logging.ERROR) 12 | 13 | SNIFF_LIMIT = 4096 14 | CSV_FILETYPES = ('text/plain', 'application/csv', 'text/csv') 15 | 16 | 17 | def detect_type(filepath): 18 | with os.popen(f'file {filepath} -b --mime-type') as proc: 19 | return proc.read().lower() 20 | 21 | 22 | def detect_encoding(filepath): 23 | with open(filepath, 'rb') as f: 24 | return chardet.detect(f.read()).get('encoding') 25 | 26 | 27 | def from_csv(filepath, encoding='utf-8', sniff_limit=SNIFF_LIMIT): 28 | """Try first w/ sniffing and then w/o sniffing if it fails, 29 | and then again by forcing ';' delimiter w/o sniffing""" 30 | kwargs = { 31 | 'sniff_limit': sniff_limit, 32 | 'encoding': encoding, 33 | 'column_types': agate_tester() 34 | } 35 | 36 | with open(filepath, 'rb') as fp: 37 | if len(fp.readlines()) < 2: 38 | raise ValueError 39 | 40 | try: 41 | return agate.Table.from_csv(filepath, **kwargs) 42 | except ValueError: 43 | try: 44 | kwargs.pop('sniff_limit') 45 | return agate.Table.from_csv(filepath, **kwargs) 46 | except ValueError: 47 | kwargs['delimiter'] = ';' 48 | return agate.Table.from_csv(filepath, **kwargs) 49 | 50 | 51 | def from_excel(filepath, xlsx=False): 52 | # Function exists to prevent side-effects after monckey patching with import 53 | import agateexcel # noqa 54 | if xlsx: 55 | return agate.Table.from_xlsx(filepath, column_types=agate_tester()) 56 | return agate.Table.from_xls(filepath, column_types=agate_tester()) 57 | 58 | 59 | def to_sql(table, urlhash, storage): 60 | db_info = get_db_info(urlhash, storage=storage) 61 | table.to_sql(db_info['dsn'], db_info['db_name'], overwrite=True) 62 | 63 | 64 | def parse(filepath, urlhash, storage, encoding=None, sniff_limit=SNIFF_LIMIT): 65 | is_csv = False 66 | file_type = detect_type(filepath) 67 | if 'application/vnd.ms-excel' in file_type: 68 | table = from_excel(filepath) 69 | elif 'application/vnd.openxml' in file_type: 70 | table = from_excel(filepath, xlsx=True) 71 | elif any([supported in file_type for supported in CSV_FILETYPES]): 72 | encoding = detect_encoding(filepath) if not encoding else encoding 73 | table = from_csv(filepath, encoding=encoding, sniff_limit=sniff_limit) 74 | is_csv = True 75 | else: 76 | raise Exception(f'Unsupported file type {file_type}') 77 | to_sql(table, urlhash, storage) 78 | return is_csv 79 | -------------------------------------------------------------------------------- /benchmark/bench.py: -------------------------------------------------------------------------------- 1 | """ 2 | 2022-11-03 3 | 4 | csvapi cli, no cache, pool_size=1, analysis=false 5 | -------------->Time execution : 17.989488124847412<-------------- 6 | -------------->Time execution : 24.22131085395813<-------------- 7 | -------------->Time execution : 21.933547019958496<-------------- 8 | 9 | csvapi cli, no cache, pool_size=1, analysis=yes 10 | -------------->Time execution : 28.727387189865112<-------------- 11 | -------------->Time execution : 27.748358964920044<-------------- 12 | -------------->Time execution : 22.15376091003418<-------------- 13 | 14 | csvapi cli, no cache, pool_size=0, analysis=yes 15 | -------------->Time execution : 27.46714496612549<-------------- 16 | -------------->Time execution : 28.398924112319946<-------------- 17 | -------------->Time execution : 25.25711679458618<-------------- 18 | 19 | hypercorn -w3, no cache, pool_size=1, analysis=yes 20 | -------------->Time execution : 17.33577609062195<-------------- 21 | -------------->Time execution : 27.747673988342285<-------------- 22 | -------------->Time execution : 19.758486032485962<-------------- 23 | 24 | hypercorn -w3, no cache, pool_size=0, analysis=yes 25 | -------------->Time execution : 23.761262893676758<-------------- 26 | -------------->Time execution : 18.91990613937378<-------------- 27 | -------------->Time execution : 31.557281017303467<-------------- 28 | -------------->Time execution : 31.700807809829712<-------------- 29 | -------------->Time execution : 32.8078031539917<-------------- 30 | """ 31 | 32 | import aiohttp 33 | import asyncio 34 | import time 35 | 36 | ANALYSIS = True 37 | 38 | URLS_APIFY = [ 39 | 'http://localhost:8001/apify?url=http://datanova.legroupe.laposte.fr/explore/dataset/laposte_poincont2/download/?format=csv&timezone=Europe/Berlin&use_labels_for_header=true', 40 | 'http://localhost:8001/apify?url=https://datanova.legroupe.laposte.fr/explore/dataset/laposte_hexasmal/download/?format=csv&timezone=Europe/Berlin&use_labels_for_header=true', 41 | 'http://localhost:8001/apify?url=https://people.sc.fsu.edu/~jburkardt/data/csv/snakes_count_10.csv', 42 | 'http://localhost:8001/apify?url=https://people.sc.fsu.edu/~jburkardt/data/csv/snakes_count_100.csv', 43 | 'http://localhost:8001/apify?url=https://people.sc.fsu.edu/~jburkardt/data/csv/snakes_count_1000.csv', 44 | 'http://localhost:8001/apify?url=https://people.sc.fsu.edu/~jburkardt/data/csv/snakes_count_10000.csv' 45 | ] 46 | 47 | 48 | async def fetch_apify(session, url): 49 | if ANALYSIS: 50 | url = url.replace('apify?', 'apify?analysis=yes&') 51 | async with session.get(url) as response: 52 | res = await response.json() 53 | return res['endpoint'] 54 | 55 | 56 | async def fetch_api(session, url): 57 | async with session.get(url) as response: 58 | return await response.text() 59 | 60 | 61 | async def main(): 62 | start = time.time() 63 | async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(force_close=True)) as session: 64 | apify_requests = [fetch_apify(session, url) for url in URLS_APIFY] 65 | endpoints = await asyncio.gather(*apify_requests) 66 | api_requests = list() 67 | for endpoint in endpoints: 68 | for _ in range(20): 69 | api_requests.append(asyncio.ensure_future(fetch_api(session, endpoint))) 70 | await asyncio.gather(*api_requests) 71 | end = time.time() 72 | print(f"-------------->Time execution : {end - start}<--------------") 73 | 74 | 75 | if __name__ == '__main__': 76 | test = asyncio.run(main()) 77 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## Current (in progress) 4 | 5 | - Nothing yet 6 | 7 | ## 2.2.0 (2022-11-04) 8 | 9 | - Remove profile endpoint, disable thread pool for profiling [#135](https://github.com/etalab/csvapi/pull/135) 10 | - Fix tests by using a custom docker image [#135](https://github.com/etalab/csvapi/pull/135) 11 | 12 | ## 2.1.1 (2022-10-25) 13 | 14 | * Fix bugs [#126](https://github.com/etalab/csvapi/pull/126) with json files 15 | 16 | ## 2.1.0 (2022-10-13) 17 | 18 | * Fix bugs [#110](https://github.com/etalab/csvapi/pull/110) and [#111](https://github.com/etalab/csvapi/pull/111) 19 | * Add endpoint API greater_than or less_than int or float value [#109](https://github.com/etalab/csvapi/pull/109) 20 | * Update version csv-detective [#119](https://github.com/etalab/csvapi/pull/119) 21 | 22 | ## 2.0.0 (2022-09-15) 23 | 24 | - [BREAKING] Migrate to python >= 3.9 [#104](https://github.com/etalab/csvapi/pull/104) 25 | - Migrate to poetry [#104](https://github.com/etalab/csvapi/pull/104) 26 | - Enrich sqlite dbs with metadata extracted from csv-detective and pandas profiling [#104](https://github.com/etalab/csvapi/pull/104) 27 | - Enrich apify api with possibility to analyse resource [#104](https://github.com/etalab/csvapi/pull/104) 28 | 29 | ## 1.2.1 (2021-04-29) 30 | 31 | - Upgrade raven to sentry-sdk (a bit dirty so far) 32 | 33 | ## 1.2.0 (2021-04-29) 34 | 35 | - Add profiling support [#77](https://github.com/etalab/csvapi/pull/77) 36 | - Fix bug in filters w/ blanks in column names [#77](https://github.com/etalab/csvapi/pull/77) 37 | 38 | ## 1.1.0 (2021-03-23) 39 | 40 | - Use aiosqlite [#76](https://github.com/etalab/csvapi/pull/76) 41 | 42 | ## 1.0.6 (2020-12-14) 43 | 44 | - Better parsing fallback [#71](https://github.com/etalab/csvapi/pull/71) 45 | 46 | ## 1.0.5 (2020-11-17) 47 | 48 | - Parsing view now raises exception on http error response codes [#69](https://github.com/etalab/csvapi/pull/69) 49 | 50 | ## 1.0.4 (2020-10-26) 51 | 52 | - Protect custom type testers against None values [#66](https://github.com/etalab/csvapi/pull/66) 53 | - Fix xlsx file support [#67](https://github.com/etalab/csvapi/pull/67) 54 | 55 | ## 1.0.3 (2020-03-04) 56 | 57 | - Fix packaging problem 58 | 59 | ## 1.0.2 (2020-03-04) 60 | 61 | - Fix XLS parsing [#60](https://github.com/etalab/csvapi/pull/60) 62 | 63 | ## 1.0.1 (2020-01-03) 64 | 65 | - Fix aiohttp import [#52](https://github.com/etalab/csvapi/pull/52) 66 | 67 | ## 1.0.0 (2020-01-03) 68 | 69 | - Add filters support [#50](https://github.com/etalab/csvapi/pull/50) 70 | - Replace requests by aiohttp for asynchronous http requests. Also replace every format() string to use only f"strings. [#46](https://github.com/etalab/csvapi/pull/46) 71 | 72 | ## 0.1.0 (2019-09-06) 73 | 74 | - Upgrade to Quart-0.9.1 :warning: requires python-3.7 [#21](https://github.com/opendatateam/csvapi/pull/21) 75 | - Parse hours, SIREN and SIRET as text [#42](https://github.com/opendatateam/csvapi/pull/42) 76 | 77 | ## 0.0.9 (2019-01-18) 78 | 79 | - Upgrade to Quart-0.6.6 and hypercorn-0.4.6 [#16](https://github.com/opendatateam/csvapi/pull/16) 80 | 81 | ## 0.0.8 (2018-10-04) 82 | 83 | - Try to parse CSV w/o sniffing (excel dialect) after sniffing if it fails 84 | 85 | ## 0.0.7 (2018-09-17) 86 | 87 | - `MAX_FILE_SIZE` config variable [#13](https://github.com/opendatateam/csvapi/pull/13) 88 | - Add filter by referrer feature (REFERRERS_FILTER) [#14](https://github.com/opendatateam/csvapi/pull/14) 89 | 90 | ## 0.0.6 (2018-09-10) 91 | 92 | - Compute the total number of rows in a table [#12](https://github.com/opendatateam/csvapi/pull/12) 93 | 94 | ## 0.0.5 (2018-09-10) 95 | 96 | - Make CSV sniff limit a config variable and raise the default value [#11](https://github.com/opendatateam/csvapi/pull/11) 97 | - Properly handle not found (404) errors 98 | 99 | ## 0.0.4 (2018-09-04) 100 | 101 | - FORCE_SSL config variable 102 | 103 | ## 0.0.3 (2018-08-31) 104 | 105 | - Sentry support via SENTRY_DSN config variable 106 | 107 | ## 0.0.2 (2018-08-30) 108 | 109 | - CSVAPI_CONFIG_FILE env var support 110 | 111 | ## 0.0.1 (2018-08-30) 112 | 113 | - Initial version 114 | -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | --- 2 | version: 2.1 3 | 4 | parameters: 5 | docker-image: 6 | type: string 7 | default: "etalab/csvapi-circle" 8 | python-module: 9 | type: string 10 | default: "csvapi" 11 | publish-branch: 12 | type: string 13 | default: "master" 14 | cache-prefix: 15 | type: string 16 | default: "py-cache-v2" 17 | 18 | jobs: 19 | install: 20 | docker: 21 | - image: << pipeline.parameters.docker-image >> 22 | steps: 23 | - checkout 24 | - run: 25 | name: Get the base reference branch 26 | command: export BASE_BRANCH=$(base_branch) 27 | - restore_cache: 28 | keys: 29 | - << pipeline.parameters.cache-prefix >>-{{ arch }}-{{ checksum "poetry.lock" }} 30 | - << pipeline.parameters.cache-prefix >>-{{ arch }}-{{ .Branch }} 31 | - << pipeline.parameters.cache-prefix >>-{{ arch }}-{{ .Environment.BASE_BRANCH }} 32 | - run: 33 | name: Install python dependencies 34 | command: | 35 | poetry self update 36 | poetry config virtualenvs.in-project true 37 | poetry install 38 | - save_cache: 39 | key: << pipeline.parameters.cache-prefix >>-{{ arch }}-{{ checksum "poetry.lock" }} 40 | paths: 41 | - .venv 42 | - save_cache: 43 | key: << pipeline.parameters.cache-prefix >>-{{ arch }}-{{ .Branch }} 44 | paths: 45 | - .venv 46 | - persist_to_workspace: 47 | root: . 48 | paths: 49 | - . 50 | 51 | lint: 52 | docker: 53 | - image: << pipeline.parameters.docker-image >> 54 | steps: 55 | - attach_workspace: 56 | at: . 57 | - run: 58 | name: Lint code 59 | command: poetry run flake8 << pipeline.parameters.python-module >> 60 | 61 | tests: 62 | docker: 63 | - image: << pipeline.parameters.docker-image >> 64 | steps: 65 | - attach_workspace: 66 | at: . 67 | - run: 68 | name: Run tests 69 | command: | 70 | poetry run pytest --junitxml=reports/python/tests.xml -p no:sugar --color=yes 71 | - store_test_results: 72 | path: reports/python 73 | 74 | build: 75 | docker: 76 | - image: << pipeline.parameters.docker-image >> 77 | steps: 78 | - attach_workspace: 79 | at: . 80 | - run: 81 | name: Build a distributable package 82 | command: | 83 | # Build a wheel release 84 | if [[ $CIRCLE_TAG ]]; then 85 | # This is a tagged release, version has been handled upstream 86 | poetry build 87 | else 88 | # Relies on a dev version like "1.2.1.dev" by default 89 | poetry version $(poetry version -s)$CIRCLE_BUILD_NUM 90 | poetry build 91 | fi 92 | - store_artifacts: 93 | path: dist 94 | - persist_to_workspace: 95 | root: . 96 | paths: 97 | - . 98 | 99 | publish: 100 | docker: 101 | - image: << pipeline.parameters.docker-image >> 102 | steps: 103 | - attach_workspace: 104 | at: . 105 | - deploy: 106 | name: Publish on PyPI 107 | command: | 108 | poetry publish --username "${PYPI_USERNAME}" --password "${PYPI_PASSWORD}" --no-interaction 109 | 110 | workflows: 111 | version: 2 112 | build: 113 | jobs: 114 | - install: 115 | filters: 116 | tags: 117 | only: /v[0-9]+(\.[0-9]+)*/ 118 | - lint: 119 | requires: 120 | - install 121 | filters: 122 | tags: 123 | only: /v[0-9]+(\.[0-9]+)*/ 124 | - tests: 125 | requires: 126 | - install 127 | filters: 128 | tags: 129 | only: /v[0-9]+(\.[0-9]+)*/ 130 | - build: 131 | requires: 132 | - tests 133 | - lint 134 | filters: 135 | tags: 136 | only: /v[0-9]+(\.[0-9]+)*/ 137 | - publish: 138 | requires: 139 | - build 140 | filters: 141 | branches: 142 | only: 143 | - << pipeline.parameters.publish-branch >> 144 | - /[0-9]+(\.[0-9]+)+/ 145 | tags: 146 | only: /v[0-9]+(\.[0-9]+)*/ 147 | context: org-global 148 | -------------------------------------------------------------------------------- /profiling-minimal.yml: -------------------------------------------------------------------------------- 1 | # Title of the document 2 | title: "Pandas Profiling Report" 3 | 4 | # Metadata 5 | dataset: 6 | description: "" 7 | creator: "" 8 | author: "Etalab" 9 | copyright_holder: "" 10 | copyright_year: "" 11 | url: "" 12 | 13 | variables: 14 | descriptions: {} 15 | 16 | # infer dtypes 17 | infer_dtypes: True 18 | 19 | # Show the description at each variable (in addition to the overview tab) 20 | show_variable_description: True 21 | 22 | # Number of workers (0=multiprocessing.cpu_count()) 23 | pool_size: 0 24 | 25 | # Show the progress bar 26 | progress_bar: True 27 | 28 | # Per variable type description settings 29 | vars: 30 | num: 31 | quantiles: 32 | - 0.05 33 | - 0.25 34 | - 0.5 35 | - 0.75 36 | - 0.95 37 | skewness_threshold: 20 38 | low_categorical_threshold: 5 39 | # Set to zero to disable 40 | chi_squared_threshold: 0.0 41 | cat: 42 | length: False 43 | characters: False 44 | words: False 45 | cardinality_threshold: 50 46 | n_obs: 5 47 | # Set to zero to disable 48 | chi_squared_threshold: 0.0 49 | coerce_str_to_date: False 50 | redact: False 51 | bool: 52 | n_obs: 3 53 | # string to boolean mappings pairs (true, false) 54 | mappings: 55 | - ["t", "f"] 56 | - ["yes", "no"] 57 | - ["y", "n"] 58 | - ["true", "false"] 59 | path: 60 | active: False 61 | file: 62 | active: False 63 | image: 64 | active: False 65 | exif: False 66 | hash: False 67 | url: 68 | active: False 69 | 70 | 71 | # Sort the variables. Possible values: ascending, descending or None (leaves original sorting) 72 | sort: None 73 | 74 | # which diagrams to show 75 | missing_diagrams: 76 | bar: False 77 | matrix: False 78 | heatmap: False 79 | dendrogram: False 80 | 81 | correlations: 82 | pearson: 83 | calculate: False 84 | warn_high_correlations: True 85 | threshold: 0.9 86 | spearman: 87 | calculate: False 88 | warn_high_correlations: False 89 | threshold: 0.9 90 | kendall: 91 | calculate: False 92 | warn_high_correlations: False 93 | threshold: 0.9 94 | phi_k: 95 | calculate: False 96 | warn_high_correlations: False 97 | threshold: 0.9 98 | cramers: 99 | calculate: False 100 | warn_high_correlations: True 101 | threshold: 0.9 102 | 103 | 104 | # Bivariate / Pairwise relations 105 | interactions: 106 | targets: [] 107 | continuous: False 108 | 109 | # For categorical 110 | categorical_maximum_correlation_distinct: 100 111 | 112 | # Plot-specific settings 113 | plot: 114 | # Image format (svg or png) 115 | image_format: "svg" 116 | dpi: 800 117 | 118 | scatter_threshold: 1000 119 | 120 | correlation: 121 | cmap: 'RdBu' 122 | bad: '#000000' 123 | 124 | missing: 125 | cmap: 'RdBu' 126 | # Force labels when there are > 50 variables 127 | # https://github.com/ResidentMario/missingno/issues/93#issuecomment-513322615 128 | force_labels: True 129 | 130 | pie: 131 | # display a pie chart if the number of distinct values is smaller or equal (set to 0 to disable) 132 | max_unique: 0 133 | 134 | histogram: 135 | x_axis_labels: True 136 | 137 | # Number of bins (set to 0 to automatically detect the bin size) 138 | bins: 50 139 | 140 | # Maximum number of bins (when bins=0) 141 | max_bins: 250 142 | 143 | # The number of observations to show 144 | n_obs_unique: 5 145 | n_extreme_obs: 5 146 | n_freq_table_max: 10 147 | 148 | # Use `deep` flag for memory_usage 149 | memory_deep: False 150 | 151 | # Configuration related to the duplicates 152 | duplicates: 153 | head: 0 154 | 155 | # Configuration related to the samples area 156 | samples: 157 | head: 0 158 | tail: 0 159 | random: 0 160 | 161 | # Configuration related to the rejection of variables 162 | reject_variables: True 163 | 164 | # When in a Jupyter notebook 165 | notebook: 166 | iframe: 167 | height: '800px' 168 | width: '100%' 169 | # or 'src' 170 | attribute: 'srcdoc' 171 | 172 | html: 173 | # Minify the html 174 | minify_html: True 175 | 176 | # Offline support 177 | use_local_assets: True 178 | 179 | # If True, single file, else directory with assets 180 | inline: True 181 | 182 | # Show navbar 183 | navbar_show: True 184 | 185 | # For internal use 186 | file_name: None 187 | 188 | # Styling options for the HTML report 189 | style: 190 | theme: None 191 | logo: "" 192 | primary_color: "#337ab7" 193 | full_width: False 194 | -------------------------------------------------------------------------------- /csvapi/parseview.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | 4 | import aiohttp 5 | import validators 6 | import pandas as pd 7 | 8 | from quart import request, jsonify, current_app as app 9 | from quart.views import MethodView 10 | 11 | from csvapi.errors import APIError 12 | from csvapi.parser import parse 13 | from csvapi.profiling import CSVAPIProfileReport 14 | from csvapi.utils import ( 15 | already_exists, 16 | get_hash, 17 | check_csv_detective_report_structure, 18 | check_profile_report_structure, 19 | create_connection, 20 | enrich_db_with_metadata 21 | ) 22 | 23 | from csv_detective.explore_csv import routine 24 | 25 | 26 | class ParseView(MethodView): 27 | 28 | @staticmethod 29 | async def do_parse( 30 | url, 31 | urlhash, 32 | encoding, 33 | storage, 34 | logger, 35 | sniff_limit, 36 | max_file_size, 37 | analysis=None 38 | ): 39 | logger.debug('* do_parse %s (%s)', urlhash, url) 40 | tmp = tempfile.NamedTemporaryFile(delete=False) 41 | chunk_count = 0 42 | chunk_size = 1024 43 | try: 44 | async with aiohttp.ClientSession(raise_for_status=True) as session: 45 | async with session.get(url) as resp: 46 | while True: 47 | chunk = await resp.content.read(chunk_size) 48 | if chunk_count * chunk_size > max_file_size: 49 | tmp.close() 50 | raise Exception('File too big (max size is %s bytes)' % max_file_size) 51 | if not chunk: 52 | break 53 | tmp.write(chunk) 54 | chunk_count += 1 55 | tmp.close() 56 | 57 | logger.debug('* Downloaded %s', urlhash) 58 | logger.debug('* Parsing %s...', urlhash) 59 | is_csv = parse( 60 | tmp.name, 61 | urlhash, 62 | storage, 63 | encoding=encoding, 64 | sniff_limit=sniff_limit 65 | ) 66 | 67 | if is_csv and analysis and analysis == 'yes': 68 | csv_detective_report = routine(tmp.name) 69 | 70 | if not check_csv_detective_report_structure(csv_detective_report): 71 | logger.error( 72 | "csvdetective report malformed" 73 | ) 74 | return 75 | 76 | profile_report = await CSVAPIProfileReport().get_minimal_profile(urlhash) 77 | 78 | if not check_profile_report_structure(profile_report): 79 | logger.error( 80 | "pandas profiling report malformed" 81 | ) 82 | return 83 | 84 | enrich_db_with_metadata( 85 | urlhash, 86 | csv_detective_report, 87 | profile_report, 88 | None, 89 | None 90 | ) 91 | 92 | if not is_csv and analysis and analysis == 'yes': 93 | conn = create_connection(f"{app.config['DB_ROOT_DIR']}/{urlhash}.db") 94 | general_infos = [ 95 | { 96 | 'filetype': 'excel' 97 | } 98 | ] 99 | df = pd.DataFrame(general_infos) 100 | df.to_sql('general_infos', con=conn, if_exists='replace', index=False) 101 | 102 | logger.debug('* Parsed %s', urlhash) 103 | finally: 104 | logger.debug('Removing tmp file: %s', tmp.name) 105 | os.unlink(tmp.name) 106 | 107 | async def get(self): 108 | app.logger.debug('* Starting ParseView.get') 109 | url = request.args.get('url') 110 | encoding = request.args.get('encoding') 111 | if not url: 112 | raise APIError('Missing url query string variable.', status=400) 113 | if not validators.url(url): 114 | raise APIError('Malformed url parameter.', status=400) 115 | urlhash = get_hash(url) 116 | analysis = request.args.get('analysis') 117 | if not await already_exists(urlhash, analysis): 118 | try: 119 | storage = app.config['DB_ROOT_DIR'] 120 | await self.do_parse(url=url, 121 | urlhash=urlhash, 122 | encoding=encoding, 123 | storage=storage, 124 | logger=app.logger, 125 | sniff_limit=app.config.get('CSV_SNIFF_LIMIT'), 126 | max_file_size=app.config.get('MAX_FILE_SIZE'), 127 | analysis=analysis) 128 | except Exception as e: 129 | raise APIError('Error parsing CSV: %s' % e) 130 | else: 131 | app.logger.info(f"{urlhash}.db already exists, skipping parse.") 132 | scheme = 'https' if app.config.get('FORCE_SSL') else request.scheme 133 | return jsonify({ 134 | 'ok': True, 135 | 'endpoint': f"{scheme}://{request.host}/api/{urlhash}", 136 | }) 137 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # csvapi 2 | 3 | "Instantly" publish an API for a CSV hosted anywhere on the internet. Also supports Excel files. 4 | 5 | This tool is used by [data.gouv.fr](https://www.data.gouv.fr) to show a preview of hosted CSV and XLS files. 6 | 7 | ## Installation 8 | 9 | Requires Python 3.9+ and a Unix OS with the `file` command available. 10 | 11 | ```shell 12 | python3 -m venv pyenv && . pyenv/bin/activate 13 | pip install csvapi 14 | ``` 15 | 16 | For development: 17 | 18 | ```shell 19 | poetry install 20 | ``` 21 | 22 | ## Quickstart 23 | 24 | ```shell 25 | poetry run csvapi serve -h 0.0.0.0 -p 8000 26 | ``` 27 | 28 | ## Command line options 29 | 30 | ```shell 31 | $ poetry run csvapi serve --help 32 | Usage: csvapi serve [OPTIONS] 33 | 34 | Options: 35 | --ssl-key TEXT Path to SSL key 36 | --ssl-cert TEXT Path to SSL certificate 37 | --cache / --no-cache Do not parse CSV again if DB already exists 38 | --reload Automatically reload if code change detected 39 | --debug Enable debug mode - useful for development 40 | -p, --port INTEGER port for server, defaults to 8001 41 | -h, --host TEXT host for server, defaults to 127.0.0.1 42 | --dbs DIRECTORY Where to store sqlite DBs 43 | --help Show this message and exit. 44 | ``` 45 | 46 | ## Deploy 47 | 48 | With SSL, using [Hypercorn](https://pgjones.gitlab.io/hypercorn/): 49 | 50 | ```shell 51 | hypercorn csvapi.webservice:app -b 0.0.0.0:443 --keyfile key.pem --ca-certs cert.pem 52 | ``` 53 | 54 | See [the documentation](https://pgjones.gitlab.io/hypercorn/usage.html) for more options. 55 | 56 | You can use the environment variable `CSVAPI_CONFIG_FILE` to point to a custom configuration file. 57 | 58 | ## API usage 59 | 60 | ### Conversion 61 | 62 | `/apify?url=http://somewhere.com/a/file.csv` 63 | 64 | This converts a CSV to an SQLite database (w/ `agate`) and returns the following response: 65 | 66 | ```json 67 | {"ok": true, "endpoint": "http://localhost:8001/api/cde857960e8dc24c9cbcced673b496bb"} 68 | ``` 69 | 70 | ### Parameters 71 | 72 | Some parameters can be used in the query string. 73 | 74 | #### `encoding` 75 | 76 | **default**: _automatic detection_ 77 | 78 | You can force an encoding (e.g. `utf-8`) using this parameter, instead of relying on the automatic detection. 79 | 80 | 81 | ### Data API 82 | 83 | This is the `endpoint` attribute of the previous response. 84 | 85 | `/api/` 86 | 87 | This queries a previously converted API file and returns the first 100 rows like this: 88 | 89 | ```json 90 | { 91 | "ok": true, 92 | "rows": [[], []], 93 | "columns": [], 94 | "query_ms": 1 95 | } 96 | ``` 97 | 98 | ### Parameters 99 | 100 | Some parameters can be used in the query string. 101 | 102 | #### `_size` 103 | 104 | **default**: `100` 105 | 106 | This will limit the query to a certain number of rows. For instance to get only 250 rows: 107 | 108 | `/api/?_size=250` 109 | 110 | #### `_sort` and `_sort_desc` 111 | 112 | Use those to sort by a column. `sort` will sort by ascending order, `sort_desc` by descending order. 113 | 114 | `/api/?_sort=` 115 | 116 | #### `_offset` 117 | 118 | Use this to add on offset. Combined with `_size` it allows pagination. 119 | 120 | `/api/?_size=1&_offset=1` 121 | 122 | #### `_shape` 123 | 124 | **default**: `lists` 125 | 126 | The `_shape` argument is used to specify the format output of the json. It can take the value `objects` to get an array of objects instead of an array of arrays: 127 | 128 | `/api/?_shape=objects` 129 | 130 | For instance, instead of returning: 131 | 132 | ```json 133 | { 134 | "ok": true, 135 | "query_ms": 0.4799365997, 136 | "rows": [ 137 | [1, "Justice", "0101", 57663310], 138 | [2, "Justice", "0101", 2255129], 139 | [3, "Justice", "0101", 36290] 140 | ], 141 | "columns": ["rowid", "Mission", "Programme", "Consommation de CP"] 142 | } 143 | ``` 144 | 145 | It will return: 146 | 147 | ```json 148 | { 149 | "ok": true, 150 | "query_ms": 2.681016922, 151 | "rows": [ 152 | { 153 | "rowid": 1, 154 | "Mission": "Justice", 155 | "Programme": "0101", 156 | "Consommation de CP": 57663310 157 | }, 158 | { 159 | "rowid": 2, 160 | "Mission": "Justice", 161 | "Programme": "0101", 162 | "Consommation de CP": 2255129 163 | }, 164 | { 165 | "rowid": 3, 166 | "Mission": "Justice", 167 | "Programme": "0101", 168 | "Consommation de CP": 36290 169 | }], 170 | "columns": ["rowid", "Mission", "Programme", "Consommation de CP"] 171 | } 172 | ``` 173 | 174 | #### `_rowid` 175 | 176 | **default**: `show` 177 | 178 | The `_rowid` argument is used to display or hide rowids in the returned data. Use `_rowid=hide` to hide. 179 | 180 | `/api/?_shape=objects&_rowid=hide` 181 | 182 | ```json 183 | { 184 | "ok": true, 185 | "query_ms": 2.681016922, 186 | "rows": [ 187 | { 188 | "Mission": "Justice", 189 | "Programme": "0101", 190 | "Consommation de CP": 57663310 191 | }, 192 | { 193 | "Mission": "Justice", 194 | "Programme": "0101", 195 | "Consommation de CP": 2255129 196 | }, 197 | { 198 | "Mission": "Justice", 199 | "Programme": "0101", 200 | "Consommation de CP": 36290 201 | }], 202 | "columns": ["Mission", "Programme", "Consommation de CP"] 203 | } 204 | ``` 205 | 206 | #### `_total` 207 | 208 | **default**: `show` 209 | 210 | The `_total` argument is used to display or hide the total number of rows (independent of pagination) in the returned data. Use `_total=hide` to hide. 211 | 212 | ```json 213 | { 214 | "ok": true, 215 | "query_ms": 2.681016922, 216 | "rows": [ 217 | { 218 | "Mission": "Justice", 219 | "Programme": "0101", 220 | "Consommation de CP": 57663310 221 | }, 222 | { 223 | "Mission": "Justice", 224 | "Programme": "0101", 225 | "Consommation de CP": 2255129 226 | }, 227 | { 228 | "Mission": "Justice", 229 | "Programme": "0101", 230 | "Consommation de CP": 36290 231 | }], 232 | "columns": ["Mission", "Programme", "Consommation de CP"], 233 | "total": 3 234 | } 235 | ``` 236 | 237 | #### Column based filters 238 | 239 | By adding `{column}__{comparator}={value}` to the query string, you can filter the results based on the following criterions: 240 | - `{column}` must be a valid column in your CSV 241 | - `{comparator}` is `exact` (SQL `= {value}`) or `contains` (SQL `LIKE %{value}%`) 242 | - `{value}` is the value you're filtering the column against 243 | 244 | You can add multiple filters, they will be joined with a `AND` at the SQL level. 245 | 246 | ## Credits 247 | 248 | Inspired by the excellent [Datasette](https://github.com/simonw/datasette). 249 | -------------------------------------------------------------------------------- /csvapi/utils.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | 3 | from pathlib import Path 4 | 5 | from quart import current_app as app 6 | 7 | import sqlite3 8 | from datetime import datetime 9 | import pandas as pd 10 | 11 | executor = None 12 | 13 | 14 | def get_db_info(urlhash, storage=None): 15 | if app: 16 | # app.config not thread safe, sometimes we need to pass storage directly 17 | db_storage = storage or app.config['DB_ROOT_DIR'] 18 | 19 | db_path = f"{db_storage}/{urlhash}.db" 20 | return { 21 | 'dsn': f"sqlite:///{db_path}", 22 | 'db_name': urlhash, 23 | 'table_name': urlhash, 24 | 'db_path': db_path, 25 | } 26 | 27 | 28 | def get_hash(to_hash): 29 | return get_hash_bytes(to_hash.encode('utf-8')) 30 | 31 | 32 | def get_hash_bytes(to_hash): 33 | return hashlib.md5(to_hash).hexdigest() 34 | 35 | 36 | async def already_exists(urlhash, analysis=None): 37 | ''' 38 | Check if db exist. If analysis is requested, we check if general_infos table exist. 39 | If not, we bypass cache and do a new download of file to analyse it with pp and csv-detective. 40 | ''' 41 | cache_enabled = app.config.get('CSV_CACHE_ENABLED') 42 | if not cache_enabled: 43 | return False 44 | 45 | db_exist = Path(get_db_info(urlhash)['db_path']).exists() 46 | 47 | if not analysis or analysis != 'yes': 48 | return db_exist 49 | else: 50 | conn = create_connection(get_db_info(urlhash)['db_path']) 51 | cur = conn.cursor() 52 | sql = 'SELECT count(*) FROM sqlite_master WHERE type=\'table\' AND name=\'general_infos\'' 53 | cur.execute(sql) 54 | rows = cur.fetchall() 55 | if rows[0][0] != 0: 56 | return True 57 | else: 58 | return False 59 | 60 | 61 | def create_connection(db_file): 62 | conn = None 63 | conn = sqlite3.connect(db_file) 64 | return conn 65 | 66 | 67 | def keys_exists(element, *keys): 68 | ''' 69 | Check if *keys (nested) exists in `element` (dict). 70 | ''' 71 | if not isinstance(element, dict): 72 | raise AttributeError('keys_exists() expects dict as first argument.') 73 | if len(keys) == 0: 74 | raise AttributeError('keys_exists() expects at least two arguments, one given.') 75 | _element = element 76 | for key in keys: 77 | try: 78 | _element = _element[key] 79 | except KeyError: 80 | return False 81 | return True 82 | 83 | 84 | def check_csv_detective_report_structure(report): 85 | if (report is not None) and \ 86 | (keys_exists(report, "columns")) and \ 87 | (keys_exists(report, "encoding")) and \ 88 | (keys_exists(report, "separator")) and \ 89 | (keys_exists(report, "header_row_idx")): 90 | 91 | for item in report['columns']: 92 | if (not keys_exists(report, "columns", item, "python_type")) | \ 93 | (not keys_exists(report, "columns", item, "format")): 94 | return False 95 | return True 96 | else: 97 | return False 98 | 99 | 100 | def check_profile_report_structure(report): 101 | if (report is not None) and \ 102 | (keys_exists(report, "table", "n")) and \ 103 | (keys_exists(report, "table", "n_var")) and \ 104 | (keys_exists(report, "table", "n_cells_missing")) and \ 105 | (keys_exists(report, "table", "n_vars_with_missing")) and \ 106 | (keys_exists(report, "table", "n_vars_all_missing")) and \ 107 | (keys_exists(report, "table", "n_cells_missing")) and \ 108 | (keys_exists(report, "variables")): 109 | 110 | for item in report['variables']: 111 | if (not keys_exists(report, "variables", item, "n_distinct")) | \ 112 | (not keys_exists(report, "variables", item, "is_unique")) | \ 113 | (not keys_exists(report, "variables", item, "n_unique")) | \ 114 | (not keys_exists(report, "variables", item, "type")) | \ 115 | (not keys_exists(report, "variables", item, "n_missing")) | \ 116 | (not keys_exists(report, "variables", item, "count")): 117 | return False 118 | return True 119 | else: 120 | return False 121 | 122 | 123 | def df_to_sql(obj, conn, name): 124 | df = pd.DataFrame(obj) 125 | if df.shape[0] > 0: 126 | df.to_sql(name, con=conn, if_exists='replace', index=False) 127 | 128 | 129 | def enrich_db_with_metadata(urlhash, csv_detective_report, profile_report, dataset_id, key): 130 | # Save to sql 131 | conn = create_connection(app.config['DB_ROOT_DIR'] + '/' + urlhash + '.db') 132 | 133 | general_infos = [ 134 | { 135 | 'encoding': csv_detective_report['encoding'], 136 | 'separator': csv_detective_report['separator'], 137 | 'header_row_idx': csv_detective_report['header_row_idx'], 138 | 'total_lines': profile_report['table']['n'], 139 | 'nb_columns': profile_report['table']['n_var'], 140 | 'nb_cells_missing': profile_report['table']['n_cells_missing'], 141 | 'nb_vars_with_missing': profile_report['table']['n_vars_with_missing'], 142 | 'nb_vars_all_missing': profile_report['table']['n_vars_all_missing'], 143 | 'date_last_check': datetime.today().strftime('%Y-%m-%d'), 144 | 'dataset_id': dataset_id, 145 | 'resource_id': key, 146 | 'filetype': 'csv' 147 | } 148 | ] 149 | df = pd.DataFrame(general_infos) 150 | df.to_sql('general_infos', con=conn, if_exists='replace', index=False) 151 | 152 | columns_infos = [] 153 | categorical_infos = [] 154 | top_infos = [] 155 | numeric_infos = [] 156 | numeric_plot_infos = [] 157 | 158 | for col in profile_report['variables']: 159 | column_info = {} 160 | column_info['name'] = col 161 | column_info['nb_distinct'] = profile_report['variables'][col]['n_distinct'] 162 | column_info['is_unique'] = profile_report['variables'][col]['is_unique'] 163 | column_info['nb_unique'] = profile_report['variables'][col]['n_unique'] 164 | column_info['type'] = profile_report['variables'][col]['type'] 165 | column_info['nb_missing'] = profile_report['variables'][col]['n_missing'] 166 | column_info['count'] = profile_report['variables'][col]['count'] 167 | column_info['format'] = 'unknown' 168 | 169 | if col in csv_detective_report['columns']: 170 | column_info['format'] = csv_detective_report['columns'][col]['format'] 171 | if csv_detective_report['columns'][col]['format'] in [ 172 | 'siren', 173 | 'siret', 174 | 'code_postal', 175 | 'code_commune_insee', 176 | 'code_departement', 177 | 'code_region', 178 | 'tel_fr']: 179 | column_info['type'] = 'Categorical' 180 | columns_infos.append(column_info) 181 | 182 | if (column_info['type'] == 'Categorical') & \ 183 | (len(profile_report['variables'][col]['value_counts_without_nan']) < 10): 184 | for cat in profile_report['variables'][col]['value_counts_without_nan']: 185 | categorical_info = {} 186 | categorical_info['column'] = col 187 | categorical_info['value'] = cat 188 | categorical_info['nb'] = profile_report['variables'][col]['value_counts_without_nan'][cat] 189 | categorical_infos.append(categorical_info) 190 | 191 | if column_info['type'] == 'Numeric': 192 | numeric_info = {} 193 | numeric_info['column'] = col 194 | numeric_info['mean'] = profile_report['variables'][col]['mean'] 195 | numeric_info['std'] = profile_report['variables'][col]['std'] 196 | numeric_info['min'] = profile_report['variables'][col]['min'] 197 | numeric_info['max'] = profile_report['variables'][col]['max'] 198 | numeric_infos.append(numeric_info) 199 | for i in range(len(profile_report['variables'][col]['histogram']['bin_edges'])): 200 | numeric_plot_info = {} 201 | numeric_plot_info['column'] = col 202 | numeric_plot_info['value'] = profile_report['variables'][col]['histogram']['bin_edges'][i] 203 | numeric_plot_info['type'] = 'bin_edges' 204 | numeric_plot_infos.append(numeric_plot_info) 205 | 206 | for i in range(len(profile_report['variables'][col]['histogram']['counts'])): 207 | numeric_plot_info = {} 208 | numeric_plot_info['column'] = col 209 | numeric_plot_info['value'] = profile_report['variables'][col]['histogram']['counts'][i] 210 | numeric_plot_info['type'] = 'counts' 211 | numeric_plot_infos.append(numeric_plot_info) 212 | 213 | cpt = 0 214 | for top in profile_report['variables'][col]['value_counts_without_nan']: 215 | if (cpt < 10): 216 | top_info = {} 217 | top_info['column'] = col 218 | top_info['value'] = top 219 | top_info['nb'] = profile_report['variables'][col]['value_counts_without_nan'][top] 220 | top_infos.append(top_info) 221 | cpt = cpt + 1 222 | 223 | df_to_sql(columns_infos, conn, 'columns_infos') 224 | df_to_sql(categorical_infos, conn, 'categorical_infos') 225 | df_to_sql(top_infos, conn, 'top_infos') 226 | df_to_sql(numeric_infos, conn, 'numeric_infos') 227 | df_to_sql(numeric_plot_infos, conn, 'numeric_plot_infos') 228 | 229 | conn.commit() 230 | -------------------------------------------------------------------------------- /csvapi/tableview.py: -------------------------------------------------------------------------------- 1 | import aiosqlite 2 | import sqlite3 3 | import time 4 | 5 | from contextlib import asynccontextmanager 6 | from pathlib import Path 7 | 8 | from quart import request, jsonify, current_app as app 9 | from quart.views import MethodView 10 | from slugify import slugify 11 | 12 | from csvapi.errors import APIError 13 | from csvapi.utils import get_db_info 14 | 15 | ROWS_LIMIT = 100 16 | SQL_TIME_LIMIT_MS = 1000 17 | DEFAULT_SHAPE = 'lists' 18 | 19 | 20 | def prepare_connection(conn): 21 | # conn.row_factory = sqlite3.Row 22 | conn.text_factory = lambda x: str(x, 'utf-8', 'replace') 23 | 24 | 25 | @asynccontextmanager 26 | async def sqlite_timelimit(conn, ms): 27 | deadline = time.time() + (ms / 1000) 28 | # n is the number of SQLite virtual machine instructions that will be 29 | # executed between each check. It's hard to know what to pick here. 30 | # After some experimentation, I've decided to go with 1000 by default and 31 | # 1 for time limits that are less than 50ms 32 | n = 1000 33 | if ms < 50: 34 | n = 1 35 | 36 | def handler(): 37 | if time.time() >= deadline: 38 | return 1 39 | 40 | await conn.set_progress_handler(handler, n) 41 | yield 42 | await conn.set_progress_handler(None, n) 43 | 44 | 45 | class TableView(MethodView): 46 | 47 | async def execute(self, sql, db_info, params=None): 48 | """Executes sql against db_name in a thread""" 49 | dsn = 'file:{}?immutable=1'.format(db_info['db_path']) 50 | # specify uri=True to make sure `file:xxx` is supported, 51 | # however the backend sqlite is configured (eg default MacOS) 52 | async with aiosqlite.connect(dsn, uri=True) as conn: 53 | conn.text_factory = lambda x: str(x, 'utf-8', 'replace') 54 | # this will raise 55 | # {"details": "interrupted", 56 | # "error": "Error selecting data",} 57 | async with sqlite_timelimit(conn, SQL_TIME_LIMIT_MS): 58 | try: 59 | async with conn.execute(sql, params or {}) as cursor: 60 | rows = await cursor.fetchall() 61 | except Exception: 62 | app.logger.error('ERROR: conn={}, sql = {}, params = {}'.format( 63 | conn, repr(sql), params 64 | )) 65 | raise 66 | return rows, cursor.description 67 | 68 | def add_filters_to_sql(self, sql, filters): 69 | wheres = [] 70 | params = {} 71 | for (f_key, f_value) in filters: 72 | comparator = f_key.split('__')[1] 73 | column = f_key.split('__')[0] 74 | normalized_column = slugify(column, separator='_') 75 | if comparator == 'exact': 76 | wheres.append(f"[{column}] = :filter_value_{normalized_column}") 77 | params[f'filter_value_{normalized_column}'] = f_value 78 | elif comparator == 'contains': 79 | wheres.append(f"[{column}] LIKE :filter_value_{normalized_column}") 80 | params[f'filter_value_{normalized_column}'] = f'%{f_value}%' 81 | elif comparator == 'less': 82 | try: 83 | float_value = float(f_value) 84 | except ValueError: 85 | raise APIError('Float value expected for less comparison.', status=400) 86 | wheres.append(f"[{column}] <= :filter_value_l_{normalized_column}") 87 | params[f'filter_value_l_{normalized_column}'] = float_value 88 | elif comparator == 'greater': 89 | try: 90 | float_value = float(f_value) 91 | except ValueError: 92 | raise APIError('Float value expected for greater comparison.', status=400) 93 | wheres.append(f"[{column}] >= :filter_value_gt_{normalized_column}") 94 | params[f'filter_value_gt_{normalized_column}'] = float_value 95 | else: 96 | app.logger.warning(f'Dropped unknown comparator in {f_key}') 97 | if wheres: 98 | sql += ' WHERE ' 99 | sql += ' AND '.join(wheres) 100 | return sql, params 101 | 102 | async def data(self, db_info, export=False): 103 | limit = request.args.get('_size', ROWS_LIMIT) if not export else -1 104 | rowid = not (request.args.get('_rowid') == 'hide') and not export 105 | total = not (request.args.get('_total') == 'hide') and not export 106 | sort = request.args.get('_sort') 107 | sort_desc = request.args.get('_sort_desc') 108 | offset = request.args.get('_offset') if not export else 0 109 | 110 | # get filter arguments, like column__exact=xxx 111 | filters = [] 112 | for key, value in request.args.items(): 113 | if not key.startswith('_') and '__' in key: 114 | filters.append((key, value)) 115 | 116 | cols = 'rowid, *' if rowid else '*' 117 | sql = 'SELECT {} FROM [{}]'.format(cols, db_info['table_name']) 118 | sql, params = self.add_filters_to_sql(sql, filters) 119 | if sort: 120 | sql += f' ORDER BY [{sort}]' 121 | elif sort_desc: 122 | sql += f' ORDER BY [{sort_desc}] DESC' 123 | else: 124 | sql += ' ORDER BY rowid' 125 | sql += ' LIMIT :l' 126 | params['l'] = limit 127 | if offset: 128 | sql += ' OFFSET :o' 129 | params['o'] = offset 130 | rows, description = await self.execute( 131 | sql, db_info, params=params 132 | ) 133 | 134 | columns = [r[0] for r in description] 135 | 136 | if export: 137 | return columns, rows 138 | 139 | res = { 140 | 'columns': columns, 141 | 'rows': list(rows), 142 | } 143 | 144 | if total: 145 | sql = f"SELECT COUNT(*) FROM [{db_info['table_name']}]" 146 | sql, params = self.add_filters_to_sql(sql, filters) 147 | r, _ = await self.execute(sql, db_info, params=params) 148 | res['total'] = r[0][0] 149 | 150 | return res 151 | 152 | async def get(self, urlhash): 153 | db_info = get_db_info(urlhash) 154 | p = Path(db_info['db_path']) 155 | if not p.exists(): 156 | raise APIError('Database has probably been removed.', status=404) 157 | 158 | start = time.time() 159 | try: 160 | data = await self.data(db_info) 161 | except (sqlite3.OperationalError, sqlite3.IntegrityError) as e: 162 | raise APIError('Error selecting data', status=400, payload=dict(details=str(e))) 163 | end = time.time() 164 | 165 | _shape = request.args.get('_shape', DEFAULT_SHAPE) 166 | if _shape == 'objects': 167 | # Format data as an array of objects for the client 168 | rows = [] 169 | for row in data['rows']: 170 | rows.append(dict(zip(data['columns'], row))) 171 | elif _shape == 'lists': 172 | rows = data['rows'] 173 | else: 174 | raise APIError(f"Unknown _shape: {_shape}", status=400) 175 | 176 | general_infos = await self.general_infos(db_info) 177 | columns_infos = await self.columns_infos(db_info) 178 | 179 | res = { 180 | 'ok': True, 181 | 'query_ms': (end - start) * 1000, 182 | 'rows': rows, 183 | 'columns': data['columns'], 184 | 'general_infos': general_infos, 185 | 'columns_infos': columns_infos 186 | } 187 | if data.get('total'): 188 | res['total'] = data['total'] 189 | 190 | return jsonify(res) 191 | 192 | async def general_infos(self, db_info): 193 | params = {} 194 | sql = 'SELECT count(*) FROM sqlite_master WHERE type=\'table\' AND name=\'general_infos\'' 195 | rows, description = await self.execute( 196 | sql, db_info, params=params 197 | ) 198 | if rows[0][0] != 0: 199 | sql = 'SELECT * FROM general_infos' 200 | rows, description = await self.execute( 201 | sql, db_info, params=params 202 | ) 203 | columns = [r[0] for r in description] 204 | res = {} 205 | cpt = 0 206 | for col in columns: 207 | res[col] = rows[0][cpt] 208 | cpt = cpt + 1 209 | 210 | return res 211 | else: 212 | return {} 213 | 214 | async def columns_infos(self, db_info): 215 | params = {} 216 | sql = 'SELECT count(*) FROM sqlite_master WHERE type=\'table\' AND name=\'columns_infos\'' 217 | rows, description = await self.execute( 218 | sql, db_info, params=params 219 | ) 220 | if rows[0][0] != 0: 221 | sql = 'SELECT * FROM columns_infos' 222 | rows, description = await self.execute( 223 | sql, db_info, params=params 224 | ) 225 | columns = [r[0] for r in description] 226 | 227 | res = {} 228 | for row in rows: 229 | cpt = 1 230 | res[row[0]] = {} 231 | for col in columns[1:]: 232 | res[row[0]][col] = row[cpt] 233 | cpt = cpt + 1 234 | 235 | res = await self.top_and_categorical_infos(db_info, res, 'top_infos') 236 | res = await self.top_and_categorical_infos(db_info, res, 'categorical_infos') 237 | res = await self.numeric_infos(db_info, res) 238 | res = await self.numeric_plot_infos(db_info, res) 239 | return res 240 | else: 241 | return {} 242 | 243 | async def top_and_categorical_infos(self, db_info, res, table_name): 244 | params = {} 245 | sql = 'SELECT count(*) FROM sqlite_master WHERE type=\'table\' AND name=\'{}\''.format(table_name) 246 | rows, description = await self.execute( 247 | sql, db_info, params=params 248 | ) 249 | if rows[0][0] != 0: 250 | sql = 'SELECT * FROM {}'.format(table_name) 251 | rows, description = await self.execute( 252 | sql, db_info, params=params 253 | ) 254 | 255 | for row in rows: 256 | if table_name not in res[row[0]]: 257 | res[row[0]][table_name] = [] 258 | inter = {} 259 | inter['value'] = row[1] 260 | inter['count'] = row[2] 261 | res[row[0]][table_name].append(inter) 262 | 263 | return res 264 | else: 265 | for col in res: 266 | res[col][table_name] = {} 267 | return res 268 | 269 | async def numeric_infos(self, db_info, res): 270 | params = {} 271 | sql = 'SELECT count(*) FROM sqlite_master WHERE type=\'table\' AND name=\'numeric_infos\'' 272 | rows, description = await self.execute( 273 | sql, db_info, params=params 274 | ) 275 | if rows[0][0] != 0: 276 | sql = 'SELECT * FROM {}'.format('numeric_infos') 277 | rows, description = await self.execute( 278 | sql, db_info, params=params 279 | ) 280 | 281 | for row in rows: 282 | if 'numeric_infos' not in res[row[0]]: 283 | res[row[0]]['numeric_infos'] = {} 284 | 285 | res[row[0]]['numeric_infos']['mean'] = row[1] 286 | res[row[0]]['numeric_infos']['std'] = row[2] 287 | res[row[0]]['numeric_infos']['min'] = row[3] 288 | res[row[0]]['numeric_infos']['max'] = row[4] 289 | 290 | return res 291 | else: 292 | for col in res: 293 | res[col]['numeric_infos'] = {} 294 | return res 295 | 296 | async def numeric_plot_infos(self, db_info, res): 297 | params = {} 298 | sql = 'SELECT count(*) FROM sqlite_master WHERE type=\'table\' AND name=\'numeric_plot_infos\'' 299 | rows, description = await self.execute( 300 | sql, db_info, params=params 301 | ) 302 | if rows[0][0] != 0: 303 | sql = 'SELECT * FROM {}'.format('numeric_plot_infos') 304 | rows, description = await self.execute( 305 | sql, db_info, params=params 306 | ) 307 | 308 | for row in rows: 309 | if 'numeric_plot_infos' not in res[row[0]]: 310 | res[row[0]]['numeric_plot_infos'] = {} 311 | if 'counts' not in res[row[0]]['numeric_plot_infos']: 312 | res[row[0]]['numeric_plot_infos']['counts'] = [] 313 | if 'bin_edges' not in res[row[0]]['numeric_plot_infos']: 314 | res[row[0]]['numeric_plot_infos']['bin_edges'] = [] 315 | if row[2] == 'counts': 316 | res[row[0]]['numeric_plot_infos']['counts'].append(row[1]) 317 | if row[2] == 'bin_edges': 318 | res[row[0]]['numeric_plot_infos']['bin_edges'].append(row[1]) 319 | 320 | return res 321 | else: 322 | for col in res: 323 | res[col]['numeric_plot_infos'] = {} 324 | return res 325 | -------------------------------------------------------------------------------- /tests/test_api.py: -------------------------------------------------------------------------------- 1 | import os 2 | import uuid 3 | from pathlib import Path 4 | 5 | import pytest 6 | import pytest_asyncio 7 | from aioresponses import aioresponses 8 | 9 | from csvapi.utils import get_hash 10 | from csvapi.webservice import app as csvapi_app 11 | 12 | MOCK_CSV_URL = 'http://domain.com/file.csv' 13 | MOCK_CSV_URL_FILTERS = 'http://domain.com/filters.csv' 14 | MOCK_CSV_HASH_FILTERS = get_hash(MOCK_CSV_URL_FILTERS) 15 | MOCK_CSV_HASH = get_hash(MOCK_CSV_URL) 16 | DB_ROOT_DIR = './tests/dbs' 17 | 18 | 19 | pytestmark = pytest.mark.asyncio 20 | 21 | 22 | @pytest.fixture 23 | def rmock(): 24 | with aioresponses() as m: 25 | yield m 26 | 27 | 28 | @pytest.fixture 29 | def app(): 30 | csvapi_app.config.update({ 31 | 'DB_ROOT_DIR': DB_ROOT_DIR, 32 | 'CSV_CACHE_ENABLED': False, 33 | }) 34 | yield csvapi_app 35 | [db.unlink() for db in Path(DB_ROOT_DIR).glob('*.db')] 36 | 37 | 38 | @pytest.fixture 39 | def client(app): 40 | yield app.test_client() 41 | 42 | 43 | @pytest.fixture 44 | def csv(): 45 | return '''col acol bcol c 46 | data à1data b1z 47 | data ª2data b2a 48 | ''' 49 | 50 | 51 | @pytest.fixture 52 | def csv_col_mismatch(): 53 | return '''col acol b 54 | data à1data b12 55 | data ª2data b24 56 | ''' 57 | 58 | 59 | @pytest.fixture 60 | def csv_hour(): 61 | return '''idhour 62 | a12:30 63 | b9:15 64 | c09:45 65 | ''' 66 | 67 | 68 | @pytest.fixture 69 | def csv_filters(): 70 | """ 71 | TODO: also test with unicode value in column name, but Quart 72 | test client currently fails 73 | """ 74 | return '''id,hour,value,another column 75 | first,12:30,1,value 76 | second,9:15,2,value 77 | third,09:45,3,value 78 | ''' 79 | 80 | 81 | @pytest.fixture 82 | def csv_siren_siret(): 83 | return """idsirensiret 84 | a13002526513002526500013 85 | b52281665152281665100056 86 | """ 87 | 88 | 89 | @pytest.fixture 90 | def csv_numeric(): 91 | return """idvalue 92 | a2 93 | b4 94 | c12 95 | """ 96 | 97 | 98 | @pytest.fixture 99 | def csv_top(): 100 | return """catvalue 101 | a15 102 | b13 103 | c11 104 | a9 105 | """ 106 | 107 | 108 | @pytest.fixture 109 | def csv_custom_types_double_cr(): 110 | """ 111 | This is clearly an invalid file (double CR) 112 | but it tests an interesting case: None values in 113 | columns detected as custom types. 114 | 115 | In this case we'd rather display empty lines and None 116 | values than break. 117 | """ 118 | return """idsirensirettime\r\r 119 | a13002526a51300252650001312:30\r\r 120 | b5228166515228166510005615:50\r\r 121 | """ 122 | 123 | 124 | @pytest.fixture 125 | def one_line_json_file(): 126 | return '''{ "property1": 1, "property2": 2}''' 127 | 128 | 129 | def random_url(): 130 | return f"https://example.com/{uuid.uuid4()}.csv" 131 | 132 | 133 | @pytest_asyncio.fixture 134 | async def uploaded_csv(rmock, csv, client): 135 | content = csv.replace('', ';').encode('utf-8') 136 | rmock.get(MOCK_CSV_URL, body=content) 137 | await client.get(f"/apify?url={MOCK_CSV_URL}") 138 | 139 | 140 | async def test_apify_no_url(rmock, csv, client): 141 | res = await client.get('/apify') 142 | assert res.status_code == 400 143 | 144 | 145 | async def test_apify_wrong_url(rmock, csv, client): 146 | res = await client.get('/apify?url=notanurl') 147 | assert res.status_code == 400 148 | 149 | 150 | async def test_apify(rmock, csv, client): 151 | rmock.get(MOCK_CSV_URL, status=200, body=csv.encode('utf-8')) 152 | res = await client.get(f"/apify?url={MOCK_CSV_URL}") 153 | assert res.status_code == 200 154 | jsonres = await res.json 155 | assert jsonres['ok'] 156 | assert 'endpoint' in jsonres 157 | assert f"/api/{MOCK_CSV_HASH}" in jsonres['endpoint'] 158 | db_path = Path(DB_ROOT_DIR) / f"{MOCK_CSV_HASH}.db" 159 | assert db_path.exists() 160 | 161 | 162 | async def test_apify_not_found(rmock, csv, client): 163 | rmock.get(MOCK_CSV_URL, status=404) 164 | res = await client.get(f"/apify?url={MOCK_CSV_URL}") 165 | assert res.status_code == 500 166 | jsonres = await res.json 167 | assert not jsonres['ok'] 168 | assert jsonres['error'].startswith("Error parsing CSV: 404, message='Not Found'") 169 | 170 | 171 | async def test_apify_w_cache(app, rmock, csv, client): 172 | app.config.update({'CSV_CACHE_ENABLED': True}) 173 | rmock.get(MOCK_CSV_URL, body=csv.encode('utf-8')) 174 | res = await client.get(f"/apify?url={MOCK_CSV_URL}") 175 | assert res.status_code == 200 176 | jsonres = await res.json 177 | assert jsonres['ok'] 178 | assert 'endpoint' in jsonres 179 | assert f"/api/{MOCK_CSV_HASH}" in jsonres['endpoint'] 180 | db_path = Path(DB_ROOT_DIR) / f"{MOCK_CSV_HASH}.db" 181 | assert db_path.exists() 182 | app.config.update({'CSV_CACHE_ENABLED': False}) 183 | 184 | 185 | async def test_apify_col_mismatch(rmock, csv_col_mismatch, client): 186 | rmock.get(MOCK_CSV_URL, body=csv_col_mismatch.replace('', ';').encode('utf-8')) 187 | res = await client.get(f"/apify?url={MOCK_CSV_URL}") 188 | assert res.status_code == 200 189 | jsonres = await res.json 190 | assert jsonres['ok'] 191 | 192 | 193 | async def test_apify_hour_format(rmock, csv_hour, client): 194 | content = csv_hour.replace('', ';').encode('utf-8') 195 | url = random_url() 196 | rmock.get(url, body=content) 197 | await client.get(f"/apify?url={url}") 198 | res = await client.get(f"/api/{get_hash(url)}") 199 | assert res.status_code == 200 200 | jsonres = await res.json 201 | assert jsonres['columns'] == ['rowid', 'id', 'hour'] 202 | assert jsonres['total'] == 3 203 | assert jsonres['rows'] == [ 204 | [1, 'a', '12:30'], 205 | [2, 'b', '9:15'], 206 | [3, 'c', '09:45'], 207 | ] 208 | 209 | 210 | async def test_apify_siren_siret_format(rmock, csv_siren_siret, client): 211 | content = csv_siren_siret.replace('', ';').encode('utf-8') 212 | url = random_url() 213 | rmock.get(url, body=content) 214 | await client.get(f"/apify?url={url}") 215 | res = await client.get(f"/api/{get_hash(url)}") 216 | assert res.status_code == 200 217 | jsonres = await res.json 218 | assert jsonres['columns'] == ['rowid', 'id', 'siren', 'siret'] 219 | assert jsonres['total'] == 2 220 | assert jsonres['rows'] == [ 221 | [1, 'a', '130025265', '13002526500013'], 222 | [2, 'b', '522816651', '52281665100056'], 223 | ] 224 | 225 | 226 | async def test_apify_custom_types_double_cr(rmock, csv_custom_types_double_cr, client): 227 | content = csv_custom_types_double_cr.replace('', ';').encode('utf-8') 228 | url = random_url() 229 | rmock.get(url, body=content) 230 | await client.get(f"/apify?url={url}") 231 | res = await client.get(f"/api/{get_hash(url)}") 232 | assert res.status_code == 200 233 | jsonres = await res.json 234 | assert jsonres['columns'] == ['rowid', 'id', 'siren', 'siret', 'time'] 235 | assert jsonres['total'] == 5 236 | assert jsonres['rows'] == [ 237 | [1, None, None, None, None], 238 | [2, 'a', '13002526a5', '13002526500013', '12:30'], 239 | [3, None, None, None, None], 240 | [4, 'b', '522816651', '52281665100056', '15:50'], 241 | [5, None, None, None, None] 242 | ] 243 | 244 | 245 | @pytest.mark.parametrize('separator', [';', ',', '\t']) 246 | @pytest.mark.parametrize('encoding', ['utf-8', 'iso-8859-15', 'iso-8859-1']) 247 | async def test_api(client, rmock, csv, separator, encoding): 248 | content = csv.replace('', separator).encode(encoding) 249 | rmock.get(MOCK_CSV_URL, body=content) 250 | await client.get(f"/apify?url={MOCK_CSV_URL}") 251 | res = await client.get(f"/api/{MOCK_CSV_HASH}") 252 | assert res.status_code == 200 253 | jsonres = await res.json 254 | assert jsonres['columns'] == ['rowid', 'col a', 'col b', 'col c'] 255 | assert jsonres['total'] == 2 256 | assert jsonres['rows'] == [ 257 | [1, 'data à1', 'data b1', 'z'], 258 | [2, 'data ª2', 'data b2', 'a'], 259 | ] 260 | 261 | 262 | async def test_api_limit(client, rmock, uploaded_csv): 263 | res = await client.get(f"/api/{MOCK_CSV_HASH}?_size=1") 264 | assert res.status_code == 200 265 | jsonres = await res.json 266 | assert len(jsonres['rows']) == 1 267 | assert jsonres['rows'] == [ 268 | [1, 'data à1', 'data b1', 'z'], 269 | ] 270 | 271 | 272 | async def test_api_limit_offset(client, rmock, uploaded_csv): 273 | res = await client.get(f"/api/{MOCK_CSV_HASH}?_size=1&_offset=1") 274 | assert res.status_code == 200 275 | jsonres = await res.json 276 | assert len(jsonres['rows']) == 1 277 | assert jsonres['rows'] == [ 278 | [2, 'data ª2', 'data b2', 'a'], 279 | ] 280 | 281 | 282 | async def test_api_wrong_limit(client, rmock, uploaded_csv): 283 | res = await client.get(f"/api/{MOCK_CSV_HASH}?_size=toto") 284 | assert res.status_code == 400 285 | 286 | 287 | async def test_api_wrong_shape(client, rmock, uploaded_csv): 288 | res = await client.get(f"/api/{MOCK_CSV_HASH}?_shape=toto") 289 | assert res.status_code == 400 290 | 291 | 292 | async def test_api_objects_shape(client, rmock, uploaded_csv): 293 | res = await client.get(f"/api/{MOCK_CSV_HASH}?_shape=objects") 294 | assert res.status_code == 200 295 | jsonres = await res.json 296 | assert jsonres['rows'] == [ 297 | { 298 | 'rowid': 1, 299 | 'col a': 'data à1', 300 | 'col b': 'data b1', 301 | 'col c': 'z', 302 | }, { 303 | 'rowid': 2, 304 | 'col a': 'data ª2', 305 | 'col b': 'data b2', 306 | 'col c': 'a', 307 | } 308 | ] 309 | 310 | 311 | async def test_api_objects_norowid(client, rmock, uploaded_csv): 312 | res = await client.get(f"/api/{MOCK_CSV_HASH}?_shape=objects&_rowid=hide") 313 | assert res.status_code == 200 314 | jsonres = await res.json 315 | assert jsonres['rows'] == [ 316 | { 317 | 'col a': 'data à1', 318 | 'col b': 'data b1', 319 | 'col c': 'z', 320 | }, { 321 | 'col a': 'data ª2', 322 | 'col b': 'data b2', 323 | 'col c': 'a', 324 | } 325 | ] 326 | 327 | 328 | async def test_api_objects_nototal(client, rmock, uploaded_csv): 329 | res = await client.get(f"/api/{MOCK_CSV_HASH}?_total=hide") 330 | assert res.status_code == 200 331 | jsonres = await res.json 332 | assert jsonres.get('total') is None 333 | 334 | 335 | async def test_api_sort(client, rmock, uploaded_csv): 336 | res = await client.get(f"/api/{MOCK_CSV_HASH}?_sort=col c") 337 | assert res.status_code == 200 338 | jsonres = await res.json 339 | assert jsonres['rows'] == [ 340 | [2, 'data ª2', 'data b2', 'a'], 341 | [1, 'data à1', 'data b1', 'z'], 342 | ] 343 | 344 | 345 | async def test_api_sort_desc(client, rmock, uploaded_csv): 346 | res = await client.get(f"/api/{MOCK_CSV_HASH}?_sort_desc=col b") 347 | assert res.status_code == 200 348 | jsonres = await res.json 349 | assert jsonres['rows'] == [ 350 | [2, 'data ª2', 'data b2', 'a'], 351 | [1, 'data à1', 'data b1', 'z'], 352 | ] 353 | 354 | 355 | async def test_apify_file_too_big(app, client, rmock): 356 | original_max_file_size = app.config.get('MAX_FILE_SIZE') 357 | app.config.update({'MAX_FILE_SIZE': 1}) 358 | here = os.path.dirname(os.path.abspath(__file__)) 359 | content = open(f"{here}/samples/test.{'xls'}", 'rb') 360 | mock_url = MOCK_CSV_URL.replace('.csv', 'xls') 361 | rmock.get(mock_url, body=content.read()) 362 | content.close() 363 | res = await client.get(f"/apify?url={mock_url}") 364 | assert res.status_code == 500 365 | jsonres = await res.json 366 | assert 'File too big' in jsonres['error'] 367 | app.config.update({'MAX_FILE_SIZE': original_max_file_size}) 368 | 369 | 370 | @pytest.mark.parametrize('extension', ['xls', 'xlsx']) 371 | async def test_api_excel(client, rmock, extension): 372 | here = os.path.dirname(os.path.abspath(__file__)) 373 | content = open(f"{here}/samples/test.{extension}", 'rb') 374 | mock_url = MOCK_CSV_URL.replace('.csv', extension) 375 | mock_hash = get_hash(mock_url) 376 | rmock.get(mock_url, body=content.read()) 377 | content.close() 378 | await client.get(f"/apify?url={mock_url}") 379 | res = await client.get(f"/api/{mock_hash}") 380 | assert res.status_code == 200 381 | jsonres = await res.json 382 | assert jsonres['columns'] == ['rowid', 'col a', 'col b', 'col c'] 383 | assert jsonres['rows'] == [ 384 | [1, 'a1', 'b1', 'z'], 385 | [2, 'a2', 'b2', 'a'], 386 | ] 387 | 388 | 389 | async def test_api_filter_referrers(app, client): 390 | app.config.update({'REFERRERS_FILTER': ['toto.com']}) 391 | res = await client.get(f"/api/{'404'}") 392 | assert res.status_code == 403 393 | res = await client.get(f"/apify?url={'http://toto.com'}") 394 | assert res.status_code == 403 395 | res = await client.get(f"/api/{'404'}", headers={'Referer': 'http://next.toto.com'}) 396 | assert res.status_code == 404 397 | app.config.update({'REFERRERS_FILTER': None}) 398 | 399 | 400 | @pytest.mark.parametrize('csv_path', Path(__file__).parent.glob('samples/real_csv/*.csv')) 401 | async def test_real_csv_files(client, rmock, csv_path): 402 | with open(csv_path, 'rb') as content: 403 | rmock.get(MOCK_CSV_URL, body=content.read()) 404 | res = await client.get(f"/apify?url={MOCK_CSV_URL}") 405 | assert res.status_code == 200 406 | res = await client.get(f"/api/{MOCK_CSV_HASH}") 407 | # w/ no error and more than 1 column and row we should be OK 408 | assert res.status_code == 200 409 | jsonres = await res.json 410 | assert len(jsonres['columns']) > 1 411 | assert len(jsonres['rows']) > 1 412 | 413 | 414 | @pytest.mark.parametrize('xls_path', Path(__file__).parent.glob('samples/real_xls/*.xls*')) 415 | async def test_real_xls_files(client, rmock, xls_path): 416 | with open(xls_path, 'rb') as content: 417 | rmock.get(MOCK_CSV_URL, body=content.read()) 418 | res = await client.get(f"/apify?url={MOCK_CSV_URL}") 419 | assert res.status_code == 200 420 | res = await client.get(f"/api/{MOCK_CSV_HASH}") 421 | # w/ no error and more than 1 column and row we should be OK 422 | assert res.status_code == 200 423 | jsonres = await res.json 424 | assert len(jsonres['columns']) > 0 425 | assert len(jsonres['rows']) > 0 426 | 427 | 428 | @pytest_asyncio.fixture 429 | async def uploaded_csv_filters(rmock, csv_filters, client): 430 | content = csv_filters.encode('utf-8') 431 | rmock.get(MOCK_CSV_URL_FILTERS, body=content) 432 | await client.get(f"/apify?url={MOCK_CSV_URL_FILTERS}") 433 | 434 | 435 | async def test_api_filters_exact_hour(rmock, uploaded_csv_filters, client): 436 | res = await client.get(f"/api/{MOCK_CSV_HASH_FILTERS}?hour__exact=12:30") 437 | assert res.status_code == 200 438 | jsonres = await res.json 439 | assert jsonres['total'] == 1 440 | assert jsonres['rows'] == [ 441 | [1, 'first', '12:30', 1.0, 'value'], 442 | ] 443 | 444 | 445 | async def test_api_filters_contains_string(rmock, uploaded_csv_filters, client): 446 | res = await client.get(f"/api/{MOCK_CSV_HASH_FILTERS}?id__contains=fir") 447 | assert res.status_code == 200 448 | jsonres = await res.json 449 | assert jsonres['total'] == 1 450 | assert jsonres['rows'] == [ 451 | [1, 'first', '12:30', 1.0, 'value'], 452 | ] 453 | 454 | 455 | async def test_api_filters_contains_exact_int(rmock, uploaded_csv_filters, client): 456 | "NB: suboptimal API result, int value returns a float" 457 | res = await client.get(f"/api/{MOCK_CSV_HASH_FILTERS}?value__exact=1") 458 | assert res.status_code == 200 459 | jsonres = await res.json 460 | assert jsonres['total'] == 1 461 | assert jsonres['rows'] == [ 462 | [1, 'first', '12:30', 1.0, 'value'], 463 | ] 464 | 465 | 466 | async def test_api_filters_contains_exact_float(rmock, uploaded_csv_filters, client): 467 | res = await client.get(f"/api/{MOCK_CSV_HASH_FILTERS}?value__exact=1.0") 468 | assert res.status_code == 200 469 | jsonres = await res.json 470 | assert jsonres['total'] == 1 471 | assert jsonres['rows'] == [ 472 | [1, 'first', '12:30', 1.0, 'value'], 473 | ] 474 | 475 | 476 | async def test_api_and_filters(rmock, uploaded_csv_filters, client): 477 | res = await client.get(f"/api/{MOCK_CSV_HASH_FILTERS}?id__contains=fir&value__exact=1") 478 | assert res.status_code == 200 479 | jsonres = await res.json 480 | assert jsonres['total'] == 1 481 | assert jsonres['rows'] == [ 482 | [1, 'first', '12:30', 1.0, 'value'], 483 | ] 484 | 485 | 486 | async def test_api_filters_greater_float(rmock, csv_numeric, client): 487 | content = csv_numeric.replace('', ';').encode('utf-8') 488 | url = random_url() 489 | rmock.get(url, body=content) 490 | await client.get(f"/apify?url={url}") 491 | res = await client.get(f"/api/{get_hash(url)}?value__greater=10") 492 | assert res.status_code == 200 493 | jsonres = await res.json 494 | print(jsonres) 495 | assert jsonres['rows'] == [ 496 | [3, 'c', 12], 497 | ] 498 | 499 | 500 | async def test_api_filters_less_float(rmock, csv_numeric, client): 501 | content = csv_numeric.replace('', ';').encode('utf-8') 502 | url = random_url() 503 | rmock.get(url, body=content) 504 | await client.get(f"/apify?url={url}") 505 | res = await client.get(f"/api/{get_hash(url)}?value__less=3") 506 | assert res.status_code == 200 507 | jsonres = await res.json 508 | print(jsonres) 509 | assert jsonres['rows'] == [ 510 | [1, 'a', 2], 511 | ] 512 | 513 | 514 | async def test_api_filters_less_greater_float(rmock, csv_numeric, client): 515 | content = csv_numeric.replace('', ';').encode('utf-8') 516 | url = random_url() 517 | rmock.get(url, body=content) 518 | await client.get(f"/apify?url={url}") 519 | res = await client.get(f"/api/{get_hash(url)}?value__greater=3&value__less=10") 520 | assert res.status_code == 200 521 | jsonres = await res.json 522 | assert jsonres['rows'] == [ 523 | [2, 'b', 4], 524 | ] 525 | 526 | async def test_api_filters_less_greater_string_error(rmock, csv_numeric, client): 527 | content = csv_numeric.replace('', ';').encode('utf-8') 528 | url = random_url() 529 | rmock.get(url, body=content) 530 | await client.get(f"/apify?url={url}") 531 | res = await client.get(f"/api/{get_hash(url)}?value__greater=3&value__less=stan") 532 | assert res.status_code == 400 533 | jsonres = await res.json 534 | assert jsonres == {"error":"Float value expected for less comparison.", "error_id": None , "ok":False} 535 | 536 | 537 | async def test_api_filters_unnormalized_column(rmock, uploaded_csv_filters, client): 538 | res = await client.get(f"/api/{MOCK_CSV_HASH_FILTERS}?id__contains=fir&another column__contains=value") 539 | assert res.status_code == 200 540 | jsonres = await res.json 541 | assert jsonres['total'] == 1 542 | assert jsonres['rows'] == [ 543 | [1, 'first', '12:30', 1.0, 'value'], 544 | ] 545 | 546 | 547 | async def test_apify_analysed_format_response(rmock, csv_siren_siret, client): 548 | content = csv_siren_siret.replace('', ';').encode('utf-8') 549 | url = random_url() 550 | rmock.get(url, body=content) 551 | await client.get(f"/apify?url={url}&analysis=yes") 552 | res = await client.get(f"/api/{get_hash(url)}") 553 | assert res.status_code == 200 554 | jsonres = await res.json 555 | assert all(x in jsonres['columns_infos'] for x in ['id', 'siren', 'siret']) 556 | assert all(x in jsonres['general_infos'] for x in [ 557 | 'dataset_id', 558 | 'date_last_check', 559 | 'encoding', 560 | 'header_row_idx', 561 | 'nb_cells_missing', 562 | 'nb_columns', 563 | 'nb_vars_all_missing', 564 | 'nb_vars_with_missing', 565 | 'resource_id', 566 | 'separator', 567 | 'total_lines', 568 | 'filetype' 569 | ]) 570 | 571 | 572 | async def test_apify_analysed_csv_detective_check_format(rmock, csv_siren_siret, client): 573 | content = csv_siren_siret.replace('', ';').encode('utf-8') 574 | url = random_url() 575 | rmock.get(url, body=content) 576 | await client.get(f"/apify?url={url}&analysis=yes") 577 | res = await client.get(f"/api/{get_hash(url)}") 578 | assert res.status_code == 200 579 | jsonres = await res.json 580 | assert jsonres['columns_infos']['siren']['format'] == 'siren' 581 | assert jsonres['columns_infos']['siret']['format'] == 'siret' 582 | 583 | 584 | async def test_apify_analysed_pandas_profiling_check_numeric(rmock, csv_numeric, client): 585 | content = csv_numeric.replace('', ';').encode('utf-8') 586 | url = random_url() 587 | rmock.get(url, body=content) 588 | await client.get(f"/apify?url={url}&analysis=yes") 589 | res = await client.get(f"/api/{get_hash(url)}") 590 | assert res.status_code == 200 591 | jsonres = await res.json 592 | assert jsonres['columns_infos']['value']['numeric_infos']['max'] == 12 593 | assert jsonres['columns_infos']['value']['numeric_infos']['min'] == 2 594 | assert jsonres['columns_infos']['value']['numeric_infos']['mean'] == 6 595 | 596 | 597 | async def test_apify_analysed_pandas_profiling_check_top(rmock, csv_top, client): 598 | content = csv_top.replace('', ';').encode('utf-8') 599 | url = random_url() 600 | rmock.get(url, body=content) 601 | await client.get(f"/apify?url={url}&analysis=yes") 602 | res = await client.get(f"/api/{get_hash(url)}") 603 | assert res.status_code == 200 604 | jsonres = await res.json 605 | assert jsonres['columns_infos']['cat']['top_infos'][0]['value'] == 'a' 606 | 607 | 608 | async def test_apify_analysed_check_general_infos(rmock, csv_top, client): 609 | content = csv_top.replace('', ';').encode('utf-8') 610 | url = random_url() 611 | rmock.get(url, body=content) 612 | await client.get(f"/apify?url={url}&analysis=yes") 613 | res = await client.get(f"/api/{get_hash(url)}") 614 | assert res.status_code == 200 615 | jsonres = await res.json 616 | assert jsonres['general_infos']['nb_columns'] == 2 617 | assert jsonres['general_infos']['total_lines'] == 4 618 | assert jsonres['general_infos']['separator'] == ';' 619 | assert jsonres['general_infos']['header_row_idx'] == 0 620 | 621 | 622 | @pytest.mark.parametrize('extension', ['xls', 'xlsx']) 623 | async def test_no_analysis_when_excel(client, rmock, extension): 624 | here = os.path.dirname(os.path.abspath(__file__)) 625 | content = open(f"{here}/samples/test.{extension}", 'rb') 626 | mock_url = MOCK_CSV_URL.replace('.csv', extension) 627 | mock_hash = get_hash(mock_url) 628 | rmock.get(mock_url, body=content.read()) 629 | content.close() 630 | await client.get(f"/apify?url={mock_url}&analysis=yes") 631 | res = await client.get(f"/api/{mock_hash}") 632 | assert res.status_code == 200 633 | jsonres = await res.json 634 | print(jsonres) 635 | assert jsonres['columns'] == ['rowid', 'col a', 'col b', 'col c'] 636 | assert jsonres['general_infos'] == { 'filetype': 'excel' } 637 | assert jsonres['columns_infos'] == {} 638 | 639 | 640 | async def test_fail_one_line_json_file(rmock, one_line_json_file, client): 641 | content = one_line_json_file 642 | url = random_url() 643 | rmock.get(url, body=content) 644 | res = await client.get(f"/apify?url={url}&analysis=yes") 645 | assert res.status_code == 500 646 | --------------------------------------------------------------------------------