├── .dockerignore ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── new_feature.md └── workflows │ └── tests.yml ├── .gitignore ├── CHANGELOG.md ├── COPYING ├── Dockerfile ├── MANIFEST.in ├── PUBLIC_API.md ├── README.md ├── csvbase_client ├── VERSION ├── __init__.py ├── constants.py ├── exceptions.py ├── fsspec.py ├── internals │ ├── __init__.py │ ├── auth.py │ ├── cache.py │ ├── cli.py │ ├── config.py │ ├── dirs.py │ ├── http.py │ ├── value_objs.py │ └── version.py └── io.py ├── mypy.ini ├── pytest.ini ├── scripts ├── nuitka │ ├── Dockerfile.nuitka │ ├── WINDOWS.md │ ├── build.sh │ └── trampoline.py └── pypi │ └── build.sh ├── setup.py ├── test-requirements.txt ├── tests ├── __init__.py ├── cli │ ├── __init__.py │ ├── conftest.py │ ├── test_auth.py │ ├── test_cli_cache.py │ ├── test_cli_tables.py │ └── utils.py ├── conftest.py ├── fsspec │ ├── __init__.py │ ├── test_dask.py │ ├── test_duckdb.py │ ├── test_fsspec.py │ ├── test_pandas.py │ └── test_polars.py ├── init-schemas.sql ├── requests_adapter.py ├── test_cache.py ├── test_version.py ├── utils.py └── value_objs.py └── tox.ini /.dockerignore: -------------------------------------------------------------------------------- 1 | csvbase_client.egg-info 2 | dist 3 | .git 4 | .github 5 | .mypy_cache 6 | .pytest_cache 7 | .ruff_cache 8 | setup.build 9 | tests 10 | .tox 11 | .venv 12 | *.csv 13 | *.parquet 14 | *.build 15 | *.dist 16 | *.onefile-build 17 | .pypi-venv -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: File a bug report to help us improve 4 | title: '' 5 | labels: 'bug' 6 | assignees: '' 7 | 8 | --- 9 | 10 | # Description 11 | 12 | # Steps to reproduce 13 | 14 | 1. Do X 15 | 2. Then Y 16 | 3. And Z 17 | 18 | # Expected result 19 | 20 | An A 21 | 22 | # Actual result 23 | 24 | But in fact B 25 | 26 | # Additional details 27 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/new_feature.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: User story 3 | about: A plan for implementing some new feature or change 4 | title: '' 5 | labels: 'enhancement' 6 | assignees: '' 7 | 8 | --- 9 | 10 | # Brief overview 11 | 12 | AS A **[persona name]** 13 | 14 | I WANT **[some feature or change to be made]** 15 | 16 | SO THAT **[brief description of goal]** 17 | 18 | # Additional details 19 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | on: 3 | push: 4 | branches: 5 | - main 6 | - '*.x' 7 | paths-ignore: 8 | - 'docs/**' 9 | - '*.md' 10 | - '*.rst' 11 | pull_request: 12 | branches: 13 | - main 14 | - '*.x' 15 | paths-ignore: 16 | - 'docs/**' 17 | - '*.md' 18 | - '*.rst' 19 | jobs: 20 | tests: 21 | name: ${{ matrix.name }} 22 | runs-on: ${{ matrix.os }} 23 | env: 24 | PGPASSWORD: 'csvbase' 25 | services: 26 | postgres: 27 | image: postgres:13 28 | env: 29 | POSTGRES_USER: csvbase 30 | POSTGRES_PASSWORD: csvbase 31 | POSTGRES_DB: csvbase 32 | ports: 33 | - 5432:5432 34 | options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5 35 | strategy: 36 | fail-fast: false 37 | matrix: 38 | include: 39 | - {name: Linux, python: '3.10', os: ubuntu-latest, tox: py310} 40 | # - {name: Windows, python: '3.11', os: windows-latest, tox: py311} 41 | # - {name: Mac, python: '3.11', os: macos-latest, tox: py311} 42 | # - {name: '3.12-dev', python: '3.12-dev', os: ubuntu-latest, tox: py312} 43 | # - {name: '3.10', python: '3.10', os: ubuntu-latest, tox: py310} 44 | # - {name: '3.9', python: '3.9', os: ubuntu-latest, tox: py39} 45 | # - {name: '3.8', python: '3.8', os: ubuntu-latest, tox: py38} 46 | # - {name: '3.7', python: '3.7', os: ubuntu-latest, tox: py37} 47 | # - {name: 'PyPy', python: 'pypy-3.9', os: ubuntu-latest, tox: pypy39} 48 | # - {name: 'Pallets Minimum Versions', python: '3.11', os: ubuntu-latest, tox: py311-min} 49 | # - {name: 'Pallets Development Versions', python: '3.7', os: ubuntu-latest, tox: py37-dev} 50 | # - {name: Typing, python: '3.11', os: ubuntu-latest, tox: typing} 51 | steps: 52 | - uses: actions/checkout@v4 53 | - uses: actions/setup-python@v5 54 | with: 55 | python-version: ${{ matrix.python }} 56 | - name: Install required ubuntu packages 57 | run: sudo apt-get update && sudo apt-get install -y libsystemd-dev postgresql-client python3-dev 58 | - name: Create schemas 59 | run: psql -h localhost -d csvbase -U csvbase -f tests/init-schemas.sql 60 | - name: Write csvbase (server) config file 61 | run: | 62 | cat < ~/.csvbase.toml 63 | secret_key = "snakeoil" 64 | db_url = "postgresql+psycopg2://csvbase:csvbase@localhost/csvbase" 65 | EOF 66 | - name: update pip 67 | run: | 68 | pip install -U wheel 69 | pip install -U setuptools 70 | python -m pip install -U pip 71 | - run: pip install tox 72 | - run: tox run -e ${{ matrix.tox }} 73 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info 2 | __pycache__ 3 | .tox 4 | dist 5 | .venv 6 | *.build 7 | *.dist 8 | *.onefile-build 9 | .pypi-venv -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to `csvbase-client` will be documented in this file. This 4 | project adheres to [Semantic Versioning](https://semver.org/). 5 | 6 | To understand what falls under the category of the public API and what lies 7 | outside it, please consult [PUBLIC_API.md](PUBLIC_API.md). Changes that do not 8 | impact the public API usually will not lead to a version change and might not 9 | be mentioned here. 10 | 11 | ## Unreleased 12 | 13 | ## [0.1.1] - 2024-04-10 14 | 15 | ### Added 16 | 17 | - Python 3.12 support 18 | 19 | ### Fixed 20 | 21 | - Adding missing dependency on importlib_resources 22 | 23 | ## [0.1.0] - 2024-04-09 24 | 25 | ### Added 26 | - A working fsspec implementation 27 | - Pandas, Dask and Polars actively tested and supported 28 | - A way to inspect (and clear) the cache - `csvbase-client cache --help` 29 | 30 | ### Removed 31 | - Most of the pre-fsspec code is gone 32 | 33 | ## [0.0.1] - 2023-08-30 34 | 35 | ### Added 36 | - Initial implementation of csvbase-client. 37 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | GNU AFFERO GENERAL PUBLIC LICENSE 2 | Version 3, 19 November 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU Affero General Public License is a free, copyleft license for 11 | software and other kinds of works, specifically designed to ensure 12 | cooperation with the community in the case of network server software. 13 | 14 | The licenses for most software and other practical works are designed 15 | to take away your freedom to share and change the works. By contrast, 16 | our General Public Licenses are intended to guarantee your freedom to 17 | share and change all versions of a program--to make sure it remains free 18 | software for all its users. 19 | 20 | When we speak of free software, we are referring to freedom, not 21 | price. Our General Public Licenses are designed to make sure that you 22 | have the freedom to distribute copies of free software (and charge for 23 | them if you wish), that you receive source code or can get it if you 24 | want it, that you can change the software or use pieces of it in new 25 | free programs, and that you know you can do these things. 26 | 27 | Developers that use our General Public Licenses protect your rights 28 | with two steps: (1) assert copyright on the software, and (2) offer 29 | you this License which gives you legal permission to copy, distribute 30 | and/or modify the software. 31 | 32 | A secondary benefit of defending all users' freedom is that 33 | improvements made in alternate versions of the program, if they 34 | receive widespread use, become available for other developers to 35 | incorporate. Many developers of free software are heartened and 36 | encouraged by the resulting cooperation. However, in the case of 37 | software used on network servers, this result may fail to come about. 38 | The GNU General Public License permits making a modified version and 39 | letting the public access it on a server without ever releasing its 40 | source code to the public. 41 | 42 | The GNU Affero General Public License is designed specifically to 43 | ensure that, in such cases, the modified source code becomes available 44 | to the community. It requires the operator of a network server to 45 | provide the source code of the modified version running there to the 46 | users of that server. Therefore, public use of a modified version, on 47 | a publicly accessible server, gives the public access to the source 48 | code of the modified version. 49 | 50 | An older license, called the Affero General Public License and 51 | published by Affero, was designed to accomplish similar goals. This is 52 | a different license, not a version of the Affero GPL, but Affero has 53 | released a new version of the Affero GPL which permits relicensing under 54 | this license. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | TERMS AND CONDITIONS 60 | 61 | 0. Definitions. 62 | 63 | "This License" refers to version 3 of the GNU Affero General Public License. 64 | 65 | "Copyright" also means copyright-like laws that apply to other kinds of 66 | works, such as semiconductor masks. 67 | 68 | "The Program" refers to any copyrightable work licensed under this 69 | License. Each licensee is addressed as "you". "Licensees" and 70 | "recipients" may be individuals or organizations. 71 | 72 | To "modify" a work means to copy from or adapt all or part of the work 73 | in a fashion requiring copyright permission, other than the making of an 74 | exact copy. The resulting work is called a "modified version" of the 75 | earlier work or a work "based on" the earlier work. 76 | 77 | A "covered work" means either the unmodified Program or a work based 78 | on the Program. 79 | 80 | To "propagate" a work means to do anything with it that, without 81 | permission, would make you directly or secondarily liable for 82 | infringement under applicable copyright law, except executing it on a 83 | computer or modifying a private copy. Propagation includes copying, 84 | distribution (with or without modification), making available to the 85 | public, and in some countries other activities as well. 86 | 87 | To "convey" a work means any kind of propagation that enables other 88 | parties to make or receive copies. Mere interaction with a user through 89 | a computer network, with no transfer of a copy, is not conveying. 90 | 91 | An interactive user interface displays "Appropriate Legal Notices" 92 | to the extent that it includes a convenient and prominently visible 93 | feature that (1) displays an appropriate copyright notice, and (2) 94 | tells the user that there is no warranty for the work (except to the 95 | extent that warranties are provided), that licensees may convey the 96 | work under this License, and how to view a copy of this License. If 97 | the interface presents a list of user commands or options, such as a 98 | menu, a prominent item in the list meets this criterion. 99 | 100 | 1. Source Code. 101 | 102 | The "source code" for a work means the preferred form of the work 103 | for making modifications to it. "Object code" means any non-source 104 | form of a work. 105 | 106 | A "Standard Interface" means an interface that either is an official 107 | standard defined by a recognized standards body, or, in the case of 108 | interfaces specified for a particular programming language, one that 109 | is widely used among developers working in that language. 110 | 111 | The "System Libraries" of an executable work include anything, other 112 | than the work as a whole, that (a) is included in the normal form of 113 | packaging a Major Component, but which is not part of that Major 114 | Component, and (b) serves only to enable use of the work with that 115 | Major Component, or to implement a Standard Interface for which an 116 | implementation is available to the public in source code form. A 117 | "Major Component", in this context, means a major essential component 118 | (kernel, window system, and so on) of the specific operating system 119 | (if any) on which the executable work runs, or a compiler used to 120 | produce the work, or an object code interpreter used to run it. 121 | 122 | The "Corresponding Source" for a work in object code form means all 123 | the source code needed to generate, install, and (for an executable 124 | work) run the object code and to modify the work, including scripts to 125 | control those activities. However, it does not include the work's 126 | System Libraries, or general-purpose tools or generally available free 127 | programs which are used unmodified in performing those activities but 128 | which are not part of the work. For example, Corresponding Source 129 | includes interface definition files associated with source files for 130 | the work, and the source code for shared libraries and dynamically 131 | linked subprograms that the work is specifically designed to require, 132 | such as by intimate data communication or control flow between those 133 | subprograms and other parts of the work. 134 | 135 | The Corresponding Source need not include anything that users 136 | can regenerate automatically from other parts of the Corresponding 137 | Source. 138 | 139 | The Corresponding Source for a work in source code form is that 140 | same work. 141 | 142 | 2. Basic Permissions. 143 | 144 | All rights granted under this License are granted for the term of 145 | copyright on the Program, and are irrevocable provided the stated 146 | conditions are met. This License explicitly affirms your unlimited 147 | permission to run the unmodified Program. The output from running a 148 | covered work is covered by this License only if the output, given its 149 | content, constitutes a covered work. This License acknowledges your 150 | rights of fair use or other equivalent, as provided by copyright law. 151 | 152 | You may make, run and propagate covered works that you do not 153 | convey, without conditions so long as your license otherwise remains 154 | in force. You may convey covered works to others for the sole purpose 155 | of having them make modifications exclusively for you, or provide you 156 | with facilities for running those works, provided that you comply with 157 | the terms of this License in conveying all material for which you do 158 | not control copyright. Those thus making or running the covered works 159 | for you must do so exclusively on your behalf, under your direction 160 | and control, on terms that prohibit them from making any copies of 161 | your copyrighted material outside their relationship with you. 162 | 163 | Conveying under any other circumstances is permitted solely under 164 | the conditions stated below. Sublicensing is not allowed; section 10 165 | makes it unnecessary. 166 | 167 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 168 | 169 | No covered work shall be deemed part of an effective technological 170 | measure under any applicable law fulfilling obligations under article 171 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 172 | similar laws prohibiting or restricting circumvention of such 173 | measures. 174 | 175 | When you convey a covered work, you waive any legal power to forbid 176 | circumvention of technological measures to the extent such circumvention 177 | is effected by exercising rights under this License with respect to 178 | the covered work, and you disclaim any intention to limit operation or 179 | modification of the work as a means of enforcing, against the work's 180 | users, your or third parties' legal rights to forbid circumvention of 181 | technological measures. 182 | 183 | 4. Conveying Verbatim Copies. 184 | 185 | You may convey verbatim copies of the Program's source code as you 186 | receive it, in any medium, provided that you conspicuously and 187 | appropriately publish on each copy an appropriate copyright notice; 188 | keep intact all notices stating that this License and any 189 | non-permissive terms added in accord with section 7 apply to the code; 190 | keep intact all notices of the absence of any warranty; and give all 191 | recipients a copy of this License along with the Program. 192 | 193 | You may charge any price or no price for each copy that you convey, 194 | and you may offer support or warranty protection for a fee. 195 | 196 | 5. Conveying Modified Source Versions. 197 | 198 | You may convey a work based on the Program, or the modifications to 199 | produce it from the Program, in the form of source code under the 200 | terms of section 4, provided that you also meet all of these conditions: 201 | 202 | a) The work must carry prominent notices stating that you modified 203 | it, and giving a relevant date. 204 | 205 | b) The work must carry prominent notices stating that it is 206 | released under this License and any conditions added under section 207 | 7. This requirement modifies the requirement in section 4 to 208 | "keep intact all notices". 209 | 210 | c) You must license the entire work, as a whole, under this 211 | License to anyone who comes into possession of a copy. This 212 | License will therefore apply, along with any applicable section 7 213 | additional terms, to the whole of the work, and all its parts, 214 | regardless of how they are packaged. This License gives no 215 | permission to license the work in any other way, but it does not 216 | invalidate such permission if you have separately received it. 217 | 218 | d) If the work has interactive user interfaces, each must display 219 | Appropriate Legal Notices; however, if the Program has interactive 220 | interfaces that do not display Appropriate Legal Notices, your 221 | work need not make them do so. 222 | 223 | A compilation of a covered work with other separate and independent 224 | works, which are not by their nature extensions of the covered work, 225 | and which are not combined with it such as to form a larger program, 226 | in or on a volume of a storage or distribution medium, is called an 227 | "aggregate" if the compilation and its resulting copyright are not 228 | used to limit the access or legal rights of the compilation's users 229 | beyond what the individual works permit. Inclusion of a covered work 230 | in an aggregate does not cause this License to apply to the other 231 | parts of the aggregate. 232 | 233 | 6. Conveying Non-Source Forms. 234 | 235 | You may convey a covered work in object code form under the terms 236 | of sections 4 and 5, provided that you also convey the 237 | machine-readable Corresponding Source under the terms of this License, 238 | in one of these ways: 239 | 240 | a) Convey the object code in, or embodied in, a physical product 241 | (including a physical distribution medium), accompanied by the 242 | Corresponding Source fixed on a durable physical medium 243 | customarily used for software interchange. 244 | 245 | b) Convey the object code in, or embodied in, a physical product 246 | (including a physical distribution medium), accompanied by a 247 | written offer, valid for at least three years and valid for as 248 | long as you offer spare parts or customer support for that product 249 | model, to give anyone who possesses the object code either (1) a 250 | copy of the Corresponding Source for all the software in the 251 | product that is covered by this License, on a durable physical 252 | medium customarily used for software interchange, for a price no 253 | more than your reasonable cost of physically performing this 254 | conveying of source, or (2) access to copy the 255 | Corresponding Source from a network server at no charge. 256 | 257 | c) Convey individual copies of the object code with a copy of the 258 | written offer to provide the Corresponding Source. This 259 | alternative is allowed only occasionally and noncommercially, and 260 | only if you received the object code with such an offer, in accord 261 | with subsection 6b. 262 | 263 | d) Convey the object code by offering access from a designated 264 | place (gratis or for a charge), and offer equivalent access to the 265 | Corresponding Source in the same way through the same place at no 266 | further charge. You need not require recipients to copy the 267 | Corresponding Source along with the object code. If the place to 268 | copy the object code is a network server, the Corresponding Source 269 | may be on a different server (operated by you or a third party) 270 | that supports equivalent copying facilities, provided you maintain 271 | clear directions next to the object code saying where to find the 272 | Corresponding Source. Regardless of what server hosts the 273 | Corresponding Source, you remain obligated to ensure that it is 274 | available for as long as needed to satisfy these requirements. 275 | 276 | e) Convey the object code using peer-to-peer transmission, provided 277 | you inform other peers where the object code and Corresponding 278 | Source of the work are being offered to the general public at no 279 | charge under subsection 6d. 280 | 281 | A separable portion of the object code, whose source code is excluded 282 | from the Corresponding Source as a System Library, need not be 283 | included in conveying the object code work. 284 | 285 | A "User Product" is either (1) a "consumer product", which means any 286 | tangible personal property which is normally used for personal, family, 287 | or household purposes, or (2) anything designed or sold for incorporation 288 | into a dwelling. In determining whether a product is a consumer product, 289 | doubtful cases shall be resolved in favor of coverage. For a particular 290 | product received by a particular user, "normally used" refers to a 291 | typical or common use of that class of product, regardless of the status 292 | of the particular user or of the way in which the particular user 293 | actually uses, or expects or is expected to use, the product. A product 294 | is a consumer product regardless of whether the product has substantial 295 | commercial, industrial or non-consumer uses, unless such uses represent 296 | the only significant mode of use of the product. 297 | 298 | "Installation Information" for a User Product means any methods, 299 | procedures, authorization keys, or other information required to install 300 | and execute modified versions of a covered work in that User Product from 301 | a modified version of its Corresponding Source. The information must 302 | suffice to ensure that the continued functioning of the modified object 303 | code is in no case prevented or interfered with solely because 304 | modification has been made. 305 | 306 | If you convey an object code work under this section in, or with, or 307 | specifically for use in, a User Product, and the conveying occurs as 308 | part of a transaction in which the right of possession and use of the 309 | User Product is transferred to the recipient in perpetuity or for a 310 | fixed term (regardless of how the transaction is characterized), the 311 | Corresponding Source conveyed under this section must be accompanied 312 | by the Installation Information. But this requirement does not apply 313 | if neither you nor any third party retains the ability to install 314 | modified object code on the User Product (for example, the work has 315 | been installed in ROM). 316 | 317 | The requirement to provide Installation Information does not include a 318 | requirement to continue to provide support service, warranty, or updates 319 | for a work that has been modified or installed by the recipient, or for 320 | the User Product in which it has been modified or installed. Access to a 321 | network may be denied when the modification itself materially and 322 | adversely affects the operation of the network or violates the rules and 323 | protocols for communication across the network. 324 | 325 | Corresponding Source conveyed, and Installation Information provided, 326 | in accord with this section must be in a format that is publicly 327 | documented (and with an implementation available to the public in 328 | source code form), and must require no special password or key for 329 | unpacking, reading or copying. 330 | 331 | 7. Additional Terms. 332 | 333 | "Additional permissions" are terms that supplement the terms of this 334 | License by making exceptions from one or more of its conditions. 335 | Additional permissions that are applicable to the entire Program shall 336 | be treated as though they were included in this License, to the extent 337 | that they are valid under applicable law. If additional permissions 338 | apply only to part of the Program, that part may be used separately 339 | under those permissions, but the entire Program remains governed by 340 | this License without regard to the additional permissions. 341 | 342 | When you convey a copy of a covered work, you may at your option 343 | remove any additional permissions from that copy, or from any part of 344 | it. (Additional permissions may be written to require their own 345 | removal in certain cases when you modify the work.) You may place 346 | additional permissions on material, added by you to a covered work, 347 | for which you have or can give appropriate copyright permission. 348 | 349 | Notwithstanding any other provision of this License, for material you 350 | add to a covered work, you may (if authorized by the copyright holders of 351 | that material) supplement the terms of this License with terms: 352 | 353 | a) Disclaiming warranty or limiting liability differently from the 354 | terms of sections 15 and 16 of this License; or 355 | 356 | b) Requiring preservation of specified reasonable legal notices or 357 | author attributions in that material or in the Appropriate Legal 358 | Notices displayed by works containing it; or 359 | 360 | c) Prohibiting misrepresentation of the origin of that material, or 361 | requiring that modified versions of such material be marked in 362 | reasonable ways as different from the original version; or 363 | 364 | d) Limiting the use for publicity purposes of names of licensors or 365 | authors of the material; or 366 | 367 | e) Declining to grant rights under trademark law for use of some 368 | trade names, trademarks, or service marks; or 369 | 370 | f) Requiring indemnification of licensors and authors of that 371 | material by anyone who conveys the material (or modified versions of 372 | it) with contractual assumptions of liability to the recipient, for 373 | any liability that these contractual assumptions directly impose on 374 | those licensors and authors. 375 | 376 | All other non-permissive additional terms are considered "further 377 | restrictions" within the meaning of section 10. If the Program as you 378 | received it, or any part of it, contains a notice stating that it is 379 | governed by this License along with a term that is a further 380 | restriction, you may remove that term. If a license document contains 381 | a further restriction but permits relicensing or conveying under this 382 | License, you may add to a covered work material governed by the terms 383 | of that license document, provided that the further restriction does 384 | not survive such relicensing or conveying. 385 | 386 | If you add terms to a covered work in accord with this section, you 387 | must place, in the relevant source files, a statement of the 388 | additional terms that apply to those files, or a notice indicating 389 | where to find the applicable terms. 390 | 391 | Additional terms, permissive or non-permissive, may be stated in the 392 | form of a separately written license, or stated as exceptions; 393 | the above requirements apply either way. 394 | 395 | 8. Termination. 396 | 397 | You may not propagate or modify a covered work except as expressly 398 | provided under this License. Any attempt otherwise to propagate or 399 | modify it is void, and will automatically terminate your rights under 400 | this License (including any patent licenses granted under the third 401 | paragraph of section 11). 402 | 403 | However, if you cease all violation of this License, then your 404 | license from a particular copyright holder is reinstated (a) 405 | provisionally, unless and until the copyright holder explicitly and 406 | finally terminates your license, and (b) permanently, if the copyright 407 | holder fails to notify you of the violation by some reasonable means 408 | prior to 60 days after the cessation. 409 | 410 | Moreover, your license from a particular copyright holder is 411 | reinstated permanently if the copyright holder notifies you of the 412 | violation by some reasonable means, this is the first time you have 413 | received notice of violation of this License (for any work) from that 414 | copyright holder, and you cure the violation prior to 30 days after 415 | your receipt of the notice. 416 | 417 | Termination of your rights under this section does not terminate the 418 | licenses of parties who have received copies or rights from you under 419 | this License. If your rights have been terminated and not permanently 420 | reinstated, you do not qualify to receive new licenses for the same 421 | material under section 10. 422 | 423 | 9. Acceptance Not Required for Having Copies. 424 | 425 | You are not required to accept this License in order to receive or 426 | run a copy of the Program. Ancillary propagation of a covered work 427 | occurring solely as a consequence of using peer-to-peer transmission 428 | to receive a copy likewise does not require acceptance. However, 429 | nothing other than this License grants you permission to propagate or 430 | modify any covered work. These actions infringe copyright if you do 431 | not accept this License. Therefore, by modifying or propagating a 432 | covered work, you indicate your acceptance of this License to do so. 433 | 434 | 10. Automatic Licensing of Downstream Recipients. 435 | 436 | Each time you convey a covered work, the recipient automatically 437 | receives a license from the original licensors, to run, modify and 438 | propagate that work, subject to this License. You are not responsible 439 | for enforcing compliance by third parties with this License. 440 | 441 | An "entity transaction" is a transaction transferring control of an 442 | organization, or substantially all assets of one, or subdividing an 443 | organization, or merging organizations. If propagation of a covered 444 | work results from an entity transaction, each party to that 445 | transaction who receives a copy of the work also receives whatever 446 | licenses to the work the party's predecessor in interest had or could 447 | give under the previous paragraph, plus a right to possession of the 448 | Corresponding Source of the work from the predecessor in interest, if 449 | the predecessor has it or can get it with reasonable efforts. 450 | 451 | You may not impose any further restrictions on the exercise of the 452 | rights granted or affirmed under this License. For example, you may 453 | not impose a license fee, royalty, or other charge for exercise of 454 | rights granted under this License, and you may not initiate litigation 455 | (including a cross-claim or counterclaim in a lawsuit) alleging that 456 | any patent claim is infringed by making, using, selling, offering for 457 | sale, or importing the Program or any portion of it. 458 | 459 | 11. Patents. 460 | 461 | A "contributor" is a copyright holder who authorizes use under this 462 | License of the Program or a work on which the Program is based. The 463 | work thus licensed is called the contributor's "contributor version". 464 | 465 | A contributor's "essential patent claims" are all patent claims 466 | owned or controlled by the contributor, whether already acquired or 467 | hereafter acquired, that would be infringed by some manner, permitted 468 | by this License, of making, using, or selling its contributor version, 469 | but do not include claims that would be infringed only as a 470 | consequence of further modification of the contributor version. For 471 | purposes of this definition, "control" includes the right to grant 472 | patent sublicenses in a manner consistent with the requirements of 473 | this License. 474 | 475 | Each contributor grants you a non-exclusive, worldwide, royalty-free 476 | patent license under the contributor's essential patent claims, to 477 | make, use, sell, offer for sale, import and otherwise run, modify and 478 | propagate the contents of its contributor version. 479 | 480 | In the following three paragraphs, a "patent license" is any express 481 | agreement or commitment, however denominated, not to enforce a patent 482 | (such as an express permission to practice a patent or covenant not to 483 | sue for patent infringement). To "grant" such a patent license to a 484 | party means to make such an agreement or commitment not to enforce a 485 | patent against the party. 486 | 487 | If you convey a covered work, knowingly relying on a patent license, 488 | and the Corresponding Source of the work is not available for anyone 489 | to copy, free of charge and under the terms of this License, through a 490 | publicly available network server or other readily accessible means, 491 | then you must either (1) cause the Corresponding Source to be so 492 | available, or (2) arrange to deprive yourself of the benefit of the 493 | patent license for this particular work, or (3) arrange, in a manner 494 | consistent with the requirements of this License, to extend the patent 495 | license to downstream recipients. "Knowingly relying" means you have 496 | actual knowledge that, but for the patent license, your conveying the 497 | covered work in a country, or your recipient's use of the covered work 498 | in a country, would infringe one or more identifiable patents in that 499 | country that you have reason to believe are valid. 500 | 501 | If, pursuant to or in connection with a single transaction or 502 | arrangement, you convey, or propagate by procuring conveyance of, a 503 | covered work, and grant a patent license to some of the parties 504 | receiving the covered work authorizing them to use, propagate, modify 505 | or convey a specific copy of the covered work, then the patent license 506 | you grant is automatically extended to all recipients of the covered 507 | work and works based on it. 508 | 509 | A patent license is "discriminatory" if it does not include within 510 | the scope of its coverage, prohibits the exercise of, or is 511 | conditioned on the non-exercise of one or more of the rights that are 512 | specifically granted under this License. You may not convey a covered 513 | work if you are a party to an arrangement with a third party that is 514 | in the business of distributing software, under which you make payment 515 | to the third party based on the extent of your activity of conveying 516 | the work, and under which the third party grants, to any of the 517 | parties who would receive the covered work from you, a discriminatory 518 | patent license (a) in connection with copies of the covered work 519 | conveyed by you (or copies made from those copies), or (b) primarily 520 | for and in connection with specific products or compilations that 521 | contain the covered work, unless you entered into that arrangement, 522 | or that patent license was granted, prior to 28 March 2007. 523 | 524 | Nothing in this License shall be construed as excluding or limiting 525 | any implied license or other defenses to infringement that may 526 | otherwise be available to you under applicable patent law. 527 | 528 | 12. No Surrender of Others' Freedom. 529 | 530 | If conditions are imposed on you (whether by court order, agreement or 531 | otherwise) that contradict the conditions of this License, they do not 532 | excuse you from the conditions of this License. If you cannot convey a 533 | covered work so as to satisfy simultaneously your obligations under this 534 | License and any other pertinent obligations, then as a consequence you may 535 | not convey it at all. For example, if you agree to terms that obligate you 536 | to collect a royalty for further conveying from those to whom you convey 537 | the Program, the only way you could satisfy both those terms and this 538 | License would be to refrain entirely from conveying the Program. 539 | 540 | 13. Remote Network Interaction; Use with the GNU General Public License. 541 | 542 | Notwithstanding any other provision of this License, if you modify the 543 | Program, your modified version must prominently offer all users 544 | interacting with it remotely through a computer network (if your version 545 | supports such interaction) an opportunity to receive the Corresponding 546 | Source of your version by providing access to the Corresponding Source 547 | from a network server at no charge, through some standard or customary 548 | means of facilitating copying of software. This Corresponding Source 549 | shall include the Corresponding Source for any work covered by version 3 550 | of the GNU General Public License that is incorporated pursuant to the 551 | following paragraph. 552 | 553 | Notwithstanding any other provision of this License, you have 554 | permission to link or combine any covered work with a work licensed 555 | under version 3 of the GNU General Public License into a single 556 | combined work, and to convey the resulting work. The terms of this 557 | License will continue to apply to the part which is the covered work, 558 | but the work with which it is combined will remain governed by version 559 | 3 of the GNU General Public License. 560 | 561 | 14. Revised Versions of this License. 562 | 563 | The Free Software Foundation may publish revised and/or new versions of 564 | the GNU Affero General Public License from time to time. Such new versions 565 | will be similar in spirit to the present version, but may differ in detail to 566 | address new problems or concerns. 567 | 568 | Each version is given a distinguishing version number. If the 569 | Program specifies that a certain numbered version of the GNU Affero General 570 | Public License "or any later version" applies to it, you have the 571 | option of following the terms and conditions either of that numbered 572 | version or of any later version published by the Free Software 573 | Foundation. If the Program does not specify a version number of the 574 | GNU Affero General Public License, you may choose any version ever published 575 | by the Free Software Foundation. 576 | 577 | If the Program specifies that a proxy can decide which future 578 | versions of the GNU Affero General Public License can be used, that proxy's 579 | public statement of acceptance of a version permanently authorizes you 580 | to choose that version for the Program. 581 | 582 | Later license versions may give you additional or different 583 | permissions. However, no additional obligations are imposed on any 584 | author or copyright holder as a result of your choosing to follow a 585 | later version. 586 | 587 | 15. Disclaimer of Warranty. 588 | 589 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 590 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 591 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 592 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 593 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 594 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 595 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 596 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 597 | 598 | 16. Limitation of Liability. 599 | 600 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 602 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 603 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 604 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 605 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 606 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 607 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 608 | SUCH DAMAGES. 609 | 610 | 17. Interpretation of Sections 15 and 16. 611 | 612 | If the disclaimer of warranty and limitation of liability provided 613 | above cannot be given local legal effect according to their terms, 614 | reviewing courts shall apply local law that most closely approximates 615 | an absolute waiver of all civil liability in connection with the 616 | Program, unless a warranty or assumption of liability accompanies a 617 | copy of the Program in return for a fee. 618 | 619 | END OF TERMS AND CONDITIONS 620 | 621 | How to Apply These Terms to Your New Programs 622 | 623 | If you develop a new program, and you want it to be of the greatest 624 | possible use to the public, the best way to achieve this is to make it 625 | free software which everyone can redistribute and change under these terms. 626 | 627 | To do so, attach the following notices to the program. It is safest 628 | to attach them to the start of each source file to most effectively 629 | state the exclusion of warranty; and each file should have at least 630 | the "copyright" line and a pointer to where the full notice is found. 631 | 632 | 633 | Copyright (C) 634 | 635 | This program is free software: you can redistribute it and/or modify 636 | it under the terms of the GNU Affero General Public License as published by 637 | the Free Software Foundation, either version 3 of the License, or 638 | (at your option) any later version. 639 | 640 | This program is distributed in the hope that it will be useful, 641 | but WITHOUT ANY WARRANTY; without even the implied warranty of 642 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 643 | GNU Affero General Public License for more details. 644 | 645 | You should have received a copy of the GNU Affero General Public License 646 | along with this program. If not, see . 647 | 648 | Also add information on how to contact you by electronic and paper mail. 649 | 650 | If your software can interact with users remotely through a computer 651 | network, you should also make sure that it provides a way for users to 652 | get its source. For example, if your program is a web application, its 653 | interface could display a "Source" link that leads users to an archive 654 | of the code. There are many ways you could offer source, and different 655 | solutions will be better for different programs; see section 13 for the 656 | specific requirements. 657 | 658 | You should also get your employer (if you work as a programmer) or school, 659 | if any, to sign a "copyright disclaimer" for the program, if necessary. 660 | For more information on this, and how to apply and follow the GNU AGPL, see 661 | . 662 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:1.4 2 | FROM python:3.11-slim-bookworm as builder 3 | RUN apt-get update && apt-get -y upgrade 4 | RUN pip install build 5 | COPY ./ ./ 6 | RUN python -m build . 7 | 8 | FROM python:3.11-slim-bookworm 9 | COPY --from=builder dist/*.whl . 10 | RUN python -m pip --no-cache-dir install *.whl 11 | ENTRYPOINT ["csvbase-client"] 12 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include cbc/VERSION 2 | global-exclude *.pyc -------------------------------------------------------------------------------- /PUBLIC_API.md: -------------------------------------------------------------------------------- 1 | # Public API of csvbase-client 2 | 3 | ## Specifically included 4 | 5 | - The cli commands 6 | - The functionality of the `csvbase.fsspec` module 7 | 8 | ## Specifically excluded 9 | 10 | - The contents of the `csvbase.internals` module 11 | - Textual output (ie: not csv or parquet) of cli commands like `csvbase-client 12 | table show` 13 | 14 | ## Otherwise 15 | 16 | Anything not noted here should be considered excluded. 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | csvbase logo 3 |
4 | 5 | # csvbase-client 6 | 7 | The command line client and pandas integration for [csvbase](https://csvbase.com/). 8 | 9 | ## Status 10 | 11 | Semantic versioning is followed, see the [changelog](https://github.com/calpaterson/csvbase-client/blob/main/CHANGELOG.md). 12 | 13 | ## Usage 14 | 15 | ### Get a table 16 | 17 | In pandas: 18 | 19 | ```python 20 | >>> import pandas as pd 21 | >>> pd.read_csv("csvbase://meripaterson/stock-exchanges") 22 | csvbase_row_id Continent Country Name MIC Last changed 23 | 0 1 Africa Lesotho HYBSE NaN 2019-03-25 24 | 1 2 Asia Kazakhstan Astana International Financial Centre AIXK 2018-11-18 25 | 2 3 Africa South Africa ZAR X ZARX 2018-11-18 26 | 3 4 South America Argentina Bolsas y Mercados Argentinos NaN 2018-04-02 27 | 4 5 North America United States of America Delaware Board of Trade NaN 2018-04-02 28 | .. ... ... ... ... ... ... 29 | 246 247 North America United States of America Long-Term Stock Exchange LTSE 2020-09-14 30 | 247 248 North America United States of America Miami International Securities Exchange MIHI 2020-09-24 31 | 248 249 North America United States of America Members' Exchange NaN 2020-09-24 32 | 249 250 Africa Zimbabwe Victoria Falls Stock Exchange NaN 2020-11-01 33 | 250 251 Asia China Beijing Stock Exchange NaN 2021-12-27 34 | 35 | [251 rows x 6 columns] 36 | ``` 37 | 38 | From the command line 39 | 40 | ```bash 41 | csvbase-client table get meripaterson/stock-exchanges 42 | csvbase_row_id,Continent,Country,Name,MIC,Last changed 43 | 1,Africa,Lesotho,HYBSE,,2019-03-25 44 | 2,Asia,Kazakhstan,Astana International Financial Centre,AIXK,2018-11-18 45 | 3,Africa,South Africa,ZAR X,ZARX,2018-11-18 46 | [ full file omitted ] 47 | ``` 48 | 49 | ### Set (aka "upsert") a table: 50 | 51 | ```bash 52 | csvbase-client table set meripaterson/stock-exchanges stock-exchanges.csv 53 | ``` 54 | 55 | Nothing is output upon success and exit code is 0. 56 | 57 | ## Installing 58 | 59 | ### Executable 60 | 61 | Download these from the github [release page](https://github.com/calpaterson/csvbase-client/releases/). 62 | 63 | ### Pip + PyPI 64 | 65 | ```bash 66 | pip install csvbase-client 67 | ``` 68 | 69 | ### Docker 70 | 71 | ```bash 72 | docker pull calpaterson/csvbase-client 73 | ``` 74 | 75 | Then when you run: 76 | 77 | ```bash 78 | # mount your own xdg-cache directory as a volume inside the container 79 | docker run -v "${XDG_CACHE_HOME:-$HOME/.cache}":/root/.cache calpaterson/csvbase-client 80 | ``` 81 | -------------------------------------------------------------------------------- /csvbase_client/VERSION: -------------------------------------------------------------------------------- 1 | 0.1.1 -------------------------------------------------------------------------------- /csvbase_client/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /csvbase_client/constants.py: -------------------------------------------------------------------------------- 1 | CSVBASE_DOT_COM = "https://csvbase.com" 2 | -------------------------------------------------------------------------------- /csvbase_client/exceptions.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | 4 | def http_error_to_user_message(ref: str, response: requests.Response) -> str: 5 | """Convert http responses into user-visible error messages""" 6 | if response.status_code == 404: 7 | return f"Table not found: {ref}" 8 | else: 9 | return f"Unknown error (HTTP status code: {response.status_code})" 10 | 11 | 12 | class CSVBaseException(Exception): 13 | """Catch all exception for errors. 14 | 15 | Eventually this will be subclassed to allow for people to determine which 16 | error has occurred, but not just yet. 17 | 18 | """ 19 | -------------------------------------------------------------------------------- /csvbase_client/fsspec.py: -------------------------------------------------------------------------------- 1 | import io 2 | from typing import Dict, Optional, IO, Iterator 3 | import shutil 4 | from logging import getLogger 5 | from urllib.parse import urljoin 6 | from threading import Lock 7 | import contextlib 8 | 9 | import requests 10 | from pyappcache.keys import Key 11 | from pyappcache.fs import FilesystemCache 12 | from fsspec.spec import AbstractFileSystem, AbstractBufferedFile 13 | 14 | from .io import rewind 15 | from .internals.cache import get_fs_cache, get_last_etag, set_etag, RepKey 16 | from .internals.value_objs import Auth, ContentType 17 | from .internals.auth import get_auth 18 | from .internals.http import get_http_sesh, HTTP_TIMEOUT 19 | from .constants import CSVBASE_DOT_COM 20 | from .exceptions import http_error_to_user_message, CSVBaseException 21 | 22 | logger = getLogger(__name__) 23 | 24 | 25 | def get_rep( 26 | http_sesh: requests.Session, 27 | cache: FilesystemCache, 28 | base_url: str, 29 | ref: str, 30 | content_type: ContentType, 31 | auth: Optional[Auth] = None, 32 | ) -> IO[bytes]: 33 | headers = {"Accept": "text/csv"} 34 | if auth is not None: 35 | headers["Authorization"] = auth.as_basic_auth() 36 | url = url_for_rep(base_url, ref, content_type) 37 | etag = get_last_etag(cache, base_url, ref, content_type) 38 | rep_key: Key[IO[bytes]] = RepKey(base_url, ref, content_type) 39 | 40 | if etag is not None: 41 | rep = cache.get(rep_key) 42 | if rep is not None: 43 | logger.debug("last known etag found: '%s' ('%s')", ref, etag) 44 | headers["If-None-Match"] = etag 45 | else: 46 | logger.debug("an etag is known but cache MISS: '%s'", ref) 47 | else: 48 | logger.debug("no etag known") 49 | 50 | response = http_sesh.get(url, headers=headers, stream=True, timeout=HTTP_TIMEOUT) 51 | logger.info("got response code: %d", response.status_code) 52 | 53 | # make sure to log all 500s to make it clear a real error has occurred 54 | if response.status_code >= 500: 55 | logger.error("got status_code: %d, %s", response.status_code, response.content) 56 | 57 | # 400s and 500s are raised as exceptions 58 | if response.status_code >= 400: 59 | message = http_error_to_user_message(ref, response) 60 | raise CSVBaseException(message) 61 | 62 | if response.status_code == 304: 63 | # FIXME: a rejig is required here for type safety 64 | return rep # type: ignore 65 | else: 66 | etag = response.headers["ETag"] 67 | set_etag(cache, base_url, ref, content_type, etag) 68 | rep = io.BytesIO() 69 | with rewind(rep): 70 | shutil.copyfileobj(response.raw, rep) 71 | response.close() 72 | with rewind(rep): 73 | cache.set(rep_key, rep) 74 | 75 | return rep 76 | 77 | 78 | def send_rep( 79 | http_sesh: requests.Session, 80 | cache: FilesystemCache, 81 | base_url: str, 82 | ref: str, 83 | content_type: ContentType, 84 | rep: IO[bytes], 85 | auth: Optional[Auth] = None, 86 | ) -> None: 87 | headers = {"Content-Type": content_type.mimetype()} 88 | if auth is not None: 89 | headers["Authorization"] = auth.as_basic_auth() 90 | url = url_for_rep(base_url, ref, content_type) 91 | response = http_sesh.put(url, data=rep, headers=headers, timeout=HTTP_TIMEOUT) 92 | 93 | # FIXME: this needs bringing into line with the get_rep code 94 | try: 95 | response.raise_for_status() 96 | except Exception: 97 | logger.error("got status code %d from csvbase server, body: %s", response.status_code, response.content) 98 | raise 99 | 100 | 101 | def url_for_rep(base_url: str, ref: str, content_type: ContentType) -> str: 102 | url = urljoin(base_url, ref) 103 | return url 104 | 105 | 106 | class CSVBaseFileSystem(AbstractFileSystem): 107 | def __init__(self, *args, **kwargs): 108 | kwargs["use_listings_cache"] = False 109 | self._base_url = CSVBASE_DOT_COM 110 | self._cache_lock = Lock() 111 | 112 | super().__init__(*args, **kwargs) 113 | 114 | def fsid(self): 115 | raise NotImplementedError 116 | 117 | def _open( 118 | self, 119 | path, 120 | mode="rb", 121 | block_size=None, 122 | autocommit=True, 123 | cache_options=None, 124 | **kwargs 125 | ): 126 | f = CSVBaseFile(self, path, mode) 127 | return f 128 | 129 | def created(self, path): 130 | raise NotImplementedError 131 | 132 | def modified(self, path): 133 | raise NotImplementedError 134 | 135 | def cp_file(self, path1, path2, **kwargs): 136 | raise NotImplementedError 137 | 138 | def touch(self, path, truncate=True, **kwargs): 139 | # This will never be implemented 140 | raise NotImplementedError 141 | 142 | def ls(self, path, detail: bool = True): 143 | # FIXME: need a way to list a users' tables 144 | return [] 145 | 146 | def _rm(self, path): 147 | raise NotImplementedError 148 | 149 | def info(self, path: str) -> Dict: 150 | # FIXME: implement a proper HEAD method for csvbase, avoid fetching the 151 | # whole file: https://github.com/calpaterson/csvbase/issues/71 152 | with self.open(path, "rb") as table_f: 153 | size = table_f.size 154 | return { 155 | "name": path, 156 | "size": size, 157 | "type": "file" if "/" in path else "directory", 158 | } 159 | 160 | def _get_rep(self, ref: str, content_type: ContentType) -> IO[bytes]: 161 | _http_sesh = get_http_sesh() 162 | with self._get_fs_cache() as cache: 163 | return get_rep( 164 | _http_sesh, 165 | cache, 166 | self._base_url, 167 | ref, 168 | content_type, 169 | self._get_auth(), 170 | ) 171 | 172 | def _send_rep(self, ref: str, content_type: ContentType, rep: IO[bytes]) -> None: 173 | _http_sesh = get_http_sesh() 174 | with self._get_fs_cache() as cache: 175 | send_rep( 176 | _http_sesh, 177 | cache, 178 | self._base_url, 179 | ref, 180 | content_type, 181 | rep, 182 | self._get_auth(), 183 | ) 184 | 185 | def _get_auth(self) -> Optional[Auth]: 186 | # this can't be done on an instance level for testing reasons - fsspec 187 | # appears to re-use instances 188 | return get_auth() 189 | 190 | @contextlib.contextmanager 191 | def _get_fs_cache(self) -> Iterator[FilesystemCache]: 192 | # Dask requires these fsspec objects to be thread safe. However the 193 | # FilesystemCache was not designed with that in mind. As a temporary 194 | # solution, lock it. 195 | with self._cache_lock: 196 | yield get_fs_cache() 197 | 198 | 199 | class CSVBaseFile(AbstractBufferedFile): 200 | def __init__(self, fs: CSVBaseFileSystem, path, mode, **kwargs) -> None: 201 | self.fs = fs 202 | self.path = path 203 | self._staging_buffer = io.BytesIO() 204 | # this is necessary because we have no way to get size of the file 205 | if mode == "rb": 206 | with rewind(self._staging_buffer): 207 | shutil.copyfileobj( 208 | fs._get_rep(path, ContentType.CSV), self._staging_buffer 209 | ) 210 | size = self._staging_buffer.tell() 211 | else: 212 | size = 0 213 | 214 | # currently this value is used only for test multi-chunk uploads 215 | self._chunk_count = 0 216 | 217 | super().__init__(fs, path, mode, size=size, cache_type="none", **kwargs) 218 | 219 | def _fetch_range(self, start: int, end: int) -> bytes: 220 | self._staging_buffer.seek(start) 221 | count = end - start 222 | return self._staging_buffer.read(count) 223 | 224 | def _initiate_upload(self) -> None: 225 | # FIXME: possibly truncate the staging buffer 226 | pass 227 | 228 | def _upload_chunk(self, final=False) -> None: 229 | # we send it all in one go at the moment 230 | self.buffer.seek(0) 231 | shutil.copyfileobj(self.buffer, self._staging_buffer) 232 | self._chunk_count += 1 233 | if final: 234 | self._staging_buffer.seek(0) 235 | self.fs._send_rep(self.path, ContentType.CSV, self._staging_buffer) 236 | -------------------------------------------------------------------------------- /csvbase_client/internals/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /csvbase_client/internals/auth.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | from typing import Optional 3 | import netrc 4 | 5 | from .value_objs import Auth 6 | 7 | logger = getLogger(__name__) 8 | 9 | 10 | def _get_auth(host) -> Optional[Auth]: 11 | try: 12 | netrc_f = netrc.netrc() 13 | except FileNotFoundError: 14 | logger.info("netrc is absent") 15 | return None 16 | except netrc.NetrcParseError as e: 17 | logger.warning("unable to raise netrc file: %s", e) 18 | return None 19 | netrc_triple = netrc_f.authenticators(host) 20 | if netrc_triple is None: 21 | logger.info("nothing found in netrc for host: %s", host) 22 | return None 23 | else: 24 | username, _, api_key = netrc_triple 25 | # FIXME: we should be validating, and logging and whatever 26 | if api_key is None: 27 | raise ValueError("empty api key in ~/.netrc!") 28 | logger.info("using api-key for %s, found in netrc", username) 29 | return Auth(username, api_key) 30 | 31 | 32 | def get_auth(host: str = "csvbase.com") -> Optional[Auth]: 33 | return _get_auth(host) 34 | -------------------------------------------------------------------------------- /csvbase_client/internals/cache.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import List, Optional, Iterator 3 | from contextlib import closing 4 | from dataclasses import dataclass 5 | from datetime import datetime, timezone 6 | 7 | from pyappcache.keys import BaseKey 8 | from pyappcache.fs import FilesystemCache 9 | from pyappcache.serialisation import BinaryFileSerialiser 10 | 11 | from .dirs import dirs 12 | from .value_objs import ContentType 13 | from ..constants import CSVBASE_DOT_COM 14 | 15 | ETAG_DDL2 = """ 16 | CREATE TABLE IF NOT EXISTS etags ( 17 | base_url NOT NULL, 18 | ref NOT NULL, 19 | content_type NOT NULL, 20 | etag NOT NULL, 21 | PRIMARY KEY (base_url, ref, content_type) 22 | ); 23 | """ 24 | 25 | SET_ETAG_DML2 = """ 26 | INSERT OR REPLACE INTO etags 27 | (base_url, ref, content_type, etag) 28 | VALUES 29 | (?, ?, ?, ?); 30 | """ 31 | 32 | GET_ETAG_DQL2 = """ 33 | SELECT etag FROM etags 34 | WHERE base_url = ? 35 | AND ref = ? 36 | AND content_type = ?; 37 | """ 38 | 39 | GET_CACHE_ENTRIES_DQL = """ 40 | SELECT 41 | e.base_url, 42 | e.ref, 43 | e.content_type, 44 | e.etag, 45 | p.last_read, 46 | p.size 47 | FROM 48 | etags AS e 49 | LEFT JOIN pyappcache AS p ON p.key = CASE WHEN base_url = 'https://csvbase.com' THEN 50 | 'v0/' || e.ref || '.csv' 51 | ELSE 52 | 'v0/' || e.base_url || e.ref || '.csv' 53 | END; 54 | """ 55 | 56 | 57 | class RepKey(BaseKey): 58 | def __init__(self, base_url: str, ref: str, content_type: ContentType): 59 | self.base_url = base_url 60 | self.ref = ref 61 | self.content_type = content_type 62 | 63 | def cache_key_segments(self) -> List[str]: 64 | segs = [] 65 | if self.base_url != CSVBASE_DOT_COM: 66 | segs.append(self.base_url) 67 | segs.extend([self.ref + self.content_type.file_extension()]) 68 | return segs 69 | 70 | 71 | def cache_path() -> Path: 72 | return Path(dirs.user_cache_dir) 73 | 74 | 75 | def get_fs_cache(path: Optional[Path] = None) -> FilesystemCache: 76 | fs_cache = FilesystemCache(path or cache_path()) 77 | # FIXME: this prefix should go at some point 78 | fs_cache.prefix = "v0" 79 | fs_cache.serialiser = BinaryFileSerialiser() 80 | ensure_etag_table(fs_cache) 81 | return fs_cache 82 | 83 | 84 | def ensure_etag_table(fs_cache) -> None: 85 | with closing(fs_cache.metadata_conn.cursor()) as cursor: 86 | cursor.execute(ETAG_DDL2) 87 | 88 | 89 | def get_last_etag( 90 | cache: FilesystemCache, base_url: str, ref: str, content_type: ContentType 91 | ) -> Optional[str]: 92 | with closing(cache.metadata_conn.cursor()) as cursor: 93 | cursor.execute(GET_ETAG_DQL2, (base_url, ref, content_type.mimetype())) 94 | rv = cursor.fetchone() 95 | if rv is not None: 96 | return rv[0] 97 | else: 98 | return None 99 | 100 | 101 | def set_etag( 102 | cache: FilesystemCache, 103 | base_url: str, 104 | ref: str, 105 | content_type: ContentType, 106 | etag: str, 107 | ) -> None: 108 | with closing(cache.metadata_conn.cursor()) as cursor: 109 | cursor.execute(SET_ETAG_DML2, (base_url, ref, content_type.mimetype(), etag)) 110 | cache.metadata_conn.commit() 111 | 112 | 113 | @dataclass 114 | class CacheEntry: 115 | """Value object for cache_contents""" 116 | 117 | base_url: str 118 | ref: str 119 | content_type: ContentType 120 | etag: str 121 | last_read: datetime 122 | size_bytes: int 123 | 124 | def etag_prefix(self) -> str: 125 | """Return a short prefix of the etag (minus the w/ bit) in order to 126 | make it legible in the UI. 127 | 128 | """ 129 | return self.etag[4:14] 130 | 131 | 132 | def cache_contents(fs_cache: FilesystemCache) -> Iterator[CacheEntry]: 133 | """Returns metadata on each cache entry""" 134 | with closing(fs_cache.metadata_conn.cursor()) as cursor: 135 | cursor.execute(GET_CACHE_ENTRIES_DQL) 136 | while (row := cursor.fetchone()) is not None: 137 | ce = CacheEntry( 138 | base_url=row[0], 139 | ref=row[1], 140 | content_type=ContentType.from_mimetype(row[2]), 141 | etag=row[3], 142 | last_read=datetime.fromisoformat(row[4]).replace(tzinfo=timezone.utc), 143 | size_bytes=row[5], 144 | ) 145 | yield ce 146 | 147 | 148 | ## old code: 149 | 150 | # import sqlite3 151 | # import shutil 152 | # from contextlib import closing 153 | # from logging import getLogger 154 | # from pathlib import Path 155 | # from typing import List, Optional, IO, Any, Dict, Iterable, Tuple 156 | # from io import BytesIO 157 | # from urllib.parse import urljoin 158 | # from datetime import datetime, timezone 159 | 160 | # from pyappcache.keys import BaseKey, Key 161 | # from pyappcache.sqlite_lru import SqliteCache 162 | # from pyappcache.serialisation import Serialiser 163 | 164 | # from .config import Config 165 | # from .http import get_http_sesh 166 | # from .value_objs import Auth, ContentType 167 | 168 | # logger = getLogger(__name__) 169 | 170 | # HTTP_TIMEOUTS = (5, 60) 171 | 172 | # ETAGS_DDL = """ 173 | # CREATE TABLE IF NOT EXISTS etags ( 174 | # ref text, 175 | # etag text NOT NULL, 176 | # content_type text NOT NULL, 177 | # last_modified text NOT NULL, 178 | # PRIMARY KEY (ref, content_type) 179 | # ); 180 | # """ 181 | 182 | # GET_ETAG_DQL = """ 183 | # SELECT 184 | # etag 185 | # FROM 186 | # etags 187 | # WHERE 188 | # ref = ? 189 | # AND 190 | # content_type = ? 191 | # """ 192 | 193 | # # 194 | # SET_ETAG_DML = """ 195 | # INSERT INTO etags (ref, etag, content_type, last_modified) 196 | # VALUES (?, ?, ?, ?) 197 | # ON CONFLICT(ref, content_type) DO UPDATE SET 198 | # etag = excluded.etag, 199 | # last_modified = excluded.last_modified; 200 | 201 | # """ 202 | 203 | # GET_CACHE_INFO_DQL = """ 204 | # SELECT 205 | # ref, etag, content_type, last_modified 206 | # FROM etags 207 | # JOIN 208 | # pyappcache 209 | # ON key = 'pyappcache/' || etag 210 | # """ 211 | 212 | 213 | # class ETagKey(BaseKey): 214 | # def __init__(self, ref): 215 | # self.ref = ref 216 | 217 | # def cache_key_segments(self) -> List[str]: 218 | # return [self.ref] # FIXME: correct this in the docs 219 | 220 | # def should_compress(self, python_obj, as_bytes) -> bool: 221 | # # csv files are highly compressible 222 | # return True 223 | 224 | 225 | # class TableCache: 226 | # """A read-through cache of tables.""" 227 | 228 | # def __init__(self, config: Config) -> None: 229 | # cache_db_path = cache_path() 230 | # logger.info("cache db path = %s", cache_db_path) 231 | # cache_db_path.parent.mkdir(parents=True, exist_ok=True) 232 | # self._sqlite_conn = sqlite3.connect(cache_db_path) 233 | # self._create_etags_table() 234 | # self._lru_cache = SqliteCache(max_size=100, connection=self._sqlite_conn) 235 | # self._lru_cache.prefix = ( 236 | # "pyappcache" # FIXME: necessary because of a bug upstream 237 | # ) 238 | # self._http_client = get_http_sesh() 239 | 240 | # def base_url(self) -> str: 241 | # return "https://csvbase.com/" 242 | 243 | # def check_creds(self, config: Config) -> bool: 244 | # if config.username is None or config.api_key is None: 245 | # return False 246 | # response = self._http_client.get(urljoin(self.base_url(), config.username)) 247 | # if 400 <= response.status_code < 500: 248 | # return False 249 | # elif 200 <= response.status_code < 300: 250 | # return True 251 | # else: 252 | # response.raise_for_status() 253 | # # this (should be) unreachable but typechecker doesn't know that 254 | # return False 255 | 256 | # def get_table( 257 | # self, ref: str, auth: Optional[Auth] = None, force_miss: bool = False 258 | # ) -> IO[bytes]: 259 | # content_type = ContentType.CSV 260 | # headers = {"Accept": content_type.mimetype()} 261 | # if auth is not None: 262 | # headers["Authorization"] = auth.as_basic_auth() 263 | # url = self._build_url_for_table_ref(ref, content_type) 264 | # canon_url = self._build_url_for_table_ref(ref) 265 | # etag = self._get_etag(canon_url) 266 | # if etag is not None: 267 | # logger.debug("etag found: %s", etag) 268 | # key: Key[IO[bytes]] = ETagKey(etag) 269 | # table = self._lru_cache.get(key) 270 | # if table is not None: 271 | # logger.debug("cache HIT: %s", ref) 272 | # if force_miss: 273 | # logger.info("cache HIT but forcing MISS") 274 | # else: 275 | # headers["If-None-Match"] = etag 276 | # else: 277 | # logger.debug("etag known but cache MISS: %s", ref) 278 | 279 | # response = self._http_client.get(url, headers=headers, stream=True) 280 | 281 | # if response.status_code >= 400: 282 | # logger.error( 283 | # "got status_code: %d, %s", response.status_code, response.content 284 | # ) 285 | 286 | # response.raise_for_status() 287 | 288 | # if response.status_code == 304: 289 | # logger.debug("server says cache still valid") 290 | # # typechecker thinks this is still optional but it can't be if we 291 | # # got here 292 | # return table # type: ignore 293 | # else: 294 | # received_etag = response.headers["ETag"] 295 | # received_etag_key = ETagKey(received_etag) 296 | # self._set_etag(canon_url, received_etag, ContentType.CSV) 297 | # buf: BytesIO = BytesIO() 298 | # shutil.copyfileobj(response.raw, buf) 299 | # response.close() 300 | # buf.seek(0) 301 | # self._lru_cache.set(received_etag_key, buf) 302 | # buf.seek(0) 303 | # return buf 304 | 305 | # def metadata(self, ref: str, auth: Optional[Auth] = None) -> Dict[str, Any]: 306 | # # FIXME: This should somehow use the caching layer, the stuff in 307 | # # get_table should be pulled out and made generic somehow 308 | # headers = {"Accept": "application/json"} 309 | # if auth is not None: 310 | # headers["Authorization"] = auth.as_basic_auth() 311 | # response = self._http_client.get( 312 | # self._build_url_for_table_ref(ref), headers=headers 313 | # ) 314 | 315 | # response.raise_for_status() 316 | # return response.json() 317 | 318 | # def set_table( 319 | # self, ref: str, file_obj: IO[str], auth: Optional[Auth] = None 320 | # ) -> None: 321 | # headers = {"Content-Type": "text/csv"} 322 | # if auth is not None: 323 | # headers["Authorization"] = auth.as_basic_auth() 324 | # url = self._build_url_for_table_ref(ref) 325 | # response = self._http_client.put(url, data=file_obj, headers=headers) 326 | # response.raise_for_status() 327 | 328 | # def _build_url_for_table_ref( 329 | # self, ref: str, content_type: Optional[ContentType] = None 330 | # ) -> str: 331 | # url = urljoin(self.base_url(), ref) 332 | # if content_type is not None: 333 | # url += content_type.file_extension() 334 | # return url 335 | 336 | # def _get_etag(self, ref: str) -> Optional[str]: 337 | # # FIXME: This still isn't quite right as etags need to take account of 338 | # # Vary, but we aren't. 339 | # with closing(self._sqlite_conn.cursor()) as cursor: 340 | # cursor.execute(GET_ETAG_DQL, (ref, "text/csv")) 341 | # rs = cursor.fetchone() 342 | # if rs is not None: 343 | # return rs[0] 344 | # else: 345 | # return None 346 | 347 | # def _set_etag(self, url: str, etag: str, content_type: ContentType) -> None: 348 | # with closing(self._sqlite_conn.cursor()) as cursor: 349 | # cursor.execute( 350 | # SET_ETAG_DML, 351 | # ( 352 | # url, 353 | # etag, 354 | # content_type.mimetype(), 355 | # datetime.now(timezone.utc).isoformat(), 356 | # ), 357 | # ) 358 | # self._sqlite_conn.commit() 359 | 360 | # def _create_etags_table(self): 361 | # with closing(self._sqlite_conn.cursor()) as cursor: 362 | # cursor.execute(ETAGS_DDL) 363 | # self._sqlite_conn.commit() 364 | 365 | # def entries(self) -> Iterable[Tuple[str, str, ContentType, datetime]]: 366 | # def parse_row( 367 | # url, etag, content_type_str, last_modified_str 368 | # ) -> Tuple[str, str, ContentType, datetime]: 369 | # content_type = ContentType.from_mimetype(content_type_str) 370 | # last_modified = datetime.fromisoformat(last_modified_str) 371 | # return (url, etag, content_type, last_modified) 372 | 373 | # with closing(self._sqlite_conn.cursor()) as cursor: 374 | # cursor.execute(GET_CACHE_INFO_DQL) 375 | # yield from (parse_row(*row) for row in cursor) 376 | 377 | # def clear(self) -> None: 378 | # if cache_path().exists(): 379 | # cache_path().unlink() 380 | 381 | 382 | # class StreamSerialiser(Serialiser): 383 | # def dump(self, obj: Any) -> IO[bytes]: 384 | # return obj 385 | 386 | # def load(self, data: IO[bytes]) -> Any: 387 | # return data 388 | -------------------------------------------------------------------------------- /csvbase_client/internals/cli.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import sys 3 | from logging import DEBUG, basicConfig, WARNING 4 | from typing import IO 5 | 6 | import humanize 7 | import fsspec 8 | import click 9 | from rich.console import Console as RichConsole 10 | from rich.table import Table as RichTable 11 | 12 | from .config import config_path 13 | from .version import get_version 14 | from .cache import cache_path, get_fs_cache, cache_contents 15 | from ..exceptions import CSVBaseException 16 | 17 | 18 | @click.group("csvbase-client") 19 | @click.version_option(version=get_version()) 20 | @click.option("--verbose", is_flag=True, help="Enable more verbose output (to stderr).") 21 | def cli(verbose: bool): 22 | """A cli client for csvbase.""" 23 | if verbose: 24 | level = DEBUG 25 | else: 26 | level = WARNING 27 | basicConfig(level=level, stream=sys.stderr, format="%(levelname)s: %(message)s") 28 | 29 | 30 | @cli.group("table", help="Interact with tables") 31 | def table(): ... 32 | 33 | 34 | @cli.command() 35 | def info(): 36 | """Show the configuration file location, and the contents""" 37 | exist_str = "" if config_path().exists() else " (does not exist)" 38 | click.echo(f"config path: {config_path()}{exist_str}") 39 | exist_str = "" if cache_path().exists() else " (does not exist)" 40 | click.echo(f"cache path: {cache_path()}{exist_str}") 41 | 42 | 43 | @cli.group(help="Manage the local cache") 44 | def cache(): ... 45 | 46 | 47 | @cache.command("show", help="Show cache location and contents") 48 | def cache_show() -> None: 49 | table = RichTable( 50 | title="csvbase-client cache", caption=f"Cache path: {cache_path()}" 51 | ) 52 | table.add_column("Ref") 53 | table.add_column("ETag prefix") 54 | table.add_column("Last read") 55 | table.add_column("Size") 56 | 57 | for ce in cache_contents(get_fs_cache()): 58 | # for now, only some of the CacheEntry data is surfaced 59 | table.add_row( 60 | ce.ref, 61 | ce.etag_prefix(), 62 | humanize.naturaltime(ce.last_read), 63 | humanize.naturalsize(ce.size_bytes, gnu=True), 64 | ) 65 | 66 | console = RichConsole() 67 | console.print(table) 68 | 69 | 70 | @cache.command("clear", help="Wipe the cache") 71 | def clear() -> None: 72 | fs_cache = get_fs_cache() 73 | fs_cache.clear() 74 | # FIXME: it should be pyappcache that does this: 75 | for path in fs_cache.directory.glob("*.csv"): 76 | path.unlink() 77 | 78 | 79 | # @cli.command() 80 | # def login(): 81 | # """Write API credentials to config file, creating it if necessary.""" 82 | # config = get_config() 83 | # username = click.prompt( 84 | # "Please enter your username", default=config.username, show_default=True 85 | # ) 86 | # api_key = click.prompt("Please enter your API key", hide_input=True) 87 | # config.username = username 88 | # config.api_key = api_key 89 | 90 | # table_cache = TableCache() 91 | # are_valid = table_cache.check_creds(config) 92 | # if are_valid: 93 | # write_config(config) 94 | # click.echo(f"Wrote {config_path()}") 95 | # else: 96 | # click.echo( 97 | # click.style( 98 | # "ERROR: Username or API key rejected by server - double check" 99 | # " they're both correct!", 100 | # fg="red", 101 | # ) 102 | # ) 103 | # exit(1) 104 | 105 | 106 | @table.command(help="Get a table.") 107 | @click.argument("ref") 108 | @click.option( 109 | "--force-cache-miss", 110 | is_flag=True, 111 | default=False, 112 | help="Always download the table again, even if it hasn't changed", 113 | ) 114 | def get(ref: str, force_cache_miss: bool): 115 | fs = fsspec.filesystem("csvbase") 116 | try: 117 | table_buf = fs.open(ref, "r") 118 | except CSVBaseException as e: 119 | error_console = RichConsole(stderr=True, style="bold red") 120 | error_console.print(str(e)) 121 | sys.exit(1) 122 | shutil.copyfileobj(table_buf, sys.stdout) 123 | 124 | 125 | # @table.command("show", help="Show metadata about a table") 126 | # @click.argument("ref") 127 | # def table_show(ref: str): 128 | # table_cache = TableCache(get_config()) 129 | # metadata = table_cache.metadata(ref, auth=get_auth()) 130 | # rv = { 131 | # ref: { 132 | # "caption": metadata["caption"], 133 | # "created": metadata["created"], 134 | # "last_changed": metadata["last_changed"], 135 | # } 136 | # } 137 | 138 | # click.echo(toml.dumps(rv)) 139 | 140 | 141 | @table.command("set", help="Create or upsert a table.") 142 | @click.argument("ref") 143 | @click.argument("file", type=click.File("rb")) 144 | def set(ref: str, file: IO[str]): 145 | fs = fsspec.filesystem("csvbase") 146 | with fs.open(ref, "wb") as table_buf: 147 | shutil.copyfileobj(file, table_buf) 148 | 149 | 150 | # NOTE: This is for convenience only, the cli is actually called by setup.py 151 | # entry_points 152 | if __name__ == "__main__": 153 | cli() 154 | -------------------------------------------------------------------------------- /csvbase_client/internals/config.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from dataclasses import dataclass, fields 3 | from pathlib import Path 4 | import functools 5 | from logging import getLogger 6 | 7 | import toml 8 | 9 | from .dirs import dirs 10 | 11 | logger = getLogger(__name__) 12 | 13 | 14 | @dataclass 15 | class Config: 16 | base_url: str 17 | username: Optional[str] 18 | api_key: Optional[str] 19 | 20 | 21 | DEFAULT_CONFIG = Config(base_url="https://csvbase.com/", username=None, api_key=None) 22 | 23 | 24 | def config_path() -> Path: 25 | return Path(dirs.user_config_dir) / "config.toml" 26 | 27 | 28 | @functools.lru_cache(1) 29 | def get_config() -> Config: 30 | path = config_path() 31 | if not path.exists(): 32 | logger.info( 33 | "config file does not exist, returning default config: %s", DEFAULT_CONFIG 34 | ) 35 | return DEFAULT_CONFIG 36 | else: 37 | parsed = toml.load(path.open()) 38 | config = Config( 39 | base_url=parsed.get("base_url", DEFAULT_CONFIG.base_url), 40 | username=parsed.get("username", DEFAULT_CONFIG.username), 41 | api_key=parsed.get("api_key", DEFAULT_CONFIG.api_key), 42 | ) 43 | return config 44 | 45 | 46 | def write_config(config: Config) -> None: 47 | path = config_path() 48 | if not path.exists(): 49 | path.parent.mkdir(exist_ok=True, parents=True) 50 | 51 | writeout_dict = {} 52 | for field in fields(Config): 53 | default_value = getattr(DEFAULT_CONFIG, field.name) 54 | our_value = getattr(config, field.name) 55 | if our_value != default_value: 56 | writeout_dict[field.name] = our_value 57 | 58 | with path.open("w", encoding="utf-8") as config_buffer: 59 | toml.dump(writeout_dict, config_buffer) 60 | -------------------------------------------------------------------------------- /csvbase_client/internals/dirs.py: -------------------------------------------------------------------------------- 1 | from platformdirs import PlatformDirs 2 | 3 | dirs = PlatformDirs("csvbase-client") 4 | -------------------------------------------------------------------------------- /csvbase_client/internals/http.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urljoin 2 | import requests 3 | 4 | from .value_objs import ContentType 5 | 6 | # Wait 5 secs to connect, 60s to read. Sadly no way to set this on 7 | # requests.Session objects 8 | HTTP_TIMEOUT = (5, 60) 9 | 10 | 11 | def _get_http_sesh() -> requests.Session: 12 | """This internal function exists only for testing/mocking reasons.""" 13 | return requests.Session() 14 | 15 | 16 | def get_http_sesh() -> requests.Session: 17 | sesh = _get_http_sesh() 18 | # disable automatic loading from the netrc - we will do that (so that we 19 | # can log it) 20 | sesh.trust_env = False 21 | version = "0.0.1" # FIXME: 22 | sesh.headers.update({"User-Agent": f"csvbase-client/{version}"}) 23 | return sesh 24 | 25 | 26 | def ref_to_url(base_url: str, ref: str, content_type: ContentType) -> str: 27 | url = urljoin(base_url, ref) 28 | if content_type is not None: 29 | url += content_type.file_extension() 30 | return url 31 | -------------------------------------------------------------------------------- /csvbase_client/internals/value_objs.py: -------------------------------------------------------------------------------- 1 | import enum 2 | from base64 import b64encode 3 | from dataclasses import dataclass 4 | 5 | 6 | @dataclass 7 | class Auth: 8 | username: str 9 | api_key: str 10 | 11 | def as_basic_auth(self) -> str: 12 | user_pass = f"{self.username}:{self.api_key}".encode("utf-8") 13 | encoded = b64encode(user_pass).decode("utf-8") 14 | return f"Basic {encoded}" 15 | 16 | 17 | @enum.unique 18 | class ContentType(enum.Enum): 19 | PARQUET = 1 20 | CSV = 2 21 | JSON = 3 22 | 23 | def mimetype(self): 24 | return MIMETYPE_MAP[self] 25 | 26 | @staticmethod 27 | def from_mimetype(mimetype) -> "ContentType": 28 | return BACKWARD_MIMETYPE_MAP[mimetype] 29 | 30 | def file_extension(self) -> str: 31 | return FILE_EXTENSION_MAP[self] 32 | 33 | 34 | MIMETYPE_MAP = { 35 | ContentType.PARQUET: "application/parquet", # unofficial, but convenient 36 | ContentType.CSV: "text/csv", 37 | ContentType.JSON: "application/json", 38 | } 39 | 40 | BACKWARD_MIMETYPE_MAP = {v: k for k, v in MIMETYPE_MAP.items()} 41 | 42 | FILE_EXTENSION_MAP = { 43 | ContentType.CSV: ".csv", 44 | ContentType.PARQUET: ".parquet", 45 | ContentType.JSON: ".json", 46 | } 47 | -------------------------------------------------------------------------------- /csvbase_client/internals/version.py: -------------------------------------------------------------------------------- 1 | import importlib_resources as imp_resources 2 | 3 | import csvbase_client 4 | 5 | 6 | def get_version() -> str: 7 | try: 8 | return get_softcoded_version() 9 | # These two are raised (under different scenarios) under Nuitka when trying 10 | # to import importlib.resources.files 11 | except (AttributeError, FileNotFoundError): 12 | return get_hardcoded_version() 13 | 14 | 15 | def get_softcoded_version() -> str: 16 | """Return the version out of package metadata. 17 | 18 | This only works if we're in a valid package, with the valid package data. 19 | 20 | """ 21 | version_path = imp_resources.files(csvbase_client) / "VERSION" # type: ignore 22 | return version_path.open().read().strip() 23 | 24 | 25 | def get_hardcoded_version() -> str: 26 | """Return the hardcoded version number. 27 | 28 | This is used when we're not in a valid package, for example when built by Nuitka. 29 | 30 | """ 31 | return "0.1.1" 32 | -------------------------------------------------------------------------------- /csvbase_client/io.py: -------------------------------------------------------------------------------- 1 | # FIXME: copy tests for this 2 | from typing import Protocol 3 | 4 | 5 | class Seekable(Protocol): 6 | """A file that support seeking (don't care whether text or binary).""" 7 | 8 | def seek(self, offset: int, whence: int = 0) -> int: 9 | pass 10 | 11 | 12 | class rewind: 13 | """Ensure that a stream is rewound after doing something. 14 | 15 | This is a common error and usually subtly messes up a sequence of 16 | operations on a file. 17 | """ 18 | 19 | def __init__(self, stream: Seekable) -> None: 20 | self.stream = stream 21 | 22 | def __enter__(self) -> None: 23 | pass 24 | 25 | def __exit__(self, exc_type, exc_value, traceback) -> None: 26 | self.stream.seek(0) 27 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | python_version = 3.8 3 | 4 | 5 | [tests.cli.requests_adapter] 6 | ignore_errors = True 7 | 8 | # in the client, we don't care about sqlalchemy types, as they are only used in 9 | # test code 10 | [mypy-sqlalchemy.*] 11 | ignore_missing_imports = True 12 | 13 | [mypy-setuptools.*] 14 | ignore_missing_imports = True 15 | 16 | [mypy-fsspec.*] 17 | ignore_missing_imports = True 18 | 19 | [mypy-exceptiongroup] 20 | ignore_missing_imports = True 21 | 22 | [mypy-importlib_metadata.*] 23 | ignore_missing_imports = True 24 | 25 | # The below seem to be needed due to of dependencies of dependencies 26 | [mypy-attr.*] 27 | ignore_missing_imports = True 28 | 29 | [mypy-pydantic.*] 30 | ignore_missing_imports = True -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | testpaths = tests -------------------------------------------------------------------------------- /scripts/nuitka/Dockerfile.nuitka: -------------------------------------------------------------------------------- 1 | FROM ubuntu:20.04 2 | RUN apt-get update && apt install -y build-essential patchelf python3 python3-dev python3-dev python3-pip python-is-python3 python3-venv 3 | RUN python3 -m pip install --upgrade pip setuptools wheel 4 | COPY ./ ./ 5 | 6 | # Use a venv to ensure that we're not building upon system python packages 7 | RUN python3 -m venv venv 8 | RUN . venv/bin/activate 9 | 10 | RUN python3 -m pip install -e . 11 | RUN python3 -m pip install nuitka 12 | RUN python -m nuitka --follow-imports --standalone --onefile scripts/nuitka/trampoline.py -o csvbase-client 13 | -------------------------------------------------------------------------------- /scripts/nuitka/WINDOWS.md: -------------------------------------------------------------------------------- 1 | # Instructions for building on Windows 2 | 3 | 1. Make sure Python is installed, either conda or from python.org 4 | 2. Make venv: `python -mv venv windows-venv` 5 | 3. Activate it: `venv\Scripts\activate` 6 | 4. `python -m pip install --upgrade pip setuptools wheel` 7 | 5. `python -m pip install -e .` 8 | 6. `python -m pip install nuitka` 9 | 7. `python -m nuitka --follow-imports --standalone --onefile scripts\nuitka\trampoline.py -o csvbase-client.exe` 10 | 11 | The first time you do step #7 you will probably need to agree to download 12 | mingw, ccache and dependency walker. 13 | -------------------------------------------------------------------------------- /scripts/nuitka/build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -xe 4 | 5 | # Build the Docker image using the Dockerfile (from root) 6 | script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 7 | repo_root="$script_dir/../.." 8 | cd $repo_root 9 | docker build -f scripts/nuitka/Dockerfile.nuitka . -t csvbase-client-build:latest 10 | 11 | # Create a temporary container from the built image 12 | container_id=$(docker create csvbase-client-build:latest) 13 | 14 | # Copy the binary from the container to your local machine 15 | docker cp "$container_id":"csvbase-client" "dist/csvbase-client" 16 | 17 | # Clean up: remove the temporary container 18 | docker rm "$container_id" 19 | -------------------------------------------------------------------------------- /scripts/nuitka/trampoline.py: -------------------------------------------------------------------------------- 1 | """This script is the 'main' file for the binary. 2 | 3 | Nuitka starts here and crawls all the imports. 4 | 5 | """ 6 | 7 | from csvbase_client.internals.cli import cli 8 | 9 | cli() 10 | -------------------------------------------------------------------------------- /scripts/pypi/build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -ex 4 | 5 | script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 6 | repo_root="$script_dir/../.." 7 | cd $repo_root 8 | 9 | python3 -m venv .pypi-venv 10 | . .pypi-venv/bin/activate 11 | python3 -m pip install build==0.10.0 12 | python3 -m build 13 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | from setuptools import setup, find_packages 4 | 5 | VERSION = ( 6 | (pathlib.Path(__file__).parent / "csvbase_client" / "VERSION").open().read().strip() 7 | ) 8 | 9 | with open("README.md", "r", encoding="utf-8") as fh: 10 | long_description = fh.read() 11 | 12 | # long agpl trove classifier that ruff doesn't like 13 | c = "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)" 14 | 15 | setup( 16 | name="csvbase-client", 17 | version=VERSION, 18 | author="Cal Paterson", 19 | author_email="cal@calpaterson.com", 20 | description="The command line client for csvbase", 21 | long_description=long_description, 22 | long_description_content_type="text/markdown", 23 | packages=find_packages(exclude=["tests.*", "tests"]), 24 | include_package_data=True, 25 | url="https://github.com/calpaterson/csvbase-client", 26 | keywords="csv data processing", 27 | package_data={"csvbase_client": ["VERSION"]}, 28 | install_requires=[ 29 | "click", 30 | "platformdirs", 31 | "pyappcache>=0.10.0", 32 | "requests", 33 | "toml", 34 | "importlib_resources", 35 | "fsspec", 36 | "rich", 37 | "humanize", 38 | ], 39 | classifiers=[ 40 | "Development Status :: 3 - Alpha", 41 | "Intended Audience :: Developers", 42 | c, 43 | "Programming Language :: Python :: 3", 44 | "Programming Language :: Python :: 3.8", 45 | "Programming Language :: Python :: 3.9", 46 | "Programming Language :: Python :: 3.10", 47 | "Programming Language :: Python :: 3.11", 48 | "Programming Language :: Python :: 3.12", 49 | "Topic :: Utilities", 50 | ], 51 | project_urls={ 52 | "Bug Tracker": "https://github.com/calpaterson/csvbase-client/issues", 53 | "Documentation": "https://github.com/calpaterson/csvbase-client/wiki", 54 | "Source Code": "https://github.com/calpaterson/csvbase-client", 55 | "Changelog": "https://github.com/calpaterson/csvbase-client/blob/main/CHANGELOG.md", 56 | }, 57 | entry_points={ 58 | "console_scripts": [ 59 | "csvbase-client=csvbase_client.internals.cli:cli", 60 | ], 61 | "fsspec.specs": [ 62 | "csvbase=csvbase_client.fsspec.CSVBaseFileSystem", 63 | ], 64 | }, 65 | ) 66 | -------------------------------------------------------------------------------- /test-requirements.txt: -------------------------------------------------------------------------------- 1 | black 2 | bpython 3 | dask[dataframe] 4 | duckdb 5 | git+https://github.com/calpaterson/csvbase 6 | mypy 7 | pandas 8 | pandas-stubs 9 | polars 10 | pyarrow 11 | pytest 12 | ruff 13 | types-passlib 14 | types-requests 15 | types-setuptools 16 | types-toml 17 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tests/cli/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tests/cli/conftest.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | 3 | from requests.auth import HTTPBasicAuth 4 | import pandas as pd 5 | from csvbase.streams import rewind 6 | from click.testing import CliRunner 7 | import pytest 8 | 9 | from ..utils import random_string 10 | from .utils import format_response_error 11 | 12 | 13 | @pytest.fixture(scope="session") 14 | def runner(): 15 | return CliRunner() 16 | 17 | 18 | @pytest.fixture() 19 | def test_table(test_user, http_sesh) -> str: 20 | table_name = random_string(prefix="cli-test-table-", n=10) 21 | df = pd.DataFrame({"a": [1, 2, 3]}) 22 | buf = BytesIO() 23 | with rewind(buf): 24 | df.to_csv(buf) 25 | resp = http_sesh.put( 26 | f"https://csvbase.com/{test_user.username}/{table_name}", 27 | data=buf, 28 | headers={"Content-Type": "text/csv"}, 29 | auth=HTTPBasicAuth(test_user.username, test_user.hex_api_key()), 30 | ) 31 | assert resp.status_code == 201, format_response_error(resp) 32 | return "/".join([test_user.username, table_name]) 33 | 34 | 35 | @pytest.fixture() 36 | def test_public_table(test_user, http_sesh) -> str: 37 | table_name = random_string(prefix="cli-test-table-", n=10) 38 | df = pd.DataFrame({"a": [1, 2, 3]}) 39 | buf = BytesIO() 40 | with rewind(buf): 41 | df.to_csv(buf) 42 | resp = http_sesh.put( 43 | f"https://csvbase.com/{test_user.username}/{table_name}", 44 | params={"public": "true"}, 45 | data=buf, 46 | headers={"Content-Type": "text/csv"}, 47 | auth=HTTPBasicAuth(test_user.username, test_user.hex_api_key()), 48 | ) 49 | assert resp.status_code == 201, format_response_error(resp) 50 | return "/".join([test_user.username, table_name]) 51 | -------------------------------------------------------------------------------- /tests/cli/test_auth.py: -------------------------------------------------------------------------------- 1 | import os 2 | from unittest.mock import patch 3 | 4 | 5 | from csvbase_client.internals.auth import _get_auth 6 | 7 | 8 | def get_auth(host="csvbase.com"): 9 | # this is to work around the default mocking of _get_auth - _get_auth is 10 | # import here prior to the mocking time 11 | return _get_auth(host) 12 | 13 | 14 | def test_netrc(tmpdir): 15 | sample_netrc = """ 16 | machine csvbase.com 17 | login test 18 | password password 19 | """ 20 | netrc = tmpdir / ".netrc" 21 | netrc.write(sample_netrc) 22 | netrc.chmod(0o600) 23 | with patch.dict(os.environ, {"HOME": str(tmpdir)}): 24 | auth = get_auth() 25 | assert auth.username == "test" 26 | assert auth.api_key == "password" 27 | 28 | 29 | def test_netrc_absent(tmpdir): 30 | with patch.dict(os.environ, {"HOME": str(tmpdir)}): 31 | auth = get_auth() 32 | assert auth is None 33 | 34 | 35 | def test_netrc_too_permissive(tmpdir): 36 | netrc = tmpdir / ".netrc" 37 | sample_netrc = """ 38 | machine csvbase.com 39 | login test 40 | password password 41 | """ 42 | netrc = tmpdir / ".netrc" 43 | netrc.write(sample_netrc) 44 | with patch.dict(os.environ, {"HOME": str(tmpdir)}): 45 | auth = get_auth() 46 | assert auth is None 47 | -------------------------------------------------------------------------------- /tests/cli/test_cli_cache.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | 3 | from csvbase_client.internals.cli import cli 4 | from csvbase_client.internals.cache import get_fs_cache, RepKey 5 | from csvbase_client.constants import CSVBASE_DOT_COM 6 | from csvbase_client.internals.value_objs import ContentType 7 | 8 | 9 | def test_cache_clear__removes_entries(runner): 10 | """Check that `csvbase-client cache clear` removes all entries""" 11 | key = RepKey(CSVBASE_DOT_COM, "test/test", ContentType.CSV) 12 | fs_cache = get_fs_cache() 13 | fs_cache.set(key, BytesIO(b"test")) 14 | 15 | # assert that that rep made it into the cache 16 | cache_dir = fs_cache.directory 17 | assert len(list(cache_dir.glob("*.csv"))) == 1 18 | 19 | result = runner.invoke(cli, ["cache", "clear"]) 20 | assert result.exit_code == 0 21 | 22 | # assert that that entry is gone 23 | assert len(list(cache_dir.glob("*.csv"))) == 0 24 | 25 | 26 | def test_cache__show_with_no_entries(runner): 27 | result = runner.invoke(cli, ["cache", "show"]) 28 | assert result.exit_code == 0 29 | -------------------------------------------------------------------------------- /tests/cli/test_cli_tables.py: -------------------------------------------------------------------------------- 1 | """Test getting and setting tables""" 2 | 3 | from io import BytesIO 4 | 5 | import pandas as pd 6 | from pandas.testing import assert_frame_equal 7 | 8 | from csvbase_client.internals.cli import cli 9 | import pytest 10 | 11 | from ..utils import random_string, mock_auth 12 | 13 | 14 | def test_get__while_anonymous(runner, test_user, test_public_table): 15 | """Test getting a table.""" 16 | result = runner.invoke(cli, ["table", "get", test_public_table]) 17 | assert result.exit_code == 0, result.stderr_bytes 18 | 19 | 20 | def test_get__while_authed(runner, test_user, test_table): 21 | """Test getting a table.""" 22 | with mock_auth(test_user.username, test_user.hex_api_key()): 23 | result = runner.invoke(cli, ["table", "get", test_table]) 24 | assert result.exit_code == 0, result.stderr_bytes 25 | 26 | 27 | def test_get__table_does_not_exist(runner, test_user): 28 | result = runner.invoke(cli, ["table", "get", f"{test_user}/fake"]) 29 | assert result.exit_code == 1, result.stderr_bytes 30 | assert "Table not found" in result.stdout 31 | 32 | 33 | @pytest.mark.xfail(reason="not implemented") 34 | def test_get__user_does_not_exist(runner, test_user): 35 | result = runner.invoke(cli, ["table", "get", f"{test_user}/fake"]) 36 | assert result.exit_code == 1, result.stderr_bytes 37 | assert result.stdout == "foobar" 38 | 39 | 40 | def test_set__to_create(runner, test_user, tmpdir): 41 | """Test setting a table.""" 42 | table_name = random_string() 43 | table_filepath = str(tmpdir / f"{table_name}.csv") 44 | df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) 45 | df.to_csv(table_filepath, index=False) 46 | 47 | ref = "/".join([test_user.username, table_name]) 48 | 49 | with mock_auth(test_user.username, test_user.hex_api_key()): 50 | # set the table 51 | result = runner.invoke(cli, ["table", "set", ref, table_filepath]) 52 | assert result.exit_code == 0, result.stderr_bytes 53 | 54 | # get it back and compare 55 | result = runner.invoke(cli, ["table", "get", ref]) 56 | assert result.exit_code == 0, result.stderr_bytes 57 | actual_df = pd.read_csv(BytesIO(result.stdout_bytes)).drop( 58 | columns="csvbase_row_id" 59 | ) 60 | 61 | assert_frame_equal(df, actual_df) 62 | 63 | 64 | @pytest.mark.xfail(reason="implementation temporarily removed") 65 | def test_show__happy(runner, test_user, test_table): 66 | with mock_auth(test_user.username, test_user.hex_api_key()): 67 | result = runner.invoke(cli, ["table", "show", test_table]) 68 | assert result.exit_code == 0, result.stderr_bytes 69 | -------------------------------------------------------------------------------- /tests/cli/utils.py: -------------------------------------------------------------------------------- 1 | def format_response_error(response) -> str: 2 | if response.headers.get("Content-Type") == "application/json": 3 | return response.json() 4 | else: 5 | return response.content 6 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import patch 2 | from pathlib import Path 3 | 4 | import requests 5 | from csvbase.config import get_config 6 | from csvbase.svc import create_user 7 | from csvbase.web.app import init_app 8 | from passlib.context import CryptContext 9 | from sqlalchemy import create_engine 10 | from sqlalchemy.orm import Session 11 | import pytest 12 | 13 | from csvbase_client.internals import http, cache, auth 14 | 15 | from .utils import random_string 16 | from .value_objs import ExtendedUser 17 | from .requests_adapter import FlaskAdapter 18 | 19 | 20 | @pytest.fixture(scope="session") 21 | def db_engine(): 22 | db_url = get_config().db_url 23 | return create_engine(db_url) 24 | 25 | 26 | @pytest.fixture(scope="function") 27 | def db_sesh(db_engine): 28 | with Session(db_engine) as db_sesh: 29 | yield db_sesh 30 | 31 | 32 | @pytest.fixture() 33 | def test_user(db_sesh): 34 | """Create a user. 35 | 36 | There is not (and probably never will be) a way to create a user via the 37 | API, so call directly into svc to do this. 38 | 39 | """ 40 | crypt_context = CryptContext(["plaintext"]) 41 | username = random_string(prefix="cli-test-user-", n=20) 42 | password = "password" 43 | user = create_user(db_sesh, crypt_context, username, password) 44 | db_sesh.commit() 45 | return ExtendedUser( 46 | username=username, 47 | user_uuid=user.user_uuid, 48 | password=password, 49 | registered=user.registered, 50 | api_key=user.api_key, 51 | email=user.email, 52 | timezone=user.timezone, 53 | mailing_list=False, 54 | ) 55 | 56 | 57 | @pytest.fixture(scope="session") 58 | def csvbase_flask_app(): 59 | app = init_app() 60 | app.config["TESTING"] = True 61 | app.config["DEBUG"] = True 62 | return app 63 | 64 | 65 | @pytest.fixture() 66 | def csvbase_test_client(csvbase_flask_app): 67 | with csvbase_flask_app.test_client() as test_client: 68 | yield test_client 69 | 70 | 71 | @pytest.fixture() 72 | def flask_adapter(csvbase_test_client): 73 | return FlaskAdapter(csvbase_test_client) 74 | 75 | 76 | @pytest.fixture(autouse=True) 77 | def http_sesh(flask_adapter): 78 | """This fixture inserts our special requests->flask adapter.""" 79 | sesh = requests.Session() 80 | sesh.mount("https://csvbase.com", flask_adapter) 81 | with patch.object(http, "_get_http_sesh") as mock_get_sesh: 82 | mock_get_sesh.return_value = sesh 83 | yield sesh 84 | 85 | 86 | @pytest.fixture(autouse=True) 87 | def mock_cache(tmpdir): 88 | with patch.object(cache, "cache_path") as mocked_cache_path: 89 | mocked_cache_path.return_value = Path(str(tmpdir / "cache")) 90 | yield 91 | 92 | 93 | @pytest.fixture(autouse=True) 94 | def disable_auth_by_default(): 95 | """Disable auth by default. 96 | 97 | This ensures that any mocked setup is explicit (via mock_auth) and not 98 | accidentally falling back to the system .netrc. 99 | """ 100 | with patch.object(auth, "_get_auth", return_value=None): 101 | yield 102 | -------------------------------------------------------------------------------- /tests/fsspec/__init__.py: -------------------------------------------------------------------------------- 1 | import duckdb 2 | import fsspec 3 | 4 | duckdb.register_filesystem(fsspec.filesystem("csvbase")) 5 | -------------------------------------------------------------------------------- /tests/fsspec/test_dask.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import fsspec 3 | import dask.dataframe as dd 4 | from pandas.testing import assert_frame_equal 5 | 6 | from ..utils import random_dataframe, random_string, mock_auth 7 | 8 | 9 | def random_dask_dataframe(): 10 | return dd.from_pandas(random_dataframe()) 11 | 12 | 13 | def create_table(user, table_name: str, df: dd.DataFrame) -> None: 14 | """Creates a random table, returning the reference""" 15 | fs = fsspec.filesystem("csvbase") 16 | with mock_auth(user.username, user.hex_api_key()): 17 | with fs.open(f"{user.username}/{table_name}?public=true", "wb") as table_f: 18 | # dask doesn't have a way to write to a buffer, so go via pandas 19 | df.compute().to_csv(table_f, index=False) 20 | 21 | 22 | @pytest.mark.xfail(reason="dask does not get on with our flask injection stuff") 23 | def test_dask__read_happy(test_user, flask_adapter): 24 | original_df = random_dask_dataframe() 25 | table_name = random_string() 26 | create_table(test_user, table_name, original_df) 27 | 28 | expected_df = original_df.set_index("A").compute() 29 | actual_df = ( 30 | dd.read_csv(f"csvbase://{test_user.username}/{table_name}") 31 | .set_index("A") 32 | .compute() 33 | ) 34 | assert_frame_equal(expected_df, actual_df) 35 | -------------------------------------------------------------------------------- /tests/fsspec/test_duckdb.py: -------------------------------------------------------------------------------- 1 | import duckdb 2 | from pandas.testing import assert_frame_equal 3 | 4 | from ..utils import random_dataframe, random_string, mock_auth 5 | 6 | 7 | def random_duckdb_table(): 8 | random_df = random_dataframe() # noqa: F841 9 | table_name = random_string() 10 | duckdb.sql(f"CREATE TABLE {table_name} AS SELECT * FROM random_df") 11 | return table_name 12 | 13 | 14 | def create_table(user, table_name: str): 15 | with mock_auth(user.username, user.hex_api_key()): 16 | duckdb.sql( 17 | f"COPY {table_name} TO 'csvbase://{user.username}/{table_name}?public=true' (HEADER, DELIMITER ',')" 18 | ) 19 | 20 | 21 | def test_duckdb__read_happy_csv(test_user, flask_adapter): 22 | original_table_name = random_duckdb_table() 23 | create_table(test_user, original_table_name) 24 | 25 | actual_table_name = random_string() 26 | duckdb.sql( 27 | f"CREATE TABLE {actual_table_name} AS FROM read_csv('csvbase://{test_user.username}/{original_table_name}')" 28 | ) 29 | 30 | expected_df = duckdb.sql(f"select * from {original_table_name}").df() 31 | actual_df = ( 32 | duckdb.sql(f"select * from {actual_table_name}") 33 | .df() 34 | .drop(columns="csvbase_row_id") 35 | ) 36 | 37 | assert_frame_equal(expected_df, actual_df) 38 | -------------------------------------------------------------------------------- /tests/fsspec/test_fsspec.py: -------------------------------------------------------------------------------- 1 | """These tests are for working with the fsspec layer directly.""" 2 | 3 | import fsspec 4 | import pandas as pd 5 | import pytest 6 | import io 7 | from typing import IO 8 | import os 9 | from pandas.testing import assert_frame_equal 10 | 11 | from csvbase_client.exceptions import CSVBaseException 12 | 13 | from csvbase_client.io import rewind 14 | from ..utils import random_string, mock_auth, random_dataframe 15 | 16 | 17 | def test_fsspec__happy(test_user, http_sesh): 18 | table_name = random_string(prefix="table-") 19 | initial_df = random_dataframe() 20 | 21 | with mock_auth(test_user.username, test_user.hex_api_key()): 22 | fs = fsspec.filesystem("csvbase") 23 | 24 | # upload a table 25 | with fs.open(f"{test_user.username}/{table_name}", "w") as table_f: 26 | initial_df.to_csv(table_f, index=False) 27 | 28 | # download a table 29 | with fs.open(f"{test_user.username}/{table_name}") as table_f: 30 | actual_df = pd.read_csv(table_f, index_col=0) 31 | 32 | expected_df = initial_df.assign(csvbase_row_id=range(1, 11)).set_index( 33 | "csvbase_row_id" 34 | ) 35 | 36 | assert_frame_equal(expected_df, actual_df) 37 | 38 | 39 | def test_fsspec__cache_hit(test_user, http_sesh, flask_adapter): 40 | table_name = random_string(prefix="table-") 41 | initial_df = pd.DataFrame({"string": [f"Hello, {n}" for n in range(10)]}) 42 | 43 | with mock_auth(test_user.username, test_user.hex_api_key()): 44 | fs = fsspec.filesystem("csvbase") 45 | 46 | with fs.open(f"{test_user.username}/{table_name}", "w") as table_f: 47 | initial_df.to_csv(table_f, index=False) 48 | 49 | # cache filling request 50 | with fs.open(f"{test_user.username}/{table_name}") as table_f: 51 | table_f.read() 52 | 53 | # cache hit request 54 | with fs.open(f"{test_user.username}/{table_name}") as table_f: 55 | actual_df = pd.read_csv(table_f, index_col=0) 56 | 57 | expected_df = initial_df.assign(csvbase_row_id=range(1, 11)).set_index( 58 | "csvbase_row_id" 59 | ) 60 | 61 | # check that the data is as expected 62 | assert_frame_equal(expected_df, actual_df) 63 | 64 | # check that a cache was used 65 | second_req, second_resp = flask_adapter.request_response_pairs[1] 66 | third_req, third_resp = flask_adapter.request_response_pairs[2] 67 | etag = second_resp.headers["ETag"] 68 | assert third_req.headers["If-None-Match"] == etag 69 | assert "ETag" not in third_resp 70 | third_resp.status_code == 304 71 | 72 | # check that the layout of the cache is as expected 73 | with fs._get_fs_cache() as cache: 74 | cache_path = cache.directory / f"v0_{test_user.username}_{table_name}.csv" 75 | assert cache_path.exists() 76 | cached_df = pd.read_csv(cache_path, index_col=0) 77 | assert_frame_equal(expected_df, cached_df) 78 | 79 | 80 | def test_fsspec__read_but_does_not_exist(test_user, http_sesh, flask_adapter): 81 | fs = fsspec.filesystem("csvbase") 82 | table_name = random_string(prefix="table-") 83 | with pytest.raises(CSVBaseException): 84 | with fs.open(f"{test_user.username}/{table_name}") as table_f: 85 | pd.read_csv(table_f.read()) 86 | 87 | 88 | def test_csvbase_file__upload_multi_chunk(test_user, http_sesh, flask_adapter): 89 | """Check that larger uploads (which are done via multiple calls to 90 | _upload_chunk send the whole file. 91 | 92 | Note that fsspec uses the terms "block" and "chunk" somewhat interchangeably. 93 | 94 | """ 95 | table_name = random_string(prefix="table-") 96 | df = random_dataframe() 97 | table_buf = io.BytesIO() 98 | with rewind(table_buf): 99 | df.to_csv(table_buf, index=False) 100 | buf_size = _get_fh_size(table_buf) 101 | chunk_size = buf_size // 4 102 | 103 | with mock_auth(test_user.username, test_user.hex_api_key()): 104 | fs = fsspec.filesystem("csvbase") 105 | 106 | with fs.open(f"{test_user.username}/{table_name}", mode="wb") as table_f: 107 | # make the blocksize artificially low to exercise this 108 | # functionality 109 | table_f.blocksize = chunk_size 110 | 111 | while len(chunk := table_buf.read(chunk_size)) > 0: 112 | table_f.write(chunk) 113 | assert table_f._chunk_count == 4, "not a multi-chunk upload" 114 | 115 | with fs.open(f"{test_user.username}/{table_name}") as table_f: 116 | actual_df = pd.read_csv(table_f).drop(columns="csvbase_row_id") 117 | 118 | assert_frame_equal(df, actual_df) 119 | 120 | 121 | # FIXME: this should be somewhere else. Probably in pyappcache. 122 | def _get_fh_size(fh: IO[bytes]) -> int: 123 | """Return the size of a seekable binary filehandle.""" 124 | pos = fh.tell() 125 | fh.seek(0, os.SEEK_END) 126 | size = fh.tell() 127 | fh.seek(pos) 128 | return size 129 | -------------------------------------------------------------------------------- /tests/fsspec/test_pandas.py: -------------------------------------------------------------------------------- 1 | import fsspec 2 | import pandas as pd 3 | from pandas.testing import assert_frame_equal 4 | 5 | from ..utils import random_dataframe, random_string, mock_auth 6 | 7 | 8 | def create_table(user, table_name: str, df: pd.DataFrame) -> None: 9 | """Creates a random table, returning the reference""" 10 | fs = fsspec.filesystem("csvbase") 11 | with mock_auth(user.username, user.hex_api_key()): 12 | with fs.open(f"{user.username}/{table_name}?public=true", "wb") as table_f: 13 | df.to_csv(table_f, index=False) 14 | 15 | 16 | def test_pandas__read_happy_csv(test_user, flask_adapter): 17 | """Read a CSV via pd.read_csv(csvbase://[...])""" 18 | original_df = random_dataframe() 19 | table_name = random_string() 20 | create_table(test_user, table_name, original_df) 21 | 22 | actual_df = pd.read_csv( 23 | f"csvbase://{test_user.username}/{table_name}", index_col=0 24 | ).set_index("A") 25 | expected_df = original_df.set_index("A") 26 | assert_frame_equal(expected_df, actual_df) 27 | 28 | 29 | def test_pandas__read_happy_parquet(test_user, flask_adapter): 30 | """Read a Parquet file via pd.read_csv""" 31 | original_df = random_dataframe() 32 | table_name = random_string() 33 | create_table(test_user, table_name, original_df) 34 | 35 | actual_df = ( 36 | pd.read_parquet( 37 | f"csvbase://{test_user.username}/{table_name}.parquet", 38 | ) 39 | .drop(columns="csvbase_row_id") 40 | .set_index("A") 41 | ) 42 | expected_df = original_df.set_index("A") 43 | assert_frame_equal(expected_df, actual_df) 44 | -------------------------------------------------------------------------------- /tests/fsspec/test_polars.py: -------------------------------------------------------------------------------- 1 | import polars as pl 2 | 3 | import fsspec 4 | from polars.testing import assert_frame_equal 5 | 6 | import pytest 7 | from ..utils import random_dataframe, random_string, mock_auth 8 | 9 | 10 | def random_polars_dataframe(): 11 | return pl.from_pandas(random_dataframe()) 12 | 13 | 14 | def create_table(user, table_name: str, df: pl.DataFrame) -> None: 15 | """Creates a random table, returning the reference""" 16 | fs = fsspec.filesystem("csvbase") 17 | with mock_auth(user.username, user.hex_api_key()): 18 | with fs.open(f"{user.username}/{table_name}?public=true", "wb") as table_f: 19 | df.write_csv(table_f) 20 | 21 | 22 | def test_polars__read_happy_csv(test_user, flask_adapter): 23 | """Read a CSV via pl.read_csv(csvbase://[...])""" 24 | original_df = random_polars_dataframe() 25 | table_name = random_string() 26 | create_table(test_user, table_name, original_df) 27 | 28 | actual_df = pl.read_csv(f"csvbase://{test_user.username}/{table_name}").drop( 29 | "csvbase_row_id" 30 | ) 31 | expected_df = original_df 32 | assert_frame_equal(expected_df, actual_df) 33 | 34 | 35 | @pytest.mark.xfail( 36 | reason="polars bug: https://github.com/pola-rs/polars/issues/16737", strict=True 37 | ) 38 | def test_polars__read_happy_parquet(test_user, flask_adapter): 39 | original_df = random_polars_dataframe() 40 | table_name = random_string() 41 | create_table(test_user, table_name, original_df) 42 | 43 | actual_df = pl.read_parquet( 44 | f"csvbase://{test_user.username}/{table_name}.parquet" 45 | ).drop("csvbase_row_id") 46 | expected_df = original_df 47 | assert_frame_equal(expected_df, actual_df) 48 | -------------------------------------------------------------------------------- /tests/init-schemas.sql: -------------------------------------------------------------------------------- 1 | -- FIXME: vendored from csvbase. should be a manifest file 2 | -- This SQL script creates the two schemas that are used for it's own tables 3 | -- and the tables of users respectively 4 | CREATE SCHEMA IF NOT EXISTS metadata; 5 | CREATE SCHEMA IF NOT EXISTS userdata; 6 | -------------------------------------------------------------------------------- /tests/requests_adapter.py: -------------------------------------------------------------------------------- 1 | # Need to wait for a release of types-requests that includes this commit: 2 | # https://github.com/python/typeshed/commit/b69b17c3d8fd5b1f0cc8209b2f15e6b4b687a2ee 3 | # mypy: ignore-errors 4 | 5 | from base64 import b64encode 6 | from io import BytesIO 7 | 8 | import requests 9 | from requests.adapters import BaseAdapter 10 | 11 | 12 | def basic_auth(username, password) -> str: 13 | user_pass = f"{username}:{password}".encode("utf-8") 14 | encoded = b64encode(user_pass).decode("utf-8") 15 | return f"Basic {encoded}" 16 | 17 | 18 | class FlaskAdapter(BaseAdapter): 19 | """Adapts requests requests into requests against a flask app's test client.""" 20 | 21 | def __init__(self, test_client): 22 | self.test_client = test_client 23 | self.request_response_pairs = [] 24 | 25 | def send( 26 | self, 27 | request: requests.PreparedRequest, 28 | stream: bool, 29 | timeout, 30 | verify: bool, 31 | cert, 32 | proxies, 33 | ) -> requests.Response: 34 | flask_response = self.test_client.open( 35 | path=request.path_url, 36 | method=request.method, 37 | headers=dict(request.headers), 38 | data=request.body.read() if request.body is not None else None, 39 | ) 40 | response = requests.Response() 41 | response.status_code = flask_response.status_code 42 | response.raw = BytesIO(flask_response.data) 43 | response.url = request.path_url 44 | response.reason = flask_response.status[4:] 45 | response.headers = flask_response.headers 46 | self.request_response_pairs.append((request, response)) 47 | return response 48 | -------------------------------------------------------------------------------- /tests/test_cache.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from io import BytesIO 3 | 4 | from csvbase_client.constants import CSVBASE_DOT_COM 5 | from csvbase_client.internals.cache import get_fs_cache, RepKey 6 | from csvbase_client.internals.value_objs import ContentType 7 | from csvbase_client.io import rewind 8 | 9 | 10 | def test_fs_cache__getting_and_setting(tmpdir): 11 | cache = get_fs_cache(Path(str(tmpdir))) 12 | # FIXME: should include etag 13 | key = RepKey(CSVBASE_DOT_COM, "test/test", ContentType.CSV) 14 | filelike = BytesIO(b"rhubarb") 15 | 16 | # unset 17 | assert cache.get(key) is None 18 | 19 | # now set 20 | with rewind(filelike): 21 | cache.set(key, filelike) 22 | 23 | # check file is present where expected 24 | 25 | # assert that i can get it 26 | actual = cache.get(key) 27 | assert actual.read() == filelike.read() 28 | -------------------------------------------------------------------------------- /tests/test_version.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from csvbase_client.internals.version import ( 4 | get_version, 5 | get_softcoded_version, 6 | get_hardcoded_version, 7 | ) 8 | 9 | 10 | def test_version_is_valid(): 11 | VERSION_REGEX = re.compile(r"^\d+\.\d+\.\d+$") 12 | assert VERSION_REGEX.match(get_version()) 13 | 14 | 15 | def test_hardcoded_version_matches_softcoded(): 16 | assert get_softcoded_version() == get_hardcoded_version() 17 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import patch 2 | import random 3 | import string 4 | import contextlib 5 | 6 | import pandas as pd 7 | import numpy as np 8 | 9 | from csvbase_client.internals.value_objs import Auth 10 | from csvbase_client.internals import auth as auth_module 11 | 12 | 13 | def random_string(prefix: str = "", n: int = 32) -> str: 14 | return prefix + "".join(random.choice(string.ascii_lowercase) for _ in range(n)) 15 | 16 | 17 | @contextlib.contextmanager 18 | def mock_auth(username: str, hex_api_key: str): 19 | auth = Auth(username, hex_api_key) 20 | with patch.object(auth_module, "_get_auth", return_value=auth): 21 | yield 22 | 23 | 24 | def random_dataframe(row_count=10) -> pd.DataFrame: 25 | rng = np.random.default_rng() 26 | df = pd.DataFrame(rng.integers(0, 100, size=(row_count, 4)), columns=list("ABCD")) 27 | return df 28 | -------------------------------------------------------------------------------- /tests/value_objs.py: -------------------------------------------------------------------------------- 1 | # COPIED: 2 | from base64 import b64encode 3 | from dataclasses import dataclass 4 | 5 | from csvbase.value_objs import User 6 | 7 | 8 | @dataclass 9 | class ExtendedUser(User): 10 | password: str 11 | 12 | def basic_auth(self) -> str: 13 | """The HTTP Basic Auth header value for this user""" 14 | hex_api_key = self.hex_api_key() 15 | user_pass = f"{self.username}:{hex_api_key}".encode("utf-8") 16 | encoded = b64encode(user_pass).decode("utf-8") 17 | return f"Basic {encoded}" 18 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = 3 | py3{12,11,10,9,8} 4 | skipsdist = True 5 | skip_missing_interpreters = True 6 | [testenv] 7 | commands = 8 | pip install --upgrade setuptools pip wheel # necessary for python3.8 9 | pip install --upgrade git+https://github.com/calpaterson/csvbase.git#egg=csvbase 10 | python -m pip install -e . 11 | python -m pip install -r test-requirements.txt 12 | ruff check . 13 | black --check . 14 | mypy . 15 | pytest --------------------------------------------------------------------------------