├── .gitignore ├── .travis.yml ├── AUTHORS.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── HISTORY.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── bin └── release_build ├── examples └── usgs │ ├── catalog │ └── earthquakes.json │ ├── config │ └── tap_config.json │ ├── custom_spec.json │ ├── sample_records.json │ ├── schema │ └── earthquakes.json │ └── today.sh ├── requirements_dev.txt ├── setup.cfg ├── setup.py ├── tap_rest_api ├── __init__.py ├── default_spec.json ├── helper.py ├── schema.py └── sync.py └── tests ├── install_test.sh ├── test_usgs.py └── unit └── test_headers.py /.gitignore: -------------------------------------------------------------------------------- 1 | venv 2 | .env 3 | install_test 4 | 5 | # Distribution / packaging 6 | .Python 7 | *.pyc 8 | env/ 9 | build/ 10 | develop-eggs/ 11 | dist/ 12 | downloads/ 13 | eggs/ 14 | .eggs/ 15 | lib/ 16 | lib64/ 17 | parts/ 18 | sdist/ 19 | var/ 20 | *.egg-info/ 21 | .installed.cfg 22 | *.egg 23 | 24 | # Serverless directories 25 | .serverless 26 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | dist: trusty 3 | 4 | language: python 5 | 6 | python: 7 | - "3.6" 8 | 9 | before_install: 10 | - export 11 | - ls /opt/python/ 12 | - /opt/python/3.6/bin/python --version 13 | 14 | install: 15 | - /opt/python/3.6/bin/python -m venv ./venv 16 | - source ./venv/bin/activate 17 | - which python 18 | - pip install --upgrade pip 19 | - pip install wheel 20 | - pip install --no-cache -e . 21 | - pip install -r requirements_dev.txt 22 | 23 | script: 24 | - which python 25 | - pytest -s tests 26 | # - tests/install_test.sh 27 | -------------------------------------------------------------------------------- /AUTHORS.md: -------------------------------------------------------------------------------- 1 | ======= 2 | Credits 3 | ======= 4 | 5 | Development Lead 6 | ---------------- 7 | 8 | * Daigo Tanaka 9 | 10 | Contributors 11 | ------------ 12 | 13 | * Louis Goddard (ltrgoddard) 14 | * Christian Gagnon (ReptilianBrain) 15 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Code of Conduct 2 | 3 | As contributors and maintainers of this project, and in the interest of fostering an open and welcoming community, we pledge to respect all people who contribute through reporting issues, posting feature requests, updating documentation, submitting pull requests or patches, and other activities. 4 | 5 | We are committed to making participation in this project a harassment-free experience for everyone, regardless of level of experience, gender, gender identity and expression, sexual orientation, disability, personal appearance, body size, race, ethnicity, age, religion, or nationality. 6 | 7 | Examples of unacceptable behavior by participants include: 8 | 9 | * The use of sexualized language or imagery 10 | * Personal attacks 11 | * Trolling or insulting/derogatory comments 12 | * Public or private harassment 13 | * Publishing other's private information, such as physical or electronic addresses, without explicit permission 14 | * Other unethical or unprofessional conduct 15 | 16 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct. By adopting this Code of Conduct, project maintainers commit themselves to fairly and consistently applying these principles to every aspect of managing this project. Project maintainers who do not follow or enforce the Code of Conduct may be permanently removed from the project team. 17 | 18 | This code of conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. 19 | 20 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by opening an issue or contacting one or more of the project maintainers. 21 | 22 | This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 1.2.0, available at https://www.contributor-covenant.org/version/1/2/0/code-of-conduct.html 23 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Contributions are welcome, and they are greatly appreciated! Every little bit 4 | helps, and credit will always be given. 5 | 6 | You can contribute in many ways: 7 | 8 | ## Types of Contributions 9 | 10 | ### Report Bugs 11 | 12 | Report bugs at https://github.com/anelendata/tap_rest_api 13 | 14 | If you are reporting a bug, please include: 15 | 16 | - Your operating system name and version. 17 | - Any details about your local setup that might be helpful in troubleshooting. 18 | - Detailed steps to reproduce the bug. 19 | 20 | ### Fix Bugs 21 | 22 | Look through the GitHub issues for bugs. Anything tagged with "bug" and "help 23 | wanted" is open to whoever wants to implement it. 24 | 25 | ### Implement Features 26 | 27 | Look through the GitHub issues for features. Anything tagged with "enhancement" 28 | and "help wanted" is open to whoever wants to implement it. 29 | 30 | ### Write Documentation 31 | 32 | tap_rest_api could always use more documentation, whether as part of the 33 | official README.md, in docstrings, or even on the web in blog posts, 34 | articles, and such. 35 | 36 | ### Submit Feedback 37 | 38 | The best way to send feedback is to file an issue at https://github.com/anelendata/tap_rest_api/issues. 39 | 40 | If you are proposing a feature: 41 | 42 | - Explain in detail how it would work. 43 | - Keep the scope as narrow as possible, to make it easier to implement. 44 | - Remember that this is a volunteer-driven project, and that contributions 45 | are welcome :) 46 | 47 | ## Submit a pull request through the GitHub website. 48 | 49 | ### Pull Request Guidelines 50 | 51 | Before you submit a pull request, check that it meets these guidelines: 52 | 53 | 1. The pull request should include tests. 54 | 2. If the pull request adds functionality, the docs should be updated. Put 55 | your new functionality into a function with a docstring, and add the 56 | feature to the list in README.md. 57 | 58 | TODO: Set up Travis: 59 | 3. The pull request should work for Python 3.6, 3.7 and 3.8, and for PyPy. Check 60 | https://travis-ci.org/anelendata/tap_rest_api/pull_requests 61 | and make sure that the tests pass for all supported Python versions. 62 | 63 | ## Deploying 64 | 65 | A reminder for the maintainers on how to deploy. 66 | Make sure all your changes are committed (including an entry in HISTORY.md). 67 | Then run:: 68 | 69 | $ bump2version patch # possible: major / minor / patch 70 | $ git push 71 | $ git push --tags 72 | 73 | TODO: Set up Travis so it will then deploy to PyPI if tests pass. 74 | -------------------------------------------------------------------------------- /HISTORY.md: -------------------------------------------------------------------------------- 1 | ## History 2 | 3 | ### 0.2.9 (2024-07-16) 4 | 5 | - Config switch to drop unknown (sub-)properties. 6 | See https://github.com/anelendata/tap-rest-api#schema-validation-and-cleanups 7 | 8 | 9 | ### 0.2.8 (2023-10-20) 10 | 11 | - Multiple stream sync 12 | Support URLs and parameters per stream 13 | See https://github.com/anelendata/tap-rest-api#multiple-streams 14 | 15 | ### 0.2.7 (2022-03-11) 16 | 17 | - Use digest for dup check instead of raw record enhancement #24 18 | - Wrong params for singer.utils.backoff decorator bug #27 19 | 20 | ### 0.2.6 (2021-07-11) 21 | 22 | - Update getschema to 0.2.6 to fix a wrong rejection of null object when it's allowed. 23 | 24 | 25 | ### 0.2.5 (2021-06-04) 26 | 27 | - Update getschema to 0.2.5 to fix a bad null conversion 28 | 29 | ### 0.2.4 (2021-05-25) 30 | 31 | - fix: Infer schema mode produces null record that causes "CRITICAL list index out of range" (#16) 32 | 33 | ### 0.2.3 (2021-05-06) 34 | 35 | - fix: missing variable for max_page logging 36 | 37 | ### 0.2.2 (2021-05-03) 38 | 39 | - fix: end_datetime is not honored when timestamp_key is used #12 40 | 41 | ### 0.2.1 (2021-05-02) 42 | 43 | - doc: add missing release history entry 44 | 45 | ### 0.2.0 (2021-05-02) 46 | 47 | - feature: Set record_list_level and record_level, index_key, datetime_key, and timestamp_key with jsonpath. 48 | 49 | ### 0.1.3 (2020-12-22) 50 | 51 | - Bump getschema version to 0.1.2 so it allows empty object (dict) entries 52 | 53 | ### 0.1.2 (2020-12-05) 54 | 55 | - When filter_by_schema: true in config, clean the record and filter out 56 | invalid record against the schema. 57 | - Externalized json2schema.py as [getschema](https://pypi.org/project/getschema/) 58 | 59 | ### 0.1.1 (2020-11-08) 60 | 61 | - Custom header (See README.md) 62 | - Raise when an invalid type is set in schema 63 | - Treat numbers with leading zeros as string 64 | 65 | ### 0.1.0b2 (2020-08-12) 66 | 67 | Project description update only. 68 | 69 | ### 0.1.0b1 (2020-08-12) 70 | 71 | Change repository and command name to tap-rest-api (from underscore) 72 | 73 | ### 0.1.0b0 (2020-08-11) 74 | 75 | Beta release. The first pypi package build. 76 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include tap_rest_api/default_spec.json 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.com/anelendata/tap-rest-api.svg?branch=master)](https://travis-ci.com/anelendata/tap-rest-api) 2 | 3 | 💥 New in 0.2.0: Set record_list_level and record_level, index_key, datetime_key, and timestamp_key with jsonpath. 4 | 5 | # tap-rest-api 6 | 7 | A configurable REST API singer.io tap. 8 | 9 | ## What is it? 10 | 11 | tap-rest-api is a [Singer](https://singer.io) tap that produces JSON-formatted 12 | data following the [Singer spec](https://github.com/singer-io/getting-started). 13 | 14 | This tap: 15 | 16 | - Pulls JSON records from Rest API 17 | - Automatically infers the schema and generate JSON-schema and Singer catalog 18 | file. 19 | - Incrementally pulls data based on the input state. (singer.io bookmark specification) 20 | 21 | The stdout from this program is intended by consumed by singer.io target program as: 22 | 23 | ``` 24 | tap-rest-api | target-csv 25 | ``` 26 | 27 | ## How to use it 28 | 29 | Install: 30 | 31 | ``` 32 | pip install tap-rest-api 33 | ``` 34 | 35 | The following example is created using [USGS Earthquake Events data](https://earthquake.usgs.gov/fdsnws/event/1/). 36 | 37 | `curl https://earthquake.usgs.gov/fdsnws/event/1/query?format=geojson&starttime=2014-01-01&endtime=2014-01-02&minmagnitude=1` 38 | 39 | ``` 40 | { 41 | "type": "FeatureCollection", 42 | "features": [ 43 | { 44 | "geometry": { 45 | "type": "Point", 46 | "coordinates": [ 47 | -116.7776667, 48 | 33.6633333, 49 | 11.008 50 | ] 51 | }, 52 | "type": "Feature", 53 | "properties": { 54 | "rms": 0.09, 55 | "code": "11408890", 56 | "cdi": null, 57 | "sources": ",ci,", 58 | "nst": 39, 59 | "tz": -480, 60 | "title": "M 1.3 - 10km SSW of Idyllwild, CA", 61 | ... 62 | "mag": 1.29, 63 | ... 64 | "place": "10km SSW of Idyllwild, CA", 65 | "time": 1388620296020, 66 | "mmi": null 67 | }, 68 | "id": "ci11408890" 69 | }, 70 | ... 71 | ] 72 | } 73 | ``` 74 | [examples/usgs/sample_records.json](https://raw.githubusercontent.com/anelendata/tap-rest-api/master/examples/usgs/sample_records.json) 75 | 76 | In the following steps, we will atempt to extract `properties` section of 77 | the record type `Feature` as Singer record. 78 | 79 | ### Step 1: Default spec 80 | 81 | Anything defined here can be added to tap configuration file or to the 82 | command-line argument: 83 | 84 | - [default_spec.json](https://github.com/anelendata/tap-rest-api/blob/master/tap_rest_api/default_spec.json) 85 | 86 | ### Step 2: [Optional] Create a custom spec for config file: 87 | 88 | If you would like to define more configuration variables, create a spec file. 89 | Here is an 90 | [example] (https://github.com/anelendata/tap-rest-api/blob/master/examples/usgs/custom_spec.json): 91 | ``` 92 | { 93 | "args": { 94 | "min_magnitude": 95 | { 96 | "type": "integer", 97 | "default": "0", 98 | "help": "Filter based on the minimum magnitude." 99 | } 100 | } 101 | } 102 | ``` 103 | 104 | Anything you define here overwrites 105 | [default_spec.json](https://github.com/anelendata/tap-rest-api/blob/master/tap_rest_api/default_spec.json). 106 | 107 | ### Step 3. Create Config file: 108 | 109 | **Please note jsonpath specification is supported version 0.2.0 and later only.** 110 | 111 | Now create a cofnig file. Note the difference between spec file and config file. 112 | The role of spec file is to create or alter the config specs, and the role of 113 | the config file is to provide the values to the config variables. When a value 114 | is not specified in the config file, the default value defined in the spec 115 | file is used. 116 | 117 | [Example](https://github.com/anelendata/tap-rest-api/tree/master/examples/usgs/config/tap_config.json): 118 | 119 | ``` 120 | { 121 | "url":"https://earthquake.usgs.gov/fdsnws/event/1/query?format=geojson&starttime={start_datetime}&endtime={end_datetime}&minmagnitude={min_magnitude}&limit={items_per_page}&offset={current_offset}&eventtype=earthquake&orderby=time-asc", 122 | "record_list_level": "features[*]", 123 | "timestamp_key": "properties.time", 124 | "schema": "earthquakes", 125 | "items_per_page": 100, 126 | "offset_start": 1, 127 | "auth_method": "no_auth", 128 | "min_magnitude": 1 129 | } 130 | ``` 131 | 132 | Below are some key concepts in the configuration file. 133 | 134 | #### Parametric URL 135 | 136 | You can use `{}` notion to insert the value specified at the config to URL. 137 | 138 | In addition to the config variables listed in 139 | [default_spec.json](https://github.com/anelendata/tap-rest-api/blob/master/tap_rest_api/default_spec.json) 140 | and the custom spec file, the URL also can contain parameters from the following run-time variables: 141 | 142 | - current_offset: Offset by the number of records to skip 143 | - current_page: The current page if the endpoint supports paging 144 | - last_update: The last retrieved value of the column specified by index_key, timestamp_key, or datetime_key 145 | (See next section) 146 | 147 | #### timestamp_key, datetime_key, index_key 148 | 149 | If you want to use timestamp, datetime, index in the parameterized URL or 150 | want to use a field in those types as a bookmark, one of either timestamp_key, 151 | datetime_key, or index_key must be set to indicate which field in the record 152 | corresponds to the data type. 153 | 154 | - timestamp_key: POSIX timestamp 155 | - datetime_key: ISO 8601 formatted datetime (it can be truncated to date and etc) 156 | It works when the character between the date and time components is " " instead of "T". 157 | - index_key: A sequential index (integer or string) 158 | 159 | In USGS example, the individual record contains the top level objects `properties` 160 | and `geometry`. The timestamp key is `time` defined under `properties`, so the config 161 | value `timestamp_key` is set as `properties.time`, following 162 | [jsonpath](https://goessner.net/articles/JsonPath/) specification. 163 | 164 | When you specify timestamp_key, datetime_key, or index_key in the config, 165 | you also need to set start_timestamp, start_datetime, or start_index in 166 | config or as a command-line argument. 167 | 168 | Optionally, you can set end_timestamp, end_datetime, or end_index to indicate 169 | so the process stops once such threashold is encounterd, assuming the data 170 | is sorted by the field. 171 | 172 | For human convenience, start/end_datetime (more human readable) is also looked 173 | up when timestamp_key is set but start/end_timestamp is not set. 174 | 175 | 176 | #### Multi-streams: timestamp_keys, datetime_keys, index_keys 177 | 178 | These dictionary values are used when you want to specify different bookmark types for each stream. 179 | 180 | ``` 181 | { 182 | ... 183 | "datetime_keys": { 184 | "some_stream": "modified_at" 185 | } 186 | ``` 187 | 188 | #### Record list level and record level 189 | 190 | - record_list_level: 191 | Some API wraps a set of records under a property. Others responds a newline separated JSONs. 192 | For the former, we need to specify a key so the tap can find the record level. 193 | The USGS earthquake response is a single JSON object example. The records are listed under 194 | features object. So the config value `record_list_level` is set as a jsonpath `features[*]`. 195 | 196 | - record_level: 197 | Under the individual record, there may be another layer of properties that separates 198 | the data and meta data and we may only be interested in the former. If this is the case, 199 | we can specify record_level. In USGS example, we can ignore `geometry` object and output 200 | only the content of `properties` object. Set a jsonpath to `record_level` config value 201 | to achieve this: 202 | 203 | ``` 204 | { 205 | "url":"https://earthquake.usgs.gov/fdsnws/event/1/query?format=geojson&starttime={start_datetime}&endtime={end_datetime}&minmagnitude={min_magnitude}&limit={items_per_page}&offset={current_offset}&eventtype=earthquake&orderby=time-asc", 206 | "record_list_level": "features[*]", 207 | "record_level": "properties", 208 | "timestamp_key": "time", 209 | "schema": "earthquakes", 210 | "items_per_page": 100, 211 | "offset_start": 1, 212 | "auth_method": "no_auth", 213 | "min_magnitude": 1 214 | } 215 | ``` 216 | 217 | #### unnest 218 | 219 | When you want to flatten a nested record, the config below will grab the record["some_nested_col"]["modified_at"] and put it in record["modified_at"]: 220 | 221 | ``` 222 | { 223 | ... 224 | "unnest": { 225 | "some_stream": [ 226 | { 227 | "path": "$.some_nested_col.modified_at", 228 | "target": "modified_at", 229 | ], 230 | ... 231 | }, 232 | ``` 233 | 234 | Note: The schema and catalog must reflect the schema after unnesting. To aid this, infer_schema also does this transformation before determining the schema. 235 | 236 | ### Step 4. Create schema and catalog files 237 | 238 | ``` 239 | $ tap-rest-api custom_spec.json --config config/tap_config.json --schema_dir ./schema --catalog_dir ./catalog --start_datetime="2020-08-06" --infer_schema 240 | ``` 241 | 242 | The schema and catalog files are created under schema and catalog directories, respectively. 243 | 244 | Note: 245 | 246 | - If no customization needed, you can omit the spec file (custom_spec.json) 247 | - `start_dateime` and `end_datetime` are copied to `start_timestamp` and `end_timestamp`. 248 | - `end_timestamp` and `end_datetime` are automatically set as UTC now if not present in the config file or command-line argument. 249 | 250 | ### Step 5. Run the tap 251 | 252 | ``` 253 | $ tap-rest-api ./custom_spec.json --config config/tap_config.json --start_datetime="2020-08-06" --catalog ./catalog/earthquakes.json 254 | ``` 255 | 256 | ## Authentication 257 | 258 | The example above does not require login. tap-rest-api currently supports 259 | basic auth. If this is needed add something like: 260 | 261 | ``` 262 | { 263 | "auth_method": "basic", 264 | "username": "my_username", 265 | "password": "my_password", 266 | ... 267 | } 268 | ``` 269 | 270 | Or add those at the commands line: 271 | 272 | ``` 273 | tap-rest-api config/custom_spec.json --config config/tap_config.json --schema_dir ./config/schema --catalog ./config/catalog/some_catalog.json --start_datetime="2020-08-06" --username my_username --password my_password --auth_method basic 274 | ``` 275 | 276 | ## Custom http-headers 277 | 278 | In addition to the authentication method, you can specify the http header 279 | in config file: 280 | 281 | Example: 282 | 283 | ``` 284 | ... 285 | "http_headers": 286 | { 287 | "User-Agent": "Mozilla/5.0 (Macintosh; scitylana.singer.io) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36", 288 | "Content-type": "application/json", 289 | "Authorization": "Bearer " 290 | }, 291 | ... 292 | ``` 293 | 294 | Here is the default value: 295 | ``` 296 | { 297 | "User-Agent": "Mozilla/5.0 (Macintosh; scitylana.singer.io) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36", 298 | "Content-type": "application/json" 299 | } 300 | ``` 301 | 302 | When you define http_headers config value, the default value is nullified. 303 | So you should redefine "User-Agent" and "Content-type" when you need them. 304 | 305 | ## Multiple streams 306 | 307 | tap-rest-api suports settings for multiple streams. 308 | 309 | - `url` is set as string for default value. 310 | - `urls` is a dictionary to overwrite the default `url` for the specified stream ID given as the dictionary key 311 | - `{stream}` can be used as parameter in URL. 312 | - `timestamp_keys`, `datetime_keys`, `index_keys` can be set as dictionary. If a stream ID exists in the dictionary key in one of the items, it will be used. If not, the key defaults to a string defined one with priotiry (timestamp_key > datetime_key > index_key. 313 | - `datetime_key`, `timestamp_key`, and `index_key` are set as string and the default bookmark keys. 314 | - Active streams must be defined as a comma separated stream IDs either in the config file or in the command `--stream ` 315 | - Streams must be registered in catalog file with `selected: true` ([example](https://github.com/anelendata/tap-rest-api/blob/master/examples/usgs/catalog/earthquakes.json)) 316 | 317 | Here is an example for [Chargify API](https://developers.chargify.com/docs/api-docs) 318 | 319 | ``` 320 | { 321 | "url": "https://{{ subdomain }}.chargify.com/{stream}.json?direction=asc&per_page={items_per_page}&page={current_page_one_base}&date_field={datetime_key}&start_datetime={start_datetime}", 322 | "urls": { 323 | "events": "https://{{ subdomain }}.chargify.com/events.json?direction=asc&per_page={items_per_page}&page={current_page_one_base}&date_field=created_at&since_id={start_index}", 324 | "price_points": "https://{{ subdomain }}.chargify.com/products_price_points.json?direction=asc&per_page={items_per_page}&page={current_page_one_base}&filter[date_field]=updated_at&filter[start_datetime]={start_datetime}&filter[end_datetime]={end_datetime}", 325 | "segments": "https://{{ subdomain }}.chargify.com/components/{{ component_id }}/price_points/{{ price_point_id }}/segments.json?per_page={items_per_page}&page={current_page_one_base}", 326 | "statements": "https://{{ subdomain }}.chargify.com/statements.json?direction=asc&per_page={items_per_page}&page={current_page_one_base}&sort=created_at", 327 | "transactions": "https://{{ subdomain }}.chargify.com/transactions.json?direction=asc&per_page={items_per_page}&page={current_page_one_base}&since_id={start_index}&order_by=id", 328 | "customers_meta": "https://{{ subdomain }}.chargify.com/customers/metadata.json?direction=asc&date_field=updated_at&per_page={items_per_page}&page={current_page_one_base}&with_deleted=true&start_datetime={start_datetime}&end_datetime={end_datetime}", 329 | "subscriptions_meta": "https://{{ subdomain }}.chargify.com/subscriptions/metadata.json?direction=asc&date_field=updated_at&per_page={items_per_page}&page={current_page_one_base}&with_deleted=true&start_datetime={start_datetime}&end_datetime={end_datetime}" 330 | }, 331 | "streams": "components,coupons,customers,events,invoices,price_points,products,product_families,subscriptions,subscriptions_components,transactions", 332 | "auth_method": "basic", 333 | "username": "{{ api_key }}", 334 | "password": "x", 335 | "record_list_level": { 336 | "customers_meta": "$.metadata[*]", 337 | "invoices": "$.invoices[*]", 338 | "price_points": "$.price_points[*]", 339 | "segments": "$.segments[*]", 340 | "subscriptions_components": "$.subscriptions_components[*]", 341 | "subscriptions_meta": "$.metadata[*]" 342 | }, 343 | "record_level": { 344 | "components": "$.component", 345 | "coupons": "$.coupon", 346 | "customers": "$.customer", 347 | "events": "$.event", 348 | "product_families": "$.product_family", 349 | "products": "$.product", 350 | "statements": "$.statement", 351 | "subscriptions": "$.subscription", 352 | "transactions": "$.transaction" 353 | }, 354 | "datetime_key": { 355 | "components": "updated_at", 356 | "coupons": "updated_at", 357 | "customers": "updated_at", 358 | "invoices": "updated_at", 359 | "price_points": "updated_at", 360 | "product_families": "updated_at", 361 | "products": "updated_at", 362 | "subscriptions": "updated_at", 363 | "subscriptions_components": "updated_at" 364 | }, 365 | "index_key": { 366 | "events": "id", 367 | "transactions": "id", 368 | "segments": "id", 369 | "statements": "id", 370 | "customers_meta": "id", 371 | "subscriptions_meta": "id" 372 | }, 373 | "items_per_page": 200 374 | } 375 | ``` 376 | 377 | ## State 378 | 379 | This tap emits [state](https://github.com/singer-io/getting-started/blob/master/docs/CONFIG_AND_STATE.md#state-file). 380 | The command also takes a state file input with `--state ` option. 381 | The tap itself does not output a state file. It anticipate the target program or a downstream process to fianlize the state safetly and produce a state file. 382 | 383 | ## Raw output mode 384 | 385 | If you want to use this tap outside Singer framework, set `--raw` in the 386 | commandline argument. Then the process write out the records as 387 | newline-separated JSON. 388 | 389 | A use case for this mode is when you expect the schema to change or inconsistent 390 | and you rather want to extract and clean up post-loading. 391 | ([Example](https://articles.anelen.co/elt-google-cloud-storage-bigquery/)) 392 | 393 | ## Schema validation and cleanups 394 | 395 | - on_invalid_property: Behavior when schema validation fails. 396 | - "raise": Raise exception 397 | - "null": Impute with null 398 | - "force" (default): Keep the record value as is (string). This may fail in the singer target. 399 | - drop_unknown_properties: If true, record will exclude unknown (sub-)properties before it's being written to stdout. Default is false. 400 | 401 | Config example to add them: 402 | ``` 403 | { 404 | ... 405 | "on_invalid_property": "force", 406 | "drop_unknown_properties": true, 407 | ... 408 | } 409 | ``` 410 | 411 | # About this project 412 | 413 | This project is developed by 414 | ANELEN and friends. Please check out the ANELEN's 415 | [open innovation philosophy and other projects](https://anelen.co/open-source.html) 416 | 417 | ![ANELEN](https://avatars.githubusercontent.com/u/13533307?s=400&u=a0d24a7330d55ce6db695c5572faf8f490c63898&v=4) 418 | --- 419 | 420 | Copyright © 2020~ Anelen Co., LLC 421 | -------------------------------------------------------------------------------- /bin/release_build: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ./tests/install_test.sh && 4 | 5 | python3 setup.py clean --all 6 | rm -fr dist 7 | rm -fr build 8 | rm -fr tap_bigquery.egg-info 9 | python3 setup.py sdist bdist_wheel 10 | 11 | # test 12 | # python3 -m twine upload --repository testpypi dist/* 13 | twine upload dist/* 14 | -------------------------------------------------------------------------------- /examples/usgs/catalog/earthquakes.json: -------------------------------------------------------------------------------- 1 | { 2 | "streams": [ 3 | { 4 | "stream": "earthquakes", 5 | "tap_stream_id": "earthquakes", 6 | "schema": { 7 | "type": "object", 8 | "properties": { 9 | "type": { 10 | "type": [ 11 | "null", 12 | "string" 13 | ] 14 | }, 15 | "properties": { 16 | "type": [ 17 | "null", 18 | "object" 19 | ], 20 | "properties": { 21 | "mag": { 22 | "type": [ 23 | "null", 24 | "number" 25 | ] 26 | }, 27 | "place": { 28 | "type": [ 29 | "null", 30 | "string" 31 | ] 32 | }, 33 | "time": { 34 | "type": [ 35 | "null", 36 | "integer" 37 | ] 38 | }, 39 | "updated": { 40 | "type": [ 41 | "null", 42 | "integer" 43 | ] 44 | }, 45 | "tz": { 46 | "type": [ 47 | "null", 48 | "string" 49 | ] 50 | }, 51 | "url": { 52 | "type": [ 53 | "null", 54 | "string" 55 | ] 56 | }, 57 | "detail": { 58 | "type": [ 59 | "null", 60 | "string" 61 | ] 62 | }, 63 | "felt": { 64 | "type": [ 65 | "null", 66 | "integer" 67 | ] 68 | }, 69 | "cdi": { 70 | "type": [ 71 | "null", 72 | "number" 73 | ] 74 | }, 75 | "mmi": { 76 | "type": [ 77 | "null", 78 | "number" 79 | ] 80 | }, 81 | "alert": { 82 | "type": [ 83 | "null", 84 | "string" 85 | ] 86 | }, 87 | "status": { 88 | "type": [ 89 | "null", 90 | "string" 91 | ] 92 | }, 93 | "tsunami": { 94 | "type": [ 95 | "null", 96 | "integer" 97 | ] 98 | }, 99 | "sig": { 100 | "type": [ 101 | "null", 102 | "integer" 103 | ] 104 | }, 105 | "net": { 106 | "type": [ 107 | "null", 108 | "string" 109 | ] 110 | }, 111 | "code": { 112 | "type": [ 113 | "null", 114 | "string" 115 | ] 116 | }, 117 | "ids": { 118 | "type": [ 119 | "null", 120 | "string" 121 | ] 122 | }, 123 | "sources": { 124 | "type": [ 125 | "null", 126 | "string" 127 | ] 128 | }, 129 | "types": { 130 | "type": [ 131 | "null", 132 | "string" 133 | ] 134 | }, 135 | "nst": { 136 | "type": [ 137 | "null", 138 | "integer" 139 | ] 140 | }, 141 | "dmin": { 142 | "type": [ 143 | "null", 144 | "number" 145 | ] 146 | }, 147 | "rms": { 148 | "type": [ 149 | "null", 150 | "number" 151 | ] 152 | }, 153 | "gap": { 154 | "type": [ 155 | "null", 156 | "number" 157 | ] 158 | }, 159 | "magType": { 160 | "type": [ 161 | "null", 162 | "string" 163 | ] 164 | }, 165 | "type": { 166 | "type": [ 167 | "null", 168 | "string" 169 | ] 170 | }, 171 | "title": { 172 | "type": [ 173 | "null", 174 | "string" 175 | ] 176 | } 177 | } 178 | }, 179 | "geometry": { 180 | "type": [ 181 | "null", 182 | "object" 183 | ], 184 | "properties": { 185 | "type": { 186 | "type": [ 187 | "null", 188 | "string" 189 | ] 190 | }, 191 | "coordinates": { 192 | "type": [ 193 | "null", 194 | "array" 195 | ], 196 | "items": { 197 | "type": [ 198 | "null", 199 | "number" 200 | ] 201 | } 202 | } 203 | } 204 | }, 205 | "id": { 206 | "type": [ 207 | "null", 208 | "string" 209 | ] 210 | }, 211 | "_sdc_extracted_at": { 212 | "type": [ 213 | "null", 214 | "string" 215 | ], 216 | "format": "date-time" 217 | }, 218 | "_sdc_batched_at": { 219 | "type": [ 220 | "null", 221 | "string" 222 | ], 223 | "format": "date-time" 224 | } 225 | }, 226 | "selected": true 227 | } 228 | } 229 | ] 230 | } -------------------------------------------------------------------------------- /examples/usgs/config/tap_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "url":"https://earthquake.usgs.gov/fdsnws/event/1/query?format=geojson&starttime={start_datetime}&endtime={end_datetime}&minmagnitude={min_magnitude}&limit={items_per_page}&offset={current_offset}&eventtype=earthquake&orderby=time-asc", 3 | "timestamp_key": "properties.time", 4 | "min_magnitude": 1, 5 | "schema": "earthquakes", 6 | "record_list_level": "features[*]", 7 | "items_per_page": 100, 8 | "offset_start": 1, 9 | "auth_method": "no_auth", 10 | "http_headers": { 11 | "User-Agent": "Mozilla/5.1 (Macintosh; scitylana.singer.io) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36", 12 | "Content-type": "application/json" 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /examples/usgs/custom_spec.json: -------------------------------------------------------------------------------- 1 | { 2 | "args": { 3 | "min_magnitude": 4 | { 5 | "type": "integer", 6 | "default": "0", 7 | "help": "Filter based on the minimum magnitude." 8 | } 9 | } 10 | } 11 | 12 | -------------------------------------------------------------------------------- /examples/usgs/sample_records.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "FeatureCollection", 3 | "features": [ 4 | { 5 | "geometry": { 6 | "type": "Point", 7 | "coordinates": [ 8 | -116.7776667, 9 | 33.6633333, 10 | 11.008 11 | ] 12 | }, 13 | "type": "Feature", 14 | "properties": { 15 | "rms": 0.09, 16 | "code": "11408890", 17 | "cdi": null, 18 | "sources": ",ci,", 19 | "nst": 39, 20 | "tz": -480, 21 | "title": "M 1.3 - 10km SSW of Idyllwild, CA", 22 | "magType": "ml", 23 | "detail": "https://earthquake.usgs.gov/fdsnws/event/1/query?eventid=ci11408890&format=geojson", 24 | "sig": 26, 25 | "net": "ci", 26 | "type": "earthquake", 27 | "status": "reviewed", 28 | "updated": 1457728844428, 29 | "felt": null, 30 | "alert": null, 31 | "dmin": 0.06729, 32 | "mag": 1.29, 33 | "gap": 51, 34 | "types": ",cap,focal-mechanism,general-link,geoserve,nearby-cities,origin,phase-data,scitech-link,", 35 | "url": "https://earthquake.usgs.gov/earthquakes/eventpage/ci11408890", 36 | "ids": ",ci11408890,", 37 | "tsunami": 0, 38 | "place": "10km SSW of Idyllwild, CA", 39 | "time": 1388620296020, 40 | "mmi": null 41 | }, 42 | "id": "ci11408890" 43 | }, 44 | { 45 | "geometry": { 46 | "type": "Point", 47 | "coordinates": [ 48 | -151.6459, 49 | 63.102, 50 | 14.1 51 | ] 52 | }, 53 | "type": "Feature", 54 | "properties": { 55 | "rms": 0.57, 56 | "code": "01421ig3u", 57 | "cdi": null, 58 | "sources": ",ak,ak,", 59 | "nst": null, 60 | "tz": -540, 61 | "title": "M 1.1 - 117km NW of Talkeetna, Alaska", 62 | "magType": "ml", 63 | "detail": "https://earthquake.usgs.gov/fdsnws/event/1/query?eventid=ak01421ig3u&format=geojson", 64 | "sig": 19, 65 | "net": "ak", 66 | "type": "earthquake", 67 | "status": "reviewed", 68 | "updated": 1558392330681, 69 | "felt": null, 70 | "alert": null, 71 | "dmin": null, 72 | "mag": 1.1, 73 | "gap": null, 74 | "types": ",associate,geoserve,nearby-cities,origin,phase-data,tectonic-summary,", 75 | "url": "https://earthquake.usgs.gov/earthquakes/eventpage/ak01421ig3u", 76 | "ids": ",ak10992887,ak01421ig3u,", 77 | "tsunami": 0, 78 | "place": "117km NW of Talkeetna, Alaska", 79 | "time": 1388620046501, 80 | "mmi": null 81 | }, 82 | "id": "ak01421ig3u" 83 | }, 84 | { 85 | "geometry": { 86 | "type": "Point", 87 | "coordinates": [ 88 | -150.0165, 89 | 61.4581, 90 | 44.6 91 | ] 92 | }, 93 | "type": "Feature", 94 | "properties": { 95 | "rms": 0.47, 96 | "code": "01421i2zj", 97 | "cdi": null, 98 | "sources": ",ak,ak,", 99 | "nst": null, 100 | "tz": -540, 101 | "title": "M 1.2 - 6km SSW of Big Lake, Alaska", 102 | "magType": "ml", 103 | "detail": "https://earthquake.usgs.gov/fdsnws/event/1/query?eventid=ak01421i2zj&format=geojson", 104 | "sig": 22, 105 | "net": "ak", 106 | "type": "earthquake", 107 | "status": "reviewed", 108 | "updated": 1558392330249, 109 | "felt": null, 110 | "alert": null, 111 | "dmin": null, 112 | "mag": 1.2, 113 | "gap": null, 114 | "types": ",associate,cap,geoserve,nearby-cities,origin,phase-data,tectonic-summary,", 115 | "url": "https://earthquake.usgs.gov/earthquakes/eventpage/ak01421i2zj", 116 | "ids": ",ak10934318,ak01421i2zj,", 117 | "tsunami": 0, 118 | "place": "6km SSW of Big Lake, Alaska", 119 | "time": 1388619956476, 120 | "mmi": null 121 | }, 122 | "id": "ak01421i2zj" 123 | } 124 | ], 125 | "metadata": { 126 | "status": 200, 127 | "count": 183, 128 | "title": "USGS Earthquakes", 129 | "url": "https://earthquake.usgs.gov/fdsnws/event/1/query?format=geojson&starttime=2014-01-01&endtime=2014-01-02&minmagnitude=1", 130 | "generated": 1596830145000, 131 | "api": "1.10.3" 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /examples/usgs/schema/earthquakes.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "object", 3 | "properties": { 4 | "type": { 5 | "type": [ 6 | "null", 7 | "string" 8 | ] 9 | }, 10 | "properties": { 11 | "type": [ 12 | "null", 13 | "object" 14 | ], 15 | "properties": { 16 | "mag": { 17 | "type": [ 18 | "null", 19 | "number" 20 | ] 21 | }, 22 | "place": { 23 | "type": [ 24 | "null", 25 | "string" 26 | ] 27 | }, 28 | "time": { 29 | "type": [ 30 | "null", 31 | "integer" 32 | ] 33 | }, 34 | "updated": { 35 | "type": [ 36 | "null", 37 | "integer" 38 | ] 39 | }, 40 | "tz": { 41 | "type": [ 42 | "null", 43 | "string" 44 | ] 45 | }, 46 | "url": { 47 | "type": [ 48 | "null", 49 | "string" 50 | ] 51 | }, 52 | "detail": { 53 | "type": [ 54 | "null", 55 | "string" 56 | ] 57 | }, 58 | "felt": { 59 | "type": [ 60 | "null", 61 | "integer" 62 | ] 63 | }, 64 | "cdi": { 65 | "type": [ 66 | "null", 67 | "number" 68 | ] 69 | }, 70 | "mmi": { 71 | "type": [ 72 | "null", 73 | "number" 74 | ] 75 | }, 76 | "alert": { 77 | "type": [ 78 | "null", 79 | "string" 80 | ] 81 | }, 82 | "status": { 83 | "type": [ 84 | "null", 85 | "string" 86 | ] 87 | }, 88 | "tsunami": { 89 | "type": [ 90 | "null", 91 | "integer" 92 | ] 93 | }, 94 | "sig": { 95 | "type": [ 96 | "null", 97 | "integer" 98 | ] 99 | }, 100 | "net": { 101 | "type": [ 102 | "null", 103 | "string" 104 | ] 105 | }, 106 | "code": { 107 | "type": [ 108 | "null", 109 | "string" 110 | ] 111 | }, 112 | "ids": { 113 | "type": [ 114 | "null", 115 | "string" 116 | ] 117 | }, 118 | "sources": { 119 | "type": [ 120 | "null", 121 | "string" 122 | ] 123 | }, 124 | "types": { 125 | "type": [ 126 | "null", 127 | "string" 128 | ] 129 | }, 130 | "nst": { 131 | "type": [ 132 | "null", 133 | "integer" 134 | ] 135 | }, 136 | "dmin": { 137 | "type": [ 138 | "null", 139 | "number" 140 | ] 141 | }, 142 | "rms": { 143 | "type": [ 144 | "null", 145 | "number" 146 | ] 147 | }, 148 | "gap": { 149 | "type": [ 150 | "null", 151 | "number" 152 | ] 153 | }, 154 | "magType": { 155 | "type": [ 156 | "null", 157 | "string" 158 | ] 159 | }, 160 | "type": { 161 | "type": [ 162 | "null", 163 | "string" 164 | ] 165 | }, 166 | "title": { 167 | "type": [ 168 | "null", 169 | "string" 170 | ] 171 | } 172 | } 173 | }, 174 | "geometry": { 175 | "type": [ 176 | "null", 177 | "object" 178 | ], 179 | "properties": { 180 | "type": { 181 | "type": [ 182 | "null", 183 | "string" 184 | ] 185 | }, 186 | "coordinates": { 187 | "type": [ 188 | "null", 189 | "array" 190 | ], 191 | "items": { 192 | "type": [ 193 | "null", 194 | "number" 195 | ] 196 | } 197 | } 198 | } 199 | }, 200 | "id": { 201 | "type": [ 202 | "null", 203 | "string" 204 | ] 205 | }, 206 | "_sdc_extracted_at": { 207 | "type": [ 208 | "null", 209 | "string" 210 | ], 211 | "format": "date-time" 212 | }, 213 | "_sdc_batched_at": { 214 | "type": [ 215 | "null", 216 | "string" 217 | ], 218 | "format": "date-time" 219 | } 220 | } 221 | } -------------------------------------------------------------------------------- /examples/usgs/today.sh: -------------------------------------------------------------------------------- 1 | tap-rest-api ./custom_spec.json --config config/tap_config.json --schema_dir ./schema --catalog_dir ./catalog --catalog ./catalog/earthquakes.json --start_datetime `date +%F` 2 | -------------------------------------------------------------------------------- /requirements_dev.txt: -------------------------------------------------------------------------------- 1 | bump2version>=0.5.11 2 | coverage>=4.5.4 3 | flake8>=3.7.8 4 | pip>=19.2.3 5 | pytest>=5.4.3 6 | Sphinx>=1.8.5 7 | tox>=3.14.0 8 | twine>=1.14.0 9 | watchdog>=0.9.0 10 | wheel>=0.33.6 11 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from setuptools import setup 3 | 4 | VERSION = "0.2.9" 5 | 6 | with open("README.md", "r") as fh: 7 | long_description = fh.read() 8 | 9 | setup( 10 | name="tap-rest-api", 11 | version=VERSION, 12 | description="Singer.io tap for extracting data from any REST API ", 13 | long_description=long_description, 14 | long_description_content_type="text/markdown", 15 | author="Daigo Tanaka, Anelen Co., LLC", 16 | url="https://github.com/anelendata/tap-rest-api", 17 | 18 | classifiers=[ 19 | "Development Status :: 4 - Beta", 20 | "License :: OSI Approved :: Apache Software License", 21 | 22 | "Operating System :: MacOS :: MacOS X", 23 | "Operating System :: POSIX :: Linux", 24 | 25 | "Programming Language :: Python :: 3.9", 26 | "Programming Language :: Python :: 3.10", 27 | "Programming Language :: Python :: 3.11", 28 | "Programming Language :: Python :: 3.12" 29 | ], 30 | 31 | install_requires=[ 32 | "attrs>=18.1.0", 33 | "backoff==1.8.0", 34 | "getschema>=0.2.10", 35 | "jsonschema==2.6.0", 36 | "python-dateutil>=2.7.3", 37 | "requests>=2.20.0", 38 | "simplejson==3.11.1", 39 | "singer-python>=5.2.0", 40 | ], 41 | entry_points=""" 42 | [console_scripts] 43 | tap-rest-api=tap_rest_api:main 44 | """, 45 | packages=["tap_rest_api"], 46 | package_data={ 47 | # Use MANIFEST.ini 48 | }, 49 | include_package_data=True 50 | ) 51 | -------------------------------------------------------------------------------- /tap_rest_api/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse, logging, os, sys 3 | import simplejson as json 4 | import singer 5 | from singer import utils 6 | from singer.catalog import Catalog 7 | 8 | from .helper import Stream, get_abs_path 9 | from .sync import sync 10 | from .schema import discover, infer_schema 11 | 12 | LOG_LEVELS = { 13 | "DEBUG": logging.DEBUG, 14 | "INFO": logging.INFO, 15 | "WARNING": logging.WARNING, 16 | "ERROR": logging.ERROR, 17 | "CRITICAL": logging.CRITICAL, 18 | } 19 | LOGGER = singer.get_logger() 20 | 21 | SPEC_FILE = "./tap_rest_api_spec.json" 22 | SPEC = {} 23 | 24 | REQUIRED_CONFIG_KEYS = ["url"] 25 | CONFIG = {} 26 | ENDPOINTS = {} 27 | STREAMS = {} 28 | 29 | 30 | def str2bool(v): 31 | if isinstance(v, bool): 32 | return v 33 | if v.lower() in ("yes", "true", "t", "y", "1"): 34 | return True 35 | elif v.lower() in ("no", "false", "f", "n", "0"): 36 | return False 37 | else: 38 | raise argparse.ArgumentTypeError("Boolean value expected.") 39 | 40 | 41 | TYPES = { 42 | "string": str, 43 | "datetime": str, 44 | "integer": int, 45 | "boolean": str2bool 46 | } 47 | 48 | 49 | def parse_args(spec_file, required_config_keys): 50 | ''' This is to replace singer's default utils.parse_args() 51 | https://github.com/singer-io/singer-python/blob/master/singer/utils.py 52 | 53 | Parse standard command-line args. 54 | Parses the command-line arguments mentioned in the SPEC and the 55 | BEST_PRACTICES documents: 56 | -c,--config Config file 57 | -s,--state State file 58 | -d,--discover Run in discover mode 59 | --catalog Catalog file 60 | Returns the parsed args object from argparse. For each argument that 61 | point to JSON files (config, state, properties), we will automatically 62 | load and parse the JSON file. 63 | ''' 64 | # Read default spec file 65 | default_spec = {} 66 | default_spec_file = get_abs_path("default_spec.json") 67 | with open(default_spec_file, "r") as f: 68 | default_spec.update(json.load(f)) 69 | 70 | SPEC.update(default_spec) 71 | 72 | # Overwrite with the custom spec file 73 | custom_spec = {} 74 | 75 | if os.path.isfile(spec_file): 76 | with open(spec_file, "r") as f: 77 | custom_spec.update(json.load(f)) 78 | 79 | SPEC["application"] = custom_spec.get("application", SPEC["application"]) 80 | if custom_spec.get("args"): 81 | SPEC["args"].update(custom_spec.get("args")) 82 | 83 | parser = argparse.ArgumentParser(SPEC["application"]) 84 | 85 | if custom_spec: 86 | parser.add_argument("spec_file", type=str, help="Custom spec file") 87 | 88 | # Capture additional args 89 | for arg in SPEC["args"].keys(): 90 | type_list = SPEC["args"][arg]["type"] 91 | type_ = None 92 | if isinstance(type_list, list): 93 | for t in type_list: 94 | if t.lower() in ["object", "array"]: 95 | continue 96 | type_ = t 97 | else: 98 | type_ = type_list 99 | if not type_: 100 | raise Exception(f"Config spec exception at {arg}") 101 | parser.add_argument( 102 | "--" + arg, 103 | type=TYPES[type_], 104 | default=SPEC["args"][arg].get("default"), 105 | help=SPEC["args"][arg].get("help"), 106 | required=SPEC["args"][arg].get("required", False)) 107 | 108 | # Default singer arguments, commands, and required args 109 | parser.add_argument( 110 | '-c', '--config', 111 | help='Config file', 112 | required=True) 113 | 114 | parser.add_argument( 115 | '-s', '--state', 116 | help='State file') 117 | 118 | parser.add_argument( 119 | '--catalog', 120 | help='Catalog file') 121 | 122 | parser.add_argument( 123 | '-l', '--loglevel', 124 | help='Set log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)', 125 | default='INFO') 126 | 127 | # commands 128 | parser.add_argument( 129 | '-r', '--raw', 130 | action='store_true', 131 | help='Raw output at record level') 132 | 133 | parser.add_argument( 134 | '-d', '--discover', 135 | action='store_true', 136 | help='Do schema discovery') 137 | 138 | parser.add_argument( 139 | '-i', '--infer_schema', 140 | action='store_true', 141 | help='Do infer schema') 142 | 143 | args = parser.parse_args() 144 | 145 | if args.config: 146 | args.config = utils.load_json(args.config) 147 | if args.state: 148 | if os.path.exists(args.state): 149 | args.state = utils.load_json(args.state) 150 | else: 151 | LOGGER.warn(args.state + " was not found.") 152 | args.state = {} 153 | else: 154 | args.state = {} 155 | if args.catalog: 156 | if not os.path.isfile(args.catalog): 157 | raise Exception("Catalog file %s not found" % args.catalog) 158 | args.catalog = Catalog.load(args.catalog) 159 | 160 | utils.check_config(args.config, required_config_keys) 161 | 162 | return args 163 | 164 | 165 | @utils.handle_top_exception(LOGGER) 166 | def main(): 167 | """ 168 | Entry point of tap_rest_api 169 | """ 170 | spec_file = "" 171 | if len(sys.argv) > 1: 172 | spec_file = sys.argv[1] 173 | 174 | args = parse_args(spec_file, REQUIRED_CONFIG_KEYS) 175 | 176 | CONFIG.update(args.config) 177 | 178 | # Overwrite config specs with commandline args 179 | # But we want to skip the args unspecified by the user... 180 | # So the trick is to go back to sys.argv and find the args begins with "--" 181 | # I can do this because I'm not allowing abbreviation of the args 182 | args_dict = args.__dict__ 183 | for arg in args_dict.keys(): 184 | if "--" + arg not in sys.argv and CONFIG.get(arg) is not None: 185 | continue 186 | CONFIG[arg] = args_dict[arg] 187 | 188 | STATE = {} 189 | 190 | if args.loglevel: 191 | log_level = LOG_LEVELS.get(args.loglevel.upper()) 192 | if not log_level: 193 | raise (f"Log level must be one of {','.join(LOG_LEVELS)}") 194 | LOGGER.setLevel(log_level) 195 | 196 | 197 | if CONFIG.get("streams"): 198 | streams = CONFIG["streams"].split(",") 199 | elif CONFIG.get("schema"): 200 | streams = [CONFIG["schema"]] 201 | else: 202 | raise Exception("Config needs to specify streams or schema variable.") 203 | 204 | for stream in streams: 205 | stream = stream.strip() 206 | STREAMS[stream] = Stream(stream, CONFIG) 207 | 208 | if args.state: 209 | STATE.update(args.state) 210 | LOGGER.debug("State read: %s" % STATE) 211 | 212 | if args.infer_schema: 213 | infer_schema(CONFIG, STREAMS) 214 | elif args.discover: 215 | discover(CONFIG, STREAMS) 216 | elif args.catalog: 217 | sync(CONFIG, STREAMS, STATE, args.catalog, raw=args.raw) 218 | else: 219 | raise Exception("No streams were selected") 220 | 221 | 222 | if __name__ == "__main__": 223 | main() 224 | -------------------------------------------------------------------------------- /tap_rest_api/default_spec.json: -------------------------------------------------------------------------------- 1 | { 2 | "application": "tap_rest_api", 3 | "args": 4 | { 5 | "streams": 6 | { 7 | "type": "string", 8 | "default": null, 9 | "help": "comma-separated strings of tap stream IDs" 10 | }, 11 | "schema_dir": 12 | { 13 | "type": "string", 14 | "default": "./schema", 15 | "help": "Path to the schema directory" 16 | }, 17 | "catalog_dir": 18 | { 19 | "type": "string", 20 | "default": "./catalog", 21 | "help": "Path to the catalog directory" 22 | }, 23 | 24 | "url": 25 | { 26 | "type": "string", 27 | "default": null, 28 | "help": "REST API endpoint with {params}. Required in config." 29 | }, 30 | 31 | "auth_method": 32 | { 33 | "type": "string", 34 | "default": "no_auth", 35 | "help": "HTTP request authentication method: no_auth, basic or digest" 36 | }, 37 | 38 | "http_headers": { 39 | "type": "string", 40 | "default": null, 41 | "help": "JSON-format string of HTTP request headers key-value pairs" }, 42 | 43 | "username": 44 | { 45 | "type": "string", 46 | "default": null, 47 | "help": "username used for authentication if applicable" 48 | }, 49 | "password": 50 | { 51 | "type": "string", 52 | "default": null, 53 | "help": "password used for authentication if applicable" 54 | }, 55 | 56 | "offset_start": 57 | { 58 | "type": "integer", 59 | "default": 0, 60 | "help": "Specify the initial value of current_offset" 61 | }, 62 | 63 | "page_start": 64 | { 65 | "type": "integer", 66 | "default": 0, 67 | "help": "Specify the initial value of current_page" 68 | }, 69 | 70 | "timestamp_key": 71 | { 72 | "type": "string", 73 | "default": null, 74 | "help": "The default POSIX timestamp key(column) name when stream key is missing in timestamp_keys. If this is not null, timestamp_key is ignored." 75 | }, 76 | "timestamp_keys": 77 | { 78 | "type": ["string", "object"], 79 | "default": null, 80 | "help": "POSIX timestamp key(column) name. If this is not null, timestamp_key is ignored. Use dictionary to specify per stream." 81 | }, 82 | "start_timestamp": 83 | { 84 | "type": "integer", 85 | "default": null, 86 | "help": "Start POSIX timestamp" 87 | }, 88 | "end_timestamp": 89 | { 90 | "type": "integer", 91 | "default": null, 92 | "help": "End POSIX timestamp. When this is set, tap only replicates the record with end_timestamp younger than end_timestamp. Once equal or greater than end_datetime is read, the process exits." 93 | }, 94 | 95 | "datetime_key": 96 | { 97 | "type": "string", 98 | "default": null, 99 | "help": "Default datetime key(column) name when stream key is missing from datetime_keys. If this is not null, timestamp_key is ignored." 100 | }, 101 | "datetime_keys": 102 | { 103 | "type": ["string", "object"], 104 | "default": null, 105 | "help": "Datetime key(column) name. If this is not null, timestamp_key is ignored. Use dictionary to specify per stream." 106 | }, 107 | "start_datetime": 108 | { 109 | "type": "datetime", 110 | "default": null, 111 | "help": "Start datetime in ISO 8601 format. As a convenience, this will be automatically converted to timestamp if datetime_key is null and timestamp_key is set and start_timestamp is null." 112 | }, 113 | "end_datetime": 114 | { 115 | "type": "datetime", 116 | "default": null, 117 | "help": "End datetime in ISO 8601 format. When this is set, tap only replicates the record with datetime younger than end_datetime. Once equal or greater than end_datetime is read, the process exits. As a convenience, this will be automatically converted to timestamp if datetime_key is null and timestamp_key is set and end_timestamp is null." 118 | }, 119 | "url_param_datetime_format": 120 | { 121 | "type": "string", 122 | "default": null, 123 | "help": "Datetime format (e.g. '%Y-%m-%d %H:%M:%S.%f'') for URL parameter. If this is set, start_datetime and end_datetime will be converted to this format. If not set, start_datetime and end_datetime will be converted to ISO 8601 format." 124 | }, 125 | "url_param_isoformat_sep": 126 | { 127 | "type": "string", 128 | "default": "T", 129 | "help": "Separator between date and time in ISO 8601 format" 130 | }, 131 | "url_param_isoformat_timespec": 132 | { 133 | "type": "string", 134 | "default": "auto", 135 | "help": "Timespec in ISO 8601 format" 136 | }, 137 | "url_param_isoformat_use_zulu": 138 | { 139 | "type": "boolean", 140 | "default": false, 141 | "help": "When UTC, replace +00:00 with Z" 142 | }, 143 | 144 | "index_key": 145 | { 146 | "type": "string", 147 | "default": null, 148 | "help": "Index key (column) name when disctionary stream key is missing in index_keys." 149 | }, 150 | "index_keys": 151 | { 152 | "type": ["string", "object"], 153 | "default": null, 154 | "help": "Index key (column) name. Use dictionary to specify per stream." 155 | }, 156 | 157 | "start_index": 158 | { 159 | "type": "integer", 160 | "default": 0, 161 | "help": "Starting index number" 162 | }, 163 | "end_index": 164 | { 165 | "type": "integer", 166 | "default": 0, 167 | "help": "When this is set, tap only replicates the record with index younger than end_index. Once equal or greater than end_index is read, the process exits." 168 | }, 169 | 170 | "items_per_page": 171 | { 172 | "type": "integer", 173 | "default": 100, 174 | "help": "# of items per page if API supports paging" 175 | }, 176 | "assume_sorted": 177 | { 178 | "type": "boolean", 179 | "default": true, 180 | "help": "If true, trust the source data to be presorted by the index/timestamp/datetime keys. So it is safe to finish the replication once the last update index/timestamp/datetime passes the end." 181 | }, 182 | "max_page": 183 | { 184 | "type": "integer", 185 | "default": null, 186 | "help": "If set, stop polling after max_page" 187 | }, 188 | "filter_by_schema": 189 | { 190 | "type": "boolean", 191 | "default": true, 192 | "help": "Filter the records read from the source according to schema. Any fields not present in shema will be removed." 193 | }, 194 | 195 | "record_list_level": 196 | { 197 | "type": "string", 198 | "default": null, 199 | "help": "Set this like 'level_1,level_2...' if the target list is at raw_json_response[level_1][level_2]..." 200 | }, 201 | "record_level": 202 | { 203 | "type": "string", 204 | "default": null, 205 | "help": "Set this like 'level_a,level_b...' if the target object is at raw_individual_record[level_a][level_b]..." 206 | }, 207 | "unnest": 208 | { 209 | "type": ["string", "object"], 210 | "default": null, 211 | "help": "Set this like unnest: {'': [{'path': '$.some.json.path', 'target': 'top_level_col_name'}, ...], ...}" 212 | }, 213 | 214 | "on_invalid_property": 215 | { 216 | "type": "string", 217 | "default": "force", 218 | "help": "Behavior when the schema validation fails. 'raise': Raise exception. 'null': Impute with null. 'force' (default): Keep the record value as is (string). This may fail in the singer target." 219 | }, 220 | "drop_unknown_properties": 221 | { 222 | "type": "boolean", 223 | "default": false, 224 | "help": "If true, record will exclude unknown (sub-)properties before it's being written to stdout. Default is false." 225 | } 226 | } 227 | } 228 | -------------------------------------------------------------------------------- /tap_rest_api/helper.py: -------------------------------------------------------------------------------- 1 | import attr, backoff, dateutil, datetime, hashlib, os, requests 2 | import simplejson as json 3 | from urllib.parse import quote as urlquote 4 | from requests.auth import HTTPBasicAuth, HTTPDigestAuth 5 | from dateutil.tz import tzoffset 6 | 7 | import jsonpath_ng as jsonpath 8 | 9 | import singer 10 | from singer import utils 11 | import singer.metrics as metrics 12 | 13 | 14 | USER_AGENT = ("Mozilla/5.0 (Macintosh; scitylana.singer.io) " + 15 | "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 " + 16 | "Safari/537.36 ") 17 | LOGGER = singer.get_logger() 18 | 19 | 20 | # StitchData compatible timestamp meta data 21 | # https://www.stitchdata.com/docs/data-structure/system-tables-and-columns 22 | # The timestamp of the record extracted from the source 23 | EXTRACT_TIMESTAMP = "_sdc_extracted_at" 24 | # The timestamp of the record submit to the destination 25 | # (kept null at extraction) 26 | BATCH_TIMESTAMP = "_sdc_batched_at" 27 | 28 | 29 | @attr.s 30 | class Stream(object): 31 | tap_stream_id = attr.ib() 32 | kwargs = attr.ib() 33 | 34 | 35 | def get_abs_path(path): 36 | """Returns the absolute path""" 37 | return os.path.join(os.path.dirname(os.path.realpath(__file__)), path) 38 | 39 | 40 | def parse_datetime_tz(datetime_str, default_tz_offset=0): 41 | d = dateutil.parser.parse(datetime_str) 42 | if not d.tzinfo: 43 | d = d.replace(tzinfo=tzoffset(None, default_tz_offset)) 44 | return d 45 | 46 | 47 | def human_readable(bookmark_type, t): 48 | readable = t 49 | if t is not None and bookmark_type == "timestamp": 50 | try: 51 | ds = datetime.datetime.fromtimestamp(t) 52 | except: 53 | raise Exception("bookmark type is set to timestamp, but the value {t} isn't timestamp") 54 | readable = f"{str(t)} ({str(ds)})" 55 | return readable 56 | 57 | 58 | def _get_jsonpath(raw, path): 59 | jsonpath_expr = jsonpath.parse(path) 60 | record = [match.value for match in jsonpath_expr.find(raw)] 61 | return record 62 | 63 | 64 | def get_record(raw_item, record_level): 65 | """ 66 | Dig the items until the target schema 67 | """ 68 | if not record_level: 69 | return raw_item 70 | 71 | record = _get_jsonpath(raw_item, record_level) 72 | if len(record) != 1: 73 | raise Exception(f"jsonpath match records: {len(record)}, expected 1.") 74 | 75 | return record[0] 76 | 77 | 78 | def get_record_list(raw_data, record_list_level): 79 | """ 80 | Dig the raw data to the level that contains the list of the records 81 | """ 82 | if not record_list_level: 83 | return raw_data 84 | data = _get_jsonpath(raw_data, record_list_level) 85 | return data 86 | 87 | 88 | def unnest(data, json_path, target_col_name): 89 | obj = _get_jsonpath(data, json_path) 90 | if (obj): 91 | data[target_col_name] = obj[0] 92 | return data 93 | 94 | 95 | def get_bookmark_type_and_key(config, stream): 96 | """ 97 | If config value timestamp_key, datetime_key, or index_key is a dictionary 98 | and has value for the stream, it will be prioritized. 99 | Otherwise, timestamp_key > datetime_key > index_key 100 | """ 101 | bm_type = None 102 | bm_key = None 103 | ts_key = config.get("timestamp_key") 104 | dt_key = config.get("datetime_key") 105 | i_key = config.get("index_key") 106 | ts_keys = config.get("timestamp_keys") 107 | dt_keys = config.get("datetime_keys") 108 | i_keys = config.get("index_keys") 109 | 110 | if isinstance(ts_keys, dict) and ts_keys.get(stream): 111 | return "timestamp", ts_keys.get(stream) 112 | if isinstance(dt_keys, dict) and dt_keys.get(stream): 113 | return "datetime", dt_keys.get(stream) 114 | if isinstance(i_keys, dict) and i_keys.get(stream): 115 | return "index", i_keys.get(stream) 116 | 117 | if ts_key: 118 | return "timestamp", ts_key 119 | if dt_key: 120 | return "datetime", dt_key 121 | if i_key: 122 | return "index", i_key 123 | 124 | raise KeyError("You need to set timestamp_key, datetime_key, or index_key") 125 | 126 | 127 | def get_streams_to_sync(streams, state): 128 | '''Get the streams to sync''' 129 | current_stream = singer.get_currently_syncing(state) 130 | result = dict(streams) 131 | 132 | if current_stream: 133 | for key in streams.keys(): 134 | if result[key].tap_stream_id != current_stream: 135 | result.pop(key, None) 136 | 137 | if not result: 138 | raise Exception("Unknown stream {} in state".format(current_stream)) 139 | 140 | return result 141 | 142 | 143 | def get_selected_streams(remaining_streams, annotated_schema): 144 | selected_streams = [] 145 | 146 | for key in remaining_streams.keys(): 147 | stream = remaining_streams[key] 148 | tap_stream_id = stream.tap_stream_id 149 | for stream_idx, annotated_stream in enumerate( 150 | annotated_schema.streams): 151 | if tap_stream_id == annotated_stream.tap_stream_id: 152 | schema = annotated_stream.schema 153 | if (hasattr(schema, "selected")) and (schema.selected is True): 154 | selected_streams.append(stream) 155 | 156 | return selected_streams 157 | 158 | def format_datetime( 159 | config: dict, 160 | dt: datetime.datetime, 161 | ): 162 | if config.get("url_param_datetime_format"): 163 | dt_str = dt.strftime(config["url_param_datetime_format"]) 164 | else: 165 | sep = config.get("url_param_isoformat_sep", "T") 166 | timespec = config.get("url_param_isoformat_timespec", "auto") 167 | dt_str = dt.isoformat(sep, timespec) 168 | if config.get("url_param_isoformat_use_zulu"): 169 | dt_str = dt_str.replace("+00:00", "Z") 170 | return dt_str 171 | 172 | def get_start(config, state, tap_stream_id, bookmark_key): 173 | """ 174 | state file, given by --state prioritizes over the start 175 | value given by config or args 176 | 177 | For human convenience, start_datetime (more human readable) is also looked 178 | up when timestamp_key is set but start_timestamp is not set. 179 | """ 180 | current_bookmark = singer.get_bookmark(state, tap_stream_id, bookmark_key) 181 | bookmark_type, _ = get_bookmark_type_and_key(config, tap_stream_id) 182 | if current_bookmark is None: 183 | if bookmark_type == "timestamp": 184 | if (not config.get("start_timestamp") and 185 | not config.get("start_datetime")): 186 | raise KeyError("timestamp_key is set but neither " + 187 | "start_timestamp or start_datetime is set") 188 | current_bookmark = config.get("start_timestamp") 189 | if current_bookmark is None: 190 | current_bookmark = dateutil.parser.parse( 191 | config["start_datetime"]).timestamp() 192 | else: 193 | current_bookmark = get_float_timestamp(current_bookmark) 194 | elif bookmark_type == "datetime": 195 | if not config.get("start_datetime"): 196 | raise KeyError( 197 | "datetime_key is set but start_datetime is not set") 198 | current_bookmark = config.get("start_datetime") 199 | elif bookmark_type == "index": 200 | if config.get("start_index") is None: 201 | raise KeyError("index_key is set but start_index is not set") 202 | current_bookmark = config.get("start_index") 203 | 204 | return current_bookmark 205 | 206 | 207 | def get_end(config, tap_stream_id): 208 | """ 209 | For human convenience, end_datetime (more human readable) is also looked 210 | up when timestamp_key is set but end_timestamp is not set. 211 | """ 212 | bookmark_type, _ = get_bookmark_type_and_key(config, tap_stream_id) 213 | if bookmark_type == "timestamp": 214 | end_from_config = config.get("end_timestamp") 215 | if end_from_config is None: 216 | if config.get("end_datetime") is not None: 217 | end_from_config = dateutil.parser.parse( 218 | config["end_datetime"]).timestamp() 219 | else: 220 | end_from_config = datetime.datetime.now().timestamp() 221 | elif bookmark_type == "datetime": 222 | end_from_config = config.get("end_datetime") 223 | if not end_from_config: 224 | end_from_config = format_datetime(config, datetime.datetime.now()) 225 | elif bookmark_type == "index": 226 | end_from_config = config.get("end_index") 227 | return end_from_config 228 | 229 | 230 | def get_digest_from_record(record): 231 | digest = hashlib.md5( 232 | json.dumps(record, sort_keys=True).encode("utf-8") 233 | ).hexdigest() 234 | return digest 235 | 236 | 237 | def get_float_timestamp(ts): 238 | # Handle the data with sub-seconds converted to int 239 | ex_digits = len(str(int(ts))) - 10 240 | value = float(ts) / (pow(10, ex_digits)) 241 | return value 242 | 243 | 244 | def get_last_update(config, tap_stream_id, record, current): 245 | last_update = current 246 | bookmark_type, bookmark_key = get_bookmark_type_and_key(config, tap_stream_id) 247 | if bookmark_type == "timestamp": 248 | value = _get_jsonpath(record, bookmark_key)[0] 249 | if value: 250 | value = get_float_timestamp(value) 251 | if value > current: 252 | last_update = value 253 | else: 254 | KeyError("timestamp_key not found in the record") 255 | elif bookmark_type == "datetime": 256 | value = _get_jsonpath(record, bookmark_key)[0] 257 | if not value: 258 | KeyError("datetime_key not found in the record") 259 | 260 | record_datetime = parse_datetime_tz(value) 261 | current_datetime = parse_datetime_tz(current) 262 | 263 | if record_datetime > current_datetime: 264 | last_update = format_datetime(config, record_datetime) 265 | elif bookmark_type == "index": 266 | current_index = str(_get_jsonpath(record, bookmark_key)[0]) 267 | LOGGER.debug("Last update will be updated from %s to %s" % 268 | (last_update, current_index)) 269 | # When index is an integer, it's dangerous to compare 9 and 10 as 270 | # string for example. 271 | try: 272 | current_index = int(current_index) 273 | except ValueError: 274 | if type(last_update) == int: 275 | # When the index suddenly changes to str, fall back to string 276 | LOGGER.warning( 277 | "Previously index was throught to be integer. Now" + 278 | " it seems to be string type. %s %s" % 279 | (last_update, current_index)) 280 | last_update = str(last_update) 281 | if current_index and (not current or current_index > current): 282 | last_update = current_index 283 | else: 284 | KeyError("index_key not found in the record") 285 | else: 286 | raise KeyError( 287 | "Neither timestamp_key, datetime_key, or index_key is set") 288 | return last_update 289 | 290 | 291 | def get_init_endpoint_params(config, state, tap_stream_id): 292 | bookmark_type, bookmark_key = get_bookmark_type_and_key(config, tap_stream_id) 293 | params = dict(config) 294 | start = get_start(config, state, tap_stream_id, "last_update") 295 | end = get_end(config, tap_stream_id) 296 | if bookmark_type == "timestamp": 297 | start_datetime = format_datetime(config, datetime.datetime.fromtimestamp(start)) 298 | end_datetime = format_datetime(config, datetime.datetime.fromtimestamp(end)) 299 | params.update({ 300 | "start_timestamp": start, 301 | "end_timestamp": end, 302 | "start_datetime": start_datetime, 303 | "end_datetime": end_datetime, 304 | "start_date": start_datetime[0:10], 305 | "end_date": end_datetime[0:10], 306 | "timestamp_key": bookmark_key, 307 | }) 308 | elif bookmark_type == "datetime": 309 | params.update({ 310 | "start_timestamp": dateutil.parser.parse(start).timestamp(), 311 | "end_timestamp": dateutil.parser.parse(end).timestamp(), 312 | "start_datetime": start, 313 | "end_datetime": end, 314 | "start_date": start[0:10], 315 | "end_date": end[0:10], 316 | "datetime_key": bookmark_key, 317 | }) 318 | elif bookmark_type == "index": 319 | start_datetime = config.get("start_datetime") 320 | start_date = start_datetime[0:10] if start_datetime else None 321 | end_datetime = config.get("end_datetime") 322 | if not end_datetime: 323 | end_datetime = format_datetime(config, datetime.datetime.utcnow()) 324 | end_date = end_datetime[0:10] if end_datetime else None 325 | params.update({ 326 | "start_datetime": start_datetime, 327 | "end_datetime": end_datetime, 328 | "start_date": start_date, 329 | "end_date": end_date, 330 | "start_index": start, 331 | "end_index": end, 332 | "index_key": bookmark_key, 333 | }) 334 | 335 | params.update( 336 | { 337 | "stream": tap_stream_id, 338 | "current_page": config.get("page_start", 0), 339 | "current_offset": config.get("offset_start", 0), 340 | "last_update": start, 341 | } 342 | ) 343 | 344 | return params 345 | 346 | 347 | def get_http_headers(config=None): 348 | if not config or not config.get("http_headers"): 349 | return {"User-Agent": USER_AGENT, 350 | "Content-type": "application/json"} 351 | 352 | headers = config["http_headers"] 353 | if type(headers) == str: 354 | headers = json.loads(headers) 355 | LOGGER.debug(headers) 356 | return headers 357 | 358 | 359 | def get_endpoint(url_format, tap_stream_id, data): 360 | """ Get the full url for the endpoint including query 361 | 362 | In addition to data passed from config values, it will create "resource" 363 | that is derived from tap_stream_id. 364 | 365 | The special characters in query are quoted with "%XX" 366 | 367 | URL can be something like: 368 | https://api.example.com/1/{resource}? \ 369 | last_update_start={start_datetime}&last_update_end={end_datetime}& \ 370 | items_per_page={items_per_page}&page={current_page} 371 | """ 372 | params = dict() 373 | for key in data: 374 | params[key] = urlquote(str(data[key]).encode("utf-8")) 375 | params["resource"] = urlquote(str(tap_stream_id).encode("utf-8")) 376 | return url_format.format(**params) 377 | 378 | 379 | def _giveup(exc): 380 | return exc.response is not None \ 381 | and 400 <= exc.response.status_code < 500 \ 382 | and exc.response.status_code != 429 383 | 384 | 385 | @utils.backoff((requests.exceptions.RequestException,), _giveup) 386 | @utils.ratelimit(20, 1) 387 | def generate_request(stream_id, url, auth_method="no_auth", headers=None, 388 | username=None, password=None): 389 | """ 390 | url: URL with pre-encoded query. See get_endpoint() 391 | """ 392 | if not auth_method or auth_method == "no_auth": 393 | auth = None 394 | elif auth_method == "basic": 395 | auth = HTTPBasicAuth(username, password) 396 | elif auth_method == "digest": 397 | auth = HTTPDigestAuth(username, password) 398 | else: 399 | raise ValueError("Unknown auth method: " + auth_method) 400 | 401 | LOGGER.info("Using %s authentication method." % auth_method) 402 | 403 | headers = headers or get_http_headers() 404 | 405 | with metrics.http_request_timer(stream_id) as timer: 406 | resp = requests.get(url, 407 | headers=headers, 408 | auth=auth) 409 | timer.tags[metrics.Tag.http_status_code] = resp.status_code 410 | resp.raise_for_status() 411 | return resp.json() 412 | -------------------------------------------------------------------------------- /tap_rest_api/schema.py: -------------------------------------------------------------------------------- 1 | import dateutil, os, sys 2 | import simplejson as json 3 | import singer 4 | from singer import utils 5 | 6 | from .helper import (generate_request, get_endpoint, get_init_endpoint_params, 7 | get_record, get_record_list, get_http_headers, unnest, 8 | EXTRACT_TIMESTAMP, BATCH_TIMESTAMP) 9 | import getschema 10 | import jsonschema 11 | 12 | LOGGER = singer.get_logger() 13 | 14 | 15 | def validate(record, schema): 16 | try: 17 | jsonschema.validate(record, schema) 18 | except jsonschema.exceptions.ValidationError: 19 | return False 20 | return True 21 | 22 | 23 | def filter_record( 24 | row, 25 | schema, 26 | on_invalid_property="force", 27 | drop_unknown_properties=False): 28 | """ 29 | Parse the result into types 30 | """ 31 | try: 32 | cleaned = getschema.fix_type( 33 | row, 34 | schema, 35 | on_invalid_property=on_invalid_property, 36 | drop_unknown_properties=drop_unknown_properties, 37 | date_to_datetime=True, 38 | ) 39 | except Exception as e: 40 | LOGGER.debug(row) 41 | raise e 42 | return cleaned 43 | 44 | 45 | def load_schema(schema_dir, entity): 46 | '''Returns the schema for the specified source''' 47 | schema = utils.load_json(os.path.join(schema_dir, "{}.json".format(entity))) 48 | return schema 49 | 50 | 51 | def load_discovered_schema(schema_dir, stream): 52 | '''Attach inclusion automatic to each schema''' 53 | schema = load_schema(schema_dir, stream.tap_stream_id) 54 | for k in schema['properties']: 55 | schema['properties'][k]['inclusion'] = 'automatic' 56 | return schema 57 | 58 | 59 | def _discover_schemas(schema_dir, streams): 60 | '''Iterate through streams, push to an array and return''' 61 | result = {'streams': []} 62 | for key in streams.keys(): 63 | stream = streams[key] 64 | LOGGER.info('Loading schema for %s', stream.tap_stream_id) 65 | result['streams'].append({'stream': stream.tap_stream_id, 66 | 'tap_stream_id': stream.tap_stream_id, 67 | 'schema': load_discovered_schema(schema_dir, 68 | stream)}) 69 | return result 70 | 71 | 72 | def discover(config, streams): 73 | """ 74 | JSON dump the schemas to stdout 75 | """ 76 | LOGGER.info("Loading Schemas") 77 | json_str = _discover_schemas(config["schema_dir"], streams) 78 | json.dump(json_str, sys.stdout, indent=2) 79 | 80 | 81 | def infer_schema(config, streams, out_catalog=True, add_tstamp=True): 82 | """ 83 | Infer schema from the sample record list and write JSON schema and 84 | catalog files under schema directory and catalog directory. 85 | """ 86 | max_page = config.get("max_page") 87 | 88 | schemas = {} 89 | for stream in list(streams.keys()): 90 | tap_stream_id = streams[stream].tap_stream_id 91 | 92 | params = get_init_endpoint_params(config, {}, tap_stream_id) 93 | 94 | url = config.get("urls", {}).get(tap_stream_id, config["url"]) 95 | auth_method = config.get("auth_method", "basic") 96 | headers = get_http_headers(config) 97 | records = [] 98 | page_number = 0 99 | offset_number = 0 100 | while True: 101 | params.update({"current_page": page_number}) 102 | params.update({"current_page_one_base": page_number + 1}) 103 | params.update({"current_offset": offset_number}) 104 | 105 | endpoint = get_endpoint(url, tap_stream_id, params) 106 | LOGGER.info("GET %s", endpoint) 107 | data = generate_request(tap_stream_id, endpoint, auth_method, 108 | headers, 109 | config.get("username"), 110 | config.get("password")) 111 | 112 | # In case the record is not at the root level 113 | record_list_level = config.get("record_list_level") 114 | if isinstance(record_list_level, dict): 115 | record_list_level = record_list_level.get(stream) 116 | record_level = config.get("record_level") 117 | if isinstance(record_level, dict): 118 | record_level = record_level.get(stream) 119 | data = get_record_list(data, record_list_level) 120 | 121 | unnest_cols = config.get("unnest", {}).get(tap_stream_id, []) 122 | if unnest_cols: 123 | for i in range(0, len(data)): 124 | for u in unnest_cols: 125 | data[i] = unnest(data[i], u["path"], u["target"]) 126 | 127 | records += data 128 | 129 | # Exit conditions 130 | if len(data) < config["items_per_page"]: 131 | LOGGER.info(("Response is less than set item per page (%d)." + 132 | "Finishing the extraction") % 133 | config["items_per_page"]) 134 | break 135 | if max_page and page_number + 1 >= max_page: 136 | LOGGER.info("Max page %d reached. Finishing the extraction." % max_page) 137 | break 138 | 139 | page_number +=1 140 | offset_number += len(data) 141 | 142 | 143 | schema = getschema.infer_schema(records, record_level) 144 | 145 | if add_tstamp: 146 | timestamp_format = {"type": ["null", "string"], 147 | "format": "date-time"} 148 | schema["properties"][EXTRACT_TIMESTAMP] = timestamp_format 149 | schema["properties"][BATCH_TIMESTAMP] = timestamp_format 150 | 151 | if not os.path.exists(config["schema_dir"]): 152 | os.mkdir(config["schema_dir"]) 153 | 154 | schemas[tap_stream_id] = schema 155 | with open(os.path.join(config["schema_dir"], tap_stream_id + ".json"), 156 | "w") as f: 157 | json.dump(schema, f, indent=2) 158 | 159 | if not out_catalog: 160 | return 161 | 162 | catalog = {"streams": []} 163 | for stream in list(streams.keys()): 164 | tap_stream_id = streams[stream].tap_stream_id 165 | schema = schemas[tap_stream_id] 166 | schema["selected"] = True 167 | catalog["streams"].append({ 168 | "stream": tap_stream_id, 169 | "tap_stream_id": tap_stream_id, 170 | "schema": schema, 171 | }) 172 | 173 | if not os.path.exists(config["catalog_dir"]): 174 | os.mkdir(config["catalog_dir"]) 175 | 176 | with open(os.path.join(config["catalog_dir"], "catalog.json"), "w") as f: 177 | json.dump(catalog, f, indent=2) 178 | -------------------------------------------------------------------------------- /tap_rest_api/sync.py: -------------------------------------------------------------------------------- 1 | import datetime, sys, time 2 | import simplejson as json 3 | 4 | import singer 5 | import singer.metrics as metrics 6 | 7 | from .helper import ( 8 | generate_request, 9 | get_bookmark_type_and_key, 10 | get_end, 11 | get_endpoint, 12 | get_init_endpoint_params, 13 | get_last_update, 14 | get_float_timestamp, 15 | get_record, 16 | get_record_list, 17 | get_selected_streams, 18 | get_start, 19 | get_streams_to_sync, 20 | human_readable, 21 | get_http_headers, 22 | get_digest_from_record, 23 | unnest, 24 | EXTRACT_TIMESTAMP, 25 | ) 26 | from .schema import filter_record, load_schema, validate 27 | 28 | 29 | LOGGER = singer.get_logger() 30 | 31 | 32 | def sync_rows(config, state, tap_stream_id, key_properties=[], auth_method=None, 33 | max_page=None, assume_sorted=True, filter_by_schema=True, 34 | raw_output=False): 35 | """ 36 | - max_page: Force sync to end after max_page. Mostly used for debugging. 37 | - assume_sorted: Trust the data to be presorted by the 38 | index/timestamp/datetime keys 39 | so it is safe to finish the replication once the last 40 | update index/timestamp/datetime passes the end. 41 | """ 42 | schema = load_schema(config["schema_dir"], tap_stream_id) 43 | params = get_init_endpoint_params(config, state, tap_stream_id) 44 | 45 | dt_keys = config.get("datetime_keys") 46 | if isinstance(dt_keys, str): 47 | raise Exception(f"{tap_stream_id}: {dt_keys}, {config}") 48 | i_keys = config.get("index_keys") 49 | if isinstance(i_keys, str): 50 | raise Exception(f"{tap_stream_id}: {i_keys}, {config}") 51 | 52 | bookmark_type, _ = get_bookmark_type_and_key(config, tap_stream_id) 53 | 54 | on_invalid_property = config.get("on_invalid_property", "force") 55 | drop_unknown_properties = config.get("drop_unknown_properties", False) 56 | 57 | start = get_start(config, state, tap_stream_id, "last_update") 58 | end = get_end(config, tap_stream_id) 59 | 60 | headers = get_http_headers(config) 61 | 62 | if start is None: 63 | LOGGER.warning("None of timestamp_key, datetime_key, and index_key" + 64 | " are set in conifg. Bookmarking is not available.") 65 | 66 | start_str = human_readable(bookmark_type, start) 67 | end_str = human_readable(bookmark_type, end) 68 | # Log the conditions 69 | LOGGER.info("Stream %s has %s set starting %s and ending %s." % 70 | (tap_stream_id, bookmark_type, start_str, end_str)) 71 | # I trust you set URL format contains those params. The behavior depends 72 | # on the data source API's spec. 73 | # I will not filter out the records outside the boundary. Every record 74 | # received is will be written out. 75 | 76 | LOGGER.info("assume_sorted is set to %s" % assume_sorted) 77 | # I trust the data to be sorted by the index/timestamp/datetime keys. 78 | # So it is safe to finish the replication once the last 79 | # update index/timestamp/datetime passes the end. 80 | # When in doubt, set this to False. Always perform post-replication dedup. 81 | 82 | LOGGER.info("filter_by_schema is set to %s." % filter_by_schema) 83 | # The fields undefined/not-conforming to the schema will be written out. 84 | 85 | LOGGER.info("auth_method is set to %s" % auth_method) 86 | 87 | # Initialize the counters 88 | last_update = start 89 | next_last_update = None 90 | 91 | # Offset is the number of records (vs. page) 92 | offset_number = params.get("current_offset", 0) 93 | page_number = params.get("current_page", 0) 94 | 95 | # When we rely on index/datetime/timestamp to parse the next GET URL, 96 | # we will get the record we have already seen in the current process. 97 | # When we get last_record_extracted from state file, we can also 98 | # compare with the previous process to further avoiding duplicated 99 | # records in the target data store. 100 | prev_written_record = None 101 | last_record_extracted = singer.get_bookmark(state, tap_stream_id, 102 | "last_record_extracted") 103 | if last_record_extracted: 104 | prev_written_record = json.loads(last_record_extracted) 105 | 106 | # First writ out the schema 107 | if raw_output is False: 108 | singer.write_schema(tap_stream_id, schema, key_properties) 109 | 110 | # Fetch and iterate over to write the records 111 | with metrics.record_counter(tap_stream_id) as counter: 112 | while True: 113 | params.update({"current_page": page_number}) 114 | params.update({"current_page_one_base": page_number + 1}) 115 | params.update({"current_offset": offset_number}) 116 | params.update({"last_update": last_update}) 117 | 118 | url = config.get("urls", {}).get(tap_stream_id, config["url"]) 119 | endpoint = get_endpoint(url, tap_stream_id, params) 120 | LOGGER.info("GET %s", endpoint) 121 | 122 | rows = generate_request(tap_stream_id, endpoint, auth_method, 123 | headers, 124 | config.get("username"), 125 | config.get("password")) 126 | 127 | # In case the record is not at the root level 128 | record_list_level = config.get("record_list_level") 129 | if isinstance(record_list_level, dict): 130 | record_list_level = record_list_level.get(tap_stream_id) 131 | record_level = config.get("record_level") 132 | if isinstance(record_level, dict): 133 | record_level = record_level.get(tap_stream_id) 134 | 135 | rows = get_record_list(rows, record_list_level) 136 | 137 | LOGGER.info("Current page %d" % page_number) 138 | LOGGER.info("Current offset %d" % offset_number) 139 | 140 | for row in rows: 141 | record = get_record(row, record_level) 142 | 143 | unnest_cols = config.get("unnest", {}).get(tap_stream_id, []) 144 | for u in unnest_cols: 145 | record = unnest(record, u["path"], u["target"]) 146 | 147 | if filter_by_schema: 148 | record = filter_record( 149 | record, 150 | schema, 151 | on_invalid_property=on_invalid_property, 152 | drop_unknown_properties=drop_unknown_properties, 153 | ) 154 | 155 | if not validate(record, schema): 156 | LOGGER.debug("Skipping the schema invalidated row %s" % record) 157 | continue 158 | 159 | # It's important to compare the record before adding 160 | # EXTRACT_TIMESTAMP 161 | digest = get_digest_from_record(record) 162 | digest_dict = {"digest": digest} 163 | # backward compatibility 164 | if (prev_written_record == record or 165 | prev_written_record == digest_dict): 166 | LOGGER.info( 167 | "Skipping the duplicated row with " 168 | f"digest {digest}" 169 | ) 170 | continue 171 | 172 | if EXTRACT_TIMESTAMP in schema["properties"].keys(): 173 | extract_tstamp = datetime.datetime.utcnow() 174 | extract_tstamp = extract_tstamp.replace( 175 | tzinfo=datetime.timezone.utc) 176 | record[EXTRACT_TIMESTAMP] = extract_tstamp.isoformat() 177 | 178 | try: 179 | next_last_update = get_last_update(config, tap_stream_id, record, last_update) 180 | except Exception as e: 181 | LOGGER.error(f"Error with the record:\n {row}\n message: {e}") 182 | raise 183 | 184 | if not end or next_last_update < end: 185 | if raw_output: 186 | sys.stdout.write(json.dumps(record) + "\n") 187 | else: 188 | singer.write_record(tap_stream_id, record) 189 | 190 | counter.increment() # Increment only when we write 191 | last_update = next_last_update 192 | 193 | # prev_written_record may be persisted for the next run. 194 | # EXTRACT_TIMESTAMP will be different. So popping it out 195 | # before storing. 196 | record.pop(EXTRACT_TIMESTAMP) 197 | digest = get_digest_from_record(record) 198 | prev_written_record = {"digest": digest} 199 | 200 | # Exit conditions 201 | if len(rows) < config["items_per_page"]: 202 | LOGGER.info(("Response is less than set item per page (%d)." + 203 | "Finishing the extraction") % 204 | config["items_per_page"]) 205 | break 206 | if max_page and page_number + 1 >= max_page: 207 | LOGGER.info("Max page %d reached. Finishing the extraction." % max_page) 208 | break 209 | if assume_sorted and end and (next_last_update and next_last_update >= end): 210 | LOGGER.info(("Record greater than %s and assume_sorted is" + 211 | " set. Finishing the extraction.") % end) 212 | break 213 | 214 | page_number +=1 215 | offset_number += len(rows) 216 | 217 | # If timestamp_key is not integerized, do so at millisecond level 218 | if bookmark_type == "timestamp" and len(str(int(last_update))) == 10: 219 | last_update = int(last_update * 1000) 220 | 221 | state = singer.write_bookmark(state, tap_stream_id, "last_update", 222 | last_update) 223 | if prev_written_record: 224 | state = singer.write_bookmark(state, tap_stream_id, 225 | "last_record_extracted", 226 | json.dumps(prev_written_record)) 227 | 228 | if raw_output == False: 229 | singer.write_state(state) 230 | 231 | return state 232 | 233 | 234 | def sync(config, streams, state, catalog, raw=False): 235 | """ 236 | Sync the streams that were selected 237 | 238 | - max_page: Stop after making this number of API call is made. 239 | - assume_sorted: Assume the data to be sorted and exit the process as soon 240 | as a record having greater than end index/datetime/timestamp is detected. 241 | - auth_method: HTTP auth method (basic, no_auth, digest) 242 | - filter_by_schema: When True, check the extracted records against the 243 | schema and undefined/unmatching fields won't be written out. 244 | - raw: Output raw JSON records to stdout 245 | """ 246 | max_page = config.get("max_page") 247 | auth_method = config.get("auth_method", "basic") 248 | assume_sorted = config.get("assume_sorted", True) 249 | filter_by_schema = config.get("filter_by_schema", True) 250 | 251 | start_process_at = datetime.datetime.now() 252 | remaining_streams = get_streams_to_sync(streams, state) 253 | selected_streams = get_selected_streams(remaining_streams, catalog) 254 | 255 | if len(selected_streams) < 1: 256 | raise Exception("No Streams selected, please check that you have a " + 257 | "schema selected in your catalog") 258 | 259 | LOGGER.info("Starting sync. Will sync these streams: %s" % 260 | [stream.tap_stream_id for stream in selected_streams]) 261 | 262 | if not state.get("bookmarks"): 263 | state["bookmarks"] = {} 264 | for stream in selected_streams: 265 | dt_keys = config.get("datetime_keys") 266 | if isinstance(dt_keys, str): 267 | raise Exception(f"{stream.tap_stream_id}: {dt_keys}, {config}") 268 | i_keys = config.get("index_keys") 269 | if isinstance(i_keys, str): 270 | raise Exception(f"{stream.tap_stream_id}: {i_keys}, {config}") 271 | 272 | LOGGER.info("%s Start sync" % stream.tap_stream_id) 273 | 274 | current_state = dict(state) 275 | singer.set_currently_syncing(current_state, stream.tap_stream_id) 276 | if raw is False: 277 | singer.write_state(current_state) 278 | 279 | try: 280 | sync_rows( 281 | config, 282 | current_state, 283 | stream.tap_stream_id, 284 | max_page=max_page, 285 | auth_method=auth_method, 286 | assume_sorted=assume_sorted, 287 | raw_output=raw, 288 | filter_by_schema=filter_by_schema) 289 | except Exception as e: 290 | LOGGER.critical(e) 291 | raise e 292 | 293 | if not state["bookmarks"].get(stream.tap_stream_id): 294 | state["bookmarks"][stream.tap_stream_id] = current_state["bookmarks"][stream.tap_stream_id] 295 | else: 296 | state["bookmarks"][stream.tap_stream_id].update( 297 | current_state["bookmarks"][stream.tap_stream_id]) 298 | if raw is False: 299 | singer.write_state(state) 300 | 301 | bookmark_type, _ = get_bookmark_type_and_key(config, stream.tap_stream_id) 302 | last_update = state["bookmarks"][stream.tap_stream_id]["last_update"] 303 | if bookmark_type == "timestamp": 304 | last_update = str(last_update) + " (" + str( 305 | datetime.datetime.fromtimestamp(get_float_timestamp(last_update))) + ")" 306 | LOGGER.info("%s End sync" % stream.tap_stream_id) 307 | LOGGER.info("%s Last record's %s: %s" % 308 | (stream.tap_stream_id, bookmark_type, last_update)) 309 | 310 | end_process_at = datetime.datetime.now() 311 | LOGGER.info("Completed sync at %s" % str(end_process_at)) 312 | LOGGER.info("Process duration: " + str(end_process_at - start_process_at)) 313 | -------------------------------------------------------------------------------- /tests/install_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | APP=tap-rest-api 4 | 5 | PYTHON=/opt/python/3.6/bin/python 6 | if [ ! -e $PYTHON ]; then 7 | PYTHON=`which python3` 8 | fi 9 | echo $PYTHON 10 | 11 | if [ -e ./install_test ]; then 12 | rm -fr install_test 13 | fi 14 | 15 | $PYTHON -m venv install_test 16 | source install_test/bin/activate; 17 | find $APP -name '__pycache__' | xargs rm -fr; 18 | python setup.py clean --all; 19 | rm -fr dist; 20 | rm -fr build; 21 | rm -fr $APP.egg-info; 22 | python setup.py install; 23 | 24 | SITE_PKG_DIR="./install_test/lib/python3.6/site-packages" 25 | PKG_DIR=`ls $SITE_PKG_DIR | grep $APP` 26 | 27 | # tree $SITE_PKG_DIR/$PKG_DIR/$APP 28 | DIFF=`diff --exclude=__pycache__ -r $SITE_PKG_DIR/$PKG_DIR/$APP ./$APP` 29 | if [ -z "$DIFF" ] 30 | then 31 | echo "All file are included in the package."; 32 | else 33 | echo $DIFF 34 | echo "Check MANIFEST.in" 35 | exit 1; 36 | fi 37 | 38 | # Note: Don't insert spaces in the next line 39 | $APP&>install_test/msg 40 | CMD_OUT=`cat install_test/msg | grep "usage:"` 41 | if [ -z "$CMD_OUT" ]; then 42 | cat install_test/msg 43 | echo "$APP is not properly installed" 44 | exit 1; 45 | else 46 | echo "$APP command is returning the expected message." 47 | fi 48 | 49 | deactivate 50 | echo "Install test finished successfully" 51 | -------------------------------------------------------------------------------- /tests/test_usgs.py: -------------------------------------------------------------------------------- 1 | import datetime, os, tempfile 2 | from tap_rest_api import sync 3 | from tap_rest_api.schema import infer_schema 4 | from tap_rest_api.helper import Stream 5 | from singer import utils 6 | from singer.catalog import Catalog 7 | 8 | 9 | def _prep_config(): 10 | cwd, _ = os.path.split(__file__) 11 | usgs_dir = os.path.join(cwd, "../examples/usgs") 12 | config = utils.load_json(os.path.join(usgs_dir, "config/tap_config.json")) 13 | config["schema_dir"] = os.path.join(usgs_dir, "schema") 14 | config["catalog_dir"] = os.path.join(usgs_dir, "catalog") 15 | catalog = Catalog.load(os.path.join(usgs_dir, config["catalog_dir"], 16 | "earthquakes.json")) 17 | config["start_datetime"] = (datetime.datetime.now() - 18 | datetime.timedelta(hours=1)).isoformat() 19 | streams = {} 20 | streams["earthquakes"] = Stream("earthquakes", config) 21 | return config, catalog, streams 22 | 23 | 24 | def test_infer_schema(): 25 | config, catalog, streams = _prep_config() 26 | with tempfile.TemporaryDirectory() as build_dir: 27 | config["schema_dir"] = build_dir 28 | config["catalog_dir"] = build_dir 29 | infer_schema(config, streams) 30 | 31 | 32 | def test_sync(): 33 | config, catalog, streams = _prep_config() 34 | state = {} 35 | sync(config, streams, state, catalog, auth_method="no_auth") 36 | -------------------------------------------------------------------------------- /tests/unit/test_headers.py: -------------------------------------------------------------------------------- 1 | from tap_rest_api.helper import get_http_headers, USER_AGENT 2 | 3 | 4 | DEFAULT_HEADERS = {"User-Agent": USER_AGENT, 5 | "Content-type": "application/json"} 6 | 7 | 8 | def test_default(): 9 | h = get_http_headers() 10 | assert h == DEFAULT_HEADERS 11 | 12 | 13 | def test_agent_overwrite(): 14 | ua = ("Mozilla/5.1 (Macintosh; scitylana.singer.io) " + 15 | "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 " + 16 | "Safari/537.36 ") 17 | config = {"http_headers": {"User-Agent": ua, 18 | "Conetnt-type": "application/json", 19 | "Bearer": "xxxxyyyy"}} 20 | 21 | h = get_http_headers(config) 22 | 23 | assert h == config["http_headers"] 24 | --------------------------------------------------------------------------------