├── .dockerignore ├── .env.example ├── .github └── workflows │ └── build.yaml ├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── Dockerfile ├── LICENSE.md ├── README.md ├── assets ├── scrape_interval.png └── scraping_history.png ├── docker-compose.yaml ├── migrations ├── 20210722150010_initial_migration.down.sql ├── 20210722150010_initial_migration.up.sql ├── 20211111232248_scrape_resource_priority.down.sql ├── 20211111232248_scrape_resource_priority.up.sql ├── 20211121211206_scraped_at.down.sql ├── 20211121211206_scraped_at.up.sql ├── 20211211152424_unique_amqp_source.down.sql ├── 20211211152424_unique_amqp_source.up.sql ├── 20211211152751_populate_amqp_source.down.sql ├── 20211211152751_populate_amqp_source.up.sql ├── 20211221055954_official_source.down.sql ├── 20211221055954_official_source.up.sql ├── 20211221061158_official_source_nullable.down.sql └── 20211221061158_official_source_nullable.up.sql ├── sqlx-data.json └── src ├── api ├── mod.rs └── v1 │ ├── mod.rs │ ├── providers.rs │ └── stats.rs ├── db.rs ├── dispatcher ├── amqp.rs ├── discord.rs ├── dispatcher.rs └── mod.rs ├── lib.rs ├── main.rs ├── models.rs ├── request.rs ├── scheduler ├── mod.rs ├── priority.rs ├── rate_limiter.rs └── scheduler.rs ├── scraper ├── mod.rs ├── providers │ ├── mod.rs │ ├── pinterest.rs │ ├── providers.rs │ ├── twitter.rs │ ├── twitter_types.rs │ ├── united_cube.rs │ └── weverse.rs └── scraper.rs └── server.rs /.dockerignore: -------------------------------------------------------------------------------- 1 | target 2 | .vscode 3 | dist 4 | manifests 5 | Dockerfile 6 | docker-compose.yaml -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | DATABASE_URL=postgres://postgres:password@localhost:5431/jiu 2 | USER_AGENT="Jiu Scraper (https://github.com/xetera/jiu)" 3 | WEVERSE_EMAIL= 4 | WEVERSE_PASSWORD= -------------------------------------------------------------------------------- /.github/workflows/build.yaml: -------------------------------------------------------------------------------- 1 | name: ci 2 | 3 | on: 4 | push: 5 | branches: 6 | - 'main' 7 | 8 | jobs: 9 | docker: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v2 13 | - name: Set up QEMU 14 | uses: docker/setup-qemu-action@v1 15 | - name: Set up Docker Buildx 16 | uses: docker/setup-buildx-action@v1 17 | - name: Login to DockerHub 18 | uses: docker/login-action@v1 19 | with: 20 | username: ${{ secrets.DOCKERHUB_USERNAME }} 21 | password: ${{ secrets.DOCKERHUB_PASSWORD }} 22 | - name: Cache Docker layers 23 | uses: actions/cache@v2 24 | with: 25 | path: /tmp/.buildx-cache 26 | key: ${{ runner.os }}-buildx-${{ github.sha }} 27 | restore-keys: | 28 | ${{ runner.os }}-buildx- 29 | - name: Build and push 30 | uses: docker/build-push-action@v2 31 | with: 32 | context: . 33 | push: true 34 | tags: xetera/jiu:latest 35 | cache-from: type=local,src=/tmp/.buildx-cache 36 | cache-to: type=local,dest=/tmp/.buildx-cache-new 37 | # This ugly bit is necessary if you don't want your cache to grow forever 38 | # till it hits GitHub's limit of 5GB. 39 | # Temp fix 40 | # https://github.com/docker/build-push-action/issues/252 41 | # https://github.com/moby/buildkit/issues/1896 42 | - name: Move cache 43 | run: | 44 | rm -rf /tmp/.buildx-cache 45 | mv /tmp/.buildx-cache-new /tmp/.buildx-cache 46 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target/ 2 | **/*.rs.bk 3 | **/dist 4 | .vscode 5 | env.toml 6 | seed.sql 7 | .env 8 | .idea 9 | *.dump 10 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "jiu" 3 | version = "0.1.1" 4 | authors = ["Xetera"] 5 | edition = "2018" 6 | 7 | [dependencies] 8 | reqwest = { version = "0.11.4", features = ["json", "multipart"] } 9 | chrono = { version= "0.4.19", features = ["serde"]} 10 | async-trait = "0.1.50" 11 | async-recursion = "0.3.2" 12 | tokio = { version = "1.8.1", features = ["full"] } 13 | tokio-test = "0.4.2" 14 | serde = { version = "1.0.126", features = ["derive"] } 15 | serde_json = "1.0.59" 16 | url = "2.2.2" 17 | better-panic = "0.2.0" 18 | futures = "0.3.15" 19 | thiserror = "1.0.26" 20 | num-traits = "0.2.14" 21 | sqlx = { version = "0.5.5", features = ["runtime-tokio-rustls", "postgres", "chrono", "offline", "json", "bigdecimal"] } 22 | dotenv = "0.15.0" 23 | log = "0.4.0" 24 | env_logger = "0.8.4" 25 | anyhow = "1.0.42" 26 | strum = { version = "0.21.0", features = ["derive"] } 27 | strum_macros = "0.21.1" 28 | rsa = "0.4.0" 29 | sha-1 = "0.9.7" 30 | digest = "0.9.0" 31 | base64 = "0.13.0" 32 | rand = "0.8.4" 33 | regex = "1.5.4" 34 | lazy_static = "1.4.0" 35 | dyn-clone = "1.0.4" 36 | bimap = "0.6.1" 37 | governor = "0.3.2" 38 | nonzero_ext = "0.2.0" 39 | axum = "0.3.4" 40 | tower = "0.4.11" 41 | itertools = "0.10.1" 42 | parking_lot = "0.11.1" 43 | tokio-amqp = "1.0.0" 44 | lapin = "1.8.1" 45 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM rust:1.55.0-buster as builder 2 | 3 | # First build a dummy project with our dependencies to cache them in Docker 4 | WORKDIR /usr/src 5 | RUN cargo new --bin builder 6 | WORKDIR /usr/src/builder 7 | COPY ./Cargo.lock ./Cargo.lock 8 | COPY ./Cargo.toml ./Cargo.toml 9 | RUN --mount=type=cache,target=/usr/local/cargo/registry cargo build --release 10 | RUN rm src/*.rs 11 | 12 | # Now copy the sources and do the real build 13 | ADD src src 14 | ADD sqlx-data.json sqlx-data.json 15 | ENV SQLX_OFFLINE true 16 | 17 | RUN cargo build --release 18 | 19 | # Second stage putting the build result into a debian jessie-slim image 20 | FROM debian:buster-slim 21 | 22 | RUN apt-get update \ 23 | && apt-get install -y ca-certificates tzdata libc6 \ 24 | && rm -rf /var/lib/apt/lists/* 25 | ENV NAME=rust-docker 26 | ENV RUST_LOG=jiu=debug 27 | COPY --from=builder /usr/src/builder/target/release/jiu /usr/local/bin/jiu 28 | CMD ["/usr/local/bin/jiu"] 29 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 | 5 |

6 | Scrape multiple media providers on a cron job and dispatch webhooks when changes are detected. 7 |

8 | 9 | ## Jiu 10 | 11 | Jiu is a multi-threaded media scraper capable of juggling thousands of endpoints from different providers with unique 12 | restrictions/requirements. 13 | 14 | It is built for the purpose of fetching media posted on different sites the form of a slow, eventual-consistency, and 15 | not for instant change detection. 16 | 17 | ## Providers 18 | 19 | Provider is the umbrella term that encapsulates all endpoints for a given domain. 20 | 21 | For example, https://weverse.io/bts/artist and https://weverse.io/dreamcatcher/artist are 2 endpoints under the Weverse 22 | provider. 23 | 24 | ### Supported providers 25 | 26 | * [Twitter](https://twitter.com/RBW_MAMAMOO) 27 | * [Pinterest Boards](https://www.pinterest.com/janairaoliveira314/handong) 28 | * [Weverse.io](https://weverse.io/dreamcatcher/feed) 29 | * [United Cube](https://www.united-cube.com/) 30 | 31 | ## Dynamic Priority & Tokens 32 | 33 | Dynamic priority is main idea behind how JiU can scrape many resources without getting rate limited. 34 | 35 | Unique endpoints that have more than 1 token are grouped by their provider type and get scheduled to be scraped at even 36 | intervals at the start of every day to avoid hammering APIs with requests. 37 | 38 | ![](./assets/scrape_interval.png) 39 | 40 | After each successful request, a 30 day sliding window of that endpoint's request history gets graded on a curve that 41 | determines how its priority should be changing based on how many new images it found in each request. 42 | 43 | ![](./assets/scraping_history.png) 44 | 45 | Pages that post at least one image regularly get assigned a higher priority, up to a maximum of 3 requests every 2 days. 46 | Pages that don't post anything sink down to a scrape schedule of once every 2 weeks. 47 | 48 | New results found in earlier dates have a higher contribution to priority than those found further back. This curve 49 | allows JiU to match its request frequency with the changing posting schedule of sites it's processing to avoid wasting 50 | requests on resources that are rarely updated. 51 | 52 | At the end of each day, every endpoint gets tokens added to it equal to its current priority that get checked as a 53 | criteria when scheduling requests the next day. 54 | 55 | ## Authorization 56 | 57 | Anonymous request are always preferred when possible. 58 | 59 | There is a customizable login flow for providers that require authorization which allows logging into APIs after an 60 | authorization error, and persists additional data (such as a JWT token) to be shared across each provider during the 61 | lifetime of the process. 62 | 63 | The login flow is reverse engineered for providers that don't have a public API. 64 | 65 | > Juggling multiple accounts per provider is currently not supported and probably won't be as long as your accounts aren't getting banned (and if they are then you're sending too many requests and need to increase your rate limits). 66 | 67 | Jiu will try its best to identify itself in its requests' `User-Agent` header, but will submit a fake UA for providers 68 | that gate posts behind a user agent check like Twitter. 69 | 70 | ## Proxies 71 | 72 | Proxies are not supported or needed. 73 | 74 | ## Webhooks 75 | 76 | Jiu is capable of sending webhooks to multiple destinations when an update for a provider is detected. 77 | 78 | Although data about posts are aggregated within webhooks, they're not persisted to the database as that's the responsibility of the service receiving the events and are not relevant for image aggregation. 79 | 80 | ```json 81 | { 82 | "provider": { 83 | "type": "twitter.timeline", 84 | "id": "729935154290925570", 85 | "ephemeral": false 86 | }, 87 | "posts": [ 88 | { 89 | "unique_identifier": "1460196926796623873", 90 | "body": "[#가현] 삐뚤빼뚤 즐거운 라이브였다❣️ 다음 주에도 재밌는 시간 보내 보카?\n\n#드림캐쳐 #Dreamcatcher #4주_집콕_프로젝트 https://t.co/r1ImPUPKkv", 91 | "url": "https://twitter.com/hf_dreamcatcher/status/1460196926796623873", 92 | "post_date": null, 93 | "account": { 94 | "name":"드림캐쳐 Dreamcatcher", 95 | "avatar_url":"https://pbs.twimg.com/profile_images/1415983453200261124/4-viIm27_normal.jpg" 96 | }, 97 | "metadata": { 98 | "language": "ko", 99 | "like_count": 12474, 100 | "retweet_count": 2760 101 | }, 102 | "images": [ 103 | { 104 | "type": "Image", 105 | "media_url": "https://pbs.twimg.com/media/FEOpVKmagAELmzI.jpg", 106 | "reference_url": "https://twitter.com/hf_dreamcatcher/status/1460196926796623873/photo/1", 107 | "unique_identifier": "1460196885285994497", 108 | "metadata": { 109 | "width": 1128, 110 | "height": 1504 111 | } 112 | }, 113 | { 114 | "type": "Image", 115 | "media_url": "https://pbs.twimg.com/media/FEOpV2FaAAEG4zr.jpg", 116 | "reference_url": "https://twitter.com/hf_dreamcatcher/status/1460196926796623873/photo/2", 117 | "unique_identifier": "1460196896958709761", 118 | "metadata": { 119 | "width": 1128, 120 | "height": 1504 121 | } 122 | } 123 | ] 124 | } 125 | ] 126 | } 127 | ``` 128 | 129 | Every provider has its own `provider_metadata` field that _may_ contain extra information about the image or the post it 130 | was found under, but may also be missing. _Documentation WIP_ 131 | 132 | The `unique_identifier` field is unique **per provider** and not globally. 133 | 134 | The `ephemeral` field defines whether an image is only accessible for a short period after dispatch (for example 135 | instagram image links expire after some time). 136 | 137 | If a Discord webhook URL is detected, the payload is changed to allow Discord to display the images in the channel. 138 | 139 | There is currently no retry mechanism for webhooks that fail to deliver successfully. 140 | 141 | ## Endpoints 142 | 143 | Jiu runs a webserver on port 8080 to allow dynamically resolving new resources by URL and getting stats at runtime 144 | 145 | - `POST /v1/provider` Create a new provider by resolving a URL to a resource 146 | - `DELETE /v1/provider` Delete an existing provider (sets it to `enabled=false`) 147 | - `GET /v1/schedule` Get the upcoming scheduled scrapes 148 | - `GET /v1/history` The list of the last 100 scraped endpoints 149 | - `GET /v1/stats` The stats of all the registered providers 150 | 151 | ## Jiu is **NOT**: 152 | 153 | * For bombarding sites like Twitter with requests to detect changes within seconds. 154 | * Capable of executing javascript with a headless browser. 155 | * Able to send requests to any social media site without explicit support. 156 | 157 | ## Jiu **IS**: 158 | 159 | * For slowly monitoring changes in different feeds over the course of multiple hours without abusing the provider. 160 | * Capable of adjusting the frequency of scrapes based on how frequently the source is updated. 161 | * Able to send webhooks or push to AMQP on discovery. 162 | * The lead singer of [Dreamcatcher](https://www.youtube.com/watch?v=1QD0FeZyDtQ). 163 | 164 | ## Usage 165 | 166 | 1. Copy over `.env.example` to `.env` and fill out relevant fields. 167 | 2. `docker-compose up -d jiu_db` to start postgres. 168 | 3. `RUST_LOG=jiu cargo run` to start the crawler 169 | 170 | To create a production-ready image, make sure to run `cargo sqlx generate` before building if you modified any of the 171 | SQL queries. 172 | 173 | > If you would like to use this project, please change the `USER_AGENT` environment variable to identify your crawler accurately. 174 | 175 | Built for [kiyomi.io](https://github.com/xetera/kiyomi) 176 | -------------------------------------------------------------------------------- /assets/scrape_interval.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xetera/jiu/e3a54c908f17359f1233b28b4bcdab31f5b249b8/assets/scrape_interval.png -------------------------------------------------------------------------------- /assets/scraping_history.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xetera/jiu/e3a54c908f17359f1233b28b4bcdab31f5b249b8/assets/scraping_history.png -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: "3.5" 2 | 3 | networks: 4 | jiu_net: 5 | 6 | volumes: 7 | jiu_data: 8 | 9 | services: 10 | jiu_db: 11 | image: postgres:13 12 | container_name: jiu_db 13 | volumes: 14 | - jiu_data:/var/lib/postgresql/data 15 | environment: 16 | POSTGRES_USER: postgres 17 | POSTGRES_PASSWORD: password 18 | POSTGRES_DB: jiu 19 | networks: 20 | - jiu_net 21 | ports: 22 | - 5431:5432 23 | jiu: 24 | image: rust:1.55 25 | volumes: 26 | - ./:/app 27 | networks: 28 | - jiu_net 29 | environment: 30 | USER: xetera 31 | DATABASE_URL: postgres://postgres:password@jiu_db:5431/jiu -------------------------------------------------------------------------------- /migrations/20210722150010_initial_migration.down.sql: -------------------------------------------------------------------------------- 1 | -- Add down migration script here 2 | DROP TABLE webhook_invocation; 3 | 4 | DROP TABLE webhook_source; 5 | 6 | DROP TABLE scrape_error; 7 | 8 | DROP TABLE media; 9 | 10 | DROP TABLE scrape_request; 11 | 12 | DROP TABLE scrape; 13 | 14 | DROP TABLE webhook; 15 | -------------------------------------------------------------------------------- /migrations/20210722150010_initial_migration.up.sql: -------------------------------------------------------------------------------- 1 | -- Add up migration script here 2 | CREATE TABLE IF NOT EXISTS webhook( 3 | id INTEGER PRIMARY KEY GENERATED ALWAYS AS IDENTITY, 4 | destination TEXT NOT NULL, 5 | created_at TIMESTAMP WITHOUT TIME ZONE NOT NULL DEFAULT NOW(), 6 | updated_at TIMESTAMP WITHOUT TIME ZONE NOT NULL DEFAULT NOW(), 7 | -- extra data attached to a webhook invocation 8 | metadata JSONB 9 | ); 10 | 11 | CREATE TABLE IF NOT EXISTS provider_resource( 12 | id INTEGER PRIMARY KEY GENERATED ALWAYS AS IDENTITY, 13 | -- This can be a FQDN or an identifier that maps to a unique API endpoint 14 | -- on the provider's end 15 | destination TEXT NOT NULL, 16 | name TEXT NOT NULL, 17 | enabled BOOLEAN DEFAULT True, 18 | -- the url for the scraped page 19 | url TEXT NOT NULL, 20 | priority INTEGER NOT NULL DEFAULT 5 CHECK(priority >= 1 AND priority <= 10), 21 | last_scrape TIMESTAMP WITHOUT TIME ZONE NULL, 22 | -- the date last scrape was requested, this acts a lock to prevent resources from being accessed multiple times 23 | last_queue TIMESTAMP WITHOUT TIME ZONE NULL, 24 | created_at TIMESTAMP WITHOUT TIME ZONE DEFAULT NOW(), 25 | UNIQUE(destination, name) 26 | ); 27 | 28 | CREATE TABLE IF NOT EXISTS scrape( 29 | id INTEGER PRIMARY KEY GENERATED ALWAYS AS IDENTITY, 30 | provider_name TEXT, 31 | provider_destination TEXT, 32 | -- the priority this scrape was executed against 33 | priority INTEGER NOT NULL CHECK(priority >= 1 AND priority <= 10), 34 | FOREIGN KEY (provider_name, provider_destination) 35 | REFERENCES provider_resource(name, destination) ON DELETE SET NULL ON UPDATE CASCADE 36 | ); 37 | 38 | -- each scrape can have more than one request 39 | CREATE TABLE IF NOT EXISTS scrape_request( 40 | id INTEGER PRIMARY KEY GENERATED ALWAYS AS IDENTITY, 41 | scrape_id INTEGER REFERENCES scrape(id), 42 | page INTEGER NOT NULL DEFAULT 1, 43 | response_code INTEGER, 44 | -- how long did the response take in ms 45 | response_delay INTEGER, 46 | scraped_at TIMESTAMP WITHOUT TIME ZONE NOT NULL 47 | ); 48 | 49 | CREATE TABLE IF NOT EXISTS media( 50 | -- This is necessary when trying to sort media that were 51 | -- crawled at the same time 52 | id INTEGER PRIMARY KEY GENERATED ALWAYS AS IDENTITY, 53 | provider_name TEXT, 54 | provider_destination TEXT, 55 | scrape_request_id INTEGER REFERENCES scrape_request(id) ON DELETE SET NULL, 56 | -- We are assuming there is only one type of url 57 | image_url TEXT NOT NULL UNIQUE, 58 | page_url TEXT NULL, 59 | reference_url TEXT NULL, 60 | -- a unique identifier that's specific to the provider 61 | unique_identifier TEXT NOT NULL, 62 | -- where the image is coming from 63 | -- could be null if the provider doesn't have the information 64 | posted_at TIMESTAMP WITHOUT TIME ZONE NULL, 65 | discovered_at TIMESTAMP WITHOUT TIME ZONE NOT NULL, 66 | UNIQUE(unique_identifier, provider_name), 67 | FOREIGN KEY (provider_name, provider_destination) 68 | REFERENCES provider_resource(name, destination) ON UPDATE CASCADE ON DELETE SET NULL 69 | ); 70 | 71 | CREATE TABLE IF NOT EXISTS scrape_error( 72 | id INTEGER PRIMARY KEY GENERATED ALWAYS AS IDENTITY, 73 | -- already declared in scrape_request 74 | -- response_code INTEGER, 75 | response_body TEXT NOT NULL DEFAULT '', 76 | response_code TEXT NOT NULL, 77 | message TEXT NULL, 78 | scrape_id INTEGER NOT NULL REFERENCES scrape(id) 79 | ); 80 | 81 | CREATE TABLE IF NOT EXISTS webhook_source( 82 | id INTEGER PRIMARY KEY GENERATED ALWAYS AS IDENTITY, 83 | webhook_id INTEGER REFERENCES webhook(id), 84 | provider_name TEXT, 85 | provider_destination TEXT, 86 | FOREIGN KEY (provider_name, provider_destination) 87 | REFERENCES provider_resource(name, destination) ON DELETE SET NULL ON UPDATE CASCADE 88 | ); 89 | 90 | CREATE TABLE IF NOT EXISTS webhook_invocation( 91 | id INTEGER PRIMARY KEY GENERATED ALWAYS AS IDENTITY, 92 | scrape_id INTEGER /* NOT NULL */ REFERENCES scrape(id), 93 | webhook_id INTEGER /* NOT NULL */ REFERENCES webhook(id) ON DELETE SET NULL, 94 | response_code INTEGER, 95 | response_delay INTEGER, 96 | invoked_at TIMESTAMPTZ NOT NULL DEFAULT NOW() 97 | ); 98 | 99 | CREATE INDEX ON scrape (provider_destination, provider_name); 100 | -------------------------------------------------------------------------------- /migrations/20211111232248_scrape_resource_priority.down.sql: -------------------------------------------------------------------------------- 1 | -- Add down migration script here 2 | ALTER TABLE provider_resource DROP COLUMN tokens; 3 | ALTER TABLE provider_resource ALTER COLUMN priority type integer; 4 | ALTER TABLE scrape ALTER COLUMN priority type integer; 5 | 6 | DROP TABLE amqp_source; -------------------------------------------------------------------------------- /migrations/20211111232248_scrape_resource_priority.up.sql: -------------------------------------------------------------------------------- 1 | -- Add up migration script here 2 | ALTER TABLE provider_resource ADD COLUMN last_token_update TIMESTAMP WITHOUT TIME ZONE NULL; 3 | ALTER TABLE provider_resource ADD COLUMN tokens DECIMAL NOT NULL DEFAULT 1.0; 4 | ALTER TABLE provider_resource ALTER COLUMN priority type decimal; 5 | ALTER TABLE scrape ALTER COLUMN priority type decimal; 6 | ALTER TABLE scrape ALTER COLUMN priority set not null; 7 | ALTER TABLE scrape ALTER COLUMN priority set default 1.0; 8 | ALTER TABLE webhook ADD CONSTRAINT unique_destination UNIQUE(destination); 9 | ALTER TABLE provider_resource DROP CONSTRAINT provider_resource_priority_check; 10 | ALTER TABLE scrape DROP CONSTRAINT scrape_priority_check; 11 | ALTER TABLE webhook DROP COLUMN metadata; 12 | ALTER TABLE webhook_source ADD COLUMN metadata JSONB; 13 | ALTER TABLE provider_resource ADD COLUMN default_name TEXT; 14 | 15 | CREATE TABLE IF NOT EXISTS amqp_source( 16 | id INTEGER PRIMARY KEY GENERATED ALWAYS AS IDENTITY, 17 | provider_name TEXT, 18 | provider_destination TEXT, 19 | metadata JSONB, 20 | FOREIGN KEY (provider_name, provider_destination) 21 | REFERENCES provider_resource(name, destination) ON DELETE SET NULL ON UPDATE CASCADE 22 | ); 23 | -------------------------------------------------------------------------------- /migrations/20211121211206_scraped_at.down.sql: -------------------------------------------------------------------------------- 1 | -- Add down migration script here 2 | -------------------------------------------------------------------------------- /migrations/20211121211206_scraped_at.up.sql: -------------------------------------------------------------------------------- 1 | -- Add up migration script here 2 | ALTER TABLE scrape ADD COLUMN IF NOT EXISTS scraped_at TIMESTAMP WITHOUT TIME ZONE DEFAULT NOW(); 3 | -------------------------------------------------------------------------------- /migrations/20211211152424_unique_amqp_source.down.sql: -------------------------------------------------------------------------------- 1 | -- Add down migration script here 2 | 3 | ALTER TABLE amqp_source DROP CONSTRAINT amqp_unique_providers; 4 | -------------------------------------------------------------------------------- /migrations/20211211152424_unique_amqp_source.up.sql: -------------------------------------------------------------------------------- 1 | -- Add up migration script here 2 | ALTER TABLE amqp_source ADD CONSTRAINT amqp_unique_providers UNIQUE (provider_name, provider_destination); 3 | -------------------------------------------------------------------------------- /migrations/20211211152751_populate_amqp_source.down.sql: -------------------------------------------------------------------------------- 1 | -- Add down migration script here 2 | -------------------------------------------------------------------------------- /migrations/20211211152751_populate_amqp_source.up.sql: -------------------------------------------------------------------------------- 1 | -- Add up migration script here 2 | INSERT INTO amqp_source (provider_name, provider_destination, metadata) 3 | SELECT name, destination, '{}' from provider_resource 4 | ON CONFLICT DO NOTHING; 5 | -------------------------------------------------------------------------------- /migrations/20211221055954_official_source.down.sql: -------------------------------------------------------------------------------- 1 | -- Add down migration script here 2 | ALTER TABLE provider_resource DROP COLUMN official; 3 | -------------------------------------------------------------------------------- /migrations/20211221055954_official_source.up.sql: -------------------------------------------------------------------------------- 1 | -- Add up migration script here 2 | ALTER TABLE provider_resource ADD COLUMN official boolean default false; -------------------------------------------------------------------------------- /migrations/20211221061158_official_source_nullable.down.sql: -------------------------------------------------------------------------------- 1 | -- Add down migration script here 2 | ALTER TABLE provider_resource ALTER COLUMN official DROP NOT NULL; 3 | -------------------------------------------------------------------------------- /migrations/20211221061158_official_source_nullable.up.sql: -------------------------------------------------------------------------------- 1 | -- Add up migration script here 2 | ALTER TABLE provider_resource ALTER COLUMN official SET NOT NULL; -------------------------------------------------------------------------------- /src/api/mod.rs: -------------------------------------------------------------------------------- 1 | use crate::db::Database; 2 | use axum::body::{Bytes, Full}; 3 | use axum::http::{Response, StatusCode}; 4 | use axum::response::IntoResponse; 5 | use axum::Json; 6 | use serde_json::json; 7 | use std::convert::Infallible; 8 | use std::sync::Arc; 9 | use crate::scraper::ProviderMap; 10 | 11 | pub mod v1; 12 | 13 | pub struct Context { 14 | pub db: Arc, 15 | pub providers: Arc 16 | } 17 | 18 | pub enum AppError { 19 | SomeError(anyhow::Error), 20 | SqlxError(sqlx::Error), 21 | } 22 | 23 | impl From for AppError { 24 | fn from(inner: anyhow::Error) -> Self { 25 | AppError::SomeError(inner) 26 | } 27 | } 28 | 29 | impl From for AppError { 30 | fn from(inner: sqlx::Error) -> Self { 31 | AppError::SqlxError(inner) 32 | } 33 | } 34 | 35 | impl IntoResponse for AppError { 36 | type Body = Full; 37 | type BodyError = Infallible; 38 | 39 | fn into_response(self) -> Response { 40 | let (status, error_message) = match self { 41 | AppError::SomeError(err) => (StatusCode::INTERNAL_SERVER_ERROR, err.to_string()), 42 | AppError::SqlxError(err) => (StatusCode::INTERNAL_SERVER_ERROR, err.to_string()), 43 | }; 44 | 45 | let body = Json(json!({ 46 | "error": error_message, 47 | })); 48 | 49 | (status, body).into_response() 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/api/v1/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod stats; 2 | pub mod providers; 3 | 4 | pub use stats::*; 5 | -------------------------------------------------------------------------------- /src/api/v1/providers.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | 3 | use axum::extract::Extension; 4 | use axum::Json; 5 | use log::{debug, error, info}; 6 | use serde::{Deserialize, Serialize}; 7 | use serde_json::{json, Map, Value}; 8 | 9 | use crate::api::{AppError, Context}; 10 | use crate::scraper::{CanonicalUrlResolution, ProviderFailure, WorkableDomain}; 11 | 12 | #[derive(Deserialize)] 13 | pub struct ProviderAdd { 14 | url: String, 15 | name: String, 16 | official: bool, 17 | metadata: Option, 18 | add_to_amqp: Option, 19 | } 20 | 21 | #[derive(Serialize)] 22 | pub enum ProviderAddResponse { 23 | InvalidUrl { 24 | url: String, 25 | }, 26 | ProviderExists { 27 | provider: String, 28 | destination: String, 29 | }, 30 | InternalError, 31 | NotImplemented, 32 | Success { 33 | provider: String, 34 | destination: String, 35 | }, 36 | } 37 | 38 | pub async fn v1_add_provider( 39 | Extension(state): Extension>, 40 | Json(input): Json, 41 | ) -> Result, AppError> { 42 | let result = state 43 | .providers 44 | .values() 45 | .find_map(|p| p.match_domain(&input.url).map(|res| (p, res))); 46 | let (provider, domain) = match result { 47 | Some((provider, domain)) => (provider, domain), 48 | None => { 49 | debug!("Url {} was not valid", input.url); 50 | return Ok(Json(ProviderAddResponse::InvalidUrl { url: input.url })); 51 | } 52 | }; 53 | let introspectable = match domain { 54 | WorkableDomain::ToCanonical(resource) => resource, 55 | _ => { 56 | debug!( 57 | "WorkableDomain {:?} from [{}] was not detected as Canonical", 58 | provider.id(), 59 | input.url 60 | ); 61 | return Ok(Json(ProviderAddResponse::InvalidUrl { url: input.url })); 62 | } 63 | }; 64 | let response = provider.introspect_resource(&introspectable).await; 65 | let destination = match response { 66 | Ok(CanonicalUrlResolution::Success { destination }) => destination, 67 | Ok(CanonicalUrlResolution::Fail(reason)) => { 68 | error!("{:?}", reason); 69 | return Ok(Json(ProviderAddResponse::InternalError)); 70 | } 71 | Ok(CanonicalUrlResolution::NotImplemented) => { 72 | return Ok(Json(ProviderAddResponse::NotImplemented)) 73 | } 74 | Err(ProviderFailure::Url) => { 75 | return Ok(Json(ProviderAddResponse::InvalidUrl { url: input.url })); 76 | } 77 | Err(other) => { 78 | return Ok(Json(ProviderAddResponse::InternalError)); 79 | } 80 | }; 81 | info!( 82 | "Successfully resolved [destination: {}] for [{:?}]", 83 | destination, 84 | provider.id() 85 | ); 86 | let provider_name = provider.id().to_string(); 87 | if let Err(sqlx::Error::Database(err)) = sqlx::query!( 88 | "INSERT INTO provider_resource (destination, name, default_name, official, url) VALUES 89 | ($1, $2, $3, $4, $5)", 90 | destination, 91 | provider_name, 92 | input.name, 93 | input.official, 94 | input.url 95 | ) 96 | .execute(&*state.db) 97 | .await 98 | { 99 | // the most disgusting way of checking for primary key conflicts 100 | if err.code().map(|code| code == "23505").unwrap_or(false) { 101 | return Ok(Json(ProviderAddResponse::ProviderExists { 102 | destination, 103 | provider: provider_name, 104 | })); 105 | } 106 | }; 107 | // there's a conflict 108 | // TODO: decouple this kiyomi-specific thing out? 109 | if input.add_to_amqp.unwrap_or(false) { 110 | sqlx::query!( 111 | "INSERT INTO amqp_source (provider_name, provider_destination, metadata) 112 | VALUES ($1, $2, $3) 113 | ON CONFLICT(provider_name, provider_destination) DO UPDATE SET metadata = $3", 114 | provider_name, 115 | destination, 116 | input.metadata.unwrap_or(Value::Object(Map::new())) 117 | ) 118 | .fetch_optional(&*state.db) 119 | .await?; 120 | } 121 | Ok(Json(ProviderAddResponse::Success { 122 | destination, 123 | provider: provider_name, 124 | })) 125 | } 126 | 127 | #[derive(Deserialize)] 128 | pub struct ProviderDelete { 129 | name: String, 130 | destination: String, 131 | } 132 | 133 | #[derive(Serialize)] 134 | pub struct ProviderDeleteResponse { 135 | modified: bool, 136 | } 137 | 138 | pub async fn v1_delete_provider( 139 | Extension(state): Extension>, 140 | Json(input): Json, 141 | ) -> Result, AppError> { 142 | let result = sqlx::query!( 143 | "UPDATE provider_resource SET enabled = False WHERE name = $1 and destination = $2 RETURNING *", 144 | input.name, 145 | input.destination, 146 | ) 147 | .fetch_optional(&*state.db) 148 | .await?; 149 | return Ok(Json(ProviderDeleteResponse { 150 | modified: result.is_some(), 151 | })); 152 | } 153 | -------------------------------------------------------------------------------- /src/api/v1/stats.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | 3 | use axum::extract::Extension; 4 | use axum::Json; 5 | use chrono::{DateTime, Duration, NaiveDateTime, Utc}; 6 | use num_traits::ToPrimitive; 7 | use serde::Serialize; 8 | use sqlx::types::BigDecimal; 9 | 10 | use crate::api::{AppError, Context}; 11 | use crate::models::ScrapeHistory; 12 | 13 | struct ScheduledProvider { 14 | id: i32, 15 | url: String, 16 | name: String, 17 | destination: String, 18 | priority: BigDecimal, 19 | tokens: BigDecimal, 20 | default_name: Option, 21 | last_queue: Option, 22 | metadata: Option, 23 | official: bool, 24 | } 25 | 26 | #[derive(Serialize)] 27 | pub struct ScheduledProviderRun { 28 | id: i32, 29 | provider: String, 30 | url: String, 31 | destination: String, 32 | wait_days: i16, 33 | metadata: Option, 34 | name: String, 35 | official: bool, 36 | } 37 | 38 | struct PreviousScrapeRow { 39 | id: i32, 40 | name: String, 41 | url: String, 42 | destination: String, 43 | date: Option, 44 | // last_post: Option, 45 | priority: BigDecimal, 46 | default_name: Option, 47 | official: bool, 48 | discovered_media: Option, 49 | } 50 | 51 | #[derive(Serialize)] 52 | pub struct PreviousScrape { 53 | id: i32, 54 | name: String, 55 | url: String, 56 | destination: String, 57 | // TODO: make this column not-null 58 | date: Option, 59 | // last_post: Option, 60 | // last_scrape: Option, 61 | // last_post: Option, 62 | priority: f32, 63 | default_name: Option, 64 | official: bool, 65 | discovered_media: i64, 66 | } 67 | 68 | #[derive(Serialize)] 69 | pub struct ScheduleResponse { 70 | pub history: Vec, 71 | pub scheduled: Vec, 72 | } 73 | 74 | pub async fn v1_scheduled_scrapes( 75 | Extension(state): Extension>, 76 | ) -> Result>, AppError> { 77 | let rows = sqlx::query_as!( 78 | ScheduledProvider, 79 | "SELECT pr.id, pr.official, pr.priority, pr.name, pr.destination, pr.url, pr.tokens, pr.last_queue, pr.default_name, ( 80 | SELECT metadata FROM amqp_source where provider_destination = pr.destination and provider_name = pr.name 81 | ) as metadata FROM provider_resource pr" 82 | ) 83 | .fetch_all(&*state.db) 84 | .await?; 85 | let (today, later): (Vec, Vec) = 86 | rows.into_iter().partition(|e| { 87 | let now = Utc::now().naive_utc(); 88 | // anything that was queued in the last 24 hours is already being scraped 89 | // it's not SUPER accurate since it's possible but 90 | // we only need a general idea, not precision 91 | e.last_queue 92 | .map(|last_queue| { 93 | let yesterday = now - Duration::hours(24); 94 | last_queue > yesterday 95 | }) 96 | .unwrap_or(false) 97 | }); 98 | let labeled = later 99 | .into_iter() 100 | .map(|row| { 101 | let wait_days = ((1f32 / (row.priority + row.tokens)) 102 | .to_f32() 103 | .unwrap_or(0f32)) 104 | .floor() as i16; 105 | ScheduledProviderRun { 106 | destination: row.destination, 107 | provider: row.name, 108 | id: row.id, 109 | url: row.url, 110 | official: row.official, 111 | wait_days, 112 | metadata: row.metadata, 113 | name: row.default_name.unwrap_or_default(), 114 | } 115 | }) 116 | .collect::>(); 117 | let mut scheduled = today 118 | .into_iter() 119 | .map(|t| ScheduledProviderRun { 120 | destination: t.destination, 121 | provider: t.name, 122 | official: t.official, 123 | id: t.id, 124 | url: t.url, 125 | wait_days: 0, 126 | metadata: t.metadata, 127 | name: t.default_name.unwrap_or_default(), 128 | }) 129 | .collect::>(); 130 | scheduled.extend(labeled); 131 | Ok(Json(scheduled)) 132 | } 133 | 134 | pub async fn v1_scrape_history( 135 | Extension(state): Extension>, 136 | ) -> Result>, AppError> { 137 | let previous = sqlx::query_as!( 138 | PreviousScrapeRow, 139 | "SELECT scrape.id, 140 | pr.url, 141 | pr.default_name, 142 | pr.official, 143 | pr.name, 144 | pr.destination, 145 | scrape.priority, 146 | scrape.scraped_at as date, 147 | COALESCE((SELECT COUNT(*) 148 | from media 149 | inner join public.scrape_request sr on sr.id = media.scrape_request_id 150 | inner join scrape s on s.id = sr.scrape_id 151 | where sr.scrape_id = scrape.id), 0) as discovered_media 152 | FROM scrape 153 | INNER JOIN provider_resource pr on pr.destination = scrape.provider_destination 154 | and scrape.provider_name = pr.name 155 | ORDER BY scrape.scraped_at desc 156 | LIMIT 100 157 | " 158 | ) 159 | .fetch_all(&*state.db) 160 | .await?; 161 | let history = previous 162 | .into_iter() 163 | .map(|row| PreviousScrape { 164 | id: row.id, 165 | name: row.name, 166 | url: row.url, 167 | date: row.date, 168 | destination: row.destination, 169 | // we shouldn't need this, but sqlx doesn't understand the semantics 170 | // of COALESCE for some reason 171 | discovered_media: row.discovered_media.unwrap_or(0), 172 | priority: row.priority.to_f32().unwrap(), 173 | default_name: row.default_name, 174 | official: row.official, 175 | }) 176 | .collect::>(); 177 | Ok(Json(history)) 178 | } 179 | 180 | #[derive(Serialize)] 181 | pub struct ProviderStat { 182 | name: String, 183 | destination: String, 184 | enabled: bool, 185 | url: String, 186 | priority: f32, 187 | tokens: f32, 188 | // TODO: why is this nullable? 189 | created_at: Option, 190 | default_name: Option, 191 | official: bool, 192 | last_scrape: Option, 193 | last_post: Option, 194 | discovered_images: i64, 195 | scrape_count: i64, 196 | } 197 | 198 | #[derive(Serialize)] 199 | pub struct ProviderStatsResponse { 200 | stats: Vec, 201 | } 202 | 203 | pub async fn v1_provider_stats( 204 | Extension(state): Extension>, 205 | ) -> Result, AppError> { 206 | let stats = sqlx::query!( 207 | "SELECT pr.id, 208 | pr.name, 209 | pr.destination, 210 | pr.enabled, 211 | pr.url, 212 | pr.priority, 213 | pr.tokens, 214 | pr.created_at, 215 | pr.default_name, 216 | pr.official, 217 | (SELECT Max(sr.scraped_at) 218 | FROM scrape_request sr 219 | inner join scrape s on pr.destination = s.provider_destination) as last_scrape, 220 | (SELECT MAX(posted_at) 221 | FROM media 222 | INNER JOIN public.scrape_request s on s.id = media.scrape_request_id 223 | inner join scrape s2 on s2.id = s.scrape_id 224 | where s2.provider_destination = pr.destination 225 | and s2.provider_name = pr.name 226 | ) as last_post, 227 | (SELECT COUNT(s3.*) 228 | from media 229 | inner join public.scrape_request r on r.id = media.scrape_request_id 230 | inner join scrape s3 on s3.id = r.scrape_id 231 | where s3.provider_name = pr.name 232 | and s3.provider_destination = pr.destination 233 | ) as discovered_images, 234 | (SELECT COUNT(*) from scrape inner join scrape_request sr2 on scrape.id = sr2.scrape_id 235 | where scrape.provider_destination = pr.destination and scrape.provider_name = pr.name 236 | ) as scrape_count 237 | FROM provider_resource pr;" 238 | ) 239 | .fetch_all(&*state.db) 240 | .await?; 241 | let data = ProviderStatsResponse { 242 | stats: stats 243 | .iter() 244 | .map(|stat| ProviderStat { 245 | name: stat.name.clone(), 246 | destination: stat.destination.clone(), 247 | enabled: stat.enabled.unwrap_or(false), 248 | url: stat.url.clone(), 249 | priority: stat.priority.to_f32().unwrap_or(0f32), 250 | tokens: stat.tokens.to_f32().unwrap_or(0f32), 251 | created_at: stat.created_at, 252 | default_name: stat.default_name.clone(), 253 | official: stat.official, 254 | last_scrape: stat.last_scrape, 255 | last_post: stat.last_post, 256 | discovered_images: stat.discovered_images.unwrap_or(0), 257 | scrape_count: stat.scrape_count.unwrap_or(0), 258 | }) 259 | .collect::>(), 260 | }; 261 | Ok(Json(data)) 262 | } 263 | 264 | // (SELECT Max(scraped_at) FROM scrape_request sr where sr.scrape_id = scrape.id) as last_scrape, 265 | // (SELECT MAX(posted_at) FROM media 266 | // INNER JOIN public.scrape_request s on s.id = media.scrape_request_id 267 | // inner join scrape s2 on s2.id = s.scrape_id 268 | // where s2.id = scrape.id 269 | // ) as last_post 270 | -------------------------------------------------------------------------------- /src/db.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashSet; 2 | use std::env; 3 | use std::iter::FromIterator; 4 | 5 | use anyhow::bail; 6 | use itertools::Itertools; 7 | use log::error; 8 | use sqlx::postgres::PgPoolOptions; 9 | use sqlx::{Error, Pool, Postgres}; 10 | 11 | use crate::dispatcher::dispatcher::WebhookInteraction; 12 | use crate::models::{ 13 | AMQPDestination, DatabaseWebhook, PendingProvider, ScrapeRequestMedia, ScrapeRequestWithMedia, 14 | }; 15 | use crate::request::HttpError; 16 | use crate::scraper::scraper::{Scrape, ScraperStep}; 17 | use crate::scraper::{ProviderFailure, ScopedProvider}; 18 | 19 | pub type Database = Pool; 20 | 21 | pub async fn connect() -> Result { 22 | Ok(PgPoolOptions::new() 23 | .max_connections(5) 24 | .connect(&env::var("DATABASE_URL").expect("No DATABASE_URL env")) 25 | .await?) 26 | } 27 | 28 | // Grab the latest N images from a relevant provider destination 29 | pub async fn latest_media_ids_from_provider( 30 | db: &Database, 31 | provider: &ScopedProvider, 32 | ) -> anyhow::Result> { 33 | let out = sqlx::query!( 34 | "SELECT unique_identifier FROM media 35 | WHERE provider_name = $1 AND provider_destination = $2 36 | order by id desc, discovered_at desc limit 100", 37 | provider.name.to_string(), 38 | provider.destination 39 | ) 40 | .map(|e| e.unique_identifier) 41 | .fetch_all(db) 42 | .await?; 43 | Ok(HashSet::from_iter(out.into_iter())) 44 | } 45 | 46 | pub async fn amqp_metadata( 47 | db: &Database, 48 | sp: &ScopedProvider, 49 | ) -> anyhow::Result> { 50 | let result = sqlx::query_as!( 51 | AMQPDestination, 52 | "SELECT id, metadata FROM amqp_source a WHERE a.provider_destination = $1 AND a.provider_name = $2 LIMIT 1", 53 | sp.destination, 54 | sp.name.to_string() 55 | ).fetch_one(db).await; 56 | match result { 57 | Ok(ok) => Ok(Some(ok)), 58 | Err(err) => match err { 59 | Error::RowNotFound => Ok(None), 60 | err_name => bail!(err_name), 61 | }, 62 | } 63 | } 64 | 65 | pub async fn webhooks_for_provider( 66 | db: &Database, 67 | provider_resolvable: &ScopedProvider, 68 | ) -> anyhow::Result> { 69 | Ok(sqlx::query_as!( 70 | DatabaseWebhook, 71 | "SELECT webhook.*, webhook_source.metadata FROM webhook 72 | JOIN webhook_source on webhook_source.webhook_id = webhook.id 73 | WHERE webhook_source.provider_destination = $1 AND webhook_source.provider_name = $2", 74 | provider_resolvable.destination, 75 | provider_resolvable.name.to_string() 76 | ) 77 | .fetch_all(db) 78 | .await?) 79 | } 80 | 81 | #[derive(Debug)] 82 | pub struct ProcessedScrape { 83 | scrape_id: i32, 84 | } 85 | 86 | /// Adds scrapes to the db. Reverses the scrape list as a side effect 87 | pub async fn process_scrape<'a>( 88 | db: &Database, 89 | scrape: &mut Scrape<'a>, 90 | pending: &PendingProvider, 91 | ) -> anyhow::Result { 92 | let mut tx = db.begin().await?; 93 | let out = sqlx::query!( 94 | "INSERT INTO scrape (provider_name, provider_destination, priority) VALUES ($1, $2, $3) returning id", 95 | scrape.provider.name.to_string(), 96 | scrape.provider.destination, 97 | pending.priority.level 98 | ) 99 | .fetch_one(&mut tx) 100 | .await?; 101 | // we don't really care about making sure this is completely correct 102 | sqlx::query!( 103 | "UPDATE provider_resource 104 | SET 105 | last_scrape = NOW(), 106 | tokens = tokens - 1 107 | WHERE name = $1 AND destination = $2 108 | RETURNING *", 109 | scrape.provider.name.to_string(), 110 | scrape.provider.destination, 111 | ) 112 | .fetch_one(db) 113 | .await?; 114 | let scrape_id = out.id; 115 | let requests = &mut scrape.requests; 116 | // we specifically need to reverse this list of requests/images 117 | // to make sure that the images that were first scraped get inserted 118 | // last with the highest id 119 | requests.reverse(); 120 | 121 | for (i, request) in requests.iter().enumerate() { 122 | match &request.step { 123 | ScraperStep::Data(provider_result) => { 124 | let response_code = provider_result.response_code.as_u16(); 125 | let scrape_request_row = sqlx::query!( 126 | "INSERT INTO scrape_request (scrape_id, response_code, response_delay, scraped_at, page) 127 | VALUES ($1, $2, $3, $4, $5) 128 | RETURNING id", 129 | scrape_id, 130 | response_code as u32, 131 | // unsafe downcast from u128? I hope the request doesn't take 2 billion milliseconds kekw 132 | provider_result.response_delay.as_millis() as u32, 133 | request.date, 134 | // pages are 1-indexed 135 | (i as i32) + 1 136 | ).fetch_one(&mut tx).await?; 137 | // we're not persisting post data, but that's ok 138 | let mut posts = provider_result.posts.clone(); 139 | posts.reverse(); 140 | for post in posts { 141 | let mut images = post.images.clone(); 142 | images.reverse(); 143 | for media in images.iter() { 144 | sqlx::query!( 145 | "INSERT INTO media ( 146 | provider_name, 147 | provider_destination, 148 | scrape_request_id, 149 | image_url, 150 | page_url, 151 | reference_url, 152 | unique_identifier, 153 | posted_at, 154 | discovered_at 155 | ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9) 156 | ON CONFLICT (image_url) DO update set discovered_at = NOW() returning *", 157 | // sometimes we end up re-scraping the latest known images 158 | &scrape.provider.name.to_string(), 159 | &scrape.provider.destination, 160 | scrape_request_row.id, 161 | media.media_url, 162 | post.url, 163 | media.reference_url, 164 | media.unique_identifier, 165 | post.post_date, 166 | request.date 167 | ) 168 | .fetch_optional(&mut tx) 169 | .await?; 170 | } 171 | } 172 | } 173 | ScraperStep::Error(ProviderFailure::HttpError(error)) => { 174 | match &error { 175 | HttpError::ReqwestError(err) => { 176 | // we should not be getting request related errors, only response errors 177 | if err.is_request() { 178 | error!( 179 | "Got an error from a provider that was caused by a request\n{:?}", 180 | err.url() 181 | ); 182 | error!("{:?}", err); 183 | continue; 184 | } 185 | 186 | if let Some(status) = err.status() { 187 | sqlx::query!( 188 | "INSERT INTO scrape_error (scrape_id, response_code) 189 | VALUES ($1, $2)", 190 | scrape_id, 191 | status.as_u16() as i32 192 | ) 193 | .fetch_one(&mut tx) 194 | .await?; 195 | } else { 196 | error!("Got an unexpected error from a provider that doesn't have a status",); 197 | error!("{:?}", err); 198 | continue; 199 | } 200 | } 201 | HttpError::FailStatus(ctx) | HttpError::UnexpectedBody(ctx) => { 202 | sqlx::query!( 203 | "INSERT INTO scrape_error (scrape_id, response_code, response_body, message) 204 | VALUES ($1, $2, $3, $4) returning id", 205 | scrape_id, 206 | ctx.code.as_u16() as i32, 207 | ctx.body, 208 | ctx.message, 209 | ) 210 | .fetch_one(&mut tx) 211 | .await?; 212 | } 213 | } 214 | } 215 | ScraperStep::Error(ProviderFailure::Url) => { 216 | println!( 217 | "Could not formal url properly for {}: {}", 218 | scrape.provider.name.to_string(), 219 | scrape.provider.destination 220 | ); 221 | } 222 | _ => {} 223 | } 224 | } 225 | tx.commit().await?; 226 | Ok(ProcessedScrape { scrape_id: out.id }) 227 | } 228 | 229 | pub async fn submit_webhook_responses( 230 | db: &Database, 231 | processed_scrape: ProcessedScrape, 232 | interactions: Vec, 233 | ) -> anyhow::Result<()> { 234 | let mut tx = db.begin().await?; 235 | // can't commit the invocation if we don't have a response status 236 | for interaction in interactions { 237 | let response_time = interaction.response_time.as_millis() as i32; 238 | let response = interaction.response; 239 | let status = match response { 240 | Ok(res) => Some(res.status()), 241 | Err(HttpError::UnexpectedBody(err)) | Err(HttpError::FailStatus(err)) => Some(err.code), 242 | Err(HttpError::ReqwestError(err)) => { 243 | let out = err.status(); 244 | if out.is_none() { 245 | println!("Received a response without a status code"); 246 | eprintln!("{:?}", err); 247 | } 248 | out 249 | } 250 | }; 251 | if let Some(code) = status { 252 | sqlx::query!( 253 | "INSERT INTO webhook_invocation ( 254 | scrape_id, 255 | webhook_id, 256 | response_code, 257 | response_delay 258 | ) VALUES ($1, $2, $3, $4) RETURNING *", 259 | processed_scrape.scrape_id, 260 | interaction.webhook.id, 261 | code.as_u16() as i32, 262 | response_time 263 | ) 264 | .fetch_one(&mut tx) 265 | .await?; 266 | } else { 267 | println!( 268 | "Failed to persist webhook response from {}", 269 | interaction.webhook.destination 270 | ) 271 | } 272 | } 273 | tx.commit().await?; 274 | Ok(()) 275 | } 276 | 277 | pub async fn latest_requests( 278 | db: &Database, 279 | _only_with_media: bool, 280 | ) -> anyhow::Result> { 281 | let results = sqlx::query!( 282 | "select 283 | sr.id as scrape_request_id, 284 | s.id as scrape_id, 285 | pr.name, 286 | sr.response_delay, 287 | sr.response_code, 288 | sr.scraped_at, 289 | pr.url 290 | from scrape_request sr 291 | join scrape s 292 | on s.id = sr.scrape_id 293 | join provider_resource pr 294 | on pr.name = s.provider_name and pr.destination = s.provider_destination 295 | ORDER BY sr.scraped_at desc 296 | LIMIT 50", 297 | ) 298 | .fetch_all(db) 299 | .await?; 300 | let scrape_ids = results 301 | .iter() 302 | .unique_by(|rec| rec.scrape_id) 303 | .map(|rec| rec.scrape_id) 304 | .collect::>(); 305 | // we're using scrape_id and not scrape_request_id because users only care about individual scrapes and not requests 306 | let medias = sqlx::query!( 307 | "SELECT sr.scrape_id, scrape_request_id, page_url, image_url 308 | FROM media m 309 | join scrape_request sr 310 | on sr.id = m.scrape_request_id 311 | join scrape s 312 | on s.id = sr.scrape_id 313 | where s.id = ANY($1)", 314 | &scrape_ids 315 | ) 316 | .fetch_all(db) 317 | .await?; 318 | 319 | let media_map = medias 320 | .into_iter() 321 | .filter(|rec| rec.scrape_id.is_some() && rec.image_url.is_some()) 322 | .into_group_map_by(|rec| rec.scrape_id.unwrap()); 323 | 324 | let mut out: Vec = vec![]; 325 | for result in results { 326 | out.push(ScrapeRequestWithMedia { 327 | response_code: result.response_code, 328 | response_delay: result.response_delay, 329 | provider_name: result.name.clone(), 330 | url: result.url.clone(), 331 | date: result.scraped_at, 332 | media: media_map 333 | .get(&result.scrape_id) 334 | .unwrap_or(&vec![]) 335 | .iter() 336 | .filter_map(|m| { 337 | if m.scrape_id.unwrap() == result.scrape_id { 338 | Some(ScrapeRequestMedia { 339 | media_url: m.image_url.clone().unwrap(), 340 | page_url: m.page_url.clone().unwrap(), 341 | }) 342 | } else { 343 | None 344 | } 345 | }) 346 | .collect::>(), 347 | }) 348 | } 349 | Ok(out) 350 | } 351 | -------------------------------------------------------------------------------- /src/dispatcher/amqp.rs: -------------------------------------------------------------------------------- 1 | use lapin::options::{ 2 | BasicPublishOptions, ExchangeBindOptions, ExchangeDeclareOptions, QueueDeclareOptions, 3 | }; 4 | use lapin::types::FieldTable; 5 | use lapin::{ 6 | BasicProperties, Channel, Connection, ConnectionProperties, ExchangeKind, Result as LapinResult, 7 | }; 8 | use log::error; 9 | 10 | use crate::dispatcher::dispatcher::DispatchablePayload; 11 | 12 | pub struct AMQPDispatcher { 13 | channel: Channel, 14 | } 15 | 16 | const DIRECT_QUEUE_NAME: &str = "image_discovery"; 17 | 18 | impl AMQPDispatcher { 19 | pub async fn from_connection_string(url: &str) -> LapinResult { 20 | let conn = Connection::connect(url, ConnectionProperties::default()).await?; 21 | let channel = conn.create_channel().await?; 22 | channel 23 | .exchange_declare( 24 | DIRECT_QUEUE_NAME, 25 | ExchangeKind::Topic, 26 | ExchangeDeclareOptions { 27 | ..ExchangeDeclareOptions::default() 28 | }, 29 | FieldTable::default(), 30 | ) 31 | .await?; 32 | // technically we're only a publisher and shouldn't be 33 | // declaring a queue but whatever 34 | channel 35 | .queue_declare( 36 | DIRECT_QUEUE_NAME, 37 | QueueDeclareOptions { 38 | durable: true, 39 | ..QueueDeclareOptions::default() 40 | }, 41 | FieldTable::default(), 42 | ) 43 | .await?; 44 | channel 45 | .exchange_bind( 46 | DIRECT_QUEUE_NAME, 47 | DIRECT_QUEUE_NAME, 48 | DIRECT_QUEUE_NAME, 49 | ExchangeBindOptions::default(), 50 | FieldTable::default(), 51 | ) 52 | .await?; 53 | LapinResult::Ok(Self { channel }) 54 | } 55 | pub async fn publish(&self, payload: &DispatchablePayload) { 56 | match serde_json::to_vec(&payload) { 57 | Err(err) => { 58 | error!("Error serializing AMQP payload {:?}", err) 59 | } 60 | Ok(value) => { 61 | let result = self 62 | .channel 63 | .basic_publish( 64 | "", 65 | DIRECT_QUEUE_NAME, 66 | BasicPublishOptions::default(), 67 | value, 68 | BasicProperties::default(), 69 | ) 70 | .await; 71 | if let Err(e) = result { 72 | error!("Couldn't publish to AMQP {:?}", e) 73 | } 74 | } 75 | } 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/dispatcher/discord.rs: -------------------------------------------------------------------------------- 1 | use serde::Serialize; 2 | 3 | #[derive(Debug, Serialize)] 4 | pub struct DiscordImage { 5 | pub url: String, 6 | } 7 | 8 | #[derive(Debug, Serialize)] 9 | pub struct DiscordEmbed { 10 | pub image: DiscordImage, 11 | } 12 | 13 | #[derive(Debug, Serialize)] 14 | pub struct DiscordPayload<'a> { 15 | pub username: &'a str, 16 | pub avatar_url: &'a str, 17 | pub content: String, // Vec, 18 | } 19 | 20 | pub fn is_discord_webhook_url(url: &str) -> bool { 21 | url.starts_with("https://discord.com/api/webhooks") 22 | } 23 | -------------------------------------------------------------------------------- /src/dispatcher/dispatcher.rs: -------------------------------------------------------------------------------- 1 | use futures::{stream, StreamExt}; 2 | use tokio::sync::Mutex; 3 | // use parking_lot::Mutex; 4 | use reqwest::{Client, Response}; 5 | use serde::Serialize; 6 | use std::{ 7 | sync::{Arc, RwLock}, 8 | time::Instant, 9 | }; 10 | 11 | use crate::{ 12 | dispatcher::{webhook_type, WebhookDestination}, 13 | models::DatabaseWebhook, 14 | request::{request_default_headers, HttpError}, 15 | scraper::{ 16 | scraper::{Scrape, ScraperStep}, 17 | AllProviders, ProviderPost, 18 | }, 19 | }; 20 | 21 | use super::super::scraper::Provider; 22 | 23 | pub struct WebhookDispatch { 24 | pub webhook: DatabaseWebhook, 25 | } 26 | 27 | #[derive(Debug)] 28 | pub struct WebhookInteraction { 29 | pub webhook: DatabaseWebhook, 30 | pub response: Result, 31 | pub response_time: std::time::Duration, 32 | } 33 | 34 | #[derive(Debug, Serialize, Clone)] 35 | pub struct DispatchablePayloadProviderInfo { 36 | #[serde(rename = "type")] 37 | pub _type: AllProviders, 38 | pub id: String, 39 | pub ephemeral: bool, 40 | pub official: bool, 41 | } 42 | 43 | #[derive(Debug, Serialize, Clone)] 44 | pub struct DispatchablePayload { 45 | pub provider: DispatchablePayloadProviderInfo, 46 | pub posts: Vec, 47 | pub metadata: Option, 48 | } 49 | 50 | impl DispatchablePayload { 51 | pub fn new( 52 | provider: &dyn Provider, 53 | scrape: &Scrape, 54 | metadata: Option, 55 | ) -> Self { 56 | let posts = scrape 57 | .requests 58 | .iter() 59 | .filter_map(|req| match &req.step { 60 | ScraperStep::Data(data) => Some(data), 61 | ScraperStep::Error(_) => None, 62 | }) 63 | .flat_map(|result| result.posts.clone()) 64 | .collect::>(); 65 | DispatchablePayload { 66 | provider: DispatchablePayloadProviderInfo { 67 | _type: scrape.provider.name, 68 | id: scrape.provider.destination.clone(), 69 | ephemeral: provider.ephemeral(), 70 | official: scrape.provider.official, 71 | }, 72 | posts, 73 | metadata, 74 | } 75 | } 76 | } 77 | 78 | const WEBHOOK_DISPATCH_CONCURRENCY_LIMIT: usize = 8; 79 | 80 | pub async fn dispatch_webhooks<'a>( 81 | // provider: &dyn Provider, 82 | // scrape: &Scrape<'a>, 83 | dispatch: Vec<(DatabaseWebhook, DispatchablePayload)>, 84 | ) -> Vec { 85 | let client = &Client::new(); 86 | // request results are not guaranteed to be in order 87 | let mut results: Vec = vec![]; 88 | 89 | let results_lock = Arc::new(Mutex::new(&mut results)); 90 | let iter = |(wh, payload): (DatabaseWebhook, DispatchablePayload)| { 91 | let f = results_lock.lock(); 92 | async move { 93 | let builder = client 94 | .post(&wh.destination) 95 | .headers(request_default_headers()); 96 | let instant = Instant::now(); 97 | if let WebhookDestination::Custom = webhook_type(&wh.destination) { 98 | let response = builder 99 | .json(&payload) 100 | .send() 101 | .await 102 | .map_err(HttpError::ReqwestError); 103 | let response_time = instant.elapsed(); 104 | f.await.push(WebhookInteraction { 105 | webhook: wh, 106 | response, 107 | response_time, 108 | }); 109 | } 110 | } 111 | }; 112 | 113 | stream::iter(dispatch) 114 | // sadly there's no `map_concurrent` for futures 115 | .for_each_concurrent(WEBHOOK_DISPATCH_CONCURRENCY_LIMIT, iter) 116 | .await; 117 | results 118 | } 119 | -------------------------------------------------------------------------------- /src/dispatcher/mod.rs: -------------------------------------------------------------------------------- 1 | use self::discord::is_discord_webhook_url; 2 | 3 | pub mod amqp; 4 | mod discord; 5 | pub mod dispatcher; 6 | 7 | pub enum WebhookDestination { 8 | #[deprecated] 9 | Discord, 10 | Custom, 11 | } 12 | 13 | pub fn webhook_type(url: &str) -> WebhookDestination { 14 | if is_discord_webhook_url(url) { 15 | WebhookDestination::Discord 16 | } else { 17 | WebhookDestination::Custom 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod db; 2 | pub mod dispatcher; 3 | pub mod models; 4 | pub mod request; 5 | pub mod scheduler; 6 | pub mod scraper; 7 | pub mod server; 8 | pub mod api; 9 | pub use dotenv::dotenv; 10 | pub use std::env; 11 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | use std::env; 2 | use std::{error::Error, sync::Arc, time::Duration}; 3 | 4 | use dotenv::dotenv; 5 | use futures::future::join_all; 6 | use log::{debug, error, info, trace}; 7 | use reqwest::Client; 8 | use sqlx::{Pool, Postgres}; 9 | 10 | use jiu::dispatcher::amqp::AMQPDispatcher; 11 | use jiu::dispatcher::dispatcher::{dispatch_webhooks, DispatchablePayload}; 12 | use jiu::server::run_server; 13 | use jiu::{ 14 | db::*, 15 | models::PendingProvider, 16 | scheduler::*, 17 | scraper::{get_provider_map, scraper::scrape, Provider, ProviderMap, ScrapeRequestInput}, 18 | }; 19 | 20 | struct Context { 21 | db: Arc>, 22 | amqp: Arc>, 23 | client: Arc, 24 | provider_map: Arc, 25 | } 26 | 27 | async fn iter( 28 | ctx: Arc, 29 | pending: &PendingProvider, 30 | provider: &dyn Provider, 31 | ) -> anyhow::Result<()> { 32 | let sp = pending.provider.clone(); 33 | let latest_data = latest_media_ids_from_provider(&ctx.db, &sp).await?; 34 | // there must be at least ONE data found if the scrape isn't the first ever one 35 | let is_first_scrape = latest_data.is_empty(); 36 | if is_first_scrape { 37 | trace!( 38 | "Scraping {}: {} for the first time ever", 39 | sp.name.to_string(), 40 | sp.destination 41 | ) 42 | } 43 | let step = ScrapeRequestInput { 44 | latest_data, 45 | is_first_scrape, 46 | default_name: pending.default_name.clone(), 47 | last_scrape: pending.last_scrape, 48 | }; 49 | let mut result = scrape(&sp, &*provider, &step).await?; 50 | 51 | let webhooks = webhooks_for_provider(&ctx.db, &sp).await?; 52 | let webhook_interactions = if result.discovered_new_images() { 53 | let dispatch = webhooks 54 | .into_iter() 55 | .map(|wh| { 56 | let payload = DispatchablePayload::new(&*provider, &result, wh.metadata.clone()); 57 | (wh, payload) 58 | }) 59 | .collect::>(); 60 | // we don't really care about the interactions in amqp since we have full 61 | // control of that environment anyways 62 | if let Some(amqp) = &*ctx.amqp { 63 | if let Ok(Some(amqp_d)) = amqp_metadata(&*ctx.db, &sp).await { 64 | let payload = DispatchablePayload::new(&*provider, &result, amqp_d.metadata); 65 | trace!("Publishing AMQP message for {}", &provider.id().to_string()); 66 | amqp.publish(&payload).await; 67 | } 68 | } 69 | Some(dispatch_webhooks(dispatch).await) 70 | } else { 71 | None 72 | }; 73 | // process scraping MUST come after dispatcher dispatching since it mutates the array by reversing it 74 | let processed_scrape = process_scrape(&ctx.db, &mut result, pending).await?; 75 | if webhook_interactions.is_some() { 76 | submit_webhook_responses(&ctx.db, processed_scrape, webhook_interactions.unwrap()).await? 77 | } 78 | Ok(()) 79 | } 80 | 81 | async fn job_loop(ctx: Arc) { 82 | let arc_db = Arc::clone(&ctx.db); 83 | trace!("Getting pending scrapes"); 84 | let pendings = match pending_scrapes(&arc_db).await { 85 | Err(error) => { 86 | println!("{:?}", error); 87 | return; 88 | } 89 | Ok(result) => result, 90 | }; 91 | trace!("Getting pending scrapes"); 92 | if let Some(err) = update_priorities(&arc_db, &pendings).await.err() { 93 | // should an error here be preventing the scrape? 94 | // Could end up spamming a provider if it's stuck at a high value 95 | error!("{:?}", err); 96 | }; 97 | trace!("Preparing to scrape {} pending providers", pendings.len()); 98 | 99 | let this_scrape = pendings.iter().map(Arc::new).map(|pending| async { 100 | let pp = pending; 101 | let sleep_time = pp.scrape_date; 102 | tokio::time::sleep(sleep_time).await; 103 | if let Err(err) = run(Arc::clone(&ctx), &pp, &ctx.provider_map).await { 104 | error!("{:?}", err); 105 | return; 106 | } 107 | debug!("Finished scraping {}", pp.provider.name.to_string()); 108 | }); 109 | join_all(this_scrape).await; 110 | } 111 | 112 | async fn run( 113 | ctx: Arc, 114 | pp: &PendingProvider, 115 | provider_map: &ProviderMap, 116 | ) -> Result<(), Box> { 117 | let provider = provider_map.get(&pp.provider.name).unwrap_or_else(|| { 118 | panic!( 119 | "Tried to get a provider that doesn't exist {}", 120 | &pp.provider, 121 | ) 122 | }); 123 | if let Err(err) = iter(Arc::clone(&ctx), pp, &**provider).await { 124 | eprintln!("{:?}", err); 125 | } 126 | Ok(()) 127 | } 128 | 129 | async fn setup() -> anyhow::Result<()> { 130 | info!("Starting JiU"); 131 | let client = Arc::new(Client::new()); 132 | let provider_map = Arc::new( 133 | get_provider_map(&Arc::clone(&client)) 134 | .await 135 | .expect("Could not successfully initialize a provider map"), 136 | ); 137 | let pm = Arc::clone(&provider_map); 138 | tokio::spawn(async move { 139 | match connect().await { 140 | Ok(db) => run_server(Arc::new(db), pm, 8080).await, 141 | Err(err) => { 142 | error!("{:?}", err) 143 | } 144 | } 145 | }); 146 | loop { 147 | let client = Arc::clone(&client); 148 | let provider_map = Arc::clone(&provider_map); 149 | info!("Starting job loop {}", SCHEDULER_END_MILLISECONDS); 150 | let data = match env::var("NO_WORKER") { 151 | Ok(_) => { 152 | info!("Not starting worker because 'NO_WORKER' environment was set"); 153 | tokio::task::spawn(async {}) 154 | } 155 | _ => tokio::task::spawn(async move { 156 | let db = match connect().await { 157 | Err(err) => { 158 | error!("{:?}", err); 159 | return; 160 | } 161 | Ok(db) => db, 162 | }; 163 | let db = Arc::new(db); 164 | let amqp = Arc::new(match env::var("AMQP_URL") { 165 | Ok(a) => Some(AMQPDispatcher::from_connection_string(&a).await.unwrap()), 166 | Err(_) => None, 167 | }); 168 | let ctx = Arc::new(Context { 169 | db: Arc::clone(&db), 170 | amqp: Arc::clone(&amqp), 171 | client: Arc::clone(&client), 172 | provider_map, 173 | }); 174 | info!("Starting requests for the day..."); 175 | job_loop(ctx).await; 176 | info!("Requests finished for the day..."); 177 | }), 178 | }; 179 | let delay = tokio::time::sleep(Duration::from_millis(SCHEDULER_END_MILLISECONDS)); 180 | if let (_, Err(join_err)) = tokio::join!(delay, data) { 181 | println!("{:?}", join_err) 182 | } 183 | info!("Finished job loop"); 184 | } 185 | } 186 | 187 | #[tokio::main] 188 | async fn main() { 189 | better_panic::install(); 190 | env_logger::init(); 191 | dotenv().ok(); 192 | 193 | info!("Running program"); 194 | if let Err(err) = setup().await { 195 | error!("{:?}", err); 196 | }; 197 | info!("Shutting down...") 198 | } 199 | -------------------------------------------------------------------------------- /src/models.rs: -------------------------------------------------------------------------------- 1 | use crate::{scheduler::Priority, scraper::ScopedProvider}; 2 | use chrono::NaiveDateTime; 3 | use serde::Serialize; 4 | use std::fmt::Display; 5 | use std::time::Duration; 6 | 7 | #[derive(Debug)] 8 | pub struct AMQPDestination { 9 | pub id: i32, 10 | pub metadata: Option, 11 | } 12 | 13 | #[derive(Debug)] 14 | pub struct DatabaseWebhook { 15 | pub id: i32, 16 | pub destination: String, 17 | pub created_at: NaiveDateTime, 18 | pub updated_at: NaiveDateTime, 19 | pub metadata: Option, 20 | } 21 | 22 | #[derive(Debug, Clone, Serialize)] 23 | pub struct ScrapeRequestMedia { 24 | pub media_url: String, 25 | pub page_url: String, 26 | } 27 | 28 | #[derive(Debug, Clone, Serialize)] 29 | pub struct ScrapeRequestWithMedia { 30 | pub provider_name: String, 31 | pub url: String, 32 | pub response_code: Option, 33 | pub response_delay: Option, 34 | pub date: NaiveDateTime, 35 | pub media: Vec, 36 | } 37 | 38 | #[derive(Debug)] 39 | pub struct DatabaseWebhookSource { 40 | pub id: i32, 41 | pub webhook_id: i32, 42 | pub provider_destination: String, 43 | } 44 | 45 | #[derive(Debug, Clone)] 46 | pub struct ScrapeHistory { 47 | pub priority: Priority, 48 | pub provider: ScopedProvider, 49 | pub date: NaiveDateTime, 50 | pub result_count: u32, 51 | } 52 | 53 | #[derive(Debug, Clone, PartialEq, Eq, Hash)] 54 | 55 | pub struct PendingProvider { 56 | pub id: i32, 57 | /// the name that is used if a more relevant name for posts cannot be found 58 | pub default_name: Option, 59 | pub priority: Priority, 60 | pub provider: ScopedProvider, 61 | pub scrape_date: Duration, 62 | pub last_scrape: Option, 63 | } 64 | 65 | impl Display for PendingProvider { 66 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 67 | f.write_str(&format!("{}", self.provider)) 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/request.rs: -------------------------------------------------------------------------------- 1 | use std::env; 2 | use std::iter::FromIterator; 3 | 4 | use log::error; 5 | use reqwest; 6 | use reqwest::header::{HeaderMap, HeaderName, HeaderValue}; 7 | use reqwest::{Response, StatusCode}; 8 | use serde::de::DeserializeOwned; 9 | use thiserror::Error; 10 | 11 | #[derive(Debug, Clone)] 12 | pub struct ResponseErrorContext { 13 | pub body: String, 14 | pub code: StatusCode, 15 | pub message: Option, 16 | } 17 | 18 | /// Wrapper for providing actual useful information about 19 | /// why responses failed since reqwest throws that information 20 | /// away when it encounters errors 21 | #[derive(Error, Debug)] 22 | pub enum HttpError { 23 | #[error("Failed response code {0:?}")] 24 | FailStatus(ResponseErrorContext), 25 | #[error("Unexpected body {0:?}")] 26 | UnexpectedBody(ResponseErrorContext), 27 | #[error("Request error")] 28 | ReqwestError(#[from] reqwest::Error), 29 | } 30 | 31 | pub async fn parse_successful_response( 32 | response: Response, 33 | ) -> Result { 34 | let response_code = response.status(); 35 | let url = response.url().clone(); 36 | let response_body = response.text().await?; 37 | if !response_code.is_success() { 38 | // sadly shitty reqwest doesn't give us the response body as 39 | // context when trying to handle invalid responses 40 | return Err(HttpError::FailStatus(ResponseErrorContext { 41 | body: response_body, 42 | code: response_code, 43 | message: None, 44 | })); 45 | } 46 | serde_json::from_str::(&response_body).map_err(|error| { 47 | error!("{:?}", error); 48 | error!("Failed to parse response from {}", url); 49 | HttpError::UnexpectedBody(ResponseErrorContext { 50 | body: response_body, 51 | code: response_code, 52 | message: Some(error.to_string()), 53 | }) 54 | }) 55 | } 56 | 57 | pub fn request_default_headers() -> HeaderMap { 58 | // TODO: change the user agent if the program has been forked to modify 59 | // important settings like request speed 60 | let user_agent: String = 61 | env::var("USER_AGENT").expect("Missing USER_AGENT environment variable"); 62 | HeaderMap::from_iter([( 63 | HeaderName::from_static("user-agent"), 64 | HeaderValue::from_str(&user_agent).unwrap(), 65 | )]) 66 | } 67 | -------------------------------------------------------------------------------- /src/scheduler/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod priority; 2 | pub use priority::*; 3 | pub mod scheduler; 4 | pub use scheduler::*; 5 | pub mod rate_limiter; 6 | pub use rate_limiter::*; 7 | 8 | const MIN_PRIORITY: f32 = 0.07; 9 | const MAX_PRIORITY: f32 = 1.75; 10 | -------------------------------------------------------------------------------- /src/scheduler/priority.rs: -------------------------------------------------------------------------------- 1 | use std::convert::TryInto; 2 | 3 | use num_traits::FromPrimitive; 4 | use sqlx::types::BigDecimal; 5 | 6 | use crate::{models::ScrapeHistory, scheduler::MIN_PRIORITY}; 7 | 8 | use super::MAX_PRIORITY; 9 | 10 | #[derive(Debug)] 11 | pub struct InvalidPriority(f32); 12 | 13 | #[derive(Debug, PartialEq, Eq, PartialOrd, Clone, Hash)] 14 | pub struct Priority { 15 | pub level: BigDecimal, 16 | } 17 | 18 | impl From for Priority { 19 | fn from(level: f32) -> Self { 20 | Self { 21 | level: BigDecimal::from_f32(level).unwrap(), 22 | } 23 | } 24 | } 25 | 26 | impl Default for Priority { 27 | fn default() -> Self { 28 | Priority::unchecked_clamp(1f32) 29 | } 30 | } 31 | 32 | const MAX_RESULT_CONTRIBUTION: u32 = 3; 33 | 34 | impl Priority { 35 | /// Decide the next priority based on the the recent scrape history of the 36 | /// provider priority. 37 | pub fn next(&self, history: &[ScrapeHistory]) -> Self { 38 | if history.is_empty() { 39 | return Self { 40 | level: BigDecimal::from_f32(1f32).unwrap(), 41 | }; 42 | } 43 | let n = history.len() as i32; 44 | 45 | let raw_weights = (0i32..n).map(|x| (x - n - 1).pow(2)); 46 | let sum_raw_weight: i32 = raw_weights.clone().sum(); 47 | let weights = raw_weights.map(|x| x as f32 / sum_raw_weight as f32); 48 | let weight_sum = weights.clone().sum::(); 49 | let z = weights.zip(history); 50 | let raw_weighted_average: f32 = z 51 | .map(|(a, b)| (a * b.result_count.min(MAX_RESULT_CONTRIBUTION) as f32)) 52 | .sum(); 53 | 54 | let weighted_average: f32 = (raw_weighted_average * weight_sum) / weight_sum as f32; 55 | let scaled = weighted_average * (MAX_PRIORITY - MIN_PRIORITY) + MIN_PRIORITY; 56 | let level = scaled.clamp(MIN_PRIORITY, MAX_PRIORITY); 57 | 58 | // in some strange situations f32 is NaN. 59 | // These cases are normally handled at the top of the function but if not... we just default to 60 | // the existing thing 61 | let level = BigDecimal::from_f32(level).unwrap_or_else(|| self.level.clone()); 62 | Self { level } 63 | } 64 | pub fn unchecked_clamp(level: f32) -> Self { 65 | level 66 | .clamp(MIN_PRIORITY, MAX_PRIORITY) 67 | .try_into() 68 | // something has gone very wrong if the level is out of bounds 69 | .expect(&format!("{} is not a valid priority", level)) 70 | } 71 | } 72 | 73 | #[cfg(test)] 74 | mod tests { 75 | use chrono::NaiveDateTime; 76 | use num_traits::FromPrimitive; 77 | use sqlx::types::BigDecimal; 78 | 79 | use crate::{ 80 | models::ScrapeHistory, 81 | scheduler::{MAX_PRIORITY, MIN_PRIORITY}, 82 | scraper::ScopedProvider, 83 | }; 84 | 85 | use super::Priority; 86 | 87 | #[test] 88 | fn priority_check() { 89 | let prio = Priority::unchecked_clamp(0f32); 90 | let make_hist = |count| ScrapeHistory { 91 | date: NaiveDateTime::from_timestamp(0, 0), 92 | priority: prio.clone(), 93 | provider: ScopedProvider { 94 | destination: "".to_owned(), 95 | name: crate::scraper::AllProviders::PinterestBoardFeed, 96 | }, 97 | result_count: count, 98 | }; 99 | let hist = make_hist(1); 100 | let n = prio.next(&[hist.clone(), hist.clone(), hist.clone(), hist]); 101 | assert_eq!(n.level, BigDecimal::from_f32(MAX_PRIORITY).unwrap()); 102 | 103 | let n = prio.next(&(0..15).map(|_| make_hist(1)).collect::>()); 104 | assert_eq!(n.level, BigDecimal::from_f32(MAX_PRIORITY).unwrap()); 105 | 106 | let n = prio.next(&(0..15).map(|_| make_hist(0)).collect::>()); 107 | assert_eq!(n.level, BigDecimal::from_f32(MIN_PRIORITY).unwrap()) 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /src/scheduler/rate_limiter.rs: -------------------------------------------------------------------------------- 1 | use governor::{ 2 | clock::QuantaClock, 3 | state::{DirectStateStore, InMemoryState, NotKeyed}, 4 | RateLimiter, 5 | }; 6 | 7 | /// Most providers use rate limiter at the domain level and not at the page level 8 | /// in order to prevent exceeding rate limits imposed by webservers 9 | pub type UnscopedLimiter = RateLimiter; 10 | 11 | /// Some providers can use rate limiting at the page level imposed by set limits of API keys 12 | #[allow(dead_code)] 13 | pub type ScopedLimiter = RateLimiter; 14 | 15 | /// Global rate limiting wrapper for limits imposed on individual providers being run concurrently 16 | pub struct GlobalRateLimiter(UnscopedLimiter); 17 | -------------------------------------------------------------------------------- /src/scheduler/scheduler.rs: -------------------------------------------------------------------------------- 1 | use std::ops::Add; 2 | use std::time::Duration; 3 | use std::{collections::HashSet, convert::TryInto, hash::Hash, iter::FromIterator, str::FromStr}; 4 | 5 | use itertools::{unfold, Itertools}; 6 | use log::{debug, info}; 7 | use num_traits::cast::ToPrimitive; 8 | use rand::Rng; 9 | 10 | use crate::{ 11 | db::Database, 12 | models::{PendingProvider, ScrapeHistory}, 13 | scheduler::Priority, 14 | scraper::{AllProviders, ScopedProvider}, 15 | }; 16 | 17 | const MAX_TESTING_PROVIDERS: usize = 10; 18 | pub const SCHEDULER_START_MILLISECONDS: u64 = if cfg!(debug_assertions) { 19 | 1000 * 3 20 | } else { 21 | 1000 * 30 22 | }; 23 | 24 | // making life easier for testing. Could blow up in my face some day... 25 | pub const SCHEDULER_END_MILLISECONDS: u64 = if cfg!(debug_assertions) { 26 | 1000 * 10 27 | } else { 28 | 8.64e7 as u64 29 | }; 30 | 31 | /// We only want to scrape one single endpoint at most 3 times a day 32 | const MAX_DAILY_SCRAPE_COUNT: i32 = 3; 33 | 34 | pub type RunningProviders = HashSet; 35 | 36 | /// Scheduled providers are ready to be processed 37 | #[derive(Debug)] 38 | pub struct ScheduledProviders(Vec); 39 | 40 | impl ScheduledProviders { 41 | pub fn providers(&self) -> &Vec { 42 | &self.0 43 | } 44 | pub fn len(&self) -> usize { 45 | self.0.len() 46 | } 47 | } 48 | 49 | /// Get a list of sorted scrapes that need to happen for that day 50 | pub async fn pending_scrapes(db: &Database) -> anyhow::Result> { 51 | // all future scrapes that are specifically grouped by their provider name first 52 | let potential_target_providers = sqlx::query!( 53 | "SELECT * FROM provider_resource pr 54 | WHERE pr.enabled AND pr.tokens >= 1 55 | ORDER BY pr.name DESC, pr.destination desc" 56 | ) 57 | .fetch_all(db) 58 | .await?; 59 | 60 | let groups = potential_target_providers 61 | .into_iter() 62 | .flat_map(|row| { 63 | let tokens = row.tokens.to_f32().unwrap().trunc() as i32; 64 | (0..tokens.min(MAX_DAILY_SCRAPE_COUNT)) 65 | .map(|_| { 66 | ( 67 | row.id, 68 | Priority::unchecked_clamp(row.priority.to_f32().unwrap()), 69 | ScopedProvider { 70 | destination: row.destination.clone(), 71 | name: AllProviders::from_str(&row.name).unwrap(), 72 | official: row.official, 73 | }, // last_scrape: row.last_scrape, 74 | row.last_scrape, 75 | row.default_name.clone(), 76 | ) 77 | }) 78 | .collect::>() 79 | }) 80 | .group_by(|p| p.2.name); 81 | 82 | let out: Vec = groups 83 | .into_iter() 84 | .flat_map(|(_, group)| { 85 | let endpoints = group.collect::>(); 86 | let maximized_endpoints = maximize_distance(&endpoints, quality_maxmindist); 87 | let dates = interpolate_dates( 88 | maximized_endpoints.len(), 89 | // We want to give the 90 | &Duration::from_millis(SCHEDULER_START_MILLISECONDS), 91 | // One day 92 | &Duration::from_millis(SCHEDULER_END_MILLISECONDS), 93 | ); 94 | maximized_endpoints 95 | .iter() 96 | .zip(dates) 97 | .map( 98 | |((id, priority, provider, last_scrape, default_name), scrape_date)| { 99 | PendingProvider { 100 | id: *id, 101 | priority: priority.clone(), 102 | provider: provider.clone(), 103 | scrape_date, 104 | last_scrape: *last_scrape, 105 | default_name: default_name.clone(), 106 | } 107 | }, 108 | ) 109 | .collect::>() 110 | }) 111 | .collect::>(); 112 | 113 | let original_length = out.len(); 114 | sqlx::query!( 115 | "UPDATE provider_resource SET last_queue = NOW() WHERE id = ANY($1)", 116 | &out.iter().map(|p| p.id).collect::>() 117 | ) 118 | .fetch_one(db) 119 | .await; 120 | let safe_providers = if cfg!(debug_assertions) { 121 | // making sure we don't blow things up in case we're running this in development with tons of 122 | // pending providers 123 | let slice_boundary = MAX_TESTING_PROVIDERS.min(out.len()); 124 | let result = Vec::from_iter(out[..slice_boundary].iter().map(|p| p.to_owned())); 125 | if result.len() != original_length { 126 | info!( 127 | "Debug mode truncated pending providers from {} to {}", 128 | result.len(), 129 | MAX_TESTING_PROVIDERS 130 | ); 131 | } 132 | result 133 | } else { 134 | out 135 | }; 136 | Ok(safe_providers) 137 | } 138 | 139 | /// Vec length is equal to the length of the items passed in 140 | fn interpolate_dates( 141 | item_count: usize, 142 | start_duration: &Duration, 143 | end_duration: &Duration, 144 | ) -> Vec { 145 | let duration = *end_duration - *start_duration; 146 | let initial_gap = duration.checked_div(item_count as u32 + 1).unwrap(); 147 | unfold(*start_duration, |duration| { 148 | let next = duration.add(initial_gap); 149 | *duration = next; 150 | Some(next) 151 | }) 152 | .by_ref() 153 | .take(item_count) 154 | .collect::>() 155 | } 156 | 157 | pub async fn update_priorities(db: &Database, sp: &[PendingProvider]) -> anyhow::Result<()> { 158 | let providers = sqlx::query!( 159 | "SELECT 160 | pr.id, 161 | pr.name, 162 | pr.destination, 163 | pr.official, 164 | s.priority as resource_priority, 165 | s.scraped_at, 166 | s.priority, 167 | (SELECT COUNT(*) 168 | FROM media m 169 | INNER JOIN scrape_request sr 170 | on sr.id = m.scrape_request_id 171 | where sr.scrape_id = s.id 172 | ) as discovery_count 173 | FROM provider_resource pr 174 | INNER JOIN LATERAL ( 175 | SELECT * 176 | FROM scrape s 177 | WHERE s.provider_name = pr.name 178 | AND s.provider_destination = pr.destination 179 | ORDER BY s.scraped_at desc, id 180 | LIMIT 30 181 | ) s on True 182 | WHERE pr.enabled AND pr.id = ANY($1) 183 | ORDER BY s.scraped_at desc", 184 | &sp.iter().map(|pp| pp.id).collect::>() 185 | ) 186 | .fetch_all(db) 187 | .await?; 188 | 189 | let groups = providers.iter().into_group_map_by(|row| { 190 | ( 191 | row.id, 192 | row.name.clone(), 193 | row.destination.clone(), 194 | row.priority.clone(), 195 | ) 196 | }); 197 | 198 | for ((id, name, destination, priority), rows) in groups { 199 | let histories = rows 200 | .into_iter() 201 | .filter(|&row| row.scraped_at.is_some()) 202 | .map(|row| ScrapeHistory { 203 | date: row.scraped_at.unwrap(), 204 | priority: Priority::unchecked_clamp(row.priority.to_f32().unwrap()), 205 | result_count: row.discovery_count.unwrap_or(0i64).try_into().unwrap(), 206 | provider: ScopedProvider { 207 | destination: destination.clone(), 208 | name: AllProviders::from_str(&name).unwrap(), 209 | official: row.official, 210 | }, 211 | }) 212 | .collect::>(); 213 | 214 | if !histories.is_empty() { 215 | let provider_priority = Priority::unchecked_clamp(priority.to_f32().unwrap()); 216 | let next_priority = provider_priority.next(&histories[..]); 217 | debug!( 218 | "Setting the next priority for [{}] from {} to {} because {:?}", 219 | &name, 220 | provider_priority.level.to_f32().unwrap_or(-1.0), 221 | next_priority.level.to_f32().unwrap_or(-1.0), 222 | &histories.iter().map(|h| h.result_count) 223 | ); 224 | // continue; 225 | sqlx::query!( 226 | "UPDATE provider_resource SET priority = $1 where id = $2 227 | AND last_token_update IS NOT NULL 228 | returning id", 229 | next_priority.level, 230 | id 231 | ) 232 | .fetch_optional(db) 233 | .await?; 234 | } 235 | } 236 | // return Ok(()); 237 | // Update tokens for all resources. This has to be run after priorities are 238 | // updated 239 | // We don't want to give any endpoint more than 4 tokens (in case something goes wrong) 240 | sqlx::query!( 241 | "UPDATE provider_resource 242 | SET 243 | tokens = LEAST(4, tokens + priority), 244 | last_token_update = NOW() 245 | WHERE enabled = True AND (last_token_update IS NULL OR last_token_update + interval '1 day' <= NOW())" 246 | ) 247 | .fetch_optional(db) 248 | .await?; 249 | Ok(()) 250 | } 251 | 252 | pub fn maximize_distance(items: &[T], quality: fn(&[T]) -> f32) -> Vec { 253 | let mut out = items.to_owned(); 254 | let mut no_improvement = 0; 255 | let mut best = 0f32; 256 | let mut rng = rand::thread_rng(); 257 | while no_improvement < 400 { 258 | let i = rng.gen_range(0..out.len()); 259 | let j = rng.gen_range(0..out.len()); 260 | let mut copy = out.clone(); 261 | copy.swap(i, j); 262 | let q = quality(©); 263 | if q > best { 264 | out = copy; 265 | best = q; 266 | no_improvement = 0; 267 | } else { 268 | no_improvement += 1; 269 | } 270 | } 271 | out 272 | } 273 | 274 | fn quality_maxmindist(items: &[T]) -> f32 { 275 | let mut s = 0f32; 276 | let uniq: HashSet<&T> = HashSet::from_iter(items); 277 | for item in uniq.into_iter() { 278 | let indices = (0..items.len()) 279 | .filter_map(|i| { 280 | if &items[i] == item { 281 | Some(i as i32) 282 | } else { 283 | None 284 | } 285 | }) 286 | .collect::>(); 287 | if indices.len() > 1 { 288 | let summed: f32 = (0..indices.len() - 1) 289 | .map(|i| 1f32 / (indices[i + 1] - indices[i]) as f32) 290 | .sum(); 291 | s += summed; 292 | } 293 | } 294 | 1f32 / s 295 | } 296 | 297 | #[cfg(test)] 298 | mod tests { 299 | use std::time::Duration; 300 | 301 | use crate::scheduler::scheduler::quality_maxmindist; 302 | 303 | use super::{interpolate_dates, maximize_distance}; 304 | 305 | #[test] 306 | fn spacing_test() { 307 | assert_eq!( 308 | maximize_distance(&[1, 1, 1, 2, 2], quality_maxmindist), 309 | &[1, 2, 1, 2, 1], 310 | ); 311 | } 312 | 313 | #[test] 314 | fn interpolate() { 315 | let out: Vec = 316 | interpolate_dates(3, &Duration::from_millis(0), &Duration::from_millis(3000)); 317 | let res: Vec = vec![ 318 | Duration::from_millis(750), 319 | Duration::from_millis(1500), 320 | Duration::from_millis(2250), 321 | ]; 322 | assert_eq!(out, res) 323 | } 324 | } 325 | -------------------------------------------------------------------------------- /src/scraper/mod.rs: -------------------------------------------------------------------------------- 1 | mod providers; 2 | pub use providers::*; 3 | pub mod scraper; 4 | -------------------------------------------------------------------------------- /src/scraper/providers/mod.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use std::env; 3 | use std::fmt::Display; 4 | use std::iter::FromIterator; 5 | use std::sync::Arc; 6 | 7 | use futures::future::join_all; 8 | use reqwest::Client; 9 | use strum::IntoEnumIterator; 10 | 11 | pub use pinterest::*; 12 | pub use providers::*; 13 | pub use twitter::*; 14 | pub use united_cube::*; 15 | pub use weverse::*; 16 | 17 | pub mod pinterest; 18 | mod providers; 19 | pub mod twitter; 20 | mod twitter_types; 21 | pub mod united_cube; 22 | pub mod weverse; 23 | 24 | /// A scrape url is only transparently available to providers 25 | #[derive(Debug, Clone)] 26 | pub struct ScrapeUrl(pub String); 27 | 28 | #[derive(Debug, Copy, Clone)] 29 | pub struct PageSize(usize); 30 | 31 | /// Identifier for a specific section of a site 32 | /// [name: pinterest.board_feed] 33 | /// [destination: ] 34 | #[derive(Debug, PartialEq, Eq, Hash, Clone)] 35 | pub struct ScopedProvider { 36 | pub name: AllProviders, 37 | pub destination: String, 38 | pub official: bool, 39 | } 40 | 41 | impl Display for ScopedProvider { 42 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 43 | f.write_str(&format!("{}:{}", self.name.to_string(), self.destination)) 44 | } 45 | } 46 | 47 | pub type ProviderMap = HashMap>; 48 | 49 | pub async fn get_provider_map(client: &Arc) -> anyhow::Result { 50 | let handles = AllProviders::iter().map(|provider_type| async move { 51 | let client = Arc::clone(client); 52 | let input = ProviderInput { client }; 53 | let provider: Box = match provider_type { 54 | AllProviders::PinterestBoardFeed => Box::new(PinterestBoardFeed::new(input)), 55 | AllProviders::WeverseArtistFeed => Box::new(WeverseArtistFeed::new(input)), 56 | AllProviders::UnitedCubeArtistFeed => Box::new(UnitedCubeArtistFeed::new(input)), 57 | AllProviders::TwitterTimeline => Box::new(TwitterTimeline::new(input)), 58 | }; 59 | // we should only initialize providers if NO_WORKER is not set 60 | // this is not typesafe so we should be careful to not try to do 61 | // authenticated requests when workers are off 62 | if let Err(_) = env::var("NO_WORKER") { 63 | provider.initialize().await; 64 | } 65 | (provider_type, provider) 66 | }); 67 | let results = join_all(handles).await; 68 | Ok(HashMap::from_iter(results)) 69 | } 70 | -------------------------------------------------------------------------------- /src/scraper/providers/pinterest.rs: -------------------------------------------------------------------------------- 1 | use std::{collections::HashMap, sync::Arc, time::Instant}; 2 | 3 | use async_trait::async_trait; 4 | use chrono::NaiveDateTime; 5 | use reqwest::Client; 6 | use serde::{Deserialize, Serialize}; 7 | use url::Url; 8 | 9 | use crate::{ 10 | request::{parse_successful_response, request_default_headers}, 11 | scheduler::UnscopedLimiter, 12 | scraper::providers::ProviderMediaType, 13 | }; 14 | 15 | use super::*; 16 | 17 | #[derive(Debug, Deserialize)] 18 | pub struct PinterestImage { 19 | pub width: u16, 20 | pub height: u16, 21 | pub url: String, 22 | } 23 | 24 | #[derive(Debug, Clone, Deserialize)] 25 | pub struct PinterestRichSummary { 26 | pub url: String, 27 | } 28 | 29 | #[derive(Debug, Clone, Deserialize)] 30 | pub struct PinterestPinner { 31 | pub full_name: String, 32 | // I don't know if these are really optional, but just to be safe 33 | pub image_xlarge_url: Option, 34 | } 35 | 36 | #[derive(Debug, Deserialize)] 37 | pub struct PinterestBoard { 38 | pub name: String, 39 | } 40 | 41 | #[derive(Debug, Deserialize)] 42 | pub struct PinterestImages { 43 | pub id: String, 44 | pub pinner: Option, 45 | pub board: Option, 46 | pub images: HashMap, 47 | pub rich_summary: Option, 48 | } 49 | 50 | #[derive(Debug, Deserialize)] 51 | pub struct PinterestResource { 52 | pub bookmark: Option, 53 | pub data: Vec, 54 | } 55 | 56 | #[derive(Debug, Deserialize)] 57 | pub struct PinterestResponse { 58 | pub resource_response: PinterestResource, 59 | } 60 | 61 | #[derive(Debug, Serialize)] 62 | struct PinterestRequestDictOptions<'a> { 63 | bookmarks: &'a Option>, 64 | board_url: &'a str, 65 | board_id: &'a str, 66 | // max accepted value by the API is 250 67 | page_size: usize, 68 | } 69 | 70 | #[derive(Debug, Serialize)] 71 | struct PinterestRequestDict<'a> { 72 | options: PinterestRequestDictOptions<'a>, 73 | } 74 | 75 | // #[derive(Clone)] 76 | pub struct PinterestBoardFeed { 77 | pub client: Arc, 78 | pub rate_limiter: UnscopedLimiter, 79 | } 80 | 81 | const PINTEREST_BOARD_SEPARATOR: &str = "|"; 82 | 83 | const URL_ROOT: &str = "https://www.pinterest.com/resource/BoardFeedResource/get"; 84 | 85 | #[allow(dead_code)] 86 | const MAXIMUM_PAGE_SIZE: usize = 200; 87 | 88 | /// pinterest uses a page size of 25 89 | #[allow(dead_code)] 90 | const PROVIDER_NATIVE_PAGE_SIZE: usize = 25; 91 | 92 | #[async_trait] 93 | impl RateLimitable for PinterestBoardFeed { 94 | async fn wait(&self, _key: &str) -> () { 95 | self.rate_limiter 96 | .until_ready_with_jitter(default_jitter()) 97 | .await; 98 | } 99 | } 100 | 101 | // PinterestBoard ids are made up of 2 pieces, board_url and board_id formatted in this way 102 | // "board_id|board_url" 103 | #[async_trait] 104 | impl Provider for PinterestBoardFeed { 105 | fn new(input: ProviderInput) -> Self 106 | where 107 | Self: Sized, 108 | { 109 | Self { 110 | client: Arc::clone(&input.client), 111 | rate_limiter: Self::rate_limiter(), 112 | } 113 | } 114 | fn id(&self) -> AllProviders { 115 | AllProviders::PinterestBoardFeed 116 | } 117 | fn max_page_size(&self) -> PageSize { 118 | PageSize(100) 119 | } 120 | 121 | fn default_page_size(&self) -> PageSize { 122 | PageSize(20) 123 | } 124 | 125 | fn from_provider_destination( 126 | &self, 127 | scrape_id: &str, 128 | page_size: PageSize, 129 | pagination: Option, 130 | ) -> Result { 131 | let (id, path) = scrape_id 132 | .split_once(PINTEREST_BOARD_SEPARATOR) 133 | .ok_or(ProviderFailure::Url)?; 134 | 135 | let data = PinterestRequestDict { 136 | options: PinterestRequestDictOptions { 137 | bookmarks: &pagination.map(|res| vec![res.next_page()]), 138 | board_id: id, 139 | board_url: path, 140 | page_size: page_size.0, 141 | }, 142 | }; 143 | let data_str = serde_json::to_string(&data) 144 | .ok() 145 | .ok_or(ProviderFailure::Url)?; 146 | 147 | let url = Url::parse_with_params(URL_ROOT, &[("source_url", path), ("data", &data_str)]) 148 | .ok() 149 | .ok_or(ProviderFailure::Url)?; 150 | Ok(ScrapeUrl(url.as_str().to_owned())) 151 | } 152 | async fn unfold(&self, state: ProviderState) -> Result { 153 | let instant = Instant::now(); 154 | let response = self 155 | .client 156 | .get(&state.url.0) 157 | .headers(request_default_headers()) 158 | .send() 159 | .await?; 160 | let response_delay = instant.elapsed(); 161 | 162 | let status = &response.status(); 163 | let response_json = parse_successful_response::(response).await?; 164 | let posts = response_json 165 | .resource_response 166 | .data 167 | .iter() 168 | .filter_map(|pin| { 169 | // I imagine every image has an "orig" size but we can't know for sure 170 | pin.images.get("orig").map(|elem| { 171 | ProviderPost { 172 | account: pin 173 | .pinner 174 | .clone() 175 | .map(|pinner| ProviderAccount { 176 | name: pinner.full_name, 177 | avatar_url: pinner.image_xlarge_url, 178 | }) 179 | .unwrap_or_default(), 180 | unique_identifier: pin.id.clone(), 181 | url: Some(format!("https://www.pinterest.com/pin/{}", pin.id)), 182 | post_date: None, 183 | // There might be a body here but I don't really care, it's pinterest 184 | body: None, 185 | images: vec![ProviderMedia { 186 | _type: ProviderMediaType::Image, 187 | media_url: elem.url.to_owned(), 188 | // yes, pinterest literally does not tell you when things were 189 | // pinned. It's so stupid 190 | reference_url: pin.rich_summary.clone().map(|sum| sum.url), 191 | unique_identifier: pin.id.to_owned(), 192 | metadata: None, 193 | }], 194 | metadata: None, 195 | } 196 | }) 197 | }) 198 | .collect::>(); 199 | 200 | let result = ProviderResult { 201 | posts, 202 | response_code: status.to_owned(), 203 | response_delay, 204 | }; 205 | 206 | let bookmark_option = response_json.resource_response.bookmark; 207 | // we receive a bookmark when there are more images to scrape 208 | Ok(match bookmark_option { 209 | Some(bookmark) => ProviderStep::Next(result, Pagination::NextCursor(bookmark)), 210 | None => ProviderStep::End(result), 211 | }) 212 | } 213 | } 214 | -------------------------------------------------------------------------------- /src/scraper/providers/providers.rs: -------------------------------------------------------------------------------- 1 | use std::convert::Infallible; 2 | use std::sync::Arc; 3 | use std::{collections::HashSet, ops::Add, time::Duration}; 4 | 5 | use async_trait::async_trait; 6 | use chrono::NaiveDateTime; 7 | use governor::{Jitter, Quota, RateLimiter}; 8 | use log::{debug, error, info}; 9 | use parking_lot::RwLock; 10 | use reqwest::{Client, Error, StatusCode}; 11 | use serde; 12 | use serde::{Deserialize, Serialize}; 13 | use strum_macros::{Display, ToString}; 14 | use strum_macros::{EnumIter, EnumString}; 15 | use thiserror::Error; 16 | use url::Url; 17 | 18 | use crate::request::HttpError; 19 | use crate::scheduler::UnscopedLimiter; 20 | use crate::scraper::providers::providers::DerivedProviderResource::Invalid; 21 | 22 | use super::{PageSize, ScrapeUrl}; 23 | 24 | #[derive(Debug, Clone, Serialize, Deserialize)] 25 | pub enum ProviderMediaType { 26 | Image, 27 | Video, 28 | } 29 | 30 | pub type SharedCredentials = Arc>>; 31 | 32 | /// Placeholder for images that may contain more metadata in the future? 33 | #[derive(Debug, Clone, Serialize, Deserialize)] 34 | pub struct ProviderMedia { 35 | #[serde(rename = "type")] 36 | pub _type: ProviderMediaType, 37 | pub media_url: String, 38 | // where the image is coming from 39 | pub reference_url: Option, 40 | pub unique_identifier: String, 41 | /// necessary for some providers like weverse which include additional 42 | /// metadata that are unique to the provider being scraped 43 | #[serde(skip_serializing_if = "Option::is_none")] 44 | pub metadata: Option, 45 | } 46 | 47 | #[derive(Debug, Clone, Serialize, Deserialize)] 48 | pub struct ProviderPost { 49 | pub account: ProviderAccount, 50 | pub unique_identifier: String, 51 | pub images: Vec, 52 | pub body: Option, 53 | pub url: Option, 54 | pub post_date: Option, 55 | /// necessary for some providers like weverse which include additional 56 | /// metadata that are unique to the provider being scraped 57 | #[serde(skip_serializing_if = "Option::is_none")] 58 | pub metadata: Option, 59 | } 60 | 61 | #[derive(Debug, Clone, Serialize, Deserialize)] 62 | pub struct ProviderAccount { 63 | pub name: String, 64 | pub avatar_url: Option, 65 | } 66 | 67 | impl Default for ProviderAccount { 68 | fn default() -> Self { 69 | Self { 70 | name: "Unknown user".to_owned(), 71 | avatar_url: None, 72 | } 73 | } 74 | } 75 | 76 | #[derive(Debug)] 77 | pub struct ProviderResult { 78 | pub posts: Vec, 79 | pub response_delay: Duration, 80 | pub response_code: StatusCode, 81 | } 82 | 83 | impl Add for ProviderResult { 84 | type Output = ProviderResult; 85 | fn add(self, rhs: ProviderResult) -> Self::Output { 86 | ProviderResult { 87 | response_code: rhs.response_code, 88 | response_delay: rhs.response_delay, 89 | posts: [self.posts, rhs.posts].concat(), 90 | } 91 | } 92 | } 93 | 94 | #[derive(Debug)] 95 | pub enum ProviderStep { 96 | Next(ProviderResult, Pagination), 97 | End(ProviderResult), 98 | // Provider exits gracefully 99 | NotInitialized, 100 | } 101 | 102 | #[derive(Error, Debug)] 103 | pub enum ProviderFailure { 104 | #[error("Error formatting URL")] 105 | Url, 106 | #[error("Failed to process response from request")] 107 | HttpError(HttpError), 108 | #[error("{0}")] 109 | Other(String), 110 | } 111 | 112 | impl From for ProviderFailure { 113 | fn from(err: reqwest::Error) -> Self { 114 | ProviderFailure::HttpError(HttpError::ReqwestError(err)) 115 | } 116 | } 117 | 118 | #[derive(Debug, Clone)] 119 | pub struct ProviderState { 120 | pub login_attempts: u32, 121 | pub id: String, 122 | pub default_name: Option, 123 | pub url: ScrapeUrl, 124 | pub pagination: Option, 125 | pub iteration: usize, 126 | } 127 | 128 | pub struct ScrapeRequestInput { 129 | pub latest_data: HashSet, 130 | pub default_name: Option, 131 | pub last_scrape: Option, 132 | pub is_first_scrape: bool, 133 | } 134 | 135 | impl From for ProviderFailure { 136 | fn from(err: HttpError) -> Self { 137 | Self::HttpError(err) 138 | } 139 | } 140 | 141 | pub enum CanonicalUrlResolution { 142 | Success { destination: String }, 143 | Fail(String), 144 | NotImplemented, 145 | } 146 | 147 | pub enum CredentialRefresh { 148 | Result(ProviderCredentials), 149 | TryLogin, 150 | Halt, 151 | } 152 | 153 | pub enum ProviderErrorHandle { 154 | RefreshToken(ProviderCredentials), 155 | Login, 156 | Halt, 157 | } 158 | 159 | #[derive(Debug, Clone)] 160 | pub enum Pagination { 161 | NextPage(i32), 162 | NextCursor(String), 163 | } 164 | 165 | impl Pagination { 166 | pub fn next_page(&self) -> String { 167 | match self { 168 | Pagination::NextPage(num) => num.to_string(), 169 | Pagination::NextCursor(cursor) => cursor.clone(), 170 | } 171 | } 172 | } 173 | 174 | impl ToString for Pagination { 175 | fn to_string(&self) -> String { 176 | self.next_page() 177 | } 178 | } 179 | 180 | #[async_trait] 181 | pub trait RateLimitable { 182 | /// The available quota for this provider 183 | fn quota() -> Quota 184 | where 185 | Self: Sized, 186 | { 187 | default_quota() 188 | } 189 | /// The default rate limiter implementation 190 | /// This currently only supports global rate limiters 191 | /// but may need to be changed to support local ones as well 192 | fn rate_limiter() -> UnscopedLimiter 193 | where 194 | Self: Sized, 195 | { 196 | RateLimiter::direct(Self::quota()) 197 | } 198 | /// Wait for next request if token is not available 199 | async fn wait(&self, key: &str) -> (); 200 | } 201 | 202 | pub fn default_quota() -> Quota { 203 | // fairly aggressive quota 204 | Quota::with_period(Duration::from_millis(3500u64)).unwrap() 205 | } 206 | 207 | const DEFAULT_WAIT_SECONDS: u64 = if cfg!(debug_assertions) { 2 } else { 8 }; 208 | 209 | pub fn default_jitter() -> Jitter { 210 | Jitter::up_to(Duration::from_secs(DEFAULT_WAIT_SECONDS)) 211 | } 212 | 213 | #[derive(Debug, Clone, Default)] 214 | pub struct ProviderCredentials { 215 | pub access_token: String, 216 | pub refresh_token: String, 217 | } 218 | 219 | pub struct BareProviderInput { 220 | pub client: Arc, 221 | } 222 | 223 | pub struct ProviderInput { 224 | pub client: Arc, 225 | } 226 | 227 | pub fn create_credentials() -> Arc>> { 228 | Arc::new(RwLock::new(None)) 229 | } 230 | 231 | /// Try to override the shared credentials after logging in one time 232 | pub async fn attempt_first_login( 233 | provider: &dyn Provider, 234 | credentials: &SharedCredentials, 235 | ) { 236 | let id = provider.id().to_string(); 237 | info!("Attempting login to {}", &id); 238 | let login = provider.login().await; 239 | let provider_creds = match login { 240 | Ok(login) => { 241 | info!("Logged in into {}", &id); 242 | login 243 | } 244 | Err(err) => { 245 | error!("Could not log into {}, leaving it uninitialized", &id); 246 | eprintln!("{:?}", err); 247 | return; 248 | } 249 | }; 250 | let mut writable = credentials.write(); 251 | *writable = Some(provider_creds); 252 | } 253 | 254 | pub enum DerivedProviderResource { 255 | Invalid, 256 | Error { reason: String }, 257 | Success { destination: String }, 258 | } 259 | 260 | pub struct IntrospectableResource(pub String); 261 | 262 | pub enum WorkableDomain { 263 | /// The url can be turned into a canonical URL using [`Provider::introspect_resource`] 264 | ToCanonical(IntrospectableResource), 265 | /// The url can be scraped for information 266 | ToResource(Url), 267 | } 268 | 269 | /// Providers represent a generic endpoint on a single platform that can be scraped 270 | /// with a unique identifier for each specific resource 271 | #[async_trait] 272 | pub trait Provider: Sync + Send + RateLimitable { 273 | fn new(input: ProviderInput) -> Self 274 | where 275 | Self: Sized; 276 | async fn initialize(&self) {} 277 | 278 | fn requires_auth(&self) -> bool { 279 | false 280 | } 281 | 282 | /// a string that uniquely identifies this provider 283 | fn id(&self) -> AllProviders; 284 | 285 | /// The maximum amount of items the provider can retrieve at one time 286 | /// This value is used whenever a provider is being scraped for the first time 287 | /// in order to quickly enumerate through all past data 288 | fn max_page_size(&self) -> PageSize; 289 | 290 | /// The default page size that is used when checking a provider that has 291 | /// already been scraped at least one time in the past 292 | fn default_page_size(&self) -> PageSize; 293 | 294 | /// The maximum number of times a resource can be paginated before exiting. 295 | /// This value is ignored if the context has no images aka the resource 296 | /// is being scraped for the first time 297 | fn max_pagination(&self) -> u16 { 298 | 5 299 | } 300 | 301 | /// The amount of delay between each pagination request. Initial request is not 302 | /// bound by this value 303 | fn scrape_delay(&self) -> Duration { 304 | Duration::from_secs(2) 305 | } 306 | 307 | /// Match the domain input into a pending action the provider can perform 308 | fn match_domain(&self, _url: &str) -> Option { 309 | None 310 | } 311 | 312 | /// Attempt to resolve the data required to construct a scrape destination given a canonical URL 313 | /// # Example 314 | /// introspectable: dreamcatcher 315 | /// Canonical URL: https://weverse.io/dreamcatcher/artist 316 | /// Result: Ok("14") 317 | async fn introspect_resource( 318 | &self, 319 | _introspectable: &IntrospectableResource, 320 | ) -> Result { 321 | Ok(CanonicalUrlResolution::NotImplemented) 322 | } 323 | 324 | /// Provider destination are any unique identifier a provider can try to resolve into an opaque [ScrapeUrl]. 325 | /// This method is called after every successful scrape to resolve the next page of media 326 | fn from_provider_destination( 327 | &self, 328 | id: &str, 329 | page_size: PageSize, 330 | pagination: Option, 331 | ) -> Result; 332 | /// Process a single iteration of the resource 333 | async fn unfold(&self, state: ProviderState) -> Result; 334 | 335 | /// Error handling branch that separates operational errors from authorization 336 | /// related error codes 337 | fn on_error(&self, _http_error: &HttpError) -> anyhow::Result { 338 | debug!( 339 | "{} ran into an unhandled error and is halting", 340 | self.id().to_string() 341 | ); 342 | Ok(ProviderErrorHandle::Halt) 343 | } 344 | 345 | async fn token_refresh( 346 | &self, 347 | _credentials: &ProviderCredentials, 348 | ) -> anyhow::Result { 349 | panic!( 350 | "{}'s on_error branch tried to refresh credentials but it doesn't implement a token refresh flow", 351 | self.id().to_string() 352 | ) 353 | } 354 | 355 | async fn login(&self) -> Result { 356 | panic!( 357 | "{} tried to login but it doesn't implement a login flow", 358 | self.id().to_string() 359 | ) 360 | } 361 | fn credentials(&self) -> SharedCredentials { 362 | panic!( 363 | "Tried to get credentials for {} which doesn't authorization", 364 | self.id().to_string() 365 | ) 366 | } 367 | 368 | fn max_login_attempts(&self) -> u32 { 369 | 3 370 | } 371 | 372 | /// Whether the URLs generated by this scraper expire after a short amount of duration 373 | fn ephemeral(&self) -> bool { 374 | false 375 | } 376 | } 377 | 378 | #[derive(Display, Debug, Hash, Copy, Clone, Serialize, EnumString, EnumIter, PartialEq, Eq)] 379 | pub enum AllProviders { 380 | #[strum(serialize = "pinterest.board_feed")] 381 | PinterestBoardFeed, 382 | #[strum(serialize = "weverse.artist_feed")] 383 | WeverseArtistFeed, 384 | #[strum(serialize = "united_cube.artist_feed")] 385 | UnitedCubeArtistFeed, 386 | #[strum(serialize = "twitter.timeline")] 387 | TwitterTimeline, 388 | } 389 | 390 | pub fn find_matching_domain(domains: &[&str], url: &str) -> Option { 391 | let parsed = Url::parse(url).ok()?; 392 | let dom = parsed.domain()?; 393 | if domains.contains(&dom) { 394 | Some(WorkableDomain::ToCanonical(IntrospectableResource( 395 | url.to_owned(), 396 | ))) 397 | } else { 398 | None 399 | } 400 | } 401 | 402 | pub struct UrlBuilder { 403 | pub params: Vec<(&'static str, String)>, 404 | } 405 | 406 | impl Default for UrlBuilder { 407 | fn default() -> Self { 408 | Self { params: vec![] } 409 | } 410 | } 411 | 412 | impl ToString for UrlBuilder { 413 | fn to_string(&self) -> String { 414 | todo!() 415 | } 416 | } 417 | 418 | impl UrlBuilder { 419 | pub fn from_queries(params: Vec<(&'static str, &'static str)>) -> Self { 420 | Self { 421 | params: params 422 | .into_iter() 423 | .map(|(key, value)| (key, value.to_owned())) 424 | .collect::>(), 425 | } 426 | } 427 | pub fn page_size(&mut self, key: &'static str, page_size: PageSize) -> &mut Self { 428 | self.params.push((key, page_size.0.to_string())); 429 | self 430 | } 431 | pub fn pagination(&mut self, key: &'static str, page_option: &Option) -> &mut Self { 432 | if let Some(page) = page_option { 433 | self.params.push((key, page.next_page())) 434 | } 435 | self 436 | } 437 | pub fn build(&self, base_url: &str) -> Result { 438 | url::Url::parse_with_params(base_url, self.params.iter()) 439 | .ok() 440 | .ok_or(ProviderFailure::Url) 441 | } 442 | pub fn build_scrape_url(self, base_url: &str) -> Result { 443 | let res = self.build(base_url)?; 444 | Ok(ScrapeUrl(res.as_str().to_owned())) 445 | } 446 | } 447 | -------------------------------------------------------------------------------- /src/scraper/providers/twitter.rs: -------------------------------------------------------------------------------- 1 | use std::borrow::Borrow; 2 | use std::env; 3 | use std::iter::FromIterator; 4 | use std::sync::Arc; 5 | use std::time::{Duration, Instant}; 6 | 7 | use anyhow::Error; 8 | use async_trait::async_trait; 9 | use chrono::NaiveDateTime; 10 | use chrono::{DateTime, FixedOffset, ParseResult}; 11 | use governor::Quota; 12 | use log::{debug, error, info, trace, warn}; 13 | use regex::Regex; 14 | use reqwest::header::{HeaderMap, HeaderName, HeaderValue}; 15 | use reqwest::Client; 16 | use url::Url; 17 | 18 | use crate::request::{parse_successful_response, HttpError}; 19 | use crate::scheduler::UnscopedLimiter; 20 | use crate::scraper::providers::twitter_types::{ 21 | Entries, GuestTokenFetchResponse, Twitter, TwitterImageMetadata, TwitterPostMetadata, 22 | TwitterUserLookupResponse, Type, 23 | }; 24 | 25 | use super::*; 26 | 27 | fn twitter_type_to_provider(media_type: &Type) -> ProviderMediaType { 28 | match media_type { 29 | Type::AnimatedGif => ProviderMediaType::Image, 30 | Type::Photo => ProviderMediaType::Image, 31 | Type::Video => ProviderMediaType::Video, 32 | } 33 | } 34 | 35 | fn replace_twitter_string(s: &str) -> String { 36 | s.replace("\\/", "/") 37 | } 38 | 39 | fn parse_twitter_date(date_str: &str) -> ParseResult> { 40 | DateTime::parse_from_str(date_str, "%a %b %d %H:%M:%S %z %Y") 41 | } 42 | 43 | pub struct TwitterTimeline { 44 | pub guest_token: SharedCredentials, 45 | pub bearer_token: Option, 46 | pub client: Arc, 47 | pub rate_limiter: UnscopedLimiter, 48 | } 49 | 50 | const BASE_URL: &str = "https://twitter.com/"; 51 | /// I have no idea where this token is coming from... 52 | const MAGIC_BEARER_TOKEN: &str = "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA"; 53 | 54 | const USER_AGENT: &str = "HTC Mozilla/5.0 (Linux; Android 7.0; HTC 10 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.83 Mobile Safari/537.36"; 55 | 56 | #[async_trait] 57 | impl RateLimitable for TwitterTimeline { 58 | fn quota() -> Quota 59 | where 60 | Self: Sized, 61 | { 62 | default_quota() 63 | } 64 | async fn wait(&self, _key: &str) -> () { 65 | self.rate_limiter 66 | .until_ready_with_jitter(default_jitter()) 67 | .await 68 | } 69 | } 70 | 71 | #[async_trait] 72 | impl Provider for TwitterTimeline { 73 | fn id(&self) -> AllProviders { 74 | AllProviders::TwitterTimeline 75 | } 76 | fn new(input: ProviderInput) -> Self 77 | where 78 | Self: Sized, 79 | { 80 | Self { 81 | guest_token: create_credentials(), 82 | bearer_token: env::var("TWITTER_BEARER_TOKEN").ok(), 83 | client: Arc::clone(&input.client), 84 | rate_limiter: Self::rate_limiter(), 85 | } 86 | } 87 | 88 | async fn initialize(&self) -> () { 89 | attempt_first_login(self, &self.guest_token).await; 90 | } 91 | 92 | fn max_page_size(&self) -> PageSize { 93 | PageSize(100) 94 | } 95 | 96 | fn default_page_size(&self) -> PageSize { 97 | PageSize(20) 98 | } 99 | 100 | fn from_provider_destination( 101 | &self, 102 | id: &str, 103 | page_size: PageSize, 104 | pagination: Option, 105 | ) -> Result { 106 | let mut url_fragment = UrlBuilder::from_queries(vec![ 107 | ("include_profile_interstitial_type", "1"), 108 | // https://github.com/twintproject/twint/blob/master/twint/url.py 109 | // ("include_blocking", "1"), 110 | // ("include_blocked_by", "1"), 111 | // ("include_followed_by", "1"), 112 | // ("include_want_retweets", "1"), 113 | // ("include_mute_edge", "1"), 114 | // ("include_can_dm", "1"), 115 | // ("include_can_media_tag", "1"), 116 | // ("skip_status", "1"), 117 | // ("cards_platform", "Web - 12"), 118 | // ("include_cards", "1"), 119 | // ("include_ext_alt_text", "true"), 120 | // ("include_quote_count", "true"), 121 | // ("include_reply_count", "1"), 122 | ("tweet_mode", "extended"), 123 | ("include_entities", "true"), 124 | // ("include_user_entities", "true"), 125 | // ("include_ext_media_color", "true"), 126 | // ("include_ext_media_availability", "true"), 127 | // ("send_error_codes", "true"), 128 | // ("simple_quoted_tweet", "true"), 129 | // ("include_tweet_replies", "true"), 130 | ("ext", "mediaStats%2ChighlightedLabel"), 131 | ]); 132 | url_fragment.page_size("count", page_size); 133 | url_fragment.pagination("cursor", &pagination); 134 | let url = url_fragment.build_scrape_url(&format!( 135 | "https://api.twitter.com/2/timeline/profile/{}.json", 136 | id 137 | ))?; 138 | Ok(url) 139 | } 140 | 141 | fn max_pagination(&self) -> u16 { 142 | 3 143 | } 144 | 145 | async fn unfold(&self, state: ProviderState) -> Result { 146 | let credentials = self.guest_token.read().clone(); 147 | let token = match credentials { 148 | Some(token) => token, 149 | None => return Ok(ProviderStep::NotInitialized), 150 | }; 151 | let bearer = self.bearer_token.clone().map_or_else( 152 | || { 153 | warn!( 154 | "Using fallback bearer token. This will most likely get rate limited and fail" 155 | ); 156 | MAGIC_BEARER_TOKEN.to_owned() 157 | }, 158 | |token| format!("Bearer {}", &token), 159 | ); 160 | let instant = Instant::now(); 161 | 162 | let response = self 163 | .client 164 | .get(state.url.0) 165 | .headers(HeaderMap::from_iter([ 166 | ( 167 | HeaderName::from_static("user-agent"), 168 | //죄송합니다 169 | HeaderValue::from_static(USER_AGENT), 170 | ), 171 | ( 172 | HeaderName::from_static("authorization"), 173 | HeaderValue::from_str(&bearer).expect("Invalid bearer token format"), 174 | ), 175 | ( 176 | HeaderName::from_static("x-guest-token"), 177 | HeaderValue::from_str(&token.access_token) 178 | .expect("Invalid access token format"), 179 | ), 180 | ])) 181 | .send() 182 | .await?; 183 | let response_code = response.status(); 184 | let response_delay = instant.elapsed(); 185 | let response_json = parse_successful_response::(response).await?; 186 | // Twitter does some really interesting stuff with how they present API data 187 | let maybe_instruction = response_json 188 | .timeline 189 | .instructions 190 | .iter() 191 | .find_map(|instruction| instruction.get("addEntries")); 192 | let tweet_db = response_json.global_objects.tweets; 193 | let user_db = response_json.global_objects.users; 194 | let entries = match maybe_instruction { 195 | Some(Entries::AddEntries { entries }) => entries, 196 | _ => { 197 | return Err(ProviderFailure::Other( 198 | "Could not find an 'addEntries' in instructions".to_owned(), 199 | )) 200 | } 201 | }; 202 | let posts = entries 203 | .iter() 204 | .filter_map(|entry| { 205 | let sort_index = &entry.sort_index; 206 | if !entry.entry_id.starts_with("tweet-") { 207 | return None; 208 | } 209 | // a sort index corresponds to the id of the 210 | // the chances of this being undefined is basically non-existent but we should be safe 211 | let tweet = match tweet_db.get(sort_index) { 212 | None => { 213 | debug!( 214 | "Could not find the corresponding tweet id for {} in the tweet db", 215 | sort_index 216 | ); 217 | return None; 218 | } 219 | Some(t) => t, 220 | }; 221 | Some(tweet) 222 | }) 223 | .filter_map(|tweet| { 224 | let unique_identifier = tweet.id_str.clone(); 225 | let like_count = tweet.favorite_count; 226 | let retweet_count = tweet.retweet_count; 227 | let language = tweet.lang.clone(); 228 | let post_date = parse_twitter_date(&tweet.created_at) 229 | .ok() 230 | .map(|e| e.naive_utc()); 231 | let body = tweet.full_text.clone().map(|t| replace_twitter_string(&t)); 232 | tweet.entities.media.as_ref().map(|media| { 233 | let user_option = user_db.get(&tweet.user_id_str); 234 | let url = user_option.map(|user| { 235 | format!( 236 | "https://twitter.com/{}/status/{}", 237 | &user.screen_name, &unique_identifier 238 | ) 239 | }); 240 | ProviderPost { 241 | account: user_option 242 | .map(|user| ProviderAccount { 243 | name: user.name.clone(), 244 | avatar_url: user.profile_image_url_https.clone(), 245 | }) 246 | .unwrap_or_default(), 247 | unique_identifier, 248 | metadata: serde_json::to_value(TwitterPostMetadata { 249 | like_count, 250 | retweet_count, 251 | language, 252 | }) 253 | .ok(), 254 | url, 255 | post_date, 256 | images: media 257 | .iter() 258 | .map(|media| ProviderMedia { 259 | _type: twitter_type_to_provider(&media.media_type), 260 | unique_identifier: media.id_str.clone(), 261 | media_url: replace_twitter_string(&media.media_url_https), 262 | reference_url: Some(replace_twitter_string(&media.expanded_url)), 263 | metadata: serde_json::to_value(TwitterImageMetadata { 264 | height: media.original_info.height, 265 | width: media.original_info.width, 266 | }) 267 | .ok(), 268 | }) 269 | .collect::>(), 270 | body, 271 | } 272 | }) 273 | }) 274 | .collect::>(); 275 | 276 | let cursor_entry = &entries.last(); 277 | let cursor = 278 | cursor_entry.and_then(|c| c.content.operation.as_ref().map(|o| o.cursor.value.clone())); 279 | let result = ProviderResult { 280 | posts, 281 | response_code, 282 | response_delay, 283 | }; 284 | match cursor { 285 | Some(cursor) => Ok(ProviderStep::Next(result, Pagination::NextCursor(cursor))), 286 | None => Ok(ProviderStep::End(result)), 287 | } 288 | } 289 | 290 | fn match_domain(&self, url: &str) -> Option { 291 | find_matching_domain(&["twitter.com"], url) 292 | } 293 | 294 | /// https://twitter.com/:username -> ID 295 | async fn introspect_resource( 296 | &self, 297 | introspectable: &IntrospectableResource, 298 | ) -> Result { 299 | let bearer = match &self.bearer_token { 300 | None => return Ok(CanonicalUrlResolution::NotImplemented), 301 | Some(token) => token, 302 | }; 303 | let input = match Url::parse(&introspectable.0) { 304 | Err(e) => return Err(ProviderFailure::Url), 305 | Ok(e) => e, 306 | }; 307 | let username = input.path().trim_start_matches("/"); 308 | // self.guest_token 309 | let endpoint = format!("https://api.twitter.com/2/users/by/username/{}", username); 310 | let result = self 311 | .client 312 | .get(endpoint) 313 | .header("Authorization", format!("Bearer {}", bearer)) 314 | .send() 315 | .await? 316 | .json::() 317 | .await?; 318 | Ok(CanonicalUrlResolution::Success { 319 | destination: result.data.id, 320 | }) 321 | } 322 | 323 | async fn login(&self) -> Result { 324 | let headers = HeaderMap::from_iter([( 325 | HeaderName::from_static("user-agent"), 326 | HeaderValue::from_static(USER_AGENT), 327 | )]); 328 | let login = self 329 | .client 330 | .get(BASE_URL) 331 | .headers(headers.clone()) 332 | .send() 333 | .await?; 334 | let html = login.text().await?; 335 | // TODO: check cookie response here? 336 | // CONTEXT: https://github.com/JustAnotherArchivist/snscrape/blob/eee06d859338b184fc43f93e424ba70a0e9f4679/snscrape/modules/twitter.py#L231 337 | let regex = Regex::new(r#"gt=(.*?);"#).unwrap(); 338 | match regex.captures(&html) { 339 | Some(captures) => { 340 | let capture = captures.get(1).expect( 341 | "Couldn't match a guest token in the twitter homepage, the site was changed", 342 | ); 343 | Ok(ProviderCredentials { 344 | access_token: capture.as_str().to_owned(), 345 | refresh_token: "".to_owned(), 346 | }) 347 | } 348 | None => { 349 | info!( 350 | "Couldn't find a guest token in the homepage, attempting to fetch from the API" 351 | ); 352 | let bearer = self 353 | .bearer_token 354 | .clone() 355 | .unwrap_or(MAGIC_BEARER_TOKEN.to_owned()); 356 | let mut request_headers = headers.clone(); 357 | request_headers.append( 358 | HeaderName::from_static("authorization"), 359 | HeaderValue::from_str(&format!("Bearer {}", bearer)) 360 | .expect("Header value for authorization request could not be formatted"), 361 | ); 362 | let result = self 363 | .client 364 | .post("https://api.twitter.com/1.1/guest/activate.json") 365 | .headers(request_headers) 366 | .send() 367 | .await? 368 | .json::() 369 | .await?; 370 | let creds = ProviderCredentials { 371 | access_token: result.guest_token, 372 | refresh_token: "".to_owned(), 373 | }; 374 | Ok(creds) 375 | } 376 | } 377 | } 378 | 379 | fn on_error(&self, error: &HttpError) -> anyhow::Result { 380 | match error { 381 | HttpError::FailStatus(e) | HttpError::UnexpectedBody(e) => { 382 | if e.code == 403 { 383 | Ok(ProviderErrorHandle::Login) 384 | } else { 385 | // unknown error at this point 386 | error!("{:?}", e); 387 | Ok(ProviderErrorHandle::Halt) 388 | } 389 | } 390 | error => { 391 | error!("{:?}", error); 392 | Ok(ProviderErrorHandle::Halt) 393 | } 394 | } 395 | } 396 | fn credentials(&self) -> SharedCredentials { 397 | self.guest_token.clone() 398 | } 399 | } 400 | -------------------------------------------------------------------------------- /src/scraper/providers/twitter_types.rs: -------------------------------------------------------------------------------- 1 | // Example code that deserializes and serializes the model. 2 | // extern crate serde; 3 | // #[macro_use] 4 | // extern crate serde_derive; 5 | // extern crate serde_json; 6 | // 7 | // use generated_module::[object Object]; 8 | // 9 | // fn main() { 10 | // let json = r#"{"answer": 42}"#; 11 | // let model: [object Object] = serde_json::from_str(&json).unwrap(); 12 | // } 13 | use serde::{Deserialize, Serialize}; 14 | use std::collections::HashMap; 15 | 16 | #[derive(Deserialize)] 17 | pub struct GuestTokenFetchResponse { 18 | pub(crate) guest_token: String, 19 | } 20 | 21 | #[derive(Deserialize)] 22 | pub struct TwitterUserLookup { 23 | pub(crate) id: String, 24 | } 25 | 26 | #[derive(Deserialize)] 27 | pub struct TwitterUserLookupResponse { 28 | pub(crate) data: TwitterUserLookup, 29 | } 30 | 31 | #[derive(Debug, Serialize, Deserialize)] 32 | pub struct TwitterPostMetadata { 33 | pub(crate) language: Option, 34 | pub(crate) like_count: Option, 35 | pub(crate) retweet_count: Option, 36 | } 37 | 38 | #[derive(Debug, Serialize, Deserialize)] 39 | pub struct TwitterImageMetadata { 40 | pub(crate) width: i64, 41 | pub(crate) height: i64, 42 | } 43 | 44 | #[derive(Debug, Serialize, Deserialize)] 45 | pub struct Twitter { 46 | #[serde(rename = "globalObjects")] 47 | pub(crate) global_objects: GlobalObjects, 48 | pub(crate) timeline: Timeline, 49 | } 50 | 51 | #[derive(Debug, Serialize, Deserialize)] 52 | pub struct GlobalObjects { 53 | pub(crate) tweets: HashMap, 54 | pub(crate) users: HashMap, 55 | } 56 | 57 | #[derive(Debug, Serialize, Deserialize)] 58 | pub struct TopicValue { 59 | pub(crate) id: String, 60 | pub(crate) name: String, 61 | pub(crate) following: bool, 62 | pub(crate) description: String, 63 | pub(crate) not_interested: bool, 64 | pub(crate) icon_url: String, 65 | } 66 | 67 | #[derive(Debug, Serialize, Deserialize)] 68 | pub struct TweetValue { 69 | pub(crate) created_at: String, 70 | pub(crate) id_str: String, 71 | pub(crate) full_text: Option, 72 | pub(crate) display_text_range: Vec, 73 | pub(crate) entities: TweetEntities, 74 | pub(crate) source: Option, 75 | pub(crate) user_id_str: String, 76 | pub(crate) retweeted_status_id_str: Option, 77 | pub(crate) retweet_count: Option, 78 | pub(crate) favorite_count: Option, 79 | pub(crate) conversation_id_str: Option, 80 | pub(crate) lang: Option, 81 | pub(crate) is_quote_status: Option, 82 | pub(crate) quoted_status_id_str: Option, 83 | pub(crate) quoted_status_permalink: Option, 84 | pub(crate) in_reply_to_status_id_str: Option, 85 | pub(crate) in_reply_to_user_id_str: Option, 86 | pub(crate) in_reply_to_screen_name: Option, 87 | pub(crate) extended_entities: Option, 88 | pub(crate) possibly_sensitive_editable: Option, 89 | pub(crate) self_thread: Option, 90 | } 91 | 92 | #[derive(Debug, Serialize, Deserialize)] 93 | pub struct TweetEntities { 94 | pub(crate) user_mentions: Option>, 95 | pub(crate) media: Option>, 96 | pub(crate) urls: Option>, 97 | pub(crate) hashtags: Option>, 98 | } 99 | 100 | #[derive(Debug, Serialize, Deserialize)] 101 | pub struct Hashtag { 102 | pub(crate) text: String, 103 | pub(crate) indices: Vec, 104 | } 105 | 106 | #[derive(Debug, Serialize, Deserialize)] 107 | pub struct EntitiesMedia { 108 | pub(crate) id_str: String, 109 | pub(crate) indices: Vec, 110 | pub(crate) media_url: String, 111 | pub(crate) media_url_https: String, 112 | pub(crate) url: String, 113 | pub(crate) display_url: String, 114 | pub(crate) expanded_url: String, 115 | #[serde(rename = "type")] 116 | pub(crate) media_type: Type, 117 | pub(crate) original_info: OriginalInfo, 118 | pub(crate) sizes: Sizes, 119 | pub(crate) media_key: Option, 120 | pub(crate) ext: Option, 121 | pub(crate) source_status_id_str: Option, 122 | pub(crate) source_user_id_str: Option, 123 | pub(crate) video_info: Option, 124 | } 125 | 126 | #[derive(Debug, Serialize, Deserialize)] 127 | pub struct PurpleExt { 128 | #[serde(rename = "mediaStats")] 129 | pub(crate) media_stats: PurpleMediaStats, 130 | } 131 | 132 | #[derive(Debug, Serialize, Deserialize)] 133 | pub struct PurpleMediaStats { 134 | pub(crate) r: REnum, 135 | pub(crate) ttl: i64, 136 | } 137 | 138 | #[derive(Debug, Serialize, Deserialize)] 139 | pub struct OriginalInfo { 140 | pub(crate) width: i64, 141 | pub(crate) height: i64, 142 | pub(crate) focus_rects: Option>, 143 | } 144 | 145 | #[derive(Debug, Serialize, Deserialize)] 146 | pub struct FocusRect { 147 | pub(crate) x: i64, 148 | pub(crate) y: i64, 149 | pub(crate) h: i64, 150 | pub(crate) w: i64, 151 | } 152 | 153 | #[derive(Debug, Serialize, Deserialize)] 154 | pub struct Sizes { 155 | pub(crate) small: Large, 156 | pub(crate) medium: Large, 157 | pub(crate) thumb: Large, 158 | pub(crate) large: Large, 159 | } 160 | 161 | #[derive(Debug, Serialize, Deserialize)] 162 | pub struct Large { 163 | pub(crate) w: i64, 164 | pub(crate) h: i64, 165 | pub(crate) resize: Resize, 166 | } 167 | 168 | #[derive(Debug, Serialize, Deserialize)] 169 | pub struct PurpleVideoInfo { 170 | pub(crate) aspect_ratio: Vec, 171 | pub(crate) variants: Vec, 172 | } 173 | 174 | #[derive(Debug, Serialize, Deserialize)] 175 | pub struct Variant { 176 | pub(crate) bitrate: Option, 177 | pub(crate) content_type: ContentType, 178 | pub(crate) url: String, 179 | } 180 | 181 | #[derive(Debug, Serialize, Deserialize)] 182 | pub struct UrlElement { 183 | pub(crate) url: String, 184 | pub(crate) expanded_url: String, 185 | pub(crate) display_url: String, 186 | pub(crate) indices: Vec, 187 | } 188 | 189 | #[derive(Debug, Serialize, Deserialize)] 190 | pub struct UserMention { 191 | pub(crate) screen_name: String, 192 | pub(crate) name: String, 193 | pub(crate) id_str: String, 194 | pub(crate) indices: Vec, 195 | } 196 | 197 | #[derive(Debug, Serialize, Deserialize)] 198 | pub struct ExtendedEntities { 199 | pub(crate) media: Vec, 200 | } 201 | 202 | #[derive(Debug, Serialize, Deserialize)] 203 | pub struct ExtendedEntitiesMedia { 204 | pub(crate) id_str: String, 205 | pub(crate) indices: Vec, 206 | pub(crate) media_url: String, 207 | pub(crate) media_url_https: String, 208 | pub(crate) url: String, 209 | pub(crate) display_url: String, 210 | pub(crate) expanded_url: String, 211 | #[serde(rename = "type")] 212 | pub(crate) media_type: Type, 213 | pub(crate) original_info: OriginalInfo, 214 | pub(crate) sizes: Sizes, 215 | pub(crate) media_key: Option, 216 | pub(crate) ext: Option, 217 | pub(crate) source_status_id_str: Option, 218 | pub(crate) source_user_id_str: Option, 219 | pub(crate) video_info: Option, 220 | pub(crate) additional_media_info: Option, 221 | } 222 | 223 | #[derive(Debug, Serialize, Deserialize)] 224 | pub struct AdditionalMediaInfo { 225 | pub(crate) monetizable: bool, 226 | } 227 | 228 | #[derive(Debug, Serialize, Deserialize)] 229 | pub struct FluffyExt { 230 | #[serde(rename = "mediaStats")] 231 | pub(crate) media_stats: FluffyMediaStats, 232 | } 233 | 234 | #[derive(Debug, Serialize, Deserialize)] 235 | pub struct FluffyMediaStats { 236 | pub(crate) r: RUnion, 237 | pub(crate) ttl: i64, 238 | } 239 | 240 | #[derive(Debug, Serialize, Deserialize)] 241 | pub struct RRClass { 242 | pub(crate) ok: Ok, 243 | } 244 | 245 | #[derive(Debug, Serialize, Deserialize)] 246 | pub struct Ok { 247 | #[serde(rename = "viewCount")] 248 | pub(crate) view_count: String, 249 | } 250 | 251 | #[derive(Debug, Serialize, Deserialize)] 252 | pub struct FluffyVideoInfo { 253 | pub(crate) aspect_ratio: Vec, 254 | pub(crate) duration_millis: Option, 255 | pub(crate) variants: Vec, 256 | } 257 | 258 | #[derive(Debug, Serialize, Deserialize)] 259 | pub struct QuotedStatusPermalink { 260 | pub(crate) url: String, 261 | pub(crate) expanded: String, 262 | pub(crate) display: String, 263 | } 264 | 265 | #[derive(Debug, Serialize, Deserialize)] 266 | pub struct SelfThread { 267 | pub(crate) id_str: String, 268 | } 269 | 270 | #[derive(Debug, Serialize, Deserialize)] 271 | pub struct User { 272 | pub(crate) id_str: String, 273 | pub(crate) name: String, 274 | pub(crate) screen_name: String, 275 | // pub(crate) location: String, 276 | // pub(crate) description: String, 277 | // pub(crate) url: Option, 278 | // pub(crate) entities: UserEntities, 279 | // pub(crate) followers_count: i64, 280 | // pub(crate) fast_followers_count: i64, 281 | // pub(crate) normal_followers_count: i64, 282 | // pub(crate) friends_count: i64, 283 | // pub(crate) listed_count: i64, 284 | // pub(crate) created_at: String, 285 | // pub(crate) favourites_count: i64, 286 | // pub(crate) geo_enabled: Option, 287 | // pub(crate) statuses_count: i64, 288 | // pub(crate) media_count: i64, 289 | pub(crate) profile_image_url_https: Option, 290 | // pub(crate) profile_banner_url: Option, 291 | // pub(crate) profile_image_extensions: ProfileExtensions, 292 | // pub(crate) profile_banner_extensions: Option, 293 | // pub(crate) profile_link_color: String, 294 | // pub(crate) pinned_tweet_ids: Vec, 295 | // pub(crate) pinned_tweet_ids_str: Vec, 296 | // pub(crate) has_custom_timelines: Option, 297 | // pub(crate) profile_interstitial_type: String, 298 | // pub(crate) has_extended_profile: Option, 299 | // pub(crate) default_profile: Option, 300 | // pub(crate) verified: Option, 301 | } 302 | 303 | #[derive(Debug, Serialize, Deserialize)] 304 | pub struct UserEntities { 305 | pub(crate) url: Option, 306 | pub(crate) description: Description, 307 | } 308 | 309 | #[derive(Debug, Serialize, Deserialize)] 310 | pub struct Description { 311 | pub(crate) urls: Option>, 312 | } 313 | 314 | #[derive(Debug, Serialize, Deserialize)] 315 | pub struct PurpleUrl { 316 | pub(crate) urls: Vec, 317 | } 318 | 319 | #[derive(Debug, Serialize, Deserialize)] 320 | pub struct ProfileExtensions { 321 | #[serde(rename = "mediaStats")] 322 | pub(crate) media_stats: ProfileImageExtensionsMediaStats, 323 | } 324 | 325 | #[derive(Debug, Serialize, Deserialize)] 326 | pub struct ProfileImageExtensionsMediaStats { 327 | pub(crate) r: MediaStatsRClass, 328 | pub(crate) ttl: i64, 329 | } 330 | 331 | #[derive(Debug, Serialize, Deserialize)] 332 | pub struct MediaStatsRClass { 333 | pub(crate) missing: Option, 334 | } 335 | 336 | #[derive(Debug, Serialize, Deserialize)] 337 | pub struct Timeline { 338 | pub(crate) id: String, 339 | pub(crate) instructions: Vec>, 340 | } 341 | 342 | #[derive(Debug, Serialize, Deserialize)] 343 | #[serde(untagged)] 344 | pub enum Entries { 345 | #[serde(rename = "addEntries")] 346 | AddEntries { 347 | entries: Vec, 348 | }, 349 | Other(serde_json::Value), 350 | } 351 | 352 | #[derive(Debug, Serialize, Deserialize)] 353 | pub struct AddEntries { 354 | pub(crate) entries: Vec, 355 | } 356 | 357 | #[derive(Debug, Serialize, Deserialize)] 358 | pub struct Entry { 359 | #[serde(rename = "entryId")] 360 | pub(crate) entry_id: String, 361 | #[serde(rename = "sortIndex")] 362 | pub(crate) sort_index: String, 363 | pub(crate) content: EntryContent, 364 | } 365 | 366 | #[derive(Debug, Serialize, Deserialize)] 367 | pub struct EntryContent { 368 | pub(crate) item: Option, 369 | #[serde(rename = "timelineModule")] 370 | pub(crate) timeline_module: Option, 371 | pub(crate) operation: Option, 372 | } 373 | 374 | #[derive(Debug, Serialize, Deserialize)] 375 | pub struct ContentItem { 376 | pub(crate) content: PurpleContent, 377 | } 378 | 379 | #[derive(Debug, Serialize, Deserialize)] 380 | pub struct PurpleContent { 381 | pub(crate) tweet: ContentTweet, 382 | } 383 | 384 | #[derive(Debug, Serialize, Deserialize)] 385 | pub struct ContentTweet { 386 | pub(crate) id: String, 387 | #[serde(rename = "displayType")] 388 | pub(crate) display_type: DisplayType, 389 | } 390 | 391 | #[derive(Debug, Serialize, Deserialize)] 392 | pub struct Operation { 393 | pub(crate) cursor: Cursor, 394 | } 395 | 396 | #[derive(Debug, Serialize, Deserialize)] 397 | pub struct Cursor { 398 | pub(crate) value: String, 399 | #[serde(rename = "cursorType")] 400 | pub(crate) cursor_type: String, 401 | #[serde(rename = "stopOnEmptyResponse")] 402 | pub(crate) stop_on_empty_response: Option, 403 | } 404 | 405 | #[derive(Debug, Serialize, Deserialize)] 406 | pub struct TimelineModule { 407 | pub(crate) items: Vec, 408 | #[serde(rename = "displayType")] 409 | pub(crate) display_type: String, 410 | pub(crate) header: Header, 411 | #[serde(rename = "clientEventInfo")] 412 | pub(crate) client_event_info: TimelineModuleClientEventInfo, 413 | pub(crate) metadata: Metadata, 414 | } 415 | 416 | #[derive(Debug, Serialize, Deserialize)] 417 | pub struct TimelineModuleClientEventInfo { 418 | pub(crate) component: Component, 419 | } 420 | 421 | #[derive(Debug, Serialize, Deserialize)] 422 | pub struct Header { 423 | pub(crate) text: String, 424 | pub(crate) sticky: bool, 425 | #[serde(rename = "socialContext")] 426 | pub(crate) social_context: SocialContext, 427 | #[serde(rename = "displayType")] 428 | pub(crate) display_type: String, 429 | } 430 | 431 | #[derive(Debug, Serialize, Deserialize)] 432 | pub struct SocialContext { 433 | #[serde(rename = "generalContext")] 434 | pub(crate) general_context: GeneralContext, 435 | } 436 | 437 | #[derive(Debug, Serialize, Deserialize)] 438 | pub struct GeneralContext { 439 | #[serde(rename = "contextType")] 440 | pub(crate) context_type: String, 441 | pub(crate) text: String, 442 | } 443 | 444 | #[derive(Debug, Serialize, Deserialize)] 445 | pub struct ItemElement { 446 | #[serde(rename = "entryId")] 447 | pub(crate) entry_id: String, 448 | pub(crate) item: ItemItem, 449 | } 450 | 451 | #[derive(Debug, Serialize, Deserialize)] 452 | pub struct ItemItem { 453 | pub(crate) content: FluffyContent, 454 | #[serde(rename = "clientEventInfo")] 455 | pub(crate) client_event_info: ItemClientEventInfo, 456 | #[serde(rename = "feedbackInfo")] 457 | pub(crate) feedback_info: FeedbackInfo, 458 | } 459 | 460 | #[derive(Debug, Serialize, Deserialize)] 461 | pub struct ItemClientEventInfo { 462 | pub(crate) component: Component, 463 | pub(crate) element: Element, 464 | pub(crate) details: Details, 465 | } 466 | 467 | #[derive(Debug, Serialize, Deserialize)] 468 | pub struct Details { 469 | #[serde(rename = "timelinesDetails")] 470 | pub(crate) timelines_details: TimelinesDetails, 471 | } 472 | 473 | #[derive(Debug, Serialize, Deserialize)] 474 | pub struct TimelinesDetails { 475 | #[serde(rename = "controllerData")] 476 | pub(crate) controller_data: String, 477 | } 478 | 479 | #[derive(Debug, Serialize, Deserialize)] 480 | pub struct FluffyContent { 481 | pub(crate) topic: ContentTopic, 482 | } 483 | 484 | #[derive(Debug, Serialize, Deserialize)] 485 | pub struct ContentTopic { 486 | #[serde(rename = "topicId")] 487 | pub(crate) topic_id: String, 488 | #[serde(rename = "topicFunctionalityType")] 489 | pub(crate) topic_functionality_type: TopicFunctionalityType, 490 | #[serde(rename = "topicDisplayType")] 491 | pub(crate) topic_display_type: TopicDisplayType, 492 | } 493 | 494 | #[derive(Debug, Serialize, Deserialize)] 495 | pub struct FeedbackInfo { 496 | #[serde(rename = "feedbackKeys")] 497 | pub(crate) feedback_keys: Vec, 498 | #[serde(rename = "feedbackMetadata")] 499 | pub(crate) feedback_metadata: FeedbackMetadata, 500 | } 501 | 502 | #[derive(Debug, Serialize, Deserialize)] 503 | pub struct Metadata { 504 | #[serde(rename = "gridCarouselMetadata")] 505 | pub(crate) grid_carousel_metadata: GridCarouselMetadata, 506 | } 507 | 508 | #[derive(Debug, Serialize, Deserialize)] 509 | pub struct GridCarouselMetadata { 510 | #[serde(rename = "numRows")] 511 | pub(crate) num_rows: i64, 512 | } 513 | 514 | #[derive(Debug, Serialize, Deserialize)] 515 | pub struct RichBehavior { 516 | #[serde(rename = "markNotInterestedTopic")] 517 | pub(crate) mark_not_interested_topic: MarkNotInterestedTopic, 518 | } 519 | 520 | #[derive(Debug, Serialize, Deserialize)] 521 | pub struct MarkNotInterestedTopic { 522 | #[serde(rename = "topicId")] 523 | pub(crate) topic_id: String, 524 | } 525 | 526 | #[derive(Debug, Serialize, Deserialize)] 527 | #[serde(untagged)] 528 | pub enum RUnion { 529 | Enum(REnum), 530 | RrClass(RRClass), 531 | } 532 | 533 | #[derive(Debug, Serialize, Deserialize)] 534 | pub enum REnum { 535 | Missing, 536 | } 537 | 538 | #[derive(Debug, Serialize, Deserialize)] 539 | pub enum Type { 540 | #[serde(rename = "animated_gif")] 541 | AnimatedGif, 542 | #[serde(rename = "photo")] 543 | Photo, 544 | #[serde(rename = "video")] 545 | Video, 546 | } 547 | 548 | #[derive(Debug, Serialize, Deserialize)] 549 | pub enum Resize { 550 | #[serde(rename = "crop")] 551 | Crop, 552 | #[serde(rename = "fit")] 553 | Fit, 554 | } 555 | 556 | #[derive(Debug, Serialize, Deserialize)] 557 | pub enum ContentType { 558 | #[serde(rename = "application/x-mpegURL")] 559 | ApplicationXMpegUrl, 560 | #[serde(rename = "video/mp4")] 561 | VideoMp4, 562 | } 563 | 564 | #[derive(Debug, Serialize, Deserialize)] 565 | pub enum AdvertiserAccountServiceLevel { 566 | #[serde(rename = "analytics")] 567 | Analytics, 568 | #[serde(rename = "media_studio")] 569 | MediaStudio, 570 | #[serde(rename = "mms")] 571 | Mms, 572 | #[serde(rename = "smb")] 573 | Smb, 574 | #[serde(rename = "subscription")] 575 | Subscription, 576 | } 577 | 578 | #[derive(Debug, Serialize, Deserialize)] 579 | pub enum AdvertiserAccountType { 580 | #[serde(rename = "none")] 581 | None, 582 | #[serde(rename = "promotable_user")] 583 | PromotableUser, 584 | } 585 | 586 | #[derive(Debug, Serialize, Deserialize)] 587 | pub enum TranslatorType { 588 | #[serde(rename = "none")] 589 | None, 590 | #[serde(rename = "regular")] 591 | Regular, 592 | } 593 | 594 | #[derive(Debug, Serialize, Deserialize)] 595 | pub enum DisplayType { 596 | Tweet, 597 | } 598 | 599 | #[derive(Debug, Serialize, Deserialize)] 600 | pub enum Component { 601 | #[serde(rename = "suggest_topics_module")] 602 | SuggestTopicsModule, 603 | } 604 | 605 | #[derive(Debug, Serialize, Deserialize)] 606 | pub enum Element { 607 | #[serde(rename = "topic")] 608 | Topic, 609 | } 610 | 611 | #[derive(Debug, Serialize, Deserialize)] 612 | pub enum TopicDisplayType { 613 | Pill, 614 | } 615 | 616 | #[derive(Debug, Serialize, Deserialize)] 617 | pub enum TopicFunctionalityType { 618 | Recommendation, 619 | } 620 | 621 | #[derive(Debug, Serialize, Deserialize)] 622 | pub enum FeedbackMetadata { 623 | #[serde(rename = "FcQBOQwA")] 624 | FcQboQwA, 625 | } 626 | 627 | #[derive(Debug, Serialize, Deserialize)] 628 | pub enum EncodedFeedbackRequest { 629 | #[serde(rename = "LBUeHBXEATkMAAAA")] 630 | LbUeHbxeaTkMaaaa, 631 | } 632 | 633 | #[derive(Debug, Serialize, Deserialize)] 634 | pub enum FeedbackType { 635 | RichBehavior, 636 | } 637 | -------------------------------------------------------------------------------- /src/scraper/providers/united_cube.rs: -------------------------------------------------------------------------------- 1 | use std::{env, path::Path, sync::Arc, time::Instant}; 2 | 3 | use async_trait::async_trait; 4 | use chrono::{DateTime, NaiveDateTime, Utc}; 5 | use log::error; 6 | use reqwest::Client; 7 | use serde::{Deserialize, Serialize}; 8 | 9 | use crate::{ 10 | request::{parse_successful_response, request_default_headers, HttpError}, 11 | scheduler::UnscopedLimiter, 12 | scraper::ProviderCredentials, 13 | }; 14 | 15 | use super::*; 16 | 17 | pub struct UnitedCubeArtistFeed { 18 | pub client: Arc, 19 | pub credentials: SharedCredentials, 20 | pub rate_limiter: UnscopedLimiter, 21 | } 22 | 23 | #[async_trait] 24 | impl RateLimitable for UnitedCubeArtistFeed { 25 | async fn wait(&self, _key: &str) -> () { 26 | self.rate_limiter 27 | .until_ready_with_jitter(default_jitter()) 28 | .await 29 | } 30 | } 31 | 32 | #[derive(Serialize)] 33 | struct LoginInput { 34 | refresh_token: Option, 35 | path: String, 36 | id: String, 37 | pw: String, 38 | remember_me: bool, 39 | } 40 | 41 | #[derive(Serialize)] 42 | struct RefreshInput { 43 | refresh_token: String, 44 | } 45 | 46 | #[derive(Deserialize)] 47 | struct GenericError { 48 | message: String, 49 | } 50 | 51 | #[derive(Deserialize)] 52 | struct RefreshResponse { 53 | token: String, 54 | } 55 | 56 | #[derive(Deserialize)] 57 | struct LoginResponse { 58 | // slug: String, 59 | // email: String, 60 | // name: String, 61 | // language: String, 62 | // role_code: String, 63 | token: String, 64 | refresh_token: String, 65 | } 66 | 67 | /// Posts are divided between images and videos 68 | #[derive(Debug, Deserialize, Clone)] 69 | #[serde(tag = "type_code", content = "data")] 70 | enum MediaData { 71 | #[serde(rename = "601")] 72 | Image { path: String }, 73 | #[serde(rename = "602")] 74 | Video { url: String, image: String }, 75 | #[serde(rename = "604")] 76 | Post { title: String }, 77 | } 78 | 79 | #[derive(Debug, Deserialize, Clone)] 80 | struct Post { 81 | slug: String, 82 | content: Option, 83 | register_datetime: DateTime, 84 | media: Vec, 85 | } 86 | 87 | #[derive(Debug, Deserialize, Clone)] 88 | struct Page { 89 | has_next: bool, 90 | // has_prev: bool, 91 | // prev_num: null, 92 | page: i32, 93 | next_num: Option, 94 | // pages: i32, 95 | // per_page: i32, 96 | // total: i32, 97 | items: Vec, 98 | } 99 | 100 | const BASE_URL: &str = "https://www.united-cube.com"; 101 | 102 | fn extract_url_and_id(path: &str, base_url: &url::Url) -> anyhow::Result<(url::Url, String)> { 103 | // ucube is missing a leading slash in their links lol 104 | let parsed_relative_url = format!("/{}", &path); 105 | let url = base_url.join(&parsed_relative_url)?; 106 | // .map_err(|result| { 107 | // anyhow::anyhow!(result) 108 | // })?; 109 | // unbelievably big brain conversion 110 | let unique_identifier = Path::new(&parsed_relative_url) 111 | .file_stem() 112 | .and_then(|str| str.to_str().map(|result| result.to_owned())) 113 | .ok_or_else(|| anyhow::anyhow!("Invalid file format: {}", parsed_relative_url))?; 114 | Ok((url, unique_identifier)) 115 | } 116 | 117 | #[async_trait] 118 | impl Provider for UnitedCubeArtistFeed { 119 | fn id(&self) -> AllProviders { 120 | AllProviders::UnitedCubeArtistFeed 121 | } 122 | fn new(input: ProviderInput) -> Self 123 | where 124 | Self: Sized, 125 | { 126 | Self { 127 | client: input.client, 128 | credentials: create_credentials(), 129 | rate_limiter: Self::rate_limiter(), 130 | } 131 | } 132 | 133 | fn requires_auth(&self) -> bool { 134 | true 135 | } 136 | 137 | async fn initialize(&self) -> () { 138 | if self.requires_auth() { 139 | attempt_first_login(self, &self.credentials).await; 140 | } 141 | } 142 | 143 | fn max_page_size(&self) -> PageSize { 144 | PageSize(200) 145 | } 146 | 147 | fn default_page_size(&self) -> PageSize { 148 | PageSize(20) 149 | } 150 | 151 | fn on_error(&self, http_error: &HttpError) -> anyhow::Result { 152 | let err = match http_error { 153 | HttpError::ReqwestError(_err) => return Ok(ProviderErrorHandle::Halt), 154 | HttpError::FailStatus(err) | HttpError::UnexpectedBody(err) => err, 155 | }; 156 | 157 | let body = match serde_json::from_str::(&err.body) { 158 | Err(err) => { 159 | error!("Couldn't parse the response from united_cube"); 160 | eprintln!("{:?}", err); 161 | return Ok(ProviderErrorHandle::Halt); 162 | } 163 | Ok(body) => body, 164 | }; 165 | Ok(if body.message == "Token Expired" && err.code == 400 { 166 | let cred = self.credentials.read().clone(); 167 | ProviderErrorHandle::RefreshToken((cred).unwrap()) 168 | } else { 169 | // I don't think there is any other response you can get if your token is expired 170 | // so we can probably assume that something else has gone wrong 171 | ProviderErrorHandle::Halt 172 | }) 173 | } 174 | 175 | fn from_provider_destination( 176 | &self, 177 | id: &str, 178 | page_size: PageSize, 179 | pagination: Option, 180 | ) -> Result { 181 | // club_id|board_id 182 | let page_id = id.to_string(); 183 | let parts = page_id.split('|').collect::>(); 184 | let board = parts.get(1).unwrap(); 185 | let mut next_url: UrlBuilder = Default::default(); 186 | next_url.params.push(("board", board.to_string())); 187 | next_url.page_size("per_page", page_size); 188 | next_url.pagination("page", &pagination); 189 | let url = next_url.build_scrape_url("https://united-cube.com/v1/posts")?; 190 | Ok(url) 191 | } 192 | 193 | async fn unfold(&self, state: ProviderState) -> Result { 194 | let creds = self.credentials.read().clone(); 195 | let credentials = match creds { 196 | Some(c) => c, 197 | None => return Ok(ProviderStep::NotInitialized), 198 | }; 199 | 200 | let token = credentials.access_token.clone(); 201 | let instant = Instant::now(); 202 | 203 | let response = self 204 | .client 205 | .get(&state.url.0) 206 | .headers(request_default_headers()) 207 | .header("Authorization", &format!("Bearer {}", token)) 208 | .send() 209 | .await?; 210 | let elapsed = instant.elapsed(); 211 | let status = response.status(); 212 | 213 | let cube_url = url::Url::parse(BASE_URL).unwrap(); 214 | let response_json = parse_successful_response::(response).await?; 215 | 216 | let account = state 217 | .default_name 218 | .map(|name| ProviderAccount { 219 | name, 220 | avatar_url: None, 221 | }) 222 | .unwrap_or_default(); 223 | let posts = response_json 224 | .items 225 | .into_iter() 226 | .map(|post| { 227 | ProviderPost { 228 | // UCube does not give us any kind of user information 229 | account: account.clone(), 230 | unique_identifier: post.slug, 231 | // TODO: maybe add page urls to this anyways? 232 | // united-cube doesn't have page-specific links, they all go to 233 | // https://www.united-cube.com/club/qXmD_5exRnmZfkFIwR1cVA/board/cHTUTBaRRpqUWAL2c5nQiw#PostDetail 234 | // which is controlled by JS and can't be linked to 235 | url: None, 236 | // This is HTML but who cares 237 | body: post.content, 238 | post_date: Some(post.register_datetime.naive_utc()), 239 | metadata: None, 240 | images: 241 | post.media 242 | .iter() 243 | .filter_map(|media| { 244 | let (_type, media_url, unique_identifier) = match &media { 245 | // we don't care about posts 246 | MediaData::Post { .. } => return None, 247 | // Every video on ucube is (probably) a link to an external youtube video 248 | // but we can't be sure 249 | MediaData::Video { url, .. } => { 250 | let is_probably_external_link = url.starts_with("http"); 251 | if is_probably_external_link { 252 | return None; 253 | } 254 | // assuming that a non-external link would follow the same pattern as 255 | match extract_url_and_id(url.as_str(), &cube_url) { 256 | Err(err) => { 257 | error!("Could not convert a non-external ucube video into a relative path"); 258 | error!("{:?}", err); 259 | return None; 260 | } 261 | Ok((url, id)) => { 262 | (ProviderMediaType::Video, url.as_str().to_owned(), id) 263 | } 264 | } 265 | } 266 | MediaData::Image { path } => { 267 | match extract_url_and_id(path.as_str(), &cube_url) { 268 | Err(err) => { 269 | error!("Could not get relative path from a ucube image {}", path); 270 | error!("{:?}", err); 271 | return None; 272 | } 273 | Ok((url, id)) => { 274 | (ProviderMediaType::Image, url.as_str().to_owned(), id) 275 | } 276 | } 277 | } 278 | }; 279 | Some(ProviderMedia { 280 | _type, 281 | media_url, 282 | // same with reference URL 283 | reference_url: None, 284 | metadata: None, 285 | unique_identifier, 286 | }) 287 | }) 288 | .collect::>(), 289 | } 290 | }) 291 | .collect::>(); 292 | 293 | let result = ProviderResult { 294 | posts, 295 | response_code: status, 296 | response_delay: elapsed, 297 | }; 298 | match response_json.next_num { 299 | Some(next) => Ok(ProviderStep::Next(result, Pagination::NextPage(next))), 300 | None => Ok(ProviderStep::End(result)), 301 | } 302 | } 303 | 304 | fn credentials(&self) -> SharedCredentials { 305 | self.credentials.clone() 306 | } 307 | 308 | async fn login(&self) -> Result { 309 | let response = self 310 | .client 311 | .post("https://united-cube.com/v1/auth/login") 312 | .json(&LoginInput { 313 | refresh_token: None, 314 | path: "https://www.united-cube.com/signin".to_owned(), 315 | id: env::var("UNITED_CUBE_EMAIL") 316 | .expect("Tried to login to united_cube without credentials"), 317 | pw: env::var("UNITED_CUBE_PASSWORD").unwrap(), 318 | remember_me: false, 319 | }) 320 | .send() 321 | .await? 322 | .json::() 323 | .await?; 324 | Ok(ProviderCredentials { 325 | access_token: response.token, 326 | refresh_token: response.refresh_token, 327 | }) 328 | } 329 | async fn token_refresh( 330 | &self, 331 | credentials: &ProviderCredentials, 332 | ) -> anyhow::Result { 333 | let refresh_token = credentials.refresh_token.clone(); 334 | let response = self 335 | .client 336 | .post("https://united-cube.com/v1/auth/refresh") 337 | .json(&RefreshInput { 338 | refresh_token: refresh_token.clone(), 339 | }) 340 | .send() 341 | .await? 342 | .json::() 343 | .await?; 344 | Ok(CredentialRefresh::Result(ProviderCredentials { 345 | access_token: response.token, 346 | refresh_token, 347 | })) 348 | } 349 | } 350 | -------------------------------------------------------------------------------- /src/scraper/providers/weverse.rs: -------------------------------------------------------------------------------- 1 | use std::error::Error; 2 | use std::{env, iter::FromIterator, sync::Arc, time::Instant}; 3 | 4 | use async_trait::async_trait; 5 | use bimap::{BiHashMap, BiMap}; 6 | use chrono::{DateTime, NaiveDateTime, Utc}; 7 | use governor::Quota; 8 | use lazy_static::lazy_static; 9 | use log::info; 10 | use rand::rngs::OsRng; 11 | use regex::Regex; 12 | use reqwest::Client; 13 | use rsa::{PaddingScheme, PublicKey, RSAPublicKey}; 14 | use serde::{Deserialize, Serialize}; 15 | use sha1::Sha1; 16 | 17 | use crate::{ 18 | request::{parse_successful_response, request_default_headers, HttpError}, 19 | scheduler::UnscopedLimiter, 20 | scraper::{providers::ProviderMediaType, ProviderMedia, ProviderResult}, 21 | }; 22 | 23 | use super::*; 24 | 25 | /// https://gist.github.com/Xetera/aa59e84f3959a37c16a3309b5d9ab5a0 26 | async fn get_public_key(client: &Client) -> Result { 27 | let login_page = client 28 | .post("https://account.weverse.io/login/auth?client_id=weverse-test&hl=en") 29 | .send() 30 | .await? 31 | .text() 32 | .await?; 33 | let regex = Regex::new(r"/(static/js/main\..*.js)").unwrap(); 34 | let js_bundle_captures = regex.captures(&login_page).ok_or( 35 | ProviderFailure::Other( 36 | "Could not find a bundle matching the regex '/(static/js/main\\..*.js)' in the weverse login page".to_owned() 37 | ) 38 | )?; 39 | 40 | let js_name = js_bundle_captures 41 | .get(1) 42 | .ok_or(ProviderFailure::Other( 43 | "Couldn't match a main js bundle on account.weverse.io, the site was changed" 44 | .to_owned(), 45 | ))? 46 | .as_str(); 47 | let js_bundle_url = format!("https://account.weverse.io/{}", js_name); 48 | let js_bundle = client.get(&js_bundle_url).send().await?.text().await?; 49 | let rsa_captures = 50 | Regex::new(r"(-----BEGIN RSA PUBLIC KEY-----(.|\n)+----END RSA PUBLIC KEY-----)") 51 | .unwrap() 52 | .captures(&js_bundle) 53 | .ok_or(ProviderFailure::Other(format!( 54 | "Could not find RSA key in {}", 55 | &js_bundle_url 56 | )))?; 57 | 58 | let rsa_key = rsa_captures.get(1).unwrap().as_str().to_owned(); 59 | 60 | let der_encoded = rsa_key 61 | .replace("\\n", "\n") 62 | .lines() 63 | .filter(|line| !line.starts_with('-')) 64 | .fold(String::new(), |mut data, line| { 65 | data.push_str(line); 66 | data 67 | }); 68 | 69 | let der_bytes = base64::decode(&der_encoded).expect("failed to decode base64 content"); 70 | let public_key = RSAPublicKey::from_pkcs8(&der_bytes).expect("failed to parse key"); 71 | Ok(public_key) 72 | } 73 | 74 | fn encrypted_password( 75 | password: String, 76 | public_key: RSAPublicKey, 77 | ) -> Result { 78 | let mut rng = OsRng; 79 | let padding = PaddingScheme::new_oaep::(); 80 | let encrypted = match public_key.encrypt(&mut rng, padding, password.as_bytes()) { 81 | Err(error) => return Err(ProviderFailure::Other(error.to_string())), 82 | Ok(ok) => ok, 83 | }; 84 | Ok(base64::encode(encrypted)) 85 | } 86 | 87 | #[derive(Serialize)] 88 | struct WeverseLoginRequest { 89 | grant_type: String, 90 | client_id: String, 91 | username: String, 92 | password: String, 93 | } 94 | 95 | #[derive(Debug, Deserialize)] 96 | pub struct WeverseLoginResponse { 97 | refresh_token: String, 98 | access_token: String, 99 | } 100 | 101 | async fn get_access_token( 102 | email: String, 103 | encrypted_password: String, 104 | client: &Client, 105 | ) -> Result { 106 | Ok(client 107 | .post("https://accountapi.weverse.io/api/v1/oauth/token") 108 | .json(&WeverseAuthorizeInput::Login { 109 | grant_type: "password".to_owned(), 110 | client_id: "weverse-test".to_owned(), 111 | username: email, 112 | password: encrypted_password, 113 | }) 114 | .send() 115 | .await? 116 | .json::() 117 | .await?) 118 | } 119 | 120 | pub async fn fetch_weverse_auth_token( 121 | client: &Client, 122 | ) -> Result, ProviderFailure> { 123 | match ( 124 | env::var("WEVERSE_ACCESS_TOKEN"), 125 | env::var("WEVERSE_EMAIL"), 126 | env::var("WEVERSE_PASSWORD"), 127 | ) { 128 | (Ok(access_token), _, _) => { 129 | info!("An existing weverse token was found"); 130 | Ok(Some(ProviderCredentials { 131 | access_token, 132 | refresh_token: "".to_owned(), 133 | })) 134 | } 135 | (_, Ok(email), Ok(password)) => { 136 | info!("Detected weverse credentials, attempting to login..."); 137 | let public_key = get_public_key(client).await?; 138 | let encrypted = encrypted_password(password, public_key)?; 139 | let token = get_access_token(email, encrypted, client).await?; 140 | Ok(Some(ProviderCredentials { 141 | access_token: token.access_token, 142 | refresh_token: token.refresh_token, 143 | })) 144 | } 145 | _ => { 146 | info!("Weverse credentials missing, not initializing Weverse module"); 147 | Ok(None) 148 | } 149 | } 150 | } 151 | 152 | #[derive(Debug, Clone, Deserialize)] 153 | #[serde(rename_all = "camelCase")] 154 | pub struct WeversePhoto { 155 | id: u64, 156 | org_img_url: String, 157 | org_img_height: u32, 158 | org_img_width: u32, 159 | thumbnail_img_url: String, 160 | post_id: u64, 161 | } 162 | 163 | #[derive(Debug, Deserialize)] 164 | #[serde(rename_all = "camelCase")] 165 | pub struct WeversePost { 166 | id: u64, 167 | // community: WeverseCommunity, 168 | body: Option, 169 | community_user: WeverseCommunityUser, 170 | photos: Option>, 171 | created_at: DateTime, 172 | } 173 | 174 | #[derive(Debug, Deserialize)] 175 | #[serde(rename_all = "camelCase")] 176 | pub struct WeverseCommunityUser { 177 | community_id: u32, 178 | artist_id: u32, 179 | profile_img_path: String, 180 | profile_nickname: String, 181 | } 182 | 183 | #[derive(Debug, Serialize)] 184 | struct PostMetadata { 185 | author_id: u32, 186 | author_name: String, 187 | } 188 | 189 | #[derive(Debug, Serialize)] 190 | pub struct ImageMetadata { 191 | height: u32, 192 | width: u32, 193 | thumbnail_url: String, 194 | } 195 | 196 | #[derive(Debug, Serialize)] 197 | enum WeverseAuthorizeInput { 198 | TokenRefresh { 199 | client_id: String, 200 | grant_type: String, 201 | refresh_token: String, 202 | }, 203 | Login { 204 | client_id: String, 205 | grant_type: String, 206 | username: String, 207 | password: String, 208 | }, 209 | } 210 | 211 | #[derive(Debug, Deserialize)] 212 | struct WeverseAuthorizeResponse { 213 | access_token: String, 214 | token_type: String, 215 | expires_in: i32, 216 | refresh_token: String, 217 | } 218 | 219 | #[derive(Debug, Serialize)] 220 | struct WeverseTokenRefreshInput {} 221 | 222 | #[derive(Debug, Deserialize)] 223 | #[serde(rename_all = "camelCase")] 224 | pub struct WeversePage { 225 | is_ended: bool, 226 | last_id: Option, 227 | posts: Vec, 228 | } 229 | 230 | // #[derive(Clone)] 231 | pub struct WeverseArtistFeed { 232 | pub client: Arc, 233 | pub credentials: SharedCredentials, 234 | pub rate_limiter: UnscopedLimiter, 235 | } 236 | 237 | lazy_static! { 238 | static ref ARTIST_MAPPINGS: BiMap = 239 | BiHashMap::from_iter([(14, "dreamcatcher"), (10, "sunmi")]); 240 | } 241 | 242 | fn url_from_post(artist_id: u32, post_id: u64, photo_id: u64) -> String { 243 | let artist_name = ARTIST_MAPPINGS 244 | .get_by_left(&artist_id) 245 | .expect(&format!("Weverse ID {} is not a valid mapping", artist_id)); 246 | format!( 247 | "https://weverse.io/{}/artist/{}?photoId={}", 248 | artist_name, post_id, photo_id 249 | ) 250 | } 251 | 252 | const MAX_PAGESIZE: usize = 30; 253 | // weverse is stupid and uses a 16 page default pagesize 254 | const DEFAULT_PAGESIZE: usize = 16; 255 | 256 | #[async_trait] 257 | impl RateLimitable for WeverseArtistFeed { 258 | fn quota() -> Quota 259 | where 260 | Self: Sized, 261 | { 262 | default_quota() 263 | } 264 | async fn wait(&self, _key: &str) -> () { 265 | self.rate_limiter 266 | .until_ready_with_jitter(default_jitter()) 267 | .await 268 | } 269 | } 270 | 271 | #[async_trait] 272 | impl Provider for WeverseArtistFeed { 273 | fn new(input: ProviderInput) -> Self 274 | where 275 | Self: Sized, 276 | { 277 | Self { 278 | credentials: create_credentials(), 279 | client: Arc::clone(&input.client), 280 | rate_limiter: Self::rate_limiter(), 281 | } 282 | } 283 | fn id(&self) -> AllProviders { 284 | AllProviders::WeverseArtistFeed 285 | } 286 | 287 | fn requires_auth(&self) -> bool { 288 | true 289 | } 290 | 291 | async fn initialize(&self) -> () { 292 | attempt_first_login(self, &self.credentials).await; 293 | } 294 | 295 | fn max_page_size(&self) -> PageSize { 296 | PageSize(MAX_PAGESIZE) 297 | } 298 | 299 | fn default_page_size(&self) -> PageSize { 300 | PageSize(DEFAULT_PAGESIZE) 301 | } 302 | 303 | // async fn canonical_url_to_id(&self, url: &str) -> CanonicalUrlResolution { 304 | // let res = match self.client.get("https://weverse.io").send().await { 305 | // Ok(res) => res, 306 | // Err(err) => { 307 | // println!("{:?}", err); 308 | // return CanonicalUrlResolution::Fail 309 | // } 310 | // }; 311 | // let html = match res.text().await { 312 | // Ok(ok) => ok, 313 | // Err(err) => return CanonicalUrlResolution::Fail, 314 | // }; 315 | // let regex = Regex::new(r"/(communitiesInfo\s*?=\s*?(\[.*?\]))").unwrap(); 316 | // } 317 | 318 | fn from_provider_destination( 319 | &self, 320 | id: &str, 321 | page_size: PageSize, 322 | pagination: Option, 323 | ) -> Result { 324 | let mut next_url = UrlBuilder::default(); 325 | next_url.page_size("pageSize", page_size); 326 | next_url.pagination("from", &pagination); 327 | let url = next_url.build_scrape_url(&format!( 328 | "https://weversewebapi.weverse.io/wapi/v1/communities/{}/posts/artistTab", 329 | id 330 | ))?; 331 | Ok(url) 332 | } 333 | 334 | fn max_pagination(&self) -> u16 { 335 | 100 336 | } 337 | 338 | async fn unfold(&self, state: ProviderState) -> Result { 339 | let credentials = self.credentials.read().clone(); 340 | // let token = "".to_owned(); 341 | let token = match credentials { 342 | Some(token) => token.access_token, 343 | None => return Ok(ProviderStep::NotInitialized), 344 | }; 345 | 346 | let instant = Instant::now(); 347 | let response = self 348 | .client 349 | .get(&state.url.0) 350 | .headers(request_default_headers()) 351 | .header("Authorization", format!("Bearer {}", token)) 352 | .send() 353 | .await?; 354 | 355 | let response_code = response.status(); 356 | let response_delay = instant.elapsed(); 357 | let response_json = parse_successful_response::(response).await?; 358 | let posts = response_json 359 | .posts 360 | .into_iter() 361 | .map(|post| { 362 | let community_id = post.community_user.community_id.to_owned(); 363 | let post_id = post.id; 364 | let user = post.community_user; 365 | let author_name = user.profile_nickname; 366 | let author_id = user.artist_id; 367 | let post_created_at = post.created_at; 368 | let photos = post.photos.unwrap_or_default(); 369 | let page_url = photos 370 | .get(0) 371 | .map(|photo| url_from_post(community_id, post_id, photo.id)); 372 | ProviderPost { 373 | account: ProviderAccount { 374 | avatar_url: Some(user.profile_img_path), 375 | name: author_name.clone(), 376 | }, 377 | unique_identifier: post_id.to_string(), 378 | metadata: serde_json::to_value(PostMetadata { 379 | author_id, 380 | author_name, 381 | }) 382 | .ok(), 383 | body: post.body, 384 | url: page_url, 385 | post_date: Some(post_created_at.naive_utc()), 386 | images: photos 387 | .into_iter() 388 | .map(|photo| { 389 | ProviderMedia { 390 | // should be unique across all of weverse 391 | _type: ProviderMediaType::Image, 392 | unique_identifier: photo.id.to_string(), 393 | media_url: photo.org_img_url.clone(), 394 | reference_url: Some(url_from_post(community_id, post_id, photo.id)), 395 | metadata: serde_json::to_value(ImageMetadata { 396 | height: photo.org_img_height, 397 | width: photo.org_img_width, 398 | thumbnail_url: photo.thumbnail_img_url, 399 | }) 400 | .ok(), 401 | } 402 | }) 403 | // not sure why I have to do this here 404 | .collect::>(), 405 | } 406 | }) 407 | .collect::>(); 408 | // weverse omits last_id when at the end of the content 409 | let has_more = !response_json.is_ended || response_json.last_id.is_some(); 410 | let result = ProviderResult { 411 | posts, 412 | response_code, 413 | response_delay, 414 | }; 415 | if has_more { 416 | return Ok(ProviderStep::Next( 417 | result, 418 | Pagination::NextCursor(response_json.last_id.unwrap().to_string()), 419 | )); 420 | } 421 | Ok(ProviderStep::End(result)) 422 | } 423 | 424 | fn on_error(&self, http_error: &HttpError) -> anyhow::Result { 425 | match http_error { 426 | HttpError::FailStatus(err) | HttpError::UnexpectedBody(err) => { 427 | // :) I don't actually know if weverse returns a 401 on expired tokens 428 | // but I can't test because their tokens last for 6 ENTIRE months!!!! 429 | if err.code == 401 || err.code == 403 { 430 | let handle = self 431 | .credentials 432 | .clone() 433 | .try_read() 434 | .map_or(ProviderErrorHandle::Login, |creds| { 435 | ProviderErrorHandle::RefreshToken(creds.clone().unwrap()) 436 | }); 437 | return Ok(handle); 438 | } 439 | Ok(ProviderErrorHandle::Halt) 440 | } 441 | _ => Ok(ProviderErrorHandle::Halt), 442 | } 443 | } 444 | async fn token_refresh( 445 | &self, 446 | credentials: &ProviderCredentials, 447 | ) -> anyhow::Result { 448 | let input = WeverseAuthorizeInput::TokenRefresh { 449 | grant_type: "refresh_token".to_owned(), 450 | client_id: "weverse-test".to_owned(), 451 | refresh_token: credentials.refresh_token.clone(), 452 | }; 453 | let out = self 454 | .client 455 | .post("https://accountapi.weverse.io/api/v1/oauth/token") 456 | .json(&input) 457 | .send() 458 | .await? 459 | .json::() 460 | .await?; 461 | let credentials_result = ProviderCredentials { 462 | access_token: out.access_token, 463 | refresh_token: out.refresh_token, 464 | }; 465 | Ok(CredentialRefresh::Result(credentials_result)) 466 | } 467 | async fn login(&self) -> Result { 468 | let credentials = fetch_weverse_auth_token(&self.client) 469 | .await? 470 | .expect("Tried to authorize weverse module but the login credentials were not found"); 471 | Ok(credentials) 472 | } 473 | fn credentials(&self) -> SharedCredentials { 474 | self.credentials.clone() 475 | } 476 | } 477 | -------------------------------------------------------------------------------- /src/scraper/scraper.rs: -------------------------------------------------------------------------------- 1 | use std::time::Instant; 2 | 3 | use async_recursion::async_recursion; 4 | use chrono::{NaiveDateTime, Utc}; 5 | use futures::StreamExt; 6 | use log::{debug, error, info, trace}; 7 | use crate::api::v1::ProviderStat; 8 | 9 | use crate::scraper::{ 10 | providers::{CredentialRefresh, ProviderErrorHandle}, 11 | ProviderPost, 12 | }; 13 | use crate::scraper::scraper::ScraperErrorHandleDecision::{Continue, MaxLoginAttempts}; 14 | 15 | use super::{ 16 | providers::{Provider, ProviderFailure, ProviderState, ProviderStep, ScrapeRequestInput}, 17 | ProviderCredentials, ProviderResult, ScopedProvider, 18 | }; 19 | 20 | #[derive(Debug)] 21 | pub struct Scrape<'a> { 22 | pub provider: &'a ScopedProvider, 23 | pub requests: Vec, 24 | } 25 | 26 | impl Scrape<'_> { 27 | pub fn discovered_new_images(&self) -> bool { 28 | let step = match self.requests.get(0) { 29 | None => return false, 30 | Some(req) => &req.step, 31 | }; 32 | match step { 33 | ScraperStep::Data(data) => !data.posts.is_empty(), 34 | ScraperStep::Error(_) => false, 35 | } 36 | } 37 | } 38 | 39 | #[derive(Debug)] 40 | pub struct ScrapeRequest { 41 | pub date: NaiveDateTime, 42 | pub step: ScraperStep, 43 | } 44 | 45 | #[derive(Debug)] 46 | enum InternalScraperStep { 47 | Data(ProviderResult), 48 | Error(ProviderFailure), 49 | Exit, 50 | } 51 | 52 | #[derive(Debug)] 53 | pub enum ScraperStep { 54 | Data(ProviderResult), 55 | // we only want to forward request related errors to the consumer 56 | Error(ProviderFailure), 57 | } 58 | 59 | enum ScraperErrorHandleDecision { 60 | Continue, 61 | MaxLoginAttempts(u32), 62 | } 63 | 64 | fn write_provider_credentials(provider: &dyn Provider, credentials: ProviderCredentials) { 65 | let creds = provider.credentials(); 66 | let mut credential_ref = creds.write(); 67 | *credential_ref = Some(credentials); 68 | } 69 | 70 | fn should_continue_requests(state: &ProviderState, provider: &dyn Provider) -> ScraperErrorHandleDecision { 71 | let max_attempts = provider.max_login_attempts(); 72 | if state.login_attempts > max_attempts { 73 | error!("Failed to login to {} after {} attempts. Giving up.", provider.id().to_string(), max_attempts); 74 | return MaxLoginAttempts(max_attempts); 75 | } 76 | return Continue; 77 | } 78 | 79 | #[async_recursion] 80 | async fn request_page<'a>( 81 | sp: &'a ScopedProvider, 82 | provider: &dyn Provider, 83 | state: ProviderState, 84 | input: &ScrapeRequestInput, 85 | ) -> (InternalScraperStep, Option) { 86 | let iteration = state.iteration; 87 | let error_step = |error| { 88 | debug!("Exiting scrape due to an error {:?}", error); 89 | (InternalScraperStep::Error(error), None) 90 | }; 91 | let give_up = (InternalScraperStep::Exit, None); 92 | let write_credentials_and_continue = |creds: ProviderCredentials| { 93 | write_provider_credentials(provider, creds); 94 | let new_state = ProviderState { 95 | login_attempts: state.login_attempts + 1, 96 | ..state.clone() 97 | }; 98 | request_page(sp, provider, new_state, input) 99 | }; 100 | match provider.unfold(state.to_owned()).await { 101 | // we have to indicate an error to the consumer and stop iteration on the next cycle 102 | Err(error) => match &error { 103 | ProviderFailure::HttpError(http_error) => match provider.on_error(http_error) { 104 | Ok(ProviderErrorHandle::Halt) => error_step(error), 105 | Ok(ProviderErrorHandle::Login) => { 106 | if let MaxLoginAttempts(count) = should_continue_requests(&state, provider) { 107 | error!("Too many login attempts ({}) for {}. Giving up", count, provider.id().to_string()); 108 | return give_up; 109 | } 110 | debug!("Triggering login flow for {}", provider.id().to_string()); 111 | match provider.login().await { 112 | Ok(credentials) => write_credentials_and_continue(credentials).await, 113 | Err(error) => error_step(error), 114 | } 115 | } 116 | Ok(ProviderErrorHandle::RefreshToken(credentials)) => { 117 | if let MaxLoginAttempts(count) = should_continue_requests(&state, provider) { 118 | error!("Too many login attempts ({}) for {}. Giving up", count, provider.id().to_string()); 119 | return give_up; 120 | } 121 | debug!( 122 | "Triggering token refresh flow for {}", 123 | provider.id().to_string() 124 | ); 125 | match provider.token_refresh(&credentials).await { 126 | Ok(CredentialRefresh::Result(credentials)) => { 127 | write_credentials_and_continue(credentials).await 128 | } 129 | Ok(CredentialRefresh::TryLogin) => { 130 | debug!("Triggering login flow for {}", provider.id().to_string()); 131 | match provider.login().await { 132 | Ok(credentials) => { 133 | write_credentials_and_continue(credentials).await 134 | } 135 | Err(err) => { 136 | debug!( 137 | "Error trying to login to {}: {:?}", 138 | provider.id().to_string(), 139 | err 140 | ); 141 | error_step(error) 142 | } 143 | } 144 | } 145 | Ok(CredentialRefresh::Halt) => error_step(error), 146 | _ => error_step(error), 147 | } 148 | } 149 | _ => error_step(error), 150 | }, 151 | // TODO: reduce this nested boilerplate by implementing [From] for a result type? 152 | _ => error_step(error), 153 | }, 154 | Ok(ProviderStep::End(result)) => (InternalScraperStep::Data(result), None), 155 | Ok(ProviderStep::NotInitialized) => { 156 | info!( 157 | "Skipping {} because the provider was not initialized", 158 | provider.id().to_string() 159 | ); 160 | (InternalScraperStep::Exit, None) 161 | } 162 | Ok(ProviderStep::Next(result, pagination)) => { 163 | let page_size = if input.is_first_scrape { 164 | provider.max_page_size() 165 | } else { 166 | provider.default_page_size() 167 | }; 168 | 169 | let id = sp.destination.clone(); 170 | let maybe_next_url = 171 | provider.from_provider_destination(&id, page_size, Some(pagination.clone())); 172 | 173 | match maybe_next_url { 174 | Err(err) => error_step(err), 175 | Ok(url) => { 176 | let next_state = ProviderState { 177 | id, 178 | default_name: input.default_name.clone(), 179 | url, 180 | pagination: Some(pagination), 181 | iteration: iteration + 1, 182 | ..state.clone() 183 | }; 184 | (InternalScraperStep::Data(result), Some(next_state)) 185 | } 186 | } 187 | } 188 | } 189 | } 190 | 191 | pub async fn scrape<'a>( 192 | sp: &'a ScopedProvider, 193 | provider: &dyn Provider, 194 | input: &ScrapeRequestInput, 195 | ) -> Result, ProviderFailure> { 196 | let initial_iteration = 0; 197 | let page_size = if input.is_first_scrape { 198 | provider.max_page_size() 199 | } else { 200 | provider.default_page_size() 201 | }; 202 | let id = sp.destination.clone(); 203 | let url = provider.from_provider_destination(&id, page_size.to_owned(), None)?; 204 | 205 | let seed = ProviderState { 206 | login_attempts: 0, 207 | id: id.clone(), 208 | default_name: input.default_name.clone(), 209 | url, 210 | pagination: None, 211 | iteration: initial_iteration, 212 | }; 213 | 214 | let mut steps = futures::stream::unfold(Some(seed), |maybe_state| async { 215 | let state = maybe_state?; 216 | info!("Scraping URL: {:?}", state.url.0); 217 | Some(request_page(sp, provider, state, input).await) 218 | }) 219 | .boxed(); 220 | 221 | let mut scrape_requests: Vec = vec![]; 222 | let scrape_start = Instant::now(); 223 | 224 | while let Some(step) = steps.next().await { 225 | let date = Utc::now().naive_utc(); 226 | match step { 227 | InternalScraperStep::Exit => break, 228 | InternalScraperStep::Error(error) => { 229 | scrape_requests.push(ScrapeRequest { 230 | date, 231 | step: ScraperStep::Error(error), 232 | }); 233 | // no reason to continue scraping after an error 234 | break; 235 | } 236 | InternalScraperStep::Data(page) => { 237 | let total_found_images = page.posts.iter().flat_map(|p| &p.images).count(); 238 | let mut posts: Vec = vec![]; 239 | for post in page.posts { 240 | // it SHOULDN'T be possible for us to have seen a post and only 241 | // have it partially saved... This should be good enough 242 | // This does sadly break the debugging process if you're deleting images 243 | // from the db and re-scraping to trigger things 244 | let known_image = post 245 | .images 246 | .iter() 247 | .find(|image| input.latest_data.contains(&image.unique_identifier)); 248 | if let Some(image) = known_image { 249 | debug!( 250 | "Reached last known image id for {}: {}", 251 | sp, image.unique_identifier 252 | ); 253 | break; 254 | } 255 | posts.push(post) 256 | } 257 | let new_image_count = posts.iter().map(|p| &p.images).len(); 258 | info!("Found {} new images in {}", posts.len(), sp); 259 | 260 | scrape_requests.push(ScrapeRequest { 261 | date, 262 | step: ScraperStep::Data(ProviderResult { posts, ..page }), 263 | }); 264 | 265 | if new_image_count == 0 { 266 | info!( 267 | "[{}] has finished crawling because it's back to the last scraped data point", 268 | sp 269 | ); 270 | break; 271 | } 272 | let pagination_limit = provider.max_pagination(); 273 | // only looking at pagination limit if there's at least one image 274 | // that's been scraped in the past 275 | if !input.latest_data.is_empty() && scrape_requests.len() as u16 > pagination_limit 276 | { 277 | info!( 278 | "[{}] has reached its pagination limit of {}", 279 | sp, pagination_limit 280 | ); 281 | break; 282 | } 283 | trace!("Waiting for provider rate limit..."); 284 | provider.wait(&sp.destination).await; 285 | } 286 | } 287 | } 288 | let scrape_count = scrape_requests.len(); 289 | info!( 290 | "[{}] finished scraping in {:?} after {} request{}", 291 | sp, 292 | scrape_start.elapsed(), 293 | scrape_count, 294 | if scrape_count != 1 { "s" } else { "" } 295 | ); 296 | Ok(Scrape { 297 | provider: sp, 298 | requests: scrape_requests, 299 | }) 300 | } 301 | -------------------------------------------------------------------------------- /src/server.rs: -------------------------------------------------------------------------------- 1 | use std::convert::Infallible; 2 | use std::net::SocketAddr; 3 | use std::ops::Sub; 4 | use std::sync::Arc; 5 | 6 | use axum::body::{Bytes, Full}; 7 | use axum::extract::Extension; 8 | use axum::http::Response; 9 | use axum::response::IntoResponse; 10 | use axum::routing::{get, post}; 11 | use axum::{AddExtensionLayer, Json, Router}; 12 | use chrono::{Duration, NaiveDate, NaiveDateTime, Utc}; 13 | use log::{debug, error, info}; 14 | use num_traits::ToPrimitive; 15 | use reqwest::StatusCode; 16 | use serde::Serialize; 17 | use serde_json::json; 18 | use sqlx::types::BigDecimal; 19 | 20 | use crate::api::v1::providers::v1_add_provider; 21 | use crate::api::v1::{v1_provider_stats, v1_scheduled_scrapes, v1_scrape_history}; 22 | use crate::api::{AppError, Context}; 23 | use crate::db::{latest_requests, Database}; 24 | use crate::scraper::ProviderMap; 25 | 26 | struct ScheduledProvider { 27 | id: i32, 28 | url: String, 29 | name: String, 30 | destination: String, 31 | priority: BigDecimal, 32 | tokens: BigDecimal, 33 | default_name: Option, 34 | last_queue: Option, 35 | metadata: Option, 36 | } 37 | 38 | #[derive(Serialize)] 39 | struct ScheduleResponse { 40 | id: i32, 41 | provider: String, 42 | url: String, 43 | destination: String, 44 | wait_days: i16, 45 | metadata: Option, 46 | name: String, 47 | } 48 | 49 | #[deprecated] 50 | async fn scheduled_scrapes( 51 | Extension(state): Extension>, 52 | ) -> Result>, AppError> { 53 | let rows = sqlx::query_as!( 54 | ScheduledProvider, 55 | "SELECT pr.id, pr.priority, pr.name, pr.destination, pr.url, pr.tokens, pr.last_queue, pr.default_name, ( 56 | SELECT metadata FROM amqp_source where provider_destination = pr.destination and provider_name = pr.name 57 | ) as metadata FROM provider_resource pr" 58 | ) 59 | .fetch_all(&*state.db) 60 | .await?; 61 | let (today, later): (Vec, Vec) = 62 | rows.into_iter().partition(|e| { 63 | let now = Utc::now().naive_utc(); 64 | // anything that was queued in the last 24 hours is already being scraped 65 | // it's not SUPER accurate since it's possible but 66 | // we only need a general idea, not precision 67 | e.last_queue 68 | .map(|last_queue| { 69 | let yesterday = now - Duration::hours(24); 70 | last_queue > yesterday 71 | }) 72 | .unwrap_or(false) 73 | }); 74 | let labeled = later 75 | .into_iter() 76 | .map(|row| { 77 | let wait_days = ((1f32 / (row.priority + row.tokens)) 78 | .to_f32() 79 | .unwrap_or(0f32)) 80 | .floor() as i16; 81 | ScheduleResponse { 82 | destination: row.destination, 83 | provider: row.name, 84 | id: row.id, 85 | url: row.url, 86 | wait_days, 87 | metadata: row.metadata, 88 | name: row.default_name.unwrap_or_default(), 89 | } 90 | }) 91 | .collect::>(); 92 | let mut out = today 93 | .into_iter() 94 | .map(|t| ScheduleResponse { 95 | destination: t.destination, 96 | provider: t.name, 97 | id: t.id, 98 | url: t.url, 99 | wait_days: 0, 100 | metadata: t.metadata, 101 | name: t.default_name.unwrap_or_default(), 102 | }) 103 | .collect::>(); 104 | out.extend(labeled); 105 | Ok(Json(out)) 106 | } 107 | 108 | pub async fn run_server(db: Arc, provider_map: Arc, port: u16) { 109 | info!("Starting server"); 110 | let ctx = Arc::new(Context { 111 | db: Arc::clone(&db), 112 | providers: provider_map, 113 | }); 114 | let router = Router::new() 115 | .route("/schedule", get(scheduled_scrapes)) 116 | .route("/v1/schedule", get(v1_scheduled_scrapes)) 117 | .route("/v1/history", get(v1_scrape_history)) 118 | .route("/v1/provider", post(v1_add_provider)) 119 | .route("/v1/stats", get(v1_provider_stats)) 120 | .layer(AddExtensionLayer::new(ctx)); 121 | let addr = SocketAddr::from(([0, 0, 0, 0], port)); 122 | axum::Server::bind(&addr) 123 | .serve(router.into_make_service()) 124 | .await 125 | .unwrap(); 126 | } 127 | --------------------------------------------------------------------------------