├── .dockerignore
├── .env.example
├── .github
    └── workflows
    │   └── build.yaml
├── .gitignore
├── Cargo.lock
├── Cargo.toml
├── Dockerfile
├── LICENSE.md
├── README.md
├── assets
    ├── scrape_interval.png
    └── scraping_history.png
├── docker-compose.yaml
├── migrations
    ├── 20210722150010_initial_migration.down.sql
    ├── 20210722150010_initial_migration.up.sql
    ├── 20211111232248_scrape_resource_priority.down.sql
    ├── 20211111232248_scrape_resource_priority.up.sql
    ├── 20211121211206_scraped_at.down.sql
    ├── 20211121211206_scraped_at.up.sql
    ├── 20211211152424_unique_amqp_source.down.sql
    ├── 20211211152424_unique_amqp_source.up.sql
    ├── 20211211152751_populate_amqp_source.down.sql
    ├── 20211211152751_populate_amqp_source.up.sql
    ├── 20211221055954_official_source.down.sql
    ├── 20211221055954_official_source.up.sql
    ├── 20211221061158_official_source_nullable.down.sql
    └── 20211221061158_official_source_nullable.up.sql
├── sqlx-data.json
└── src
    ├── api
        ├── mod.rs
        └── v1
        │   ├── mod.rs
        │   ├── providers.rs
        │   └── stats.rs
    ├── db.rs
    ├── dispatcher
        ├── amqp.rs
        ├── discord.rs
        ├── dispatcher.rs
        └── mod.rs
    ├── lib.rs
    ├── main.rs
    ├── models.rs
    ├── request.rs
    ├── scheduler
        ├── mod.rs
        ├── priority.rs
        ├── rate_limiter.rs
        └── scheduler.rs
    ├── scraper
        ├── mod.rs
        ├── providers
        │   ├── mod.rs
        │   ├── pinterest.rs
        │   ├── providers.rs
        │   ├── twitter.rs
        │   ├── twitter_types.rs
        │   ├── united_cube.rs
        │   └── weverse.rs
        └── scraper.rs
    └── server.rs


/.dockerignore:
--------------------------------------------------------------------------------
1 | target
2 | .vscode
3 | dist
4 | manifests
5 | Dockerfile
6 | docker-compose.yaml


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
1 | DATABASE_URL=postgres://postgres:password@localhost:5431/jiu
2 | USER_AGENT="Jiu Scraper (https://github.com/xetera/jiu)"
3 | WEVERSE_EMAIL=
4 | WEVERSE_PASSWORD=


--------------------------------------------------------------------------------
/.github/workflows/build.yaml:
--------------------------------------------------------------------------------
 1 | name: ci
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - 'main'
 7 | 
 8 | jobs:
 9 |   docker:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - uses: actions/checkout@v2
13 |       - name: Set up QEMU
14 |         uses: docker/setup-qemu-action@v1
15 |       - name: Set up Docker Buildx
16 |         uses: docker/setup-buildx-action@v1
17 |       - name: Login to DockerHub
18 |         uses: docker/login-action@v1
19 |         with:
20 |           username: ${{ secrets.DOCKERHUB_USERNAME }}
21 |           password: ${{ secrets.DOCKERHUB_PASSWORD }}
22 |       - name: Cache Docker layers
23 |         uses: actions/cache@v2
24 |         with:
25 |           path: /tmp/.buildx-cache
26 |           key: ${{ runner.os }}-buildx-${{ github.sha }}
27 |           restore-keys: |
28 |             ${{ runner.os }}-buildx-
29 |       - name: Build and push
30 |         uses: docker/build-push-action@v2
31 |         with:
32 |           context: .
33 |           push: true
34 |           tags: xetera/jiu:latest
35 |           cache-from: type=local,src=/tmp/.buildx-cache
36 |           cache-to: type=local,dest=/tmp/.buildx-cache-new
37 |         # This ugly bit is necessary if you don't want your cache to grow forever
38 |         # till it hits GitHub's limit of 5GB.
39 |         # Temp fix
40 |         # https://github.com/docker/build-push-action/issues/252
41 |         # https://github.com/moby/buildkit/issues/1896
42 |       - name: Move cache
43 |         run: |
44 |           rm -rf /tmp/.buildx-cache
45 |           mv /tmp/.buildx-cache-new /tmp/.buildx-cache
46 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /target/
 2 | **/*.rs.bk
 3 | **/dist
 4 | .vscode
 5 | env.toml
 6 | seed.sql
 7 | .env
 8 | .idea
 9 | *.dump
10 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "jiu"
 3 | version = "0.1.1"
 4 | authors = ["Xetera"]
 5 | edition = "2018"
 6 | 
 7 | [dependencies]
 8 | reqwest = { version = "0.11.4", features = ["json", "multipart"] }
 9 | chrono = { version= "0.4.19", features = ["serde"]}
10 | async-trait = "0.1.50"
11 | async-recursion = "0.3.2"
12 | tokio = { version = "1.8.1",   features = ["full"] }
13 | tokio-test = "0.4.2"
14 | serde = { version = "1.0.126", features = ["derive"] }
15 | serde_json = "1.0.59"
16 | url = "2.2.2"
17 | better-panic = "0.2.0"
18 | futures = "0.3.15"
19 | thiserror = "1.0.26"
20 | num-traits = "0.2.14"
21 | sqlx = { version = "0.5.5", features = ["runtime-tokio-rustls", "postgres", "chrono", "offline", "json", "bigdecimal"] }
22 | dotenv = "0.15.0"
23 | log = "0.4.0"
24 | env_logger = "0.8.4"
25 | anyhow = "1.0.42"
26 | strum = { version = "0.21.0", features = ["derive"] }
27 | strum_macros = "0.21.1"
28 | rsa = "0.4.0"
29 | sha-1 = "0.9.7"
30 | digest = "0.9.0"
31 | base64 = "0.13.0"
32 | rand = "0.8.4"
33 | regex = "1.5.4"
34 | lazy_static = "1.4.0"
35 | dyn-clone = "1.0.4"
36 | bimap = "0.6.1"
37 | governor = "0.3.2"
38 | nonzero_ext = "0.2.0"
39 | axum = "0.3.4"
40 | tower = "0.4.11"
41 | itertools = "0.10.1"
42 | parking_lot = "0.11.1"
43 | tokio-amqp = "1.0.0"
44 | lapin = "1.8.1"
45 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM rust:1.55.0-buster as builder
 2 | 
 3 | # First build a dummy project with our dependencies to cache them in Docker
 4 | WORKDIR /usr/src
 5 | RUN cargo new --bin builder
 6 | WORKDIR /usr/src/builder
 7 | COPY ./Cargo.lock ./Cargo.lock
 8 | COPY ./Cargo.toml ./Cargo.toml
 9 | RUN --mount=type=cache,target=/usr/local/cargo/registry cargo build --release
10 | RUN rm src/*.rs
11 | 
12 | # Now copy the sources and do the real build
13 | ADD src src
14 | ADD sqlx-data.json sqlx-data.json
15 | ENV SQLX_OFFLINE true
16 | 
17 | RUN cargo build --release
18 | 
19 | # Second stage putting the build result into a debian jessie-slim image
20 | FROM debian:buster-slim
21 | 
22 | RUN apt-get update \
23 |     && apt-get install -y ca-certificates tzdata libc6 \
24 |     && rm -rf /var/lib/apt/lists/*
25 | ENV NAME=rust-docker
26 | ENV RUST_LOG=jiu=debug
27 | COPY --from=builder /usr/src/builder/target/release/jiu /usr/local/bin/jiu
28 | CMD ["/usr/local/bin/jiu"]
29 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <h1>
  2 |   <img src="https://i.imgur.com/qVp1N9y.png">
  3 | </h1>
  4 | 
  5 | <p align="center">
  6 |   <b>Scrape multiple media providers on a cron job and dispatch webhooks when changes are detected.</b>
  7 | </p>
  8 | 
  9 | ## Jiu
 10 | 
 11 | Jiu is a multi-threaded media scraper capable of juggling thousands of endpoints from different providers with unique
 12 | restrictions/requirements.
 13 | 
 14 | It is built for the purpose of fetching media posted on different sites the form of a slow, eventual-consistency, and
 15 | not for instant change detection.
 16 | 
 17 | ## Providers
 18 | 
 19 | Provider is the umbrella term that encapsulates all endpoints for a given domain.
 20 | 
 21 | For example, https://weverse.io/bts/artist and https://weverse.io/dreamcatcher/artist are 2 endpoints under the Weverse
 22 | provider.
 23 | 
 24 | ### Supported providers
 25 | 
 26 | * [Twitter](https://twitter.com/RBW_MAMAMOO)
 27 | * [Pinterest Boards](https://www.pinterest.com/janairaoliveira314/handong)
 28 | * [Weverse.io](https://weverse.io/dreamcatcher/feed)
 29 | * [United Cube](https://www.united-cube.com/)
 30 | 
 31 | ## Dynamic Priority & Tokens
 32 | 
 33 | Dynamic priority is main idea behind how JiU can scrape many resources without getting rate limited.
 34 | 
 35 | Unique endpoints that have more than 1 token are grouped by their provider type and get scheduled to be scraped at even
 36 | intervals at the start of every day to avoid hammering APIs with requests.
 37 | 
 38 | ![](./assets/scrape_interval.png)
 39 | 
 40 | After each successful request, a 30 day sliding window of that endpoint's request history gets graded on a curve that
 41 | determines how its priority should be changing based on how many new images it found in each request.
 42 | 
 43 | ![](./assets/scraping_history.png)
 44 | 
 45 | Pages that post at least one image regularly get assigned a higher priority, up to a maximum of 3 requests every 2 days.
 46 | Pages that don't post anything sink down to a scrape schedule of once every 2 weeks.
 47 | 
 48 | New results found in earlier dates have a higher contribution to priority than those found further back. This curve
 49 | allows JiU to match its request frequency with the changing posting schedule of sites it's processing to avoid wasting
 50 | requests on resources that are rarely updated.
 51 | 
 52 | At the end of each day, every endpoint gets tokens added to it equal to its current priority that get checked as a
 53 | criteria when scheduling requests the next day.
 54 | 
 55 | ## Authorization
 56 | 
 57 | Anonymous request are always preferred when possible.
 58 | 
 59 | There is a customizable login flow for providers that require authorization which allows logging into APIs after an
 60 | authorization error, and persists additional data (such as a JWT token) to be shared across each provider during the
 61 | lifetime of the process.
 62 | 
 63 | The login flow is reverse engineered for providers that don't have a public API.
 64 | 
 65 | > Juggling multiple accounts per provider is currently not supported and probably won't be as long as your accounts aren't getting banned (and if they are then you're sending too many requests and need to increase your rate limits).
 66 | 
 67 | Jiu will try its best to identify itself in its requests' `User-Agent` header, but will submit a fake UA for providers
 68 | that gate posts behind a user agent check like Twitter.
 69 | 
 70 | ## Proxies
 71 | 
 72 | Proxies are not supported or needed.
 73 | 
 74 | ## Webhooks
 75 | 
 76 | Jiu is capable of sending webhooks to multiple destinations when an update for a provider is detected.
 77 | 
 78 | Although data about posts are aggregated within webhooks, they're not persisted to the database as that's the responsibility of the service receiving the events and are not relevant for image aggregation.
 79 | 
 80 | ```json
 81 | {
 82 |   "provider": {
 83 |     "type": "twitter.timeline",
 84 |     "id": "729935154290925570",
 85 |     "ephemeral": false
 86 |   },
 87 |   "posts": [
 88 |     {
 89 |       "unique_identifier": "1460196926796623873",
 90 |       "body": "[#가현] 삐뚤빼뚤 즐거운 라이브였다❣️ 다음 주에도 재밌는 시간 보내 보카?\n\n#드림캐쳐 #Dreamcatcher #4주_집콕_프로젝트 https://t.co/r1ImPUPKkv",
 91 |       "url": "https://twitter.com/hf_dreamcatcher/status/1460196926796623873",
 92 |       "post_date": null,
 93 |       "account": {
 94 |         "name":"드림캐쳐 Dreamcatcher",
 95 |         "avatar_url":"https://pbs.twimg.com/profile_images/1415983453200261124/4-viIm27_normal.jpg"
 96 |       },
 97 |       "metadata": {
 98 |         "language": "ko",
 99 |         "like_count": 12474,
100 |         "retweet_count": 2760
101 |       },
102 |       "images": [
103 |         {
104 |           "type": "Image",
105 |           "media_url": "https://pbs.twimg.com/media/FEOpVKmagAELmzI.jpg",
106 |           "reference_url": "https://twitter.com/hf_dreamcatcher/status/1460196926796623873/photo/1",
107 |           "unique_identifier": "1460196885285994497",
108 |           "metadata": {
109 |             "width": 1128,
110 |             "height": 1504
111 |           }
112 |         },
113 |         {
114 |           "type": "Image",
115 |           "media_url": "https://pbs.twimg.com/media/FEOpV2FaAAEG4zr.jpg",
116 |           "reference_url": "https://twitter.com/hf_dreamcatcher/status/1460196926796623873/photo/2",
117 |           "unique_identifier": "1460196896958709761",
118 |           "metadata": {
119 |             "width": 1128,
120 |             "height": 1504
121 |           }
122 |         }
123 |       ]
124 |     }
125 |   ]
126 | }
127 | ```
128 | 
129 | Every provider has its own `provider_metadata` field that _may_ contain extra information about the image or the post it
130 | was found under, but may also be missing. _Documentation WIP_
131 | 
132 | The `unique_identifier` field is unique **per provider** and not globally.
133 | 
134 | The `ephemeral` field defines whether an image is only accessible for a short period after dispatch (for example
135 | instagram image links expire after some time).
136 | 
137 | If a Discord webhook URL is detected, the payload is changed to allow Discord to display the images in the channel.
138 | 
139 | There is currently no retry mechanism for webhooks that fail to deliver successfully.
140 | 
141 | ## Endpoints
142 | 
143 | Jiu runs a webserver on port 8080 to allow dynamically resolving new resources by URL and getting stats at runtime
144 | 
145 | - `POST    /v1/provider` Create a new provider by resolving a URL to a resource
146 | - `DELETE  /v1/provider` Delete an existing provider (sets it to `enabled=false`)
147 | - `GET     /v1/schedule` Get the upcoming scheduled scrapes
148 | - `GET     /v1/history`  The list of the last 100 scraped endpoints
149 | - `GET     /v1/stats`    The stats of all the registered providers
150 | 
151 | ## Jiu is **NOT**:
152 | 
153 | * For bombarding sites like Twitter with requests to detect changes within seconds.
154 | * Capable of executing javascript with a headless browser.
155 | * Able to send requests to any social media site without explicit support.
156 | 
157 | ## Jiu **IS**:
158 | 
159 | * For slowly monitoring changes in different feeds over the course of multiple hours without abusing the provider.
160 | * Capable of adjusting the frequency of scrapes based on how frequently the source is updated.
161 | * Able to send webhooks or push to AMQP on discovery.
162 | * The lead singer of [Dreamcatcher](https://www.youtube.com/watch?v=1QD0FeZyDtQ).
163 | 
164 | ## Usage
165 | 
166 | 1. Copy over `.env.example` to `.env` and fill out relevant fields.
167 | 2. `docker-compose up -d jiu_db` to start postgres.
168 | 3. `RUST_LOG=jiu cargo run` to start the crawler
169 | 
170 | To create a production-ready image, make sure to run `cargo sqlx generate` before building if you modified any of the
171 | SQL queries.
172 | 
173 | > If you would like to use this project, please change the `USER_AGENT` environment variable to identify your crawler accurately.
174 | 
175 | Built for [kiyomi.io](https://github.com/xetera/kiyomi)
176 | 


--------------------------------------------------------------------------------
/assets/scrape_interval.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Xetera/jiu/e3a54c908f17359f1233b28b4bcdab31f5b249b8/assets/scrape_interval.png


--------------------------------------------------------------------------------
/assets/scraping_history.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Xetera/jiu/e3a54c908f17359f1233b28b4bcdab31f5b249b8/assets/scraping_history.png


--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | version: "3.5"
 2 | 
 3 | networks:
 4 |   jiu_net:
 5 | 
 6 | volumes:
 7 |   jiu_data:
 8 | 
 9 | services: 
10 |   jiu_db:
11 |     image: postgres:13
12 |     container_name: jiu_db
13 |     volumes:
14 |       - jiu_data:/var/lib/postgresql/data
15 |     environment:
16 |       POSTGRES_USER: postgres
17 |       POSTGRES_PASSWORD: password
18 |       POSTGRES_DB: jiu
19 |     networks:
20 |       - jiu_net
21 |     ports:
22 |       - 5431:5432
23 |   jiu:
24 |     image: rust:1.55
25 |     volumes:
26 |       - ./:/app
27 |     networks:
28 |       - jiu_net
29 |     environment:
30 |       USER: xetera
31 |       DATABASE_URL: postgres://postgres:password@jiu_db:5431/jiu


--------------------------------------------------------------------------------
/migrations/20210722150010_initial_migration.down.sql:
--------------------------------------------------------------------------------
 1 | -- Add down migration script here
 2 | DROP TABLE webhook_invocation;
 3 | 
 4 | DROP TABLE webhook_source;
 5 | 
 6 | DROP TABLE scrape_error;
 7 | 
 8 | DROP TABLE media;
 9 | 
10 | DROP TABLE scrape_request;
11 | 
12 | DROP TABLE scrape;
13 | 
14 | DROP TABLE webhook;
15 | 


--------------------------------------------------------------------------------
/migrations/20210722150010_initial_migration.up.sql:
--------------------------------------------------------------------------------
  1 | -- Add up migration script here
  2 | CREATE TABLE IF NOT EXISTS webhook(
  3 |   id INTEGER PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
  4 |   destination TEXT NOT NULL,
  5 |   created_at TIMESTAMP WITHOUT TIME ZONE NOT NULL DEFAULT NOW(),
  6 |   updated_at TIMESTAMP WITHOUT TIME ZONE NOT NULL DEFAULT NOW(),
  7 |   -- extra data attached to a webhook invocation
  8 |   metadata JSONB
  9 | );
 10 | 
 11 | CREATE TABLE IF NOT EXISTS provider_resource(
 12 |   id INTEGER PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
 13 |   -- This can be a FQDN or an identifier that maps to a unique API endpoint
 14 |   -- on the provider's end
 15 |   destination TEXT NOT NULL,
 16 |   name TEXT NOT NULL,
 17 |   enabled BOOLEAN DEFAULT True,
 18 |   -- the url for the scraped page
 19 |   url TEXT NOT NULL,
 20 |   priority INTEGER NOT NULL DEFAULT 5 CHECK(priority >= 1 AND priority <= 10),
 21 |   last_scrape TIMESTAMP WITHOUT TIME ZONE NULL,
 22 |   -- the date last scrape was requested, this acts a lock to prevent resources from being accessed multiple times 
 23 |   last_queue TIMESTAMP WITHOUT TIME ZONE NULL,
 24 |   created_at TIMESTAMP WITHOUT TIME ZONE DEFAULT NOW(),
 25 |   UNIQUE(destination, name)
 26 | );
 27 | 
 28 | CREATE TABLE IF NOT EXISTS scrape(
 29 |   id INTEGER PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
 30 |   provider_name TEXT,
 31 |   provider_destination TEXT,
 32 |   -- the priority this scrape was executed against
 33 |   priority INTEGER NOT NULL CHECK(priority >= 1 AND priority <= 10),
 34 |   FOREIGN KEY (provider_name, provider_destination)
 35 |     REFERENCES provider_resource(name, destination) ON DELETE SET NULL ON UPDATE CASCADE
 36 | );
 37 | 
 38 | -- each scrape can have more than one request
 39 | CREATE TABLE IF NOT EXISTS scrape_request(
 40 |   id INTEGER PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
 41 |   scrape_id INTEGER REFERENCES scrape(id),
 42 |   page INTEGER NOT NULL DEFAULT 1,
 43 |   response_code INTEGER,
 44 |   -- how long did the response take in ms
 45 |   response_delay INTEGER,
 46 |   scraped_at TIMESTAMP WITHOUT TIME ZONE NOT NULL
 47 | );
 48 | 
 49 | CREATE TABLE IF NOT EXISTS media(
 50 |   -- This is necessary when trying to sort media that were
 51 |   -- crawled at the same time
 52 |   id INTEGER PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
 53 |   provider_name TEXT,
 54 |   provider_destination TEXT,
 55 |   scrape_request_id INTEGER REFERENCES scrape_request(id) ON DELETE SET NULL,
 56 |   -- We are assuming there is only one type of url
 57 |   image_url TEXT NOT NULL UNIQUE,
 58 |   page_url TEXT NULL,
 59 |   reference_url TEXT NULL,
 60 |   -- a unique identifier that's specific to the provider
 61 |   unique_identifier TEXT NOT NULL,
 62 |   -- where the image is coming from
 63 |   -- could be null if the provider doesn't have the information
 64 |   posted_at TIMESTAMP WITHOUT TIME ZONE NULL,
 65 |   discovered_at TIMESTAMP WITHOUT TIME ZONE NOT NULL,
 66 |   UNIQUE(unique_identifier, provider_name),
 67 |   FOREIGN KEY (provider_name, provider_destination)
 68 |     REFERENCES provider_resource(name, destination) ON UPDATE CASCADE ON DELETE SET NULL
 69 | );
 70 | 
 71 | CREATE TABLE IF NOT EXISTS scrape_error(
 72 |   id INTEGER PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
 73 |   -- already declared in scrape_request
 74 |   -- response_code INTEGER,
 75 |   response_body TEXT NOT NULL DEFAULT '',
 76 |   response_code TEXT NOT NULL,
 77 |   message TEXT NULL,
 78 |   scrape_id INTEGER NOT NULL REFERENCES scrape(id)
 79 | );
 80 | 
 81 | CREATE TABLE IF NOT EXISTS webhook_source(
 82 |   id INTEGER PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
 83 |   webhook_id INTEGER REFERENCES webhook(id),
 84 |   provider_name TEXT,
 85 |   provider_destination TEXT,
 86 |   FOREIGN KEY (provider_name, provider_destination)
 87 |     REFERENCES provider_resource(name, destination) ON DELETE SET NULL ON UPDATE CASCADE
 88 | );
 89 | 
 90 | CREATE TABLE IF NOT EXISTS webhook_invocation(
 91 |   id INTEGER PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
 92 |   scrape_id INTEGER /* NOT NULL */ REFERENCES scrape(id),
 93 |   webhook_id INTEGER /* NOT NULL */ REFERENCES webhook(id) ON DELETE SET NULL,
 94 |   response_code INTEGER,
 95 |   response_delay INTEGER,
 96 |   invoked_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
 97 | );
 98 | 
 99 | CREATE INDEX ON scrape (provider_destination, provider_name);
100 | 


--------------------------------------------------------------------------------
/migrations/20211111232248_scrape_resource_priority.down.sql:
--------------------------------------------------------------------------------
1 | -- Add down migration script here
2 | ALTER TABLE provider_resource DROP COLUMN tokens;
3 | ALTER TABLE provider_resource ALTER COLUMN priority type integer;
4 | ALTER TABLE scrape ALTER COLUMN priority type integer;
5 | 
6 | DROP TABLE amqp_source;


--------------------------------------------------------------------------------
/migrations/20211111232248_scrape_resource_priority.up.sql:
--------------------------------------------------------------------------------
 1 | -- Add up migration script here
 2 | ALTER TABLE provider_resource ADD COLUMN last_token_update TIMESTAMP WITHOUT TIME ZONE NULL;
 3 | ALTER TABLE provider_resource ADD COLUMN tokens DECIMAL NOT NULL DEFAULT 1.0;
 4 | ALTER TABLE provider_resource ALTER COLUMN priority type decimal;
 5 | ALTER TABLE scrape ALTER COLUMN priority type decimal;
 6 | ALTER TABLE scrape ALTER COLUMN priority set not null;
 7 | ALTER TABLE scrape ALTER COLUMN priority set default 1.0;
 8 | ALTER TABLE webhook ADD CONSTRAINT unique_destination UNIQUE(destination);
 9 | ALTER TABLE provider_resource DROP CONSTRAINT provider_resource_priority_check;
10 | ALTER TABLE scrape DROP CONSTRAINT scrape_priority_check;
11 | ALTER TABLE webhook DROP COLUMN metadata;
12 | ALTER TABLE webhook_source ADD COLUMN metadata JSONB;
13 | ALTER TABLE provider_resource ADD COLUMN default_name TEXT;
14 | 
15 | CREATE TABLE IF NOT EXISTS amqp_source(
16 |      id INTEGER PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
17 |      provider_name TEXT,
18 |      provider_destination TEXT,
19 |      metadata JSONB,
20 |      FOREIGN KEY (provider_name, provider_destination)
21 |          REFERENCES provider_resource(name, destination) ON DELETE SET NULL ON UPDATE CASCADE
22 | );
23 | 


--------------------------------------------------------------------------------
/migrations/20211121211206_scraped_at.down.sql:
--------------------------------------------------------------------------------
1 | -- Add down migration script here
2 | 


--------------------------------------------------------------------------------
/migrations/20211121211206_scraped_at.up.sql:
--------------------------------------------------------------------------------
1 | -- Add up migration script here
2 | ALTER TABLE scrape ADD COLUMN IF NOT EXISTS scraped_at TIMESTAMP WITHOUT TIME ZONE DEFAULT NOW();
3 | 


--------------------------------------------------------------------------------
/migrations/20211211152424_unique_amqp_source.down.sql:
--------------------------------------------------------------------------------
1 | -- Add down migration script here
2 | 
3 | ALTER TABLE amqp_source DROP CONSTRAINT amqp_unique_providers;
4 | 


--------------------------------------------------------------------------------
/migrations/20211211152424_unique_amqp_source.up.sql:
--------------------------------------------------------------------------------
1 | -- Add up migration script here
2 | ALTER TABLE amqp_source ADD CONSTRAINT amqp_unique_providers UNIQUE (provider_name, provider_destination);
3 | 


--------------------------------------------------------------------------------
/migrations/20211211152751_populate_amqp_source.down.sql:
--------------------------------------------------------------------------------
1 | -- Add down migration script here
2 | 


--------------------------------------------------------------------------------
/migrations/20211211152751_populate_amqp_source.up.sql:
--------------------------------------------------------------------------------
1 | -- Add up migration script here
2 | INSERT INTO amqp_source (provider_name, provider_destination, metadata)
3 | SELECT name, destination, '{}' from provider_resource
4 | ON CONFLICT DO NOTHING;
5 | 


--------------------------------------------------------------------------------
/migrations/20211221055954_official_source.down.sql:
--------------------------------------------------------------------------------
1 | -- Add down migration script here
2 | ALTER TABLE provider_resource DROP COLUMN official;
3 | 


--------------------------------------------------------------------------------
/migrations/20211221055954_official_source.up.sql:
--------------------------------------------------------------------------------
1 | -- Add up migration script here
2 | ALTER TABLE provider_resource ADD COLUMN official boolean default false;


--------------------------------------------------------------------------------
/migrations/20211221061158_official_source_nullable.down.sql:
--------------------------------------------------------------------------------
1 | -- Add down migration script here
2 | ALTER TABLE provider_resource ALTER COLUMN official DROP NOT NULL;
3 | 


--------------------------------------------------------------------------------
/migrations/20211221061158_official_source_nullable.up.sql:
--------------------------------------------------------------------------------
1 | -- Add up migration script here
2 | ALTER TABLE provider_resource ALTER COLUMN official SET NOT NULL;


--------------------------------------------------------------------------------
/src/api/mod.rs:
--------------------------------------------------------------------------------
 1 | use crate::db::Database;
 2 | use axum::body::{Bytes, Full};
 3 | use axum::http::{Response, StatusCode};
 4 | use axum::response::IntoResponse;
 5 | use axum::Json;
 6 | use serde_json::json;
 7 | use std::convert::Infallible;
 8 | use std::sync::Arc;
 9 | use crate::scraper::ProviderMap;
10 | 
11 | pub mod v1;
12 | 
13 | pub struct Context {
14 |     pub db: Arc<Database>,
15 |     pub providers: Arc<ProviderMap>
16 | }
17 | 
18 | pub enum AppError {
19 |     SomeError(anyhow::Error),
20 |     SqlxError(sqlx::Error),
21 | }
22 | 
23 | impl From<anyhow::Error> for AppError {
24 |     fn from(inner: anyhow::Error) -> Self {
25 |         AppError::SomeError(inner)
26 |     }
27 | }
28 | 
29 | impl From<sqlx::Error> for AppError {
30 |     fn from(inner: sqlx::Error) -> Self {
31 |         AppError::SqlxError(inner)
32 |     }
33 | }
34 | 
35 | impl IntoResponse for AppError {
36 |     type Body = Full<Bytes>;
37 |     type BodyError = Infallible;
38 | 
39 |     fn into_response(self) -> Response<Self::Body> {
40 |         let (status, error_message) = match self {
41 |             AppError::SomeError(err) => (StatusCode::INTERNAL_SERVER_ERROR, err.to_string()),
42 |             AppError::SqlxError(err) => (StatusCode::INTERNAL_SERVER_ERROR, err.to_string()),
43 |         };
44 | 
45 |         let body = Json(json!({
46 |             "error": error_message,
47 |         }));
48 | 
49 |         (status, body).into_response()
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/src/api/v1/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod stats;
2 | pub mod providers;
3 | 
4 | pub use stats::*;
5 | 


--------------------------------------------------------------------------------
/src/api/v1/providers.rs:
--------------------------------------------------------------------------------
  1 | use std::sync::Arc;
  2 | 
  3 | use axum::extract::Extension;
  4 | use axum::Json;
  5 | use log::{debug, error, info};
  6 | use serde::{Deserialize, Serialize};
  7 | use serde_json::{json, Map, Value};
  8 | 
  9 | use crate::api::{AppError, Context};
 10 | use crate::scraper::{CanonicalUrlResolution, ProviderFailure, WorkableDomain};
 11 | 
 12 | #[derive(Deserialize)]
 13 | pub struct ProviderAdd {
 14 |     url: String,
 15 |     name: String,
 16 |     official: bool,
 17 |     metadata: Option<Value>,
 18 |     add_to_amqp: Option<bool>,
 19 | }
 20 | 
 21 | #[derive(Serialize)]
 22 | pub enum ProviderAddResponse {
 23 |     InvalidUrl {
 24 |         url: String,
 25 |     },
 26 |     ProviderExists {
 27 |         provider: String,
 28 |         destination: String,
 29 |     },
 30 |     InternalError,
 31 |     NotImplemented,
 32 |     Success {
 33 |         provider: String,
 34 |         destination: String,
 35 |     },
 36 | }
 37 | 
 38 | pub async fn v1_add_provider(
 39 |     Extension(state): Extension<Arc<Context>>,
 40 |     Json(input): Json<ProviderAdd>,
 41 | ) -> Result<Json<ProviderAddResponse>, AppError> {
 42 |     let result = state
 43 |         .providers
 44 |         .values()
 45 |         .find_map(|p| p.match_domain(&input.url).map(|res| (p, res)));
 46 |     let (provider, domain) = match result {
 47 |         Some((provider, domain)) => (provider, domain),
 48 |         None => {
 49 |             debug!("Url {} was not valid", input.url);
 50 |             return Ok(Json(ProviderAddResponse::InvalidUrl { url: input.url }));
 51 |         }
 52 |     };
 53 |     let introspectable = match domain {
 54 |         WorkableDomain::ToCanonical(resource) => resource,
 55 |         _ => {
 56 |             debug!(
 57 |                 "WorkableDomain {:?} from [{}] was not detected as Canonical",
 58 |                 provider.id(),
 59 |                 input.url
 60 |             );
 61 |             return Ok(Json(ProviderAddResponse::InvalidUrl { url: input.url }));
 62 |         }
 63 |     };
 64 |     let response = provider.introspect_resource(&introspectable).await;
 65 |     let destination = match response {
 66 |         Ok(CanonicalUrlResolution::Success { destination }) => destination,
 67 |         Ok(CanonicalUrlResolution::Fail(reason)) => {
 68 |             error!("{:?}", reason);
 69 |             return Ok(Json(ProviderAddResponse::InternalError));
 70 |         }
 71 |         Ok(CanonicalUrlResolution::NotImplemented) => {
 72 |             return Ok(Json(ProviderAddResponse::NotImplemented))
 73 |         }
 74 |         Err(ProviderFailure::Url) => {
 75 |             return Ok(Json(ProviderAddResponse::InvalidUrl { url: input.url }));
 76 |         }
 77 |         Err(other) => {
 78 |             return Ok(Json(ProviderAddResponse::InternalError));
 79 |         }
 80 |     };
 81 |     info!(
 82 |         "Successfully resolved [destination: {}] for [{:?}]",
 83 |         destination,
 84 |         provider.id()
 85 |     );
 86 |     let provider_name = provider.id().to_string();
 87 |     if let Err(sqlx::Error::Database(err)) = sqlx::query!(
 88 |         "INSERT INTO provider_resource (destination, name, default_name, official, url) VALUES
 89 |             ($1, $2, $3, $4, $5)",
 90 |         destination,
 91 |         provider_name,
 92 |         input.name,
 93 |         input.official,
 94 |         input.url
 95 |     )
 96 |     .execute(&*state.db)
 97 |     .await
 98 |     {
 99 |         // the most disgusting way of checking for primary key conflicts
100 |         if err.code().map(|code| code == "23505").unwrap_or(false) {
101 |             return Ok(Json(ProviderAddResponse::ProviderExists {
102 |                 destination,
103 |                 provider: provider_name,
104 |             }));
105 |         }
106 |     };
107 |     // there's a conflict
108 |     // TODO: decouple this kiyomi-specific thing out?
109 |     if input.add_to_amqp.unwrap_or(false) {
110 |         sqlx::query!(
111 |             "INSERT INTO amqp_source (provider_name, provider_destination, metadata)
112 |                 VALUES ($1, $2, $3)
113 |                 ON CONFLICT(provider_name, provider_destination) DO UPDATE SET metadata = $3",
114 |             provider_name,
115 |             destination,
116 |             input.metadata.unwrap_or(Value::Object(Map::new()))
117 |         )
118 |         .fetch_optional(&*state.db)
119 |         .await?;
120 |     }
121 |     Ok(Json(ProviderAddResponse::Success {
122 |         destination,
123 |         provider: provider_name,
124 |     }))
125 | }
126 | 
127 | #[derive(Deserialize)]
128 | pub struct ProviderDelete {
129 |     name: String,
130 |     destination: String,
131 | }
132 | 
133 | #[derive(Serialize)]
134 | pub struct ProviderDeleteResponse {
135 |     modified: bool,
136 | }
137 | 
138 | pub async fn v1_delete_provider(
139 |     Extension(state): Extension<Arc<Context>>,
140 |     Json(input): Json<ProviderDelete>,
141 | ) -> Result<Json<ProviderDeleteResponse>, AppError> {
142 |     let result = sqlx::query!(
143 |         "UPDATE provider_resource SET enabled = False WHERE name = $1 and destination = $2 RETURNING *",
144 |         input.name,
145 |         input.destination,
146 |     )
147 |     .fetch_optional(&*state.db)
148 |     .await?;
149 |     return Ok(Json(ProviderDeleteResponse {
150 |         modified: result.is_some(),
151 |     }));
152 | }
153 | 


--------------------------------------------------------------------------------
/src/api/v1/stats.rs:
--------------------------------------------------------------------------------
  1 | use std::sync::Arc;
  2 | 
  3 | use axum::extract::Extension;
  4 | use axum::Json;
  5 | use chrono::{DateTime, Duration, NaiveDateTime, Utc};
  6 | use num_traits::ToPrimitive;
  7 | use serde::Serialize;
  8 | use sqlx::types::BigDecimal;
  9 | 
 10 | use crate::api::{AppError, Context};
 11 | use crate::models::ScrapeHistory;
 12 | 
 13 | struct ScheduledProvider {
 14 |     id: i32,
 15 |     url: String,
 16 |     name: String,
 17 |     destination: String,
 18 |     priority: BigDecimal,
 19 |     tokens: BigDecimal,
 20 |     default_name: Option<String>,
 21 |     last_queue: Option<NaiveDateTime>,
 22 |     metadata: Option<serde_json::Value>,
 23 |     official: bool,
 24 | }
 25 | 
 26 | #[derive(Serialize)]
 27 | pub struct ScheduledProviderRun {
 28 |     id: i32,
 29 |     provider: String,
 30 |     url: String,
 31 |     destination: String,
 32 |     wait_days: i16,
 33 |     metadata: Option<serde_json::Value>,
 34 |     name: String,
 35 |     official: bool,
 36 | }
 37 | 
 38 | struct PreviousScrapeRow {
 39 |     id: i32,
 40 |     name: String,
 41 |     url: String,
 42 |     destination: String,
 43 |     date: Option<NaiveDateTime>,
 44 |     // last_post: Option<NaiveDateTime>,
 45 |     priority: BigDecimal,
 46 |     default_name: Option<String>,
 47 |     official: bool,
 48 |     discovered_media: Option<i64>,
 49 | }
 50 | 
 51 | #[derive(Serialize)]
 52 | pub struct PreviousScrape {
 53 |     id: i32,
 54 |     name: String,
 55 |     url: String,
 56 |     destination: String,
 57 |     // TODO: make this column not-null
 58 |     date: Option<NaiveDateTime>,
 59 |     // last_post: Option<NaiveDateTime>,
 60 |     // last_scrape: Option<NaiveDateTime>,
 61 |     // last_post: Option<NaiveDateTime>,
 62 |     priority: f32,
 63 |     default_name: Option<String>,
 64 |     official: bool,
 65 |     discovered_media: i64,
 66 | }
 67 | 
 68 | #[derive(Serialize)]
 69 | pub struct ScheduleResponse {
 70 |     pub history: Vec<PreviousScrape>,
 71 |     pub scheduled: Vec<ScheduledProviderRun>,
 72 | }
 73 | 
 74 | pub async fn v1_scheduled_scrapes(
 75 |     Extension(state): Extension<Arc<Context>>,
 76 | ) -> Result<Json<Vec<ScheduledProviderRun>>, AppError> {
 77 |     let rows = sqlx::query_as!(
 78 |         ScheduledProvider,
 79 |         "SELECT pr.id, pr.official, pr.priority, pr.name, pr.destination, pr.url, pr.tokens, pr.last_queue, pr.default_name, (
 80 |             SELECT metadata FROM amqp_source where provider_destination = pr.destination and provider_name = pr.name
 81 |         ) as metadata FROM provider_resource pr"
 82 |     )
 83 |         .fetch_all(&*state.db)
 84 |         .await?;
 85 |     let (today, later): (Vec<ScheduledProvider>, Vec<ScheduledProvider>) =
 86 |         rows.into_iter().partition(|e| {
 87 |             let now = Utc::now().naive_utc();
 88 |             // anything that was queued in the last 24 hours is already being scraped
 89 |             // it's not SUPER accurate since it's possible but
 90 |             // we only need a general idea, not precision
 91 |             e.last_queue
 92 |                 .map(|last_queue| {
 93 |                     let yesterday = now - Duration::hours(24);
 94 |                     last_queue > yesterday
 95 |                 })
 96 |                 .unwrap_or(false)
 97 |         });
 98 |     let labeled = later
 99 |         .into_iter()
100 |         .map(|row| {
101 |             let wait_days = ((1f32 / (row.priority + row.tokens))
102 |                 .to_f32()
103 |                 .unwrap_or(0f32))
104 |             .floor() as i16;
105 |             ScheduledProviderRun {
106 |                 destination: row.destination,
107 |                 provider: row.name,
108 |                 id: row.id,
109 |                 url: row.url,
110 |                 official: row.official,
111 |                 wait_days,
112 |                 metadata: row.metadata,
113 |                 name: row.default_name.unwrap_or_default(),
114 |             }
115 |         })
116 |         .collect::<Vec<_>>();
117 |     let mut scheduled = today
118 |         .into_iter()
119 |         .map(|t| ScheduledProviderRun {
120 |             destination: t.destination,
121 |             provider: t.name,
122 |             official: t.official,
123 |             id: t.id,
124 |             url: t.url,
125 |             wait_days: 0,
126 |             metadata: t.metadata,
127 |             name: t.default_name.unwrap_or_default(),
128 |         })
129 |         .collect::<Vec<_>>();
130 |     scheduled.extend(labeled);
131 |     Ok(Json(scheduled))
132 | }
133 | 
134 | pub async fn v1_scrape_history(
135 |     Extension(state): Extension<Arc<Context>>,
136 | ) -> Result<Json<Vec<PreviousScrape>>, AppError> {
137 |     let previous = sqlx::query_as!(
138 |         PreviousScrapeRow,
139 |         "SELECT scrape.id,
140 |             pr.url,
141 |             pr.default_name,
142 |             pr.official,
143 |             pr.name,
144 |             pr.destination,
145 |             scrape.priority,
146 |             scrape.scraped_at as date,
147 |             COALESCE((SELECT COUNT(*)
148 |             from media
149 |                      inner join public.scrape_request sr on sr.id = media.scrape_request_id
150 |                      inner join scrape s on s.id = sr.scrape_id
151 |                where sr.scrape_id = scrape.id), 0) as discovered_media
152 |         FROM scrape
153 |                  INNER JOIN provider_resource pr on pr.destination = scrape.provider_destination
154 |             and scrape.provider_name = pr.name
155 |         ORDER BY scrape.scraped_at desc
156 |         LIMIT 100
157 |         "
158 |     )
159 |     .fetch_all(&*state.db)
160 |     .await?;
161 |     let history = previous
162 |         .into_iter()
163 |         .map(|row| PreviousScrape {
164 |             id: row.id,
165 |             name: row.name,
166 |             url: row.url,
167 |             date: row.date,
168 |             destination: row.destination,
169 |             // we shouldn't need this, but sqlx doesn't understand the semantics
170 |             // of COALESCE for some reason
171 |             discovered_media: row.discovered_media.unwrap_or(0),
172 |             priority: row.priority.to_f32().unwrap(),
173 |             default_name: row.default_name,
174 |             official: row.official,
175 |         })
176 |         .collect::<Vec<_>>();
177 |     Ok(Json(history))
178 | }
179 | 
180 | #[derive(Serialize)]
181 | pub struct ProviderStat {
182 |     name: String,
183 |     destination: String,
184 |     enabled: bool,
185 |     url: String,
186 |     priority: f32,
187 |     tokens: f32,
188 |     // TODO: why is this nullable?
189 |     created_at: Option<NaiveDateTime>,
190 |     default_name: Option<String>,
191 |     official: bool,
192 |     last_scrape: Option<NaiveDateTime>,
193 |     last_post: Option<NaiveDateTime>,
194 |     discovered_images: i64,
195 |     scrape_count: i64,
196 | }
197 | 
198 | #[derive(Serialize)]
199 | pub struct ProviderStatsResponse {
200 |     stats: Vec<ProviderStat>,
201 | }
202 | 
203 | pub async fn v1_provider_stats(
204 |     Extension(state): Extension<Arc<Context>>,
205 | ) -> Result<Json<ProviderStatsResponse>, AppError> {
206 |     let stats = sqlx::query!(
207 |         "SELECT pr.id,
208 |        pr.name,
209 |        pr.destination,
210 |        pr.enabled,
211 |        pr.url,
212 |        pr.priority,
213 |        pr.tokens,
214 |        pr.created_at,
215 |        pr.default_name,
216 |        pr.official,
217 |        (SELECT Max(sr.scraped_at)
218 |         FROM scrape_request sr
219 |                  inner join scrape s on pr.destination = s.provider_destination) as last_scrape,
220 |        (SELECT MAX(posted_at)
221 |         FROM media
222 |                  INNER JOIN public.scrape_request s on s.id = media.scrape_request_id
223 |                  inner join scrape s2 on s2.id = s.scrape_id
224 |         where s2.provider_destination = pr.destination
225 |           and s2.provider_name = pr.name
226 |        ) as last_post,
227 |        (SELECT COUNT(s3.*)
228 |         from media
229 |                  inner join public.scrape_request r on r.id = media.scrape_request_id
230 |                  inner join scrape s3 on s3.id = r.scrape_id
231 |         where s3.provider_name = pr.name
232 |           and s3.provider_destination = pr.destination
233 |        ) as discovered_images,
234 |        (SELECT COUNT(*) from scrape inner join scrape_request sr2 on scrape.id = sr2.scrape_id
235 |           where scrape.provider_destination = pr.destination and scrape.provider_name = pr.name
236 |        ) as scrape_count
237 |     FROM provider_resource pr;"
238 |     )
239 |     .fetch_all(&*state.db)
240 |     .await?;
241 |     let data = ProviderStatsResponse {
242 |         stats: stats
243 |             .iter()
244 |             .map(|stat| ProviderStat {
245 |                 name: stat.name.clone(),
246 |                 destination: stat.destination.clone(),
247 |                 enabled: stat.enabled.unwrap_or(false),
248 |                 url: stat.url.clone(),
249 |                 priority: stat.priority.to_f32().unwrap_or(0f32),
250 |                 tokens: stat.tokens.to_f32().unwrap_or(0f32),
251 |                 created_at: stat.created_at,
252 |                 default_name: stat.default_name.clone(),
253 |                 official: stat.official,
254 |                 last_scrape: stat.last_scrape,
255 |                 last_post: stat.last_post,
256 |                 discovered_images: stat.discovered_images.unwrap_or(0),
257 |                 scrape_count: stat.scrape_count.unwrap_or(0),
258 |             })
259 |             .collect::<Vec<_>>(),
260 |     };
261 |     Ok(Json(data))
262 | }
263 | 
264 | // (SELECT Max(scraped_at) FROM scrape_request sr where sr.scrape_id = scrape.id) as last_scrape,
265 | // (SELECT MAX(posted_at) FROM media
266 | // INNER JOIN public.scrape_request s on s.id = media.scrape_request_id
267 | // inner join scrape s2 on s2.id = s.scrape_id
268 | // where s2.id = scrape.id
269 | // ) as last_post
270 | 


--------------------------------------------------------------------------------
/src/db.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::HashSet;
  2 | use std::env;
  3 | use std::iter::FromIterator;
  4 | 
  5 | use anyhow::bail;
  6 | use itertools::Itertools;
  7 | use log::error;
  8 | use sqlx::postgres::PgPoolOptions;
  9 | use sqlx::{Error, Pool, Postgres};
 10 | 
 11 | use crate::dispatcher::dispatcher::WebhookInteraction;
 12 | use crate::models::{
 13 |     AMQPDestination, DatabaseWebhook, PendingProvider, ScrapeRequestMedia, ScrapeRequestWithMedia,
 14 | };
 15 | use crate::request::HttpError;
 16 | use crate::scraper::scraper::{Scrape, ScraperStep};
 17 | use crate::scraper::{ProviderFailure, ScopedProvider};
 18 | 
 19 | pub type Database = Pool<Postgres>;
 20 | 
 21 | pub async fn connect() -> Result<Database, Error> {
 22 |     Ok(PgPoolOptions::new()
 23 |         .max_connections(5)
 24 |         .connect(&env::var("DATABASE_URL").expect("No DATABASE_URL env"))
 25 |         .await?)
 26 | }
 27 | 
 28 | // Grab the latest N images from a relevant provider destination
 29 | pub async fn latest_media_ids_from_provider(
 30 |     db: &Database,
 31 |     provider: &ScopedProvider,
 32 | ) -> anyhow::Result<HashSet<String>> {
 33 |     let out = sqlx::query!(
 34 |         "SELECT unique_identifier FROM media
 35 |         WHERE provider_name = $1 AND provider_destination = $2
 36 |         order by id desc, discovered_at desc limit 100",
 37 |         provider.name.to_string(),
 38 |         provider.destination
 39 |     )
 40 |     .map(|e| e.unique_identifier)
 41 |     .fetch_all(db)
 42 |     .await?;
 43 |     Ok(HashSet::from_iter(out.into_iter()))
 44 | }
 45 | 
 46 | pub async fn amqp_metadata(
 47 |     db: &Database,
 48 |     sp: &ScopedProvider,
 49 | ) -> anyhow::Result<Option<AMQPDestination>> {
 50 |     let result = sqlx::query_as!(
 51 |         AMQPDestination,
 52 |         "SELECT id, metadata FROM amqp_source a WHERE a.provider_destination = $1 AND a.provider_name = $2 LIMIT 1",
 53 |         sp.destination,
 54 |         sp.name.to_string()
 55 |     ).fetch_one(db).await;
 56 |     match result {
 57 |         Ok(ok) => Ok(Some(ok)),
 58 |         Err(err) => match err {
 59 |             Error::RowNotFound => Ok(None),
 60 |             err_name => bail!(err_name),
 61 |         },
 62 |     }
 63 | }
 64 | 
 65 | pub async fn webhooks_for_provider(
 66 |     db: &Database,
 67 |     provider_resolvable: &ScopedProvider,
 68 | ) -> anyhow::Result<Vec<DatabaseWebhook>> {
 69 |     Ok(sqlx::query_as!(
 70 |         DatabaseWebhook,
 71 |         "SELECT webhook.*, webhook_source.metadata FROM webhook
 72 |         JOIN webhook_source on webhook_source.webhook_id = webhook.id
 73 |         WHERE webhook_source.provider_destination = $1 AND webhook_source.provider_name = $2",
 74 |         provider_resolvable.destination,
 75 |         provider_resolvable.name.to_string()
 76 |     )
 77 |     .fetch_all(db)
 78 |     .await?)
 79 | }
 80 | 
 81 | #[derive(Debug)]
 82 | pub struct ProcessedScrape {
 83 |     scrape_id: i32,
 84 | }
 85 | 
 86 | /// Adds scrapes to the db. Reverses the scrape list as a side effect
 87 | pub async fn process_scrape<'a>(
 88 |     db: &Database,
 89 |     scrape: &mut Scrape<'a>,
 90 |     pending: &PendingProvider,
 91 | ) -> anyhow::Result<ProcessedScrape> {
 92 |     let mut tx = db.begin().await?;
 93 |     let out = sqlx::query!(
 94 |         "INSERT INTO scrape (provider_name, provider_destination, priority) VALUES ($1, $2, $3) returning id",
 95 |         scrape.provider.name.to_string(),
 96 |         scrape.provider.destination,
 97 |         pending.priority.level
 98 |     )
 99 |         .fetch_one(&mut tx)
100 |         .await?;
101 |     // we don't really care about making sure this is completely correct
102 |     sqlx::query!(
103 |         "UPDATE provider_resource
104 |         SET
105 |             last_scrape = NOW(),
106 |             tokens = tokens - 1
107 |         WHERE name = $1 AND destination = $2
108 |         RETURNING *",
109 |         scrape.provider.name.to_string(),
110 |         scrape.provider.destination,
111 |     )
112 |     .fetch_one(db)
113 |     .await?;
114 |     let scrape_id = out.id;
115 |     let requests = &mut scrape.requests;
116 |     // we specifically need to reverse this list of requests/images
117 |     // to make sure that the images that were first scraped get inserted
118 |     // last with the highest id
119 |     requests.reverse();
120 | 
121 |     for (i, request) in requests.iter().enumerate() {
122 |         match &request.step {
123 |             ScraperStep::Data(provider_result) => {
124 |                 let response_code = provider_result.response_code.as_u16();
125 |                 let scrape_request_row = sqlx::query!(
126 |                     "INSERT INTO scrape_request (scrape_id, response_code, response_delay, scraped_at, page)
127 |                     VALUES ($1, $2, $3, $4, $5)
128 |                     RETURNING id",
129 |                     scrape_id,
130 |                     response_code as u32,
131 |                     // unsafe downcast from u128? I hope the request doesn't take 2 billion milliseconds kekw
132 |                     provider_result.response_delay.as_millis() as u32,
133 |                     request.date,
134 |                     // pages are 1-indexed
135 |                     (i as i32) + 1
136 |                 ).fetch_one(&mut tx).await?;
137 |                 // we're not persisting post data, but that's ok
138 |                 let mut posts = provider_result.posts.clone();
139 |                 posts.reverse();
140 |                 for post in posts {
141 |                     let mut images = post.images.clone();
142 |                     images.reverse();
143 |                     for media in images.iter() {
144 |                         sqlx::query!(
145 |                             "INSERT INTO media (
146 |                             provider_name,
147 |                             provider_destination,
148 |                             scrape_request_id,
149 |                             image_url,
150 |                             page_url,
151 |                             reference_url,
152 |                             unique_identifier,
153 |                             posted_at,
154 |                             discovered_at
155 |                         ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
156 |                         ON CONFLICT (image_url) DO update set discovered_at = NOW() returning *",
157 |                             // sometimes we end up re-scraping the latest known images
158 |                             &scrape.provider.name.to_string(),
159 |                             &scrape.provider.destination,
160 |                             scrape_request_row.id,
161 |                             media.media_url,
162 |                             post.url,
163 |                             media.reference_url,
164 |                             media.unique_identifier,
165 |                             post.post_date,
166 |                             request.date
167 |                         )
168 |                         .fetch_optional(&mut tx)
169 |                         .await?;
170 |                     }
171 |                 }
172 |             }
173 |             ScraperStep::Error(ProviderFailure::HttpError(error)) => {
174 |                 match &error {
175 |                     HttpError::ReqwestError(err) => {
176 |                         // we should not be getting request related errors, only response errors
177 |                         if err.is_request() {
178 |                             error!(
179 |                                 "Got an error from a provider that was caused by a request\n{:?}",
180 |                                 err.url()
181 |                             );
182 |                             error!("{:?}", err);
183 |                             continue;
184 |                         }
185 | 
186 |                         if let Some(status) = err.status() {
187 |                             sqlx::query!(
188 |                                 "INSERT INTO scrape_error (scrape_id, response_code)
189 |                                 VALUES ($1, $2)",
190 |                                 scrape_id,
191 |                                 status.as_u16() as i32
192 |                             )
193 |                             .fetch_one(&mut tx)
194 |                             .await?;
195 |                         } else {
196 |                             error!("Got an unexpected error from a provider that doesn't have a status",);
197 |                             error!("{:?}", err);
198 |                             continue;
199 |                         }
200 |                     }
201 |                     HttpError::FailStatus(ctx) | HttpError::UnexpectedBody(ctx) => {
202 |                         sqlx::query!(
203 |                             "INSERT INTO scrape_error (scrape_id, response_code, response_body, message)
204 |                             VALUES ($1, $2, $3, $4) returning id",
205 |                             scrape_id,
206 |                             ctx.code.as_u16() as i32,
207 |                             ctx.body,
208 |                             ctx.message,
209 |                         )
210 |                         .fetch_one(&mut tx)
211 |                         .await?;
212 |                     }
213 |                 }
214 |             }
215 |             ScraperStep::Error(ProviderFailure::Url) => {
216 |                 println!(
217 |                     "Could not formal url properly for {}: {}",
218 |                     scrape.provider.name.to_string(),
219 |                     scrape.provider.destination
220 |                 );
221 |             }
222 |             _ => {}
223 |         }
224 |     }
225 |     tx.commit().await?;
226 |     Ok(ProcessedScrape { scrape_id: out.id })
227 | }
228 | 
229 | pub async fn submit_webhook_responses(
230 |     db: &Database,
231 |     processed_scrape: ProcessedScrape,
232 |     interactions: Vec<WebhookInteraction>,
233 | ) -> anyhow::Result<()> {
234 |     let mut tx = db.begin().await?;
235 |     // can't commit the invocation if we don't have a response status
236 |     for interaction in interactions {
237 |         let response_time = interaction.response_time.as_millis() as i32;
238 |         let response = interaction.response;
239 |         let status = match response {
240 |             Ok(res) => Some(res.status()),
241 |             Err(HttpError::UnexpectedBody(err)) | Err(HttpError::FailStatus(err)) => Some(err.code),
242 |             Err(HttpError::ReqwestError(err)) => {
243 |                 let out = err.status();
244 |                 if out.is_none() {
245 |                     println!("Received a response without a status code");
246 |                     eprintln!("{:?}", err);
247 |                 }
248 |                 out
249 |             }
250 |         };
251 |         if let Some(code) = status {
252 |             sqlx::query!(
253 |                 "INSERT INTO webhook_invocation (
254 |                     scrape_id,
255 |                     webhook_id,
256 |                     response_code,
257 |                     response_delay
258 |                 ) VALUES ($1, $2, $3, $4) RETURNING *",
259 |                 processed_scrape.scrape_id,
260 |                 interaction.webhook.id,
261 |                 code.as_u16() as i32,
262 |                 response_time
263 |             )
264 |             .fetch_one(&mut tx)
265 |             .await?;
266 |         } else {
267 |             println!(
268 |                 "Failed to persist webhook response from {}",
269 |                 interaction.webhook.destination
270 |             )
271 |         }
272 |     }
273 |     tx.commit().await?;
274 |     Ok(())
275 | }
276 | 
277 | pub async fn latest_requests(
278 |     db: &Database,
279 |     _only_with_media: bool,
280 | ) -> anyhow::Result<Vec<ScrapeRequestWithMedia>> {
281 |     let results = sqlx::query!(
282 |         "select
283 |                 sr.id as scrape_request_id,
284 |                 s.id as scrape_id,
285 |                 pr.name,
286 |                 sr.response_delay,
287 |                 sr.response_code,
288 |                 sr.scraped_at,
289 |                 pr.url
290 |             from scrape_request sr
291 |             join scrape s
292 |                 on s.id = sr.scrape_id
293 |             join provider_resource pr
294 |                 on pr.name = s.provider_name and pr.destination = s.provider_destination
295 |             ORDER BY sr.scraped_at desc
296 |             LIMIT 50",
297 |     )
298 |     .fetch_all(db)
299 |     .await?;
300 |     let scrape_ids = results
301 |         .iter()
302 |         .unique_by(|rec| rec.scrape_id)
303 |         .map(|rec| rec.scrape_id)
304 |         .collect::<Vec<i32>>();
305 |     // we're using scrape_id and not scrape_request_id because users only care about individual scrapes and not requests
306 |     let medias = sqlx::query!(
307 |         "SELECT sr.scrape_id, scrape_request_id, page_url, image_url
308 |         FROM media m
309 |         join scrape_request sr
310 |             on sr.id = m.scrape_request_id
311 |         join scrape s
312 |             on s.id = sr.scrape_id
313 |         where s.id = ANY($1)",
314 |         &scrape_ids
315 |     )
316 |     .fetch_all(db)
317 |     .await?;
318 | 
319 |     let media_map = medias
320 |         .into_iter()
321 |         .filter(|rec| rec.scrape_id.is_some() && rec.image_url.is_some())
322 |         .into_group_map_by(|rec| rec.scrape_id.unwrap());
323 | 
324 |     let mut out: Vec<ScrapeRequestWithMedia> = vec![];
325 |     for result in results {
326 |         out.push(ScrapeRequestWithMedia {
327 |             response_code: result.response_code,
328 |             response_delay: result.response_delay,
329 |             provider_name: result.name.clone(),
330 |             url: result.url.clone(),
331 |             date: result.scraped_at,
332 |             media: media_map
333 |                 .get(&result.scrape_id)
334 |                 .unwrap_or(&vec![])
335 |                 .iter()
336 |                 .filter_map(|m| {
337 |                     if m.scrape_id.unwrap() == result.scrape_id {
338 |                         Some(ScrapeRequestMedia {
339 |                             media_url: m.image_url.clone().unwrap(),
340 |                             page_url: m.page_url.clone().unwrap(),
341 |                         })
342 |                     } else {
343 |                         None
344 |                     }
345 |                 })
346 |                 .collect::<Vec<ScrapeRequestMedia>>(),
347 |         })
348 |     }
349 |     Ok(out)
350 | }
351 | 


--------------------------------------------------------------------------------
/src/dispatcher/amqp.rs:
--------------------------------------------------------------------------------
 1 | use lapin::options::{
 2 |     BasicPublishOptions, ExchangeBindOptions, ExchangeDeclareOptions, QueueDeclareOptions,
 3 | };
 4 | use lapin::types::FieldTable;
 5 | use lapin::{
 6 |     BasicProperties, Channel, Connection, ConnectionProperties, ExchangeKind, Result as LapinResult,
 7 | };
 8 | use log::error;
 9 | 
10 | use crate::dispatcher::dispatcher::DispatchablePayload;
11 | 
12 | pub struct AMQPDispatcher {
13 |     channel: Channel,
14 | }
15 | 
16 | const DIRECT_QUEUE_NAME: &str = "image_discovery";
17 | 
18 | impl AMQPDispatcher {
19 |     pub async fn from_connection_string(url: &str) -> LapinResult<Self> {
20 |         let conn = Connection::connect(url, ConnectionProperties::default()).await?;
21 |         let channel = conn.create_channel().await?;
22 |         channel
23 |             .exchange_declare(
24 |                 DIRECT_QUEUE_NAME,
25 |                 ExchangeKind::Topic,
26 |                 ExchangeDeclareOptions {
27 |                     ..ExchangeDeclareOptions::default()
28 |                 },
29 |                 FieldTable::default(),
30 |             )
31 |             .await?;
32 |         // technically we're only a publisher and shouldn't be
33 |         // declaring a queue but whatever
34 |         channel
35 |             .queue_declare(
36 |                 DIRECT_QUEUE_NAME,
37 |                 QueueDeclareOptions {
38 |                     durable: true,
39 |                     ..QueueDeclareOptions::default()
40 |                 },
41 |                 FieldTable::default(),
42 |             )
43 |             .await?;
44 |         channel
45 |             .exchange_bind(
46 |                 DIRECT_QUEUE_NAME,
47 |                 DIRECT_QUEUE_NAME,
48 |                 DIRECT_QUEUE_NAME,
49 |                 ExchangeBindOptions::default(),
50 |                 FieldTable::default(),
51 |             )
52 |             .await?;
53 |         LapinResult::Ok(Self { channel })
54 |     }
55 |     pub async fn publish(&self, payload: &DispatchablePayload) {
56 |         match serde_json::to_vec(&payload) {
57 |             Err(err) => {
58 |                 error!("Error serializing AMQP payload {:?}", err)
59 |             }
60 |             Ok(value) => {
61 |                 let result = self
62 |                     .channel
63 |                     .basic_publish(
64 |                         "",
65 |                         DIRECT_QUEUE_NAME,
66 |                         BasicPublishOptions::default(),
67 |                         value,
68 |                         BasicProperties::default(),
69 |                     )
70 |                     .await;
71 |                 if let Err(e) = result {
72 |                     error!("Couldn't publish to AMQP {:?}", e)
73 |                 }
74 |             }
75 |         }
76 |     }
77 | }
78 | 


--------------------------------------------------------------------------------
/src/dispatcher/discord.rs:
--------------------------------------------------------------------------------
 1 | use serde::Serialize;
 2 | 
 3 | #[derive(Debug, Serialize)]
 4 | pub struct DiscordImage {
 5 |     pub url: String,
 6 | }
 7 | 
 8 | #[derive(Debug, Serialize)]
 9 | pub struct DiscordEmbed {
10 |     pub image: DiscordImage,
11 | }
12 | 
13 | #[derive(Debug, Serialize)]
14 | pub struct DiscordPayload<'a> {
15 |     pub username: &'a str,
16 |     pub avatar_url: &'a str,
17 |     pub content: String, // Vec<DiscordEmbed>,
18 | }
19 | 
20 | pub fn is_discord_webhook_url(url: &str) -> bool {
21 |     url.starts_with("https://discord.com/api/webhooks")
22 | }
23 | 


--------------------------------------------------------------------------------
/src/dispatcher/dispatcher.rs:
--------------------------------------------------------------------------------
  1 | use futures::{stream, StreamExt};
  2 | use tokio::sync::Mutex;
  3 | // use parking_lot::Mutex;
  4 | use reqwest::{Client, Response};
  5 | use serde::Serialize;
  6 | use std::{
  7 |     sync::{Arc, RwLock},
  8 |     time::Instant,
  9 | };
 10 | 
 11 | use crate::{
 12 |     dispatcher::{webhook_type, WebhookDestination},
 13 |     models::DatabaseWebhook,
 14 |     request::{request_default_headers, HttpError},
 15 |     scraper::{
 16 |         scraper::{Scrape, ScraperStep},
 17 |         AllProviders, ProviderPost,
 18 |     },
 19 | };
 20 | 
 21 | use super::super::scraper::Provider;
 22 | 
 23 | pub struct WebhookDispatch {
 24 |     pub webhook: DatabaseWebhook,
 25 | }
 26 | 
 27 | #[derive(Debug)]
 28 | pub struct WebhookInteraction {
 29 |     pub webhook: DatabaseWebhook,
 30 |     pub response: Result<Response, HttpError>,
 31 |     pub response_time: std::time::Duration,
 32 | }
 33 | 
 34 | #[derive(Debug, Serialize, Clone)]
 35 | pub struct DispatchablePayloadProviderInfo {
 36 |     #[serde(rename = "type")]
 37 |     pub _type: AllProviders,
 38 |     pub id: String,
 39 |     pub ephemeral: bool,
 40 |     pub official: bool,
 41 | }
 42 | 
 43 | #[derive(Debug, Serialize, Clone)]
 44 | pub struct DispatchablePayload {
 45 |     pub provider: DispatchablePayloadProviderInfo,
 46 |     pub posts: Vec<ProviderPost>,
 47 |     pub metadata: Option<serde_json::Value>,
 48 | }
 49 | 
 50 | impl DispatchablePayload {
 51 |     pub fn new(
 52 |         provider: &dyn Provider,
 53 |         scrape: &Scrape,
 54 |         metadata: Option<serde_json::Value>,
 55 |     ) -> Self {
 56 |         let posts = scrape
 57 |             .requests
 58 |             .iter()
 59 |             .filter_map(|req| match &req.step {
 60 |                 ScraperStep::Data(data) => Some(data),
 61 |                 ScraperStep::Error(_) => None,
 62 |             })
 63 |             .flat_map(|result| result.posts.clone())
 64 |             .collect::<Vec<_>>();
 65 |         DispatchablePayload {
 66 |             provider: DispatchablePayloadProviderInfo {
 67 |                 _type: scrape.provider.name,
 68 |                 id: scrape.provider.destination.clone(),
 69 |                 ephemeral: provider.ephemeral(),
 70 |                 official: scrape.provider.official,
 71 |             },
 72 |             posts,
 73 |             metadata,
 74 |         }
 75 |     }
 76 | }
 77 | 
 78 | const WEBHOOK_DISPATCH_CONCURRENCY_LIMIT: usize = 8;
 79 | 
 80 | pub async fn dispatch_webhooks<'a>(
 81 |     // provider: &dyn Provider,
 82 |     // scrape: &Scrape<'a>,
 83 |     dispatch: Vec<(DatabaseWebhook, DispatchablePayload)>,
 84 | ) -> Vec<WebhookInteraction> {
 85 |     let client = &Client::new();
 86 |     // request results are not guaranteed to be in order
 87 |     let mut results: Vec<WebhookInteraction> = vec![];
 88 | 
 89 |     let results_lock = Arc::new(Mutex::new(&mut results));
 90 |     let iter = |(wh, payload): (DatabaseWebhook, DispatchablePayload)| {
 91 |         let f = results_lock.lock();
 92 |         async move {
 93 |             let builder = client
 94 |                 .post(&wh.destination)
 95 |                 .headers(request_default_headers());
 96 |             let instant = Instant::now();
 97 |             if let WebhookDestination::Custom = webhook_type(&wh.destination) {
 98 |                 let response = builder
 99 |                     .json(&payload)
100 |                     .send()
101 |                     .await
102 |                     .map_err(HttpError::ReqwestError);
103 |                 let response_time = instant.elapsed();
104 |                 f.await.push(WebhookInteraction {
105 |                     webhook: wh,
106 |                     response,
107 |                     response_time,
108 |                 });
109 |             }
110 |         }
111 |     };
112 | 
113 |     stream::iter(dispatch)
114 |         // sadly there's no `map_concurrent` for futures
115 |         .for_each_concurrent(WEBHOOK_DISPATCH_CONCURRENCY_LIMIT, iter)
116 |         .await;
117 |     results
118 | }
119 | 


--------------------------------------------------------------------------------
/src/dispatcher/mod.rs:
--------------------------------------------------------------------------------
 1 | use self::discord::is_discord_webhook_url;
 2 | 
 3 | pub mod amqp;
 4 | mod discord;
 5 | pub mod dispatcher;
 6 | 
 7 | pub enum WebhookDestination {
 8 |     #[deprecated]
 9 |     Discord,
10 |     Custom,
11 | }
12 | 
13 | pub fn webhook_type(url: &str) -> WebhookDestination {
14 |     if is_discord_webhook_url(url) {
15 |         WebhookDestination::Discord
16 |     } else {
17 |         WebhookDestination::Custom
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
 1 | pub mod db;
 2 | pub mod dispatcher;
 3 | pub mod models;
 4 | pub mod request;
 5 | pub mod scheduler;
 6 | pub mod scraper;
 7 | pub mod server;
 8 | pub mod api;
 9 | pub use dotenv::dotenv;
10 | pub use std::env;
11 | 


--------------------------------------------------------------------------------
/src/main.rs:
--------------------------------------------------------------------------------
  1 | use std::env;
  2 | use std::{error::Error, sync::Arc, time::Duration};
  3 | 
  4 | use dotenv::dotenv;
  5 | use futures::future::join_all;
  6 | use log::{debug, error, info, trace};
  7 | use reqwest::Client;
  8 | use sqlx::{Pool, Postgres};
  9 | 
 10 | use jiu::dispatcher::amqp::AMQPDispatcher;
 11 | use jiu::dispatcher::dispatcher::{dispatch_webhooks, DispatchablePayload};
 12 | use jiu::server::run_server;
 13 | use jiu::{
 14 |     db::*,
 15 |     models::PendingProvider,
 16 |     scheduler::*,
 17 |     scraper::{get_provider_map, scraper::scrape, Provider, ProviderMap, ScrapeRequestInput},
 18 | };
 19 | 
 20 | struct Context {
 21 |     db: Arc<Pool<Postgres>>,
 22 |     amqp: Arc<Option<AMQPDispatcher>>,
 23 |     client: Arc<Client>,
 24 |     provider_map: Arc<ProviderMap>,
 25 | }
 26 | 
 27 | async fn iter(
 28 |     ctx: Arc<Context>,
 29 |     pending: &PendingProvider,
 30 |     provider: &dyn Provider,
 31 | ) -> anyhow::Result<()> {
 32 |     let sp = pending.provider.clone();
 33 |     let latest_data = latest_media_ids_from_provider(&ctx.db, &sp).await?;
 34 |     // there must be at least ONE data found if the scrape isn't the first ever one
 35 |     let is_first_scrape = latest_data.is_empty();
 36 |     if is_first_scrape {
 37 |         trace!(
 38 |             "Scraping {}: {} for the first time ever",
 39 |             sp.name.to_string(),
 40 |             sp.destination
 41 |         )
 42 |     }
 43 |     let step = ScrapeRequestInput {
 44 |         latest_data,
 45 |         is_first_scrape,
 46 |         default_name: pending.default_name.clone(),
 47 |         last_scrape: pending.last_scrape,
 48 |     };
 49 |     let mut result = scrape(&sp, &*provider, &step).await?;
 50 | 
 51 |     let webhooks = webhooks_for_provider(&ctx.db, &sp).await?;
 52 |     let webhook_interactions = if result.discovered_new_images() {
 53 |         let dispatch = webhooks
 54 |             .into_iter()
 55 |             .map(|wh| {
 56 |                 let payload = DispatchablePayload::new(&*provider, &result, wh.metadata.clone());
 57 |                 (wh, payload)
 58 |             })
 59 |             .collect::<Vec<_>>();
 60 |         // we don't really care about the interactions in amqp since we have full
 61 |         // control of that environment anyways
 62 |         if let Some(amqp) = &*ctx.amqp {
 63 |             if let Ok(Some(amqp_d)) = amqp_metadata(&*ctx.db, &sp).await {
 64 |                 let payload = DispatchablePayload::new(&*provider, &result, amqp_d.metadata);
 65 |                 trace!("Publishing AMQP message for {}", &provider.id().to_string());
 66 |                 amqp.publish(&payload).await;
 67 |             }
 68 |         }
 69 |         Some(dispatch_webhooks(dispatch).await)
 70 |     } else {
 71 |         None
 72 |     };
 73 |     // process scraping MUST come after dispatcher dispatching since it mutates the array by reversing it
 74 |     let processed_scrape = process_scrape(&ctx.db, &mut result, pending).await?;
 75 |     if webhook_interactions.is_some() {
 76 |         submit_webhook_responses(&ctx.db, processed_scrape, webhook_interactions.unwrap()).await?
 77 |     }
 78 |     Ok(())
 79 | }
 80 | 
 81 | async fn job_loop(ctx: Arc<Context>) {
 82 |     let arc_db = Arc::clone(&ctx.db);
 83 |     trace!("Getting pending scrapes");
 84 |     let pendings = match pending_scrapes(&arc_db).await {
 85 |         Err(error) => {
 86 |             println!("{:?}", error);
 87 |             return;
 88 |         }
 89 |         Ok(result) => result,
 90 |     };
 91 |     trace!("Getting pending scrapes");
 92 |     if let Some(err) = update_priorities(&arc_db, &pendings).await.err() {
 93 |         // should an error here be preventing the scrape?
 94 |         // Could end up spamming a provider if it's stuck at a high value
 95 |         error!("{:?}", err);
 96 |     };
 97 |     trace!("Preparing to scrape {} pending providers", pendings.len());
 98 | 
 99 |     let this_scrape = pendings.iter().map(Arc::new).map(|pending| async {
100 |         let pp = pending;
101 |         let sleep_time = pp.scrape_date;
102 |         tokio::time::sleep(sleep_time).await;
103 |         if let Err(err) = run(Arc::clone(&ctx), &pp, &ctx.provider_map).await {
104 |             error!("{:?}", err);
105 |             return;
106 |         }
107 |         debug!("Finished scraping {}", pp.provider.name.to_string());
108 |     });
109 |     join_all(this_scrape).await;
110 | }
111 | 
112 | async fn run(
113 |     ctx: Arc<Context>,
114 |     pp: &PendingProvider,
115 |     provider_map: &ProviderMap,
116 | ) -> Result<(), Box<dyn Error + Send>> {
117 |     let provider = provider_map.get(&pp.provider.name).unwrap_or_else(|| {
118 |         panic!(
119 |             "Tried to get a provider that doesn't exist {}",
120 |             &pp.provider,
121 |         )
122 |     });
123 |     if let Err(err) = iter(Arc::clone(&ctx), pp, &**provider).await {
124 |         eprintln!("{:?}", err);
125 |     }
126 |     Ok(())
127 | }
128 | 
129 | async fn setup() -> anyhow::Result<()> {
130 |     info!("Starting JiU");
131 |     let client = Arc::new(Client::new());
132 |     let provider_map = Arc::new(
133 |         get_provider_map(&Arc::clone(&client))
134 |             .await
135 |             .expect("Could not successfully initialize a provider map"),
136 |     );
137 |     let pm = Arc::clone(&provider_map);
138 |     tokio::spawn(async move {
139 |         match connect().await {
140 |             Ok(db) => run_server(Arc::new(db), pm, 8080).await,
141 |             Err(err) => {
142 |                 error!("{:?}", err)
143 |             }
144 |         }
145 |     });
146 |     loop {
147 |         let client = Arc::clone(&client);
148 |         let provider_map = Arc::clone(&provider_map);
149 |         info!("Starting job loop {}", SCHEDULER_END_MILLISECONDS);
150 |         let data = match env::var("NO_WORKER") {
151 |             Ok(_) => {
152 |                 info!("Not starting worker because 'NO_WORKER' environment was set");
153 |                 tokio::task::spawn(async {})
154 |             }
155 |             _ => tokio::task::spawn(async move {
156 |                 let db = match connect().await {
157 |                     Err(err) => {
158 |                         error!("{:?}", err);
159 |                         return;
160 |                     }
161 |                     Ok(db) => db,
162 |                 };
163 |                 let db = Arc::new(db);
164 |                 let amqp = Arc::new(match env::var("AMQP_URL") {
165 |                     Ok(a) => Some(AMQPDispatcher::from_connection_string(&a).await.unwrap()),
166 |                     Err(_) => None,
167 |                 });
168 |                 let ctx = Arc::new(Context {
169 |                     db: Arc::clone(&db),
170 |                     amqp: Arc::clone(&amqp),
171 |                     client: Arc::clone(&client),
172 |                     provider_map,
173 |                 });
174 |                 info!("Starting requests for the day...");
175 |                 job_loop(ctx).await;
176 |                 info!("Requests finished for the day...");
177 |             }),
178 |         };
179 |         let delay = tokio::time::sleep(Duration::from_millis(SCHEDULER_END_MILLISECONDS));
180 |         if let (_, Err(join_err)) = tokio::join!(delay, data) {
181 |             println!("{:?}", join_err)
182 |         }
183 |         info!("Finished job loop");
184 |     }
185 | }
186 | 
187 | #[tokio::main]
188 | async fn main() {
189 |     better_panic::install();
190 |     env_logger::init();
191 |     dotenv().ok();
192 | 
193 |     info!("Running program");
194 |     if let Err(err) = setup().await {
195 |         error!("{:?}", err);
196 |     };
197 |     info!("Shutting down...")
198 | }
199 | 


--------------------------------------------------------------------------------
/src/models.rs:
--------------------------------------------------------------------------------
 1 | use crate::{scheduler::Priority, scraper::ScopedProvider};
 2 | use chrono::NaiveDateTime;
 3 | use serde::Serialize;
 4 | use std::fmt::Display;
 5 | use std::time::Duration;
 6 | 
 7 | #[derive(Debug)]
 8 | pub struct AMQPDestination {
 9 |     pub id: i32,
10 |     pub metadata: Option<serde_json::Value>,
11 | }
12 | 
13 | #[derive(Debug)]
14 | pub struct DatabaseWebhook {
15 |     pub id: i32,
16 |     pub destination: String,
17 |     pub created_at: NaiveDateTime,
18 |     pub updated_at: NaiveDateTime,
19 |     pub metadata: Option<serde_json::Value>,
20 | }
21 | 
22 | #[derive(Debug, Clone, Serialize)]
23 | pub struct ScrapeRequestMedia {
24 |     pub media_url: String,
25 |     pub page_url: String,
26 | }
27 | 
28 | #[derive(Debug, Clone, Serialize)]
29 | pub struct ScrapeRequestWithMedia {
30 |     pub provider_name: String,
31 |     pub url: String,
32 |     pub response_code: Option<i32>,
33 |     pub response_delay: Option<i32>,
34 |     pub date: NaiveDateTime,
35 |     pub media: Vec<ScrapeRequestMedia>,
36 | }
37 | 
38 | #[derive(Debug)]
39 | pub struct DatabaseWebhookSource {
40 |     pub id: i32,
41 |     pub webhook_id: i32,
42 |     pub provider_destination: String,
43 | }
44 | 
45 | #[derive(Debug, Clone)]
46 | pub struct ScrapeHistory {
47 |     pub priority: Priority,
48 |     pub provider: ScopedProvider,
49 |     pub date: NaiveDateTime,
50 |     pub result_count: u32,
51 | }
52 | 
53 | #[derive(Debug, Clone, PartialEq, Eq, Hash)]
54 | 
55 | pub struct PendingProvider {
56 |     pub id: i32,
57 |     /// the name that is used if a more relevant name for posts cannot be found 
58 |     pub default_name: Option<String>,
59 |     pub priority: Priority,
60 |     pub provider: ScopedProvider,
61 |     pub scrape_date: Duration,
62 |     pub last_scrape: Option<NaiveDateTime>,
63 | }
64 | 
65 | impl Display for PendingProvider {
66 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
67 |         f.write_str(&format!("{}", self.provider))
68 |     }
69 | }
70 | 


--------------------------------------------------------------------------------
/src/request.rs:
--------------------------------------------------------------------------------
 1 | use std::env;
 2 | use std::iter::FromIterator;
 3 | 
 4 | use log::error;
 5 | use reqwest;
 6 | use reqwest::header::{HeaderMap, HeaderName, HeaderValue};
 7 | use reqwest::{Response, StatusCode};
 8 | use serde::de::DeserializeOwned;
 9 | use thiserror::Error;
10 | 
11 | #[derive(Debug, Clone)]
12 | pub struct ResponseErrorContext {
13 |     pub body: String,
14 |     pub code: StatusCode,
15 |     pub message: Option<String>,
16 | }
17 | 
18 | /// Wrapper for providing actual useful information about
19 | /// why responses failed since reqwest throws that information
20 | /// away when it encounters errors
21 | #[derive(Error, Debug)]
22 | pub enum HttpError {
23 |     #[error("Failed response code {0:?}")]
24 |     FailStatus(ResponseErrorContext),
25 |     #[error("Unexpected body {0:?}")]
26 |     UnexpectedBody(ResponseErrorContext),
27 |     #[error("Request error")]
28 |     ReqwestError(#[from] reqwest::Error),
29 | }
30 | 
31 | pub async fn parse_successful_response<T: DeserializeOwned>(
32 |     response: Response,
33 | ) -> Result<T, HttpError> {
34 |     let response_code = response.status();
35 |     let url = response.url().clone();
36 |     let response_body = response.text().await?;
37 |     if !response_code.is_success() {
38 |         // sadly shitty reqwest doesn't give us the response body as
39 |         // context when trying to handle invalid responses
40 |         return Err(HttpError::FailStatus(ResponseErrorContext {
41 |             body: response_body,
42 |             code: response_code,
43 |             message: None,
44 |         }));
45 |     }
46 |     serde_json::from_str::<T>(&response_body).map_err(|error| {
47 |         error!("{:?}", error);
48 |         error!("Failed to parse response from {}", url);
49 |         HttpError::UnexpectedBody(ResponseErrorContext {
50 |             body: response_body,
51 |             code: response_code,
52 |             message: Some(error.to_string()),
53 |         })
54 |     })
55 | }
56 | 
57 | pub fn request_default_headers() -> HeaderMap {
58 |     // TODO: change the user agent if the program has been forked to modify
59 |     // important settings like request speed
60 |     let user_agent: String =
61 |         env::var("USER_AGENT").expect("Missing USER_AGENT environment variable");
62 |     HeaderMap::from_iter([(
63 |         HeaderName::from_static("user-agent"),
64 |         HeaderValue::from_str(&user_agent).unwrap(),
65 |     )])
66 | }
67 | 


--------------------------------------------------------------------------------
/src/scheduler/mod.rs:
--------------------------------------------------------------------------------
 1 | pub mod priority;
 2 | pub use priority::*;
 3 | pub mod scheduler;
 4 | pub use scheduler::*;
 5 | pub mod rate_limiter;
 6 | pub use rate_limiter::*;
 7 | 
 8 | const MIN_PRIORITY: f32 = 0.07;
 9 | const MAX_PRIORITY: f32 = 1.75;
10 | 


--------------------------------------------------------------------------------
/src/scheduler/priority.rs:
--------------------------------------------------------------------------------
  1 | use std::convert::TryInto;
  2 | 
  3 | use num_traits::FromPrimitive;
  4 | use sqlx::types::BigDecimal;
  5 | 
  6 | use crate::{models::ScrapeHistory, scheduler::MIN_PRIORITY};
  7 | 
  8 | use super::MAX_PRIORITY;
  9 | 
 10 | #[derive(Debug)]
 11 | pub struct InvalidPriority(f32);
 12 | 
 13 | #[derive(Debug, PartialEq, Eq, PartialOrd, Clone, Hash)]
 14 | pub struct Priority {
 15 |     pub level: BigDecimal,
 16 | }
 17 | 
 18 | impl From<f32> for Priority {
 19 |     fn from(level: f32) -> Self {
 20 |         Self {
 21 |             level: BigDecimal::from_f32(level).unwrap(),
 22 |         }
 23 |     }
 24 | }
 25 | 
 26 | impl Default for Priority {
 27 |     fn default() -> Self {
 28 |         Priority::unchecked_clamp(1f32)
 29 |     }
 30 | }
 31 | 
 32 | const MAX_RESULT_CONTRIBUTION: u32 = 3;
 33 | 
 34 | impl Priority {
 35 |     /// Decide the next priority based on the the recent scrape history of the
 36 |     /// provider priority.
 37 |     pub fn next(&self, history: &[ScrapeHistory]) -> Self {
 38 |         if history.is_empty() {
 39 |             return Self {
 40 |                 level: BigDecimal::from_f32(1f32).unwrap(),
 41 |             };
 42 |         }
 43 |         let n = history.len() as i32;
 44 | 
 45 |         let raw_weights = (0i32..n).map(|x| (x - n - 1).pow(2));
 46 |         let sum_raw_weight: i32 = raw_weights.clone().sum();
 47 |         let weights = raw_weights.map(|x| x as f32 / sum_raw_weight as f32);
 48 |         let weight_sum = weights.clone().sum::<f32>();
 49 |         let z = weights.zip(history);
 50 |         let raw_weighted_average: f32 = z
 51 |             .map(|(a, b)| (a * b.result_count.min(MAX_RESULT_CONTRIBUTION) as f32))
 52 |             .sum();
 53 | 
 54 |         let weighted_average: f32 = (raw_weighted_average * weight_sum) / weight_sum as f32;
 55 |         let scaled = weighted_average * (MAX_PRIORITY - MIN_PRIORITY) + MIN_PRIORITY;
 56 |         let level = scaled.clamp(MIN_PRIORITY, MAX_PRIORITY);
 57 | 
 58 |         // in some strange situations f32 is NaN.
 59 |         // These cases are normally handled at the top of the function but if not... we just default to
 60 |         // the existing thing
 61 |         let level = BigDecimal::from_f32(level).unwrap_or_else(|| self.level.clone());
 62 |         Self { level }
 63 |     }
 64 |     pub fn unchecked_clamp(level: f32) -> Self {
 65 |         level
 66 |             .clamp(MIN_PRIORITY, MAX_PRIORITY)
 67 |             .try_into()
 68 |             // something has gone very wrong if the level is out of bounds
 69 |             .expect(&format!("{} is not a valid priority", level))
 70 |     }
 71 | }
 72 | 
 73 | #[cfg(test)]
 74 | mod tests {
 75 |     use chrono::NaiveDateTime;
 76 |     use num_traits::FromPrimitive;
 77 |     use sqlx::types::BigDecimal;
 78 | 
 79 |     use crate::{
 80 |         models::ScrapeHistory,
 81 |         scheduler::{MAX_PRIORITY, MIN_PRIORITY},
 82 |         scraper::ScopedProvider,
 83 |     };
 84 | 
 85 |     use super::Priority;
 86 | 
 87 |     #[test]
 88 |     fn priority_check() {
 89 |         let prio = Priority::unchecked_clamp(0f32);
 90 |         let make_hist = |count| ScrapeHistory {
 91 |             date: NaiveDateTime::from_timestamp(0, 0),
 92 |             priority: prio.clone(),
 93 |             provider: ScopedProvider {
 94 |                 destination: "".to_owned(),
 95 |                 name: crate::scraper::AllProviders::PinterestBoardFeed,
 96 |             },
 97 |             result_count: count,
 98 |         };
 99 |         let hist = make_hist(1);
100 |         let n = prio.next(&[hist.clone(), hist.clone(), hist.clone(), hist]);
101 |         assert_eq!(n.level, BigDecimal::from_f32(MAX_PRIORITY).unwrap());
102 | 
103 |         let n = prio.next(&(0..15).map(|_| make_hist(1)).collect::<Vec<_>>());
104 |         assert_eq!(n.level, BigDecimal::from_f32(MAX_PRIORITY).unwrap());
105 | 
106 |         let n = prio.next(&(0..15).map(|_| make_hist(0)).collect::<Vec<_>>());
107 |         assert_eq!(n.level, BigDecimal::from_f32(MIN_PRIORITY).unwrap())
108 |     }
109 | }
110 | 


--------------------------------------------------------------------------------
/src/scheduler/rate_limiter.rs:
--------------------------------------------------------------------------------
 1 | use governor::{
 2 |     clock::QuantaClock,
 3 |     state::{DirectStateStore, InMemoryState, NotKeyed},
 4 |     RateLimiter,
 5 | };
 6 | 
 7 | /// Most providers use rate limiter at the domain level and not at the page level
 8 | /// in order to prevent exceeding rate limits imposed by webservers
 9 | pub type UnscopedLimiter = RateLimiter<NotKeyed, InMemoryState, QuantaClock>;
10 | 
11 | /// Some providers can use rate limiting at the page level imposed by set limits of API keys
12 | #[allow(dead_code)]
13 | pub type ScopedLimiter = RateLimiter<dyn DirectStateStore, InMemoryState, QuantaClock>;
14 | 
15 | /// Global rate limiting wrapper for limits imposed on individual providers being run concurrently
16 | pub struct GlobalRateLimiter(UnscopedLimiter);
17 | 


--------------------------------------------------------------------------------
/src/scheduler/scheduler.rs:
--------------------------------------------------------------------------------
  1 | use std::ops::Add;
  2 | use std::time::Duration;
  3 | use std::{collections::HashSet, convert::TryInto, hash::Hash, iter::FromIterator, str::FromStr};
  4 | 
  5 | use itertools::{unfold, Itertools};
  6 | use log::{debug, info};
  7 | use num_traits::cast::ToPrimitive;
  8 | use rand::Rng;
  9 | 
 10 | use crate::{
 11 |     db::Database,
 12 |     models::{PendingProvider, ScrapeHistory},
 13 |     scheduler::Priority,
 14 |     scraper::{AllProviders, ScopedProvider},
 15 | };
 16 | 
 17 | const MAX_TESTING_PROVIDERS: usize = 10;
 18 | pub const SCHEDULER_START_MILLISECONDS: u64 = if cfg!(debug_assertions) {
 19 |     1000 * 3
 20 | } else {
 21 |     1000 * 30
 22 | };
 23 | 
 24 | // making life easier for testing. Could blow up in my face some day...
 25 | pub const SCHEDULER_END_MILLISECONDS: u64 = if cfg!(debug_assertions) {
 26 |     1000 * 10
 27 | } else {
 28 |     8.64e7 as u64
 29 | };
 30 | 
 31 | /// We only want to scrape one single endpoint at most 3 times a day
 32 | const MAX_DAILY_SCRAPE_COUNT: i32 = 3;
 33 | 
 34 | pub type RunningProviders = HashSet<ScopedProvider>;
 35 | 
 36 | /// Scheduled providers are ready to be processed
 37 | #[derive(Debug)]
 38 | pub struct ScheduledProviders(Vec<PendingProvider>);
 39 | 
 40 | impl ScheduledProviders {
 41 |     pub fn providers(&self) -> &Vec<PendingProvider> {
 42 |         &self.0
 43 |     }
 44 |     pub fn len(&self) -> usize {
 45 |         self.0.len()
 46 |     }
 47 | }
 48 | 
 49 | /// Get a list of sorted scrapes that need to happen for that day
 50 | pub async fn pending_scrapes(db: &Database) -> anyhow::Result<Vec<PendingProvider>> {
 51 |     // all future scrapes that are specifically grouped by their provider name first
 52 |     let potential_target_providers = sqlx::query!(
 53 |         "SELECT * FROM provider_resource pr
 54 |         WHERE pr.enabled AND pr.tokens >= 1
 55 |         ORDER BY pr.name DESC, pr.destination desc"
 56 |     )
 57 |     .fetch_all(db)
 58 |     .await?;
 59 | 
 60 |     let groups = potential_target_providers
 61 |         .into_iter()
 62 |         .flat_map(|row| {
 63 |             let tokens = row.tokens.to_f32().unwrap().trunc() as i32;
 64 |             (0..tokens.min(MAX_DAILY_SCRAPE_COUNT))
 65 |                 .map(|_| {
 66 |                     (
 67 |                         row.id,
 68 |                         Priority::unchecked_clamp(row.priority.to_f32().unwrap()),
 69 |                         ScopedProvider {
 70 |                             destination: row.destination.clone(),
 71 |                             name: AllProviders::from_str(&row.name).unwrap(),
 72 |                             official: row.official,
 73 |                         }, // last_scrape: row.last_scrape,
 74 |                         row.last_scrape,
 75 |                         row.default_name.clone(),
 76 |                     )
 77 |                 })
 78 |                 .collect::<Vec<_>>()
 79 |         })
 80 |         .group_by(|p| p.2.name);
 81 | 
 82 |     let out: Vec<PendingProvider> = groups
 83 |         .into_iter()
 84 |         .flat_map(|(_, group)| {
 85 |             let endpoints = group.collect::<Vec<_>>();
 86 |             let maximized_endpoints = maximize_distance(&endpoints, quality_maxmindist);
 87 |             let dates = interpolate_dates(
 88 |                 maximized_endpoints.len(),
 89 |                 // We want to give the
 90 |                 &Duration::from_millis(SCHEDULER_START_MILLISECONDS),
 91 |                 // One day
 92 |                 &Duration::from_millis(SCHEDULER_END_MILLISECONDS),
 93 |             );
 94 |             maximized_endpoints
 95 |                 .iter()
 96 |                 .zip(dates)
 97 |                 .map(
 98 |                     |((id, priority, provider, last_scrape, default_name), scrape_date)| {
 99 |                         PendingProvider {
100 |                             id: *id,
101 |                             priority: priority.clone(),
102 |                             provider: provider.clone(),
103 |                             scrape_date,
104 |                             last_scrape: *last_scrape,
105 |                             default_name: default_name.clone(),
106 |                         }
107 |                     },
108 |                 )
109 |                 .collect::<Vec<_>>()
110 |         })
111 |         .collect::<Vec<_>>();
112 | 
113 |     let original_length = out.len();
114 |     sqlx::query!(
115 |         "UPDATE provider_resource SET last_queue = NOW() WHERE id = ANY($1)",
116 |         &out.iter().map(|p| p.id).collect::<Vec<_>>()
117 |     )
118 |     .fetch_one(db)
119 |     .await;
120 |     let safe_providers = if cfg!(debug_assertions) {
121 |         // making sure we don't blow things up in case we're running this in development with tons of
122 |         // pending providers
123 |         let slice_boundary = MAX_TESTING_PROVIDERS.min(out.len());
124 |         let result = Vec::from_iter(out[..slice_boundary].iter().map(|p| p.to_owned()));
125 |         if result.len() != original_length {
126 |             info!(
127 |                 "Debug mode truncated pending providers from {} to {}",
128 |                 result.len(),
129 |                 MAX_TESTING_PROVIDERS
130 |             );
131 |         }
132 |         result
133 |     } else {
134 |         out
135 |     };
136 |     Ok(safe_providers)
137 | }
138 | 
139 | /// Vec length is equal to the length of the items passed in
140 | fn interpolate_dates(
141 |     item_count: usize,
142 |     start_duration: &Duration,
143 |     end_duration: &Duration,
144 | ) -> Vec<Duration> {
145 |     let duration = *end_duration - *start_duration;
146 |     let initial_gap = duration.checked_div(item_count as u32 + 1).unwrap();
147 |     unfold(*start_duration, |duration| {
148 |         let next = duration.add(initial_gap);
149 |         *duration = next;
150 |         Some(next)
151 |     })
152 |     .by_ref()
153 |     .take(item_count)
154 |     .collect::<Vec<_>>()
155 | }
156 | 
157 | pub async fn update_priorities(db: &Database, sp: &[PendingProvider]) -> anyhow::Result<()> {
158 |     let providers = sqlx::query!(
159 |         "SELECT
160 |             pr.id,
161 |             pr.name,
162 |             pr.destination,
163 |             pr.official,
164 |             s.priority as resource_priority,
165 |             s.scraped_at,
166 |             s.priority,
167 |             (SELECT COUNT(*)
168 |               FROM media m
169 |               INNER JOIN scrape_request sr
170 |                 on sr.id = m.scrape_request_id
171 |               where sr.scrape_id = s.id
172 |             ) as discovery_count
173 |         FROM provider_resource pr
174 |         INNER JOIN LATERAL (
175 |             SELECT *
176 |             FROM scrape s
177 |             WHERE s.provider_name = pr.name
178 |               AND s.provider_destination = pr.destination
179 |             ORDER BY s.scraped_at desc, id
180 |             LIMIT 30
181 |         ) s on True
182 |         WHERE pr.enabled AND pr.id = ANY($1)
183 |         ORDER BY s.scraped_at desc",
184 |         &sp.iter().map(|pp| pp.id).collect::<Vec<_>>()
185 |     )
186 |     .fetch_all(db)
187 |     .await?;
188 | 
189 |     let groups = providers.iter().into_group_map_by(|row| {
190 |         (
191 |             row.id,
192 |             row.name.clone(),
193 |             row.destination.clone(),
194 |             row.priority.clone(),
195 |         )
196 |     });
197 | 
198 |     for ((id, name, destination, priority), rows) in groups {
199 |         let histories = rows
200 |             .into_iter()
201 |             .filter(|&row| row.scraped_at.is_some())
202 |             .map(|row| ScrapeHistory {
203 |                 date: row.scraped_at.unwrap(),
204 |                 priority: Priority::unchecked_clamp(row.priority.to_f32().unwrap()),
205 |                 result_count: row.discovery_count.unwrap_or(0i64).try_into().unwrap(),
206 |                 provider: ScopedProvider {
207 |                     destination: destination.clone(),
208 |                     name: AllProviders::from_str(&name).unwrap(),
209 |                     official: row.official,
210 |                 },
211 |             })
212 |             .collect::<Vec<ScrapeHistory>>();
213 | 
214 |         if !histories.is_empty() {
215 |             let provider_priority = Priority::unchecked_clamp(priority.to_f32().unwrap());
216 |             let next_priority = provider_priority.next(&histories[..]);
217 |             debug!(
218 |                 "Setting the next priority for [{}] from {} to {} because {:?}",
219 |                 &name,
220 |                 provider_priority.level.to_f32().unwrap_or(-1.0),
221 |                 next_priority.level.to_f32().unwrap_or(-1.0),
222 |                 &histories.iter().map(|h| h.result_count)
223 |             );
224 |             // continue;
225 |             sqlx::query!(
226 |                 "UPDATE provider_resource SET priority = $1 where id = $2
227 |              AND last_token_update IS NOT NULL
228 |              returning id",
229 |                 next_priority.level,
230 |                 id
231 |             )
232 |             .fetch_optional(db)
233 |             .await?;
234 |         }
235 |     }
236 |     // return Ok(());
237 |     // Update tokens for all resources. This has to be run after priorities are
238 |     // updated
239 |     // We don't want to give any endpoint more than 4 tokens (in case something goes wrong)
240 |     sqlx::query!(
241 |         "UPDATE provider_resource
242 |         SET
243 |             tokens = LEAST(4, tokens + priority),
244 |             last_token_update = NOW()
245 |         WHERE enabled = True AND (last_token_update IS NULL OR last_token_update + interval '1 day' <= NOW())"
246 |     )
247 |     .fetch_optional(db)
248 |     .await?;
249 |     Ok(())
250 | }
251 | 
252 | pub fn maximize_distance<T: Hash + Eq + Clone>(items: &[T], quality: fn(&[T]) -> f32) -> Vec<T> {
253 |     let mut out = items.to_owned();
254 |     let mut no_improvement = 0;
255 |     let mut best = 0f32;
256 |     let mut rng = rand::thread_rng();
257 |     while no_improvement < 400 {
258 |         let i = rng.gen_range(0..out.len());
259 |         let j = rng.gen_range(0..out.len());
260 |         let mut copy = out.clone();
261 |         copy.swap(i, j);
262 |         let q = quality(&copy);
263 |         if q > best {
264 |             out = copy;
265 |             best = q;
266 |             no_improvement = 0;
267 |         } else {
268 |             no_improvement += 1;
269 |         }
270 |     }
271 |     out
272 | }
273 | 
274 | fn quality_maxmindist<T: Hash + Eq>(items: &[T]) -> f32 {
275 |     let mut s = 0f32;
276 |     let uniq: HashSet<&T> = HashSet::from_iter(items);
277 |     for item in uniq.into_iter() {
278 |         let indices = (0..items.len())
279 |             .filter_map(|i| {
280 |                 if &items[i] == item {
281 |                     Some(i as i32)
282 |                 } else {
283 |                     None
284 |                 }
285 |             })
286 |             .collect::<Vec<i32>>();
287 |         if indices.len() > 1 {
288 |             let summed: f32 = (0..indices.len() - 1)
289 |                 .map(|i| 1f32 / (indices[i + 1] - indices[i]) as f32)
290 |                 .sum();
291 |             s += summed;
292 |         }
293 |     }
294 |     1f32 / s
295 | }
296 | 
297 | #[cfg(test)]
298 | mod tests {
299 |     use std::time::Duration;
300 | 
301 |     use crate::scheduler::scheduler::quality_maxmindist;
302 | 
303 |     use super::{interpolate_dates, maximize_distance};
304 | 
305 |     #[test]
306 |     fn spacing_test() {
307 |         assert_eq!(
308 |             maximize_distance(&[1, 1, 1, 2, 2], quality_maxmindist),
309 |             &[1, 2, 1, 2, 1],
310 |         );
311 |     }
312 | 
313 |     #[test]
314 |     fn interpolate() {
315 |         let out: Vec<Duration> =
316 |             interpolate_dates(3, &Duration::from_millis(0), &Duration::from_millis(3000));
317 |         let res: Vec<Duration> = vec![
318 |             Duration::from_millis(750),
319 |             Duration::from_millis(1500),
320 |             Duration::from_millis(2250),
321 |         ];
322 |         assert_eq!(out, res)
323 |     }
324 | }
325 | 


--------------------------------------------------------------------------------
/src/scraper/mod.rs:
--------------------------------------------------------------------------------
1 | mod providers;
2 | pub use providers::*;
3 | pub mod scraper;
4 | 


--------------------------------------------------------------------------------
/src/scraper/providers/mod.rs:
--------------------------------------------------------------------------------
 1 | use std::collections::HashMap;
 2 | use std::env;
 3 | use std::fmt::Display;
 4 | use std::iter::FromIterator;
 5 | use std::sync::Arc;
 6 | 
 7 | use futures::future::join_all;
 8 | use reqwest::Client;
 9 | use strum::IntoEnumIterator;
10 | 
11 | pub use pinterest::*;
12 | pub use providers::*;
13 | pub use twitter::*;
14 | pub use united_cube::*;
15 | pub use weverse::*;
16 | 
17 | pub mod pinterest;
18 | mod providers;
19 | pub mod twitter;
20 | mod twitter_types;
21 | pub mod united_cube;
22 | pub mod weverse;
23 | 
24 | /// A scrape url is only transparently available to providers
25 | #[derive(Debug, Clone)]
26 | pub struct ScrapeUrl(pub String);
27 | 
28 | #[derive(Debug, Copy, Clone)]
29 | pub struct PageSize(usize);
30 | 
31 | /// Identifier for a specific section of a site
32 | /// [name: pinterest.board_feed]
33 | /// [destination: <A unique identifier scoped to pinterest>]
34 | #[derive(Debug, PartialEq, Eq, Hash, Clone)]
35 | pub struct ScopedProvider {
36 |     pub name: AllProviders,
37 |     pub destination: String,
38 |     pub official: bool,
39 | }
40 | 
41 | impl Display for ScopedProvider {
42 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
43 |         f.write_str(&format!("{}:{}", self.name.to_string(), self.destination))
44 |     }
45 | }
46 | 
47 | pub type ProviderMap = HashMap<AllProviders, Box<dyn Provider>>;
48 | 
49 | pub async fn get_provider_map(client: &Arc<Client>) -> anyhow::Result<ProviderMap> {
50 |     let handles = AllProviders::iter().map(|provider_type| async move {
51 |         let client = Arc::clone(client);
52 |         let input = ProviderInput { client };
53 |         let provider: Box<dyn Provider> = match provider_type {
54 |             AllProviders::PinterestBoardFeed => Box::new(PinterestBoardFeed::new(input)),
55 |             AllProviders::WeverseArtistFeed => Box::new(WeverseArtistFeed::new(input)),
56 |             AllProviders::UnitedCubeArtistFeed => Box::new(UnitedCubeArtistFeed::new(input)),
57 |             AllProviders::TwitterTimeline => Box::new(TwitterTimeline::new(input)),
58 |         };
59 |         // we should only initialize providers if NO_WORKER is not set
60 |         // this is not typesafe so we should be careful to not try to do
61 |         // authenticated requests when workers are off
62 |         if let Err(_) = env::var("NO_WORKER") {
63 |             provider.initialize().await;
64 |         }
65 |         (provider_type, provider)
66 |     });
67 |     let results = join_all(handles).await;
68 |     Ok(HashMap::from_iter(results))
69 | }
70 | 


--------------------------------------------------------------------------------
/src/scraper/providers/pinterest.rs:
--------------------------------------------------------------------------------
  1 | use std::{collections::HashMap, sync::Arc, time::Instant};
  2 | 
  3 | use async_trait::async_trait;
  4 | use chrono::NaiveDateTime;
  5 | use reqwest::Client;
  6 | use serde::{Deserialize, Serialize};
  7 | use url::Url;
  8 | 
  9 | use crate::{
 10 |     request::{parse_successful_response, request_default_headers},
 11 |     scheduler::UnscopedLimiter,
 12 |     scraper::providers::ProviderMediaType,
 13 | };
 14 | 
 15 | use super::*;
 16 | 
 17 | #[derive(Debug, Deserialize)]
 18 | pub struct PinterestImage {
 19 |     pub width: u16,
 20 |     pub height: u16,
 21 |     pub url: String,
 22 | }
 23 | 
 24 | #[derive(Debug, Clone, Deserialize)]
 25 | pub struct PinterestRichSummary {
 26 |     pub url: String,
 27 | }
 28 | 
 29 | #[derive(Debug, Clone, Deserialize)]
 30 | pub struct PinterestPinner {
 31 |     pub full_name: String,
 32 |     // I don't know if these are really optional, but just to be safe
 33 |     pub image_xlarge_url: Option<String>,
 34 | }
 35 | 
 36 | #[derive(Debug, Deserialize)]
 37 | pub struct PinterestBoard {
 38 |     pub name: String,
 39 | }
 40 | 
 41 | #[derive(Debug, Deserialize)]
 42 | pub struct PinterestImages {
 43 |     pub id: String,
 44 |     pub pinner: Option<PinterestPinner>,
 45 |     pub board: Option<PinterestBoard>,
 46 |     pub images: HashMap<String, PinterestImage>,
 47 |     pub rich_summary: Option<PinterestRichSummary>,
 48 | }
 49 | 
 50 | #[derive(Debug, Deserialize)]
 51 | pub struct PinterestResource {
 52 |     pub bookmark: Option<String>,
 53 |     pub data: Vec<PinterestImages>,
 54 | }
 55 | 
 56 | #[derive(Debug, Deserialize)]
 57 | pub struct PinterestResponse {
 58 |     pub resource_response: PinterestResource,
 59 | }
 60 | 
 61 | #[derive(Debug, Serialize)]
 62 | struct PinterestRequestDictOptions<'a> {
 63 |     bookmarks: &'a Option<Vec<String>>,
 64 |     board_url: &'a str,
 65 |     board_id: &'a str,
 66 |     // max accepted value by the API is 250
 67 |     page_size: usize,
 68 | }
 69 | 
 70 | #[derive(Debug, Serialize)]
 71 | struct PinterestRequestDict<'a> {
 72 |     options: PinterestRequestDictOptions<'a>,
 73 | }
 74 | 
 75 | // #[derive(Clone)]
 76 | pub struct PinterestBoardFeed {
 77 |     pub client: Arc<Client>,
 78 |     pub rate_limiter: UnscopedLimiter,
 79 | }
 80 | 
 81 | const PINTEREST_BOARD_SEPARATOR: &str = "|";
 82 | 
 83 | const URL_ROOT: &str = "https://www.pinterest.com/resource/BoardFeedResource/get";
 84 | 
 85 | #[allow(dead_code)]
 86 | const MAXIMUM_PAGE_SIZE: usize = 200;
 87 | 
 88 | /// pinterest uses a page size of 25
 89 | #[allow(dead_code)]
 90 | const PROVIDER_NATIVE_PAGE_SIZE: usize = 25;
 91 | 
 92 | #[async_trait]
 93 | impl RateLimitable for PinterestBoardFeed {
 94 |     async fn wait(&self, _key: &str) -> () {
 95 |         self.rate_limiter
 96 |             .until_ready_with_jitter(default_jitter())
 97 |             .await;
 98 |     }
 99 | }
100 | 
101 | // PinterestBoard ids are made up of 2 pieces, board_url and board_id formatted in this way
102 | // "board_id|board_url"
103 | #[async_trait]
104 | impl Provider for PinterestBoardFeed {
105 |     fn new(input: ProviderInput) -> Self
106 |     where
107 |         Self: Sized,
108 |     {
109 |         Self {
110 |             client: Arc::clone(&input.client),
111 |             rate_limiter: Self::rate_limiter(),
112 |         }
113 |     }
114 |     fn id(&self) -> AllProviders {
115 |         AllProviders::PinterestBoardFeed
116 |     }
117 |     fn max_page_size(&self) -> PageSize {
118 |         PageSize(100)
119 |     }
120 | 
121 |     fn default_page_size(&self) -> PageSize {
122 |         PageSize(20)
123 |     }
124 | 
125 |     fn from_provider_destination(
126 |         &self,
127 |         scrape_id: &str,
128 |         page_size: PageSize,
129 |         pagination: Option<Pagination>,
130 |     ) -> Result<ScrapeUrl, ProviderFailure> {
131 |         let (id, path) = scrape_id
132 |             .split_once(PINTEREST_BOARD_SEPARATOR)
133 |             .ok_or(ProviderFailure::Url)?;
134 | 
135 |         let data = PinterestRequestDict {
136 |             options: PinterestRequestDictOptions {
137 |                 bookmarks: &pagination.map(|res| vec![res.next_page()]),
138 |                 board_id: id,
139 |                 board_url: path,
140 |                 page_size: page_size.0,
141 |             },
142 |         };
143 |         let data_str = serde_json::to_string(&data)
144 |             .ok()
145 |             .ok_or(ProviderFailure::Url)?;
146 | 
147 |         let url = Url::parse_with_params(URL_ROOT, &[("source_url", path), ("data", &data_str)])
148 |             .ok()
149 |             .ok_or(ProviderFailure::Url)?;
150 |         Ok(ScrapeUrl(url.as_str().to_owned()))
151 |     }
152 |     async fn unfold(&self, state: ProviderState) -> Result<ProviderStep, ProviderFailure> {
153 |         let instant = Instant::now();
154 |         let response = self
155 |             .client
156 |             .get(&state.url.0)
157 |             .headers(request_default_headers())
158 |             .send()
159 |             .await?;
160 |         let response_delay = instant.elapsed();
161 | 
162 |         let status = &response.status();
163 |         let response_json = parse_successful_response::<PinterestResponse>(response).await?;
164 |         let posts = response_json
165 |             .resource_response
166 |             .data
167 |             .iter()
168 |             .filter_map(|pin| {
169 |                 // I imagine every image has an "orig" size but we can't know for sure
170 |                 pin.images.get("orig").map(|elem| {
171 |                     ProviderPost {
172 |                         account: pin
173 |                             .pinner
174 |                             .clone()
175 |                             .map(|pinner| ProviderAccount {
176 |                                 name: pinner.full_name,
177 |                                 avatar_url: pinner.image_xlarge_url,
178 |                             })
179 |                             .unwrap_or_default(),
180 |                         unique_identifier: pin.id.clone(),
181 |                         url: Some(format!("https://www.pinterest.com/pin/{}", pin.id)),
182 |                         post_date: None,
183 |                         // There might be a body here but I don't really care, it's pinterest
184 |                         body: None,
185 |                         images: vec![ProviderMedia {
186 |                             _type: ProviderMediaType::Image,
187 |                             media_url: elem.url.to_owned(),
188 |                             // yes, pinterest literally does not tell you when things were
189 |                             // pinned. It's so stupid
190 |                             reference_url: pin.rich_summary.clone().map(|sum| sum.url),
191 |                             unique_identifier: pin.id.to_owned(),
192 |                             metadata: None,
193 |                         }],
194 |                         metadata: None,
195 |                     }
196 |                 })
197 |             })
198 |             .collect::<Vec<_>>();
199 | 
200 |         let result = ProviderResult {
201 |             posts,
202 |             response_code: status.to_owned(),
203 |             response_delay,
204 |         };
205 | 
206 |         let bookmark_option = response_json.resource_response.bookmark;
207 |         // we receive a bookmark when there are more images to scrape
208 |         Ok(match bookmark_option {
209 |             Some(bookmark) => ProviderStep::Next(result, Pagination::NextCursor(bookmark)),
210 |             None => ProviderStep::End(result),
211 |         })
212 |     }
213 | }
214 | 


--------------------------------------------------------------------------------
/src/scraper/providers/providers.rs:
--------------------------------------------------------------------------------
  1 | use std::convert::Infallible;
  2 | use std::sync::Arc;
  3 | use std::{collections::HashSet, ops::Add, time::Duration};
  4 | 
  5 | use async_trait::async_trait;
  6 | use chrono::NaiveDateTime;
  7 | use governor::{Jitter, Quota, RateLimiter};
  8 | use log::{debug, error, info};
  9 | use parking_lot::RwLock;
 10 | use reqwest::{Client, Error, StatusCode};
 11 | use serde;
 12 | use serde::{Deserialize, Serialize};
 13 | use strum_macros::{Display, ToString};
 14 | use strum_macros::{EnumIter, EnumString};
 15 | use thiserror::Error;
 16 | use url::Url;
 17 | 
 18 | use crate::request::HttpError;
 19 | use crate::scheduler::UnscopedLimiter;
 20 | use crate::scraper::providers::providers::DerivedProviderResource::Invalid;
 21 | 
 22 | use super::{PageSize, ScrapeUrl};
 23 | 
 24 | #[derive(Debug, Clone, Serialize, Deserialize)]
 25 | pub enum ProviderMediaType {
 26 |     Image,
 27 |     Video,
 28 | }
 29 | 
 30 | pub type SharedCredentials<T> = Arc<RwLock<Option<T>>>;
 31 | 
 32 | /// Placeholder for images that may contain more metadata in the future?
 33 | #[derive(Debug, Clone, Serialize, Deserialize)]
 34 | pub struct ProviderMedia {
 35 |     #[serde(rename = "type")]
 36 |     pub _type: ProviderMediaType,
 37 |     pub media_url: String,
 38 |     // where the image is coming from
 39 |     pub reference_url: Option<String>,
 40 |     pub unique_identifier: String,
 41 |     /// necessary for some providers like weverse which include additional
 42 |     /// metadata that are unique to the provider being scraped
 43 |     #[serde(skip_serializing_if = "Option::is_none")]
 44 |     pub metadata: Option<serde_json::Value>,
 45 | }
 46 | 
 47 | #[derive(Debug, Clone, Serialize, Deserialize)]
 48 | pub struct ProviderPost {
 49 |     pub account: ProviderAccount,
 50 |     pub unique_identifier: String,
 51 |     pub images: Vec<ProviderMedia>,
 52 |     pub body: Option<String>,
 53 |     pub url: Option<String>,
 54 |     pub post_date: Option<NaiveDateTime>,
 55 |     /// necessary for some providers like weverse which include additional
 56 |     /// metadata that are unique to the provider being scraped
 57 |     #[serde(skip_serializing_if = "Option::is_none")]
 58 |     pub metadata: Option<serde_json::Value>,
 59 | }
 60 | 
 61 | #[derive(Debug, Clone, Serialize, Deserialize)]
 62 | pub struct ProviderAccount {
 63 |     pub name: String,
 64 |     pub avatar_url: Option<String>,
 65 | }
 66 | 
 67 | impl Default for ProviderAccount {
 68 |     fn default() -> Self {
 69 |         Self {
 70 |             name: "Unknown user".to_owned(),
 71 |             avatar_url: None,
 72 |         }
 73 |     }
 74 | }
 75 | 
 76 | #[derive(Debug)]
 77 | pub struct ProviderResult {
 78 |     pub posts: Vec<ProviderPost>,
 79 |     pub response_delay: Duration,
 80 |     pub response_code: StatusCode,
 81 | }
 82 | 
 83 | impl Add<ProviderResult> for ProviderResult {
 84 |     type Output = ProviderResult;
 85 |     fn add(self, rhs: ProviderResult) -> Self::Output {
 86 |         ProviderResult {
 87 |             response_code: rhs.response_code,
 88 |             response_delay: rhs.response_delay,
 89 |             posts: [self.posts, rhs.posts].concat(),
 90 |         }
 91 |     }
 92 | }
 93 | 
 94 | #[derive(Debug)]
 95 | pub enum ProviderStep {
 96 |     Next(ProviderResult, Pagination),
 97 |     End(ProviderResult),
 98 |     // Provider exits gracefully
 99 |     NotInitialized,
100 | }
101 | 
102 | #[derive(Error, Debug)]
103 | pub enum ProviderFailure {
104 |     #[error("Error formatting URL")]
105 |     Url,
106 |     #[error("Failed to process response from request")]
107 |     HttpError(HttpError),
108 |     #[error("{0}")]
109 |     Other(String),
110 | }
111 | 
112 | impl From<reqwest::Error> for ProviderFailure {
113 |     fn from(err: reqwest::Error) -> Self {
114 |         ProviderFailure::HttpError(HttpError::ReqwestError(err))
115 |     }
116 | }
117 | 
118 | #[derive(Debug, Clone)]
119 | pub struct ProviderState {
120 |     pub login_attempts: u32,
121 |     pub id: String,
122 |     pub default_name: Option<String>,
123 |     pub url: ScrapeUrl,
124 |     pub pagination: Option<Pagination>,
125 |     pub iteration: usize,
126 | }
127 | 
128 | pub struct ScrapeRequestInput {
129 |     pub latest_data: HashSet<String>,
130 |     pub default_name: Option<String>,
131 |     pub last_scrape: Option<NaiveDateTime>,
132 |     pub is_first_scrape: bool,
133 | }
134 | 
135 | impl From<HttpError> for ProviderFailure {
136 |     fn from(err: HttpError) -> Self {
137 |         Self::HttpError(err)
138 |     }
139 | }
140 | 
141 | pub enum CanonicalUrlResolution {
142 |     Success { destination: String },
143 |     Fail(String),
144 |     NotImplemented,
145 | }
146 | 
147 | pub enum CredentialRefresh {
148 |     Result(ProviderCredentials),
149 |     TryLogin,
150 |     Halt,
151 | }
152 | 
153 | pub enum ProviderErrorHandle {
154 |     RefreshToken(ProviderCredentials),
155 |     Login,
156 |     Halt,
157 | }
158 | 
159 | #[derive(Debug, Clone)]
160 | pub enum Pagination {
161 |     NextPage(i32),
162 |     NextCursor(String),
163 | }
164 | 
165 | impl Pagination {
166 |     pub fn next_page(&self) -> String {
167 |         match self {
168 |             Pagination::NextPage(num) => num.to_string(),
169 |             Pagination::NextCursor(cursor) => cursor.clone(),
170 |         }
171 |     }
172 | }
173 | 
174 | impl ToString for Pagination {
175 |     fn to_string(&self) -> String {
176 |         self.next_page()
177 |     }
178 | }
179 | 
180 | #[async_trait]
181 | pub trait RateLimitable {
182 |     /// The available quota for this provider
183 |     fn quota() -> Quota
184 |     where
185 |         Self: Sized,
186 |     {
187 |         default_quota()
188 |     }
189 |     /// The default rate limiter implementation
190 |     /// This currently only supports global rate limiters
191 |     /// but may need to be changed to support local ones as well
192 |     fn rate_limiter() -> UnscopedLimiter
193 |     where
194 |         Self: Sized,
195 |     {
196 |         RateLimiter::direct(Self::quota())
197 |     }
198 |     /// Wait for next request if token is not available
199 |     async fn wait(&self, key: &str) -> ();
200 | }
201 | 
202 | pub fn default_quota() -> Quota {
203 |     // fairly aggressive quota
204 |     Quota::with_period(Duration::from_millis(3500u64)).unwrap()
205 | }
206 | 
207 | const DEFAULT_WAIT_SECONDS: u64 = if cfg!(debug_assertions) { 2 } else { 8 };
208 | 
209 | pub fn default_jitter() -> Jitter {
210 |     Jitter::up_to(Duration::from_secs(DEFAULT_WAIT_SECONDS))
211 | }
212 | 
213 | #[derive(Debug, Clone, Default)]
214 | pub struct ProviderCredentials {
215 |     pub access_token: String,
216 |     pub refresh_token: String,
217 | }
218 | 
219 | pub struct BareProviderInput {
220 |     pub client: Arc<Client>,
221 | }
222 | 
223 | pub struct ProviderInput {
224 |     pub client: Arc<Client>,
225 | }
226 | 
227 | pub fn create_credentials<T>() -> Arc<RwLock<Option<T>>> {
228 |     Arc::new(RwLock::new(None))
229 | }
230 | 
231 | /// Try to override the shared credentials after logging in one time
232 | pub async fn attempt_first_login(
233 |     provider: &dyn Provider,
234 |     credentials: &SharedCredentials<ProviderCredentials>,
235 | ) {
236 |     let id = provider.id().to_string();
237 |     info!("Attempting login to {}", &id);
238 |     let login = provider.login().await;
239 |     let provider_creds = match login {
240 |         Ok(login) => {
241 |             info!("Logged in into {}", &id);
242 |             login
243 |         }
244 |         Err(err) => {
245 |             error!("Could not log into {}, leaving it uninitialized", &id);
246 |             eprintln!("{:?}", err);
247 |             return;
248 |         }
249 |     };
250 |     let mut writable = credentials.write();
251 |     *writable = Some(provider_creds);
252 | }
253 | 
254 | pub enum DerivedProviderResource {
255 |     Invalid,
256 |     Error { reason: String },
257 |     Success { destination: String },
258 | }
259 | 
260 | pub struct IntrospectableResource(pub String);
261 | 
262 | pub enum WorkableDomain {
263 |     /// The url can be turned into a canonical URL using [`Provider::introspect_resource`]
264 |     ToCanonical(IntrospectableResource),
265 |     /// The url can be scraped for information
266 |     ToResource(Url),
267 | }
268 | 
269 | /// Providers represent a generic endpoint on a single platform that can be scraped
270 | /// with a unique identifier for each specific resource
271 | #[async_trait]
272 | pub trait Provider: Sync + Send + RateLimitable {
273 |     fn new(input: ProviderInput) -> Self
274 |     where
275 |         Self: Sized;
276 |     async fn initialize(&self) {}
277 | 
278 |     fn requires_auth(&self) -> bool {
279 |         false
280 |     }
281 | 
282 |     /// a string that uniquely identifies this provider
283 |     fn id(&self) -> AllProviders;
284 | 
285 |     /// The maximum amount of items the provider can retrieve at one time
286 |     /// This value is used whenever a provider is being scraped for the first time
287 |     /// in order to quickly enumerate through all past data
288 |     fn max_page_size(&self) -> PageSize;
289 | 
290 |     /// The default page size that is used when checking a provider that has
291 |     /// already been scraped at least one time in the past
292 |     fn default_page_size(&self) -> PageSize;
293 | 
294 |     /// The maximum number of times a resource can be paginated before exiting.
295 |     /// This value is ignored if the context has no images aka the resource
296 |     /// is being scraped for the first time
297 |     fn max_pagination(&self) -> u16 {
298 |         5
299 |     }
300 | 
301 |     /// The amount of delay between each pagination request. Initial request is not
302 |     /// bound by this value
303 |     fn scrape_delay(&self) -> Duration {
304 |         Duration::from_secs(2)
305 |     }
306 | 
307 |     /// Match the domain input into a pending action the provider can perform
308 |     fn match_domain(&self, _url: &str) -> Option<WorkableDomain> {
309 |         None
310 |     }
311 | 
312 |     /// Attempt to resolve the data required to construct a scrape destination given a canonical URL
313 |     /// # Example
314 |     /// introspectable: dreamcatcher
315 |     /// Canonical URL:  https://weverse.io/dreamcatcher/artist
316 |     /// Result:         Ok("14")
317 |     async fn introspect_resource(
318 |         &self,
319 |         _introspectable: &IntrospectableResource,
320 |     ) -> Result<CanonicalUrlResolution, ProviderFailure> {
321 |         Ok(CanonicalUrlResolution::NotImplemented)
322 |     }
323 | 
324 |     /// Provider destination are any unique identifier a provider can try to resolve into an opaque [ScrapeUrl].
325 |     /// This method is called after every successful scrape to resolve the next page of media
326 |     fn from_provider_destination(
327 |         &self,
328 |         id: &str,
329 |         page_size: PageSize,
330 |         pagination: Option<Pagination>,
331 |     ) -> Result<ScrapeUrl, ProviderFailure>;
332 |     /// Process a single iteration of the resource
333 |     async fn unfold(&self, state: ProviderState) -> Result<ProviderStep, ProviderFailure>;
334 | 
335 |     /// Error handling branch that separates operational errors from authorization
336 |     /// related error codes
337 |     fn on_error(&self, _http_error: &HttpError) -> anyhow::Result<ProviderErrorHandle> {
338 |         debug!(
339 |             "{} ran into an unhandled error and is halting",
340 |             self.id().to_string()
341 |         );
342 |         Ok(ProviderErrorHandle::Halt)
343 |     }
344 | 
345 |     async fn token_refresh(
346 |         &self,
347 |         _credentials: &ProviderCredentials,
348 |     ) -> anyhow::Result<CredentialRefresh> {
349 |         panic!(
350 |             "{}'s on_error branch tried to refresh credentials but it doesn't implement a token refresh flow",
351 |             self.id().to_string()
352 |         )
353 |     }
354 | 
355 |     async fn login(&self) -> Result<ProviderCredentials, ProviderFailure> {
356 |         panic!(
357 |             "{} tried to login but it doesn't implement a login flow",
358 |             self.id().to_string()
359 |         )
360 |     }
361 |     fn credentials(&self) -> SharedCredentials<ProviderCredentials> {
362 |         panic!(
363 |             "Tried to get credentials for {} which doesn't authorization",
364 |             self.id().to_string()
365 |         )
366 |     }
367 |     
368 |     fn max_login_attempts(&self) -> u32 {
369 |         3
370 |     }
371 | 
372 |     /// Whether the URLs generated by this scraper expire after a short amount of duration
373 |     fn ephemeral(&self) -> bool {
374 |         false
375 |     }
376 | }
377 | 
378 | #[derive(Display, Debug, Hash, Copy, Clone, Serialize, EnumString, EnumIter, PartialEq, Eq)]
379 | pub enum AllProviders {
380 |     #[strum(serialize = "pinterest.board_feed")]
381 |     PinterestBoardFeed,
382 |     #[strum(serialize = "weverse.artist_feed")]
383 |     WeverseArtistFeed,
384 |     #[strum(serialize = "united_cube.artist_feed")]
385 |     UnitedCubeArtistFeed,
386 |     #[strum(serialize = "twitter.timeline")]
387 |     TwitterTimeline,
388 | }
389 | 
390 | pub fn find_matching_domain(domains: &[&str], url: &str) -> Option<WorkableDomain> {
391 |     let parsed = Url::parse(url).ok()?;
392 |     let dom = parsed.domain()?;
393 |     if domains.contains(&dom) {
394 |         Some(WorkableDomain::ToCanonical(IntrospectableResource(
395 |             url.to_owned(),
396 |         )))
397 |     } else {
398 |         None
399 |     }
400 | }
401 | 
402 | pub struct UrlBuilder {
403 |     pub params: Vec<(&'static str, String)>,
404 | }
405 | 
406 | impl Default for UrlBuilder {
407 |     fn default() -> Self {
408 |         Self { params: vec![] }
409 |     }
410 | }
411 | 
412 | impl ToString for UrlBuilder {
413 |     fn to_string(&self) -> String {
414 |         todo!()
415 |     }
416 | }
417 | 
418 | impl UrlBuilder {
419 |     pub fn from_queries(params: Vec<(&'static str, &'static str)>) -> Self {
420 |         Self {
421 |             params: params
422 |                 .into_iter()
423 |                 .map(|(key, value)| (key, value.to_owned()))
424 |                 .collect::<Vec<_>>(),
425 |         }
426 |     }
427 |     pub fn page_size(&mut self, key: &'static str, page_size: PageSize) -> &mut Self {
428 |         self.params.push((key, page_size.0.to_string()));
429 |         self
430 |     }
431 |     pub fn pagination(&mut self, key: &'static str, page_option: &Option<Pagination>) -> &mut Self {
432 |         if let Some(page) = page_option {
433 |             self.params.push((key, page.next_page()))
434 |         }
435 |         self
436 |     }
437 |     pub fn build(&self, base_url: &str) -> Result<url::Url, ProviderFailure> {
438 |         url::Url::parse_with_params(base_url, self.params.iter())
439 |             .ok()
440 |             .ok_or(ProviderFailure::Url)
441 |     }
442 |     pub fn build_scrape_url(self, base_url: &str) -> Result<ScrapeUrl, ProviderFailure> {
443 |         let res = self.build(base_url)?;
444 |         Ok(ScrapeUrl(res.as_str().to_owned()))
445 |     }
446 | }
447 | 


--------------------------------------------------------------------------------
/src/scraper/providers/twitter.rs:
--------------------------------------------------------------------------------
  1 | use std::borrow::Borrow;
  2 | use std::env;
  3 | use std::iter::FromIterator;
  4 | use std::sync::Arc;
  5 | use std::time::{Duration, Instant};
  6 | 
  7 | use anyhow::Error;
  8 | use async_trait::async_trait;
  9 | use chrono::NaiveDateTime;
 10 | use chrono::{DateTime, FixedOffset, ParseResult};
 11 | use governor::Quota;
 12 | use log::{debug, error, info, trace, warn};
 13 | use regex::Regex;
 14 | use reqwest::header::{HeaderMap, HeaderName, HeaderValue};
 15 | use reqwest::Client;
 16 | use url::Url;
 17 | 
 18 | use crate::request::{parse_successful_response, HttpError};
 19 | use crate::scheduler::UnscopedLimiter;
 20 | use crate::scraper::providers::twitter_types::{
 21 |     Entries, GuestTokenFetchResponse, Twitter, TwitterImageMetadata, TwitterPostMetadata,
 22 |     TwitterUserLookupResponse, Type,
 23 | };
 24 | 
 25 | use super::*;
 26 | 
 27 | fn twitter_type_to_provider(media_type: &Type) -> ProviderMediaType {
 28 |     match media_type {
 29 |         Type::AnimatedGif => ProviderMediaType::Image,
 30 |         Type::Photo => ProviderMediaType::Image,
 31 |         Type::Video => ProviderMediaType::Video,
 32 |     }
 33 | }
 34 | 
 35 | fn replace_twitter_string(s: &str) -> String {
 36 |     s.replace("\\/", "/")
 37 | }
 38 | 
 39 | fn parse_twitter_date(date_str: &str) -> ParseResult<DateTime<FixedOffset>> {
 40 |     DateTime::parse_from_str(date_str, "%a %b %d %H:%M:%S %z %Y")
 41 | }
 42 | 
 43 | pub struct TwitterTimeline {
 44 |     pub guest_token: SharedCredentials<ProviderCredentials>,
 45 |     pub bearer_token: Option<String>,
 46 |     pub client: Arc<Client>,
 47 |     pub rate_limiter: UnscopedLimiter,
 48 | }
 49 | 
 50 | const BASE_URL: &str = "https://twitter.com/";
 51 | /// I have no idea where this token is coming from...
 52 | const MAGIC_BEARER_TOKEN: &str = "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA";
 53 | 
 54 | const USER_AGENT: &str = "HTC Mozilla/5.0 (Linux; Android 7.0; HTC 10 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.83 Mobile Safari/537.36";
 55 | 
 56 | #[async_trait]
 57 | impl RateLimitable for TwitterTimeline {
 58 |     fn quota() -> Quota
 59 |     where
 60 |         Self: Sized,
 61 |     {
 62 |         default_quota()
 63 |     }
 64 |     async fn wait(&self, _key: &str) -> () {
 65 |         self.rate_limiter
 66 |             .until_ready_with_jitter(default_jitter())
 67 |             .await
 68 |     }
 69 | }
 70 | 
 71 | #[async_trait]
 72 | impl Provider for TwitterTimeline {
 73 |     fn id(&self) -> AllProviders {
 74 |         AllProviders::TwitterTimeline
 75 |     }
 76 |     fn new(input: ProviderInput) -> Self
 77 |     where
 78 |         Self: Sized,
 79 |     {
 80 |         Self {
 81 |             guest_token: create_credentials(),
 82 |             bearer_token: env::var("TWITTER_BEARER_TOKEN").ok(),
 83 |             client: Arc::clone(&input.client),
 84 |             rate_limiter: Self::rate_limiter(),
 85 |         }
 86 |     }
 87 | 
 88 |     async fn initialize(&self) -> () {
 89 |         attempt_first_login(self, &self.guest_token).await;
 90 |     }
 91 | 
 92 |     fn max_page_size(&self) -> PageSize {
 93 |         PageSize(100)
 94 |     }
 95 | 
 96 |     fn default_page_size(&self) -> PageSize {
 97 |         PageSize(20)
 98 |     }
 99 | 
100 |     fn from_provider_destination(
101 |         &self,
102 |         id: &str,
103 |         page_size: PageSize,
104 |         pagination: Option<Pagination>,
105 |     ) -> Result<ScrapeUrl, ProviderFailure> {
106 |         let mut url_fragment = UrlBuilder::from_queries(vec![
107 |             ("include_profile_interstitial_type", "1"),
108 |             // https://github.com/twintproject/twint/blob/master/twint/url.py
109 |             // ("include_blocking", "1"),
110 |             // ("include_blocked_by", "1"),
111 |             // ("include_followed_by", "1"),
112 |             // ("include_want_retweets", "1"),
113 |             // ("include_mute_edge", "1"),
114 |             // ("include_can_dm", "1"),
115 |             // ("include_can_media_tag", "1"),
116 |             // ("skip_status", "1"),
117 |             // ("cards_platform", "Web - 12"),
118 |             // ("include_cards", "1"),
119 |             // ("include_ext_alt_text", "true"),
120 |             // ("include_quote_count", "true"),
121 |             // ("include_reply_count", "1"),
122 |             ("tweet_mode", "extended"),
123 |             ("include_entities", "true"),
124 |             // ("include_user_entities", "true"),
125 |             // ("include_ext_media_color", "true"),
126 |             // ("include_ext_media_availability", "true"),
127 |             // ("send_error_codes", "true"),
128 |             // ("simple_quoted_tweet", "true"),
129 |             // ("include_tweet_replies", "true"),
130 |             ("ext", "mediaStats%2ChighlightedLabel"),
131 |         ]);
132 |         url_fragment.page_size("count", page_size);
133 |         url_fragment.pagination("cursor", &pagination);
134 |         let url = url_fragment.build_scrape_url(&format!(
135 |             "https://api.twitter.com/2/timeline/profile/{}.json",
136 |             id
137 |         ))?;
138 |         Ok(url)
139 |     }
140 | 
141 |     fn max_pagination(&self) -> u16 {
142 |         3
143 |     }
144 | 
145 |     async fn unfold(&self, state: ProviderState) -> Result<ProviderStep, ProviderFailure> {
146 |         let credentials = self.guest_token.read().clone();
147 |         let token = match credentials {
148 |             Some(token) => token,
149 |             None => return Ok(ProviderStep::NotInitialized),
150 |         };
151 |         let bearer = self.bearer_token.clone().map_or_else(
152 |             || {
153 |                 warn!(
154 |                     "Using fallback bearer token. This will most likely get rate limited and fail"
155 |                 );
156 |                 MAGIC_BEARER_TOKEN.to_owned()
157 |             },
158 |             |token| format!("Bearer {}", &token),
159 |         );
160 |         let instant = Instant::now();
161 | 
162 |         let response = self
163 |             .client
164 |             .get(state.url.0)
165 |             .headers(HeaderMap::from_iter([
166 |                 (
167 |                     HeaderName::from_static("user-agent"),
168 |                     //죄송합니다
169 |                     HeaderValue::from_static(USER_AGENT),
170 |                 ),
171 |                 (
172 |                     HeaderName::from_static("authorization"),
173 |                     HeaderValue::from_str(&bearer).expect("Invalid bearer token format"),
174 |                 ),
175 |                 (
176 |                     HeaderName::from_static("x-guest-token"),
177 |                     HeaderValue::from_str(&token.access_token)
178 |                         .expect("Invalid access token format"),
179 |                 ),
180 |             ]))
181 |             .send()
182 |             .await?;
183 |         let response_code = response.status();
184 |         let response_delay = instant.elapsed();
185 |         let response_json = parse_successful_response::<Twitter>(response).await?;
186 |         // Twitter does some really interesting stuff with how they present API data
187 |         let maybe_instruction = response_json
188 |             .timeline
189 |             .instructions
190 |             .iter()
191 |             .find_map(|instruction| instruction.get("addEntries"));
192 |         let tweet_db = response_json.global_objects.tweets;
193 |         let user_db = response_json.global_objects.users;
194 |         let entries = match maybe_instruction {
195 |             Some(Entries::AddEntries { entries }) => entries,
196 |             _ => {
197 |                 return Err(ProviderFailure::Other(
198 |                     "Could not find an 'addEntries' in instructions".to_owned(),
199 |                 ))
200 |             }
201 |         };
202 |         let posts = entries
203 |             .iter()
204 |             .filter_map(|entry| {
205 |                 let sort_index = &entry.sort_index;
206 |                 if !entry.entry_id.starts_with("tweet-") {
207 |                     return None;
208 |                 }
209 |                 // a sort index corresponds to the id of the
210 |                 // the chances of this being undefined is basically non-existent but we should be safe
211 |                 let tweet = match tweet_db.get(sort_index) {
212 |                     None => {
213 |                         debug!(
214 |                             "Could not find the corresponding tweet id for {} in the tweet db",
215 |                             sort_index
216 |                         );
217 |                         return None;
218 |                     }
219 |                     Some(t) => t,
220 |                 };
221 |                 Some(tweet)
222 |             })
223 |             .filter_map(|tweet| {
224 |                 let unique_identifier = tweet.id_str.clone();
225 |                 let like_count = tweet.favorite_count;
226 |                 let retweet_count = tweet.retweet_count;
227 |                 let language = tweet.lang.clone();
228 |                 let post_date = parse_twitter_date(&tweet.created_at)
229 |                     .ok()
230 |                     .map(|e| e.naive_utc());
231 |                 let body = tweet.full_text.clone().map(|t| replace_twitter_string(&t));
232 |                 tweet.entities.media.as_ref().map(|media| {
233 |                     let user_option = user_db.get(&tweet.user_id_str);
234 |                     let url = user_option.map(|user| {
235 |                         format!(
236 |                             "https://twitter.com/{}/status/{}",
237 |                             &user.screen_name, &unique_identifier
238 |                         )
239 |                     });
240 |                     ProviderPost {
241 |                         account: user_option
242 |                             .map(|user| ProviderAccount {
243 |                                 name: user.name.clone(),
244 |                                 avatar_url: user.profile_image_url_https.clone(),
245 |                             })
246 |                             .unwrap_or_default(),
247 |                         unique_identifier,
248 |                         metadata: serde_json::to_value(TwitterPostMetadata {
249 |                             like_count,
250 |                             retweet_count,
251 |                             language,
252 |                         })
253 |                         .ok(),
254 |                         url,
255 |                         post_date,
256 |                         images: media
257 |                             .iter()
258 |                             .map(|media| ProviderMedia {
259 |                                 _type: twitter_type_to_provider(&media.media_type),
260 |                                 unique_identifier: media.id_str.clone(),
261 |                                 media_url: replace_twitter_string(&media.media_url_https),
262 |                                 reference_url: Some(replace_twitter_string(&media.expanded_url)),
263 |                                 metadata: serde_json::to_value(TwitterImageMetadata {
264 |                                     height: media.original_info.height,
265 |                                     width: media.original_info.width,
266 |                                 })
267 |                                 .ok(),
268 |                             })
269 |                             .collect::<Vec<_>>(),
270 |                         body,
271 |                     }
272 |                 })
273 |             })
274 |             .collect::<Vec<_>>();
275 | 
276 |         let cursor_entry = &entries.last();
277 |         let cursor =
278 |             cursor_entry.and_then(|c| c.content.operation.as_ref().map(|o| o.cursor.value.clone()));
279 |         let result = ProviderResult {
280 |             posts,
281 |             response_code,
282 |             response_delay,
283 |         };
284 |         match cursor {
285 |             Some(cursor) => Ok(ProviderStep::Next(result, Pagination::NextCursor(cursor))),
286 |             None => Ok(ProviderStep::End(result)),
287 |         }
288 |     }
289 | 
290 |     fn match_domain(&self, url: &str) -> Option<WorkableDomain> {
291 |         find_matching_domain(&["twitter.com"], url)
292 |     }
293 | 
294 |     /// https://twitter.com/:username -> ID
295 |     async fn introspect_resource(
296 |         &self,
297 |         introspectable: &IntrospectableResource,
298 |     ) -> Result<CanonicalUrlResolution, ProviderFailure> {
299 |         let bearer = match &self.bearer_token {
300 |             None => return Ok(CanonicalUrlResolution::NotImplemented),
301 |             Some(token) => token,
302 |         };
303 |         let input = match Url::parse(&introspectable.0) {
304 |             Err(e) => return Err(ProviderFailure::Url),
305 |             Ok(e) => e,
306 |         };
307 |         let username = input.path().trim_start_matches("/");
308 |         // self.guest_token
309 |         let endpoint = format!("https://api.twitter.com/2/users/by/username/{}", username);
310 |         let result = self
311 |             .client
312 |             .get(endpoint)
313 |             .header("Authorization", format!("Bearer {}", bearer))
314 |             .send()
315 |             .await?
316 |             .json::<TwitterUserLookupResponse>()
317 |             .await?;
318 |         Ok(CanonicalUrlResolution::Success {
319 |             destination: result.data.id,
320 |         })
321 |     }
322 | 
323 |     async fn login(&self) -> Result<ProviderCredentials, ProviderFailure> {
324 |         let headers = HeaderMap::from_iter([(
325 |             HeaderName::from_static("user-agent"),
326 |             HeaderValue::from_static(USER_AGENT),
327 |         )]);
328 |         let login = self
329 |             .client
330 |             .get(BASE_URL)
331 |             .headers(headers.clone())
332 |             .send()
333 |             .await?;
334 |         let html = login.text().await?;
335 |         // TODO: check cookie response here?
336 |         // CONTEXT: https://github.com/JustAnotherArchivist/snscrape/blob/eee06d859338b184fc43f93e424ba70a0e9f4679/snscrape/modules/twitter.py#L231
337 |         let regex = Regex::new(r#"gt=(.*?);"#).unwrap();
338 |         match regex.captures(&html) {
339 |             Some(captures) => {
340 |                 let capture = captures.get(1).expect(
341 |                     "Couldn't match a guest token in the twitter homepage, the site was changed",
342 |                 );
343 |                 Ok(ProviderCredentials {
344 |                     access_token: capture.as_str().to_owned(),
345 |                     refresh_token: "".to_owned(),
346 |                 })
347 |             }
348 |             None => {
349 |                 info!(
350 |                     "Couldn't find a guest token in the homepage, attempting to fetch from the API"
351 |                 );
352 |                 let bearer = self
353 |                     .bearer_token
354 |                     .clone()
355 |                     .unwrap_or(MAGIC_BEARER_TOKEN.to_owned());
356 |                 let mut request_headers = headers.clone();
357 |                 request_headers.append(
358 |                     HeaderName::from_static("authorization"),
359 |                     HeaderValue::from_str(&format!("Bearer {}", bearer))
360 |                         .expect("Header value for authorization request could not be formatted"),
361 |                 );
362 |                 let result = self
363 |                     .client
364 |                     .post("https://api.twitter.com/1.1/guest/activate.json")
365 |                     .headers(request_headers)
366 |                     .send()
367 |                     .await?
368 |                     .json::<GuestTokenFetchResponse>()
369 |                     .await?;
370 |                 let creds = ProviderCredentials {
371 |                     access_token: result.guest_token,
372 |                     refresh_token: "".to_owned(),
373 |                 };
374 |                 Ok(creds)
375 |             }
376 |         }
377 |     }
378 | 
379 |     fn on_error(&self, error: &HttpError) -> anyhow::Result<ProviderErrorHandle> {
380 |         match error {
381 |             HttpError::FailStatus(e) | HttpError::UnexpectedBody(e) => {
382 |                 if e.code == 403 {
383 |                     Ok(ProviderErrorHandle::Login)
384 |                 } else {
385 |                     // unknown error at this point
386 |                     error!("{:?}", e);
387 |                     Ok(ProviderErrorHandle::Halt)
388 |                 }
389 |             }
390 |             error => {
391 |                 error!("{:?}", error);
392 |                 Ok(ProviderErrorHandle::Halt)
393 |             }
394 |         }
395 |     }
396 |     fn credentials(&self) -> SharedCredentials<ProviderCredentials> {
397 |         self.guest_token.clone()
398 |     }
399 | }
400 | 


--------------------------------------------------------------------------------
/src/scraper/providers/twitter_types.rs:
--------------------------------------------------------------------------------
  1 | // Example code that deserializes and serializes the model.
  2 | // extern crate serde;
  3 | // #[macro_use]
  4 | // extern crate serde_derive;
  5 | // extern crate serde_json;
  6 | //
  7 | // use generated_module::[object Object];
  8 | //
  9 | // fn main() {
 10 | //     let json = r#"{"answer": 42}"#;
 11 | //     let model: [object Object] = serde_json::from_str(&json).unwrap();
 12 | // }
 13 | use serde::{Deserialize, Serialize};
 14 | use std::collections::HashMap;
 15 | 
 16 | #[derive(Deserialize)]
 17 | pub struct GuestTokenFetchResponse {
 18 |     pub(crate) guest_token: String,
 19 | }
 20 | 
 21 | #[derive(Deserialize)]
 22 | pub struct TwitterUserLookup {
 23 |     pub(crate) id: String,
 24 | }
 25 | 
 26 | #[derive(Deserialize)]
 27 | pub struct TwitterUserLookupResponse {
 28 |     pub(crate) data: TwitterUserLookup,
 29 | }
 30 | 
 31 | #[derive(Debug, Serialize, Deserialize)]
 32 | pub struct TwitterPostMetadata {
 33 |     pub(crate) language: Option<String>,
 34 |     pub(crate) like_count: Option<i64>,
 35 |     pub(crate) retweet_count: Option<i64>,
 36 | }
 37 | 
 38 | #[derive(Debug, Serialize, Deserialize)]
 39 | pub struct TwitterImageMetadata {
 40 |     pub(crate) width: i64,
 41 |     pub(crate) height: i64,
 42 | }
 43 | 
 44 | #[derive(Debug, Serialize, Deserialize)]
 45 | pub struct Twitter {
 46 |     #[serde(rename = "globalObjects")]
 47 |     pub(crate) global_objects: GlobalObjects,
 48 |     pub(crate) timeline: Timeline,
 49 | }
 50 | 
 51 | #[derive(Debug, Serialize, Deserialize)]
 52 | pub struct GlobalObjects {
 53 |     pub(crate) tweets: HashMap<String, TweetValue>,
 54 |     pub(crate) users: HashMap<String, User>,
 55 | }
 56 | 
 57 | #[derive(Debug, Serialize, Deserialize)]
 58 | pub struct TopicValue {
 59 |     pub(crate) id: String,
 60 |     pub(crate) name: String,
 61 |     pub(crate) following: bool,
 62 |     pub(crate) description: String,
 63 |     pub(crate) not_interested: bool,
 64 |     pub(crate) icon_url: String,
 65 | }
 66 | 
 67 | #[derive(Debug, Serialize, Deserialize)]
 68 | pub struct TweetValue {
 69 |     pub(crate) created_at: String,
 70 |     pub(crate) id_str: String,
 71 |     pub(crate) full_text: Option<String>,
 72 |     pub(crate) display_text_range: Vec<i64>,
 73 |     pub(crate) entities: TweetEntities,
 74 |     pub(crate) source: Option<String>,
 75 |     pub(crate) user_id_str: String,
 76 |     pub(crate) retweeted_status_id_str: Option<String>,
 77 |     pub(crate) retweet_count: Option<i64>,
 78 |     pub(crate) favorite_count: Option<i64>,
 79 |     pub(crate) conversation_id_str: Option<String>,
 80 |     pub(crate) lang: Option<String>,
 81 |     pub(crate) is_quote_status: Option<bool>,
 82 |     pub(crate) quoted_status_id_str: Option<String>,
 83 |     pub(crate) quoted_status_permalink: Option<QuotedStatusPermalink>,
 84 |     pub(crate) in_reply_to_status_id_str: Option<String>,
 85 |     pub(crate) in_reply_to_user_id_str: Option<String>,
 86 |     pub(crate) in_reply_to_screen_name: Option<String>,
 87 |     pub(crate) extended_entities: Option<ExtendedEntities>,
 88 |     pub(crate) possibly_sensitive_editable: Option<bool>,
 89 |     pub(crate) self_thread: Option<SelfThread>,
 90 | }
 91 | 
 92 | #[derive(Debug, Serialize, Deserialize)]
 93 | pub struct TweetEntities {
 94 |     pub(crate) user_mentions: Option<Vec<UserMention>>,
 95 |     pub(crate) media: Option<Vec<EntitiesMedia>>,
 96 |     pub(crate) urls: Option<Vec<UrlElement>>,
 97 |     pub(crate) hashtags: Option<Vec<Hashtag>>,
 98 | }
 99 | 
100 | #[derive(Debug, Serialize, Deserialize)]
101 | pub struct Hashtag {
102 |     pub(crate) text: String,
103 |     pub(crate) indices: Vec<i64>,
104 | }
105 | 
106 | #[derive(Debug, Serialize, Deserialize)]
107 | pub struct EntitiesMedia {
108 |     pub(crate) id_str: String,
109 |     pub(crate) indices: Vec<i64>,
110 |     pub(crate) media_url: String,
111 |     pub(crate) media_url_https: String,
112 |     pub(crate) url: String,
113 |     pub(crate) display_url: String,
114 |     pub(crate) expanded_url: String,
115 |     #[serde(rename = "type")]
116 |     pub(crate) media_type: Type,
117 |     pub(crate) original_info: OriginalInfo,
118 |     pub(crate) sizes: Sizes,
119 |     pub(crate) media_key: Option<String>,
120 |     pub(crate) ext: Option<PurpleExt>,
121 |     pub(crate) source_status_id_str: Option<String>,
122 |     pub(crate) source_user_id_str: Option<String>,
123 |     pub(crate) video_info: Option<PurpleVideoInfo>,
124 | }
125 | 
126 | #[derive(Debug, Serialize, Deserialize)]
127 | pub struct PurpleExt {
128 |     #[serde(rename = "mediaStats")]
129 |     pub(crate) media_stats: PurpleMediaStats,
130 | }
131 | 
132 | #[derive(Debug, Serialize, Deserialize)]
133 | pub struct PurpleMediaStats {
134 |     pub(crate) r: REnum,
135 |     pub(crate) ttl: i64,
136 | }
137 | 
138 | #[derive(Debug, Serialize, Deserialize)]
139 | pub struct OriginalInfo {
140 |     pub(crate) width: i64,
141 |     pub(crate) height: i64,
142 |     pub(crate) focus_rects: Option<Vec<FocusRect>>,
143 | }
144 | 
145 | #[derive(Debug, Serialize, Deserialize)]
146 | pub struct FocusRect {
147 |     pub(crate) x: i64,
148 |     pub(crate) y: i64,
149 |     pub(crate) h: i64,
150 |     pub(crate) w: i64,
151 | }
152 | 
153 | #[derive(Debug, Serialize, Deserialize)]
154 | pub struct Sizes {
155 |     pub(crate) small: Large,
156 |     pub(crate) medium: Large,
157 |     pub(crate) thumb: Large,
158 |     pub(crate) large: Large,
159 | }
160 | 
161 | #[derive(Debug, Serialize, Deserialize)]
162 | pub struct Large {
163 |     pub(crate) w: i64,
164 |     pub(crate) h: i64,
165 |     pub(crate) resize: Resize,
166 | }
167 | 
168 | #[derive(Debug, Serialize, Deserialize)]
169 | pub struct PurpleVideoInfo {
170 |     pub(crate) aspect_ratio: Vec<i64>,
171 |     pub(crate) variants: Vec<Variant>,
172 | }
173 | 
174 | #[derive(Debug, Serialize, Deserialize)]
175 | pub struct Variant {
176 |     pub(crate) bitrate: Option<i64>,
177 |     pub(crate) content_type: ContentType,
178 |     pub(crate) url: String,
179 | }
180 | 
181 | #[derive(Debug, Serialize, Deserialize)]
182 | pub struct UrlElement {
183 |     pub(crate) url: String,
184 |     pub(crate) expanded_url: String,
185 |     pub(crate) display_url: String,
186 |     pub(crate) indices: Vec<i64>,
187 | }
188 | 
189 | #[derive(Debug, Serialize, Deserialize)]
190 | pub struct UserMention {
191 |     pub(crate) screen_name: String,
192 |     pub(crate) name: String,
193 |     pub(crate) id_str: String,
194 |     pub(crate) indices: Vec<i64>,
195 | }
196 | 
197 | #[derive(Debug, Serialize, Deserialize)]
198 | pub struct ExtendedEntities {
199 |     pub(crate) media: Vec<ExtendedEntitiesMedia>,
200 | }
201 | 
202 | #[derive(Debug, Serialize, Deserialize)]
203 | pub struct ExtendedEntitiesMedia {
204 |     pub(crate) id_str: String,
205 |     pub(crate) indices: Vec<i64>,
206 |     pub(crate) media_url: String,
207 |     pub(crate) media_url_https: String,
208 |     pub(crate) url: String,
209 |     pub(crate) display_url: String,
210 |     pub(crate) expanded_url: String,
211 |     #[serde(rename = "type")]
212 |     pub(crate) media_type: Type,
213 |     pub(crate) original_info: OriginalInfo,
214 |     pub(crate) sizes: Sizes,
215 |     pub(crate) media_key: Option<String>,
216 |     pub(crate) ext: Option<FluffyExt>,
217 |     pub(crate) source_status_id_str: Option<String>,
218 |     pub(crate) source_user_id_str: Option<String>,
219 |     pub(crate) video_info: Option<FluffyVideoInfo>,
220 |     pub(crate) additional_media_info: Option<AdditionalMediaInfo>,
221 | }
222 | 
223 | #[derive(Debug, Serialize, Deserialize)]
224 | pub struct AdditionalMediaInfo {
225 |     pub(crate) monetizable: bool,
226 | }
227 | 
228 | #[derive(Debug, Serialize, Deserialize)]
229 | pub struct FluffyExt {
230 |     #[serde(rename = "mediaStats")]
231 |     pub(crate) media_stats: FluffyMediaStats,
232 | }
233 | 
234 | #[derive(Debug, Serialize, Deserialize)]
235 | pub struct FluffyMediaStats {
236 |     pub(crate) r: RUnion,
237 |     pub(crate) ttl: i64,
238 | }
239 | 
240 | #[derive(Debug, Serialize, Deserialize)]
241 | pub struct RRClass {
242 |     pub(crate) ok: Ok,
243 | }
244 | 
245 | #[derive(Debug, Serialize, Deserialize)]
246 | pub struct Ok {
247 |     #[serde(rename = "viewCount")]
248 |     pub(crate) view_count: String,
249 | }
250 | 
251 | #[derive(Debug, Serialize, Deserialize)]
252 | pub struct FluffyVideoInfo {
253 |     pub(crate) aspect_ratio: Vec<i64>,
254 |     pub(crate) duration_millis: Option<i64>,
255 |     pub(crate) variants: Vec<Variant>,
256 | }
257 | 
258 | #[derive(Debug, Serialize, Deserialize)]
259 | pub struct QuotedStatusPermalink {
260 |     pub(crate) url: String,
261 |     pub(crate) expanded: String,
262 |     pub(crate) display: String,
263 | }
264 | 
265 | #[derive(Debug, Serialize, Deserialize)]
266 | pub struct SelfThread {
267 |     pub(crate) id_str: String,
268 | }
269 | 
270 | #[derive(Debug, Serialize, Deserialize)]
271 | pub struct User {
272 |     pub(crate) id_str: String,
273 |     pub(crate) name: String,
274 |     pub(crate) screen_name: String,
275 |     // pub(crate) location: String,
276 |     // pub(crate) description: String,
277 |     // pub(crate) url: Option<String>,
278 |     // pub(crate) entities: UserEntities,
279 |     // pub(crate) followers_count: i64,
280 |     // pub(crate) fast_followers_count: i64,
281 |     // pub(crate) normal_followers_count: i64,
282 |     // pub(crate) friends_count: i64,
283 |     // pub(crate) listed_count: i64,
284 |     // pub(crate) created_at: String,
285 |     // pub(crate) favourites_count: i64,
286 |     // pub(crate) geo_enabled: Option<bool>,
287 |     // pub(crate) statuses_count: i64,
288 |     // pub(crate) media_count: i64,
289 |     pub(crate) profile_image_url_https: Option<String>,
290 |     // pub(crate) profile_banner_url: Option<String>,
291 |     // pub(crate) profile_image_extensions: ProfileExtensions,
292 |     // pub(crate) profile_banner_extensions: Option<ProfileExtensions>,
293 |     // pub(crate) profile_link_color: String,
294 |     // pub(crate) pinned_tweet_ids: Vec<f64>,
295 |     // pub(crate) pinned_tweet_ids_str: Vec<String>,
296 |     // pub(crate) has_custom_timelines: Option<bool>,
297 |     // pub(crate) profile_interstitial_type: String,
298 |     // pub(crate) has_extended_profile: Option<bool>,
299 |     // pub(crate) default_profile: Option<bool>,
300 |     // pub(crate) verified: Option<bool>,
301 | }
302 | 
303 | #[derive(Debug, Serialize, Deserialize)]
304 | pub struct UserEntities {
305 |     pub(crate) url: Option<PurpleUrl>,
306 |     pub(crate) description: Description,
307 | }
308 | 
309 | #[derive(Debug, Serialize, Deserialize)]
310 | pub struct Description {
311 |     pub(crate) urls: Option<Vec<UrlElement>>,
312 | }
313 | 
314 | #[derive(Debug, Serialize, Deserialize)]
315 | pub struct PurpleUrl {
316 |     pub(crate) urls: Vec<UrlElement>,
317 | }
318 | 
319 | #[derive(Debug, Serialize, Deserialize)]
320 | pub struct ProfileExtensions {
321 |     #[serde(rename = "mediaStats")]
322 |     pub(crate) media_stats: ProfileImageExtensionsMediaStats,
323 | }
324 | 
325 | #[derive(Debug, Serialize, Deserialize)]
326 | pub struct ProfileImageExtensionsMediaStats {
327 |     pub(crate) r: MediaStatsRClass,
328 |     pub(crate) ttl: i64,
329 | }
330 | 
331 | #[derive(Debug, Serialize, Deserialize)]
332 | pub struct MediaStatsRClass {
333 |     pub(crate) missing: Option<serde_json::Value>,
334 | }
335 | 
336 | #[derive(Debug, Serialize, Deserialize)]
337 | pub struct Timeline {
338 |     pub(crate) id: String,
339 |     pub(crate) instructions: Vec<HashMap<String, Entries>>,
340 | }
341 | 
342 | #[derive(Debug, Serialize, Deserialize)]
343 | #[serde(untagged)]
344 | pub enum Entries {
345 |     #[serde(rename = "addEntries")]
346 |     AddEntries {
347 |         entries: Vec<Entry>,
348 |     },
349 |     Other(serde_json::Value),
350 | }
351 | 
352 | #[derive(Debug, Serialize, Deserialize)]
353 | pub struct AddEntries {
354 |     pub(crate) entries: Vec<Entry>,
355 | }
356 | 
357 | #[derive(Debug, Serialize, Deserialize)]
358 | pub struct Entry {
359 |     #[serde(rename = "entryId")]
360 |     pub(crate) entry_id: String,
361 |     #[serde(rename = "sortIndex")]
362 |     pub(crate) sort_index: String,
363 |     pub(crate) content: EntryContent,
364 | }
365 | 
366 | #[derive(Debug, Serialize, Deserialize)]
367 | pub struct EntryContent {
368 |     pub(crate) item: Option<ContentItem>,
369 |     #[serde(rename = "timelineModule")]
370 |     pub(crate) timeline_module: Option<TimelineModule>,
371 |     pub(crate) operation: Option<Operation>,
372 | }
373 | 
374 | #[derive(Debug, Serialize, Deserialize)]
375 | pub struct ContentItem {
376 |     pub(crate) content: PurpleContent,
377 | }
378 | 
379 | #[derive(Debug, Serialize, Deserialize)]
380 | pub struct PurpleContent {
381 |     pub(crate) tweet: ContentTweet,
382 | }
383 | 
384 | #[derive(Debug, Serialize, Deserialize)]
385 | pub struct ContentTweet {
386 |     pub(crate) id: String,
387 |     #[serde(rename = "displayType")]
388 |     pub(crate) display_type: DisplayType,
389 | }
390 | 
391 | #[derive(Debug, Serialize, Deserialize)]
392 | pub struct Operation {
393 |     pub(crate) cursor: Cursor,
394 | }
395 | 
396 | #[derive(Debug, Serialize, Deserialize)]
397 | pub struct Cursor {
398 |     pub(crate) value: String,
399 |     #[serde(rename = "cursorType")]
400 |     pub(crate) cursor_type: String,
401 |     #[serde(rename = "stopOnEmptyResponse")]
402 |     pub(crate) stop_on_empty_response: Option<bool>,
403 | }
404 | 
405 | #[derive(Debug, Serialize, Deserialize)]
406 | pub struct TimelineModule {
407 |     pub(crate) items: Vec<ItemElement>,
408 |     #[serde(rename = "displayType")]
409 |     pub(crate) display_type: String,
410 |     pub(crate) header: Header,
411 |     #[serde(rename = "clientEventInfo")]
412 |     pub(crate) client_event_info: TimelineModuleClientEventInfo,
413 |     pub(crate) metadata: Metadata,
414 | }
415 | 
416 | #[derive(Debug, Serialize, Deserialize)]
417 | pub struct TimelineModuleClientEventInfo {
418 |     pub(crate) component: Component,
419 | }
420 | 
421 | #[derive(Debug, Serialize, Deserialize)]
422 | pub struct Header {
423 |     pub(crate) text: String,
424 |     pub(crate) sticky: bool,
425 |     #[serde(rename = "socialContext")]
426 |     pub(crate) social_context: SocialContext,
427 |     #[serde(rename = "displayType")]
428 |     pub(crate) display_type: String,
429 | }
430 | 
431 | #[derive(Debug, Serialize, Deserialize)]
432 | pub struct SocialContext {
433 |     #[serde(rename = "generalContext")]
434 |     pub(crate) general_context: GeneralContext,
435 | }
436 | 
437 | #[derive(Debug, Serialize, Deserialize)]
438 | pub struct GeneralContext {
439 |     #[serde(rename = "contextType")]
440 |     pub(crate) context_type: String,
441 |     pub(crate) text: String,
442 | }
443 | 
444 | #[derive(Debug, Serialize, Deserialize)]
445 | pub struct ItemElement {
446 |     #[serde(rename = "entryId")]
447 |     pub(crate) entry_id: String,
448 |     pub(crate) item: ItemItem,
449 | }
450 | 
451 | #[derive(Debug, Serialize, Deserialize)]
452 | pub struct ItemItem {
453 |     pub(crate) content: FluffyContent,
454 |     #[serde(rename = "clientEventInfo")]
455 |     pub(crate) client_event_info: ItemClientEventInfo,
456 |     #[serde(rename = "feedbackInfo")]
457 |     pub(crate) feedback_info: FeedbackInfo,
458 | }
459 | 
460 | #[derive(Debug, Serialize, Deserialize)]
461 | pub struct ItemClientEventInfo {
462 |     pub(crate) component: Component,
463 |     pub(crate) element: Element,
464 |     pub(crate) details: Details,
465 | }
466 | 
467 | #[derive(Debug, Serialize, Deserialize)]
468 | pub struct Details {
469 |     #[serde(rename = "timelinesDetails")]
470 |     pub(crate) timelines_details: TimelinesDetails,
471 | }
472 | 
473 | #[derive(Debug, Serialize, Deserialize)]
474 | pub struct TimelinesDetails {
475 |     #[serde(rename = "controllerData")]
476 |     pub(crate) controller_data: String,
477 | }
478 | 
479 | #[derive(Debug, Serialize, Deserialize)]
480 | pub struct FluffyContent {
481 |     pub(crate) topic: ContentTopic,
482 | }
483 | 
484 | #[derive(Debug, Serialize, Deserialize)]
485 | pub struct ContentTopic {
486 |     #[serde(rename = "topicId")]
487 |     pub(crate) topic_id: String,
488 |     #[serde(rename = "topicFunctionalityType")]
489 |     pub(crate) topic_functionality_type: TopicFunctionalityType,
490 |     #[serde(rename = "topicDisplayType")]
491 |     pub(crate) topic_display_type: TopicDisplayType,
492 | }
493 | 
494 | #[derive(Debug, Serialize, Deserialize)]
495 | pub struct FeedbackInfo {
496 |     #[serde(rename = "feedbackKeys")]
497 |     pub(crate) feedback_keys: Vec<String>,
498 |     #[serde(rename = "feedbackMetadata")]
499 |     pub(crate) feedback_metadata: FeedbackMetadata,
500 | }
501 | 
502 | #[derive(Debug, Serialize, Deserialize)]
503 | pub struct Metadata {
504 |     #[serde(rename = "gridCarouselMetadata")]
505 |     pub(crate) grid_carousel_metadata: GridCarouselMetadata,
506 | }
507 | 
508 | #[derive(Debug, Serialize, Deserialize)]
509 | pub struct GridCarouselMetadata {
510 |     #[serde(rename = "numRows")]
511 |     pub(crate) num_rows: i64,
512 | }
513 | 
514 | #[derive(Debug, Serialize, Deserialize)]
515 | pub struct RichBehavior {
516 |     #[serde(rename = "markNotInterestedTopic")]
517 |     pub(crate) mark_not_interested_topic: MarkNotInterestedTopic,
518 | }
519 | 
520 | #[derive(Debug, Serialize, Deserialize)]
521 | pub struct MarkNotInterestedTopic {
522 |     #[serde(rename = "topicId")]
523 |     pub(crate) topic_id: String,
524 | }
525 | 
526 | #[derive(Debug, Serialize, Deserialize)]
527 | #[serde(untagged)]
528 | pub enum RUnion {
529 |     Enum(REnum),
530 |     RrClass(RRClass),
531 | }
532 | 
533 | #[derive(Debug, Serialize, Deserialize)]
534 | pub enum REnum {
535 |     Missing,
536 | }
537 | 
538 | #[derive(Debug, Serialize, Deserialize)]
539 | pub enum Type {
540 |     #[serde(rename = "animated_gif")]
541 |     AnimatedGif,
542 |     #[serde(rename = "photo")]
543 |     Photo,
544 |     #[serde(rename = "video")]
545 |     Video,
546 | }
547 | 
548 | #[derive(Debug, Serialize, Deserialize)]
549 | pub enum Resize {
550 |     #[serde(rename = "crop")]
551 |     Crop,
552 |     #[serde(rename = "fit")]
553 |     Fit,
554 | }
555 | 
556 | #[derive(Debug, Serialize, Deserialize)]
557 | pub enum ContentType {
558 |     #[serde(rename = "application/x-mpegURL")]
559 |     ApplicationXMpegUrl,
560 |     #[serde(rename = "video/mp4")]
561 |     VideoMp4,
562 | }
563 | 
564 | #[derive(Debug, Serialize, Deserialize)]
565 | pub enum AdvertiserAccountServiceLevel {
566 |     #[serde(rename = "analytics")]
567 |     Analytics,
568 |     #[serde(rename = "media_studio")]
569 |     MediaStudio,
570 |     #[serde(rename = "mms")]
571 |     Mms,
572 |     #[serde(rename = "smb")]
573 |     Smb,
574 |     #[serde(rename = "subscription")]
575 |     Subscription,
576 | }
577 | 
578 | #[derive(Debug, Serialize, Deserialize)]
579 | pub enum AdvertiserAccountType {
580 |     #[serde(rename = "none")]
581 |     None,
582 |     #[serde(rename = "promotable_user")]
583 |     PromotableUser,
584 | }
585 | 
586 | #[derive(Debug, Serialize, Deserialize)]
587 | pub enum TranslatorType {
588 |     #[serde(rename = "none")]
589 |     None,
590 |     #[serde(rename = "regular")]
591 |     Regular,
592 | }
593 | 
594 | #[derive(Debug, Serialize, Deserialize)]
595 | pub enum DisplayType {
596 |     Tweet,
597 | }
598 | 
599 | #[derive(Debug, Serialize, Deserialize)]
600 | pub enum Component {
601 |     #[serde(rename = "suggest_topics_module")]
602 |     SuggestTopicsModule,
603 | }
604 | 
605 | #[derive(Debug, Serialize, Deserialize)]
606 | pub enum Element {
607 |     #[serde(rename = "topic")]
608 |     Topic,
609 | }
610 | 
611 | #[derive(Debug, Serialize, Deserialize)]
612 | pub enum TopicDisplayType {
613 |     Pill,
614 | }
615 | 
616 | #[derive(Debug, Serialize, Deserialize)]
617 | pub enum TopicFunctionalityType {
618 |     Recommendation,
619 | }
620 | 
621 | #[derive(Debug, Serialize, Deserialize)]
622 | pub enum FeedbackMetadata {
623 |     #[serde(rename = "FcQBOQwA")]
624 |     FcQboQwA,
625 | }
626 | 
627 | #[derive(Debug, Serialize, Deserialize)]
628 | pub enum EncodedFeedbackRequest {
629 |     #[serde(rename = "LBUeHBXEATkMAAAA")]
630 |     LbUeHbxeaTkMaaaa,
631 | }
632 | 
633 | #[derive(Debug, Serialize, Deserialize)]
634 | pub enum FeedbackType {
635 |     RichBehavior,
636 | }
637 | 


--------------------------------------------------------------------------------
/src/scraper/providers/united_cube.rs:
--------------------------------------------------------------------------------
  1 | use std::{env, path::Path, sync::Arc, time::Instant};
  2 | 
  3 | use async_trait::async_trait;
  4 | use chrono::{DateTime, NaiveDateTime, Utc};
  5 | use log::error;
  6 | use reqwest::Client;
  7 | use serde::{Deserialize, Serialize};
  8 | 
  9 | use crate::{
 10 |     request::{parse_successful_response, request_default_headers, HttpError},
 11 |     scheduler::UnscopedLimiter,
 12 |     scraper::ProviderCredentials,
 13 | };
 14 | 
 15 | use super::*;
 16 | 
 17 | pub struct UnitedCubeArtistFeed {
 18 |     pub client: Arc<Client>,
 19 |     pub credentials: SharedCredentials<ProviderCredentials>,
 20 |     pub rate_limiter: UnscopedLimiter,
 21 | }
 22 | 
 23 | #[async_trait]
 24 | impl RateLimitable for UnitedCubeArtistFeed {
 25 |     async fn wait(&self, _key: &str) -> () {
 26 |         self.rate_limiter
 27 |             .until_ready_with_jitter(default_jitter())
 28 |             .await
 29 |     }
 30 | }
 31 | 
 32 | #[derive(Serialize)]
 33 | struct LoginInput {
 34 |     refresh_token: Option<String>,
 35 |     path: String,
 36 |     id: String,
 37 |     pw: String,
 38 |     remember_me: bool,
 39 | }
 40 | 
 41 | #[derive(Serialize)]
 42 | struct RefreshInput {
 43 |     refresh_token: String,
 44 | }
 45 | 
 46 | #[derive(Deserialize)]
 47 | struct GenericError {
 48 |     message: String,
 49 | }
 50 | 
 51 | #[derive(Deserialize)]
 52 | struct RefreshResponse {
 53 |     token: String,
 54 | }
 55 | 
 56 | #[derive(Deserialize)]
 57 | struct LoginResponse {
 58 |     // slug: String,
 59 |     // email: String,
 60 |     // name: String,
 61 |     // language: String,
 62 |     // role_code: String,
 63 |     token: String,
 64 |     refresh_token: String,
 65 | }
 66 | 
 67 | /// Posts are divided between images and videos
 68 | #[derive(Debug, Deserialize, Clone)]
 69 | #[serde(tag = "type_code", content = "data")]
 70 | enum MediaData {
 71 |     #[serde(rename = "601")]
 72 |     Image { path: String },
 73 |     #[serde(rename = "602")]
 74 |     Video { url: String, image: String },
 75 |     #[serde(rename = "604")]
 76 |     Post { title: String },
 77 | }
 78 | 
 79 | #[derive(Debug, Deserialize, Clone)]
 80 | struct Post {
 81 |     slug: String,
 82 |     content: Option<String>,
 83 |     register_datetime: DateTime<Utc>,
 84 |     media: Vec<MediaData>,
 85 | }
 86 | 
 87 | #[derive(Debug, Deserialize, Clone)]
 88 | struct Page {
 89 |     has_next: bool,
 90 |     // has_prev: bool,
 91 |     // prev_num: null,
 92 |     page: i32,
 93 |     next_num: Option<i32>,
 94 |     // pages: i32,
 95 |     // per_page: i32,
 96 |     // total: i32,
 97 |     items: Vec<Post>,
 98 | }
 99 | 
100 | const BASE_URL: &str = "https://www.united-cube.com";
101 | 
102 | fn extract_url_and_id(path: &str, base_url: &url::Url) -> anyhow::Result<(url::Url, String)> {
103 |     // ucube is missing a leading slash in their links lol
104 |     let parsed_relative_url = format!("/{}", &path);
105 |     let url = base_url.join(&parsed_relative_url)?;
106 |     // .map_err(|result| {
107 |     //     anyhow::anyhow!(result)
108 |     // })?;
109 |     // unbelievably big brain conversion
110 |     let unique_identifier = Path::new(&parsed_relative_url)
111 |         .file_stem()
112 |         .and_then(|str| str.to_str().map(|result| result.to_owned()))
113 |         .ok_or_else(|| anyhow::anyhow!("Invalid file format: {}", parsed_relative_url))?;
114 |     Ok((url, unique_identifier))
115 | }
116 | 
117 | #[async_trait]
118 | impl Provider for UnitedCubeArtistFeed {
119 |     fn id(&self) -> AllProviders {
120 |         AllProviders::UnitedCubeArtistFeed
121 |     }
122 |     fn new(input: ProviderInput) -> Self
123 |     where
124 |         Self: Sized,
125 |     {
126 |         Self {
127 |             client: input.client,
128 |             credentials: create_credentials(),
129 |             rate_limiter: Self::rate_limiter(),
130 |         }
131 |     }
132 | 
133 |     fn requires_auth(&self) -> bool {
134 |         true
135 |     }
136 | 
137 |     async fn initialize(&self) -> () {
138 |         if self.requires_auth() {
139 |             attempt_first_login(self, &self.credentials).await;
140 |         }
141 |     }
142 | 
143 |     fn max_page_size(&self) -> PageSize {
144 |         PageSize(200)
145 |     }
146 | 
147 |     fn default_page_size(&self) -> PageSize {
148 |         PageSize(20)
149 |     }
150 | 
151 |     fn on_error(&self, http_error: &HttpError) -> anyhow::Result<ProviderErrorHandle> {
152 |         let err = match http_error {
153 |             HttpError::ReqwestError(_err) => return Ok(ProviderErrorHandle::Halt),
154 |             HttpError::FailStatus(err) | HttpError::UnexpectedBody(err) => err,
155 |         };
156 | 
157 |         let body = match serde_json::from_str::<GenericError>(&err.body) {
158 |             Err(err) => {
159 |                 error!("Couldn't parse the response from united_cube");
160 |                 eprintln!("{:?}", err);
161 |                 return Ok(ProviderErrorHandle::Halt);
162 |             }
163 |             Ok(body) => body,
164 |         };
165 |         Ok(if body.message == "Token Expired" && err.code == 400 {
166 |             let cred = self.credentials.read().clone();
167 |             ProviderErrorHandle::RefreshToken((cred).unwrap())
168 |         } else {
169 |             // I don't think there is any other response you can get if your token is expired
170 |             // so we can probably assume that something else has gone wrong
171 |             ProviderErrorHandle::Halt
172 |         })
173 |     }
174 | 
175 |     fn from_provider_destination(
176 |         &self,
177 |         id: &str,
178 |         page_size: PageSize,
179 |         pagination: Option<Pagination>,
180 |     ) -> Result<ScrapeUrl, ProviderFailure> {
181 |         // club_id|board_id
182 |         let page_id = id.to_string();
183 |         let parts = page_id.split('|').collect::<Vec<_>>();
184 |         let board = parts.get(1).unwrap();
185 |         let mut next_url: UrlBuilder = Default::default();
186 |         next_url.params.push(("board", board.to_string()));
187 |         next_url.page_size("per_page", page_size);
188 |         next_url.pagination("page", &pagination);
189 |         let url = next_url.build_scrape_url("https://united-cube.com/v1/posts")?;
190 |         Ok(url)
191 |     }
192 | 
193 |     async fn unfold(&self, state: ProviderState) -> Result<ProviderStep, ProviderFailure> {
194 |         let creds = self.credentials.read().clone();
195 |         let credentials = match creds {
196 |             Some(c) => c,
197 |             None => return Ok(ProviderStep::NotInitialized),
198 |         };
199 | 
200 |         let token = credentials.access_token.clone();
201 |         let instant = Instant::now();
202 | 
203 |         let response = self
204 |             .client
205 |             .get(&state.url.0)
206 |             .headers(request_default_headers())
207 |             .header("Authorization", &format!("Bearer {}", token))
208 |             .send()
209 |             .await?;
210 |         let elapsed = instant.elapsed();
211 |         let status = response.status();
212 | 
213 |         let cube_url = url::Url::parse(BASE_URL).unwrap();
214 |         let response_json = parse_successful_response::<Page>(response).await?;
215 | 
216 |         let account = state
217 |             .default_name
218 |             .map(|name| ProviderAccount {
219 |                 name,
220 |                 avatar_url: None,
221 |             })
222 |             .unwrap_or_default();
223 |         let posts = response_json
224 |             .items
225 |             .into_iter()
226 |             .map(|post| {
227 |                 ProviderPost {
228 |                     // UCube does not give us any kind of user information
229 |                     account: account.clone(),
230 |                     unique_identifier: post.slug,
231 |                     // TODO: maybe add page urls to this anyways?
232 |                     // united-cube doesn't have page-specific links, they all go to
233 |                     // https://www.united-cube.com/club/qXmD_5exRnmZfkFIwR1cVA/board/cHTUTBaRRpqUWAL2c5nQiw#PostDetail
234 |                     // which is controlled by JS and can't be linked to
235 |                     url: None,
236 |                     // This is HTML but who cares
237 |                     body: post.content,
238 |                     post_date: Some(post.register_datetime.naive_utc()),
239 |                     metadata: None,
240 |                     images:
241 |                     post.media
242 |                         .iter()
243 |                         .filter_map(|media| {
244 |                             let (_type, media_url, unique_identifier) = match &media {
245 |                                 // we don't care about posts
246 |                                 MediaData::Post { .. } => return None,
247 |                                 // Every video on ucube is (probably) a link to an external youtube video
248 |                                 // but we can't be sure
249 |                                 MediaData::Video { url, .. } => {
250 |                                     let is_probably_external_link = url.starts_with("http");
251 |                                     if is_probably_external_link {
252 |                                         return None;
253 |                                     }
254 |                                     // assuming that a non-external link would follow the same pattern as
255 |                                     match extract_url_and_id(url.as_str(), &cube_url) {
256 |                                         Err(err) => {
257 |                                             error!("Could not convert a non-external ucube video into a relative path");
258 |                                             error!("{:?}", err);
259 |                                             return None;
260 |                                         }
261 |                                         Ok((url, id)) => {
262 |                                             (ProviderMediaType::Video, url.as_str().to_owned(), id)
263 |                                         }
264 |                                     }
265 |                                 }
266 |                                 MediaData::Image { path } => {
267 |                                     match extract_url_and_id(path.as_str(), &cube_url) {
268 |                                         Err(err) => {
269 |                                             error!("Could not get relative path from a ucube image {}", path);
270 |                                             error!("{:?}", err);
271 |                                             return None;
272 |                                         }
273 |                                         Ok((url, id)) => {
274 |                                             (ProviderMediaType::Image, url.as_str().to_owned(), id)
275 |                                         }
276 |                                     }
277 |                                 }
278 |                             };
279 |                             Some(ProviderMedia {
280 |                                 _type,
281 |                                 media_url,
282 |                                 // same with reference URL
283 |                                 reference_url: None,
284 |                                 metadata: None,
285 |                                 unique_identifier,
286 |                             })
287 |                         })
288 |                         .collect::<Vec<_>>(),
289 |                 }
290 |             })
291 |             .collect::<Vec<_>>();
292 | 
293 |         let result = ProviderResult {
294 |             posts,
295 |             response_code: status,
296 |             response_delay: elapsed,
297 |         };
298 |         match response_json.next_num {
299 |             Some(next) => Ok(ProviderStep::Next(result, Pagination::NextPage(next))),
300 |             None => Ok(ProviderStep::End(result)),
301 |         }
302 |     }
303 | 
304 |     fn credentials(&self) -> SharedCredentials<ProviderCredentials> {
305 |         self.credentials.clone()
306 |     }
307 | 
308 |     async fn login(&self) -> Result<ProviderCredentials, ProviderFailure> {
309 |         let response = self
310 |             .client
311 |             .post("https://united-cube.com/v1/auth/login")
312 |             .json(&LoginInput {
313 |                 refresh_token: None,
314 |                 path: "https://www.united-cube.com/signin".to_owned(),
315 |                 id: env::var("UNITED_CUBE_EMAIL")
316 |                     .expect("Tried to login to united_cube without credentials"),
317 |                 pw: env::var("UNITED_CUBE_PASSWORD").unwrap(),
318 |                 remember_me: false,
319 |             })
320 |             .send()
321 |             .await?
322 |             .json::<LoginResponse>()
323 |             .await?;
324 |         Ok(ProviderCredentials {
325 |             access_token: response.token,
326 |             refresh_token: response.refresh_token,
327 |         })
328 |     }
329 |     async fn token_refresh(
330 |         &self,
331 |         credentials: &ProviderCredentials,
332 |     ) -> anyhow::Result<CredentialRefresh> {
333 |         let refresh_token = credentials.refresh_token.clone();
334 |         let response = self
335 |             .client
336 |             .post("https://united-cube.com/v1/auth/refresh")
337 |             .json(&RefreshInput {
338 |                 refresh_token: refresh_token.clone(),
339 |             })
340 |             .send()
341 |             .await?
342 |             .json::<RefreshResponse>()
343 |             .await?;
344 |         Ok(CredentialRefresh::Result(ProviderCredentials {
345 |             access_token: response.token,
346 |             refresh_token,
347 |         }))
348 |     }
349 | }
350 | 


--------------------------------------------------------------------------------
/src/scraper/providers/weverse.rs:
--------------------------------------------------------------------------------
  1 | use std::error::Error;
  2 | use std::{env, iter::FromIterator, sync::Arc, time::Instant};
  3 | 
  4 | use async_trait::async_trait;
  5 | use bimap::{BiHashMap, BiMap};
  6 | use chrono::{DateTime, NaiveDateTime, Utc};
  7 | use governor::Quota;
  8 | use lazy_static::lazy_static;
  9 | use log::info;
 10 | use rand::rngs::OsRng;
 11 | use regex::Regex;
 12 | use reqwest::Client;
 13 | use rsa::{PaddingScheme, PublicKey, RSAPublicKey};
 14 | use serde::{Deserialize, Serialize};
 15 | use sha1::Sha1;
 16 | 
 17 | use crate::{
 18 |     request::{parse_successful_response, request_default_headers, HttpError},
 19 |     scheduler::UnscopedLimiter,
 20 |     scraper::{providers::ProviderMediaType, ProviderMedia, ProviderResult},
 21 | };
 22 | 
 23 | use super::*;
 24 | 
 25 | /// https://gist.github.com/Xetera/aa59e84f3959a37c16a3309b5d9ab5a0
 26 | async fn get_public_key(client: &Client) -> Result<RSAPublicKey, ProviderFailure> {
 27 |     let login_page = client
 28 |         .post("https://account.weverse.io/login/auth?client_id=weverse-test&hl=en")
 29 |         .send()
 30 |         .await?
 31 |         .text()
 32 |         .await?;
 33 |     let regex = Regex::new(r"/(static/js/main\..*.js)").unwrap();
 34 |     let js_bundle_captures = regex.captures(&login_page).ok_or(
 35 |         ProviderFailure::Other(
 36 |             "Could not find a bundle matching the regex '/(static/js/main\\..*.js)' in the weverse login page".to_owned()
 37 |         )
 38 |     )?;
 39 | 
 40 |     let js_name = js_bundle_captures
 41 |         .get(1)
 42 |         .ok_or(ProviderFailure::Other(
 43 |             "Couldn't match a main js bundle on account.weverse.io, the site was changed"
 44 |                 .to_owned(),
 45 |         ))?
 46 |         .as_str();
 47 |     let js_bundle_url = format!("https://account.weverse.io/{}", js_name);
 48 |     let js_bundle = client.get(&js_bundle_url).send().await?.text().await?;
 49 |     let rsa_captures =
 50 |         Regex::new(r"(-----BEGIN RSA PUBLIC KEY-----(.|\n)+----END RSA PUBLIC KEY-----)")
 51 |             .unwrap()
 52 |             .captures(&js_bundle)
 53 |             .ok_or(ProviderFailure::Other(format!(
 54 |                 "Could not find RSA key in {}",
 55 |                 &js_bundle_url
 56 |             )))?;
 57 | 
 58 |     let rsa_key = rsa_captures.get(1).unwrap().as_str().to_owned();
 59 | 
 60 |     let der_encoded = rsa_key
 61 |         .replace("\\n", "\n")
 62 |         .lines()
 63 |         .filter(|line| !line.starts_with('-'))
 64 |         .fold(String::new(), |mut data, line| {
 65 |             data.push_str(line);
 66 |             data
 67 |         });
 68 | 
 69 |     let der_bytes = base64::decode(&der_encoded).expect("failed to decode base64 content");
 70 |     let public_key = RSAPublicKey::from_pkcs8(&der_bytes).expect("failed to parse key");
 71 |     Ok(public_key)
 72 | }
 73 | 
 74 | fn encrypted_password(
 75 |     password: String,
 76 |     public_key: RSAPublicKey,
 77 | ) -> Result<String, ProviderFailure> {
 78 |     let mut rng = OsRng;
 79 |     let padding = PaddingScheme::new_oaep::<Sha1>();
 80 |     let encrypted = match public_key.encrypt(&mut rng, padding, password.as_bytes()) {
 81 |         Err(error) => return Err(ProviderFailure::Other(error.to_string())),
 82 |         Ok(ok) => ok,
 83 |     };
 84 |     Ok(base64::encode(encrypted))
 85 | }
 86 | 
 87 | #[derive(Serialize)]
 88 | struct WeverseLoginRequest {
 89 |     grant_type: String,
 90 |     client_id: String,
 91 |     username: String,
 92 |     password: String,
 93 | }
 94 | 
 95 | #[derive(Debug, Deserialize)]
 96 | pub struct WeverseLoginResponse {
 97 |     refresh_token: String,
 98 |     access_token: String,
 99 | }
100 | 
101 | async fn get_access_token(
102 |     email: String,
103 |     encrypted_password: String,
104 |     client: &Client,
105 | ) -> Result<WeverseAuthorizeResponse, ProviderFailure> {
106 |     Ok(client
107 |         .post("https://accountapi.weverse.io/api/v1/oauth/token")
108 |         .json(&WeverseAuthorizeInput::Login {
109 |             grant_type: "password".to_owned(),
110 |             client_id: "weverse-test".to_owned(),
111 |             username: email,
112 |             password: encrypted_password,
113 |         })
114 |         .send()
115 |         .await?
116 |         .json::<WeverseAuthorizeResponse>()
117 |         .await?)
118 | }
119 | 
120 | pub async fn fetch_weverse_auth_token(
121 |     client: &Client,
122 | ) -> Result<Option<ProviderCredentials>, ProviderFailure> {
123 |     match (
124 |         env::var("WEVERSE_ACCESS_TOKEN"),
125 |         env::var("WEVERSE_EMAIL"),
126 |         env::var("WEVERSE_PASSWORD"),
127 |     ) {
128 |         (Ok(access_token), _, _) => {
129 |             info!("An existing weverse token was found");
130 |             Ok(Some(ProviderCredentials {
131 |                 access_token,
132 |                 refresh_token: "".to_owned(),
133 |             }))
134 |         }
135 |         (_, Ok(email), Ok(password)) => {
136 |             info!("Detected weverse credentials, attempting to login...");
137 |             let public_key = get_public_key(client).await?;
138 |             let encrypted = encrypted_password(password, public_key)?;
139 |             let token = get_access_token(email, encrypted, client).await?;
140 |             Ok(Some(ProviderCredentials {
141 |                 access_token: token.access_token,
142 |                 refresh_token: token.refresh_token,
143 |             }))
144 |         }
145 |         _ => {
146 |             info!("Weverse credentials missing, not initializing Weverse module");
147 |             Ok(None)
148 |         }
149 |     }
150 | }
151 | 
152 | #[derive(Debug, Clone, Deserialize)]
153 | #[serde(rename_all = "camelCase")]
154 | pub struct WeversePhoto {
155 |     id: u64,
156 |     org_img_url: String,
157 |     org_img_height: u32,
158 |     org_img_width: u32,
159 |     thumbnail_img_url: String,
160 |     post_id: u64,
161 | }
162 | 
163 | #[derive(Debug, Deserialize)]
164 | #[serde(rename_all = "camelCase")]
165 | pub struct WeversePost {
166 |     id: u64,
167 |     // community: WeverseCommunity,
168 |     body: Option<String>,
169 |     community_user: WeverseCommunityUser,
170 |     photos: Option<Vec<WeversePhoto>>,
171 |     created_at: DateTime<Utc>,
172 | }
173 | 
174 | #[derive(Debug, Deserialize)]
175 | #[serde(rename_all = "camelCase")]
176 | pub struct WeverseCommunityUser {
177 |     community_id: u32,
178 |     artist_id: u32,
179 |     profile_img_path: String,
180 |     profile_nickname: String,
181 | }
182 | 
183 | #[derive(Debug, Serialize)]
184 | struct PostMetadata {
185 |     author_id: u32,
186 |     author_name: String,
187 | }
188 | 
189 | #[derive(Debug, Serialize)]
190 | pub struct ImageMetadata {
191 |     height: u32,
192 |     width: u32,
193 |     thumbnail_url: String,
194 | }
195 | 
196 | #[derive(Debug, Serialize)]
197 | enum WeverseAuthorizeInput {
198 |     TokenRefresh {
199 |         client_id: String,
200 |         grant_type: String,
201 |         refresh_token: String,
202 |     },
203 |     Login {
204 |         client_id: String,
205 |         grant_type: String,
206 |         username: String,
207 |         password: String,
208 |     },
209 | }
210 | 
211 | #[derive(Debug, Deserialize)]
212 | struct WeverseAuthorizeResponse {
213 |     access_token: String,
214 |     token_type: String,
215 |     expires_in: i32,
216 |     refresh_token: String,
217 | }
218 | 
219 | #[derive(Debug, Serialize)]
220 | struct WeverseTokenRefreshInput {}
221 | 
222 | #[derive(Debug, Deserialize)]
223 | #[serde(rename_all = "camelCase")]
224 | pub struct WeversePage {
225 |     is_ended: bool,
226 |     last_id: Option<u64>,
227 |     posts: Vec<WeversePost>,
228 | }
229 | 
230 | // #[derive(Clone)]
231 | pub struct WeverseArtistFeed {
232 |     pub client: Arc<Client>,
233 |     pub credentials: SharedCredentials<ProviderCredentials>,
234 |     pub rate_limiter: UnscopedLimiter,
235 | }
236 | 
237 | lazy_static! {
238 |     static ref ARTIST_MAPPINGS: BiMap<u32, &'static str> =
239 |         BiHashMap::from_iter([(14, "dreamcatcher"), (10, "sunmi")]);
240 | }
241 | 
242 | fn url_from_post(artist_id: u32, post_id: u64, photo_id: u64) -> String {
243 |     let artist_name = ARTIST_MAPPINGS
244 |         .get_by_left(&artist_id)
245 |         .expect(&format!("Weverse ID {} is not a valid mapping", artist_id));
246 |     format!(
247 |         "https://weverse.io/{}/artist/{}?photoId={}",
248 |         artist_name, post_id, photo_id
249 |     )
250 | }
251 | 
252 | const MAX_PAGESIZE: usize = 30;
253 | // weverse is stupid and uses a 16 page default pagesize
254 | const DEFAULT_PAGESIZE: usize = 16;
255 | 
256 | #[async_trait]
257 | impl RateLimitable for WeverseArtistFeed {
258 |     fn quota() -> Quota
259 |     where
260 |         Self: Sized,
261 |     {
262 |         default_quota()
263 |     }
264 |     async fn wait(&self, _key: &str) -> () {
265 |         self.rate_limiter
266 |             .until_ready_with_jitter(default_jitter())
267 |             .await
268 |     }
269 | }
270 | 
271 | #[async_trait]
272 | impl Provider for WeverseArtistFeed {
273 |     fn new(input: ProviderInput) -> Self
274 |     where
275 |         Self: Sized,
276 |     {
277 |         Self {
278 |             credentials: create_credentials(),
279 |             client: Arc::clone(&input.client),
280 |             rate_limiter: Self::rate_limiter(),
281 |         }
282 |     }
283 |     fn id(&self) -> AllProviders {
284 |         AllProviders::WeverseArtistFeed
285 |     }
286 | 
287 |     fn requires_auth(&self) -> bool {
288 |         true
289 |     }
290 | 
291 |     async fn initialize(&self) -> () {
292 |         attempt_first_login(self, &self.credentials).await;
293 |     }
294 | 
295 |     fn max_page_size(&self) -> PageSize {
296 |         PageSize(MAX_PAGESIZE)
297 |     }
298 | 
299 |     fn default_page_size(&self) -> PageSize {
300 |         PageSize(DEFAULT_PAGESIZE)
301 |     }
302 | 
303 |     // async fn canonical_url_to_id(&self, url: &str) -> CanonicalUrlResolution {
304 |     //     let res = match self.client.get("https://weverse.io").send().await {
305 |     //         Ok(res) => res,
306 |     //         Err(err) => {
307 |     //             println!("{:?}", err);
308 |     //             return CanonicalUrlResolution::Fail
309 |     //         }
310 |     //     };
311 |     //     let html = match res.text().await {
312 |     //         Ok(ok) => ok,
313 |     //         Err(err) => return CanonicalUrlResolution::Fail,
314 |     //     };
315 |     //     let regex = Regex::new(r"/(communitiesInfo\s*?=\s*?(\[.*?\]))").unwrap();
316 |     // }
317 | 
318 |     fn from_provider_destination(
319 |         &self,
320 |         id: &str,
321 |         page_size: PageSize,
322 |         pagination: Option<Pagination>,
323 |     ) -> Result<ScrapeUrl, ProviderFailure> {
324 |         let mut next_url = UrlBuilder::default();
325 |         next_url.page_size("pageSize", page_size);
326 |         next_url.pagination("from", &pagination);
327 |         let url = next_url.build_scrape_url(&format!(
328 |             "https://weversewebapi.weverse.io/wapi/v1/communities/{}/posts/artistTab",
329 |             id
330 |         ))?;
331 |         Ok(url)
332 |     }
333 | 
334 |     fn max_pagination(&self) -> u16 {
335 |         100
336 |     }
337 | 
338 |     async fn unfold(&self, state: ProviderState) -> Result<ProviderStep, ProviderFailure> {
339 |         let credentials = self.credentials.read().clone();
340 |         // let token = "".to_owned();
341 |         let token = match credentials {
342 |             Some(token) => token.access_token,
343 |             None => return Ok(ProviderStep::NotInitialized),
344 |         };
345 | 
346 |         let instant = Instant::now();
347 |         let response = self
348 |             .client
349 |             .get(&state.url.0)
350 |             .headers(request_default_headers())
351 |             .header("Authorization", format!("Bearer {}", token))
352 |             .send()
353 |             .await?;
354 | 
355 |         let response_code = response.status();
356 |         let response_delay = instant.elapsed();
357 |         let response_json = parse_successful_response::<WeversePage>(response).await?;
358 |         let posts = response_json
359 |             .posts
360 |             .into_iter()
361 |             .map(|post| {
362 |                 let community_id = post.community_user.community_id.to_owned();
363 |                 let post_id = post.id;
364 |                 let user = post.community_user;
365 |                 let author_name = user.profile_nickname;
366 |                 let author_id = user.artist_id;
367 |                 let post_created_at = post.created_at;
368 |                 let photos = post.photos.unwrap_or_default();
369 |                 let page_url = photos
370 |                     .get(0)
371 |                     .map(|photo| url_from_post(community_id, post_id, photo.id));
372 |                 ProviderPost {
373 |                     account: ProviderAccount {
374 |                         avatar_url: Some(user.profile_img_path),
375 |                         name: author_name.clone(),
376 |                     },
377 |                     unique_identifier: post_id.to_string(),
378 |                     metadata: serde_json::to_value(PostMetadata {
379 |                         author_id,
380 |                         author_name,
381 |                     })
382 |                     .ok(),
383 |                     body: post.body,
384 |                     url: page_url,
385 |                     post_date: Some(post_created_at.naive_utc()),
386 |                     images: photos
387 |                         .into_iter()
388 |                         .map(|photo| {
389 |                             ProviderMedia {
390 |                                 // should be unique across all of weverse
391 |                                 _type: ProviderMediaType::Image,
392 |                                 unique_identifier: photo.id.to_string(),
393 |                                 media_url: photo.org_img_url.clone(),
394 |                                 reference_url: Some(url_from_post(community_id, post_id, photo.id)),
395 |                                 metadata: serde_json::to_value(ImageMetadata {
396 |                                     height: photo.org_img_height,
397 |                                     width: photo.org_img_width,
398 |                                     thumbnail_url: photo.thumbnail_img_url,
399 |                                 })
400 |                                 .ok(),
401 |                             }
402 |                         })
403 |                         // not sure why I have to do this here
404 |                         .collect::<Vec<_>>(),
405 |                 }
406 |             })
407 |             .collect::<Vec<_>>();
408 |         // weverse omits last_id when at the end of the content
409 |         let has_more = !response_json.is_ended || response_json.last_id.is_some();
410 |         let result = ProviderResult {
411 |             posts,
412 |             response_code,
413 |             response_delay,
414 |         };
415 |         if has_more {
416 |             return Ok(ProviderStep::Next(
417 |                 result,
418 |                 Pagination::NextCursor(response_json.last_id.unwrap().to_string()),
419 |             ));
420 |         }
421 |         Ok(ProviderStep::End(result))
422 |     }
423 | 
424 |     fn on_error(&self, http_error: &HttpError) -> anyhow::Result<ProviderErrorHandle> {
425 |         match http_error {
426 |             HttpError::FailStatus(err) | HttpError::UnexpectedBody(err) => {
427 |                 // :) I don't actually know if weverse returns a 401 on expired tokens
428 |                 // but I can't test because their tokens last for 6 ENTIRE months!!!!
429 |                 if err.code == 401 || err.code == 403 {
430 |                     let handle = self
431 |                         .credentials
432 |                         .clone()
433 |                         .try_read()
434 |                         .map_or(ProviderErrorHandle::Login, |creds| {
435 |                             ProviderErrorHandle::RefreshToken(creds.clone().unwrap())
436 |                         });
437 |                     return Ok(handle);
438 |                 }
439 |                 Ok(ProviderErrorHandle::Halt)
440 |             }
441 |             _ => Ok(ProviderErrorHandle::Halt),
442 |         }
443 |     }
444 |     async fn token_refresh(
445 |         &self,
446 |         credentials: &ProviderCredentials,
447 |     ) -> anyhow::Result<CredentialRefresh> {
448 |         let input = WeverseAuthorizeInput::TokenRefresh {
449 |             grant_type: "refresh_token".to_owned(),
450 |             client_id: "weverse-test".to_owned(),
451 |             refresh_token: credentials.refresh_token.clone(),
452 |         };
453 |         let out = self
454 |             .client
455 |             .post("https://accountapi.weverse.io/api/v1/oauth/token")
456 |             .json(&input)
457 |             .send()
458 |             .await?
459 |             .json::<WeverseAuthorizeResponse>()
460 |             .await?;
461 |         let credentials_result = ProviderCredentials {
462 |             access_token: out.access_token,
463 |             refresh_token: out.refresh_token,
464 |         };
465 |         Ok(CredentialRefresh::Result(credentials_result))
466 |     }
467 |     async fn login(&self) -> Result<ProviderCredentials, ProviderFailure> {
468 |         let credentials = fetch_weverse_auth_token(&self.client)
469 |             .await?
470 |             .expect("Tried to authorize weverse module but the login credentials were not found");
471 |         Ok(credentials)
472 |     }
473 |     fn credentials(&self) -> SharedCredentials<ProviderCredentials> {
474 |         self.credentials.clone()
475 |     }
476 | }
477 | 


--------------------------------------------------------------------------------
/src/scraper/scraper.rs:
--------------------------------------------------------------------------------
  1 | use std::time::Instant;
  2 | 
  3 | use async_recursion::async_recursion;
  4 | use chrono::{NaiveDateTime, Utc};
  5 | use futures::StreamExt;
  6 | use log::{debug, error, info, trace};
  7 | use crate::api::v1::ProviderStat;
  8 | 
  9 | use crate::scraper::{
 10 |     providers::{CredentialRefresh, ProviderErrorHandle},
 11 |     ProviderPost,
 12 | };
 13 | use crate::scraper::scraper::ScraperErrorHandleDecision::{Continue, MaxLoginAttempts};
 14 | 
 15 | use super::{
 16 |     providers::{Provider, ProviderFailure, ProviderState, ProviderStep, ScrapeRequestInput},
 17 |     ProviderCredentials, ProviderResult, ScopedProvider,
 18 | };
 19 | 
 20 | #[derive(Debug)]
 21 | pub struct Scrape<'a> {
 22 |     pub provider: &'a ScopedProvider,
 23 |     pub requests: Vec<ScrapeRequest>,
 24 | }
 25 | 
 26 | impl Scrape<'_> {
 27 |     pub fn discovered_new_images(&self) -> bool {
 28 |         let step = match self.requests.get(0) {
 29 |             None => return false,
 30 |             Some(req) => &req.step,
 31 |         };
 32 |         match step {
 33 |             ScraperStep::Data(data) => !data.posts.is_empty(),
 34 |             ScraperStep::Error(_) => false,
 35 |         }
 36 |     }
 37 | }
 38 | 
 39 | #[derive(Debug)]
 40 | pub struct ScrapeRequest {
 41 |     pub date: NaiveDateTime,
 42 |     pub step: ScraperStep,
 43 | }
 44 | 
 45 | #[derive(Debug)]
 46 | enum InternalScraperStep {
 47 |     Data(ProviderResult),
 48 |     Error(ProviderFailure),
 49 |     Exit,
 50 | }
 51 | 
 52 | #[derive(Debug)]
 53 | pub enum ScraperStep {
 54 |     Data(ProviderResult),
 55 |     // we only want to forward request related errors to the consumer
 56 |     Error(ProviderFailure),
 57 | }
 58 | 
 59 | enum ScraperErrorHandleDecision {
 60 |     Continue,
 61 |     MaxLoginAttempts(u32),
 62 | }
 63 | 
 64 | fn write_provider_credentials(provider: &dyn Provider, credentials: ProviderCredentials) {
 65 |     let creds = provider.credentials();
 66 |     let mut credential_ref = creds.write();
 67 |     *credential_ref = Some(credentials);
 68 | }
 69 | 
 70 | fn should_continue_requests(state: &ProviderState, provider: &dyn Provider) -> ScraperErrorHandleDecision {
 71 |     let max_attempts = provider.max_login_attempts();
 72 |     if state.login_attempts > max_attempts {
 73 |         error!("Failed to login to {} after {} attempts. Giving up.", provider.id().to_string(), max_attempts);
 74 |         return MaxLoginAttempts(max_attempts);
 75 |     }
 76 |     return Continue;
 77 | }
 78 | 
 79 | #[async_recursion]
 80 | async fn request_page<'a>(
 81 |     sp: &'a ScopedProvider,
 82 |     provider: &dyn Provider,
 83 |     state: ProviderState,
 84 |     input: &ScrapeRequestInput,
 85 | ) -> (InternalScraperStep, Option<ProviderState>) {
 86 |     let iteration = state.iteration;
 87 |     let error_step = |error| {
 88 |         debug!("Exiting scrape due to an error {:?}", error);
 89 |         (InternalScraperStep::Error(error), None)
 90 |     };
 91 |     let give_up = (InternalScraperStep::Exit, None);
 92 |     let write_credentials_and_continue = |creds: ProviderCredentials| {
 93 |         write_provider_credentials(provider, creds);
 94 |         let new_state = ProviderState {
 95 |             login_attempts: state.login_attempts + 1,
 96 |             ..state.clone()
 97 |         };
 98 |         request_page(sp, provider, new_state, input)
 99 |     };
100 |     match provider.unfold(state.to_owned()).await {
101 |         // we have to indicate an error to the consumer and stop iteration on the next cycle
102 |         Err(error) => match &error {
103 |             ProviderFailure::HttpError(http_error) => match provider.on_error(http_error) {
104 |                 Ok(ProviderErrorHandle::Halt) => error_step(error),
105 |                 Ok(ProviderErrorHandle::Login) => {
106 |                     if let MaxLoginAttempts(count) = should_continue_requests(&state, provider) {
107 |                         error!("Too many login attempts ({}) for {}. Giving up", count, provider.id().to_string());
108 |                         return give_up;
109 |                     }
110 |                     debug!("Triggering login flow for {}", provider.id().to_string());
111 |                     match provider.login().await {
112 |                         Ok(credentials) => write_credentials_and_continue(credentials).await,
113 |                         Err(error) => error_step(error),
114 |                     }
115 |                 }
116 |                 Ok(ProviderErrorHandle::RefreshToken(credentials)) => {
117 |                     if let MaxLoginAttempts(count) = should_continue_requests(&state, provider) {
118 |                         error!("Too many login attempts ({}) for {}. Giving up", count, provider.id().to_string());
119 |                         return give_up;
120 |                     }
121 |                     debug!(
122 |                         "Triggering token refresh flow for {}",
123 |                         provider.id().to_string()
124 |                     );
125 |                     match provider.token_refresh(&credentials).await {
126 |                         Ok(CredentialRefresh::Result(credentials)) => {
127 |                             write_credentials_and_continue(credentials).await
128 |                         }
129 |                         Ok(CredentialRefresh::TryLogin) => {
130 |                             debug!("Triggering login flow for {}", provider.id().to_string());
131 |                             match provider.login().await {
132 |                                 Ok(credentials) => {
133 |                                     write_credentials_and_continue(credentials).await
134 |                                 }
135 |                                 Err(err) => {
136 |                                     debug!(
137 |                                         "Error trying to login to {}: {:?}",
138 |                                         provider.id().to_string(),
139 |                                         err
140 |                                     );
141 |                                     error_step(error)
142 |                                 }
143 |                             }
144 |                         }
145 |                         Ok(CredentialRefresh::Halt) => error_step(error),
146 |                         _ => error_step(error),
147 |                     }
148 |                 }
149 |                 _ => error_step(error),
150 |             },
151 |             // TODO: reduce this nested boilerplate by implementing [From] for a result type?
152 |             _ => error_step(error),
153 |         },
154 |         Ok(ProviderStep::End(result)) => (InternalScraperStep::Data(result), None),
155 |         Ok(ProviderStep::NotInitialized) => {
156 |             info!(
157 |                 "Skipping {} because the provider was not initialized",
158 |                 provider.id().to_string()
159 |             );
160 |             (InternalScraperStep::Exit, None)
161 |         }
162 |         Ok(ProviderStep::Next(result, pagination)) => {
163 |             let page_size = if input.is_first_scrape {
164 |                 provider.max_page_size()
165 |             } else {
166 |                 provider.default_page_size()
167 |             };
168 | 
169 |             let id = sp.destination.clone();
170 |             let maybe_next_url =
171 |                 provider.from_provider_destination(&id, page_size, Some(pagination.clone()));
172 | 
173 |             match maybe_next_url {
174 |                 Err(err) => error_step(err),
175 |                 Ok(url) => {
176 |                     let next_state = ProviderState {
177 |                         id,
178 |                         default_name: input.default_name.clone(),
179 |                         url,
180 |                         pagination: Some(pagination),
181 |                         iteration: iteration + 1,
182 |                         ..state.clone()
183 |                     };
184 |                     (InternalScraperStep::Data(result), Some(next_state))
185 |                 }
186 |             }
187 |         }
188 |     }
189 | }
190 | 
191 | pub async fn scrape<'a>(
192 |     sp: &'a ScopedProvider,
193 |     provider: &dyn Provider,
194 |     input: &ScrapeRequestInput,
195 | ) -> Result<Scrape<'a>, ProviderFailure> {
196 |     let initial_iteration = 0;
197 |     let page_size = if input.is_first_scrape {
198 |         provider.max_page_size()
199 |     } else {
200 |         provider.default_page_size()
201 |     };
202 |     let id = sp.destination.clone();
203 |     let url = provider.from_provider_destination(&id, page_size.to_owned(), None)?;
204 | 
205 |     let seed = ProviderState {
206 |         login_attempts: 0,
207 |         id: id.clone(),
208 |         default_name: input.default_name.clone(),
209 |         url,
210 |         pagination: None,
211 |         iteration: initial_iteration,
212 |     };
213 | 
214 |     let mut steps = futures::stream::unfold(Some(seed), |maybe_state| async {
215 |         let state = maybe_state?;
216 |         info!("Scraping URL: {:?}", state.url.0);
217 |         Some(request_page(sp, provider, state, input).await)
218 |     })
219 |         .boxed();
220 | 
221 |     let mut scrape_requests: Vec<ScrapeRequest> = vec![];
222 |     let scrape_start = Instant::now();
223 | 
224 |     while let Some(step) = steps.next().await {
225 |         let date = Utc::now().naive_utc();
226 |         match step {
227 |             InternalScraperStep::Exit => break,
228 |             InternalScraperStep::Error(error) => {
229 |                 scrape_requests.push(ScrapeRequest {
230 |                     date,
231 |                     step: ScraperStep::Error(error),
232 |                 });
233 |                 // no reason to continue scraping after an error
234 |                 break;
235 |             }
236 |             InternalScraperStep::Data(page) => {
237 |                 let total_found_images = page.posts.iter().flat_map(|p| &p.images).count();
238 |                 let mut posts: Vec<ProviderPost> = vec![];
239 |                 for post in page.posts {
240 |                     // it SHOULDN'T be possible for us to have seen a post and only
241 |                     // have it partially saved... This should be good enough
242 |                     // This does sadly break the debugging process if you're deleting images
243 |                     // from the db and re-scraping to trigger things
244 |                     let known_image = post
245 |                         .images
246 |                         .iter()
247 |                         .find(|image| input.latest_data.contains(&image.unique_identifier));
248 |                     if let Some(image) = known_image {
249 |                         debug!(
250 |                             "Reached last known image id for {}: {}",
251 |                             sp, image.unique_identifier
252 |                         );
253 |                         break;
254 |                     }
255 |                     posts.push(post)
256 |                 }
257 |                 let new_image_count = posts.iter().map(|p| &p.images).len();
258 |                 info!("Found {} new images in {}", posts.len(), sp);
259 | 
260 |                 scrape_requests.push(ScrapeRequest {
261 |                     date,
262 |                     step: ScraperStep::Data(ProviderResult { posts, ..page }),
263 |                 });
264 | 
265 |                 if new_image_count == 0 {
266 |                     info!(
267 |                         "[{}] has finished crawling because it's back to the last scraped data point",
268 |                         sp
269 |                     );
270 |                     break;
271 |                 }
272 |                 let pagination_limit = provider.max_pagination();
273 |                 // only looking at pagination limit if there's at least one image
274 |                 // that's been scraped in the past
275 |                 if !input.latest_data.is_empty() && scrape_requests.len() as u16 > pagination_limit
276 |                 {
277 |                     info!(
278 |                         "[{}] has reached its pagination limit of {}",
279 |                         sp, pagination_limit
280 |                     );
281 |                     break;
282 |                 }
283 |                 trace!("Waiting for provider rate limit...");
284 |                 provider.wait(&sp.destination).await;
285 |             }
286 |         }
287 |     }
288 |     let scrape_count = scrape_requests.len();
289 |     info!(
290 |         "[{}] finished scraping in {:?} after {} request{}",
291 |         sp,
292 |         scrape_start.elapsed(),
293 |         scrape_count,
294 |         if scrape_count != 1 { "s" } else { "" }
295 |     );
296 |     Ok(Scrape {
297 |         provider: sp,
298 |         requests: scrape_requests,
299 |     })
300 | }
301 | 


--------------------------------------------------------------------------------
/src/server.rs:
--------------------------------------------------------------------------------
  1 | use std::convert::Infallible;
  2 | use std::net::SocketAddr;
  3 | use std::ops::Sub;
  4 | use std::sync::Arc;
  5 | 
  6 | use axum::body::{Bytes, Full};
  7 | use axum::extract::Extension;
  8 | use axum::http::Response;
  9 | use axum::response::IntoResponse;
 10 | use axum::routing::{get, post};
 11 | use axum::{AddExtensionLayer, Json, Router};
 12 | use chrono::{Duration, NaiveDate, NaiveDateTime, Utc};
 13 | use log::{debug, error, info};
 14 | use num_traits::ToPrimitive;
 15 | use reqwest::StatusCode;
 16 | use serde::Serialize;
 17 | use serde_json::json;
 18 | use sqlx::types::BigDecimal;
 19 | 
 20 | use crate::api::v1::providers::v1_add_provider;
 21 | use crate::api::v1::{v1_provider_stats, v1_scheduled_scrapes, v1_scrape_history};
 22 | use crate::api::{AppError, Context};
 23 | use crate::db::{latest_requests, Database};
 24 | use crate::scraper::ProviderMap;
 25 | 
 26 | struct ScheduledProvider {
 27 |     id: i32,
 28 |     url: String,
 29 |     name: String,
 30 |     destination: String,
 31 |     priority: BigDecimal,
 32 |     tokens: BigDecimal,
 33 |     default_name: Option<String>,
 34 |     last_queue: Option<NaiveDateTime>,
 35 |     metadata: Option<serde_json::Value>,
 36 | }
 37 | 
 38 | #[derive(Serialize)]
 39 | struct ScheduleResponse {
 40 |     id: i32,
 41 |     provider: String,
 42 |     url: String,
 43 |     destination: String,
 44 |     wait_days: i16,
 45 |     metadata: Option<serde_json::Value>,
 46 |     name: String,
 47 | }
 48 | 
 49 | #[deprecated]
 50 | async fn scheduled_scrapes(
 51 |     Extension(state): Extension<Arc<Context>>,
 52 | ) -> Result<Json<Vec<ScheduleResponse>>, AppError> {
 53 |     let rows = sqlx::query_as!(
 54 |         ScheduledProvider,
 55 |         "SELECT pr.id, pr.priority, pr.name, pr.destination, pr.url, pr.tokens, pr.last_queue, pr.default_name, (
 56 |             SELECT metadata FROM amqp_source where provider_destination = pr.destination and provider_name = pr.name
 57 |         ) as metadata FROM provider_resource pr"
 58 |     )
 59 |     .fetch_all(&*state.db)
 60 |     .await?;
 61 |     let (today, later): (Vec<ScheduledProvider>, Vec<ScheduledProvider>) =
 62 |         rows.into_iter().partition(|e| {
 63 |             let now = Utc::now().naive_utc();
 64 |             // anything that was queued in the last 24 hours is already being scraped
 65 |             // it's not SUPER accurate since it's possible but
 66 |             // we only need a general idea, not precision
 67 |             e.last_queue
 68 |                 .map(|last_queue| {
 69 |                     let yesterday = now - Duration::hours(24);
 70 |                     last_queue > yesterday
 71 |                 })
 72 |                 .unwrap_or(false)
 73 |         });
 74 |     let labeled = later
 75 |         .into_iter()
 76 |         .map(|row| {
 77 |             let wait_days = ((1f32 / (row.priority + row.tokens))
 78 |                 .to_f32()
 79 |                 .unwrap_or(0f32))
 80 |             .floor() as i16;
 81 |             ScheduleResponse {
 82 |                 destination: row.destination,
 83 |                 provider: row.name,
 84 |                 id: row.id,
 85 |                 url: row.url,
 86 |                 wait_days,
 87 |                 metadata: row.metadata,
 88 |                 name: row.default_name.unwrap_or_default(),
 89 |             }
 90 |         })
 91 |         .collect::<Vec<_>>();
 92 |     let mut out = today
 93 |         .into_iter()
 94 |         .map(|t| ScheduleResponse {
 95 |             destination: t.destination,
 96 |             provider: t.name,
 97 |             id: t.id,
 98 |             url: t.url,
 99 |             wait_days: 0,
100 |             metadata: t.metadata,
101 |             name: t.default_name.unwrap_or_default(),
102 |         })
103 |         .collect::<Vec<_>>();
104 |     out.extend(labeled);
105 |     Ok(Json(out))
106 | }
107 | 
108 | pub async fn run_server(db: Arc<Database>, provider_map: Arc<ProviderMap>, port: u16) {
109 |     info!("Starting server");
110 |     let ctx = Arc::new(Context {
111 |         db: Arc::clone(&db),
112 |         providers: provider_map,
113 |     });
114 |     let router = Router::new()
115 |         .route("/schedule", get(scheduled_scrapes))
116 |         .route("/v1/schedule", get(v1_scheduled_scrapes))
117 |         .route("/v1/history", get(v1_scrape_history))
118 |         .route("/v1/provider", post(v1_add_provider))
119 |         .route("/v1/stats", get(v1_provider_stats))
120 |         .layer(AddExtensionLayer::new(ctx));
121 |     let addr = SocketAddr::from(([0, 0, 0, 0], port));
122 |     axum::Server::bind(&addr)
123 |         .serve(router.into_make_service())
124 |         .await
125 |         .unwrap();
126 | }
127 | 


--------------------------------------------------------------------------------