├── .dockerignore
├── .env.example
├── .github
└── workflows
│ └── build.yaml
├── .gitignore
├── Cargo.lock
├── Cargo.toml
├── Dockerfile
├── LICENSE.md
├── README.md
├── assets
├── scrape_interval.png
└── scraping_history.png
├── docker-compose.yaml
├── migrations
├── 20210722150010_initial_migration.down.sql
├── 20210722150010_initial_migration.up.sql
├── 20211111232248_scrape_resource_priority.down.sql
├── 20211111232248_scrape_resource_priority.up.sql
├── 20211121211206_scraped_at.down.sql
├── 20211121211206_scraped_at.up.sql
├── 20211211152424_unique_amqp_source.down.sql
├── 20211211152424_unique_amqp_source.up.sql
├── 20211211152751_populate_amqp_source.down.sql
├── 20211211152751_populate_amqp_source.up.sql
├── 20211221055954_official_source.down.sql
├── 20211221055954_official_source.up.sql
├── 20211221061158_official_source_nullable.down.sql
└── 20211221061158_official_source_nullable.up.sql
├── sqlx-data.json
└── src
├── api
├── mod.rs
└── v1
│ ├── mod.rs
│ ├── providers.rs
│ └── stats.rs
├── db.rs
├── dispatcher
├── amqp.rs
├── discord.rs
├── dispatcher.rs
└── mod.rs
├── lib.rs
├── main.rs
├── models.rs
├── request.rs
├── scheduler
├── mod.rs
├── priority.rs
├── rate_limiter.rs
└── scheduler.rs
├── scraper
├── mod.rs
├── providers
│ ├── mod.rs
│ ├── pinterest.rs
│ ├── providers.rs
│ ├── twitter.rs
│ ├── twitter_types.rs
│ ├── united_cube.rs
│ └── weverse.rs
└── scraper.rs
└── server.rs
/.dockerignore:
--------------------------------------------------------------------------------
1 | target
2 | .vscode
3 | dist
4 | manifests
5 | Dockerfile
6 | docker-compose.yaml
--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
1 | DATABASE_URL=postgres://postgres:password@localhost:5431/jiu
2 | USER_AGENT="Jiu Scraper (https://github.com/xetera/jiu)"
3 | WEVERSE_EMAIL=
4 | WEVERSE_PASSWORD=
--------------------------------------------------------------------------------
/.github/workflows/build.yaml:
--------------------------------------------------------------------------------
1 | name: ci
2 |
3 | on:
4 | push:
5 | branches:
6 | - 'main'
7 |
8 | jobs:
9 | docker:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - uses: actions/checkout@v2
13 | - name: Set up QEMU
14 | uses: docker/setup-qemu-action@v1
15 | - name: Set up Docker Buildx
16 | uses: docker/setup-buildx-action@v1
17 | - name: Login to DockerHub
18 | uses: docker/login-action@v1
19 | with:
20 | username: ${{ secrets.DOCKERHUB_USERNAME }}
21 | password: ${{ secrets.DOCKERHUB_PASSWORD }}
22 | - name: Cache Docker layers
23 | uses: actions/cache@v2
24 | with:
25 | path: /tmp/.buildx-cache
26 | key: ${{ runner.os }}-buildx-${{ github.sha }}
27 | restore-keys: |
28 | ${{ runner.os }}-buildx-
29 | - name: Build and push
30 | uses: docker/build-push-action@v2
31 | with:
32 | context: .
33 | push: true
34 | tags: xetera/jiu:latest
35 | cache-from: type=local,src=/tmp/.buildx-cache
36 | cache-to: type=local,dest=/tmp/.buildx-cache-new
37 | # This ugly bit is necessary if you don't want your cache to grow forever
38 | # till it hits GitHub's limit of 5GB.
39 | # Temp fix
40 | # https://github.com/docker/build-push-action/issues/252
41 | # https://github.com/moby/buildkit/issues/1896
42 | - name: Move cache
43 | run: |
44 | rm -rf /tmp/.buildx-cache
45 | mv /tmp/.buildx-cache-new /tmp/.buildx-cache
46 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /target/
2 | **/*.rs.bk
3 | **/dist
4 | .vscode
5 | env.toml
6 | seed.sql
7 | .env
8 | .idea
9 | *.dump
10 |
--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "jiu"
3 | version = "0.1.1"
4 | authors = ["Xetera"]
5 | edition = "2018"
6 |
7 | [dependencies]
8 | reqwest = { version = "0.11.4", features = ["json", "multipart"] }
9 | chrono = { version= "0.4.19", features = ["serde"]}
10 | async-trait = "0.1.50"
11 | async-recursion = "0.3.2"
12 | tokio = { version = "1.8.1", features = ["full"] }
13 | tokio-test = "0.4.2"
14 | serde = { version = "1.0.126", features = ["derive"] }
15 | serde_json = "1.0.59"
16 | url = "2.2.2"
17 | better-panic = "0.2.0"
18 | futures = "0.3.15"
19 | thiserror = "1.0.26"
20 | num-traits = "0.2.14"
21 | sqlx = { version = "0.5.5", features = ["runtime-tokio-rustls", "postgres", "chrono", "offline", "json", "bigdecimal"] }
22 | dotenv = "0.15.0"
23 | log = "0.4.0"
24 | env_logger = "0.8.4"
25 | anyhow = "1.0.42"
26 | strum = { version = "0.21.0", features = ["derive"] }
27 | strum_macros = "0.21.1"
28 | rsa = "0.4.0"
29 | sha-1 = "0.9.7"
30 | digest = "0.9.0"
31 | base64 = "0.13.0"
32 | rand = "0.8.4"
33 | regex = "1.5.4"
34 | lazy_static = "1.4.0"
35 | dyn-clone = "1.0.4"
36 | bimap = "0.6.1"
37 | governor = "0.3.2"
38 | nonzero_ext = "0.2.0"
39 | axum = "0.3.4"
40 | tower = "0.4.11"
41 | itertools = "0.10.1"
42 | parking_lot = "0.11.1"
43 | tokio-amqp = "1.0.0"
44 | lapin = "1.8.1"
45 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM rust:1.55.0-buster as builder
2 |
3 | # First build a dummy project with our dependencies to cache them in Docker
4 | WORKDIR /usr/src
5 | RUN cargo new --bin builder
6 | WORKDIR /usr/src/builder
7 | COPY ./Cargo.lock ./Cargo.lock
8 | COPY ./Cargo.toml ./Cargo.toml
9 | RUN --mount=type=cache,target=/usr/local/cargo/registry cargo build --release
10 | RUN rm src/*.rs
11 |
12 | # Now copy the sources and do the real build
13 | ADD src src
14 | ADD sqlx-data.json sqlx-data.json
15 | ENV SQLX_OFFLINE true
16 |
17 | RUN cargo build --release
18 |
19 | # Second stage putting the build result into a debian jessie-slim image
20 | FROM debian:buster-slim
21 |
22 | RUN apt-get update \
23 | && apt-get install -y ca-certificates tzdata libc6 \
24 | && rm -rf /var/lib/apt/lists/*
25 | ENV NAME=rust-docker
26 | ENV RUST_LOG=jiu=debug
27 | COPY --from=builder /usr/src/builder/target/release/jiu /usr/local/bin/jiu
28 | CMD ["/usr/local/bin/jiu"]
29 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Scrape multiple media providers on a cron job and dispatch webhooks when changes are detected.
7 |
8 |
9 | ## Jiu
10 |
11 | Jiu is a multi-threaded media scraper capable of juggling thousands of endpoints from different providers with unique
12 | restrictions/requirements.
13 |
14 | It is built for the purpose of fetching media posted on different sites the form of a slow, eventual-consistency, and
15 | not for instant change detection.
16 |
17 | ## Providers
18 |
19 | Provider is the umbrella term that encapsulates all endpoints for a given domain.
20 |
21 | For example, https://weverse.io/bts/artist and https://weverse.io/dreamcatcher/artist are 2 endpoints under the Weverse
22 | provider.
23 |
24 | ### Supported providers
25 |
26 | * [Twitter](https://twitter.com/RBW_MAMAMOO)
27 | * [Pinterest Boards](https://www.pinterest.com/janairaoliveira314/handong)
28 | * [Weverse.io](https://weverse.io/dreamcatcher/feed)
29 | * [United Cube](https://www.united-cube.com/)
30 |
31 | ## Dynamic Priority & Tokens
32 |
33 | Dynamic priority is main idea behind how JiU can scrape many resources without getting rate limited.
34 |
35 | Unique endpoints that have more than 1 token are grouped by their provider type and get scheduled to be scraped at even
36 | intervals at the start of every day to avoid hammering APIs with requests.
37 |
38 | 
39 |
40 | After each successful request, a 30 day sliding window of that endpoint's request history gets graded on a curve that
41 | determines how its priority should be changing based on how many new images it found in each request.
42 |
43 | 
44 |
45 | Pages that post at least one image regularly get assigned a higher priority, up to a maximum of 3 requests every 2 days.
46 | Pages that don't post anything sink down to a scrape schedule of once every 2 weeks.
47 |
48 | New results found in earlier dates have a higher contribution to priority than those found further back. This curve
49 | allows JiU to match its request frequency with the changing posting schedule of sites it's processing to avoid wasting
50 | requests on resources that are rarely updated.
51 |
52 | At the end of each day, every endpoint gets tokens added to it equal to its current priority that get checked as a
53 | criteria when scheduling requests the next day.
54 |
55 | ## Authorization
56 |
57 | Anonymous request are always preferred when possible.
58 |
59 | There is a customizable login flow for providers that require authorization which allows logging into APIs after an
60 | authorization error, and persists additional data (such as a JWT token) to be shared across each provider during the
61 | lifetime of the process.
62 |
63 | The login flow is reverse engineered for providers that don't have a public API.
64 |
65 | > Juggling multiple accounts per provider is currently not supported and probably won't be as long as your accounts aren't getting banned (and if they are then you're sending too many requests and need to increase your rate limits).
66 |
67 | Jiu will try its best to identify itself in its requests' `User-Agent` header, but will submit a fake UA for providers
68 | that gate posts behind a user agent check like Twitter.
69 |
70 | ## Proxies
71 |
72 | Proxies are not supported or needed.
73 |
74 | ## Webhooks
75 |
76 | Jiu is capable of sending webhooks to multiple destinations when an update for a provider is detected.
77 |
78 | Although data about posts are aggregated within webhooks, they're not persisted to the database as that's the responsibility of the service receiving the events and are not relevant for image aggregation.
79 |
80 | ```json
81 | {
82 | "provider": {
83 | "type": "twitter.timeline",
84 | "id": "729935154290925570",
85 | "ephemeral": false
86 | },
87 | "posts": [
88 | {
89 | "unique_identifier": "1460196926796623873",
90 | "body": "[#가현] 삐뚤빼뚤 즐거운 라이브였다❣️ 다음 주에도 재밌는 시간 보내 보카?\n\n#드림캐쳐 #Dreamcatcher #4주_집콕_프로젝트 https://t.co/r1ImPUPKkv",
91 | "url": "https://twitter.com/hf_dreamcatcher/status/1460196926796623873",
92 | "post_date": null,
93 | "account": {
94 | "name":"드림캐쳐 Dreamcatcher",
95 | "avatar_url":"https://pbs.twimg.com/profile_images/1415983453200261124/4-viIm27_normal.jpg"
96 | },
97 | "metadata": {
98 | "language": "ko",
99 | "like_count": 12474,
100 | "retweet_count": 2760
101 | },
102 | "images": [
103 | {
104 | "type": "Image",
105 | "media_url": "https://pbs.twimg.com/media/FEOpVKmagAELmzI.jpg",
106 | "reference_url": "https://twitter.com/hf_dreamcatcher/status/1460196926796623873/photo/1",
107 | "unique_identifier": "1460196885285994497",
108 | "metadata": {
109 | "width": 1128,
110 | "height": 1504
111 | }
112 | },
113 | {
114 | "type": "Image",
115 | "media_url": "https://pbs.twimg.com/media/FEOpV2FaAAEG4zr.jpg",
116 | "reference_url": "https://twitter.com/hf_dreamcatcher/status/1460196926796623873/photo/2",
117 | "unique_identifier": "1460196896958709761",
118 | "metadata": {
119 | "width": 1128,
120 | "height": 1504
121 | }
122 | }
123 | ]
124 | }
125 | ]
126 | }
127 | ```
128 |
129 | Every provider has its own `provider_metadata` field that _may_ contain extra information about the image or the post it
130 | was found under, but may also be missing. _Documentation WIP_
131 |
132 | The `unique_identifier` field is unique **per provider** and not globally.
133 |
134 | The `ephemeral` field defines whether an image is only accessible for a short period after dispatch (for example
135 | instagram image links expire after some time).
136 |
137 | If a Discord webhook URL is detected, the payload is changed to allow Discord to display the images in the channel.
138 |
139 | There is currently no retry mechanism for webhooks that fail to deliver successfully.
140 |
141 | ## Endpoints
142 |
143 | Jiu runs a webserver on port 8080 to allow dynamically resolving new resources by URL and getting stats at runtime
144 |
145 | - `POST /v1/provider` Create a new provider by resolving a URL to a resource
146 | - `DELETE /v1/provider` Delete an existing provider (sets it to `enabled=false`)
147 | - `GET /v1/schedule` Get the upcoming scheduled scrapes
148 | - `GET /v1/history` The list of the last 100 scraped endpoints
149 | - `GET /v1/stats` The stats of all the registered providers
150 |
151 | ## Jiu is **NOT**:
152 |
153 | * For bombarding sites like Twitter with requests to detect changes within seconds.
154 | * Capable of executing javascript with a headless browser.
155 | * Able to send requests to any social media site without explicit support.
156 |
157 | ## Jiu **IS**:
158 |
159 | * For slowly monitoring changes in different feeds over the course of multiple hours without abusing the provider.
160 | * Capable of adjusting the frequency of scrapes based on how frequently the source is updated.
161 | * Able to send webhooks or push to AMQP on discovery.
162 | * The lead singer of [Dreamcatcher](https://www.youtube.com/watch?v=1QD0FeZyDtQ).
163 |
164 | ## Usage
165 |
166 | 1. Copy over `.env.example` to `.env` and fill out relevant fields.
167 | 2. `docker-compose up -d jiu_db` to start postgres.
168 | 3. `RUST_LOG=jiu cargo run` to start the crawler
169 |
170 | To create a production-ready image, make sure to run `cargo sqlx generate` before building if you modified any of the
171 | SQL queries.
172 |
173 | > If you would like to use this project, please change the `USER_AGENT` environment variable to identify your crawler accurately.
174 |
175 | Built for [kiyomi.io](https://github.com/xetera/kiyomi)
176 |
--------------------------------------------------------------------------------
/assets/scrape_interval.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Xetera/jiu/e3a54c908f17359f1233b28b4bcdab31f5b249b8/assets/scrape_interval.png
--------------------------------------------------------------------------------
/assets/scraping_history.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Xetera/jiu/e3a54c908f17359f1233b28b4bcdab31f5b249b8/assets/scraping_history.png
--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | version: "3.5"
2 |
3 | networks:
4 | jiu_net:
5 |
6 | volumes:
7 | jiu_data:
8 |
9 | services:
10 | jiu_db:
11 | image: postgres:13
12 | container_name: jiu_db
13 | volumes:
14 | - jiu_data:/var/lib/postgresql/data
15 | environment:
16 | POSTGRES_USER: postgres
17 | POSTGRES_PASSWORD: password
18 | POSTGRES_DB: jiu
19 | networks:
20 | - jiu_net
21 | ports:
22 | - 5431:5432
23 | jiu:
24 | image: rust:1.55
25 | volumes:
26 | - ./:/app
27 | networks:
28 | - jiu_net
29 | environment:
30 | USER: xetera
31 | DATABASE_URL: postgres://postgres:password@jiu_db:5431/jiu
--------------------------------------------------------------------------------
/migrations/20210722150010_initial_migration.down.sql:
--------------------------------------------------------------------------------
1 | -- Add down migration script here
2 | DROP TABLE webhook_invocation;
3 |
4 | DROP TABLE webhook_source;
5 |
6 | DROP TABLE scrape_error;
7 |
8 | DROP TABLE media;
9 |
10 | DROP TABLE scrape_request;
11 |
12 | DROP TABLE scrape;
13 |
14 | DROP TABLE webhook;
15 |
--------------------------------------------------------------------------------
/migrations/20210722150010_initial_migration.up.sql:
--------------------------------------------------------------------------------
1 | -- Add up migration script here
2 | CREATE TABLE IF NOT EXISTS webhook(
3 | id INTEGER PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
4 | destination TEXT NOT NULL,
5 | created_at TIMESTAMP WITHOUT TIME ZONE NOT NULL DEFAULT NOW(),
6 | updated_at TIMESTAMP WITHOUT TIME ZONE NOT NULL DEFAULT NOW(),
7 | -- extra data attached to a webhook invocation
8 | metadata JSONB
9 | );
10 |
11 | CREATE TABLE IF NOT EXISTS provider_resource(
12 | id INTEGER PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
13 | -- This can be a FQDN or an identifier that maps to a unique API endpoint
14 | -- on the provider's end
15 | destination TEXT NOT NULL,
16 | name TEXT NOT NULL,
17 | enabled BOOLEAN DEFAULT True,
18 | -- the url for the scraped page
19 | url TEXT NOT NULL,
20 | priority INTEGER NOT NULL DEFAULT 5 CHECK(priority >= 1 AND priority <= 10),
21 | last_scrape TIMESTAMP WITHOUT TIME ZONE NULL,
22 | -- the date last scrape was requested, this acts a lock to prevent resources from being accessed multiple times
23 | last_queue TIMESTAMP WITHOUT TIME ZONE NULL,
24 | created_at TIMESTAMP WITHOUT TIME ZONE DEFAULT NOW(),
25 | UNIQUE(destination, name)
26 | );
27 |
28 | CREATE TABLE IF NOT EXISTS scrape(
29 | id INTEGER PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
30 | provider_name TEXT,
31 | provider_destination TEXT,
32 | -- the priority this scrape was executed against
33 | priority INTEGER NOT NULL CHECK(priority >= 1 AND priority <= 10),
34 | FOREIGN KEY (provider_name, provider_destination)
35 | REFERENCES provider_resource(name, destination) ON DELETE SET NULL ON UPDATE CASCADE
36 | );
37 |
38 | -- each scrape can have more than one request
39 | CREATE TABLE IF NOT EXISTS scrape_request(
40 | id INTEGER PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
41 | scrape_id INTEGER REFERENCES scrape(id),
42 | page INTEGER NOT NULL DEFAULT 1,
43 | response_code INTEGER,
44 | -- how long did the response take in ms
45 | response_delay INTEGER,
46 | scraped_at TIMESTAMP WITHOUT TIME ZONE NOT NULL
47 | );
48 |
49 | CREATE TABLE IF NOT EXISTS media(
50 | -- This is necessary when trying to sort media that were
51 | -- crawled at the same time
52 | id INTEGER PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
53 | provider_name TEXT,
54 | provider_destination TEXT,
55 | scrape_request_id INTEGER REFERENCES scrape_request(id) ON DELETE SET NULL,
56 | -- We are assuming there is only one type of url
57 | image_url TEXT NOT NULL UNIQUE,
58 | page_url TEXT NULL,
59 | reference_url TEXT NULL,
60 | -- a unique identifier that's specific to the provider
61 | unique_identifier TEXT NOT NULL,
62 | -- where the image is coming from
63 | -- could be null if the provider doesn't have the information
64 | posted_at TIMESTAMP WITHOUT TIME ZONE NULL,
65 | discovered_at TIMESTAMP WITHOUT TIME ZONE NOT NULL,
66 | UNIQUE(unique_identifier, provider_name),
67 | FOREIGN KEY (provider_name, provider_destination)
68 | REFERENCES provider_resource(name, destination) ON UPDATE CASCADE ON DELETE SET NULL
69 | );
70 |
71 | CREATE TABLE IF NOT EXISTS scrape_error(
72 | id INTEGER PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
73 | -- already declared in scrape_request
74 | -- response_code INTEGER,
75 | response_body TEXT NOT NULL DEFAULT '',
76 | response_code TEXT NOT NULL,
77 | message TEXT NULL,
78 | scrape_id INTEGER NOT NULL REFERENCES scrape(id)
79 | );
80 |
81 | CREATE TABLE IF NOT EXISTS webhook_source(
82 | id INTEGER PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
83 | webhook_id INTEGER REFERENCES webhook(id),
84 | provider_name TEXT,
85 | provider_destination TEXT,
86 | FOREIGN KEY (provider_name, provider_destination)
87 | REFERENCES provider_resource(name, destination) ON DELETE SET NULL ON UPDATE CASCADE
88 | );
89 |
90 | CREATE TABLE IF NOT EXISTS webhook_invocation(
91 | id INTEGER PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
92 | scrape_id INTEGER /* NOT NULL */ REFERENCES scrape(id),
93 | webhook_id INTEGER /* NOT NULL */ REFERENCES webhook(id) ON DELETE SET NULL,
94 | response_code INTEGER,
95 | response_delay INTEGER,
96 | invoked_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
97 | );
98 |
99 | CREATE INDEX ON scrape (provider_destination, provider_name);
100 |
--------------------------------------------------------------------------------
/migrations/20211111232248_scrape_resource_priority.down.sql:
--------------------------------------------------------------------------------
1 | -- Add down migration script here
2 | ALTER TABLE provider_resource DROP COLUMN tokens;
3 | ALTER TABLE provider_resource ALTER COLUMN priority type integer;
4 | ALTER TABLE scrape ALTER COLUMN priority type integer;
5 |
6 | DROP TABLE amqp_source;
--------------------------------------------------------------------------------
/migrations/20211111232248_scrape_resource_priority.up.sql:
--------------------------------------------------------------------------------
1 | -- Add up migration script here
2 | ALTER TABLE provider_resource ADD COLUMN last_token_update TIMESTAMP WITHOUT TIME ZONE NULL;
3 | ALTER TABLE provider_resource ADD COLUMN tokens DECIMAL NOT NULL DEFAULT 1.0;
4 | ALTER TABLE provider_resource ALTER COLUMN priority type decimal;
5 | ALTER TABLE scrape ALTER COLUMN priority type decimal;
6 | ALTER TABLE scrape ALTER COLUMN priority set not null;
7 | ALTER TABLE scrape ALTER COLUMN priority set default 1.0;
8 | ALTER TABLE webhook ADD CONSTRAINT unique_destination UNIQUE(destination);
9 | ALTER TABLE provider_resource DROP CONSTRAINT provider_resource_priority_check;
10 | ALTER TABLE scrape DROP CONSTRAINT scrape_priority_check;
11 | ALTER TABLE webhook DROP COLUMN metadata;
12 | ALTER TABLE webhook_source ADD COLUMN metadata JSONB;
13 | ALTER TABLE provider_resource ADD COLUMN default_name TEXT;
14 |
15 | CREATE TABLE IF NOT EXISTS amqp_source(
16 | id INTEGER PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
17 | provider_name TEXT,
18 | provider_destination TEXT,
19 | metadata JSONB,
20 | FOREIGN KEY (provider_name, provider_destination)
21 | REFERENCES provider_resource(name, destination) ON DELETE SET NULL ON UPDATE CASCADE
22 | );
23 |
--------------------------------------------------------------------------------
/migrations/20211121211206_scraped_at.down.sql:
--------------------------------------------------------------------------------
1 | -- Add down migration script here
2 |
--------------------------------------------------------------------------------
/migrations/20211121211206_scraped_at.up.sql:
--------------------------------------------------------------------------------
1 | -- Add up migration script here
2 | ALTER TABLE scrape ADD COLUMN IF NOT EXISTS scraped_at TIMESTAMP WITHOUT TIME ZONE DEFAULT NOW();
3 |
--------------------------------------------------------------------------------
/migrations/20211211152424_unique_amqp_source.down.sql:
--------------------------------------------------------------------------------
1 | -- Add down migration script here
2 |
3 | ALTER TABLE amqp_source DROP CONSTRAINT amqp_unique_providers;
4 |
--------------------------------------------------------------------------------
/migrations/20211211152424_unique_amqp_source.up.sql:
--------------------------------------------------------------------------------
1 | -- Add up migration script here
2 | ALTER TABLE amqp_source ADD CONSTRAINT amqp_unique_providers UNIQUE (provider_name, provider_destination);
3 |
--------------------------------------------------------------------------------
/migrations/20211211152751_populate_amqp_source.down.sql:
--------------------------------------------------------------------------------
1 | -- Add down migration script here
2 |
--------------------------------------------------------------------------------
/migrations/20211211152751_populate_amqp_source.up.sql:
--------------------------------------------------------------------------------
1 | -- Add up migration script here
2 | INSERT INTO amqp_source (provider_name, provider_destination, metadata)
3 | SELECT name, destination, '{}' from provider_resource
4 | ON CONFLICT DO NOTHING;
5 |
--------------------------------------------------------------------------------
/migrations/20211221055954_official_source.down.sql:
--------------------------------------------------------------------------------
1 | -- Add down migration script here
2 | ALTER TABLE provider_resource DROP COLUMN official;
3 |
--------------------------------------------------------------------------------
/migrations/20211221055954_official_source.up.sql:
--------------------------------------------------------------------------------
1 | -- Add up migration script here
2 | ALTER TABLE provider_resource ADD COLUMN official boolean default false;
--------------------------------------------------------------------------------
/migrations/20211221061158_official_source_nullable.down.sql:
--------------------------------------------------------------------------------
1 | -- Add down migration script here
2 | ALTER TABLE provider_resource ALTER COLUMN official DROP NOT NULL;
3 |
--------------------------------------------------------------------------------
/migrations/20211221061158_official_source_nullable.up.sql:
--------------------------------------------------------------------------------
1 | -- Add up migration script here
2 | ALTER TABLE provider_resource ALTER COLUMN official SET NOT NULL;
--------------------------------------------------------------------------------
/src/api/mod.rs:
--------------------------------------------------------------------------------
1 | use crate::db::Database;
2 | use axum::body::{Bytes, Full};
3 | use axum::http::{Response, StatusCode};
4 | use axum::response::IntoResponse;
5 | use axum::Json;
6 | use serde_json::json;
7 | use std::convert::Infallible;
8 | use std::sync::Arc;
9 | use crate::scraper::ProviderMap;
10 |
11 | pub mod v1;
12 |
13 | pub struct Context {
14 | pub db: Arc,
15 | pub providers: Arc
16 | }
17 |
18 | pub enum AppError {
19 | SomeError(anyhow::Error),
20 | SqlxError(sqlx::Error),
21 | }
22 |
23 | impl From for AppError {
24 | fn from(inner: anyhow::Error) -> Self {
25 | AppError::SomeError(inner)
26 | }
27 | }
28 |
29 | impl From for AppError {
30 | fn from(inner: sqlx::Error) -> Self {
31 | AppError::SqlxError(inner)
32 | }
33 | }
34 |
35 | impl IntoResponse for AppError {
36 | type Body = Full;
37 | type BodyError = Infallible;
38 |
39 | fn into_response(self) -> Response {
40 | let (status, error_message) = match self {
41 | AppError::SomeError(err) => (StatusCode::INTERNAL_SERVER_ERROR, err.to_string()),
42 | AppError::SqlxError(err) => (StatusCode::INTERNAL_SERVER_ERROR, err.to_string()),
43 | };
44 |
45 | let body = Json(json!({
46 | "error": error_message,
47 | }));
48 |
49 | (status, body).into_response()
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/src/api/v1/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod stats;
2 | pub mod providers;
3 |
4 | pub use stats::*;
5 |
--------------------------------------------------------------------------------
/src/api/v1/providers.rs:
--------------------------------------------------------------------------------
1 | use std::sync::Arc;
2 |
3 | use axum::extract::Extension;
4 | use axum::Json;
5 | use log::{debug, error, info};
6 | use serde::{Deserialize, Serialize};
7 | use serde_json::{json, Map, Value};
8 |
9 | use crate::api::{AppError, Context};
10 | use crate::scraper::{CanonicalUrlResolution, ProviderFailure, WorkableDomain};
11 |
12 | #[derive(Deserialize)]
13 | pub struct ProviderAdd {
14 | url: String,
15 | name: String,
16 | official: bool,
17 | metadata: Option,
18 | add_to_amqp: Option,
19 | }
20 |
21 | #[derive(Serialize)]
22 | pub enum ProviderAddResponse {
23 | InvalidUrl {
24 | url: String,
25 | },
26 | ProviderExists {
27 | provider: String,
28 | destination: String,
29 | },
30 | InternalError,
31 | NotImplemented,
32 | Success {
33 | provider: String,
34 | destination: String,
35 | },
36 | }
37 |
38 | pub async fn v1_add_provider(
39 | Extension(state): Extension>,
40 | Json(input): Json,
41 | ) -> Result, AppError> {
42 | let result = state
43 | .providers
44 | .values()
45 | .find_map(|p| p.match_domain(&input.url).map(|res| (p, res)));
46 | let (provider, domain) = match result {
47 | Some((provider, domain)) => (provider, domain),
48 | None => {
49 | debug!("Url {} was not valid", input.url);
50 | return Ok(Json(ProviderAddResponse::InvalidUrl { url: input.url }));
51 | }
52 | };
53 | let introspectable = match domain {
54 | WorkableDomain::ToCanonical(resource) => resource,
55 | _ => {
56 | debug!(
57 | "WorkableDomain {:?} from [{}] was not detected as Canonical",
58 | provider.id(),
59 | input.url
60 | );
61 | return Ok(Json(ProviderAddResponse::InvalidUrl { url: input.url }));
62 | }
63 | };
64 | let response = provider.introspect_resource(&introspectable).await;
65 | let destination = match response {
66 | Ok(CanonicalUrlResolution::Success { destination }) => destination,
67 | Ok(CanonicalUrlResolution::Fail(reason)) => {
68 | error!("{:?}", reason);
69 | return Ok(Json(ProviderAddResponse::InternalError));
70 | }
71 | Ok(CanonicalUrlResolution::NotImplemented) => {
72 | return Ok(Json(ProviderAddResponse::NotImplemented))
73 | }
74 | Err(ProviderFailure::Url) => {
75 | return Ok(Json(ProviderAddResponse::InvalidUrl { url: input.url }));
76 | }
77 | Err(other) => {
78 | return Ok(Json(ProviderAddResponse::InternalError));
79 | }
80 | };
81 | info!(
82 | "Successfully resolved [destination: {}] for [{:?}]",
83 | destination,
84 | provider.id()
85 | );
86 | let provider_name = provider.id().to_string();
87 | if let Err(sqlx::Error::Database(err)) = sqlx::query!(
88 | "INSERT INTO provider_resource (destination, name, default_name, official, url) VALUES
89 | ($1, $2, $3, $4, $5)",
90 | destination,
91 | provider_name,
92 | input.name,
93 | input.official,
94 | input.url
95 | )
96 | .execute(&*state.db)
97 | .await
98 | {
99 | // the most disgusting way of checking for primary key conflicts
100 | if err.code().map(|code| code == "23505").unwrap_or(false) {
101 | return Ok(Json(ProviderAddResponse::ProviderExists {
102 | destination,
103 | provider: provider_name,
104 | }));
105 | }
106 | };
107 | // there's a conflict
108 | // TODO: decouple this kiyomi-specific thing out?
109 | if input.add_to_amqp.unwrap_or(false) {
110 | sqlx::query!(
111 | "INSERT INTO amqp_source (provider_name, provider_destination, metadata)
112 | VALUES ($1, $2, $3)
113 | ON CONFLICT(provider_name, provider_destination) DO UPDATE SET metadata = $3",
114 | provider_name,
115 | destination,
116 | input.metadata.unwrap_or(Value::Object(Map::new()))
117 | )
118 | .fetch_optional(&*state.db)
119 | .await?;
120 | }
121 | Ok(Json(ProviderAddResponse::Success {
122 | destination,
123 | provider: provider_name,
124 | }))
125 | }
126 |
127 | #[derive(Deserialize)]
128 | pub struct ProviderDelete {
129 | name: String,
130 | destination: String,
131 | }
132 |
133 | #[derive(Serialize)]
134 | pub struct ProviderDeleteResponse {
135 | modified: bool,
136 | }
137 |
138 | pub async fn v1_delete_provider(
139 | Extension(state): Extension>,
140 | Json(input): Json,
141 | ) -> Result, AppError> {
142 | let result = sqlx::query!(
143 | "UPDATE provider_resource SET enabled = False WHERE name = $1 and destination = $2 RETURNING *",
144 | input.name,
145 | input.destination,
146 | )
147 | .fetch_optional(&*state.db)
148 | .await?;
149 | return Ok(Json(ProviderDeleteResponse {
150 | modified: result.is_some(),
151 | }));
152 | }
153 |
--------------------------------------------------------------------------------
/src/api/v1/stats.rs:
--------------------------------------------------------------------------------
1 | use std::sync::Arc;
2 |
3 | use axum::extract::Extension;
4 | use axum::Json;
5 | use chrono::{DateTime, Duration, NaiveDateTime, Utc};
6 | use num_traits::ToPrimitive;
7 | use serde::Serialize;
8 | use sqlx::types::BigDecimal;
9 |
10 | use crate::api::{AppError, Context};
11 | use crate::models::ScrapeHistory;
12 |
13 | struct ScheduledProvider {
14 | id: i32,
15 | url: String,
16 | name: String,
17 | destination: String,
18 | priority: BigDecimal,
19 | tokens: BigDecimal,
20 | default_name: Option,
21 | last_queue: Option,
22 | metadata: Option,
23 | official: bool,
24 | }
25 |
26 | #[derive(Serialize)]
27 | pub struct ScheduledProviderRun {
28 | id: i32,
29 | provider: String,
30 | url: String,
31 | destination: String,
32 | wait_days: i16,
33 | metadata: Option,
34 | name: String,
35 | official: bool,
36 | }
37 |
38 | struct PreviousScrapeRow {
39 | id: i32,
40 | name: String,
41 | url: String,
42 | destination: String,
43 | date: Option,
44 | // last_post: Option,
45 | priority: BigDecimal,
46 | default_name: Option,
47 | official: bool,
48 | discovered_media: Option,
49 | }
50 |
51 | #[derive(Serialize)]
52 | pub struct PreviousScrape {
53 | id: i32,
54 | name: String,
55 | url: String,
56 | destination: String,
57 | // TODO: make this column not-null
58 | date: Option,
59 | // last_post: Option,
60 | // last_scrape: Option,
61 | // last_post: Option,
62 | priority: f32,
63 | default_name: Option,
64 | official: bool,
65 | discovered_media: i64,
66 | }
67 |
68 | #[derive(Serialize)]
69 | pub struct ScheduleResponse {
70 | pub history: Vec,
71 | pub scheduled: Vec,
72 | }
73 |
74 | pub async fn v1_scheduled_scrapes(
75 | Extension(state): Extension>,
76 | ) -> Result>, AppError> {
77 | let rows = sqlx::query_as!(
78 | ScheduledProvider,
79 | "SELECT pr.id, pr.official, pr.priority, pr.name, pr.destination, pr.url, pr.tokens, pr.last_queue, pr.default_name, (
80 | SELECT metadata FROM amqp_source where provider_destination = pr.destination and provider_name = pr.name
81 | ) as metadata FROM provider_resource pr"
82 | )
83 | .fetch_all(&*state.db)
84 | .await?;
85 | let (today, later): (Vec, Vec) =
86 | rows.into_iter().partition(|e| {
87 | let now = Utc::now().naive_utc();
88 | // anything that was queued in the last 24 hours is already being scraped
89 | // it's not SUPER accurate since it's possible but
90 | // we only need a general idea, not precision
91 | e.last_queue
92 | .map(|last_queue| {
93 | let yesterday = now - Duration::hours(24);
94 | last_queue > yesterday
95 | })
96 | .unwrap_or(false)
97 | });
98 | let labeled = later
99 | .into_iter()
100 | .map(|row| {
101 | let wait_days = ((1f32 / (row.priority + row.tokens))
102 | .to_f32()
103 | .unwrap_or(0f32))
104 | .floor() as i16;
105 | ScheduledProviderRun {
106 | destination: row.destination,
107 | provider: row.name,
108 | id: row.id,
109 | url: row.url,
110 | official: row.official,
111 | wait_days,
112 | metadata: row.metadata,
113 | name: row.default_name.unwrap_or_default(),
114 | }
115 | })
116 | .collect::>();
117 | let mut scheduled = today
118 | .into_iter()
119 | .map(|t| ScheduledProviderRun {
120 | destination: t.destination,
121 | provider: t.name,
122 | official: t.official,
123 | id: t.id,
124 | url: t.url,
125 | wait_days: 0,
126 | metadata: t.metadata,
127 | name: t.default_name.unwrap_or_default(),
128 | })
129 | .collect::>();
130 | scheduled.extend(labeled);
131 | Ok(Json(scheduled))
132 | }
133 |
134 | pub async fn v1_scrape_history(
135 | Extension(state): Extension>,
136 | ) -> Result>, AppError> {
137 | let previous = sqlx::query_as!(
138 | PreviousScrapeRow,
139 | "SELECT scrape.id,
140 | pr.url,
141 | pr.default_name,
142 | pr.official,
143 | pr.name,
144 | pr.destination,
145 | scrape.priority,
146 | scrape.scraped_at as date,
147 | COALESCE((SELECT COUNT(*)
148 | from media
149 | inner join public.scrape_request sr on sr.id = media.scrape_request_id
150 | inner join scrape s on s.id = sr.scrape_id
151 | where sr.scrape_id = scrape.id), 0) as discovered_media
152 | FROM scrape
153 | INNER JOIN provider_resource pr on pr.destination = scrape.provider_destination
154 | and scrape.provider_name = pr.name
155 | ORDER BY scrape.scraped_at desc
156 | LIMIT 100
157 | "
158 | )
159 | .fetch_all(&*state.db)
160 | .await?;
161 | let history = previous
162 | .into_iter()
163 | .map(|row| PreviousScrape {
164 | id: row.id,
165 | name: row.name,
166 | url: row.url,
167 | date: row.date,
168 | destination: row.destination,
169 | // we shouldn't need this, but sqlx doesn't understand the semantics
170 | // of COALESCE for some reason
171 | discovered_media: row.discovered_media.unwrap_or(0),
172 | priority: row.priority.to_f32().unwrap(),
173 | default_name: row.default_name,
174 | official: row.official,
175 | })
176 | .collect::>();
177 | Ok(Json(history))
178 | }
179 |
180 | #[derive(Serialize)]
181 | pub struct ProviderStat {
182 | name: String,
183 | destination: String,
184 | enabled: bool,
185 | url: String,
186 | priority: f32,
187 | tokens: f32,
188 | // TODO: why is this nullable?
189 | created_at: Option,
190 | default_name: Option,
191 | official: bool,
192 | last_scrape: Option,
193 | last_post: Option,
194 | discovered_images: i64,
195 | scrape_count: i64,
196 | }
197 |
198 | #[derive(Serialize)]
199 | pub struct ProviderStatsResponse {
200 | stats: Vec,
201 | }
202 |
203 | pub async fn v1_provider_stats(
204 | Extension(state): Extension>,
205 | ) -> Result, AppError> {
206 | let stats = sqlx::query!(
207 | "SELECT pr.id,
208 | pr.name,
209 | pr.destination,
210 | pr.enabled,
211 | pr.url,
212 | pr.priority,
213 | pr.tokens,
214 | pr.created_at,
215 | pr.default_name,
216 | pr.official,
217 | (SELECT Max(sr.scraped_at)
218 | FROM scrape_request sr
219 | inner join scrape s on pr.destination = s.provider_destination) as last_scrape,
220 | (SELECT MAX(posted_at)
221 | FROM media
222 | INNER JOIN public.scrape_request s on s.id = media.scrape_request_id
223 | inner join scrape s2 on s2.id = s.scrape_id
224 | where s2.provider_destination = pr.destination
225 | and s2.provider_name = pr.name
226 | ) as last_post,
227 | (SELECT COUNT(s3.*)
228 | from media
229 | inner join public.scrape_request r on r.id = media.scrape_request_id
230 | inner join scrape s3 on s3.id = r.scrape_id
231 | where s3.provider_name = pr.name
232 | and s3.provider_destination = pr.destination
233 | ) as discovered_images,
234 | (SELECT COUNT(*) from scrape inner join scrape_request sr2 on scrape.id = sr2.scrape_id
235 | where scrape.provider_destination = pr.destination and scrape.provider_name = pr.name
236 | ) as scrape_count
237 | FROM provider_resource pr;"
238 | )
239 | .fetch_all(&*state.db)
240 | .await?;
241 | let data = ProviderStatsResponse {
242 | stats: stats
243 | .iter()
244 | .map(|stat| ProviderStat {
245 | name: stat.name.clone(),
246 | destination: stat.destination.clone(),
247 | enabled: stat.enabled.unwrap_or(false),
248 | url: stat.url.clone(),
249 | priority: stat.priority.to_f32().unwrap_or(0f32),
250 | tokens: stat.tokens.to_f32().unwrap_or(0f32),
251 | created_at: stat.created_at,
252 | default_name: stat.default_name.clone(),
253 | official: stat.official,
254 | last_scrape: stat.last_scrape,
255 | last_post: stat.last_post,
256 | discovered_images: stat.discovered_images.unwrap_or(0),
257 | scrape_count: stat.scrape_count.unwrap_or(0),
258 | })
259 | .collect::>(),
260 | };
261 | Ok(Json(data))
262 | }
263 |
264 | // (SELECT Max(scraped_at) FROM scrape_request sr where sr.scrape_id = scrape.id) as last_scrape,
265 | // (SELECT MAX(posted_at) FROM media
266 | // INNER JOIN public.scrape_request s on s.id = media.scrape_request_id
267 | // inner join scrape s2 on s2.id = s.scrape_id
268 | // where s2.id = scrape.id
269 | // ) as last_post
270 |
--------------------------------------------------------------------------------
/src/db.rs:
--------------------------------------------------------------------------------
1 | use std::collections::HashSet;
2 | use std::env;
3 | use std::iter::FromIterator;
4 |
5 | use anyhow::bail;
6 | use itertools::Itertools;
7 | use log::error;
8 | use sqlx::postgres::PgPoolOptions;
9 | use sqlx::{Error, Pool, Postgres};
10 |
11 | use crate::dispatcher::dispatcher::WebhookInteraction;
12 | use crate::models::{
13 | AMQPDestination, DatabaseWebhook, PendingProvider, ScrapeRequestMedia, ScrapeRequestWithMedia,
14 | };
15 | use crate::request::HttpError;
16 | use crate::scraper::scraper::{Scrape, ScraperStep};
17 | use crate::scraper::{ProviderFailure, ScopedProvider};
18 |
19 | pub type Database = Pool;
20 |
21 | pub async fn connect() -> Result {
22 | Ok(PgPoolOptions::new()
23 | .max_connections(5)
24 | .connect(&env::var("DATABASE_URL").expect("No DATABASE_URL env"))
25 | .await?)
26 | }
27 |
28 | // Grab the latest N images from a relevant provider destination
29 | pub async fn latest_media_ids_from_provider(
30 | db: &Database,
31 | provider: &ScopedProvider,
32 | ) -> anyhow::Result> {
33 | let out = sqlx::query!(
34 | "SELECT unique_identifier FROM media
35 | WHERE provider_name = $1 AND provider_destination = $2
36 | order by id desc, discovered_at desc limit 100",
37 | provider.name.to_string(),
38 | provider.destination
39 | )
40 | .map(|e| e.unique_identifier)
41 | .fetch_all(db)
42 | .await?;
43 | Ok(HashSet::from_iter(out.into_iter()))
44 | }
45 |
46 | pub async fn amqp_metadata(
47 | db: &Database,
48 | sp: &ScopedProvider,
49 | ) -> anyhow::Result