├── .dockerignore ├── .github └── workflows │ └── release.yml ├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── README.md ├── fuzzysearch-api ├── Cargo.toml ├── Dockerfile ├── sqlx-data.json └── src │ ├── filters.rs │ ├── handlers.rs │ ├── main.rs │ ├── models.rs │ ├── types.rs │ └── utils.rs ├── fuzzysearch-common ├── Cargo.toml └── src │ ├── download.rs │ ├── faktory.rs │ ├── lib.rs │ ├── trace.rs │ └── types.rs ├── fuzzysearch-hash-input ├── Cargo.toml ├── Dockerfile └── src │ └── main.rs ├── fuzzysearch-ingest-e621 ├── Cargo.toml ├── Dockerfile ├── sqlx-data.json └── src │ └── main.rs ├── fuzzysearch-ingest-furaffinity ├── .gitignore ├── Cargo.toml ├── Dockerfile └── src │ └── main.rs ├── fuzzysearch-ingest-weasyl ├── Cargo.toml ├── Dockerfile ├── sqlx-data.json └── src │ └── main.rs ├── fuzzysearch-refresh ├── Cargo.toml ├── Dockerfile ├── sqlx-data.json └── src │ └── main.rs ├── fuzzysearch-webhook ├── Cargo.toml ├── Dockerfile └── src │ └── main.rs ├── migrations ├── 20210221024406_bktree_index.down.sql ├── 20210221024406_bktree_index.up.sql ├── 20210221025236_furaffinity.down.sql ├── 20210221025236_furaffinity.up.sql ├── 20210221025652_e621.down.sql ├── 20210221025652_e621.up.sql ├── 20210221025835_weasyl.down.sql ├── 20210221025835_weasyl.up.sql ├── 20210221030022_twitter.down.sql ├── 20210221030022_twitter.up.sql ├── 20210221030823_hashes.down.sql ├── 20210221030823_hashes.up.sql ├── 20210221033051_authentication.down.sql ├── 20210221033051_authentication.up.sql ├── 20210419174900_index_all_hashes.down.sql ├── 20210419174900_index_all_hashes.up.sql ├── 20210419202830_remove_old_index.down.sql ├── 20210419202830_remove_old_index.up.sql ├── 20210420024815_webhooks.down.sql ├── 20210420024815_webhooks.up.sql ├── 20210422224815_change_hash_index.down.sql ├── 20210422224815_change_hash_index.up.sql ├── 20210822052026_isnumeric_function.down.sql ├── 20210822052026_isnumeric_function.up.sql ├── 20210822052313_deleted_flag.down.sql ├── 20210822052313_deleted_flag.up.sql ├── 20220519161030_index_tag_post_id.down.sql └── 20220519161030_index_tag_post_id.up.sql └── tests ├── fox.gif ├── sample.sql ├── samples ├── 1460136557.psychonautic_syfarore.png ├── 1467485464.oce_syfaro-sketch-web.jpg ├── 1473103034.casual-dhole_fylninsyf2__web_.png ├── 1568810406.kosseart_experimental-syfaro-fa.png ├── 273210894ab3d9f02f02742acead73a2.jpg ├── EmbGP7hWEAEysaF.jpg large.jpg ├── EmbGPKyWEAEi3JI.jpg large.jpg └── EmbGTypW8AA65r_.jpg large.jpg └── video.webm /.dockerignore: -------------------------------------------------------------------------------- 1 | target/ 2 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | env: 9 | CARGO_TERM_COLOR: always 10 | 11 | jobs: 12 | release: 13 | runs-on: ubuntu-24.04 14 | permissions: 15 | contents: read 16 | packages: write 17 | 18 | steps: 19 | - uses: actions/checkout@v2 20 | 21 | - name: Cache target 22 | uses: actions/cache@v4 23 | with: 24 | path: | 25 | target/ 26 | key: ${{ runner.os }}-release-${{ hashFiles('Cargo.lock') }} 27 | 28 | - name: Get stable toolchain 29 | uses: actions-rs/toolchain@v1 30 | with: 31 | toolchain: stable 32 | 33 | - name: Install dependencies 34 | run: | 35 | sudo apt-get update -y 36 | sudo apt-get install -y libssl-dev pkg-config clang llvm python3 python3-pip 37 | 38 | - name: Build binaries 39 | uses: actions-rs/cargo@v1 40 | with: 41 | command: build 42 | args: --release 43 | 44 | - name: Move binaries 45 | run: | 46 | mv target/release/fuzzysearch-api fuzzysearch-api/fuzzysearch-api 47 | mv target/release/fuzzysearch-webhook fuzzysearch-webhook/fuzzysearch-webhook 48 | mv target/release/fuzzysearch-refresh fuzzysearch-refresh/fuzzysearch-refresh 49 | mv target/release/fuzzysearch-hash-input fuzzysearch-hash-input/fuzzysearch-hash-input 50 | mv target/release/fuzzysearch-ingest-e621 fuzzysearch-ingest-e621/fuzzysearch-ingest-e621 51 | mv target/release/fuzzysearch-ingest-furaffinity fuzzysearch-ingest-furaffinity/fuzzysearch-ingest-furaffinity 52 | mv target/release/fuzzysearch-ingest-weasyl fuzzysearch-ingest-weasyl/fuzzysearch-ingest-weasyl 53 | 54 | - name: Upload binaries 55 | uses: actions/upload-artifact@v4 56 | with: 57 | name: binaries 58 | path: | 59 | fuzzysearch-api/fuzzysearch-api 60 | fuzzysearch-webhook/fuzzysearch-webhook 61 | fuzzysearch-refresh/fuzzysearch-refresh 62 | fuzzysearch-hash-input/fuzzysearch-hash-input 63 | fuzzysearch-ingest-e621/fuzzysearch-ingest-e621 64 | fuzzysearch-ingest-furaffinity/fuzzysearch-ingest-furaffinity 65 | fuzzysearch-ingest-weasyl/fuzzysearch-ingest-weasyl 66 | 67 | - name: Login to GitHub Container Registry 68 | uses: docker/login-action@v1 69 | with: 70 | registry: ghcr.io 71 | username: ${{ github.actor }} 72 | password: ${{ secrets.GITHUB_TOKEN }} 73 | 74 | - name: Extract FuzzySearch API metadata for Docker 75 | id: meta-fuzzysearch-api 76 | uses: docker/metadata-action@v3 77 | with: 78 | images: ghcr.io/syfaro/fuzzysearch-api 79 | 80 | - name: Extract FuzzySearch Webhook metadata for Docker 81 | id: meta-fuzzysearch-webhook 82 | uses: docker/metadata-action@v3 83 | with: 84 | images: ghcr.io/syfaro/fuzzysearch-webhook 85 | 86 | - name: Extract FuzzySearch refresh metadata for Docker 87 | id: meta-fuzzysearch-refresh 88 | uses: docker/metadata-action@v3 89 | with: 90 | images: ghcr.io/syfaro/fuzzysearch-refresh 91 | 92 | - name: Extract FuzzySearch hash input metadata for Docker 93 | id: meta-fuzzysearch-hash-input 94 | uses: docker/metadata-action@v3 95 | with: 96 | images: ghcr.io/syfaro/fuzzysearch-hash-input 97 | 98 | - name: Extract FuzzySearch ingest e621 metadata for Docker 99 | id: meta-fuzzysearch-ingest-e621 100 | uses: docker/metadata-action@v3 101 | with: 102 | images: ghcr.io/syfaro/fuzzysearch-ingest-e621 103 | 104 | - name: Extract FuzzySearch ingest FurAffinity metadata for Docker 105 | id: meta-fuzzysearch-ingest-furaffinity 106 | uses: docker/metadata-action@v3 107 | with: 108 | images: ghcr.io/syfaro/fuzzysearch-ingest-furaffinity 109 | 110 | - name: Extract FuzzySearch ingest Weasyl metadata for Docker 111 | id: meta-fuzzysearch-ingest-weasyl 112 | uses: docker/metadata-action@v3 113 | with: 114 | images: ghcr.io/syfaro/fuzzysearch-ingest-weasyl 115 | 116 | - name: Build and push FuzzySearch API Docker image 117 | uses: docker/build-push-action@v2 118 | with: 119 | context: . 120 | push: true 121 | tags: ${{ steps.meta-fuzzysearch-api.outputs.tags }} 122 | labels: ${{ steps.meta-fuzzysearch-api.outputs.labels }} 123 | file: fuzzysearch-api/Dockerfile 124 | 125 | - name: Build and push FuzzySearch Webhook Docker image 126 | uses: docker/build-push-action@v2 127 | with: 128 | context: . 129 | push: true 130 | tags: ${{ steps.meta-fuzzysearch-webhook.outputs.tags }} 131 | labels: ${{ steps.meta-fuzzysearch-webhook.outputs.labels }} 132 | file: fuzzysearch-webhook/Dockerfile 133 | 134 | - name: Build and push FuzzySearch hash input Docker image 135 | uses: docker/build-push-action@v2 136 | with: 137 | context: . 138 | push: true 139 | tags: ${{ steps.meta-fuzzysearch-hash-input.outputs.tags }} 140 | labels: ${{ steps.meta-fuzzysearch-hash-input.outputs.labels }} 141 | file: fuzzysearch-hash-input/Dockerfile 142 | 143 | - name: Build and push FuzzySearch refresh Docker image 144 | uses: docker/build-push-action@v2 145 | with: 146 | context: . 147 | push: true 148 | tags: ${{ steps.meta-fuzzysearch-refresh.outputs.tags }} 149 | labels: ${{ steps.meta-fuzzysearch-refresh.outputs.labels }} 150 | file: fuzzysearch-refresh/Dockerfile 151 | 152 | - name: Build and push FuzzySearch ingest e621 Docker image 153 | uses: docker/build-push-action@v2 154 | with: 155 | context: . 156 | push: true 157 | tags: ${{ steps.meta-fuzzysearch-ingest-e621.outputs.tags }} 158 | labels: ${{ steps.meta-fuzzysearch-ingest-e621.outputs.labels }} 159 | file: fuzzysearch-ingest-e621/Dockerfile 160 | 161 | - name: Build and push FuzzySearch ingest FurAffinity Docker image 162 | uses: docker/build-push-action@v2 163 | with: 164 | context: . 165 | push: true 166 | tags: ${{ steps.meta-fuzzysearch-ingest-furaffinity.outputs.tags }} 167 | labels: ${{ steps.meta-fuzzysearch-ingest-furaffinity.outputs.labels }} 168 | file: fuzzysearch-ingest-furaffinity/Dockerfile 169 | 170 | - name: Build and push FuzzySearch ingest Weasyl Docker image 171 | uses: docker/build-push-action@v2 172 | with: 173 | context: . 174 | push: true 175 | tags: ${{ steps.meta-fuzzysearch-ingest-weasyl.outputs.tags }} 176 | labels: ${{ steps.meta-fuzzysearch-ingest-weasyl.outputs.labels }} 177 | file: fuzzysearch-ingest-weasyl/Dockerfile 178 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **/target 2 | .env 3 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = [ 3 | "fuzzysearch-common", 4 | 5 | "fuzzysearch-api", 6 | "fuzzysearch-hash-input", 7 | "fuzzysearch-refresh", 8 | "fuzzysearch-webhook", 9 | 10 | "fuzzysearch-ingest-e621", 11 | "fuzzysearch-ingest-furaffinity", 12 | "fuzzysearch-ingest-weasyl", 13 | ] 14 | 15 | [profile.dev.package."*"] 16 | opt-level = 2 17 | debug = true 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # fuzzysearch 2 | 3 | A site that allows you to reverse image search millions of furry images in under a second. 4 | -------------------------------------------------------------------------------- /fuzzysearch-api/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "fuzzysearch-api" 3 | version = "0.2.0" 4 | authors = ["Syfaro "] 5 | edition = "2018" 6 | 7 | [dependencies] 8 | tracing = "0.1" 9 | tracing-subscriber = "0.3" 10 | tracing-futures = "0.2" 11 | tracing-log = "0.1" 12 | 13 | prometheus = { version = "0.13", features = ["process"] } 14 | lazy_static = "1" 15 | 16 | opentelemetry = { version = "0.17", features = ["rt-tokio"] } 17 | opentelemetry-jaeger = { version = "0.16", features = ["tokio"] } 18 | tracing-opentelemetry = "0.17" 19 | opentelemetry-http = "0.6" 20 | 21 | tokio = { version = "1", features = ["full"] } 22 | tokio-stream = "0.1" 23 | 24 | futures = "0.3" 25 | 26 | chrono = "0.4" 27 | bytes = "1" 28 | 29 | serde = { version = "1", features = ["derive"] } 30 | serde_json = "1" 31 | hex = "0.4" 32 | 33 | warp = "0.3" 34 | reqwest = { version = "0.11", features = ["multipart"] } 35 | hyper = "0.14" 36 | 37 | sqlx = { version = "0.5", features = ["runtime-tokio-native-tls", "postgres", "macros", "json", "offline", "chrono"] } 38 | 39 | image = "0.23" 40 | img_hash = "3" 41 | hamming = "0.1" 42 | 43 | bkapi-client = { git = "https://github.com/Syfaro/bkapi.git" } 44 | 45 | fuzzysearch-common = { path = "../fuzzysearch-common" } 46 | -------------------------------------------------------------------------------- /fuzzysearch-api/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:24.04 2 | EXPOSE 8080 8081 3 | ENV METRICS_HOST=0.0.0.0:8081 4 | RUN apt-get update -y && apt-get install -y --no-install-recommends openssl ca-certificates && rm -rf /var/lib/apt/lists/* 5 | COPY ./fuzzysearch-api/fuzzysearch-api /bin/fuzzysearch-api 6 | CMD ["/bin/fuzzysearch-api"] 7 | -------------------------------------------------------------------------------- /fuzzysearch-api/sqlx-data.json: -------------------------------------------------------------------------------- 1 | { 2 | "db": "PostgreSQL", 3 | "1984ce60f052d6a29638f8e05b35671b8edfbf273783d4b843ebd35cbb8a391f": { 4 | "query": "INSERT INTO\n rate_limit (api_key_id, time_window, group_name, count)\n VALUES\n ($1, $2, $3, $4)\n ON CONFLICT ON CONSTRAINT unique_window\n DO UPDATE set count = rate_limit.count + $4\n RETURNING rate_limit.count", 5 | "describe": { 6 | "columns": [ 7 | { 8 | "ordinal": 0, 9 | "name": "count", 10 | "type_info": "Int2" 11 | } 12 | ], 13 | "parameters": { 14 | "Left": [ 15 | "Int4", 16 | "Int8", 17 | "Text", 18 | "Int2" 19 | ] 20 | }, 21 | "nullable": [ 22 | false 23 | ] 24 | } 25 | }, 26 | "659ee9ddc1c5ccd42ba9dc1617440544c30ece449ba3ba7f9d39f447b8af3cfe": { 27 | "query": "SELECT\n api_key.id,\n api_key.name_limit,\n api_key.image_limit,\n api_key.hash_limit,\n api_key.name,\n account.email owner_email\n FROM\n api_key\n JOIN account\n ON account.id = api_key.user_id\n WHERE\n api_key.key = $1\n ", 28 | "describe": { 29 | "columns": [ 30 | { 31 | "ordinal": 0, 32 | "name": "id", 33 | "type_info": "Int4" 34 | }, 35 | { 36 | "ordinal": 1, 37 | "name": "name_limit", 38 | "type_info": "Int2" 39 | }, 40 | { 41 | "ordinal": 2, 42 | "name": "image_limit", 43 | "type_info": "Int2" 44 | }, 45 | { 46 | "ordinal": 3, 47 | "name": "hash_limit", 48 | "type_info": "Int2" 49 | }, 50 | { 51 | "ordinal": 4, 52 | "name": "name", 53 | "type_info": "Text" 54 | }, 55 | { 56 | "ordinal": 5, 57 | "name": "owner_email", 58 | "type_info": "Text" 59 | } 60 | ], 61 | "parameters": { 62 | "Left": [ 63 | "Text" 64 | ] 65 | }, 66 | "nullable": [ 67 | false, 68 | false, 69 | false, 70 | false, 71 | true, 72 | false 73 | ] 74 | } 75 | }, 76 | "6b8d304fc40fa539ae671e6e24e7978ad271cb7a1cafb20fc4b4096a958d790f": { 77 | "query": "SELECT exists(SELECT 1 FROM twitter_user WHERE lower(data->>'screen_name') = lower($1))", 78 | "describe": { 79 | "columns": [ 80 | { 81 | "ordinal": 0, 82 | "name": "exists", 83 | "type_info": "Bool" 84 | } 85 | ], 86 | "parameters": { 87 | "Left": [ 88 | "Text" 89 | ] 90 | }, 91 | "nullable": [ 92 | null 93 | ] 94 | } 95 | }, 96 | "fe2100bda3d730a69efcb052ff72029f15603e02f5fb1e59e061935be872f6d6": { 97 | "query": "WITH hashes AS (\n SELECT * FROM jsonb_to_recordset($1::jsonb)\n AS hashes(searched_hash bigint, found_hash bigint, distance bigint)\n )\n SELECT\n 'FurAffinity' site,\n submission.id,\n submission.hash_int hash,\n submission.url,\n submission.filename,\n ARRAY(SELECT artist.name) artists,\n submission.file_id,\n null sources,\n submission.rating,\n submission.posted_at,\n hashes.searched_hash,\n hashes.distance,\n submission.file_sha256 sha256\n FROM hashes\n JOIN submission ON hashes.found_hash = submission.hash_int\n JOIN artist ON submission.artist_id = artist.id\n WHERE hash_int IN (SELECT hashes.found_hash)\n UNION ALL\n SELECT\n 'e621' site,\n e621.id,\n e621.hash,\n e621.data->'file'->>'url' url,\n (e621.data->'file'->>'md5') || '.' || (e621.data->'file'->>'ext') filename,\n ARRAY(SELECT jsonb_array_elements_text(e621.data->'tags'->'artist')) artists,\n null file_id,\n ARRAY(SELECT jsonb_array_elements_text(e621.data->'sources')) sources,\n e621.data->>'rating' rating,\n to_timestamp(data->>'created_at', 'YYYY-MM-DD\"T\"HH24:MI:SS\"Z\"') posted_at,\n hashes.searched_hash,\n hashes.distance,\n e621.sha256\n FROM hashes\n JOIN e621 ON hashes.found_hash = e621.hash\n WHERE e621.hash IN (SELECT hashes.found_hash)\n UNION ALL\n SELECT\n 'Weasyl' site,\n weasyl.id,\n weasyl.hash,\n weasyl.data->>'link' url,\n null filename,\n ARRAY(SELECT weasyl.data->>'owner_login') artists,\n null file_id,\n null sources,\n weasyl.data->>'rating' rating,\n to_timestamp(data->>'posted_at', 'YYYY-MM-DD\"T\"HH24:MI:SS\"Z\"') posted_at,\n hashes.searched_hash,\n hashes.distance,\n weasyl.sha256\n FROM hashes\n JOIN weasyl ON hashes.found_hash = weasyl.hash\n WHERE weasyl.hash IN (SELECT hashes.found_hash)\n UNION ALL\n SELECT\n 'Twitter' site,\n tweet.id,\n tweet_media.hash,\n tweet_media.url,\n null filename,\n ARRAY(SELECT tweet.data->'user'->>'screen_name') artists,\n null file_id,\n null sources,\n CASE\n WHEN (tweet.data->'possibly_sensitive')::boolean IS true THEN 'adult'\n WHEN (tweet.data->'possibly_sensitive')::boolean IS false THEN 'general'\n END rating,\n to_timestamp(tweet.data->>'created_at', 'DY Mon DD HH24:MI:SS +0000 YYYY') posted_at,\n hashes.searched_hash,\n hashes.distance,\n null sha256\n FROM hashes\n JOIN tweet_media ON hashes.found_hash = tweet_media.hash\n JOIN tweet ON tweet_media.tweet_id = tweet.id\n WHERE tweet_media.hash IN (SELECT hashes.found_hash)", 98 | "describe": { 99 | "columns": [ 100 | { 101 | "ordinal": 0, 102 | "name": "site", 103 | "type_info": "Text" 104 | }, 105 | { 106 | "ordinal": 1, 107 | "name": "id", 108 | "type_info": "Int8" 109 | }, 110 | { 111 | "ordinal": 2, 112 | "name": "hash", 113 | "type_info": "Int8" 114 | }, 115 | { 116 | "ordinal": 3, 117 | "name": "url", 118 | "type_info": "Text" 119 | }, 120 | { 121 | "ordinal": 4, 122 | "name": "filename", 123 | "type_info": "Text" 124 | }, 125 | { 126 | "ordinal": 5, 127 | "name": "artists", 128 | "type_info": "TextArray" 129 | }, 130 | { 131 | "ordinal": 6, 132 | "name": "file_id", 133 | "type_info": "Int4" 134 | }, 135 | { 136 | "ordinal": 7, 137 | "name": "sources", 138 | "type_info": "TextArray" 139 | }, 140 | { 141 | "ordinal": 8, 142 | "name": "rating", 143 | "type_info": "Bpchar" 144 | }, 145 | { 146 | "ordinal": 9, 147 | "name": "posted_at", 148 | "type_info": "Timestamptz" 149 | }, 150 | { 151 | "ordinal": 10, 152 | "name": "searched_hash", 153 | "type_info": "Int8" 154 | }, 155 | { 156 | "ordinal": 11, 157 | "name": "distance", 158 | "type_info": "Int8" 159 | }, 160 | { 161 | "ordinal": 12, 162 | "name": "sha256", 163 | "type_info": "Bytea" 164 | } 165 | ], 166 | "parameters": { 167 | "Left": [ 168 | "Jsonb" 169 | ] 170 | }, 171 | "nullable": [ 172 | null, 173 | null, 174 | null, 175 | null, 176 | null, 177 | null, 178 | null, 179 | null, 180 | null, 181 | null, 182 | null, 183 | null, 184 | null 185 | ] 186 | } 187 | } 188 | } -------------------------------------------------------------------------------- /fuzzysearch-api/src/filters.rs: -------------------------------------------------------------------------------- 1 | use crate::{handlers, Pool}; 2 | use crate::{types::*, Endpoints}; 3 | use std::convert::Infallible; 4 | use tracing_futures::Instrument; 5 | use warp::{Filter, Rejection, Reply}; 6 | 7 | pub fn search( 8 | db: Pool, 9 | bkapi: bkapi_client::BKApiClient, 10 | endpoints: Endpoints, 11 | ) -> impl Filter + Clone { 12 | search_image(db.clone(), bkapi.clone(), endpoints) 13 | .or(search_hashes(db.clone(), bkapi.clone())) 14 | .or(search_file(db.clone())) 15 | .or(check_handle(db.clone())) 16 | .or(search_image_by_url(db, bkapi)) 17 | } 18 | 19 | pub fn search_file(db: Pool) -> impl Filter + Clone { 20 | warp::path("file") 21 | .and(warp::header::headers_cloned()) 22 | .and(warp::get()) 23 | .and(warp::query::()) 24 | .and(with_pool(db)) 25 | .and(with_api_key()) 26 | .and_then(|headers, opts, db, api_key| { 27 | use tracing_opentelemetry::OpenTelemetrySpanExt; 28 | 29 | let span = tracing::info_span!("search_file", ?opts); 30 | span.set_parent(with_telem(headers)); 31 | span.in_scope(|| handlers::search_file(opts, db, api_key).in_current_span()) 32 | }) 33 | } 34 | 35 | pub fn search_image( 36 | db: Pool, 37 | bkapi: bkapi_client::BKApiClient, 38 | endpoints: Endpoints, 39 | ) -> impl Filter + Clone { 40 | warp::path("image") 41 | .and(warp::header::headers_cloned()) 42 | .and(warp::post()) 43 | .and(warp::multipart::form().max_length(1024 * 1024 * 10)) 44 | .and(warp::query::()) 45 | .and(with_pool(db)) 46 | .and(with_bkapi(bkapi)) 47 | .and(with_api_key()) 48 | .and(with_endpoints(endpoints)) 49 | .and_then(|headers, form, opts, pool, bkapi, api_key, endpoints| { 50 | use tracing_opentelemetry::OpenTelemetrySpanExt; 51 | 52 | let span = tracing::info_span!("search_image", ?opts); 53 | span.set_parent(with_telem(headers)); 54 | span.in_scope(|| { 55 | handlers::search_image(form, opts, pool, bkapi, api_key, endpoints) 56 | .in_current_span() 57 | }) 58 | }) 59 | } 60 | 61 | pub fn search_image_by_url( 62 | db: Pool, 63 | bkapi: bkapi_client::BKApiClient, 64 | ) -> impl Filter + Clone { 65 | warp::path("url") 66 | .and(warp::get()) 67 | .and(warp::query::()) 68 | .and(with_pool(db)) 69 | .and(with_bkapi(bkapi)) 70 | .and(with_api_key()) 71 | .and_then(handlers::search_image_by_url) 72 | } 73 | 74 | pub fn search_hashes( 75 | db: Pool, 76 | bkapi: bkapi_client::BKApiClient, 77 | ) -> impl Filter + Clone { 78 | warp::path("hashes") 79 | .and(warp::header::headers_cloned()) 80 | .and(warp::get()) 81 | .and(warp::query::()) 82 | .and(with_pool(db)) 83 | .and(with_bkapi(bkapi)) 84 | .and(with_api_key()) 85 | .and_then(|headers, opts, db, bkapi, api_key| { 86 | use tracing_opentelemetry::OpenTelemetrySpanExt; 87 | 88 | let span = tracing::info_span!("search_hashes", ?opts); 89 | span.set_parent(with_telem(headers)); 90 | span.in_scope(|| handlers::search_hashes(opts, db, bkapi, api_key).in_current_span()) 91 | }) 92 | } 93 | 94 | pub fn check_handle(db: Pool) -> impl Filter + Clone { 95 | warp::path("handle") 96 | .and(warp::get()) 97 | .and(warp::query::()) 98 | .and(with_pool(db)) 99 | .and_then(handlers::check_handle) 100 | } 101 | 102 | fn with_api_key() -> impl Filter + Clone { 103 | warp::header::("x-api-key") 104 | } 105 | 106 | fn with_pool(db: Pool) -> impl Filter + Clone { 107 | warp::any().map(move || db.clone()) 108 | } 109 | 110 | fn with_bkapi( 111 | bkapi: bkapi_client::BKApiClient, 112 | ) -> impl Filter + Clone { 113 | warp::any().map(move || bkapi.clone()) 114 | } 115 | 116 | fn with_endpoints( 117 | endpoints: Endpoints, 118 | ) -> impl Filter + Clone { 119 | warp::any().map(move || endpoints.clone()) 120 | } 121 | 122 | fn with_telem(headers: warp::http::HeaderMap) -> opentelemetry::Context { 123 | let remote_context = opentelemetry::global::get_text_map_propagator(|propagator| { 124 | propagator.extract(&opentelemetry_http::HeaderExtractor(&headers)) 125 | }); 126 | 127 | tracing::trace!(?remote_context, "Got remote context"); 128 | 129 | remote_context 130 | } 131 | -------------------------------------------------------------------------------- /fuzzysearch-api/src/handlers.rs: -------------------------------------------------------------------------------- 1 | use futures::StreamExt; 2 | use futures::TryStreamExt; 3 | use hyper::StatusCode; 4 | use lazy_static::lazy_static; 5 | use prometheus::{register_histogram, register_int_counter, Histogram, IntCounter}; 6 | use std::convert::TryInto; 7 | use tracing::{span, warn}; 8 | use tracing_futures::Instrument; 9 | use warp::{Rejection, Reply}; 10 | 11 | use crate::models::image_query; 12 | use crate::types::*; 13 | use crate::Endpoints; 14 | use crate::{early_return, rate_limit, Pool}; 15 | use fuzzysearch_common::{ 16 | trace::InjectContext, 17 | types::{SearchResult, SiteInfo}, 18 | }; 19 | 20 | lazy_static! { 21 | static ref IMAGE_HASH_DURATION: Histogram = register_histogram!( 22 | "fuzzysearch_api_image_hash_seconds", 23 | "Duration to perform an image hash operation" 24 | ) 25 | .unwrap(); 26 | static ref VIDEO_HASH_DURATION: Histogram = register_histogram!( 27 | "fuzzysearch_api_video_hash_seconds", 28 | "Duration to perform a video hash operation" 29 | ) 30 | .unwrap(); 31 | static ref IMAGE_URL_DOWNLOAD_DURATION: Histogram = register_histogram!( 32 | "fuzzysearch_api_image_url_download_seconds", 33 | "Duration to download an image from a provided URL" 34 | ) 35 | .unwrap(); 36 | static ref UNHANDLED_REJECTIONS: IntCounter = register_int_counter!( 37 | "fuzzysearch_api_unhandled_rejections_count", 38 | "Number of unhandled HTTP rejections" 39 | ) 40 | .unwrap(); 41 | } 42 | 43 | #[derive(Debug)] 44 | enum Error { 45 | Postgres(sqlx::Error), 46 | Reqwest(reqwest::Error), 47 | Warp(warp::Error), 48 | InvalidData, 49 | InvalidImage, 50 | ApiKey, 51 | RateLimit, 52 | } 53 | 54 | impl warp::Reply for Error { 55 | fn into_response(self) -> warp::reply::Response { 56 | let msg = match self { 57 | Error::Postgres(_) | Error::Reqwest(_) | Error::Warp(_) => ErrorMessage { 58 | code: 500, 59 | message: "Internal server error".to_string(), 60 | }, 61 | Error::InvalidData => ErrorMessage { 62 | code: 400, 63 | message: "Invalid data provided".to_string(), 64 | }, 65 | Error::InvalidImage => ErrorMessage { 66 | code: 400, 67 | message: "Invalid image provided".to_string(), 68 | }, 69 | Error::ApiKey => ErrorMessage { 70 | code: 401, 71 | message: "Invalid API key".to_string(), 72 | }, 73 | Error::RateLimit => ErrorMessage { 74 | code: 429, 75 | message: "Too many requests".to_string(), 76 | }, 77 | }; 78 | 79 | let body = hyper::body::Body::from(serde_json::to_string(&msg).unwrap()); 80 | 81 | warp::http::Response::builder() 82 | .status(msg.code) 83 | .body(body) 84 | .unwrap() 85 | } 86 | } 87 | 88 | impl From for Error { 89 | fn from(err: sqlx::Error) -> Self { 90 | Error::Postgres(err) 91 | } 92 | } 93 | 94 | impl From for Error { 95 | fn from(err: reqwest::Error) -> Self { 96 | Error::Reqwest(err) 97 | } 98 | } 99 | 100 | impl From for Error { 101 | fn from(err: warp::Error) -> Self { 102 | Self::Warp(err) 103 | } 104 | } 105 | 106 | #[tracing::instrument(skip(endpoints, form))] 107 | async fn hash_input( 108 | endpoints: &Endpoints, 109 | mut form: warp::multipart::FormData, 110 | ) -> Result { 111 | let mut image_part = None; 112 | 113 | tracing::debug!("looking at image parts"); 114 | while let Ok(Some(part)) = form.try_next().await { 115 | if part.name() == "image" { 116 | image_part = Some(part); 117 | } 118 | } 119 | 120 | let image_part = image_part.ok_or(Error::InvalidImage)?; 121 | 122 | tracing::debug!("found image part, reading data"); 123 | let bytes = image_part 124 | .stream() 125 | .fold(bytes::BytesMut::new(), |mut buf, chunk| { 126 | use bytes::BufMut; 127 | 128 | buf.put(chunk.unwrap()); 129 | async move { buf } 130 | }) 131 | .await; 132 | let part = reqwest::multipart::Part::bytes(bytes.to_vec()); 133 | 134 | let form = reqwest::multipart::Form::new().part("image", part); 135 | 136 | tracing::debug!("sending image to hash input service"); 137 | let client = reqwest::Client::new(); 138 | let resp = client 139 | .post(&endpoints.hash_input) 140 | .inject_context() 141 | .multipart(form) 142 | .send() 143 | .await?; 144 | 145 | tracing::debug!("got response"); 146 | if resp.status() != StatusCode::OK { 147 | return Err(Error::InvalidImage); 148 | } 149 | 150 | let hash: i64 = resp 151 | .text() 152 | .await? 153 | .parse() 154 | .map_err(|_err| Error::InvalidImage)?; 155 | 156 | Ok(hash) 157 | } 158 | 159 | pub async fn search_image( 160 | form: warp::multipart::FormData, 161 | opts: ImageSearchOpts, 162 | db: Pool, 163 | bkapi: bkapi_client::BKApiClient, 164 | api_key: String, 165 | endpoints: Endpoints, 166 | ) -> Result, Rejection> { 167 | let image_remaining = rate_limit!(&api_key, &db, image_limit, "image"); 168 | let hash_remaining = rate_limit!(&api_key, &db, hash_limit, "hash"); 169 | 170 | let num = early_return!(hash_input(&endpoints, form).await); 171 | 172 | let mut items = { 173 | if opts.search_type == Some(ImageSearchType::Force) { 174 | image_query(db.clone(), bkapi.clone(), vec![num], 10) 175 | .await 176 | .unwrap() 177 | } else { 178 | let results = image_query(db.clone(), bkapi.clone(), vec![num], 0) 179 | .await 180 | .unwrap(); 181 | if results.is_empty() && opts.search_type != Some(ImageSearchType::Exact) { 182 | image_query(db.clone(), bkapi.clone(), vec![num], 10) 183 | .await 184 | .unwrap() 185 | } else { 186 | results 187 | } 188 | } 189 | }; 190 | 191 | items.sort_by(|a, b| { 192 | a.distance 193 | .unwrap_or(u64::max_value()) 194 | .partial_cmp(&b.distance.unwrap_or(u64::max_value())) 195 | .unwrap() 196 | }); 197 | 198 | let similarity = ImageSimilarity { 199 | hash: num, 200 | matches: items, 201 | }; 202 | 203 | let resp = warp::http::Response::builder() 204 | .header("x-image-hash", num.to_string()) 205 | .header("x-rate-limit-total-image", image_remaining.1.to_string()) 206 | .header( 207 | "x-rate-limit-remaining-image", 208 | image_remaining.0.to_string(), 209 | ) 210 | .header("x-rate-limit-total-hash", hash_remaining.1.to_string()) 211 | .header("x-rate-limit-remaining-hash", hash_remaining.0.to_string()) 212 | .header("content-type", "application/json") 213 | .body(serde_json::to_string(&similarity).unwrap()) 214 | .unwrap(); 215 | 216 | Ok(Box::new(resp)) 217 | } 218 | 219 | pub async fn search_hashes( 220 | opts: HashSearchOpts, 221 | db: Pool, 222 | bkapi: bkapi_client::BKApiClient, 223 | api_key: String, 224 | ) -> Result, Rejection> { 225 | let pool = db.clone(); 226 | 227 | let hashes: Vec = opts 228 | .hashes 229 | .split(',') 230 | .take(10) 231 | .filter_map(|hash| hash.parse::().ok()) 232 | .collect(); 233 | 234 | if hashes.is_empty() { 235 | return Ok(Box::new(Error::InvalidData)); 236 | } 237 | 238 | let image_remaining = rate_limit!(&api_key, &db, image_limit, "image", hashes.len() as i16); 239 | 240 | let results = 241 | early_return!(image_query(pool, bkapi, hashes.clone(), opts.distance.unwrap_or(10)).await); 242 | 243 | let resp = warp::http::Response::builder() 244 | .header("x-rate-limit-total-image", image_remaining.1.to_string()) 245 | .header( 246 | "x-rate-limit-remaining-image", 247 | image_remaining.0.to_string(), 248 | ) 249 | .header("content-type", "application/json") 250 | .body(serde_json::to_string(&results).unwrap()) 251 | .unwrap(); 252 | 253 | Ok(Box::new(resp)) 254 | } 255 | 256 | pub async fn search_file( 257 | opts: FileSearchOpts, 258 | db: Pool, 259 | api_key: String, 260 | ) -> Result, Rejection> { 261 | use sqlx::Row; 262 | 263 | let file_remaining = rate_limit!(&api_key, &db, name_limit, "file"); 264 | 265 | let query = if let Some(ref id) = opts.id { 266 | sqlx::query( 267 | "SELECT 268 | submission.id, 269 | submission.url, 270 | submission.filename, 271 | submission.file_id, 272 | submission.rating, 273 | submission.posted_at, 274 | submission.hash_int, 275 | submission.file_sha256, 276 | artist.name, 277 | array(SELECT tag.name FROM tag_to_post JOIN tag ON tag_to_post.tag_id = tag.id WHERE tag_to_post.post_id = submission.id) tags 278 | FROM 279 | submission 280 | JOIN artist 281 | ON artist.id = submission.artist_id 282 | WHERE 283 | file_id = $1 284 | LIMIT 10", 285 | ) 286 | .bind(id) 287 | } else if let Some(ref name) = opts.name { 288 | sqlx::query( 289 | "SELECT 290 | submission.id, 291 | submission.url, 292 | submission.filename, 293 | submission.file_id, 294 | submission.rating, 295 | submission.posted_at, 296 | submission.hash_int, 297 | submission.file_sha256, 298 | artist.name, 299 | array(SELECT tag.name FROM tag_to_post JOIN tag ON tag_to_post.tag_id = tag.id WHERE tag_to_post.post_id = submission.id) tags 300 | FROM 301 | submission 302 | JOIN artist 303 | ON artist.id = submission.artist_id 304 | WHERE 305 | lower(filename) = lower($1) 306 | LIMIT 10", 307 | ) 308 | .bind(name) 309 | } else if let Some(ref url) = opts.url { 310 | sqlx::query( 311 | "SELECT 312 | submission.id, 313 | submission.url, 314 | submission.filename, 315 | submission.file_id, 316 | submission.rating, 317 | submission.posted_at, 318 | submission.hash_int, 319 | submission.file_sha256, 320 | artist.name, 321 | array(SELECT tag.name FROM tag_to_post JOIN tag ON tag_to_post.tag_id = tag.id WHERE tag_to_post.post_id = submission.id) tags 322 | FROM 323 | submission 324 | JOIN artist 325 | ON artist.id = submission.artist_id 326 | WHERE 327 | lower(url) = lower($1) 328 | LIMIT 10", 329 | ) 330 | .bind(url) 331 | } else if let Some(ref site_id) = opts.site_id { 332 | sqlx::query( 333 | "SELECT 334 | submission.id, 335 | submission.url, 336 | submission.filename, 337 | submission.file_id, 338 | submission.rating, 339 | submission.posted_at, 340 | submission.hash_int, 341 | submission.file_sha256, 342 | artist.name, 343 | array(SELECT tag.name FROM tag_to_post JOIN tag ON tag_to_post.tag_id = tag.id WHERE tag_to_post.post_id = submission.id) tags 344 | FROM 345 | submission 346 | JOIN artist 347 | ON artist.id = submission.artist_id 348 | WHERE 349 | submission.id = $1 350 | LIMIT 10", 351 | ) 352 | .bind(site_id) 353 | } else { 354 | return Ok(Box::new(Error::InvalidData)); 355 | }; 356 | 357 | let matches: Result, _> = query 358 | .map(|row| SearchResult { 359 | site_id: row.get::("id") as i64, 360 | site_id_str: row.get::("id").to_string(), 361 | url: row.get("url"), 362 | filename: row.get("filename"), 363 | posted_at: row.get("posted_at"), 364 | artists: row 365 | .get::, _>("name") 366 | .map(|artist| vec![artist]), 367 | tags: row.get("tags"), 368 | sha256: row 369 | .get::>, _>("file_sha256") 370 | .map(hex::encode), 371 | distance: None, 372 | hash: row.get::, _>("hash_int"), 373 | searched_hash: None, 374 | site_info: Some(SiteInfo::FurAffinity { 375 | file_id: row.get("file_id"), 376 | }), 377 | rating: row 378 | .get::, _>("rating") 379 | .and_then(|rating| rating.parse().ok()), 380 | }) 381 | .fetch_all(&db) 382 | .await; 383 | 384 | let matches = early_return!(matches); 385 | 386 | let resp = warp::http::Response::builder() 387 | .header("x-rate-limit-total-file", file_remaining.1.to_string()) 388 | .header("x-rate-limit-remaining-file", file_remaining.0.to_string()) 389 | .header("content-type", "application/json") 390 | .body(serde_json::to_string(&matches).unwrap()) 391 | .unwrap(); 392 | 393 | Ok(Box::new(resp)) 394 | } 395 | 396 | pub async fn check_handle(opts: HandleOpts, db: Pool) -> Result, Rejection> { 397 | let exists = if let Some(handle) = opts.twitter { 398 | let result = sqlx::query_scalar!("SELECT exists(SELECT 1 FROM twitter_user WHERE lower(data->>'screen_name') = lower($1))", handle) 399 | .fetch_optional(&db) 400 | .await 401 | .map(|row| row.flatten().unwrap_or(false)); 402 | 403 | early_return!(result) 404 | } else { 405 | false 406 | }; 407 | 408 | Ok(Box::new(warp::reply::json(&exists))) 409 | } 410 | 411 | pub async fn search_image_by_url( 412 | opts: UrlSearchOpts, 413 | db: Pool, 414 | bkapi: bkapi_client::BKApiClient, 415 | api_key: String, 416 | ) -> Result, Rejection> { 417 | use bytes::BufMut; 418 | 419 | let url = opts.url; 420 | 421 | let image_remaining = rate_limit!(&api_key, &db, image_limit, "image"); 422 | let hash_remaining = rate_limit!(&api_key, &db, hash_limit, "hash"); 423 | 424 | let _timer = IMAGE_URL_DOWNLOAD_DURATION.start_timer(); 425 | 426 | let mut resp = match reqwest::get(&url).await { 427 | Ok(resp) => resp, 428 | Err(_err) => return Ok(Box::new(Error::InvalidImage)), 429 | }; 430 | 431 | let content_length = resp 432 | .headers() 433 | .get("content-length") 434 | .and_then(|len| { 435 | String::from_utf8_lossy(len.as_bytes()) 436 | .parse::() 437 | .ok() 438 | }) 439 | .unwrap_or(0); 440 | 441 | if content_length > 10_000_000 { 442 | return Ok(Box::new(Error::InvalidImage)); 443 | } 444 | 445 | let mut buf = bytes::BytesMut::with_capacity(content_length); 446 | 447 | while let Some(chunk) = early_return!(resp.chunk().await) { 448 | if buf.len() + chunk.len() > 10_000_000 { 449 | return Ok(Box::new(Error::InvalidImage)); 450 | } 451 | 452 | buf.put(chunk); 453 | } 454 | 455 | drop(_timer); 456 | 457 | let _timer = IMAGE_HASH_DURATION.start_timer(); 458 | let hash = tokio::task::spawn_blocking(move || { 459 | let hasher = fuzzysearch_common::get_hasher(); 460 | let image = image::load_from_memory(&buf).unwrap(); 461 | hasher.hash_image(&image) 462 | }) 463 | .instrument(span!(tracing::Level::TRACE, "hashing image")) 464 | .await 465 | .unwrap(); 466 | drop(_timer); 467 | 468 | let hash: [u8; 8] = hash.as_bytes().try_into().unwrap(); 469 | let num = i64::from_be_bytes(hash); 470 | 471 | let results = image_query(db.clone(), bkapi.clone(), vec![num], 3) 472 | .await 473 | .unwrap(); 474 | 475 | let resp = warp::http::Response::builder() 476 | .header("x-image-hash", num.to_string()) 477 | .header("x-rate-limit-total-image", image_remaining.1.to_string()) 478 | .header( 479 | "x-rate-limit-remaining-image", 480 | image_remaining.0.to_string(), 481 | ) 482 | .header("x-rate-limit-total-hash", hash_remaining.1.to_string()) 483 | .header("x-rate-limit-remaining-hash", hash_remaining.0.to_string()) 484 | .header("content-type", "application/json") 485 | .body(serde_json::to_string(&results).unwrap()) 486 | .unwrap(); 487 | 488 | Ok(Box::new(resp)) 489 | } 490 | 491 | #[tracing::instrument] 492 | pub async fn handle_rejection(err: Rejection) -> Result, std::convert::Infallible> { 493 | warn!("had rejection"); 494 | 495 | UNHANDLED_REJECTIONS.inc(); 496 | 497 | let (code, message) = if err.is_not_found() { 498 | ( 499 | warp::http::StatusCode::NOT_FOUND, 500 | "This page does not exist", 501 | ) 502 | } else if err.find::().is_some() { 503 | return Ok(Box::new(Error::InvalidData) as Box); 504 | } else if err.find::().is_some() { 505 | return Ok(Box::new(Error::InvalidData) as Box); 506 | } else { 507 | ( 508 | warp::http::StatusCode::INTERNAL_SERVER_ERROR, 509 | "An unknown error occured", 510 | ) 511 | }; 512 | 513 | let json = warp::reply::json(&ErrorMessage { 514 | code: code.as_u16(), 515 | message: message.into(), 516 | }); 517 | 518 | Ok(Box::new(warp::reply::with_status(json, code))) 519 | } 520 | -------------------------------------------------------------------------------- /fuzzysearch-api/src/main.rs: -------------------------------------------------------------------------------- 1 | #![recursion_limit = "256"] 2 | 3 | use warp::Filter; 4 | 5 | mod filters; 6 | mod handlers; 7 | mod models; 8 | mod types; 9 | mod utils; 10 | 11 | type Pool = sqlx::PgPool; 12 | 13 | #[derive(Clone)] 14 | pub struct Endpoints { 15 | pub hash_input: String, 16 | pub bkapi: String, 17 | } 18 | 19 | #[tokio::main] 20 | async fn main() { 21 | fuzzysearch_common::trace::configure_tracing("fuzzysearch-api"); 22 | fuzzysearch_common::trace::serve_metrics().await; 23 | 24 | let s = std::env::var("DATABASE_URL").expect("Missing DATABASE_URL"); 25 | 26 | let db_pool = sqlx::PgPool::connect(&s) 27 | .await 28 | .expect("Unable to create Postgres pool"); 29 | 30 | let endpoints = Endpoints { 31 | hash_input: std::env::var("ENDPOINT_HASH_INPUT").expect("Missing ENDPOINT_HASH_INPUT"), 32 | bkapi: std::env::var("ENDPOINT_BKAPI").expect("Missing ENDPOINT_BKAPI"), 33 | }; 34 | 35 | let bkapi = bkapi_client::BKApiClient::new(&endpoints.bkapi); 36 | 37 | let log = warp::log("fuzzysearch-api"); 38 | let cors = warp::cors() 39 | .allow_any_origin() 40 | .allow_headers(vec!["x-api-key"]) 41 | .allow_methods(vec!["GET", "POST"]); 42 | 43 | let options = warp::options().map(|| "✓"); 44 | 45 | let api = options.or(filters::search(db_pool, bkapi, endpoints)); 46 | let routes = api 47 | .or(warp::path::end() 48 | .map(|| warp::redirect(warp::http::Uri::from_static("https://fuzzysearch.net")))) 49 | .with(log) 50 | .with(cors) 51 | .recover(handlers::handle_rejection); 52 | 53 | warp::serve(routes).run(([0, 0, 0, 0], 8080)).await; 54 | } 55 | -------------------------------------------------------------------------------- /fuzzysearch-api/src/models.rs: -------------------------------------------------------------------------------- 1 | use lazy_static::lazy_static; 2 | use prometheus::{register_histogram, Histogram}; 3 | 4 | use crate::types::*; 5 | use crate::Pool; 6 | use fuzzysearch_common::types::{SearchResult, SiteInfo}; 7 | 8 | lazy_static! { 9 | static ref IMAGE_QUERY_DURATION: Histogram = register_histogram!( 10 | "fuzzysearch_api_image_query_seconds", 11 | "Duration to perform a single image lookup query" 12 | ) 13 | .unwrap(); 14 | } 15 | 16 | #[tracing::instrument(skip(db))] 17 | pub async fn lookup_api_key(key: &str, db: &sqlx::PgPool) -> Option { 18 | sqlx::query_as!( 19 | ApiKey, 20 | "SELECT 21 | api_key.id, 22 | api_key.name_limit, 23 | api_key.image_limit, 24 | api_key.hash_limit, 25 | api_key.name, 26 | account.email owner_email 27 | FROM 28 | api_key 29 | JOIN account 30 | ON account.id = api_key.user_id 31 | WHERE 32 | api_key.key = $1 33 | ", 34 | key 35 | ) 36 | .fetch_optional(db) 37 | .await 38 | .ok() 39 | .flatten() 40 | } 41 | 42 | #[derive(serde::Serialize)] 43 | struct HashSearch { 44 | searched_hash: i64, 45 | found_hash: i64, 46 | distance: u64, 47 | } 48 | 49 | #[tracing::instrument(skip(pool, bkapi))] 50 | pub async fn image_query( 51 | pool: Pool, 52 | bkapi: bkapi_client::BKApiClient, 53 | hashes: Vec, 54 | distance: i64, 55 | ) -> Result, sqlx::Error> { 56 | let found_hashes: Vec = bkapi 57 | .search_many(&hashes, distance as u64) 58 | .await 59 | .unwrap() 60 | .into_iter() 61 | .flat_map(|results| { 62 | results 63 | .hashes 64 | .iter() 65 | .map(|hash| HashSearch { 66 | searched_hash: results.hash, 67 | found_hash: hash.hash, 68 | distance: hash.distance, 69 | }) 70 | .collect::>() 71 | }) 72 | .collect(); 73 | 74 | let timer = IMAGE_QUERY_DURATION.start_timer(); 75 | let matches = sqlx::query!( 76 | r#"WITH hashes AS ( 77 | SELECT * FROM jsonb_to_recordset($1::jsonb) 78 | AS hashes(searched_hash bigint, found_hash bigint, distance bigint) 79 | ) 80 | SELECT 81 | 'FurAffinity' site, 82 | submission.id, 83 | submission.hash_int hash, 84 | submission.url, 85 | submission.filename, 86 | ARRAY(SELECT artist.name) artists, 87 | submission.file_id, 88 | null sources, 89 | submission.rating, 90 | submission.posted_at, 91 | hashes.searched_hash, 92 | hashes.distance, 93 | submission.file_sha256 sha256 94 | FROM hashes 95 | JOIN submission ON hashes.found_hash = submission.hash_int 96 | JOIN artist ON submission.artist_id = artist.id 97 | WHERE hash_int IN (SELECT hashes.found_hash) 98 | UNION ALL 99 | SELECT 100 | 'e621' site, 101 | e621.id, 102 | e621.hash, 103 | e621.data->'file'->>'url' url, 104 | (e621.data->'file'->>'md5') || '.' || (e621.data->'file'->>'ext') filename, 105 | ARRAY(SELECT jsonb_array_elements_text(e621.data->'tags'->'artist')) artists, 106 | null file_id, 107 | ARRAY(SELECT jsonb_array_elements_text(e621.data->'sources')) sources, 108 | e621.data->>'rating' rating, 109 | to_timestamp(data->>'created_at', 'YYYY-MM-DD"T"HH24:MI:SS"Z"') posted_at, 110 | hashes.searched_hash, 111 | hashes.distance, 112 | e621.sha256 113 | FROM hashes 114 | JOIN e621 ON hashes.found_hash = e621.hash 115 | WHERE e621.hash IN (SELECT hashes.found_hash) 116 | UNION ALL 117 | SELECT 118 | 'Weasyl' site, 119 | weasyl.id, 120 | weasyl.hash, 121 | weasyl.data->>'link' url, 122 | null filename, 123 | ARRAY(SELECT weasyl.data->>'owner_login') artists, 124 | null file_id, 125 | null sources, 126 | weasyl.data->>'rating' rating, 127 | to_timestamp(data->>'posted_at', 'YYYY-MM-DD"T"HH24:MI:SS"Z"') posted_at, 128 | hashes.searched_hash, 129 | hashes.distance, 130 | weasyl.sha256 131 | FROM hashes 132 | JOIN weasyl ON hashes.found_hash = weasyl.hash 133 | WHERE weasyl.hash IN (SELECT hashes.found_hash) 134 | UNION ALL 135 | SELECT 136 | 'Twitter' site, 137 | tweet.id, 138 | tweet_media.hash, 139 | tweet_media.url, 140 | null filename, 141 | ARRAY(SELECT tweet.data->'user'->>'screen_name') artists, 142 | null file_id, 143 | null sources, 144 | CASE 145 | WHEN (tweet.data->'possibly_sensitive')::boolean IS true THEN 'adult' 146 | WHEN (tweet.data->'possibly_sensitive')::boolean IS false THEN 'general' 147 | END rating, 148 | to_timestamp(tweet.data->>'created_at', 'DY Mon DD HH24:MI:SS +0000 YYYY') posted_at, 149 | hashes.searched_hash, 150 | hashes.distance, 151 | null sha256 152 | FROM hashes 153 | JOIN tweet_media ON hashes.found_hash = tweet_media.hash 154 | JOIN tweet ON tweet_media.tweet_id = tweet.id 155 | WHERE tweet_media.hash IN (SELECT hashes.found_hash)"#, 156 | serde_json::to_value(&found_hashes).unwrap() 157 | ) 158 | .map(|row| { 159 | use std::convert::TryFrom; 160 | 161 | let site_info = match row.site.as_deref() { 162 | Some("FurAffinity") => SiteInfo::FurAffinity { 163 | file_id: row.file_id.unwrap_or(-1), 164 | }, 165 | Some("e621") => SiteInfo::E621 { 166 | sources: row.sources, 167 | }, 168 | Some("Twitter") => SiteInfo::Twitter, 169 | Some("Weasyl") => SiteInfo::Weasyl, 170 | _ => panic!("Got unknown site"), 171 | }; 172 | 173 | SearchResult { 174 | site_id: row.id.unwrap_or_default(), 175 | site_info: Some(site_info), 176 | rating: row.rating.and_then(|rating| rating.parse().ok()), 177 | site_id_str: row.id.unwrap_or_default().to_string(), 178 | url: row.url.unwrap_or_default(), 179 | posted_at: row.posted_at, 180 | tags: None, 181 | sha256: row.sha256.map(hex::encode), 182 | hash: row.hash, 183 | distance: row 184 | .distance 185 | .and_then(|distance| u64::try_from(distance).ok()), 186 | artists: row.artists, 187 | filename: row.filename.unwrap_or_default(), 188 | searched_hash: row.searched_hash, 189 | } 190 | }) 191 | .fetch_all(&pool) 192 | .await?; 193 | timer.stop_and_record(); 194 | 195 | Ok(matches) 196 | } 197 | -------------------------------------------------------------------------------- /fuzzysearch-api/src/types.rs: -------------------------------------------------------------------------------- 1 | use serde::{Deserialize, Serialize}; 2 | 3 | use fuzzysearch_common::types::SearchResult; 4 | 5 | /// An API key representation from the database.alloc 6 | /// 7 | /// May contain information about the owner, always has rate limit information. 8 | /// Limits are the number of requests allowed per minute. 9 | #[derive(Debug)] 10 | pub struct ApiKey { 11 | pub id: i32, 12 | pub name: Option, 13 | pub owner_email: String, 14 | pub name_limit: i16, 15 | pub image_limit: i16, 16 | pub hash_limit: i16, 17 | } 18 | 19 | /// The status of an API key's rate limit. 20 | #[derive(Debug, PartialEq)] 21 | pub enum RateLimit { 22 | /// This key is limited, we should deny the request. 23 | Limited, 24 | /// This key is available, contains the number of requests made. 25 | Available((i16, i16)), 26 | } 27 | 28 | #[derive(Debug, Deserialize)] 29 | pub struct FileSearchOpts { 30 | pub id: Option, 31 | pub name: Option, 32 | pub url: Option, 33 | pub site_id: Option, 34 | } 35 | 36 | #[derive(Debug, Deserialize)] 37 | pub struct ImageSearchOpts { 38 | #[serde(rename = "type")] 39 | pub search_type: Option, 40 | } 41 | 42 | #[derive(Debug, Deserialize, PartialEq)] 43 | #[serde(rename_all = "lowercase")] 44 | pub enum ImageSearchType { 45 | Close, 46 | Exact, 47 | Force, 48 | } 49 | 50 | #[derive(Debug, Serialize)] 51 | pub struct ImageSimilarity { 52 | pub hash: i64, 53 | pub matches: Vec, 54 | } 55 | 56 | #[derive(Serialize)] 57 | pub struct ErrorMessage { 58 | pub code: u16, 59 | pub message: String, 60 | } 61 | 62 | #[derive(Debug, Deserialize)] 63 | pub struct HashSearchOpts { 64 | pub hashes: String, 65 | pub distance: Option, 66 | } 67 | 68 | #[derive(Debug, Deserialize)] 69 | pub struct HandleOpts { 70 | pub twitter: Option, 71 | } 72 | 73 | #[derive(Debug, Deserialize)] 74 | pub struct UrlSearchOpts { 75 | pub url: String, 76 | } 77 | -------------------------------------------------------------------------------- /fuzzysearch-api/src/utils.rs: -------------------------------------------------------------------------------- 1 | use crate::types::*; 2 | use lazy_static::lazy_static; 3 | use prometheus::{register_int_counter_vec, IntCounterVec}; 4 | 5 | lazy_static! { 6 | pub static ref RATE_LIMIT_STATUS: IntCounterVec = register_int_counter_vec!( 7 | "fuzzysearch_api_rate_limit_count", 8 | "Number of allowed and rate limited requests", 9 | &["status"] 10 | ) 11 | .unwrap(); 12 | } 13 | 14 | #[macro_export] 15 | macro_rules! rate_limit { 16 | ($api_key:expr, $db:expr, $limit:tt, $group:expr) => { 17 | rate_limit!($api_key, $db, $limit, $group, 1) 18 | }; 19 | 20 | ($api_key:expr, $db:expr, $limit:tt, $group:expr, $incr_by:expr) => {{ 21 | let api_key = match crate::models::lookup_api_key($api_key, $db).await { 22 | Some(api_key) => api_key, 23 | None => return Ok(Box::new(Error::ApiKey)), 24 | }; 25 | 26 | let rate_limit = match crate::utils::update_rate_limit( 27 | $db, 28 | api_key.id, 29 | api_key.$limit, 30 | $group, 31 | $incr_by, 32 | ) 33 | .await 34 | { 35 | Ok(rate_limit) => rate_limit, 36 | Err(err) => return Ok(Box::new(Error::Postgres(err))), 37 | }; 38 | 39 | match rate_limit { 40 | crate::types::RateLimit::Limited => { 41 | crate::utils::RATE_LIMIT_STATUS 42 | .with_label_values(&["limited"]) 43 | .inc(); 44 | return Ok(Box::new(Error::RateLimit)); 45 | } 46 | crate::types::RateLimit::Available(count) => { 47 | crate::utils::RATE_LIMIT_STATUS 48 | .with_label_values(&["allowed"]) 49 | .inc(); 50 | count 51 | } 52 | } 53 | }}; 54 | } 55 | 56 | #[macro_export] 57 | macro_rules! early_return { 58 | ($val:expr) => { 59 | match $val { 60 | Ok(val) => val, 61 | Err(err) => return Ok(Box::new(Error::from(err))), 62 | } 63 | }; 64 | } 65 | 66 | /// Increment the rate limit for a group. 67 | /// 68 | /// We need to specify the ID of the API key to increment, the key's limit for 69 | /// the specified group, the name of the group we're incrementing, and the 70 | /// amount to increment for this request. This should remain as 1 except for 71 | /// joined requests. 72 | #[tracing::instrument(skip(db))] 73 | pub async fn update_rate_limit( 74 | db: &sqlx::PgPool, 75 | key_id: i32, 76 | key_group_limit: i16, 77 | group_name: &'static str, 78 | incr_by: i16, 79 | ) -> Result { 80 | let now = chrono::Utc::now(); 81 | let timestamp = now.timestamp(); 82 | let time_window = timestamp - (timestamp % 60); 83 | 84 | let count: i16 = sqlx::query_scalar!( 85 | "INSERT INTO 86 | rate_limit (api_key_id, time_window, group_name, count) 87 | VALUES 88 | ($1, $2, $3, $4) 89 | ON CONFLICT ON CONSTRAINT unique_window 90 | DO UPDATE set count = rate_limit.count + $4 91 | RETURNING rate_limit.count", 92 | key_id, 93 | time_window, 94 | group_name, 95 | incr_by 96 | ) 97 | .fetch_one(db) 98 | .await?; 99 | 100 | if count > key_group_limit { 101 | Ok(RateLimit::Limited) 102 | } else { 103 | Ok(RateLimit::Available(( 104 | key_group_limit - count, 105 | key_group_limit, 106 | ))) 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /fuzzysearch-common/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "fuzzysearch-common" 3 | version = "0.1.0" 4 | authors = ["Syfaro "] 5 | edition = "2018" 6 | 7 | [features] 8 | default = ["trace", "download"] 9 | 10 | queue = ["faktory", "tokio", "serde_json"] 11 | trace = ["opentelemetry", "opentelemetry-jaeger", "tracing-opentelemetry", "opentelemetry-http", "hyper", "prometheus", "tokio", "reqwest"] 12 | download = ["tokio"] 13 | 14 | [dependencies] 15 | anyhow = "1" 16 | tracing = "0.1" 17 | tracing-subscriber = { version = "0.3", features = ["env-filter", "json", "time"] } 18 | tracing-log = "0.1" 19 | 20 | tokio = { version = "1", features = ["rt", "fs"], optional = true } 21 | futures = "0.3" 22 | 23 | serde = { version = "1", features = ["derive"] } 24 | serde_json = { version = "1", optional = true } 25 | 26 | base64 = "0.13" 27 | image = "0.23" 28 | img_hash = "3" 29 | hex = "0.4" 30 | chrono = { version = "0.4", features = ["serde"] } 31 | 32 | tempfile = { version = "3", optional = true } 33 | 34 | faktory = { version = "0.11", optional = true } 35 | 36 | opentelemetry = { version = "0.17.0", features = ["rt-tokio"], optional = true } 37 | opentelemetry-jaeger = { version = "0.16", features = ["rt-tokio"], optional = true } 38 | tracing-opentelemetry = { version = "0.17", optional = true } 39 | opentelemetry-http = { version = "0.6", optional = true } 40 | 41 | hyper = { version = "0.14", features = ["server", "http2", "tcp"], optional = true } 42 | prometheus = { version = "0.13", optional = true } 43 | reqwest = { version = "0.11", optional = true } 44 | -------------------------------------------------------------------------------- /fuzzysearch-common/src/download.rs: -------------------------------------------------------------------------------- 1 | use tokio::io::AsyncWriteExt; 2 | 3 | pub async fn write_bytes(folder: &str, hash: &[u8], bytes: &[u8]) -> std::io::Result<()> { 4 | let hex_hash = hex::encode(&hash); 5 | tracing::debug!("writing {} to {}", hex_hash, folder); 6 | 7 | let hash_folder = std::path::PathBuf::from(folder) 8 | .join(&hex_hash[0..2]) 9 | .join(&hex_hash[2..4]); 10 | 11 | match tokio::fs::create_dir_all(&hash_folder).await { 12 | Ok(_) => (), 13 | Err(err) if err.kind() == std::io::ErrorKind::AlreadyExists => (), 14 | Err(err) => return Err(err), 15 | } 16 | 17 | let file_path = hash_folder.join(hex_hash); 18 | let mut file = match tokio::fs::File::create(file_path).await { 19 | Ok(file) => file, 20 | Err(err) if err.kind() == std::io::ErrorKind::AlreadyExists => return Ok(()), 21 | Err(err) => return Err(err), 22 | }; 23 | 24 | file.write_all(bytes).await?; 25 | 26 | Ok(()) 27 | } 28 | -------------------------------------------------------------------------------- /fuzzysearch-common/src/faktory.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use std::net::TcpStream; 3 | use std::sync::{Arc, Mutex}; 4 | 5 | use serde::{Deserialize, Serialize}; 6 | 7 | /// A wrapper around Faktory, providing an async interface to common operations. 8 | #[derive(Clone)] 9 | pub struct FaktoryClient { 10 | faktory: Arc>>, 11 | } 12 | 13 | impl FaktoryClient { 14 | /// Connect to a Faktory instance. 15 | pub async fn connect>(host: H) -> anyhow::Result { 16 | let host = host.into(); 17 | 18 | let producer = tokio::task::spawn_blocking(move || { 19 | faktory::Producer::connect(Some(&host)) 20 | .map_err(|err| anyhow::format_err!("Unable to connect to Faktory: {:?}", err)) 21 | }) 22 | .await??; 23 | 24 | let faktory = Arc::new(Mutex::new(producer)); 25 | 26 | Ok(FaktoryClient { faktory }) 27 | } 28 | 29 | /// Enqueue a new job. 30 | #[tracing::instrument(err, skip(self))] 31 | pub async fn enqueue(&self, mut job: faktory::Job) -> anyhow::Result<()> { 32 | let faktory = self.faktory.clone(); 33 | 34 | tracing::trace!("Attempting to enqueue job"); 35 | job.custom = get_faktory_custom() 36 | .into_iter() 37 | .chain(job.custom.into_iter()) 38 | .collect(); 39 | 40 | tokio::task::spawn_blocking(move || { 41 | let mut faktory = faktory.lock().unwrap(); 42 | faktory 43 | .enqueue(job) 44 | .map_err(|err| anyhow::format_err!("Unable to enqueue job: {:?}", err)) 45 | }) 46 | .await??; 47 | 48 | tracing::debug!("Enqueued job"); 49 | 50 | Ok(()) 51 | } 52 | 53 | /// Create a new job for webhook data and enqueue it. 54 | pub async fn queue_webhook(&self, data: WebHookData) -> anyhow::Result<()> { 55 | let value = serde_json::value::to_value(data)?; 56 | let mut job = 57 | faktory::Job::new("new_submission", vec![value]).on_queue("fuzzysearch_webhook"); 58 | job.retry = Some(3); 59 | job.reserve_for = Some(30); 60 | self.enqueue(job).await 61 | } 62 | } 63 | 64 | fn get_faktory_custom() -> HashMap { 65 | use opentelemetry::propagation::TextMapPropagator; 66 | use tracing_opentelemetry::OpenTelemetrySpanExt; 67 | 68 | let context = tracing::Span::current().context(); 69 | 70 | let mut extra: HashMap = Default::default(); 71 | let propagator = opentelemetry::sdk::propagation::TraceContextPropagator::new(); 72 | propagator.inject_context(&context, &mut extra); 73 | 74 | extra 75 | .into_iter() 76 | .filter_map(|(key, value)| match serde_json::to_value(value) { 77 | Ok(val) => Some((key, val)), 78 | _ => None, 79 | }) 80 | .collect() 81 | } 82 | 83 | #[derive(Clone, Debug, Deserialize, Serialize)] 84 | pub struct WebHookData { 85 | pub site: crate::types::Site, 86 | #[serde(with = "string")] 87 | pub site_id: i64, 88 | pub artist: String, 89 | pub file_url: String, 90 | #[serde(with = "b64_vec")] 91 | pub file_sha256: Option>, 92 | #[serde(with = "b64_u8")] 93 | pub hash: Option<[u8; 8]>, 94 | } 95 | 96 | mod b64_vec { 97 | use serde::Deserialize; 98 | 99 | pub fn serialize(bytes: &Option>, serializer: S) -> Result 100 | where 101 | S: serde::Serializer, 102 | { 103 | match bytes { 104 | Some(bytes) => serializer.serialize_str(&base64::encode(bytes)), 105 | None => serializer.serialize_none(), 106 | } 107 | } 108 | 109 | pub fn deserialize<'de, D>(deserializer: D) -> Result>, D::Error> 110 | where 111 | D: serde::Deserializer<'de>, 112 | { 113 | let val = >::deserialize(deserializer)? 114 | .map(base64::decode) 115 | .transpose() 116 | .map_err(serde::de::Error::custom)?; 117 | 118 | Ok(val) 119 | } 120 | } 121 | 122 | mod b64_u8 { 123 | use std::convert::TryInto; 124 | 125 | use serde::Deserialize; 126 | 127 | pub fn serialize( 128 | bytes: &Option<[u8; N]>, 129 | serializer: S, 130 | ) -> Result 131 | where 132 | S: serde::Serializer, 133 | { 134 | match bytes { 135 | Some(bytes) => serializer.serialize_str(&base64::encode(bytes)), 136 | None => serializer.serialize_none(), 137 | } 138 | } 139 | 140 | pub fn deserialize<'de, D, const N: usize>(deserializer: D) -> Result, D::Error> 141 | where 142 | D: serde::Deserializer<'de>, 143 | { 144 | let val = >::deserialize(deserializer)? 145 | .map(base64::decode) 146 | .transpose() 147 | .map_err(serde::de::Error::custom)? 148 | .map(|bytes| bytes.try_into()) 149 | .transpose() 150 | .map_err(|_err| "value did not have correct number of bytes") 151 | .map_err(serde::de::Error::custom)?; 152 | 153 | Ok(val) 154 | } 155 | } 156 | 157 | pub mod string { 158 | use std::fmt::Display; 159 | use std::str::FromStr; 160 | 161 | use serde::{de, Deserialize, Deserializer, Serializer}; 162 | 163 | pub fn serialize(value: &T, serializer: S) -> Result 164 | where 165 | T: Display, 166 | S: Serializer, 167 | { 168 | serializer.collect_str(value) 169 | } 170 | 171 | pub fn deserialize<'de, T, D>(deserializer: D) -> Result 172 | where 173 | T: FromStr, 174 | T::Err: Display, 175 | D: Deserializer<'de>, 176 | { 177 | String::deserialize(deserializer)? 178 | .parse() 179 | .map_err(de::Error::custom) 180 | } 181 | } 182 | -------------------------------------------------------------------------------- /fuzzysearch-common/src/lib.rs: -------------------------------------------------------------------------------- 1 | #[cfg(feature = "queue")] 2 | pub mod faktory; 3 | pub mod types; 4 | 5 | #[cfg(feature = "trace")] 6 | pub mod trace; 7 | 8 | #[cfg(feature = "download")] 9 | pub mod download; 10 | 11 | /// Create an instance of img_hash with project defaults. 12 | pub fn get_hasher() -> img_hash::Hasher<[u8; 8]> { 13 | use img_hash::{HashAlg::Gradient, HasherConfig}; 14 | 15 | HasherConfig::with_bytes_type::<[u8; 8]>() 16 | .hash_alg(Gradient) 17 | .hash_size(8, 8) 18 | .preproc_dct() 19 | .to_hasher() 20 | } 21 | 22 | /// Initialize the logger. This should only be called by the running binary. 23 | pub fn init_logger() { 24 | if matches!(std::env::var("LOG_FMT").as_deref(), Ok("json")) { 25 | tracing_subscriber::fmt::Subscriber::builder() 26 | .json() 27 | .with_env_filter(tracing_subscriber::EnvFilter::from_default_env()) 28 | .with_timer(tracing_subscriber::fmt::time::UtcTime::rfc_3339()) 29 | .init(); 30 | } else { 31 | tracing_subscriber::fmt::init(); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /fuzzysearch-common/src/trace.rs: -------------------------------------------------------------------------------- 1 | pub fn configure_tracing(service_name: &'static str) { 2 | use opentelemetry::KeyValue; 3 | use tracing_subscriber::layer::SubscriberExt; 4 | 5 | tracing_log::LogTracer::init().unwrap(); 6 | 7 | let env = std::env::var("ENVIRONMENT"); 8 | let env = if let Ok(env) = env.as_ref() { 9 | env.as_str() 10 | } else if cfg!(debug_assertions) { 11 | "debug" 12 | } else { 13 | "release" 14 | }; 15 | 16 | opentelemetry::global::set_text_map_propagator(opentelemetry_jaeger::Propagator::new()); 17 | 18 | let tracer = opentelemetry_jaeger::new_pipeline() 19 | .with_agent_endpoint(std::env::var("JAEGER_COLLECTOR").expect("Missing JAEGER_COLLECTOR")) 20 | .with_service_name(service_name) 21 | .with_tags(vec![ 22 | KeyValue::new("environment", env.to_owned()), 23 | KeyValue::new("version", env!("CARGO_PKG_VERSION")), 24 | ]) 25 | .install_batch(opentelemetry::runtime::Tokio) 26 | .unwrap(); 27 | 28 | let trace = tracing_opentelemetry::layer().with_tracer(tracer); 29 | let env_filter = tracing_subscriber::EnvFilter::from_default_env(); 30 | 31 | if matches!(std::env::var("LOG_FMT").as_deref(), Ok("json")) { 32 | let subscriber = tracing_subscriber::fmt::layer() 33 | .json() 34 | .with_timer(tracing_subscriber::fmt::time::UtcTime::rfc_3339()) 35 | .with_target(true); 36 | let subscriber = tracing_subscriber::Registry::default() 37 | .with(env_filter) 38 | .with(trace) 39 | .with(subscriber); 40 | tracing::subscriber::set_global_default(subscriber).unwrap(); 41 | } else { 42 | let subscriber = tracing_subscriber::fmt::layer(); 43 | let subscriber = tracing_subscriber::Registry::default() 44 | .with(env_filter) 45 | .with(trace) 46 | .with(subscriber); 47 | tracing::subscriber::set_global_default(subscriber).unwrap(); 48 | } 49 | 50 | tracing::debug!(service_name, "set application tracing service name"); 51 | } 52 | 53 | async fn metrics( 54 | req: hyper::Request, 55 | ) -> Result, std::convert::Infallible> { 56 | use hyper::{Body, Response, StatusCode}; 57 | 58 | match req.uri().path() { 59 | "/health" => Ok(Response::new(Body::from("OK"))), 60 | "/metrics" => { 61 | use prometheus::{Encoder, TextEncoder}; 62 | 63 | let mut buffer = Vec::new(); 64 | let encoder = TextEncoder::new(); 65 | 66 | let metric_families = prometheus::gather(); 67 | encoder.encode(&metric_families, &mut buffer).unwrap(); 68 | 69 | Ok(Response::new(Body::from(buffer))) 70 | } 71 | _ => { 72 | let mut not_found = Response::new(Body::default()); 73 | *not_found.status_mut() = StatusCode::NOT_FOUND; 74 | Ok(not_found) 75 | } 76 | } 77 | } 78 | 79 | pub async fn serve_metrics() { 80 | use hyper::{ 81 | server::Server, 82 | service::{make_service_fn, service_fn}, 83 | }; 84 | use std::convert::Infallible; 85 | use std::net::SocketAddr; 86 | 87 | let make_svc = make_service_fn(|_conn| async { Ok::<_, Infallible>(service_fn(metrics)) }); 88 | 89 | let addr: SocketAddr = std::env::var("METRICS_HOST") 90 | .expect("Missing METRICS_HOST") 91 | .parse() 92 | .expect("Invalid METRICS_HOST"); 93 | 94 | let server = Server::bind(&addr).serve(make_svc); 95 | 96 | tokio::spawn(async move { 97 | server.await.expect("Metrics server error"); 98 | }); 99 | } 100 | 101 | pub trait InjectContext { 102 | fn inject_context(self) -> Self; 103 | } 104 | 105 | impl InjectContext for reqwest::RequestBuilder { 106 | fn inject_context(self: reqwest::RequestBuilder) -> reqwest::RequestBuilder { 107 | use tracing_opentelemetry::OpenTelemetrySpanExt; 108 | 109 | let mut headers: reqwest::header::HeaderMap = Default::default(); 110 | 111 | let cx = tracing::Span::current().context(); 112 | opentelemetry::global::get_text_map_propagator(|propagator| { 113 | propagator.inject_context(&cx, &mut opentelemetry_http::HeaderInjector(&mut headers)) 114 | }); 115 | 116 | self.headers(headers) 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /fuzzysearch-common/src/types.rs: -------------------------------------------------------------------------------- 1 | use serde::{Deserialize, Serialize}; 2 | 3 | #[derive(Clone, Debug, Deserialize, Serialize, PartialEq)] 4 | #[serde(rename_all = "lowercase")] 5 | pub enum Rating { 6 | General, 7 | Mature, 8 | Adult, 9 | } 10 | 11 | impl std::str::FromStr for Rating { 12 | type Err = &'static str; 13 | 14 | fn from_str(s: &str) -> Result { 15 | // Each site has their own system of content ratings... 16 | let rating = match s { 17 | "g" | "s" | "general" => Self::General, 18 | "m" | "q" | "mature" => Self::Mature, 19 | "a" | "e" | "adult" | "explicit" => Self::Adult, 20 | _ => return Err("unknown rating"), 21 | }; 22 | 23 | Ok(rating) 24 | } 25 | } 26 | 27 | /// A general type for every result in a search. 28 | #[derive(Clone, Debug, Default, Deserialize, Serialize)] 29 | pub struct SearchResult { 30 | pub site_id: i64, 31 | pub site_id_str: String, 32 | 33 | pub url: String, 34 | pub filename: String, 35 | pub artists: Option>, 36 | pub rating: Option, 37 | pub posted_at: Option>, 38 | 39 | pub sha256: Option, 40 | 41 | #[serde(skip_serializing_if = "Option::is_none")] 42 | pub tags: Option>, 43 | 44 | #[serde(skip_serializing_if = "Option::is_none")] 45 | #[serde(flatten)] 46 | pub site_info: Option, 47 | 48 | #[serde(skip_serializing_if = "Option::is_none")] 49 | pub hash: Option, 50 | #[serde(skip_serializing_if = "Option::is_none")] 51 | pub distance: Option, 52 | 53 | #[serde(skip_serializing_if = "Option::is_none")] 54 | pub searched_hash: Option, 55 | } 56 | 57 | #[derive(Clone, Debug, Deserialize, Serialize)] 58 | #[serde(tag = "site", content = "site_info")] 59 | pub enum SiteInfo { 60 | FurAffinity { 61 | file_id: i32, 62 | }, 63 | #[serde(rename = "e621")] 64 | E621 { 65 | sources: Option>, 66 | }, 67 | Twitter, 68 | Weasyl, 69 | } 70 | 71 | #[derive(Copy, Clone, Deserialize, Serialize, Debug)] 72 | pub enum Site { 73 | FurAffinity, 74 | E621, 75 | Weasyl, 76 | Twitter, 77 | } 78 | 79 | impl std::fmt::Display for Site { 80 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 81 | match self { 82 | Self::FurAffinity => write!(f, "FurAffinity"), 83 | Self::E621 => write!(f, "e621"), 84 | Self::Weasyl => write!(f, "Weasyl"), 85 | Self::Twitter => write!(f, "Twitter"), 86 | } 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /fuzzysearch-hash-input/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "fuzzysearch-hash-input" 3 | version = "0.1.0" 4 | authors = ["Syfaro "] 5 | edition = "2018" 6 | 7 | [dependencies] 8 | tracing = "0.1" 9 | anyhow = "1" 10 | 11 | tokio = { version = "1", features = ["full"] } 12 | tokio-stream = "0.1" 13 | 14 | tempfile = "3" 15 | image = "0.23" 16 | 17 | actix-web = "4" 18 | actix-http = "3" 19 | actix-multipart = "0.4" 20 | tracing-actix-web = { version = "0.5", features = ["opentelemetry_0_17"] } 21 | 22 | lazy_static = "1" 23 | prometheus = { version = "0.13", features = ["process"] } 24 | 25 | fuzzysearch-common = { path = "../fuzzysearch-common" } 26 | -------------------------------------------------------------------------------- /fuzzysearch-hash-input/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:24.04 2 | RUN apt-get update -y && apt-get install -y openssl ca-certificates && rm -rf /var/lib/apt/lists/* 3 | COPY ./fuzzysearch-hash-input/fuzzysearch-hash-input /bin/fuzzysearch-hash-input 4 | CMD ["/bin/fuzzysearch-hash-input"] 5 | -------------------------------------------------------------------------------- /fuzzysearch-hash-input/src/main.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | convert::TryInto, 3 | io::{BufReader, SeekFrom}, 4 | }; 5 | 6 | use actix_web::{post, web::Data, App, HttpRequest, HttpResponse, HttpServer, Responder}; 7 | use tempfile::tempfile; 8 | use tokio::{ 9 | io::{AsyncSeekExt, AsyncWriteExt}, 10 | sync::Semaphore, 11 | }; 12 | use tokio_stream::StreamExt; 13 | 14 | lazy_static::lazy_static! { 15 | static ref IMAGE_LOADING_DURATION: prometheus::Histogram = 16 | prometheus::register_histogram!("fuzzysearch_image_image_loading_seconds", "Duration to download and save image").unwrap(); 17 | static ref IMAGE_DECODING_DURATION: prometheus::Histogram = 18 | prometheus::register_histogram!("fuzzysearch_image_image_decoding_seconds", "Duration to decode image data").unwrap(); 19 | static ref IMAGE_HASHING_DURATION: prometheus::Histogram = 20 | prometheus::register_histogram!("fuzzysearch_image_image_hashing_seconds", "Duration to hash image").unwrap(); 21 | } 22 | 23 | enum ImageResponse { 24 | Hash(i64), 25 | Error(anyhow::Error), 26 | } 27 | 28 | impl Responder for ImageResponse { 29 | type Body = actix_web::body::BoxBody; 30 | 31 | fn respond_to(self, _req: &HttpRequest) -> HttpResponse { 32 | match self { 33 | ImageResponse::Hash(hash) => HttpResponse::Ok() 34 | .content_type("text/plain") 35 | .body(hash.to_string()), 36 | ImageResponse::Error(error) => HttpResponse::BadRequest() 37 | .content_type("text/plain") 38 | .body(error.to_string()), 39 | } 40 | } 41 | } 42 | 43 | #[tracing::instrument(err, skip(field, semaphore))] 44 | async fn process_image( 45 | mut field: actix_multipart::Field, 46 | semaphore: Data, 47 | ) -> anyhow::Result { 48 | tracing::debug!("creating temp file"); 49 | 50 | let loading_duration = IMAGE_LOADING_DURATION.start_timer(); 51 | let mut file = 52 | tokio::task::spawn_blocking(move || -> anyhow::Result { 53 | let file = tempfile()?; 54 | Ok(tokio::fs::File::from_std(file)) 55 | }) 56 | .await??; 57 | 58 | tracing::debug!("writing contents to temp file"); 59 | let mut size = 0; 60 | while let Ok(Some(chunk)) = field.try_next().await { 61 | file.write_all(&chunk).await?; 62 | size += chunk.len(); 63 | } 64 | tracing::debug!("file was {} bytes", size); 65 | 66 | tracing::debug!("returning file to beginning"); 67 | file.seek(SeekFrom::Start(0)).await?; 68 | let file = file.into_std().await; 69 | loading_duration.stop_and_record(); 70 | 71 | tracing::debug!("getting semaphore permit"); 72 | let _permit = semaphore.acquire().await?; 73 | 74 | tracing::debug!("decoding and hashing image"); 75 | let hash = tokio::task::spawn_blocking(move || -> anyhow::Result { 76 | let decoding_duration = IMAGE_DECODING_DURATION.start_timer(); 77 | let reader = BufReader::new(file); 78 | let reader = image::io::Reader::new(reader).with_guessed_format()?; 79 | let im = reader.decode()?; 80 | decoding_duration.stop_and_record(); 81 | 82 | let hashing_duration = IMAGE_HASHING_DURATION.start_timer(); 83 | let image_hash = fuzzysearch_common::get_hasher().hash_image(&im); 84 | let hash: [u8; 8] = image_hash.as_bytes().try_into()?; 85 | let hash = i64::from_be_bytes(hash); 86 | hashing_duration.stop_and_record(); 87 | 88 | Ok(hash) 89 | }) 90 | .await??; 91 | 92 | tracing::debug!("calculated image hash: {}", hash); 93 | Ok(hash) 94 | } 95 | 96 | #[post("/image")] 97 | async fn post_image( 98 | mut form: actix_multipart::Multipart, 99 | semaphore: Data, 100 | ) -> impl Responder { 101 | while let Ok(Some(field)) = form.try_next().await { 102 | tracing::debug!("got multipart field: {:?}", field); 103 | 104 | if !matches!(field.content_disposition().get_name(), Some("image")) { 105 | continue; 106 | } 107 | 108 | match process_image(field, semaphore).await { 109 | Ok(hash) => return ImageResponse::Hash(hash), 110 | Err(err) => return ImageResponse::Error(err), 111 | } 112 | } 113 | 114 | ImageResponse::Error(anyhow::anyhow!("missing image field")) 115 | } 116 | 117 | #[actix_web::main] 118 | async fn main() { 119 | fuzzysearch_common::trace::configure_tracing("fuzzysearch-image"); 120 | fuzzysearch_common::trace::serve_metrics().await; 121 | 122 | let semaphore = Data::new(Semaphore::new(4)); 123 | 124 | HttpServer::new(move || { 125 | App::new() 126 | .wrap(tracing_actix_web::TracingLogger::default()) 127 | .app_data(semaphore.clone()) 128 | .service(post_image) 129 | }) 130 | .workers(2) 131 | .bind("0.0.0.0:8090") 132 | .unwrap() 133 | .run() 134 | .await 135 | .unwrap(); 136 | } 137 | -------------------------------------------------------------------------------- /fuzzysearch-ingest-e621/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "fuzzysearch-ingest-e621" 3 | version = "0.1.0" 4 | authors = ["Syfaro "] 5 | edition = "2018" 6 | 7 | [dependencies] 8 | tokio = { version = "1", features = ["full"] } 9 | 10 | hyper = { version = "0.14", features = ["server"] } 11 | reqwest = { version = "0.11", features = ["json"] } 12 | 13 | serde = "1" 14 | serde_json = "1" 15 | 16 | sqlx = { version = "0.5", features = ["runtime-tokio-native-tls", "postgres", "macros", "json", "offline"] } 17 | 18 | image = "0.23" 19 | img_hash = "3" 20 | sha2 = "0.10" 21 | 22 | tracing = "0.1" 23 | tracing-unwrap = "0.9" 24 | 25 | anyhow = "1" 26 | 27 | lazy_static = "1" 28 | prometheus = { version = "0.13", features = ["process"] } 29 | 30 | fuzzysearch-common = { path = "../fuzzysearch-common", features = ["queue"] } 31 | -------------------------------------------------------------------------------- /fuzzysearch-ingest-e621/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:24.04 2 | EXPOSE 8080 3 | ENV METRICS_HOST=0.0.0.0:8080 4 | RUN apt-get update -y && apt-get install -y openssl ca-certificates && rm -rf /var/lib/apt/lists/* 5 | COPY ./fuzzysearch-ingest-e621/fuzzysearch-ingest-e621 /bin/fuzzysearch-ingest-e621 6 | CMD ["/bin/fuzzysearch-ingest-e621"] 7 | -------------------------------------------------------------------------------- /fuzzysearch-ingest-e621/sqlx-data.json: -------------------------------------------------------------------------------- 1 | { 2 | "db": "PostgreSQL", 3 | "02b98e35cf7d650413c2730df732d7ae08119b11a5b2aaddcee08a7f06338924": { 4 | "query": "SELECT max(id) max FROM e621", 5 | "describe": { 6 | "columns": [ 7 | { 8 | "ordinal": 0, 9 | "name": "max", 10 | "type_info": "Int4" 11 | } 12 | ], 13 | "parameters": { 14 | "Left": [] 15 | }, 16 | "nullable": [ 17 | null 18 | ] 19 | } 20 | }, 21 | "a054594f7844f32e5968a54c0dab59716149a10411fcb16184a9070a82bb287d": { 22 | "query": "INSERT INTO e621\n (id, data, hash, hash_error, sha256) VALUES\n ($1, $2, $3, $4, $5)\n ON CONFLICT (id) DO UPDATE SET\n data = EXCLUDED.data,\n hash = EXCLUDED.hash,\n hash_error = EXCLUDED.hash_error,\n sha256 = EXCLUDED.sha256", 23 | "describe": { 24 | "columns": [], 25 | "parameters": { 26 | "Left": [ 27 | "Int4", 28 | "Jsonb", 29 | "Int8", 30 | "Text", 31 | "Bytea" 32 | ] 33 | }, 34 | "nullable": [] 35 | } 36 | } 37 | } -------------------------------------------------------------------------------- /fuzzysearch-ingest-e621/src/main.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Context; 2 | use lazy_static::lazy_static; 3 | use prometheus::{ 4 | register_histogram, register_int_gauge, Histogram, HistogramOpts, IntGauge, Opts, 5 | }; 6 | use sqlx::Connection; 7 | use tracing_unwrap::ResultExt; 8 | 9 | use fuzzysearch_common::faktory::FaktoryClient; 10 | 11 | static USER_AGENT: &str = "e621-watcher / FuzzySearch Ingester / Syfaro "; 12 | 13 | lazy_static! { 14 | static ref SUBMISSION_BACKLOG: IntGauge = register_int_gauge!(Opts::new( 15 | "fuzzysearch_watcher_submission_backlog", 16 | "Number of submissions behind the latest ID" 17 | ) 18 | .const_label("site", "e621")) 19 | .unwrap_or_log(); 20 | static ref INDEX_DURATION: Histogram = register_histogram!(HistogramOpts::new( 21 | "fuzzysearch_watcher_index_duration_seconds", 22 | "Duration to load an index of submissions" 23 | ) 24 | .const_label("site", "e621")) 25 | .unwrap_or_log(); 26 | static ref SUBMISSION_DURATION: Histogram = register_histogram!(HistogramOpts::new( 27 | "fuzzysearch_watcher_submission_duration_seconds", 28 | "Duration to load an index of submissions" 29 | ) 30 | .const_label("site", "e621")) 31 | .unwrap_or_log(); 32 | } 33 | 34 | type Auth = (String, Option); 35 | 36 | #[tokio::main] 37 | async fn main() -> anyhow::Result<()> { 38 | fuzzysearch_common::trace::configure_tracing("fuzzysearch-ingest-e621"); 39 | fuzzysearch_common::trace::serve_metrics().await; 40 | 41 | let login = std::env::var("E621_LOGIN").expect_or_log("Missing E621_LOGIN"); 42 | let api_key = std::env::var("E621_API_KEY").expect_or_log("Missing E621_API_KEY"); 43 | let auth = (login, Some(api_key)); 44 | 45 | let download_folder = std::env::var("DOWNLOAD_FOLDER").ok(); 46 | 47 | let client = reqwest::ClientBuilder::default() 48 | .user_agent(USER_AGENT) 49 | .build()?; 50 | 51 | let mut conn = sqlx::PgConnection::connect( 52 | &std::env::var("DATABASE_URL").expect_or_log("Missing DATABASE_URL"), 53 | ) 54 | .await?; 55 | 56 | let faktory_dsn = std::env::var("FAKTORY_URL").expect_or_log("Missing FAKTORY_URL"); 57 | let faktory = FaktoryClient::connect(faktory_dsn) 58 | .await 59 | .expect_or_log("Unable to connect to Faktory"); 60 | 61 | let max_id: i32 = sqlx::query!("SELECT max(id) max FROM e621") 62 | .fetch_one(&mut conn) 63 | .await? 64 | .max 65 | .unwrap_or(0); 66 | 67 | tracing::info!(max_id, "Found maximum ID in database"); 68 | 69 | let mut now; 70 | let mut min_id = max_id; 71 | 72 | let mut latest_id: Option = None; 73 | 74 | loop { 75 | now = std::time::Instant::now(); 76 | 77 | let lid = match latest_id { 78 | Some(latest_id) => latest_id, 79 | None => { 80 | let _hist = INDEX_DURATION.start_timer(); 81 | let lid = get_latest_id(&client, &auth) 82 | .await 83 | .expect_or_log("Unable to get latest ID"); 84 | drop(_hist); 85 | 86 | latest_id = Some(lid); 87 | 88 | lid 89 | } 90 | }; 91 | 92 | let _hist = INDEX_DURATION.start_timer(); 93 | let page = load_page(&client, &auth, min_id).await?; 94 | drop(_hist); 95 | 96 | let posts = get_page_posts(&page)?; 97 | let post_ids = get_post_ids(posts); 98 | 99 | tracing::trace!(?post_ids, "Collected posts"); 100 | 101 | min_id = match post_ids.iter().max() { 102 | Some(id) => *id, 103 | None => { 104 | tracing::info!("Found no new posts, sleeping"); 105 | tokio::time::sleep(std::time::Duration::from_secs(60 * 5)).await; 106 | continue; 107 | } 108 | }; 109 | 110 | SUBMISSION_BACKLOG.set((lid - min_id).into()); 111 | 112 | let mut tx = conn.begin().await?; 113 | 114 | for post in posts { 115 | let _hist = SUBMISSION_DURATION.start_timer(); 116 | insert_submission(&mut tx, &faktory, &client, post, &download_folder).await?; 117 | drop(_hist); 118 | 119 | SUBMISSION_BACKLOG.sub(1); 120 | } 121 | 122 | tx.commit().await?; 123 | 124 | let elapsed = now.elapsed().as_millis() as u64; 125 | if post_ids.contains(&lid) { 126 | tracing::info!(lid, "Page contained latest ID, sleeping"); 127 | tokio::time::sleep(std::time::Duration::from_secs(60 * 5)).await; 128 | 129 | latest_id = None; 130 | } else if elapsed < 1000 { 131 | let delay = 1000 - elapsed; 132 | tracing::info!(delay, "Delaying before next request"); 133 | tokio::time::sleep(std::time::Duration::from_millis(delay)).await; 134 | } 135 | } 136 | } 137 | 138 | fn get_page_posts(page: &serde_json::Value) -> anyhow::Result<&Vec> { 139 | let page = match page { 140 | serde_json::Value::Object(ref obj) => obj, 141 | _ => return Err(anyhow::anyhow!("Top level object was not an object")), 142 | }; 143 | 144 | let posts = page 145 | .get("posts") 146 | .context("Page did not contain posts object")? 147 | .as_array() 148 | .context("Posts was not an array")?; 149 | 150 | Ok(posts) 151 | } 152 | 153 | fn get_post_ids(posts: &[serde_json::Value]) -> Vec { 154 | let ids: Vec = posts 155 | .iter() 156 | .filter_map(|post| { 157 | let post = match post { 158 | serde_json::Value::Object(post) => post, 159 | _ => return None, 160 | }; 161 | 162 | let id = match post.get("id")? { 163 | serde_json::Value::Number(num) => num.as_i64()? as i32, 164 | _ => return None, 165 | }; 166 | 167 | Some(id) 168 | }) 169 | .collect(); 170 | 171 | ids 172 | } 173 | 174 | #[tracing::instrument(err, skip(client, auth))] 175 | async fn get_latest_id(client: &reqwest::Client, auth: &Auth) -> anyhow::Result { 176 | tracing::debug!("Looking up current highest ID"); 177 | 178 | let query = vec![("limit", "1")]; 179 | 180 | let page: serde_json::Value = client 181 | .get("https://e621.net/posts.json") 182 | .query(&query) 183 | .basic_auth(&auth.0, auth.1.as_ref()) 184 | .send() 185 | .await? 186 | .json() 187 | .await?; 188 | 189 | let posts = get_page_posts(&page)?; 190 | 191 | let id = get_post_ids(posts) 192 | .into_iter() 193 | .max() 194 | .context("Page had no IDs")?; 195 | 196 | tracing::info!(id, "Found maximum ID"); 197 | 198 | Ok(id) 199 | } 200 | 201 | #[tracing::instrument(err, skip(client, auth))] 202 | async fn load_page( 203 | client: &reqwest::Client, 204 | auth: &Auth, 205 | after_id: i32, 206 | ) -> anyhow::Result { 207 | tracing::debug!("Attempting to load page"); 208 | 209 | let query = vec![ 210 | ("limit", "320".to_string()), 211 | ("page", format!("a{}", after_id)), 212 | ]; 213 | 214 | let body = client 215 | .get("https://e621.net/posts.json") 216 | .query(&query) 217 | .basic_auth(&auth.0, auth.1.as_ref()) 218 | .send() 219 | .await? 220 | .json() 221 | .await?; 222 | 223 | Ok(body) 224 | } 225 | 226 | struct ImageData { 227 | hash: Option, 228 | hash_error: Option, 229 | sha256: Option>, 230 | bytes: Option>, 231 | } 232 | 233 | #[tracing::instrument(err, skip(conn, faktory, client, post, download_folder), fields(id))] 234 | async fn insert_submission( 235 | conn: &mut sqlx::Transaction<'_, sqlx::Postgres>, 236 | faktory: &FaktoryClient, 237 | client: &reqwest::Client, 238 | post: &serde_json::Value, 239 | download_folder: &Option, 240 | ) -> anyhow::Result<()> { 241 | let id = post 242 | .get("id") 243 | .context("Post was missing ID")? 244 | .as_i64() 245 | .context("Post ID was not number")? as i32; 246 | 247 | tracing::Span::current().record("id", &id); 248 | tracing::debug!("Inserting submission"); 249 | 250 | tracing::trace!(?post, "Evaluating post"); 251 | 252 | let ImageData { 253 | hash, 254 | hash_error, 255 | sha256, 256 | .. 257 | } = if let Some((url, ext)) = get_post_url_ext(post) { 258 | let ImageData { 259 | hash, 260 | hash_error, 261 | sha256, 262 | bytes, 263 | } = if url != "/images/deleted-preview.png" && (ext == "jpg" || ext == "png") { 264 | load_image(client, url).await? 265 | } else { 266 | tracing::debug!("Ignoring post as it is deleted or not a supported image format"); 267 | 268 | ImageData { 269 | hash: None, 270 | hash_error: None, 271 | sha256: None, 272 | bytes: None, 273 | } 274 | }; 275 | 276 | if let (Some(folder), Some(sha256), Some(bytes)) = (download_folder, &sha256, &bytes) { 277 | if let Err(err) = fuzzysearch_common::download::write_bytes(folder, sha256, bytes).await 278 | { 279 | tracing::error!("Could not download file: {:?}", err); 280 | } 281 | } 282 | 283 | let artist = post 284 | .as_object() 285 | .and_then(|post| post.get("tags")) 286 | .and_then(|tags| tags.get("artist")) 287 | .and_then(|artist| artist.as_array()) 288 | .map(|artists| { 289 | artists 290 | .iter() 291 | .filter_map(|artist| artist.as_str()) 292 | .collect::>() 293 | .join(", ") 294 | }) 295 | .unwrap_or_default(); 296 | 297 | faktory 298 | .queue_webhook(fuzzysearch_common::faktory::WebHookData { 299 | site: fuzzysearch_common::types::Site::E621, 300 | site_id: id as i64, 301 | artist, 302 | file_url: url.to_owned(), 303 | file_sha256: sha256.clone(), 304 | hash: hash.map(|hash| hash.to_be_bytes()), 305 | }) 306 | .await?; 307 | 308 | ImageData { 309 | hash, 310 | hash_error, 311 | sha256, 312 | bytes, 313 | } 314 | } else { 315 | tracing::warn!("Post had missing URL or extension"); 316 | 317 | ImageData { 318 | hash: None, 319 | hash_error: None, 320 | sha256: None, 321 | bytes: None, 322 | } 323 | }; 324 | 325 | sqlx::query!( 326 | "INSERT INTO e621 327 | (id, data, hash, hash_error, sha256) VALUES 328 | ($1, $2, $3, $4, $5) 329 | ON CONFLICT (id) DO UPDATE SET 330 | data = EXCLUDED.data, 331 | hash = EXCLUDED.hash, 332 | hash_error = EXCLUDED.hash_error, 333 | sha256 = EXCLUDED.sha256", 334 | id, 335 | post, 336 | hash, 337 | hash_error, 338 | sha256 339 | ) 340 | .execute(conn) 341 | .await?; 342 | 343 | tracing::info!("Completed submission"); 344 | 345 | Ok(()) 346 | } 347 | 348 | fn get_post_url_ext(post: &serde_json::Value) -> Option<(&str, &str)> { 349 | let file = post.as_object()?.get("file")?.as_object()?; 350 | 351 | let url = file.get("url")?.as_str()?; 352 | let ext = file.get("ext")?.as_str()?; 353 | 354 | Some((url, ext)) 355 | } 356 | 357 | #[tracing::instrument(err, skip(client))] 358 | async fn load_image(client: &reqwest::Client, url: &str) -> anyhow::Result { 359 | use sha2::{Digest, Sha256}; 360 | use std::convert::TryInto; 361 | 362 | let bytes = client.get(url).send().await?.bytes().await?.to_vec(); 363 | 364 | tracing::trace!(len = bytes.len(), "Got submission image bytes"); 365 | 366 | let mut hasher = Sha256::new(); 367 | hasher.update(&bytes); 368 | let result = hasher.finalize().to_vec(); 369 | 370 | tracing::trace!(?result, "Calculated image SHA256"); 371 | 372 | let hasher = fuzzysearch_common::get_hasher(); 373 | let img = match image::load_from_memory(&bytes) { 374 | Ok(img) => img, 375 | Err(err) => { 376 | tracing::error!(?err, "Unable to open image"); 377 | return Ok(ImageData { 378 | hash: None, 379 | hash_error: Some(err.to_string()), 380 | sha256: Some(result), 381 | bytes: Some(bytes), 382 | }); 383 | } 384 | }; 385 | 386 | tracing::trace!("Opened image successfully"); 387 | 388 | let hash = hasher.hash_image(&img); 389 | let hash: [u8; 8] = hash.as_bytes().try_into()?; 390 | let hash = i64::from_be_bytes(hash); 391 | 392 | tracing::trace!(?hash, "Calculated image hash"); 393 | 394 | Ok(ImageData { 395 | hash: Some(hash), 396 | hash_error: None, 397 | sha256: Some(result), 398 | bytes: Some(bytes), 399 | }) 400 | } 401 | -------------------------------------------------------------------------------- /fuzzysearch-ingest-furaffinity/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | **/*.rs.bk -------------------------------------------------------------------------------- /fuzzysearch-ingest-furaffinity/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "fuzzysearch-ingest-furaffinity" 3 | version = "0.1.0" 4 | authors = ["Syfaro "] 5 | edition = "2018" 6 | 7 | [dependencies] 8 | reqwest = "0.11" 9 | tokio = { version = "1", features = ["full"] } 10 | tokio-postgres = { version = "0.7.0", features = ["with-chrono-0_4"] } 11 | chrono = "0.4" 12 | hyper = { version = "0.14", features = ["server"] } 13 | prometheus = { version = "0.13", features = ["process"] } 14 | lazy_static = "1" 15 | futures-retry = "0.6" 16 | tracing = "0.1" 17 | tracing-unwrap = "0.9" 18 | faktory = "0.11" 19 | anyhow = "1" 20 | serde = { version = "1", features = ["derive"] } 21 | serde_json = "1" 22 | fuzzysearch-common = { path = "../fuzzysearch-common", features = ["queue"] } 23 | furaffinity-rs = { git = "https://github.com/Syfaro/furaffinity-rs" } 24 | -------------------------------------------------------------------------------- /fuzzysearch-ingest-furaffinity/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:24.04 2 | RUN apt-get update -y && \ 3 | apt-get install -y openssl ca-certificates && \ 4 | rm -rf /var/lib/apt/lists/* 5 | COPY ./fuzzysearch-ingest-furaffinity/fuzzysearch-ingest-furaffinity /bin/fuzzysearch-ingest-furaffinity 6 | CMD ["/bin/fuzzysearch-ingest-furaffinity"] 7 | -------------------------------------------------------------------------------- /fuzzysearch-ingest-furaffinity/src/main.rs: -------------------------------------------------------------------------------- 1 | use lazy_static::lazy_static; 2 | use prometheus::{ 3 | register_counter, register_histogram, register_int_gauge_vec, Counter, Histogram, 4 | HistogramOpts, IntGaugeVec, Opts, 5 | }; 6 | use tokio_postgres::Client; 7 | use tracing_unwrap::{OptionExt, ResultExt}; 8 | 9 | use fuzzysearch_common::faktory::FaktoryClient; 10 | 11 | lazy_static! { 12 | static ref INDEX_DURATION: Histogram = register_histogram!(HistogramOpts::new( 13 | "fuzzysearch_watcher_index_duration_seconds", 14 | "Duration to load an index of submissions" 15 | ) 16 | .const_label("site", "furaffinity")) 17 | .unwrap_or_log(); 18 | static ref SUBMISSION_DURATION: Histogram = register_histogram!(HistogramOpts::new( 19 | "fuzzysearch_watcher_submission_duration_seconds", 20 | "Duration to load an index of submissions" 21 | ) 22 | .const_label("site", "furaffinity")) 23 | .unwrap_or_log(); 24 | static ref SUBMISSION_MISSING: Counter = register_counter!(Opts::new( 25 | "fuzzysearch_watcher_submission_missing_total", 26 | "Number of submissions that were missing" 27 | ) 28 | .const_label("site", "furaffinity")) 29 | .unwrap_or_log(); 30 | static ref USERS_ONLINE: IntGaugeVec = register_int_gauge_vec!( 31 | Opts::new( 32 | "fuzzysearch_watcher_users_online", 33 | "Number of users online for each category" 34 | ) 35 | .const_label("site", "furaffinity"), 36 | &["group"] 37 | ) 38 | .unwrap_or_log(); 39 | } 40 | 41 | async fn lookup_tag(client: &Client, tag: &str) -> i32 { 42 | if let Some(row) = client 43 | .query("SELECT id FROM tag WHERE name = $1", &[&tag]) 44 | .await 45 | .unwrap_or_log() 46 | .into_iter() 47 | .next() 48 | { 49 | return row.get("id"); 50 | } 51 | 52 | client 53 | .query("INSERT INTO tag (name) VALUES ($1) RETURNING id", &[&tag]) 54 | .await 55 | .unwrap_or_log() 56 | .into_iter() 57 | .next() 58 | .unwrap_or_log() 59 | .get("id") 60 | } 61 | 62 | async fn lookup_artist(client: &Client, artist: &str) -> i32 { 63 | if let Some(row) = client 64 | .query("SELECT id FROM artist WHERE name = $1", &[&artist]) 65 | .await 66 | .unwrap_or_log() 67 | .into_iter() 68 | .next() 69 | { 70 | return row.get("id"); 71 | } 72 | 73 | client 74 | .query( 75 | "INSERT INTO artist (name) VALUES ($1) RETURNING id", 76 | &[&artist], 77 | ) 78 | .await 79 | .unwrap_or_log() 80 | .into_iter() 81 | .next() 82 | .unwrap_or_log() 83 | .get("id") 84 | } 85 | 86 | async fn has_submission(client: &Client, id: i32) -> bool { 87 | client 88 | .query("SELECT id FROM submission WHERE id = $1", &[&id]) 89 | .await 90 | .unwrap_or_log() 91 | .into_iter() 92 | .next() 93 | .is_some() 94 | } 95 | 96 | async fn ids_to_check(client: &Client, max: i32) -> Vec { 97 | let rows = client.query("SELECT sid FROM generate_series((SELECT max(id) FROM submission), $1::int) sid WHERE sid NOT IN (SELECT id FROM submission where id = sid)", &[&max]).await.unwrap_or_log(); 98 | 99 | rows.iter().map(|row| row.get("sid")).collect() 100 | } 101 | 102 | async fn insert_submission( 103 | client: &Client, 104 | sub: &furaffinity_rs::Submission, 105 | ) -> Result<(), tokio_postgres::Error> { 106 | let artist_id = lookup_artist(client, &sub.artist).await; 107 | let mut tag_ids = Vec::with_capacity(sub.tags.len()); 108 | for tag in &sub.tags { 109 | tag_ids.push(lookup_tag(client, tag).await); 110 | } 111 | 112 | let hash = sub.hash.clone(); 113 | let url = sub.content.url(); 114 | 115 | let size = sub.file_size.map(|size| size as i32); 116 | 117 | client.execute("INSERT INTO submission (id, artist_id, url, filename, hash, rating, posted_at, description, hash_int, file_id, file_size, file_sha256) VALUES ($1, $2, $3, $4, decode($5, 'base64'), $6, $7, $8, $9, CASE WHEN isnumeric(split_part($4, '.', 1)) THEN split_part($4, '.', 1)::int ELSE null END, $10, $11)", &[ 118 | &sub.id, &artist_id, &url, &sub.filename, &hash, &sub.rating.serialize(), &sub.posted_at, &sub.description, &sub.hash_num, &size, &sub.file_sha256, 119 | ]).await?; 120 | 121 | let stmt = client 122 | .prepare("INSERT INTO tag_to_post (tag_id, post_id) VALUES ($1, $2) ON CONFLICT DO NOTHING") 123 | .await?; 124 | 125 | for tag_id in tag_ids { 126 | client.execute(&stmt, &[&tag_id, &sub.id]).await?; 127 | } 128 | 129 | Ok(()) 130 | } 131 | 132 | async fn insert_null_submission(client: &Client, id: i32) -> Result { 133 | client 134 | .execute("INSERT INTO SUBMISSION (id) VALUES ($1)", &[&id]) 135 | .await 136 | } 137 | 138 | struct RetryHandler { 139 | max_attempts: usize, 140 | } 141 | 142 | impl RetryHandler { 143 | fn new(max_attempts: usize) -> Self { 144 | Self { max_attempts } 145 | } 146 | } 147 | 148 | impl futures_retry::ErrorHandler for RetryHandler { 149 | type OutError = furaffinity_rs::Error; 150 | 151 | #[tracing::instrument(skip(self), fields(max_attempts = self.max_attempts))] 152 | fn handle( 153 | &mut self, 154 | attempt: usize, 155 | err: furaffinity_rs::Error, 156 | ) -> futures_retry::RetryPolicy { 157 | tracing::warn!("Attempt failed"); 158 | 159 | if attempt >= self.max_attempts { 160 | tracing::error!("All attempts have been used"); 161 | return futures_retry::RetryPolicy::ForwardError(err); 162 | } 163 | 164 | if !err.retry { 165 | tracing::error!("Error did not ask for retry"); 166 | return futures_retry::RetryPolicy::ForwardError(err); 167 | } 168 | 169 | futures_retry::RetryPolicy::WaitRetry(std::time::Duration::from_secs(1 + attempt as u64)) 170 | } 171 | } 172 | 173 | #[tracing::instrument(skip(client, fa, faktory, download_folder))] 174 | async fn process_submission( 175 | client: &Client, 176 | fa: &furaffinity_rs::FurAffinity, 177 | faktory: &FaktoryClient, 178 | id: i32, 179 | download_folder: &Option, 180 | ) { 181 | if has_submission(client, id).await { 182 | return; 183 | } 184 | 185 | tracing::info!("Loading submission"); 186 | 187 | let _timer = SUBMISSION_DURATION.start_timer(); 188 | 189 | let sub = futures_retry::FutureRetry::new(|| fa.get_submission(id), RetryHandler::new(3)) 190 | .await 191 | .map(|(sub, _attempts)| sub) 192 | .map_err(|(err, _attempts)| err); 193 | 194 | let sub = match sub { 195 | Ok(sub) => sub, 196 | Err(err) => { 197 | tracing::error!("Failed to load submission: {:?}", err); 198 | _timer.stop_and_discard(); 199 | SUBMISSION_MISSING.inc(); 200 | insert_null_submission(client, id).await.unwrap_or_log(); 201 | return; 202 | } 203 | }; 204 | 205 | let sub = match sub { 206 | Some(sub) => sub, 207 | None => { 208 | tracing::warn!("Submission did not exist"); 209 | _timer.stop_and_discard(); 210 | SUBMISSION_MISSING.inc(); 211 | insert_null_submission(client, id).await.unwrap_or_log(); 212 | return; 213 | } 214 | }; 215 | 216 | let image = 217 | futures_retry::FutureRetry::new(|| fa.calc_image_hash(sub.clone()), RetryHandler::new(3)) 218 | .await 219 | .map(|(sub, _attempt)| sub) 220 | .map_err(|(err, _attempt)| err); 221 | 222 | let sub = match image { 223 | Ok(sub) => sub, 224 | Err(err) => { 225 | tracing::error!("Unable to hash submission image: {:?}", err); 226 | sub 227 | } 228 | }; 229 | 230 | if let (Some(folder), Some(sha256), Some(bytes)) = 231 | (download_folder, &sub.file_sha256, &sub.file) 232 | { 233 | if let Err(err) = fuzzysearch_common::download::write_bytes(folder, sha256, bytes).await { 234 | tracing::error!("Could not download image: {:?}", err); 235 | } 236 | } 237 | 238 | _timer.stop_and_record(); 239 | 240 | if let Err(err) = faktory 241 | .queue_webhook(fuzzysearch_common::faktory::WebHookData { 242 | site: fuzzysearch_common::types::Site::FurAffinity, 243 | site_id: sub.id as i64, 244 | artist: sub.artist.clone(), 245 | file_url: sub.content.url().clone(), 246 | file_sha256: sub.file_sha256.clone(), 247 | hash: sub.hash_num.map(|hash| hash.to_be_bytes()), 248 | }) 249 | .await 250 | { 251 | tracing::error!("Unable to queue webhook: {:?}", err); 252 | } 253 | 254 | insert_submission(client, &sub).await.unwrap_or_log(); 255 | } 256 | 257 | #[tokio::main] 258 | async fn main() { 259 | fuzzysearch_common::trace::configure_tracing("fuzzysearch-ingest-furaffinity"); 260 | fuzzysearch_common::trace::serve_metrics().await; 261 | 262 | let (cookie_a, cookie_b) = ( 263 | std::env::var("FA_A").expect_or_log("Missing FA_A"), 264 | std::env::var("FA_B").expect_or_log("Missing FA_B"), 265 | ); 266 | 267 | let download_folder = std::env::var("DOWNLOAD_FOLDER").ok(); 268 | 269 | let user_agent = std::env::var("USER_AGENT").expect_or_log("Missing USER_AGENT"); 270 | let client = reqwest::Client::builder() 271 | .timeout(std::time::Duration::from_secs(10)) 272 | .build() 273 | .unwrap_or_log(); 274 | 275 | let fa = furaffinity_rs::FurAffinity::new(cookie_a, cookie_b, user_agent, Some(client)); 276 | 277 | let dsn = std::env::var("POSTGRES_DSN").expect_or_log("Missing POSTGRES_DSN"); 278 | 279 | let (client, connection) = tokio_postgres::connect(&dsn, tokio_postgres::NoTls) 280 | .await 281 | .unwrap_or_log(); 282 | 283 | tokio::spawn(async move { 284 | if let Err(e) = connection.await { 285 | panic!("PostgreSQL connection error: {:?}", e); 286 | } 287 | }); 288 | 289 | let faktory_dsn = std::env::var("FAKTORY_URL").expect_or_log("Missing FAKTORY_URL"); 290 | let faktory = FaktoryClient::connect(faktory_dsn) 291 | .await 292 | .expect_or_log("Unable to connect to Faktory"); 293 | 294 | tracing::info!("Started"); 295 | 296 | loop { 297 | tracing::debug!("Fetching latest ID... "); 298 | let duration = INDEX_DURATION.start_timer(); 299 | let (latest_id, online) = fa 300 | .latest_id() 301 | .await 302 | .expect_or_log("Unable to get latest id"); 303 | duration.stop_and_record(); 304 | tracing::info!(latest_id = latest_id, "Got latest ID"); 305 | 306 | tracing::debug!(?online, "Got updated users online"); 307 | USERS_ONLINE 308 | .with_label_values(&["guest"]) 309 | .set(online.guests as i64); 310 | USERS_ONLINE 311 | .with_label_values(&["registered"]) 312 | .set(online.registered as i64); 313 | USERS_ONLINE 314 | .with_label_values(&["other"]) 315 | .set(online.other as i64); 316 | 317 | for id in ids_to_check(&client, latest_id).await { 318 | process_submission(&client, &fa, &faktory, id, &download_folder).await; 319 | } 320 | 321 | tracing::info!("Completed fetch, waiting a minute before loading more"); 322 | tokio::time::sleep(std::time::Duration::from_secs(60)).await; 323 | } 324 | } 325 | -------------------------------------------------------------------------------- /fuzzysearch-ingest-weasyl/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "fuzzysearch-ingest-weasyl" 3 | version = "0.1.0" 4 | authors = ["Syfaro "] 5 | edition = "2018" 6 | 7 | [dependencies] 8 | anyhow = "1" 9 | 10 | tracing = "0.1" 11 | tracing-unwrap = "0.9" 12 | 13 | prometheus = "0.13" 14 | lazy_static = "1" 15 | 16 | reqwest = { version = "0.11", features = ["json"] } 17 | tokio = { version = "1", features = ["full"] } 18 | 19 | serde = "1" 20 | serde_json = "1" 21 | 22 | image = "0.23" 23 | img_hash = "3" 24 | 25 | sha2 = "0.10" 26 | 27 | fuzzysearch-common = { path = "../fuzzysearch-common", features = ["queue"] } 28 | 29 | [dependencies.sqlx] 30 | version = "0.5" 31 | default-features = false 32 | features = ["runtime-tokio-native-tls", "macros", "postgres", "json", "offline"] 33 | -------------------------------------------------------------------------------- /fuzzysearch-ingest-weasyl/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:24.04 2 | RUN apt-get update -y && apt-get install -y openssl ca-certificates && rm -rf /var/lib/apt/lists/* 3 | COPY ./fuzzysearch-ingest-weasyl/fuzzysearch-ingest-weasyl /bin/fuzzysearch-ingest-weasyl 4 | CMD ["/bin/fuzzysearch-ingest-weasyl"] 5 | -------------------------------------------------------------------------------- /fuzzysearch-ingest-weasyl/sqlx-data.json: -------------------------------------------------------------------------------- 1 | { 2 | "db": "PostgreSQL", 3 | "05da31ef5ee193d5094c6e2dc0f7cb00b4b0720a1902af02069861868f176688": { 4 | "query": "INSERT INTO weasyl (id, hash, sha256, file_size, data) VALUES ($1, $2, $3, $4, $5)", 5 | "describe": { 6 | "columns": [], 7 | "parameters": { 8 | "Left": [ 9 | "Int4", 10 | "Int8", 11 | "Bytea", 12 | "Int4", 13 | "Jsonb" 14 | ] 15 | }, 16 | "nullable": [] 17 | } 18 | }, 19 | "364c5c10ad748d1822c3e909aca601993f0ddb7690368a82ae467b3b0950478e": { 20 | "query": "INSERT INTO WEASYL (id, data) VALUES ($1, $2)", 21 | "describe": { 22 | "columns": [], 23 | "parameters": { 24 | "Left": [ 25 | "Int4", 26 | "Jsonb" 27 | ] 28 | }, 29 | "nullable": [] 30 | } 31 | }, 32 | "7ef3d8fa00b1245440aae6f91bfc23bddee7730fc2de67e2f359762ce8db3bf4": { 33 | "query": "SELECT id FROM weasyl WHERE id = $1", 34 | "describe": { 35 | "columns": [ 36 | { 37 | "ordinal": 0, 38 | "name": "id", 39 | "type_info": "Int4" 40 | } 41 | ], 42 | "parameters": { 43 | "Left": [ 44 | "Int4" 45 | ] 46 | }, 47 | "nullable": [ 48 | false 49 | ] 50 | } 51 | }, 52 | "949eca4258721007af9db04f43830bd8df525f942b6673c7a5713735ed7746d6": { 53 | "query": "SELECT max(id) id FROM weasyl", 54 | "describe": { 55 | "columns": [ 56 | { 57 | "ordinal": 0, 58 | "name": "id", 59 | "type_info": "Int4" 60 | } 61 | ], 62 | "parameters": { 63 | "Left": [] 64 | }, 65 | "nullable": [ 66 | null 67 | ] 68 | } 69 | } 70 | } -------------------------------------------------------------------------------- /fuzzysearch-ingest-weasyl/src/main.rs: -------------------------------------------------------------------------------- 1 | use std::time::Duration; 2 | 3 | use prometheus::{register_counter, register_histogram, Counter, Histogram, HistogramOpts, Opts}; 4 | use serde::{Deserialize, Serialize}; 5 | use sha2::{Digest, Sha256}; 6 | use tracing_unwrap::{OptionExt, ResultExt}; 7 | 8 | use fuzzysearch_common::faktory::FaktoryClient; 9 | 10 | lazy_static::lazy_static! { 11 | static ref INDEX_DURATION: Histogram = register_histogram!(HistogramOpts::new( 12 | "fuzzysearch_watcher_index_duration_seconds", 13 | "Duration to load an index of submissions" 14 | ) 15 | .const_label("site", "weasyl")) 16 | .unwrap_or_log(); 17 | static ref SUBMISSION_DURATION: Histogram = register_histogram!(HistogramOpts::new( 18 | "fuzzysearch_watcher_submission_duration_seconds", 19 | "Duration to load an index of submissions" 20 | ) 21 | .const_label("site", "weasyl")) 22 | .unwrap_or_log(); 23 | static ref SUBMISSION_MISSING: Counter = register_counter!(Opts::new( 24 | "fuzzysearch_watcher_submission_missing_total", 25 | "Number of submissions that were missing" 26 | ) 27 | .const_label("site", "weasyl")) 28 | .unwrap_or_log(); 29 | } 30 | 31 | #[derive(Debug, Serialize, Deserialize)] 32 | struct WeasylMediaSubmission { 33 | #[serde(rename = "mediaid")] 34 | id: i32, 35 | url: String, 36 | } 37 | 38 | #[derive(Debug, Serialize, Deserialize)] 39 | struct WeasylMedia { 40 | submission: Vec, 41 | } 42 | 43 | #[derive(Debug, Serialize, Deserialize, PartialEq)] 44 | #[serde(rename_all = "lowercase")] 45 | enum WeasylSubmissionSubtype { 46 | Multimedia, 47 | Visual, 48 | Literary, 49 | } 50 | 51 | #[derive(Debug, Serialize, Deserialize)] 52 | struct WeasylSubmission { 53 | #[serde(rename = "submitid")] 54 | id: i32, 55 | owner_login: String, 56 | media: WeasylMedia, 57 | subtype: WeasylSubmissionSubtype, 58 | } 59 | 60 | #[derive(Debug, Serialize, Deserialize)] 61 | struct WeasylFrontpageSubmission { 62 | #[serde(rename = "submitid")] 63 | id: i32, 64 | } 65 | 66 | #[derive(Debug, Serialize, Deserialize)] 67 | struct WeasylError { 68 | name: String, 69 | } 70 | 71 | #[derive(Debug, Serialize, Deserialize)] 72 | #[serde(untagged)] 73 | enum WeasylResponse { 74 | Error { error: WeasylError }, 75 | Response(T), 76 | } 77 | 78 | #[tracing::instrument(skip(client, api_key))] 79 | async fn load_frontpage(client: &reqwest::Client, api_key: &str) -> anyhow::Result { 80 | let resp: WeasylResponse> = client 81 | .get("https://www.weasyl.com/api/submissions/frontpage") 82 | .header("X-Weasyl-API-Key", api_key) 83 | .send() 84 | .await? 85 | .error_for_status()? 86 | .json() 87 | .await?; 88 | 89 | let subs = match resp { 90 | WeasylResponse::Response(subs) => subs, 91 | WeasylResponse::Error { 92 | error: WeasylError { name }, 93 | } => return Err(anyhow::anyhow!(name)), 94 | }; 95 | 96 | let max = subs 97 | .into_iter() 98 | .filter_map(|sub| sub.get("submitid").and_then(|id| id.as_i64())) 99 | .max() 100 | .unwrap_or_default(); 101 | 102 | Ok(max as i32) 103 | } 104 | 105 | #[tracing::instrument(skip(client, api_key))] 106 | async fn load_submission( 107 | client: &reqwest::Client, 108 | api_key: &str, 109 | id: i32, 110 | ) -> anyhow::Result<(Option, serde_json::Value)> { 111 | tracing::debug!("Loading submission"); 112 | 113 | let body: serde_json::Value = client 114 | .get(&format!( 115 | "https://www.weasyl.com/api/submissions/{}/view", 116 | id 117 | )) 118 | .header("X-Weasyl-API-Key", api_key) 119 | .send() 120 | .await? 121 | .json() 122 | .await?; 123 | 124 | let data: WeasylResponse = match serde_json::from_value(body.clone()) { 125 | Ok(data) => data, 126 | Err(err) => { 127 | tracing::error!("Unable to parse submission: {:?}", err); 128 | return Ok((None, body)); 129 | } 130 | }; 131 | 132 | let res = match data { 133 | WeasylResponse::Response(sub) if sub.subtype == WeasylSubmissionSubtype::Visual => { 134 | Some(sub) 135 | } 136 | WeasylResponse::Response(_sub) => None, 137 | WeasylResponse::Error { 138 | error: WeasylError { name }, 139 | } if name == "submissionRecordMissing" => None, 140 | WeasylResponse::Error { 141 | error: WeasylError { name }, 142 | } => return Err(anyhow::anyhow!(name)), 143 | }; 144 | 145 | Ok((res, body)) 146 | } 147 | 148 | #[tracing::instrument(skip(pool, client, faktory, body, sub, download_folder), fields(id = sub.id))] 149 | async fn process_submission( 150 | pool: &sqlx::Pool, 151 | client: &reqwest::Client, 152 | faktory: &FaktoryClient, 153 | body: serde_json::Value, 154 | sub: WeasylSubmission, 155 | download_folder: &Option, 156 | ) -> anyhow::Result<()> { 157 | tracing::debug!("Processing submission"); 158 | 159 | let data = client 160 | .get(&sub.media.submission.first().unwrap_or_log().url) 161 | .send() 162 | .await? 163 | .error_for_status()? 164 | .bytes() 165 | .await? 166 | .to_vec(); 167 | 168 | let num = if let Ok(image) = image::load_from_memory(&data) { 169 | let hasher = fuzzysearch_common::get_hasher(); 170 | let hash = hasher.hash_image(&image); 171 | let mut bytes: [u8; 8] = [0; 8]; 172 | bytes.copy_from_slice(hash.as_bytes()); 173 | let num = i64::from_be_bytes(bytes); 174 | Some(num) 175 | } else { 176 | tracing::warn!("Unable to decode image"); 177 | 178 | None 179 | }; 180 | 181 | let mut hasher = Sha256::new(); 182 | hasher.update(&data); 183 | let result: [u8; 32] = hasher.finalize().into(); 184 | 185 | if let Some(folder) = download_folder { 186 | if let Err(err) = fuzzysearch_common::download::write_bytes(folder, &result, &data).await { 187 | tracing::error!("Could not download image: {:?}", err); 188 | } 189 | } 190 | 191 | sqlx::query!( 192 | "INSERT INTO weasyl (id, hash, sha256, file_size, data) VALUES ($1, $2, $3, $4, $5)", 193 | sub.id, 194 | num, 195 | result.to_vec(), 196 | data.len() as i32, 197 | body 198 | ) 199 | .execute(pool) 200 | .await?; 201 | 202 | tracing::info!("Completed submission"); 203 | 204 | faktory 205 | .queue_webhook(fuzzysearch_common::faktory::WebHookData { 206 | site: fuzzysearch_common::types::Site::Weasyl, 207 | site_id: sub.id as i64, 208 | artist: sub.owner_login.clone(), 209 | file_url: sub.media.submission.first().unwrap_or_log().url.clone(), 210 | file_sha256: Some(result.to_vec()), 211 | hash: num.map(|hash| hash.to_be_bytes()), 212 | }) 213 | .await?; 214 | 215 | Ok(()) 216 | } 217 | 218 | #[tracing::instrument(skip(pool, body))] 219 | async fn insert_null( 220 | pool: &sqlx::Pool, 221 | body: serde_json::Value, 222 | id: i32, 223 | ) -> anyhow::Result<()> { 224 | tracing::debug!("Inserting null submission"); 225 | 226 | sqlx::query!("INSERT INTO WEASYL (id, data) VALUES ($1, $2)", id, body) 227 | .execute(pool) 228 | .await?; 229 | 230 | Ok(()) 231 | } 232 | 233 | #[tokio::main] 234 | async fn main() { 235 | fuzzysearch_common::trace::configure_tracing("fuzzysearch-ingest-weasyl"); 236 | fuzzysearch_common::trace::serve_metrics().await; 237 | 238 | let api_key = std::env::var("WEASYL_APIKEY").unwrap_or_log(); 239 | let user_agent = std::env::var("USER_AGENT").unwrap_or_log(); 240 | 241 | let download_folder = std::env::var("DOWNLOAD_FOLDER").ok(); 242 | 243 | let pool = sqlx::postgres::PgPoolOptions::new() 244 | .max_connections(2) 245 | .connect(&std::env::var("DATABASE_URL").unwrap_or_log()) 246 | .await 247 | .unwrap_or_log(); 248 | 249 | let client = reqwest::Client::builder() 250 | .user_agent(user_agent) 251 | .build() 252 | .unwrap_or_log(); 253 | 254 | let faktory_dsn = std::env::var("FAKTORY_URL").expect_or_log("Missing FAKTORY_URL"); 255 | let faktory = FaktoryClient::connect(faktory_dsn) 256 | .await 257 | .expect_or_log("Unable to connect to Faktory"); 258 | 259 | loop { 260 | let min = sqlx::query!("SELECT max(id) id FROM weasyl") 261 | .fetch_one(&pool) 262 | .await 263 | .unwrap_or_log() 264 | .id 265 | .unwrap_or_default(); 266 | 267 | let duration = INDEX_DURATION.start_timer(); 268 | let max = load_frontpage(&client, &api_key).await.unwrap_or_log(); 269 | duration.stop_and_record(); 270 | 271 | tracing::info!(min, max, "Calculated range of submissions to check"); 272 | 273 | tokio::time::sleep(Duration::from_secs(1)).await; 274 | 275 | for id in (min + 1)..=max { 276 | let row: Option<_> = sqlx::query!("SELECT id FROM weasyl WHERE id = $1", id) 277 | .fetch_optional(&pool) 278 | .await 279 | .unwrap_or_log(); 280 | if row.is_some() { 281 | continue; 282 | } 283 | 284 | let duration = SUBMISSION_DURATION.start_timer(); 285 | 286 | match load_submission(&client, &api_key, id).await.unwrap_or_log() { 287 | (Some(sub), json) => { 288 | process_submission(&pool, &client, &faktory, json, sub, &download_folder) 289 | .await 290 | .unwrap_or_log(); 291 | 292 | duration.stop_and_record(); 293 | } 294 | (None, body) => { 295 | insert_null(&pool, body, id).await.unwrap_or_log(); 296 | 297 | SUBMISSION_MISSING.inc(); 298 | duration.stop_and_discard(); 299 | } 300 | } 301 | 302 | tokio::time::sleep(Duration::from_secs(1)).await; 303 | } 304 | 305 | tokio::time::sleep(std::time::Duration::from_secs(60 * 5)).await; 306 | } 307 | } 308 | -------------------------------------------------------------------------------- /fuzzysearch-refresh/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "fuzzysearch-refresh" 3 | version = "0.1.0" 4 | authors = ["Syfaro "] 5 | edition = "2018" 6 | 7 | [dependencies] 8 | tracing = "0.1" 9 | tracing-unwrap = "0.9" 10 | anyhow = "1" 11 | thiserror = "1" 12 | 13 | tokio = "1" 14 | tokio-stream = "0.1" 15 | futures = "0.3" 16 | 17 | faktory = "0.11" 18 | sqlx = { version = "0.5", features = ["runtime-tokio-native-tls", "postgres", "macros", "json", "offline", "chrono"] } 19 | 20 | chrono = "0.4" 21 | reqwest = "0.11" 22 | 23 | furaffinity-rs = { git = "https://github.com/Syfaro/furaffinity-rs" } 24 | 25 | fuzzysearch-common = { path = "../fuzzysearch-common" } 26 | -------------------------------------------------------------------------------- /fuzzysearch-refresh/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:24.04 2 | RUN apt-get update -y && apt-get install -y openssl ca-certificates && rm -rf /var/lib/apt/lists/* 3 | COPY ./fuzzysearch-refresh/fuzzysearch-refresh /bin/fuzzysearch-refresh 4 | CMD ["/bin/fuzzysearch-refresh"] 5 | -------------------------------------------------------------------------------- /fuzzysearch-refresh/sqlx-data.json: -------------------------------------------------------------------------------- 1 | { 2 | "db": "PostgreSQL", 3 | "36c9a44cf5d7e004912ae72b7f1e3550deb0531aa07144c3cef140381da9bc97": { 4 | "query": "SELECT id FROM artist WHERE name = $1", 5 | "describe": { 6 | "columns": [ 7 | { 8 | "ordinal": 0, 9 | "name": "id", 10 | "type_info": "Int4" 11 | } 12 | ], 13 | "parameters": { 14 | "Left": [ 15 | "Text" 16 | ] 17 | }, 18 | "nullable": [ 19 | false 20 | ] 21 | } 22 | }, 23 | "58683afdbc3906ed624e0daf3acec7079db9bc455b30d647d932a35838419b1b": { 24 | "query": "SELECT id FROM submission", 25 | "describe": { 26 | "columns": [ 27 | { 28 | "ordinal": 0, 29 | "name": "id", 30 | "type_info": "Int4" 31 | } 32 | ], 33 | "parameters": { 34 | "Left": [] 35 | }, 36 | "nullable": [ 37 | false 38 | ] 39 | } 40 | }, 41 | "8fb99c8859fdcc51f095ba191924f3a336358ce6a6e5223e86f8b15cd7ec7f37": { 42 | "query": "INSERT INTO submission (id, updated_at, deleted) VALUES ($1, current_timestamp, true) ON CONFLICT (id) DO UPDATE SET deleted = true", 43 | "describe": { 44 | "columns": [], 45 | "parameters": { 46 | "Left": [ 47 | "Int4" 48 | ] 49 | }, 50 | "nullable": [] 51 | } 52 | }, 53 | "921fcab0b8fed99671fe84fe1b011650b7fa4cfaae3843a5a724f928db4c9734": { 54 | "query": "SELECT id FROM tag WHERE name = $1", 55 | "describe": { 56 | "columns": [ 57 | { 58 | "ordinal": 0, 59 | "name": "id", 60 | "type_info": "Int4" 61 | } 62 | ], 63 | "parameters": { 64 | "Left": [ 65 | "Text" 66 | ] 67 | }, 68 | "nullable": [ 69 | false 70 | ] 71 | } 72 | }, 73 | "a1dff4a02afe1a8a3ffd42cf86b557709fdb0994518b72342f15f9535f5b6a02": { 74 | "query": "INSERT INTO submission\n (id, artist_id, url, filename, hash, rating, posted_at, description, hash_int, file_id, file_size, file_sha256, updated_at) VALUES\n ($1, $2, $3, $4, decode($5, 'base64'), $6, $7, $8, $9, CASE WHEN isnumeric(split_part($4, '.', 1)) THEN split_part($4, '.', 1)::int ELSE null END, $10, $11, current_timestamp)\n ON CONFLICT (id) DO UPDATE SET url = $3, filename = $4, hash = decode($5, 'base64'), rating = $6, description = $8, hash_int = $9, file_id = CASE WHEN isnumeric(split_part($4, '.', 1)) THEN split_part($4, '.', 1)::int ELSE null END, file_size = $10, file_sha256 = $11, updated_at = current_timestamp", 75 | "describe": { 76 | "columns": [], 77 | "parameters": { 78 | "Left": [ 79 | "Int4", 80 | "Int4", 81 | "Text", 82 | "Text", 83 | "Text", 84 | "Bpchar", 85 | "Timestamptz", 86 | "Text", 87 | "Int8", 88 | "Int4", 89 | "Bytea" 90 | ] 91 | }, 92 | "nullable": [] 93 | } 94 | }, 95 | "a6d0113ac38781a41a717aee7e28940b7f362951402bec50bc54932a6939b217": { 96 | "query": "INSERT INTO artist (name) VALUES ($1) RETURNING id", 97 | "describe": { 98 | "columns": [ 99 | { 100 | "ordinal": 0, 101 | "name": "id", 102 | "type_info": "Int4" 103 | } 104 | ], 105 | "parameters": { 106 | "Left": [ 107 | "Text" 108 | ] 109 | }, 110 | "nullable": [ 111 | false 112 | ] 113 | } 114 | }, 115 | "b9323d762b487be18d991f84cfde591c7b33e0a2530be186ab77ad802781772e": { 116 | "query": "SELECT updated_at FROM submission WHERE id = $1", 117 | "describe": { 118 | "columns": [ 119 | { 120 | "ordinal": 0, 121 | "name": "updated_at", 122 | "type_info": "Timestamptz" 123 | } 124 | ], 125 | "parameters": { 126 | "Left": [ 127 | "Int4" 128 | ] 129 | }, 130 | "nullable": [ 131 | true 132 | ] 133 | } 134 | }, 135 | "cb877c1de895efa7753b25f401036ae61711d95a1c1db233580b50fb36eec0cb": { 136 | "query": "INSERT INTO tag_to_post (tag_id, post_id) VALUES ($1, $2) ON CONFLICT DO NOTHING", 137 | "describe": { 138 | "columns": [], 139 | "parameters": { 140 | "Left": [ 141 | "Int4", 142 | "Int4" 143 | ] 144 | }, 145 | "nullable": [] 146 | } 147 | }, 148 | "f9dfb3a7414c35f112dc30d053fdc546ec4776761346db98982858ddf3afb6d3": { 149 | "query": "INSERT INTO tag (name) VALUES ($1) RETURNING id", 150 | "describe": { 151 | "columns": [ 152 | { 153 | "ordinal": 0, 154 | "name": "id", 155 | "type_info": "Int4" 156 | } 157 | ], 158 | "parameters": { 159 | "Left": [ 160 | "Text" 161 | ] 162 | }, 163 | "nullable": [ 164 | false 165 | ] 166 | } 167 | } 168 | } -------------------------------------------------------------------------------- /fuzzysearch-refresh/src/main.rs: -------------------------------------------------------------------------------- 1 | use std::net::TcpStream; 2 | use std::sync::{Arc, Mutex}; 3 | 4 | use furaffinity_rs::FurAffinity; 5 | use tracing_unwrap::ResultExt; 6 | 7 | #[derive(Debug, thiserror::Error)] 8 | #[non_exhaustive] 9 | enum Error { 10 | #[error("database error: {0}")] 11 | Database(#[from] sqlx::Error), 12 | #[error("missing data: {0}")] 13 | MissingData(&'static str), 14 | #[error("furaffinity error")] 15 | FurAffinity(furaffinity_rs::Error), 16 | #[error("faktory error")] 17 | Faktory, 18 | } 19 | 20 | static FURAFFINITY_QUEUE: &str = "fuzzysearch_refresh_furaffinity"; 21 | 22 | type Producer = Arc>>; 23 | type Db = sqlx::Pool; 24 | 25 | fn main() { 26 | fuzzysearch_common::init_logger(); 27 | 28 | tracing::info!("initializing"); 29 | 30 | let rt = Arc::new(tokio::runtime::Runtime::new().unwrap()); 31 | 32 | let mut faktory = faktory::ConsumerBuilder::default(); 33 | faktory.labels(vec!["fuzzysearch-refresh".to_string()]); 34 | faktory.workers(2); 35 | 36 | let p = Arc::new(Mutex::new(faktory::Producer::connect(None).unwrap_or_log())); 37 | 38 | let pool = rt 39 | .block_on( 40 | sqlx::postgres::PgPoolOptions::new() 41 | .max_connections(2) 42 | .connect(&std::env::var("DATABASE_URL").unwrap_or_log()), 43 | ) 44 | .unwrap_or_log(); 45 | 46 | let (cookie_a, cookie_b) = ( 47 | std::env::var("FA_A").unwrap_or_log(), 48 | std::env::var("FA_B").unwrap_or_log(), 49 | ); 50 | let user_agent = std::env::var("USER_AGENT").unwrap_or_log(); 51 | let client = reqwest::Client::new(); 52 | let fa = Arc::new(FurAffinity::new( 53 | cookie_a, 54 | cookie_b, 55 | user_agent, 56 | Some(client), 57 | )); 58 | 59 | rt.spawn(poll_fa_online(fa.clone(), p.clone())); 60 | 61 | let rt_clone = rt.clone(); 62 | let pool_clone = pool.clone(); 63 | faktory.register("furaffinity_load", move |job| -> Result<(), Error> { 64 | use std::convert::TryFrom; 65 | 66 | let id = job 67 | .args() 68 | .iter() 69 | .next() 70 | .ok_or(Error::MissingData("submission id"))? 71 | .as_i64() 72 | .ok_or(Error::MissingData("submission id"))?; 73 | 74 | let id = i32::try_from(id).map_err(|_| Error::MissingData("invalid id"))?; 75 | 76 | let last_updated = rt_clone 77 | .block_on( 78 | sqlx::query_scalar!("SELECT updated_at FROM submission WHERE id = $1", id) 79 | .fetch_optional(&pool_clone), 80 | )? 81 | .flatten(); 82 | 83 | if let Some(last_updated) = last_updated { 84 | let diff = last_updated.signed_duration_since(chrono::Utc::now()); 85 | if diff.num_days() < 30 { 86 | tracing::warn!("attempted to check recent submission, skipping"); 87 | return Ok(()); 88 | } 89 | } 90 | 91 | let sub = rt_clone 92 | .block_on(fa.get_submission(id)) 93 | .map_err(Error::FurAffinity)?; 94 | 95 | tracing::debug!("loaded furaffinity submission"); 96 | 97 | rt_clone.block_on(update_furaffinity_submission( 98 | pool_clone.clone(), 99 | fa.clone(), 100 | id, 101 | sub, 102 | ))?; 103 | 104 | Ok(()) 105 | }); 106 | 107 | faktory.register( 108 | "furaffinity_calculate_missing", 109 | move |job| -> Result<(), Error> { 110 | use std::collections::HashSet; 111 | 112 | let batch_size = job 113 | .args() 114 | .iter() 115 | .next() 116 | .and_then(|arg| arg.as_i64()) 117 | .unwrap_or(1_000); 118 | 119 | tracing::debug!(batch_size, "calculating missing submissions"); 120 | 121 | let known_ids: HashSet<_> = rt 122 | .block_on(sqlx::query_scalar!("SELECT id FROM submission").fetch_all(&pool))? 123 | .into_iter() 124 | .collect(); 125 | let all_ids: HashSet<_> = (1..=*known_ids.iter().max().unwrap_or(&1)).collect(); 126 | let missing_ids: Vec<_> = all_ids 127 | .difference(&known_ids) 128 | .take(batch_size as usize) 129 | .collect(); 130 | 131 | tracing::info!( 132 | missing = missing_ids.len(), 133 | "enqueueing batch of missing submissions" 134 | ); 135 | 136 | let mut p = p.lock().unwrap_or_log(); 137 | 138 | for id in missing_ids { 139 | let job = 140 | faktory::Job::new("furaffinity_load", vec![*id]).on_queue(FURAFFINITY_QUEUE); 141 | p.enqueue(job).map_err(|_err| Error::Faktory)?; 142 | } 143 | 144 | Ok(()) 145 | }, 146 | ); 147 | 148 | let faktory = faktory.connect(None).unwrap_or_log(); 149 | tracing::info!("starting to run queues"); 150 | faktory.run_to_completion(&["fuzzysearch_refresh", FURAFFINITY_QUEUE]); 151 | } 152 | 153 | /// Check the number of users on FurAffinity every minute and control if queues 154 | /// are allowed to run. 155 | async fn poll_fa_online(fa: Arc, p: Producer) { 156 | use futures::StreamExt; 157 | use std::{ 158 | sync::atomic::{AtomicBool, Ordering}, 159 | time::Duration, 160 | }; 161 | use tokio::time::interval; 162 | use tokio_stream::wrappers::IntervalStream; 163 | 164 | let max_online = std::env::var("MAX_ONLINE") 165 | .ok() 166 | .and_then(|num| num.parse().ok()) 167 | .unwrap_or(10_000); 168 | 169 | tracing::info!(max_online, "got max fa users online before pause"); 170 | 171 | // Ensure initial state of the queue being enabled. 172 | { 173 | let p = p.clone(); 174 | tokio::task::spawn_blocking(move || { 175 | let mut p = p.lock().unwrap_or_log(); 176 | p.queue_resume(&[FURAFFINITY_QUEUE]).unwrap_or_log(); 177 | }) 178 | .await 179 | .expect_or_log("could not set initial queue state"); 180 | } 181 | 182 | let queue_state = AtomicBool::new(true); 183 | 184 | IntervalStream::new(interval(Duration::from_secs(300))) 185 | .for_each(|_| { 186 | let p = p.clone(); 187 | 188 | async { 189 | let continue_queue = match fa.latest_id().await { 190 | Ok((_latest_id, online)) => { 191 | tracing::debug!(registered = online.registered, "got updated fa online"); 192 | online.registered < max_online 193 | } 194 | Err(err) => { 195 | tracing::error!("unable to get fa online: {:?}", err); 196 | false 197 | } 198 | }; 199 | 200 | if queue_state.load(Ordering::SeqCst) == continue_queue { 201 | tracing::trace!("fa queue was already in correct state"); 202 | return; 203 | } 204 | 205 | tracing::info!(continue_queue, "updating fa queue state"); 206 | 207 | let result = tokio::task::spawn_blocking(move || { 208 | let mut p = p.lock().unwrap_or_log(); 209 | 210 | if continue_queue { 211 | p.queue_resume(&[FURAFFINITY_QUEUE]) 212 | } else { 213 | p.queue_pause(&[FURAFFINITY_QUEUE]) 214 | } 215 | }) 216 | .await; 217 | 218 | match result { 219 | Err(err) => tracing::error!("unable to join queue change: {:?}", err), 220 | Ok(Err(err)) => tracing::error!("unable to change fa queue state: {:?}", err), 221 | _ => queue_state.store(continue_queue, Ordering::SeqCst), 222 | } 223 | } 224 | }) 225 | .await; 226 | } 227 | 228 | async fn get_furaffinity_artist(db: &Db, artist: &str) -> Result { 229 | if let Some(id) = sqlx::query_scalar!("SELECT id FROM artist WHERE name = $1", artist) 230 | .fetch_optional(db) 231 | .await? 232 | { 233 | return Ok(id); 234 | } 235 | 236 | sqlx::query_scalar!("INSERT INTO artist (name) VALUES ($1) RETURNING id", artist) 237 | .fetch_one(db) 238 | .await 239 | } 240 | 241 | async fn get_furaffinity_tag(db: &Db, tag: &str) -> Result { 242 | if let Some(id) = sqlx::query_scalar!("SELECT id FROM tag WHERE name = $1", tag) 243 | .fetch_optional(db) 244 | .await? 245 | { 246 | return Ok(id); 247 | } 248 | 249 | sqlx::query_scalar!("INSERT INTO tag (name) VALUES ($1) RETURNING id", tag) 250 | .fetch_one(db) 251 | .await 252 | } 253 | 254 | async fn associate_furaffinity_tag(db: &Db, id: i32, tag_id: i32) -> Result<(), sqlx::Error> { 255 | sqlx::query!( 256 | "INSERT INTO tag_to_post (tag_id, post_id) VALUES ($1, $2) ON CONFLICT DO NOTHING", 257 | tag_id, 258 | id 259 | ) 260 | .execute(db) 261 | .await 262 | .map(|_| ()) 263 | } 264 | 265 | async fn update_furaffinity_submission( 266 | db: Db, 267 | fa: Arc, 268 | id: i32, 269 | sub: Option, 270 | ) -> Result<(), Error> { 271 | let sub = match sub { 272 | Some(sub) => sub, 273 | None => { 274 | tracing::info!(id, "furaffinity submission did not exist"); 275 | sqlx::query!("INSERT INTO submission (id, updated_at, deleted) VALUES ($1, current_timestamp, true) ON CONFLICT (id) DO UPDATE SET deleted = true", id).execute(&db).await?; 276 | return Ok(()); 277 | } 278 | }; 279 | 280 | let sub = fa.calc_image_hash(sub).await.map_err(Error::FurAffinity)?; 281 | 282 | let artist_id = get_furaffinity_artist(&db, &sub.artist).await?; 283 | 284 | let mut tag_ids = Vec::with_capacity(sub.tags.len()); 285 | for tag in &sub.tags { 286 | tag_ids.push(get_furaffinity_tag(&db, tag).await?); 287 | } 288 | 289 | let hash = sub.hash.clone(); 290 | let url = sub.content.url(); 291 | 292 | let size = sub.file_size.map(|size| size as i32); 293 | 294 | sqlx::query!( 295 | "INSERT INTO submission 296 | (id, artist_id, url, filename, hash, rating, posted_at, description, hash_int, file_id, file_size, file_sha256, updated_at) VALUES 297 | ($1, $2, $3, $4, decode($5, 'base64'), $6, $7, $8, $9, CASE WHEN isnumeric(split_part($4, '.', 1)) THEN split_part($4, '.', 1)::int ELSE null END, $10, $11, current_timestamp) 298 | ON CONFLICT (id) DO UPDATE SET url = $3, filename = $4, hash = decode($5, 'base64'), rating = $6, description = $8, hash_int = $9, file_id = CASE WHEN isnumeric(split_part($4, '.', 1)) THEN split_part($4, '.', 1)::int ELSE null END, file_size = $10, file_sha256 = $11, updated_at = current_timestamp", 299 | sub.id, artist_id, url, sub.filename, hash, sub.rating.serialize(), sub.posted_at, sub.description, sub.hash_num, size, sub.file_sha256, 300 | ) 301 | .execute(&db).await?; 302 | 303 | for tag_id in tag_ids { 304 | associate_furaffinity_tag(&db, id, tag_id).await?; 305 | } 306 | 307 | Ok(()) 308 | } 309 | -------------------------------------------------------------------------------- /fuzzysearch-webhook/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "fuzzysearch-webhook" 3 | version = "0.1.0" 4 | authors = ["Syfaro "] 5 | edition = "2018" 6 | 7 | [dependencies] 8 | tracing = "0.1" 9 | tracing-unwrap = "0.9" 10 | thiserror = "1" 11 | 12 | faktory = "0.11" 13 | reqwest = { version = "0.11", features = ["blocking", "json"] } 14 | anyhow = "1" 15 | serde_json = "1" 16 | r2d2 = "0.8" 17 | r2d2_postgres = "0.18" 18 | 19 | fuzzysearch-common = { path = "../fuzzysearch-common" } 20 | -------------------------------------------------------------------------------- /fuzzysearch-webhook/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:24.04 2 | RUN apt-get update -y && apt-get install -y openssl ca-certificates && rm -rf /var/lib/apt/lists/* 3 | COPY ./fuzzysearch-webhook/fuzzysearch-webhook /bin/fuzzysearch-webhook 4 | CMD ["/bin/fuzzysearch-webhook"] 5 | -------------------------------------------------------------------------------- /fuzzysearch-webhook/src/main.rs: -------------------------------------------------------------------------------- 1 | use r2d2_postgres::{postgres::NoTls, PostgresConnectionManager}; 2 | use thiserror::Error; 3 | use tracing_unwrap::ResultExt; 4 | 5 | static APP_USER_AGENT: &str = concat!( 6 | env!("CARGO_PKG_NAME"), 7 | "/", 8 | env!("CARGO_PKG_VERSION"), 9 | " - ", 10 | env!("CARGO_PKG_AUTHORS") 11 | ); 12 | 13 | #[derive(Error, Debug)] 14 | pub enum WebhookError { 15 | #[error("invalid data")] 16 | Serde(#[from] serde_json::Error), 17 | #[error("missing data")] 18 | MissingData, 19 | #[error("database pool issue")] 20 | Pool(#[from] r2d2_postgres::postgres::Error), 21 | #[error("database error")] 22 | Database(#[from] r2d2::Error), 23 | #[error("network error")] 24 | Network(#[from] reqwest::Error), 25 | #[error("faktory error")] 26 | Faktory, 27 | } 28 | 29 | fn main() { 30 | fuzzysearch_common::init_logger(); 31 | 32 | tracing::info!("Starting..."); 33 | 34 | let dsn = std::env::var("POSTGRES_DSN").unwrap_or_log(); 35 | let manager = PostgresConnectionManager::new(dsn.parse().unwrap_or_log(), NoTls); 36 | let pool = r2d2::Pool::new(manager).unwrap_or_log(); 37 | 38 | let client = reqwest::blocking::ClientBuilder::default() 39 | .user_agent(APP_USER_AGENT) 40 | .timeout(std::time::Duration::from_secs(3)) 41 | .build() 42 | .unwrap_or_log(); 43 | 44 | let mut faktory = faktory::ConsumerBuilder::default(); 45 | faktory.labels(vec!["fuzzysearch-webhook".to_string()]); 46 | faktory.workers(2); 47 | 48 | let producer = std::sync::Mutex::new(faktory::Producer::connect(None).unwrap()); 49 | 50 | faktory.register("new_submission", move |job| -> Result<(), WebhookError> { 51 | let _span = tracing::info_span!("new_submission", job_id = job.id()).entered(); 52 | 53 | let data = job 54 | .args() 55 | .iter() 56 | .next() 57 | .ok_or(WebhookError::MissingData)? 58 | .to_owned(); 59 | 60 | let mut conn = pool.get()?; 61 | 62 | for row in conn.query("SELECT endpoint FROM webhook", &[])? { 63 | let endpoint: &str = row.get(0); 64 | 65 | tracing::debug!(endpoint, "Queueing webhook"); 66 | 67 | let job = faktory::Job::new( 68 | "send_webhook", 69 | vec![data.clone(), serde_json::to_value(endpoint)?], 70 | ) 71 | .on_queue("fuzzysearch_webhook"); 72 | 73 | let mut producer = producer.lock().unwrap(); 74 | producer.enqueue(job).map_err(|_| WebhookError::Faktory)?; 75 | } 76 | 77 | tracing::info!("Queued webhooks"); 78 | 79 | Ok(()) 80 | }); 81 | 82 | faktory.register("send_webhook", move |job| -> Result<(), WebhookError> { 83 | let _span = tracing::info_span!("send_webhook", job_id = job.id()).entered(); 84 | 85 | let mut args = job.args().iter(); 86 | 87 | let data = args.next().ok_or(WebhookError::MissingData)?.to_owned(); 88 | let value: fuzzysearch_common::faktory::WebHookData = serde_json::value::from_value(data)?; 89 | 90 | let endpoint = args 91 | .next() 92 | .ok_or(WebhookError::MissingData)? 93 | .as_str() 94 | .ok_or(WebhookError::MissingData)?; 95 | 96 | tracing::trace!(endpoint, site = %value.site, site_id = value.site_id, "Sending webhook"); 97 | 98 | client 99 | .post(endpoint) 100 | .json(&value) 101 | .send()? 102 | .error_for_status()?; 103 | 104 | Ok(()) 105 | }); 106 | 107 | let faktory = faktory.connect(None).unwrap_or_log(); 108 | faktory.run_to_completion(&["fuzzysearch_webhook"]); 109 | } 110 | -------------------------------------------------------------------------------- /migrations/20210221024406_bktree_index.down.sql: -------------------------------------------------------------------------------- 1 | DROP EXTENSION bktree; 2 | -------------------------------------------------------------------------------- /migrations/20210221024406_bktree_index.up.sql: -------------------------------------------------------------------------------- 1 | CREATE EXTENSION bktree; 2 | -------------------------------------------------------------------------------- /migrations/20210221025236_furaffinity.down.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE artist; 2 | DROP TABLE submission; 3 | DROP TABLE tag; 4 | DROP TABLE tag_to_post; 5 | -------------------------------------------------------------------------------- /migrations/20210221025236_furaffinity.up.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE artist ( 2 | id SERIAL PRIMARY KEY, 3 | name TEXT UNIQUE NOT NULL 4 | ); 5 | 6 | CREATE TABLE submission ( 7 | id INTEGER PRIMARY KEY, 8 | artist_id INTEGER REFERENCES artist (id), 9 | hash BYTEA, 10 | hash_int BIGINT, 11 | url TEXT, 12 | filename TEXT, 13 | rating CHAR(1), 14 | posted_at TIMESTAMP WITH TIME ZONE, 15 | description TEXT, 16 | file_id INTEGER, 17 | file_size INTEGER, 18 | file_sha256 BYTEA, 19 | imported BOOLEAN DEFAULT false, 20 | removed BOOLEAN, 21 | updated_at TIMESTAMP WITH TIME ZONE 22 | ); 23 | 24 | CREATE INDEX ON submission (file_id); 25 | CREATE INDEX ON submission (imported); 26 | CREATE INDEX ON submission (posted_at); 27 | CREATE INDEX ON submission (artist_id); 28 | CREATE INDEX ON submission (file_sha256) WHERE file_sha256 IS NOT NULL; 29 | CREATE INDEX ON submission (lower(url)); 30 | CREATE INDEX ON submission (lower(filename)); 31 | 32 | CREATE TABLE tag ( 33 | id SERIAL PRIMARY KEY, 34 | name TEXT UNIQUE NOT NULL 35 | ); 36 | 37 | CREATE TABLE tag_to_post ( 38 | tag_id INTEGER NOT NULL REFERENCES tag (id), 39 | post_id INTEGER NOT NULL REFERENCES submission (id), 40 | 41 | PRIMARY KEY (tag_id, post_id) 42 | ); 43 | -------------------------------------------------------------------------------- /migrations/20210221025652_e621.down.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE e621; 2 | -------------------------------------------------------------------------------- /migrations/20210221025652_e621.up.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE e621 ( 2 | id INTEGER PRIMARY KEY, 3 | hash BIGINT, 4 | data JSONB, 5 | sha256 BYTEA, 6 | hash_error TEXT 7 | ); 8 | 9 | CREATE INDEX ON e621 (sha256); 10 | -------------------------------------------------------------------------------- /migrations/20210221025835_weasyl.down.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE weasyl; 2 | -------------------------------------------------------------------------------- /migrations/20210221025835_weasyl.up.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE weasyl ( 2 | id INTEGER PRIMARY KEY, 3 | hash BIGINT, 4 | data JSONB, 5 | sha256 BYTEA, 6 | file_size INTEGER 7 | ); 8 | 9 | CREATE INDEX ON weasyl (sha256); 10 | -------------------------------------------------------------------------------- /migrations/20210221030022_twitter.down.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE tweet_media; 2 | DROP TABLE tweet; 3 | DROP TABLE twitter_user; 4 | -------------------------------------------------------------------------------- /migrations/20210221030022_twitter.up.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE twitter_user ( 2 | twitter_id BIGINT PRIMARY KEY, 3 | approved BOOLEAN NOT NULL DEFAULT false, 4 | data JSONB, 5 | last_update TIMESTAMP WITHOUT TIME ZONE, 6 | max_id BIGINT, 7 | completed_back BOOLEAN NOT NULL DEFAULT false, 8 | min_id BIGINT 9 | ); 10 | 11 | CREATE INDEX ON twitter_user (last_update); 12 | CREATE INDEX ON twitter_user (lower(data->>'screen_name')); 13 | CREATE INDEX ON twitter_user (min_id); 14 | CREATE INDEX ON twitter_user (twitter_id, approved); 15 | CREATE INDEX ON twitter_user (((data->'protected')::boolean)); 16 | 17 | CREATE TABLE tweet ( 18 | id BIGINT PRIMARY KEY, 19 | twitter_user_id BIGINT NOT NULL REFERENCES twitter_user (twitter_id), 20 | data JSONB 21 | ); 22 | 23 | CREATE TABLE tweet_media ( 24 | media_id BIGINT NOT NULL, 25 | tweet_id BIGINT NOT NULL REFERENCES tweet (id), 26 | hash BIGINT, 27 | url TEXT, 28 | 29 | PRIMARY KEY (media_id, tweet_id) 30 | ); 31 | -------------------------------------------------------------------------------- /migrations/20210221030823_hashes.down.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE hashes; 2 | -------------------------------------------------------------------------------- /migrations/20210221030823_hashes.up.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE hashes ( 2 | id SERIAL PRIMARY KEY, 3 | hash BIGINT NOT NULL, 4 | furaffinity_id INTEGER UNIQUE REFERENCES submission (id), 5 | e621_id INTEGER UNIQUE REFERENCES e621 (id), 6 | twitter_id BIGINT REFERENCES tweet (id) 7 | ); 8 | 9 | CREATE FUNCTION hashes_insert_furaffinity() 10 | RETURNS trigger 11 | LANGUAGE plpgsql 12 | AS $$ 13 | BEGIN 14 | if NEW.hash_int IS NOT NULL THEN 15 | INSERT INTO hashes (furaffinity_id, hash) VALUES (NEW.id, NEW.hash_int); 16 | END IF; 17 | 18 | RETURN NEW; 19 | END; 20 | $$; 21 | 22 | CREATE FUNCTION hashes_insert_e621() 23 | RETURNS trigger 24 | LANGUAGE plpgsql 25 | AS $$ 26 | BEGIN 27 | IF NEW.hash IS NOT NULL THEN 28 | IF exists(SELECT 1 FROM hashes WHERE hashes.e621_id = NEW.id) THEN 29 | UPDATE hashes SET hashes.hash = NEW.hash WHERE e621_id = NEW.id; 30 | ELSE 31 | INSERT INTO hashes (e621_id, hash) VALUES (NEW.id, NEW.hash); 32 | END IF; 33 | END IF; 34 | 35 | RETURN NEW; 36 | END; 37 | $$; 38 | 39 | CREATE FUNCTION hashes_insert_twitter() 40 | RETURNS trigger 41 | LANGUAGE plpgsql 42 | AS $$ 43 | BEGIN 44 | IF NEW.hash IS NOT NULL THEN 45 | INSERT INTO hashes (twitter_id, hash) VALUES (NEW.tweet_id, NEW.hash); 46 | END IF; 47 | 48 | RETURN NEW; 49 | END; 50 | $$; 51 | 52 | CREATE TRIGGER hashes_insert_furaffinity AFTER INSERT ON submission 53 | FOR EACH ROW EXECUTE PROCEDURE hashes_insert_furaffinity(); 54 | CREATE TRIGGER hashes_insert_e621 AFTER INSERT ON e621 55 | FOR EACH ROW EXECUTE PROCEDURE hashes_insert_e621(); 56 | CREATE TRIGGER hashes_insert_twitter AFTER INSERT ON tweet_media 57 | FOR EACH ROW EXECUTE PROCEDURE hashes_insert_twitter(); 58 | 59 | INSERT INTO hashes (furaffinity_id, hash) 60 | SELECT id, hash_int FROM submission WHERE hash_int IS NOT NULL 61 | ON CONFLICT DO NOTHING; 62 | INSERT INTO hashes (e621_id, hash) 63 | SELECT id, hash FROM e621 WHERE hash IS NOT NULL 64 | ON CONFLICT DO NOTHING; 65 | INSERT INTO hashes (twitter_id, hash) 66 | SELECT tweet_id, hash FROM tweet_media WHERE hash IS NOT NULL 67 | ON CONFLICT DO NOTHING; 68 | 69 | CREATE INDEX ON hashes USING spgist (hash bktree_ops); 70 | 71 | CREATE FUNCTION hashes_notify_inserted() 72 | RETURNS trigger 73 | LANGUAGE plpgsql 74 | AS $$ 75 | BEGIN 76 | PERFORM pg_notify('fuzzysearch_hash_added'::text, 77 | json_build_object('id', NEW.id, 'hash', NEW.hash)::text); 78 | RETURN NEW; 79 | END; 80 | $$; 81 | 82 | CREATE TRIGGER hashes_notify_inserted AFTER INSERT ON hashes 83 | FOR EACH ROW EXECUTE PROCEDURE hashes_notify_inserted(); 84 | -------------------------------------------------------------------------------- /migrations/20210221033051_authentication.down.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE rate_limit; 2 | DROP TABLE api_key; 3 | DROP TABLE account; 4 | -------------------------------------------------------------------------------- /migrations/20210221033051_authentication.up.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE account ( 2 | id SERIAL PRIMARY KEY, 3 | email TEXT UNIQUE NOT NULL, 4 | password TEXT NOT NULL, 5 | email_verifier TEXT 6 | ); 7 | 8 | CREATE TABLE api_key ( 9 | id SERIAL PRIMARY KEY, 10 | user_id INTEGER NOT NULL REFERENCES account (id), 11 | name TEXT, 12 | key TEXT UNIQUE NOT NULL, 13 | name_limit SMALLINT NOT NULL, 14 | image_limit SMALLINT NOT NULL, 15 | hash_limit SMALLINT NOT NULL 16 | ); 17 | 18 | CREATE TABLE rate_limit ( 19 | api_key_id INTEGER NOT NULL REFERENCES api_key (id), 20 | time_window BIGINT NOT NULL, 21 | group_name TEXT NOT NULL, 22 | count SMALLINT NOT NULL DEFAULT 0, 23 | 24 | CONSTRAINT unique_window 25 | PRIMARY KEY (api_key_id, time_window, group_name) 26 | ); 27 | -------------------------------------------------------------------------------- /migrations/20210419174900_index_all_hashes.down.sql: -------------------------------------------------------------------------------- 1 | DROP INDEX bk_furaffinity_hash; 2 | DROP INDEX bk_e621_hash; 3 | DROP INDEX bk_twitter_hash; 4 | DROP INDEX bk_weasyl_hash; 5 | -------------------------------------------------------------------------------- /migrations/20210419174900_index_all_hashes.up.sql: -------------------------------------------------------------------------------- 1 | CREATE INDEX bk_furaffinity_hash ON submission USING spgist (hash_int bktree_ops); 2 | CREATE INDEX bk_e621_hash ON e621 USING spgist (hash bktree_ops); 3 | CREATE INDEX bk_twitter_hash ON tweet_media USING spgist (hash bktree_ops); 4 | CREATE INDEX bk_weasyl_hash ON weasyl USING spgist (hash bktree_ops); 5 | -------------------------------------------------------------------------------- /migrations/20210419202830_remove_old_index.down.sql: -------------------------------------------------------------------------------- 1 | DROP FUNCTION update_notify_furaffinity CASCADE; 2 | DROP FUNCTION update_notify_others CASCADE; 3 | 4 | CREATE TABLE hashes ( 5 | id SERIAL PRIMARY KEY, 6 | hash BIGINT NOT NULL, 7 | furaffinity_id INTEGER UNIQUE REFERENCES submission (id), 8 | e621_id INTEGER UNIQUE REFERENCES e621 (id), 9 | twitter_id BIGINT REFERENCES tweet (id) 10 | ); 11 | 12 | CREATE FUNCTION hashes_insert_furaffinity() 13 | RETURNS trigger 14 | LANGUAGE plpgsql 15 | AS $$ 16 | BEGIN 17 | if NEW.hash_int IS NOT NULL THEN 18 | INSERT INTO hashes (furaffinity_id, hash) VALUES (NEW.id, NEW.hash_int); 19 | END IF; 20 | 21 | RETURN NEW; 22 | END; 23 | $$; 24 | 25 | CREATE FUNCTION hashes_insert_e621() 26 | RETURNS trigger 27 | LANGUAGE plpgsql 28 | AS $$ 29 | BEGIN 30 | IF NEW.hash IS NOT NULL THEN 31 | IF exists(SELECT 1 FROM hashes WHERE hashes.e621_id = NEW.id) THEN 32 | UPDATE hashes SET hashes.hash = NEW.hash WHERE e621_id = NEW.id; 33 | ELSE 34 | INSERT INTO hashes (e621_id, hash) VALUES (NEW.id, NEW.hash); 35 | END IF; 36 | END IF; 37 | 38 | RETURN NEW; 39 | END; 40 | $$; 41 | 42 | CREATE FUNCTION hashes_insert_twitter() 43 | RETURNS trigger 44 | LANGUAGE plpgsql 45 | AS $$ 46 | BEGIN 47 | IF NEW.hash IS NOT NULL THEN 48 | INSERT INTO hashes (twitter_id, hash) VALUES (NEW.tweet_id, NEW.hash); 49 | END IF; 50 | 51 | RETURN NEW; 52 | END; 53 | $$; 54 | 55 | CREATE TRIGGER hashes_insert_furaffinity AFTER INSERT ON submission 56 | FOR EACH ROW EXECUTE PROCEDURE hashes_insert_furaffinity(); 57 | CREATE TRIGGER hashes_insert_e621 AFTER INSERT ON e621 58 | FOR EACH ROW EXECUTE PROCEDURE hashes_insert_e621(); 59 | CREATE TRIGGER hashes_insert_twitter AFTER INSERT ON tweet_media 60 | FOR EACH ROW EXECUTE PROCEDURE hashes_insert_twitter(); 61 | 62 | INSERT INTO hashes (furaffinity_id, hash) 63 | SELECT id, hash_int FROM submission WHERE hash_int IS NOT NULL 64 | ON CONFLICT DO NOTHING; 65 | INSERT INTO hashes (e621_id, hash) 66 | SELECT id, hash FROM e621 WHERE hash IS NOT NULL 67 | ON CONFLICT DO NOTHING; 68 | INSERT INTO hashes (twitter_id, hash) 69 | SELECT tweet_id, hash FROM tweet_media WHERE hash IS NOT NULL 70 | ON CONFLICT DO NOTHING; 71 | 72 | CREATE INDEX ON hashes USING spgist (hash bktree_ops); 73 | 74 | CREATE FUNCTION hashes_notify_inserted() 75 | RETURNS trigger 76 | LANGUAGE plpgsql 77 | AS $$ 78 | BEGIN 79 | PERFORM pg_notify('fuzzysearch_hash_added'::text, 80 | json_build_object('id', NEW.id, 'hash', NEW.hash)::text); 81 | RETURN NEW; 82 | END; 83 | $$; 84 | 85 | CREATE TRIGGER hashes_notify_inserted AFTER INSERT ON hashes 86 | FOR EACH ROW EXECUTE PROCEDURE hashes_notify_inserted(); 87 | -------------------------------------------------------------------------------- /migrations/20210419202830_remove_old_index.up.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE hashes; 2 | DROP FUNCTION hashes_notify_inserted CASCADE; 3 | DROP FUNCTION hashes_insert_furaffinity CASCADE; 4 | DROP FUNCTION hashes_insert_e621 CASCADE; 5 | DROP FUNCTION hashes_insert_twitter CASCADE; 6 | 7 | CREATE FUNCTION update_notify_furaffinity() 8 | RETURNS trigger 9 | LANGUAGE plpgsql 10 | AS $$ 11 | BEGIN 12 | if NEW.hash_int IS NOT NULL THEN 13 | PERFORM pg_notify('fuzzysearch_hash_added'::text, 14 | json_build_object('hash', NEW.hash_int)::text); 15 | RETURN NEW; 16 | END IF; 17 | 18 | RETURN NEW; 19 | END; 20 | $$; 21 | 22 | CREATE FUNCTION update_notify_others() 23 | RETURNS trigger 24 | LANGUAGE plpgsql 25 | AS $$ 26 | BEGIN 27 | if NEW.hash IS NOT NULL THEN 28 | PERFORM pg_notify('fuzzysearch_hash_added'::text, 29 | json_build_object('hash', NEW.hash)::text); 30 | RETURN NEW; 31 | END IF; 32 | 33 | RETURN NEW; 34 | END; 35 | $$; 36 | 37 | CREATE TRIGGER update_notify_furaffinity AFTER INSERT OR UPDATE ON submission 38 | FOR EACH ROW EXECUTE PROCEDURE update_notify_furaffinity(); 39 | CREATE TRIGGER update_notify_e621 AFTER INSERT OR UPDATE ON e621 40 | FOR EACH ROW EXECUTE PROCEDURE update_notify_others(); 41 | CREATE TRIGGER update_notify_twitter AFTER INSERT OR UPDATE ON tweet_media 42 | FOR EACH ROW EXECUTE PROCEDURE update_notify_others(); 43 | CREATE TRIGGER update_notify_weasyl AFTER INSERT OR UPDATE ON weasyl 44 | FOR EACH ROW EXECUTE PROCEDURE update_notify_others(); 45 | -------------------------------------------------------------------------------- /migrations/20210420024815_webhooks.down.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE webhook; 2 | -------------------------------------------------------------------------------- /migrations/20210420024815_webhooks.up.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE webhook ( 2 | id SERIAL PRIMARY KEY, 3 | account_id INTEGER REFERENCES account (id), 4 | endpoint TEXT NOT NULL 5 | ); 6 | -------------------------------------------------------------------------------- /migrations/20210422224815_change_hash_index.down.sql: -------------------------------------------------------------------------------- 1 | DROP INDEX submission_hash_int_idx; 2 | DROP INDEX e621_hash_idx; 3 | DROP INDEX tweet_media_hash_idx; 4 | DROP INDEX weasyl_hash_idx; 5 | 6 | CREATE INDEX bk_furaffinity_hash ON submission USING spgist (hash_int bktree_ops); 7 | CREATE INDEX bk_e621_hash ON e621 USING spgist (hash bktree_ops); 8 | CREATE INDEX bk_twitter_hash ON tweet_media USING spgist (hash bktree_ops); 9 | CREATE INDEX bk_weasyl_hash ON weasyl USING spgist (hash bktree_ops); 10 | -------------------------------------------------------------------------------- /migrations/20210422224815_change_hash_index.up.sql: -------------------------------------------------------------------------------- 1 | DROP INDEX bk_furaffinity_hash; 2 | DROP INDEX bk_e621_hash; 3 | DROP INDEX bk_twitter_hash; 4 | DROP INDEX bk_weasyl_hash; 5 | 6 | CREATE INDEX submission_hash_int_idx ON submission (hash_int); 7 | CREATE INDEX e621_hash_idx ON e621 (hash); 8 | CREATE INDEX tweet_media_hash_idx ON tweet_media (hash); 9 | CREATE INDEX weasyl_hash_idx ON weasyl (hash); 10 | -------------------------------------------------------------------------------- /migrations/20210822052026_isnumeric_function.down.sql: -------------------------------------------------------------------------------- 1 | DROP FUNCTION IF EXISTS isnumeric; 2 | -------------------------------------------------------------------------------- /migrations/20210822052026_isnumeric_function.up.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE FUNCTION isnumeric(text) RETURNS BOOLEAN AS $$ 2 | DECLARE x NUMERIC; 3 | BEGIN 4 | x = $1::NUMERIC; 5 | RETURN TRUE; 6 | EXCEPTION WHEN others THEN 7 | RETURN FALSE; 8 | END; 9 | $$ 10 | STRICT 11 | LANGUAGE plpgsql IMMUTABLE; 12 | -------------------------------------------------------------------------------- /migrations/20210822052313_deleted_flag.down.sql: -------------------------------------------------------------------------------- 1 | ALTER TABLE submission DROP COLUMN deleted; 2 | ALTER TABLE e621 DROP COLUMN deleted; 3 | ALTER TABLE weasyl DROP COLUMN deleted; 4 | -------------------------------------------------------------------------------- /migrations/20210822052313_deleted_flag.up.sql: -------------------------------------------------------------------------------- 1 | ALTER TABLE submission ADD COLUMN deleted BOOLEAN NOT NULL DEFAULT false; 2 | ALTER TABLE e621 ADD COLUMN deleted BOOLEAN NOT NULL DEFAULT false; 3 | ALTER TABLE weasyl ADD COLUMN deleted BOOLEAN NOT NULL DEFAULT false; 4 | -------------------------------------------------------------------------------- /migrations/20220519161030_index_tag_post_id.down.sql: -------------------------------------------------------------------------------- 1 | DROP INDEX tag_to_post_post_id; 2 | -------------------------------------------------------------------------------- /migrations/20220519161030_index_tag_post_id.up.sql: -------------------------------------------------------------------------------- 1 | CREATE INDEX tag_to_post_post_id ON tag_to_post (post_id); 2 | -------------------------------------------------------------------------------- /tests/fox.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Syfaro/fuzzysearch/990633645cf31e3e85f1f209b4dc51faec4d0638/tests/fox.gif -------------------------------------------------------------------------------- /tests/sample.sql: -------------------------------------------------------------------------------- 1 | -- Clear any existing data 2 | 3 | TRUNCATE TABLE 4 | tag_to_post, submission, tag, artist, 5 | e621, 6 | tweet_media, tweet, twitter_user, 7 | rate_limit, api_key, account 8 | CASCADE; 9 | 10 | -- Account and test API keys 11 | 12 | INSERT INTO account (id, email, password) VALUES 13 | (1, 'test@example.com', 0); 14 | INSERT INTO api_key (id, user_id, name, key, name_limit, image_limit, hash_limit) VALUES 15 | (1, 1, 'Test', 'test', 120, 120, 15); 16 | 17 | -- FurAffinity sample data 18 | 19 | INSERT INTO artist (id, name) VALUES 20 | (1, 'casual-dhole'), 21 | (2, 'kosseart'), 22 | (3, 'oce'), 23 | (4, 'psychonautic'); 24 | 25 | INSERT INTO tag (id, name) VALUES 26 | (1, 'syfaro'), 27 | (2, 'male'), 28 | (3, 'fox'), 29 | (4, 'purple'), 30 | (5, 'blue'), 31 | (6, 'light'), 32 | (7, 'dark'), 33 | (8, 'enjoying'), 34 | (9, 'delicious'), 35 | (10, 'grapes'); 36 | 37 | INSERT INTO submission (id, artist_id, hash, hash_int, url, filename, rating, posted_at, description, file_id, file_size, imported, removed, updated_at) VALUES 38 | (21060541, 1, '\x52eda52bc535b1a4', 5975613888001323428, 'https://d.facdn.net/art/casual-dhole/1473103034/1473103034.casual-dhole_fylninsyf2__web_.png', '1473103034.casual-dhole_fylninsyf2__web_.png', 'g', '2016-09-05 21:17:00+00', '', 1473103034, null, false, false, null), 39 | (33088558, 2, '\xb63326dd92c46ad8', -5317864001902449960, 'https://d.facdn.net/art/kosseart/1568810406/1568810406.kosseart_experimental-syfaro-fa.png', '1568810406.kosseart_experimental-syfaro-fa.png', 'g', '2019-09-18 13:40:00+00', '', 1568810406, null, false, false, null), 40 | (20449854, 3, '\x544565d3e5aad6ad', 6072371633344665261, 'https://d.facdn.net/art/oce/1467485464/1467485464.oce_syfaro-sketch-web.jpg', '1467485464.oce_syfaro-sketch-web.jpg', 'g', '2016-07-02 20:51:00+00', '', 1467485464, null, true, false, null), 41 | (19620670, 4, '\x5494a456b9b9ad92', 6094676888129219986, 'https://d.facdn.net/art/psychonautic/1460136557/1460136557.psychonautic_syfarore.png', '1460136557.psychonautic_syfarore.png', 'g', '2016-04-08 19:29:00+00', '', '1460136557', null, true, false, null); 42 | 43 | INSERT INTO tag_to_post (tag_id, post_id) VALUES 44 | (1, 20449854), 45 | (2, 20449854), 46 | (3, 20449854), 47 | (4, 20449854), 48 | (5, 20449854), 49 | (6, 20449854), 50 | (7, 20449854), 51 | (8, 20449854), 52 | (9, 20449854), 53 | (10, 20449854); 54 | 55 | -- e621 sample data 56 | 57 | INSERT INTO e621 (id, hash, data, sha256) VALUES 58 | (934261, 6072371633344665261, '{"id": 934261, "file": {"ext": "jpg", "md5": "273210894ab3d9f02f02742acead73a2", "url": "https://static1.e621.net/data/27/32/273210894ab3d9f02f02742acead73a2.jpg", "size": 266900, "width": 681, "height": 900}, "tags": {"lore": [], "meta": [], "artist": ["amara_telgemeier"], "general": ["2016", "anthro", "biped", "eating", "food", "fruit", "fur", "grape", "looking_at_viewer", "male", "nude", "plant", "purple_body", "purple_fur", "purple_theme", "simple_background", "sitting", "solo"], "invalid": [], "species": ["canid", "canine", "fox", "mammal"], "character": ["syfaro"], "copyright": []}, "flags": {"deleted": false, "flagged": false, "pending": false, "note_locked": false, "rating_locked": false, "status_locked": false}, "pools": [], "score": {"up": 0, "down": 0, "total": 14}, "rating": "s", "sample": {"has": false, "url": "https://static1.e621.net/data/27/32/273210894ab3d9f02f02742acead73a2.jpg", "width": 681, "height": 900}, "preview": {"url": "https://static1.e621.net/data/preview/27/32/273210894ab3d9f02f02742acead73a2.jpg", "width": 113, "height": 150}, "sources": ["https://furrynetwork.com/artwork/1275945", "https://d3gz42uwgl1r1y.cloudfront.net/sy/syfaro/submission/2016/07/87f00959822f665716c58c4df43a27c2.jpg", "https://www.furaffinity.net/user/oce", "https://www.furaffinity.net/full/20449854/", "https://d.facdn.net/art/oce/1467485464/1467485464.oce_syfaro-sketch-web.jpg", "https://www.furaffinity.net/user/oce/"], "fav_count": 30, "change_seq": 26745767, "created_at": "2016-07-03T10:44:50.983-04:00", "updated_at": "2020-04-04T15:50:17.669-04:00", "approver_id": null, "description": "", "locked_tags": [], "uploader_id": 2083, "is_favorited": false, "comment_count": 0, "relationships": {"children": [], "parent_id": null, "has_children": false, "has_active_children": false}}', '\x26d16b09a372f780079af7b4bd13128ded8bf0f78395f40e2a3e307a3495955b'); 59 | 60 | --- Twitter sample data 61 | 62 | INSERT INTO twitter_user (twitter_id, approved, data, last_update, max_id, completed_back, min_id) VALUES 63 | (1030062061856993282, true, '{"id": 1030062061856993282, "url": "https://t.co/9QXcrQ32Q2", "lang": null, "name": "𝕯𝖊𝖒𝖔𝖓 𝕯𝖔𝖌 𝕮𝖊𝖓𝖙𝖗𝖆𝖑™", "id_str": "1030062061856993282", "status": {"id": 1221685448407486465, "geo": null, "lang": "en", "text": "@folklaurel_ WHAT that''s so kind of you?? Thank you so much?!??? 😭💖", "place": null, "id_str": "1221685448407486465", "source": "Twitter Web App", "entities": {"urls": [], "symbols": [], "hashtags": [], "user_mentions": [{"id": 2566142377, "name": "Colin 🐊 FC2020", "id_str": "2566142377", "indices": [0, 12], "screen_name": "folklaurel_"}]}, "favorited": false, "retweeted": false, "truncated": false, "created_at": "Mon Jan 27 06:44:43 +0000 2020", "coordinates": null, "contributors": null, "retweet_count": 0, "favorite_count": 0, "is_quote_status": false, "in_reply_to_user_id": 2566142377, "in_reply_to_status_id": 1221681145135300608, "in_reply_to_screen_name": "folklaurel_", "in_reply_to_user_id_str": "2566142377", "in_reply_to_status_id_str": "1221681145135300608"}, "entities": {"url": {"urls": [{"url": "https://t.co/9QXcrQ32Q2", "indices": [0, 23], "display_url": "deviantart.com/yodelinyote/ga…", "expanded_url": "https://www.deviantart.com/yodelinyote/gallery"}]}, "description": {"urls": []}}, "location": "St.Louis, Misery (MO)", "verified": false, "following": false, "protected": false, "time_zone": null, "created_at": "Thu Aug 16 12:01:47 +0000 2018", "utc_offset": null, "description": "Caim | 21 | transmasc - he/him ONLY! | I draw SFW furry art | Comms are CLOSED | Pfp: @kind7ed | Banner by me | Personal: @yappinyote", "followed_by": false, "geo_enabled": false, "screen_name": "yodelinyote", "listed_count": 13, "can_media_tag": false, "friends_count": 726, "is_translator": false, "notifications": false, "statuses_count": 4094, "default_profile": false, "followers_count": 5073, "translator_type": "none", "favourites_count": 10462, "profile_image_url": "http://pbs.twimg.com/profile_images/1160593354205405184/p-I8E7aX_normal.jpg", "profile_banner_url": "https://pbs.twimg.com/profile_banners/1030062061856993282/1571030121", "profile_link_color": "E81C4F", "profile_text_color": "000000", "follow_request_sent": false, "contributors_enabled": false, "has_extended_profile": true, "default_profile_image": false, "is_translation_enabled": false, "profile_background_tile": false, "profile_image_url_https": "https://pbs.twimg.com/profile_images/1160593354205405184/p-I8E7aX_normal.jpg", "profile_background_color": "000000", "profile_sidebar_fill_color": "000000", "profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png", "profile_sidebar_border_color": "000000", "profile_use_background_image": false, "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png"}', '2021-02-21 00:49:04.59449', 1363167747291643906, false, 1218752964501868546); 64 | 65 | INSERT INTO tweet (id, twitter_user_id, data) VALUES 66 | (1325965206934212608, 1030062061856993282, '{"id": 1325965206934212608, "geo": null, "lang": "en", "user": {"id": 1030062061856993282, "url": null, "lang": null, "name": "Caim", "id_str": "1030062061856993282", "entities": {"description": {"urls": [{"url": "https://t.co/mIG5vnu0lj", "indices": [82, 105], "display_url": "infurnalyote.carrd.co", "expanded_url": "http://infurnalyote.carrd.co"}]}}, "location": "Osage & Kickapoo land", "verified": false, "following": true, "protected": false, "time_zone": null, "created_at": "Thu Aug 16 12:01:47 +0000 2018", "utc_offset": null, "description": "♦️ Caim - 22 - transmasc (he/him) - BLM - ACAB ♦️\n♦️ Artist - Comms CLOSED ♦️\n♦️ https://t.co/mIG5vnu0lj - Pfp: @S0LARDOG ♦️", "geo_enabled": false, "screen_name": "infurnalyote", "listed_count": 58, "friends_count": 1108, "is_translator": false, "notifications": false, "statuses_count": 10721, "default_profile": false, "followers_count": 8793, "translator_type": "none", "favourites_count": 32013, "profile_image_url": "http://pbs.twimg.com/profile_images/1267191835728007173/oKM3jNzN_normal.jpg", "profile_banner_url": "https://pbs.twimg.com/profile_banners/1030062061856993282/1571030121", "profile_link_color": "E81C4F", "profile_text_color": "000000", "follow_request_sent": false, "contributors_enabled": false, "has_extended_profile": true, "default_profile_image": false, "is_translation_enabled": false, "profile_background_tile": false, "profile_image_url_https": "https://pbs.twimg.com/profile_images/1267191835728007173/oKM3jNzN_normal.jpg", "profile_background_color": "000000", "profile_sidebar_fill_color": "000000", "profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png", "profile_sidebar_border_color": "000000", "profile_use_background_image": false, "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png"}, "place": null, "id_str": "1325965206934212608", "source": "Twitter Web App", "entities": {"urls": [], "media": [{"id": 1325965104203042817, "url": "https://t.co/wL19uCgrAF", "type": "photo", "sizes": {"large": {"h": 1250, "w": 1000, "resize": "fit"}, "small": {"h": 680, "w": 544, "resize": "fit"}, "thumb": {"h": 150, "w": 150, "resize": "crop"}, "medium": {"h": 1200, "w": 960, "resize": "fit"}}, "id_str": "1325965104203042817", "indices": [20, 43], "media_url": "http://pbs.twimg.com/media/EmbGPKyWEAEi3JI.jpg", "display_url": "pic.twitter.com/wL19uCgrAF", "expanded_url": "https://twitter.com/infurnalyote/status/1325965206934212608/photo/1", "media_url_https": "https://pbs.twimg.com/media/EmbGPKyWEAEi3JI.jpg"}], "symbols": [], "hashtags": [], "user_mentions": []}, "favorited": false, "full_text": "Some more sillyness https://t.co/wL19uCgrAF", "retweeted": false, "truncated": false, "created_at": "Tue Nov 10 00:55:15 +0000 2020", "coordinates": null, "contributors": null, "retweet_count": 11, "favorite_count": 108, "is_quote_status": false, "extended_entities": {"media": [{"id": 1325965104203042817, "url": "https://t.co/wL19uCgrAF", "type": "photo", "sizes": {"large": {"h": 1250, "w": 1000, "resize": "fit"}, "small": {"h": 680, "w": 544, "resize": "fit"}, "thumb": {"h": 150, "w": 150, "resize": "crop"}, "medium": {"h": 1200, "w": 960, "resize": "fit"}}, "id_str": "1325965104203042817", "indices": [20, 43], "media_url": "http://pbs.twimg.com/media/EmbGPKyWEAEi3JI.jpg", "display_url": "pic.twitter.com/wL19uCgrAF", "expanded_url": "https://twitter.com/infurnalyote/status/1325965206934212608/photo/1", "media_url_https": "https://pbs.twimg.com/media/EmbGPKyWEAEi3JI.jpg"}, {"id": 1325965117285076993, "url": "https://t.co/wL19uCgrAF", "type": "photo", "sizes": {"large": {"h": 683, "w": 2048, "resize": "fit"}, "small": {"h": 227, "w": 680, "resize": "fit"}, "thumb": {"h": 150, "w": 150, "resize": "crop"}, "medium": {"h": 400, "w": 1200, "resize": "fit"}}, "id_str": "1325965117285076993", "indices": [20, 43], "media_url": "http://pbs.twimg.com/media/EmbGP7hWEAEysaF.jpg", "display_url": "pic.twitter.com/wL19uCgrAF", "expanded_url": "https://twitter.com/infurnalyote/status/1325965206934212608/photo/1", "media_url_https": "https://pbs.twimg.com/media/EmbGP7hWEAEysaF.jpg"}, {"id": 1325965183622246400, "url": "https://t.co/wL19uCgrAF", "type": "photo", "sizes": {"large": {"h": 500, "w": 1500, "resize": "fit"}, "small": {"h": 227, "w": 680, "resize": "fit"}, "thumb": {"h": 150, "w": 150, "resize": "crop"}, "medium": {"h": 400, "w": 1200, "resize": "fit"}}, "id_str": "1325965183622246400", "indices": [20, 43], "media_url": "http://pbs.twimg.com/media/EmbGTypW8AA65r_.jpg", "display_url": "pic.twitter.com/wL19uCgrAF", "expanded_url": "https://twitter.com/infurnalyote/status/1325965206934212608/photo/1", "media_url_https": "https://pbs.twimg.com/media/EmbGTypW8AA65r_.jpg"}]}, "display_text_range": [0, 19], "possibly_sensitive": false, "in_reply_to_user_id": 1030062061856993282, "in_reply_to_status_id": 1325964607509438470, "in_reply_to_screen_name": "infurnalyote", "in_reply_to_user_id_str": "1030062061856993282", "in_reply_to_status_id_str": "1325964607509438470"}'); 67 | 68 | INSERT INTO tweet_media (media_id, tweet_id, hash, url) VALUES 69 | (1325965183622246400, 1325965206934212608, -3140163608635666133, 'https://pbs.twimg.com/media/EmbGTypW8AA65r_.jpg:large'), 70 | (1325965104203042817, 1325965206934212608, 2641824390885488310, 'https://pbs.twimg.com/media/EmbGPKyWEAEi3JI.jpg:large'), 71 | (1325965117285076993, 1325965206934212608, 5517556289826018726, 'https://pbs.twimg.com/media/EmbGP7hWEAEysaF.jpg:large'); 72 | -------------------------------------------------------------------------------- /tests/samples/1460136557.psychonautic_syfarore.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Syfaro/fuzzysearch/990633645cf31e3e85f1f209b4dc51faec4d0638/tests/samples/1460136557.psychonautic_syfarore.png -------------------------------------------------------------------------------- /tests/samples/1467485464.oce_syfaro-sketch-web.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Syfaro/fuzzysearch/990633645cf31e3e85f1f209b4dc51faec4d0638/tests/samples/1467485464.oce_syfaro-sketch-web.jpg -------------------------------------------------------------------------------- /tests/samples/1473103034.casual-dhole_fylninsyf2__web_.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Syfaro/fuzzysearch/990633645cf31e3e85f1f209b4dc51faec4d0638/tests/samples/1473103034.casual-dhole_fylninsyf2__web_.png -------------------------------------------------------------------------------- /tests/samples/1568810406.kosseart_experimental-syfaro-fa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Syfaro/fuzzysearch/990633645cf31e3e85f1f209b4dc51faec4d0638/tests/samples/1568810406.kosseart_experimental-syfaro-fa.png -------------------------------------------------------------------------------- /tests/samples/273210894ab3d9f02f02742acead73a2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Syfaro/fuzzysearch/990633645cf31e3e85f1f209b4dc51faec4d0638/tests/samples/273210894ab3d9f02f02742acead73a2.jpg -------------------------------------------------------------------------------- /tests/samples/EmbGP7hWEAEysaF.jpg large.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Syfaro/fuzzysearch/990633645cf31e3e85f1f209b4dc51faec4d0638/tests/samples/EmbGP7hWEAEysaF.jpg large.jpg -------------------------------------------------------------------------------- /tests/samples/EmbGPKyWEAEi3JI.jpg large.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Syfaro/fuzzysearch/990633645cf31e3e85f1f209b4dc51faec4d0638/tests/samples/EmbGPKyWEAEi3JI.jpg large.jpg -------------------------------------------------------------------------------- /tests/samples/EmbGTypW8AA65r_.jpg large.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Syfaro/fuzzysearch/990633645cf31e3e85f1f209b4dc51faec4d0638/tests/samples/EmbGTypW8AA65r_.jpg large.jpg -------------------------------------------------------------------------------- /tests/video.webm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Syfaro/fuzzysearch/990633645cf31e3e85f1f209b4dc51faec4d0638/tests/video.webm --------------------------------------------------------------------------------