├── .dockerignore ├── .env.example ├── .gitignore ├── .isort.cfg ├── .prettierignore ├── .prettierrc.json ├── .python-version ├── .rustfmt.toml ├── Cargo.toml ├── Dockerfile.edge-caddy ├── Dockerfile.nodejs-base ├── Dockerfile.python-base ├── Dockerfile.runpod-base ├── Dockerfile.rust-base ├── Dockerfile.telegraf ├── README.md ├── api-worker-broker └── main.ts ├── api-worker-node ├── download_model.py └── main.py ├── api ├── endpoint │ ├── analyzePopularity.ts │ ├── analyzeSentiment.ts │ ├── heatmap.ts │ ├── items.ts │ ├── search.ts │ └── topUsers.ts ├── main.ts └── query.ts ├── app ├── component │ ├── App.css │ ├── App.tsx │ ├── ColorInput.css │ ├── ColorInput.tsx │ ├── Ico.css │ ├── Ico.tsx │ ├── ImageOnLoad.css │ ├── ImageOnLoad.tsx │ ├── Loading.css │ ├── Loading.tsx │ ├── PageSwitcher.css │ ├── PageSwitcher.tsx │ ├── PointMap.css │ ├── PointMap.tsx │ ├── Post.css │ ├── Post.tsx │ └── RouteLink.tsx ├── deploy.js ├── imports.d.ts ├── index.css ├── index.html ├── index.tsx ├── page │ ├── Analysis.css │ ├── Analysis.tsx │ ├── City.css │ ├── City.tsx │ ├── NotFound.tsx │ ├── Search.css │ └── Search.tsx ├── util │ ├── api.ts │ ├── const.ts │ ├── dom.ts │ ├── fetch.ts │ ├── item.ts │ ├── map.ts │ └── router.ts └── worker.PointLabels.ts ├── build-ann └── main.py ├── build-api-data └── main.py ├── build-data ├── common.rs ├── export_comment_embs.rs ├── export_comment_sentiments.rs ├── export_comment_texts.rs ├── export_comments.rs ├── export_interactions.rs ├── export_post_embs.rs ├── export_post_embs_bgem3_dense.rs ├── export_post_texts.rs ├── export_post_titles.rs ├── export_posts.rs ├── export_url_metas.rs ├── export_url_texts.rs ├── export_urls.rs ├── export_users.rs └── main.rs ├── build-edge-data └── main.py ├── build-map └── main.py ├── build-nn-data └── main.rs ├── common ├── arrow.rs ├── const.ts ├── data.py ├── data_gpu.py ├── heatmap.py ├── lib.rs ├── mat.rs ├── msgpack.rs ├── res.ts ├── terrain.py └── util.py ├── crawler ├── archive.rs ├── crawl.rs ├── direct.rs ├── main.rs ├── origin.rs └── parse.rs ├── docker-compose.yaml ├── edge ├── Caddyfile └── main.rs ├── embedder ├── download_model.py ├── main.ts ├── worker_embed.py └── worker_embed.ts ├── enqueuer └── main.ts ├── format ├── jest.config.js ├── kmeans └── main.py ├── package.json ├── promtail.yaml ├── requirements.txt ├── runpod ├── docker.entrypoint.sh └── telegraf.conf ├── schema.sql ├── sentiment-analyser ├── download_model.py ├── main.ts ├── model.py └── model.ts ├── telegraf └── telegraf.conf ├── tsconfig.json ├── umap └── main.py └── wrench ├── comment.ts ├── crawler-aws.ts ├── crawler.ts ├── db.ts ├── edge.ts ├── edge ├── build ├── logs └── up ├── kmeans ├── plot-clusters.py ├── plot-inertia.py └── prepare-titles.py ├── queue.ts ├── runpod_gpus.txt └── url.ts /.dockerignore: -------------------------------------------------------------------------------- 1 | .env 2 | Cargo.lock 3 | dist/ 4 | node_modules/ 5 | package-lock.json 6 | target/ 7 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | API_SSL_CA_BASE64=abc 2 | API_SSL_CERT_BASE64=abc 3 | API_SSL_KEY_BASE64=abc 4 | API_WORKER_NODE_CERT_B64=abc 5 | API_WORKER_NODE_DATASETS=abc 6 | API_WORKER_NODE_KEY_B64=abc 7 | API_WORKER_NODE_LOAD_ANN=0 8 | API_WORKER_NODE_TOKEN=abc 9 | DB_RPC_API_KEY=abc 10 | DOCKER_VOLUME_DIR=/abc 11 | EDGE_CADDY_ACME_EMAIL=abc@my-domain.com 12 | EDGE_DOMAIN_SUFFIX=edge.hndr.my-domain.com 13 | INFLUXDB_BUCKET=abc 14 | INFLUXDB_ENDPOINT=https://influx.my-domain.com 15 | INFLUXDB_ORGANIZATION=abc 16 | INFLUXDB_TOKEN=abc 17 | LOKI_BASICAUTH_PASSWORD=abc 18 | LOKI_BASICAUTH_USERNAME=abc 19 | LOKI_ENDPOINT=https://loki.my-domain.com 20 | MAP_POINT_SET=hndr 21 | QUEUED_API_KEY=abc 22 | UMAP_LOW_MEMORY=1 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .env 3 | .venv*/ 4 | /wrench/kmeans/*.webp 5 | /wrench/kmeans/cluster-titles*.json 6 | Cargo.lock 7 | dist/ 8 | node_modules/ 9 | package-lock.json 10 | target/ 11 | -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | profile=black 3 | force_single_line=True 4 | from_first=True 5 | no_sections=True 6 | order_by_type=False 7 | -------------------------------------------------------------------------------- /.prettierignore: -------------------------------------------------------------------------------- 1 | *.md 2 | -------------------------------------------------------------------------------- /.prettierrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "plugins": ["prettier-plugin-organize-imports"] 3 | } 4 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.12 2 | -------------------------------------------------------------------------------- /.rustfmt.toml: -------------------------------------------------------------------------------- 1 | fn_single_line = false 2 | force_explicit_abi = true 3 | format_macro_bodies = true 4 | format_macro_matchers = true 5 | group_imports = "One" 6 | hard_tabs = false 7 | hex_literal_case = "Lower" 8 | imports_granularity = "Item" 9 | imports_layout = "Horizontal" 10 | merge_derives = true 11 | overflow_delimited_expr = true 12 | remove_nested_parens = true 13 | reorder_impl_items = true 14 | reorder_imports = true 15 | reorder_modules = true 16 | tab_spaces = 2 17 | trailing_semicolon = true 18 | use_field_init_shorthand = true 19 | wrap_comments = false 20 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "hndr" 3 | publish = false 4 | version = "0.1.0" 5 | edition = "2021" 6 | 7 | [dependencies] 8 | ahash = { version = "0.8.11", features = ["serde"] } 9 | arrow = "51.0.0" 10 | axum = { version = "0.7.5", features = ["http2"] } 11 | axum-msgpack = "0.4.0" 12 | base64 = "0.22.0" 13 | cadence = "1.3.0" 14 | chrono = { version = "0.4.37", features = ["serde"] } 15 | dashmap = "5.5.3" 16 | db-rpc-client-rs = "0.1.1" 17 | futures = "0.3.30" 18 | http = "1.1.0" 19 | itertools = "0.12.1" 20 | once_cell = "1.19.0" 21 | parking_lot = "0.12.1" 22 | queued-client-rs = "0.1.1" 23 | rand = "0.8.5" 24 | regex = "1.10.4" 25 | reqwest = { version = "0.12.3", features = ["stream"] } 26 | rmp-serde = "1.1.2" 27 | rmpv = { version = "1.0.1", features = ["with-serde"] } 28 | scraper = { version = "0.19.0", features = ["atomic"] } 29 | serde = { version = "1.0.197", features = ["derive"] } 30 | serde_bytes = "0.11.14" 31 | serde_json = "1.0.115" 32 | serde_with = "3.7.0" 33 | service-toolkit = "0.4.0" 34 | sysinfo = "0.30.10" 35 | tokio = { version = "1.37.0", features = ["full"] } 36 | tower-http = { version = "0.5.2", features = ["cors"] } 37 | tracing = "0.1.40" 38 | tracing-subscriber = { version = "0.3.18", features = ["json", "env-filter"] } 39 | 40 | [lib] 41 | name = "common" 42 | path = "common/lib.rs" 43 | 44 | [[bin]] 45 | name = "build-data" 46 | path = "build-data/main.rs" 47 | 48 | [[bin]] 49 | name = "build-nn-data" 50 | path = "build-nn-data/main.rs" 51 | 52 | [[bin]] 53 | name = "crawler" 54 | path = "crawler/main.rs" 55 | 56 | [[bin]] 57 | name = "edge" 58 | path = "edge/main.rs" 59 | 60 | [profile.release] 61 | codegen-units = 1 62 | debug = true 63 | lto = true 64 | opt-level = 3 65 | 66 | [profile.release.package."*"] 67 | codegen-units = 1 68 | opt-level = 3 69 | -------------------------------------------------------------------------------- /Dockerfile.edge-caddy: -------------------------------------------------------------------------------- 1 | FROM caddy:2.7 2 | 3 | COPY edge/Caddyfile /etc/caddy/Caddyfile 4 | -------------------------------------------------------------------------------- /Dockerfile.nodejs-base: -------------------------------------------------------------------------------- 1 | FROM node:21 2 | 3 | WORKDIR /app 4 | 5 | COPY package.json . 6 | RUN npm i 7 | 8 | ARG MAIN 9 | COPY common common 10 | COPY $MAIN $MAIN 11 | COPY tsconfig.json . 12 | RUN npx tsc 13 | 14 | ENV MAIN=$MAIN 15 | ENV NODE_NO_WARNINGS=1 16 | ENV NODE_OPTIONS='--max-old-space-size=16384 --stack-trace-limit=1024' 17 | # We cannot use ts-node as it doesn't support node:worker. 18 | CMD node $MAIN/main.js 19 | -------------------------------------------------------------------------------- /Dockerfile.python-base: -------------------------------------------------------------------------------- 1 | FROM python:3.12 2 | 3 | WORKDIR /tmp 4 | RUN pip install huggingface_hub 5 | COPY $MAIN/download_model.py* dm.py 6 | RUN bash -c 'if [[ -f dm.py ]]; then python dm.py; fi' 7 | 8 | WORKDIR /app 9 | 10 | COPY requirements.txt . 11 | RUN pip install -r requirements.txt 12 | # TODO HACK to disable TQDM from trashing our logs. (TQDM_DISABLE=1 doesn't seem to work.) 13 | RUN sed -i 's%tqdm(%(lambda x, **o: x)(%' /usr/local/lib/python3.12/site-packages/FlagEmbedding/bge_m3.py 14 | 15 | ARG MAIN 16 | COPY common common 17 | COPY $MAIN $MAIN 18 | 19 | ENV MAIN=$MAIN 20 | ENV PYTHONPATH=/app 21 | ENV PYTHONUNBUFFERED=1 22 | CMD python $MAIN/main.py 23 | -------------------------------------------------------------------------------- /Dockerfile.runpod-base: -------------------------------------------------------------------------------- 1 | # Use the image with the oldest version of CUDA while maintaining PyTorch >=2.0. 2 | # This is because some RunPod Community Cloud hosts have older NVIDIA driver versions. What matters on the host is the NVIDIA driver version, not the CUDA version. NVIDIA drivers are only compatible up to a certain version of CUDA. 3 | FROM runpod/pytorch:2.1.0-py3.10-cuda11.8.0-devel-ubuntu22.04 4 | ARG MAIN 5 | ARG HNDR_EMBEDDER_MODE 6 | ENV HNDR_EMBEDDER_MODE=$HNDR_EMBEDDER_MODE 7 | 8 | RUN pip install huggingface_hub 9 | COPY $MAIN/download_model.py /tmp/download_model.py 10 | RUN python /tmp/download_model.py 11 | 12 | RUN curl -fLSs --output /usr/bin/runpodctl https://github.com/runpod/runpodctl/releases/download/v1.14.2/runpodctl-linux-amd64 13 | RUN chmod +x /usr/bin/runpodctl 14 | 15 | COPY --from=telegraf /usr/bin/telegraf /telegraf 16 | COPY runpod/telegraf.conf / 17 | COPY --from=grafana/promtail /usr/bin/promtail /promtail 18 | COPY promtail.yaml / 19 | 20 | RUN curl -fLSs https://deb.nodesource.com/setup_21.x | bash - && apt install -yq nodejs 21 | 22 | WORKDIR /app 23 | 24 | # https://docs.rapids.ai/install 25 | # This will also install CuPy. 26 | RUN pip install \ 27 | --extra-index-url=https://pypi.nvidia.com \ 28 | cudf-cu11==24.4.* dask-cudf-cu11==24.4.* cuml-cu11==24.4.* \ 29 | cugraph-cu11==24.4.* cuspatial-cu11==24.4.* cuproj-cu11==24.4.* \ 30 | cuxfilter-cu11==24.4.* cucim-cu11==24.4.* pylibraft-cu11==24.4.* \ 31 | raft-dask-cu11==24.4.* cuvs-cu11==24.4.* 32 | COPY requirements.txt . 33 | RUN pip install -r requirements.txt 34 | # TODO HACK to disable TQDM from trashing our logs. (TQDM_DISABLE=1 doesn't seem to work.) 35 | RUN sed -i 's%tqdm(%(lambda x, **o: x)(%' /usr/local/lib/python3.10/dist-packages/FlagEmbedding/bge_m3.py 36 | 37 | COPY package.json . 38 | RUN npm i 39 | 40 | COPY tsconfig.json . 41 | COPY common common 42 | COPY $MAIN $MAIN 43 | RUN npx tsc 44 | 45 | ENV MAIN=$MAIN 46 | ENV NODE_NO_WARNINGS=1 47 | ENV NODE_OPTIONS='--enable-source-maps --max-old-space-size=16384 --stack-trace-limit=1024' 48 | ENV PYTHONUNBUFFERED=1 49 | ENV TQDM_DISABLE=1 50 | COPY runpod/docker.entrypoint.sh . 51 | CMD bash docker.entrypoint.sh 52 | -------------------------------------------------------------------------------- /Dockerfile.rust-base: -------------------------------------------------------------------------------- 1 | FROM rust:1.77-slim-bookworm 2 | 3 | RUN apt -y update && apt -yq install libssl-dev pkg-config 4 | 5 | WORKDIR /app 6 | 7 | ARG MAIN 8 | COPY Cargo.toml . 9 | COPY common common 10 | COPY $MAIN $MAIN 11 | RUN cargo build --release --bin $MAIN 12 | 13 | FROM debian:bookworm-slim 14 | 15 | ARG MAIN 16 | COPY --from=0 /app/target/release/$MAIN /main 17 | 18 | RUN apt -y update && apt -yq install ca-certificates openssl && rm -rf /var/lib/apt/lists/* 19 | 20 | ENV MAIN=$MAIN 21 | ENV RUST_BACKTRACE=1 22 | ENV RUST_LOG=info 23 | CMD /main 24 | -------------------------------------------------------------------------------- /Dockerfile.telegraf: -------------------------------------------------------------------------------- 1 | FROM telegraf:1.30 2 | 3 | COPY telegraf/telegraf.conf /etc/telegraf/telegraf.conf 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hackerverse 2 | 3 | ![Smeantic map of Hacker News posts.](https://blog.wilsonl.in/hackerverse/map.png) 4 | 5 | Interactive semantic map, search engine, automatic subcommunities, and popularity/sentiment analyzer of over 40 million Hacker News posts and comments, powered by text embeddings. 6 | 7 | See the [blog post](https://blog.wilsonl.in/hackerverse/) for more details. 8 | 9 | Documentation, README WIP 10 | -------------------------------------------------------------------------------- /api-worker-broker/main.ts: -------------------------------------------------------------------------------- 1 | import { decode, encode } from "@msgpack/msgpack"; 2 | import { 3 | VArray, 4 | VInteger, 5 | VOptional, 6 | VString, 7 | VStruct, 8 | VUnknown, 9 | Valid, 10 | } from "@wzlin/valid"; 11 | import Dict from "@xtjs/lib/Dict"; 12 | import assertExists from "@xtjs/lib/assertExists"; 13 | import assertInstanceOf from "@xtjs/lib/assertInstanceOf"; 14 | import assertState from "@xtjs/lib/assertState"; 15 | import findAndRemove from "@xtjs/lib/findAndRemove"; 16 | import randomPick from "@xtjs/lib/randomPick"; 17 | import readBufferStream from "@xtjs/lib/readBufferStream"; 18 | import http from "http"; 19 | import https from "https"; 20 | import { WebSocket, WebSocketServer } from "ws"; 21 | import { lg } from "../common/res"; 22 | 23 | const TOKEN = assertExists(process.env["API_WORKER_NODE_TOKEN"]); 24 | 25 | let nextReqId = 0; 26 | const reqs = new Dict< 27 | number, 28 | { 29 | resolve: ( 30 | res: 31 | | { 32 | output: any; 33 | } 34 | | { 35 | error: any; 36 | }, 37 | ) => void; 38 | reject: (err: Error) => void; 39 | } 40 | >(); 41 | const channelToConns: Record = {}; 42 | const connStates = new WeakMap< 43 | WebSocket, 44 | { 45 | requests: Set; 46 | channels: Set; 47 | } 48 | >(); 49 | 50 | const vNodeInitMessage = new VStruct({ 51 | ip: new VString(), 52 | token: new VString(), 53 | channels: new VArray(new VString(1), 1), 54 | }); 55 | 56 | const vMessageToNode = new VStruct({ 57 | id: new VInteger(0), 58 | input: new VUnknown(), 59 | }); 60 | 61 | const vMessageToBroker = new VStruct({ 62 | id: new VInteger(0), 63 | output: new VOptional(new VUnknown()), 64 | error: new VOptional(new VUnknown()), 65 | }); 66 | 67 | const wsServer = https.createServer({ 68 | key: Buffer.from( 69 | assertExists(process.env["API_WORKER_NODE_KEY_B64"]), 70 | "base64", 71 | ), 72 | cert: Buffer.from( 73 | assertExists(process.env["API_WORKER_NODE_CERT_B64"]), 74 | "base64", 75 | ), 76 | }); 77 | wsServer.listen(6000, () => lg.info("WS server started")); 78 | const ws = new WebSocketServer({ 79 | server: wsServer, 80 | }); 81 | ws.on("connection", (conn) => { 82 | lg.info("node connected"); 83 | const verifyTimeout = setTimeout(() => { 84 | lg.warn("did not receive token within reasonable time"); 85 | conn.close(); 86 | }, 1000 * 15); 87 | conn.once("message", (raw, isBinary) => { 88 | assertState(isBinary); 89 | const msg = vNodeInitMessage.parseRoot( 90 | decode(assertInstanceOf(raw, Buffer)), 91 | ); 92 | if (msg.token !== TOKEN) { 93 | lg.warn("received invalid token"); 94 | conn.close(); 95 | return; 96 | } 97 | clearTimeout(verifyTimeout); 98 | lg.info({ ip: msg.ip, channels: msg.channels }, "node verified"); 99 | // Set up connection. 100 | const connState = { 101 | requests: new Set(), 102 | channels: new Set(msg.channels), 103 | }; 104 | connStates.set(conn, connState); 105 | for (const ch of msg.channels) { 106 | (channelToConns[ch] ??= []).push(conn); 107 | } 108 | conn.on("message", (raw, isBinary) => { 109 | assertState(isBinary); 110 | const { id, error, output } = vMessageToBroker.parseRoot( 111 | decode(assertInstanceOf(raw, Buffer)), 112 | ); 113 | connState.requests.delete(id); 114 | const prom = reqs.remove(id); 115 | prom?.resolve({ error, output }); 116 | }); 117 | }); 118 | conn.on("close", () => { 119 | lg.info("node disconnected"); 120 | const connState = connStates.get(conn); 121 | if (connState) { 122 | for (const id of connState.requests) { 123 | reqs.remove(id)?.reject(new Error("Node disconnected")); 124 | } 125 | for (const ch of connState.channels) { 126 | findAndRemove(channelToConns[ch], (oc) => oc === conn); 127 | } 128 | } 129 | }); 130 | }); 131 | 132 | const sendToNode = (channel: string, input: any) => 133 | new Promise<{ error?: any; output?: any }>((resolve, reject) => { 134 | const id = nextReqId++; 135 | const conn = randomPick(channelToConns[channel] ?? []); 136 | if (!conn) { 137 | return reject(new Error("No node available")); 138 | } 139 | reqs.set(id, { resolve, reject }); 140 | connStates.get(conn)!.requests.add(id); 141 | const msg: Valid = { id, input }; 142 | conn.send(JSON.stringify(msg)); 143 | }); 144 | 145 | http 146 | .createServer(async (req, res) => { 147 | if (req.method !== "POST") { 148 | return res.writeHead(405).end(); 149 | } 150 | const channel = req.url?.slice(1) ?? ""; 151 | let input; 152 | try { 153 | input = decode(await readBufferStream(req)); 154 | } catch (err) { 155 | return res.writeHead(400).end(err.message); 156 | } 157 | let resBody; 158 | try { 159 | resBody = await sendToNode(channel, input); 160 | } catch (err) { 161 | return res.writeHead(500).end(err.message); 162 | } 163 | res 164 | .writeHead(resBody.error ? 502 : 200, { 165 | "content-type": "application/msgpack", 166 | }) 167 | .end(encode(resBody.error ?? resBody.output)); 168 | }) 169 | .listen(6050, () => lg.info("API server started")); 170 | -------------------------------------------------------------------------------- /api-worker-node/download_model.py: -------------------------------------------------------------------------------- 1 | from huggingface_hub import snapshot_download 2 | 3 | snapshot_download("BAAI/bge-m3") 4 | snapshot_download("jinaai/jina-embeddings-v2-small-en") 5 | -------------------------------------------------------------------------------- /api/endpoint/analyzePopularity.ts: -------------------------------------------------------------------------------- 1 | import { 2 | VFiniteNumber, 3 | VHumanString, 4 | VInteger, 5 | VStruct, 6 | Valid, 7 | } from "@wzlin/valid"; 8 | import assertInstanceOf from "@xtjs/lib/assertInstanceOf"; 9 | import { QueryGroupByOutput, makeQuery } from "../query"; 10 | 11 | const input = new VStruct({ 12 | query: new VHumanString(1, 512), 13 | simMinHundredths: new VInteger(80, 100), 14 | }); 15 | 16 | export const endpointAnalyzePopularity = { 17 | input, 18 | handler: async ({ query, simMinHundredths }: Valid) => { 19 | const simThreshold = simMinHundredths / 100; 20 | const res = await makeQuery({ 21 | dataset: "post", 22 | queries: [query], 23 | scales: { 24 | sim: { min: simThreshold, max: 1.0 }, 25 | }, 26 | post_filter_clip: { 27 | sim: { min: simThreshold, max: 1.0 }, 28 | // Some posts have UNIX timestamp 0. 29 | ts_day: { min: 1, max: Number.MAX_SAFE_INTEGER }, 30 | }, 31 | weights: { 32 | votes: "sim", 33 | }, 34 | outputs: [ 35 | { 36 | group_by: { 37 | by: "ts_day", 38 | bucket: 7, 39 | cols: [["final_score", "sum"]], 40 | }, 41 | }, 42 | ], 43 | }); 44 | const data = assertInstanceOf(res[0], QueryGroupByOutput); 45 | return { 46 | timestamps: [...data.groups(new VInteger())].map( 47 | (d) => new Date(d * 7 * 24 * 60 * 60 * 1000), 48 | ), 49 | scores: [...data.column("final_score", new VFiniteNumber())], 50 | }; 51 | }, 52 | }; 53 | -------------------------------------------------------------------------------- /api/endpoint/analyzeSentiment.ts: -------------------------------------------------------------------------------- 1 | import { 2 | VFiniteNumber, 3 | VHumanString, 4 | VInteger, 5 | VStruct, 6 | Valid, 7 | } from "@wzlin/valid"; 8 | import assertInstanceOf from "@xtjs/lib/assertInstanceOf"; 9 | import { QueryGroupByOutput, makeQuery } from "../query"; 10 | 11 | const input = new VStruct({ 12 | query: new VHumanString(1, 512), 13 | simMinHundredths: new VInteger(80, 100), 14 | }); 15 | 16 | export const endpointAnalyzeSentiment = { 17 | input, 18 | handler: async ({ query, simMinHundredths }: Valid) => { 19 | const res = await makeQuery({ 20 | dataset: "comment", 21 | queries: [query], 22 | thresholds: { 23 | sim: simMinHundredths / 100, 24 | sent_pos: 0.5, 25 | sent_neg: 0.5, 26 | }, 27 | post_filter_clip: { 28 | sim_thresh: { min: 1.0, max: 1.0 }, 29 | }, 30 | outputs: [ 31 | { 32 | group_by: { 33 | by: "ts_day", 34 | bucket: 7, 35 | cols: [ 36 | ["sent_pos_thresh", "sum"], 37 | ["sent_neg_thresh", "sum"], 38 | ], 39 | }, 40 | }, 41 | ], 42 | }); 43 | const data = assertInstanceOf(res[0], QueryGroupByOutput); 44 | return { 45 | timestamps: [...data.groups(new VInteger())].map( 46 | (d) => new Date(d * 7 * 24 * 60 * 60 * 1000), 47 | ), 48 | positives: [...data.column("sent_pos_thresh", new VFiniteNumber())], 49 | negatives: [...data.column("sent_neg_thresh", new VFiniteNumber())], 50 | }; 51 | }, 52 | }; 53 | -------------------------------------------------------------------------------- /api/endpoint/heatmap.ts: -------------------------------------------------------------------------------- 1 | import { 2 | VHumanString, 3 | VInteger, 4 | VMember, 5 | VStruct, 6 | VTuple, 7 | Valid, 8 | } from "@wzlin/valid"; 9 | import assertInstanceOf from "@xtjs/lib/assertInstanceOf"; 10 | import { QueryHeatmapOutput, makeQuery } from "../query"; 11 | 12 | const input = new VStruct({ 13 | query: new VHumanString(1, 512), 14 | dataset: new VMember(["post", "toppost"] as const), 15 | color: new VTuple([ 16 | new VInteger(0, 255), 17 | new VInteger(0, 255), 18 | new VInteger(0, 255), 19 | ] as const), 20 | }); 21 | 22 | export const endpointHeatmap = { 23 | input, 24 | handler: async ({ dataset, query, color }: Valid) => { 25 | const res = await makeQuery({ 26 | dataset, 27 | queries: [query], 28 | scales: { 29 | sim: { 30 | post: { min: 0.7, max: 1 }, 31 | toppost: { min: 0.55, max: 1 }, 32 | }[dataset], 33 | }, 34 | post_filter_clip: { 35 | scaled: { min: 0.01, max: 1 }, 36 | }, 37 | weights: { 38 | sim_scaled: 1, 39 | }, 40 | outputs: [ 41 | { 42 | heatmap: { 43 | alpha_scale: 2, // TODO This is really a hack, investigate distribution of scores. 44 | density: 25, 45 | color, 46 | upscale: 2, 47 | sigma: 4, 48 | }, 49 | }, 50 | ], 51 | }); 52 | const data = assertInstanceOf(res[0], QueryHeatmapOutput); 53 | return new Uint8Array(data.raw); 54 | }, 55 | }; 56 | -------------------------------------------------------------------------------- /api/endpoint/items.ts: -------------------------------------------------------------------------------- 1 | import { 2 | VFiniteNumber, 3 | VHumanString, 4 | VInteger, 5 | VMember, 6 | VStruct, 7 | Valid, 8 | } from "@wzlin/valid"; 9 | import assertInstanceOf from "@xtjs/lib/assertInstanceOf"; 10 | import { QueryItemsOutput, makeQuery } from "../query"; 11 | 12 | const input = new VStruct({ 13 | dataset: new VMember(["comment", "post"] as const), 14 | query: new VHumanString(1, 512), 15 | limit: new VInteger(1, 500), 16 | simMinHundredths: new VInteger(80, 100), 17 | orderBy: new VMember(["votes", "ts"] as const), 18 | }); 19 | 20 | export const endpointItems = { 21 | input, 22 | handler: async ({ 23 | dataset, 24 | query, 25 | limit, 26 | simMinHundredths, 27 | orderBy, 28 | }: Valid) => { 29 | const simThreshold = simMinHundredths / 100; 30 | const res = await makeQuery({ 31 | dataset, 32 | queries: [query], 33 | post_filter_clip: { 34 | sim: { min: simThreshold, max: 1 }, 35 | }, 36 | outputs: [ 37 | { 38 | items: { 39 | cols: ["id", "sim"], 40 | limit, 41 | order_asc: false, 42 | order_by: orderBy, 43 | }, 44 | }, 45 | ], 46 | }); 47 | const data = assertInstanceOf(res[0], QueryItemsOutput); 48 | return [ 49 | ...data.items({ 50 | id: new VInteger(1), 51 | sim: new VFiniteNumber(), 52 | }), 53 | ]; 54 | }, 55 | }; 56 | -------------------------------------------------------------------------------- /api/endpoint/search.ts: -------------------------------------------------------------------------------- 1 | import { 2 | VFiniteNumber, 3 | VHumanString, 4 | VInteger, 5 | VMember, 6 | VStruct, 7 | Valid, 8 | } from "@wzlin/valid"; 9 | import assertInstanceOf from "@xtjs/lib/assertInstanceOf"; 10 | import { QueryItemsOutput, makeQuery } from "../query"; 11 | 12 | const input = new VStruct({ 13 | query: new VHumanString(1, 512), 14 | limit: new VInteger(1, 128), 15 | dataset: new VMember(["post", "toppost"] as const), 16 | weightSimilarity: new VFiniteNumber(), 17 | weightScore: new VFiniteNumber(), 18 | weightTimestamp: new VFiniteNumber(), 19 | decayTimestamp: new VFiniteNumber(), 20 | }); 21 | 22 | export const endpointSearch = { 23 | input, 24 | handler: async ({ 25 | limit, 26 | dataset, 27 | decayTimestamp, 28 | query, 29 | weightScore, 30 | weightSimilarity, 31 | weightTimestamp, 32 | }: Valid) => { 33 | const res = await makeQuery({ 34 | dataset, 35 | queries: [query], 36 | ts_decay: decayTimestamp, 37 | scales: { 38 | sim: { 39 | post: { min: 0.7, max: 1 }, 40 | toppost: { min: 0.55, max: 1 }, 41 | }[dataset], 42 | }, 43 | weights: { 44 | sim_scaled: weightSimilarity, 45 | ts_norm: weightTimestamp, 46 | votes_norm: weightScore, 47 | }, 48 | outputs: [ 49 | { 50 | items: { 51 | cols: ["id", "x", "y", "sim", "final_score"], 52 | limit, 53 | }, 54 | }, 55 | ], 56 | }); 57 | const data = assertInstanceOf(res[0], QueryItemsOutput); 58 | return [ 59 | ...data.items({ 60 | id: new VInteger(1), 61 | x: new VFiniteNumber(), 62 | y: new VFiniteNumber(), 63 | sim: new VFiniteNumber(), 64 | final_score: new VFiniteNumber(), 65 | }), 66 | ]; 67 | }, 68 | }; 69 | -------------------------------------------------------------------------------- /api/endpoint/topUsers.ts: -------------------------------------------------------------------------------- 1 | import { 2 | VFiniteNumber, 3 | VHumanString, 4 | VInteger, 5 | VString, 6 | VStruct, 7 | Valid, 8 | } from "@wzlin/valid"; 9 | import assertInstanceOf from "@xtjs/lib/assertInstanceOf"; 10 | import { QueryGroupByOutput, makeQuery } from "../query"; 11 | 12 | const input = new VStruct({ 13 | query: new VHumanString(1, 512), 14 | limit: new VInteger(1, 20), 15 | simMinHundredths: new VInteger(80, 100), 16 | }); 17 | 18 | export const endpointTopUsers = { 19 | input, 20 | handler: async ({ query, limit, simMinHundredths }: Valid) => { 21 | const simThreshold = simMinHundredths / 100; 22 | const res = await makeQuery({ 23 | dataset: "comment", 24 | queries: [query], 25 | scales: { 26 | sim: { 27 | min: simThreshold, 28 | max: 1, 29 | }, 30 | }, 31 | weights: { 32 | // We can't multiply by votes, because the HN API does not expose votes for anything except posts. 33 | sim_scaled: 1, 34 | }, 35 | outputs: [ 36 | { 37 | group_by: { 38 | by: "user", 39 | cols: [["final_score", "sum"]], 40 | order_by: "final_score", 41 | order_asc: false, 42 | limit, 43 | }, 44 | }, 45 | ], 46 | }); 47 | const data = assertInstanceOf(res[0], QueryGroupByOutput); 48 | return [ 49 | ...data.entries(new VString(), { 50 | final_score: new VFiniteNumber(), 51 | }), 52 | ].map((e) => ({ 53 | user: e[0], 54 | score: e[1].final_score, 55 | })); 56 | }, 57 | }; 58 | -------------------------------------------------------------------------------- /api/main.ts: -------------------------------------------------------------------------------- 1 | import { decode, encode } from "@msgpack/msgpack"; 2 | import { Validator, ValuePath } from "@wzlin/valid"; 3 | import assertExists from "@xtjs/lib/assertExists"; 4 | import decodeBase64 from "@xtjs/lib/decodeBase64"; 5 | import parseInteger from "@xtjs/lib/parseInteger"; 6 | import readBufferStream from "@xtjs/lib/readBufferStream"; 7 | import uint8ArrayToBuffer from "@xtjs/lib/uint8ArrayToBuffer"; 8 | import { createSecureServer } from "http2"; 9 | import { lg } from "../common/res"; 10 | import { endpointAnalyzePopularity } from "./endpoint/analyzePopularity"; 11 | import { endpointAnalyzeSentiment } from "./endpoint/analyzeSentiment"; 12 | import { endpointHeatmap } from "./endpoint/heatmap"; 13 | import { endpointItems } from "./endpoint/items"; 14 | import { endpointSearch } from "./endpoint/search"; 15 | import { endpointTopUsers } from "./endpoint/topUsers"; 16 | 17 | const getPemEnv = (name: string) => 18 | uint8ArrayToBuffer( 19 | decodeBase64(assertExists(process.env[`SSL_${name}_BASE64`])), 20 | ); 21 | 22 | type Endpoint = { 23 | input: Validator; 24 | handler: (req: any) => Promise; 25 | }; 26 | 27 | const ENDPOINTS: Record = { 28 | analyzePopularity: endpointAnalyzePopularity, 29 | analyzeSentiment: endpointAnalyzeSentiment, 30 | heatmap: endpointHeatmap, 31 | items: endpointItems, 32 | search: endpointSearch, 33 | topUsers: endpointTopUsers, 34 | }; 35 | 36 | createSecureServer( 37 | { 38 | allowHTTP1: true, 39 | key: getPemEnv("KEY"), 40 | cert: getPemEnv("CERT"), 41 | ca: getPemEnv("CA"), 42 | rejectUnauthorized: true, 43 | requestCert: true, 44 | }, 45 | async (req, res) => { 46 | if (req.url === "/healthz") { 47 | return res.writeHead(200).end("OK"); 48 | } 49 | 50 | res.setHeader("Access-Control-Allow-Origin", "*"); 51 | res.setHeader("Access-Control-Allow-Methods", "*"); 52 | res.setHeader("Access-Control-Allow-Headers", "*"); 53 | if (req.method === "OPTIONS") { 54 | return res.writeHead(200).end(); 55 | } 56 | if (req.method !== "POST") { 57 | return res.writeHead(405).end(); 58 | } 59 | const endpointName = req.url?.slice(1); 60 | const endpoint = ENDPOINTS[endpointName]; 61 | if (!endpoint) { 62 | return res.writeHead(404).end(); 63 | } 64 | let reqBodyRaw; 65 | try { 66 | reqBodyRaw = decode(await readBufferStream(req)); 67 | } catch { 68 | return res.writeHead(400).end(); 69 | } 70 | let reqBody; 71 | try { 72 | reqBody = endpoint.input.parse( 73 | new ValuePath(["request body"]), 74 | reqBodyRaw, 75 | ); 76 | } catch (err) { 77 | return res.writeHead(400).end(err.message); 78 | } 79 | let resBody; 80 | try { 81 | resBody = await endpoint.handler(reqBody); 82 | } catch (error) { 83 | lg.error( 84 | { 85 | error: { 86 | trace: error.stack, 87 | message: error.message, 88 | type: error.constructor?.name, 89 | name: error.name, 90 | data: { ...error }, 91 | }, 92 | endpoint: endpointName, 93 | }, 94 | "endpoint error", 95 | ); 96 | return res.writeHead(500).end(); 97 | } 98 | return res 99 | .writeHead(200, { 100 | "content-type": "application/msgpack", 101 | }) 102 | .end(encode(resBody)); 103 | }, 104 | ) 105 | .on("error", (error) => lg.error({ error }, "server error")) 106 | .listen(parseInteger(process.env["PORT"]!), () => lg.info("server started")); 107 | -------------------------------------------------------------------------------- /app/component/App.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilsonzlin/hackerverse/d10a5aadb233fdf10fbcd365db1e4874cc1ddf6f/app/component/App.css -------------------------------------------------------------------------------- /app/component/App.tsx: -------------------------------------------------------------------------------- 1 | import { useEffect, useMemo, useState } from "react"; 2 | import { AnalysisPage } from "../page/Analysis"; 3 | import { CityPage } from "../page/City"; 4 | import { NotFoundPage } from "../page/NotFound"; 5 | import { SearchPage } from "../page/Search"; 6 | import { DEFAULT_EDGE, EdgeContext, findClosestEdge } from "../util/item"; 7 | import { router } from "../util/router"; 8 | import "./App.css"; 9 | 10 | export const App = () => { 11 | const [path, setPath] = useState(location.pathname); 12 | useEffect(() => { 13 | const handler = () => setPath(location.pathname); 14 | router.addListener(handler); 15 | return () => router.removeListener(handler); 16 | }, []); 17 | const [Page, params] = useMemo(() => { 18 | const [pfx, ...params] = path 19 | .slice(1) 20 | .split("/") 21 | .map((c) => decodeURIComponent(c)); 22 | const Page = 23 | { 24 | "": SearchPage, 25 | s: SearchPage, 26 | c: CityPage, 27 | a: AnalysisPage, 28 | }[pfx] ?? NotFoundPage; 29 | return [Page, params] as const; 30 | }, [path]); 31 | 32 | const [edge, setEdge] = useState(DEFAULT_EDGE); 33 | useEffect(() => { 34 | const ac = new AbortController(); 35 | (async () => { 36 | const edge = await findClosestEdge(ac.signal); 37 | console.log("Closest edge:", edge); 38 | setEdge(edge); 39 | })(); 40 | return () => ac.abort(); 41 | }, []); 42 | 43 | return ( 44 | 45 | 46 | 47 | ); 48 | }; 49 | -------------------------------------------------------------------------------- /app/component/ColorInput.css: -------------------------------------------------------------------------------- 1 | .ColorInput { 2 | cursor: pointer; 3 | 4 | > div { 5 | border-radius: 100%; 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /app/component/ColorInput.tsx: -------------------------------------------------------------------------------- 1 | import hexToRgb from "@xtjs/lib/hexToRgb"; 2 | import rgbToHex from "@xtjs/lib/rgbToHex"; 3 | import "./ColorInput.css"; 4 | 5 | export const ColorInput = ({ 6 | color, 7 | onChange, 8 | size, 9 | }: { 10 | color: [number, number, number]; 11 | onChange: (color: [number, number, number]) => void; 12 | size: number; 13 | }) => { 14 | return ( 15 |