├── .envrc
├── .envrc.local.example
├── .github
└── workflows
│ ├── build.yml
│ ├── deploy.yml
│ └── format.yml
├── .gitignore
├── .golangci.yml
├── .mergify.yml
├── LICENSE
├── README.md
├── aboutpage.go
├── algorithmspage.go
├── app.go
├── archive.go
├── auth.go
├── canonicaldomain.go
├── compare-against-random-voter.sql
├── database.go
├── devbox.json
├── devbox.lock
├── domain_penalties.go
├── fly.toml
├── frontpage.go
├── go.mod
├── go.sum
├── health.go
├── httpserver.go
├── init.sql
├── justfile
├── logger.go
├── logo.svg
├── main.go
├── middleware.go
├── migrate-volume.sh
├── position.go
├── postprocessing.go
├── prometheus.go
├── rankcrawler.go
├── reset-prior-average-upvote-rate.sql
├── resources.go
├── score-page.go
├── scoring-formula.go
├── scraper.go
├── seed
└── domain-penalties.csv
├── sql
├── cumulative-upvotes.sql
├── previous-crawl-index-old.sql
├── previous-crawl.sql
├── qnranks.sql
├── random-new-voter.sql
├── random-top-voter.sql
├── raw-ranks.sql
├── resubmissions.sql
└── upvote-rates.sql
├── static
├── android-chrome-192x192.png
├── android-chrome-512x512.png
├── apple-touch-icon.png
├── browserconfig.xml
├── chart-646.png
├── expected-upvotes.png
├── favicon-16x16.png
├── favicon-32x32.png
├── favicon.ico
├── hn-top-page-upvotes-by-rank.png
├── hn-top-page-votehistogram.svg
├── logo.svg
├── mstile-144x144.png
├── mstile-150x150.png
├── mstile-310x150.png
├── mstile-310x310.png
├── mstile-70x70.png
├── rank-history.png
├── safari-pinned-tab.svg
├── site.webmanifest
├── upvote-rate.png
└── upvote-share-by-rank.png
├── statspage.go
├── storage.go
├── story-details.go
├── storyplot-data.go
├── templates.go
├── templates
├── about-content.html.tmpl
├── about.html.tmpl
├── algorithms-content.html.tmpl
├── header.html.tmpl
├── index.html.tmpl
├── normalize.css.tmpl
├── ranksPlot.js.tmpl
├── score.html.tmpl
├── scorePlot.js.tmpl
├── spinner.css.tmpl
├── stats.html.tmpl
├── storyDetails.html.tmpl
├── storyplots.js.tmpl
├── styles.css.tmpl
├── upvoteRatePlot.js.tmpl
├── upvotesPlot.js.tmpl
├── vote.html.tmpl
└── vote.js.tmpl
├── timeout.go
├── upvote-rate-model.go
├── upvotes-db.sh
├── utils.go
├── vote.go
├── voting-notes.md
└── watch.sh
/.envrc:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # 1) Enable devbox environment
4 | eval "$(devbox generate direnv --print-envrc)"
5 |
6 | # 2) Any shared env variables go here
7 | export SQLITE_DATA_DIR=data
8 | export CACHE_SIZE=100
9 | export LISTEN_ADDRESS=127.0.0.1
10 | export PORT=8080
11 | export R2_BUCKET=news-archive-dev
12 | export R2_USE_SSL=true
13 | export R2_ENDPOINT=https://9e2da4e2b5c6dd05d36f399d4afc7d4c.r2.cloudflarestorage.com
14 |
15 | # 3) Only on macOS unify DEVELOPER_DIR / DEVELOPER_DIR_FOR_TARGET
16 | if [[ "$(uname)" == "Darwin" ]]; then
17 | # Devbox may set both DEVELOPER_DIR and DEVELOPER_DIR_FOR_TARGET to different paths.
18 | # cgo doesn't like that, so unify them.
19 | if [[ -n "$DEVELOPER_DIR" && -n "$DEVELOPER_DIR_FOR_TARGET" ]]; then
20 | export DEVELOPER_DIR_FOR_TARGET="$DEVELOPER_DIR"
21 | fi
22 | fi
23 |
24 | # 4) If there's a local override file, load it
25 | if [[ -f .envrc.local ]]; then
26 | source .envrc.local
27 | echo "Successfully loaded .envrc.local"
28 | fi
29 |
30 | echo "Successfully loaded .envrc"
31 |
--------------------------------------------------------------------------------
/.envrc.local.example:
--------------------------------------------------------------------------------
1 | # If you use nix, you can run the nix-shell directly with the following command
2 | if command -v nix &> /dev/null
3 | then
4 | use nix
5 | # you can add parameters to the nix-shell as well, e.g.
6 | # use nix --command zsh
7 | # if you use lorri, replace `use nix` with (see https://github.com/nix-community/lorri)
8 | # eval "$(lorri direnv)"
9 | fi
10 |
11 | export R2_ACCESS_KEY_ID="DEV.ACCESS.KEY.ID"
12 | export R2_SECRET_ACCESS_KEY="DEV.SECRET.ACCESS.KEY"
13 |
14 | echo "Successfully loaded .envrc.local"
15 |
--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
1 | name: Build
2 |
3 | on:
4 | push:
5 | branches: ["master"]
6 | tags: [v*]
7 | pull_request:
8 | types: [opened, synchronize]
9 | workflow_dispatch:
10 |
11 | permissions:
12 | contents: read
13 |
14 | # automatically cancel previous runs on the same PR
15 | # https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre/67939898#67939898
16 | concurrency:
17 | group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
18 | cancel-in-progress: true
19 |
20 | jobs:
21 | build:
22 | name: "Build"
23 | runs-on: ubuntu-22.04
24 | steps:
25 | - uses: actions/checkout@v3
26 | with:
27 | # https://github.com/actions/checkout/issues/626
28 | # This is correct, because we're using a merge queue (mergify) which only merges when built against the latest target branch.
29 | # https://docs.mergify.com/actions/queue/
30 | ref: ${{ github.event.pull_request.head.sha }}
31 | - uses: actions/setup-go@v3
32 | with:
33 | go-version-file: go.mod
34 | cache: true
35 | - run: go build *.go
36 | - name: Check if working directory is clean
37 | run: git diff --quiet --exit-code || (git status && false)
38 |
39 | lint:
40 | name: "Lint"
41 | runs-on: ubuntu-22.04
42 | steps:
43 | - uses: actions/checkout@v3
44 | with:
45 | # https://github.com/actions/checkout/issues/626
46 | # This is correct, because we're using a merge queue (mergify) which only merges when built against the latest target branch.
47 | # https://docs.mergify.com/actions/queue/
48 | ref: ${{ github.event.pull_request.head.sha }}
49 | - uses: actions/setup-go@v3
50 | with:
51 | go-version-file: go.mod
52 | cache: true
53 | - name: golangci-lint
54 | uses: golangci/golangci-lint-action@v3
55 | with:
56 | version: v1.50.1
57 | - name: Check if go code is formatted
58 | run: |
59 | UNFORMATTED_FILES=$(gofmt -l .)
60 | test -z $UNFORMATTED_FILES || (echo -e "Go code not formatted:\n$UNFORMATTED_FILES\n"; exit 1)
61 |
--------------------------------------------------------------------------------
/.github/workflows/deploy.yml:
--------------------------------------------------------------------------------
1 | name: Deploy
2 |
3 | on:
4 | push:
5 | branches: [master]
6 | env:
7 | FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
8 | jobs:
9 | deploy:
10 | name: Deploy app
11 | runs-on: ubuntu-22.04
12 | steps:
13 | - uses: actions/checkout@v3
14 | - uses: superfly/flyctl-actions/setup-flyctl@master
15 | - run: flyctl deploy
16 |
--------------------------------------------------------------------------------
/.github/workflows/format.yml:
--------------------------------------------------------------------------------
1 | name: Formatter
2 |
3 | on:
4 | pull_request:
5 | types: [opened]
6 | issue_comment:
7 | types: [created]
8 |
9 | jobs:
10 | format:
11 | name: "Format"
12 | runs-on: ubuntu-22.04
13 | if: github.event.issue.pull_request
14 | steps:
15 | - uses: khan/pull-request-comment-trigger@v1.1.0
16 | id: check
17 | with:
18 | trigger: '/format'
19 | reaction: "+1" # Reaction must be one of the reactions here: https://developer.github.com/v3/reactions/#reaction-types
20 | env:
21 | GITHUB_TOKEN: '${{ secrets.GITHUB_TOKEN }}'
22 |
23 | - uses: actions/checkout@v3
24 | if: steps.check.outputs.triggered == 'true'
25 |
26 | - name: Check out PR
27 | if: steps.check.outputs.triggered == 'true'
28 | run: gh pr checkout ${{ github.event.issue.number }}
29 | env:
30 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
31 |
32 | - uses: actions/setup-go@v3
33 | with:
34 | go-version-file: go.mod
35 | cache: true
36 |
37 | - name: Format Go Code
38 | if: steps.check.outputs.triggered == 'true'
39 | run: go fmt .
40 |
41 | - name: Commit changes
42 | if: steps.check.outputs.triggered == 'true'
43 | run: |
44 | git config user.name "GitHub Actions Bot"
45 | git config user.email "<>"
46 |
47 | git status
48 | git diff --stat
49 | git commit -am "chore: format code"
50 |
51 | git log --oneline --max-count=10
52 |
53 | git push
54 |
55 | - uses: khan/pull-request-comment-trigger@v1.1.0
56 | if: failure()
57 | with:
58 | trigger: '/format'
59 | reaction: "confused" # Reaction must be one of the reactions here: https://developer.github.com/v3/reactions/#reaction-types
60 | env:
61 | GITHUB_TOKEN: '${{ secrets.GITHUB_TOKEN }}'
62 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # mac
2 | .DS_Store
3 |
4 | # other?
5 | .history
6 |
7 | #general
8 | /node_modules
9 | /data
10 | /.envrc.local
11 |
12 | personal-notes.md
13 | working-query.sql
--------------------------------------------------------------------------------
/.golangci.yml:
--------------------------------------------------------------------------------
1 | linters:
2 | disable:
3 | - staticcheck
4 |
5 | run:
6 | skip-dirs:
7 | - "go/pkg/mod"
8 | - "/Cellar/go"
9 |
--------------------------------------------------------------------------------
/.mergify.yml:
--------------------------------------------------------------------------------
1 | queue_rules:
2 | - name: Merge dependency-update PRs
3 | queue_conditions:
4 | - label=dependencies
5 | - base=master
6 | merge_conditions: []
7 | merge_method: squash
8 |
9 | - name: Merge PRs using label (rebase)
10 | queue_conditions:
11 | - label=ready-to-merge-rebase
12 | - base=master
13 | - "#review-requested=0"
14 | - "#changes-requested-reviews-by=0"
15 | - "#review-threads-unresolved=0"
16 | merge_conditions: []
17 | merge_method: rebase
18 |
19 | - name: Merge PRs using label (squash)
20 | queue_conditions:
21 | - label=ready-to-merge-squash
22 | - base=master
23 | - "#review-requested=0"
24 | - "#changes-requested-reviews-by=0"
25 | - "#review-threads-unresolved=0"
26 | merge_conditions: []
27 | merge_method: squash
28 |
29 | pull_request_rules:
30 | - name: All PRs into queue
31 | conditions: []
32 | actions:
33 | queue:
34 |
--------------------------------------------------------------------------------
/aboutpage.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "net/http"
5 |
6 | "github.com/pkg/errors"
7 | )
8 |
9 | type AboutPageData struct {
10 | PageTemplateData
11 | }
12 |
13 | func (d AboutPageData) IsAboutPage() bool {
14 | return true
15 | }
16 |
17 | func (app app) aboutHandler() func(http.ResponseWriter, *http.Request, struct{}) error {
18 | return func(w http.ResponseWriter, r *http.Request, p struct{}) error {
19 | w.Header().Set("Content-Type", "text/html; charset=utf-8")
20 |
21 | err := templates.ExecuteTemplate(w, "about.html.tmpl", AboutPageData{PageTemplateData{UserID: app.getUserID(r)}})
22 |
23 | return errors.Wrap(err, "executing algorithms page template")
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/algorithmspage.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "net/http"
5 |
6 | "github.com/pkg/errors"
7 | )
8 |
9 | type AlgorithmsPageData struct {
10 | PageTemplateData
11 | }
12 |
13 | func (d AlgorithmsPageData) IsAlgorithmsPage() bool {
14 | return true
15 | }
16 |
17 | func (app app) algorithmsHandler() func(http.ResponseWriter, *http.Request, struct{}) error {
18 | return func(w http.ResponseWriter, r *http.Request, p struct{}) error {
19 | w.Header().Set("Content-Type", "text/html; charset=utf-8")
20 |
21 | err := templates.ExecuteTemplate(w, "about.html.tmpl", AlgorithmsPageData{PageTemplateData{UserID: app.getUserID(r)}})
22 |
23 | return errors.Wrap(err, "executing Algorithms page template")
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/app.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "context"
5 | "net/http"
6 | "os"
7 | "strconv"
8 | "time"
9 |
10 | "github.com/johnwarden/hn"
11 | "golang.org/x/exp/slog"
12 |
13 | retryablehttp "github.com/hashicorp/go-retryablehttp"
14 | )
15 |
16 | type app struct {
17 | ndb newsDatabase
18 | hnClient *hn.Client
19 | httpClient *http.Client
20 | logger *slog.Logger
21 | cacheSize int
22 | archiveTriggerChan chan context.Context
23 | }
24 |
25 | func initApp() app {
26 | var err error
27 | var cacheSize int
28 | {
29 | s := os.Getenv("CACHE_SIZE")
30 | if s != "" {
31 | cacheSize, err = strconv.Atoi(s)
32 | if err != nil {
33 | LogFatal(slog.Default(), "CACHE_SIZE", err)
34 | }
35 | }
36 | }
37 |
38 | logLevelString := os.Getenv("LOG_LEVEL")
39 | logFormatString := os.Getenv("LOG_FORMAT")
40 | logger := newLogger(logLevelString, logFormatString)
41 |
42 | sqliteDataDir := os.Getenv("SQLITE_DATA_DIR")
43 | if sqliteDataDir == "" {
44 | panic("SQLITE_DATA_DIR not set")
45 | }
46 |
47 | db, err := openNewsDatabase(sqliteDataDir)
48 | if err != nil {
49 | LogFatal(logger, "openNewsDatabase", err)
50 | }
51 |
52 | retryClient := retryablehttp.NewClient()
53 | retryClient.RetryMax = 3
54 | retryClient.RetryWaitMin = 1 * time.Second
55 | retryClient.RetryWaitMax = 5 * time.Second
56 |
57 | retryClient.Logger = wrapLoggerForRetryableHTTPClient(logger)
58 |
59 | httpClient := retryClient.StandardClient()
60 |
61 | hnClient := hn.NewClient(httpClient)
62 |
63 | return app{
64 | httpClient: httpClient,
65 | hnClient: hnClient,
66 | logger: logger,
67 | ndb: db,
68 | cacheSize: cacheSize,
69 | archiveTriggerChan: make(chan context.Context, 1),
70 | }
71 | }
72 |
73 | func (app app) cleanup() {
74 | app.ndb.close()
75 | }
76 |
--------------------------------------------------------------------------------
/auth.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "database/sql"
5 | "math/rand"
6 | "net/http"
7 | "strconv"
8 |
9 | "github.com/johnwarden/httperror"
10 | "github.com/pkg/errors"
11 | )
12 |
13 | func (app app) getUserID(r *http.Request) sql.NullInt64 {
14 | var id sql.NullInt64
15 |
16 | cookie, err := r.Cookie("userID")
17 | if err != nil {
18 | if !errors.Is(err, http.ErrNoCookie) {
19 | app.logger.Error("r.Cookie('UserID'", err)
20 | }
21 | return id
22 | }
23 |
24 | idInt, err := strconv.Atoi(cookie.Value)
25 | if err != nil {
26 | app.logger.Error("Parsing cookie", err)
27 | }
28 |
29 | id.Int64 = int64(idInt)
30 | id.Valid = true
31 |
32 | return id
33 | }
34 |
35 | type loginParams struct {
36 | UserID sql.NullInt64
37 | }
38 |
39 | func (app app) loginHandler() func(http.ResponseWriter, *http.Request, loginParams) error {
40 | return func(w http.ResponseWriter, r *http.Request, p loginParams) error {
41 | userID := p.UserID
42 |
43 | if !userID.Valid {
44 | loggedInUserID := app.getUserID(r)
45 | if loggedInUserID.Valid {
46 | http.Redirect(w, r, "/", http.StatusTemporaryRedirect)
47 | return nil
48 | }
49 |
50 | // Assign a random user ID if none specified as parameter
51 | userID.Int64 = rand.Int63()
52 | userID.Valid = true
53 | }
54 |
55 | if userID.Int64 == 0 {
56 | return httperror.PublicErrorf(http.StatusUnauthorized, "Can't login as user 0")
57 | }
58 |
59 | setUserIDCookie(w, userID)
60 |
61 | http.Redirect(w, r, "/score", http.StatusTemporaryRedirect)
62 |
63 | return nil
64 | }
65 | }
66 |
67 | func (app app) logoutHandler() func(http.ResponseWriter, *http.Request, struct{}) error {
68 | return func(w http.ResponseWriter, r *http.Request, p struct{}) error {
69 | var userID sql.NullInt64
70 | setUserIDCookie(w, userID)
71 |
72 | http.Redirect(w, r, "/", http.StatusTemporaryRedirect)
73 |
74 | return nil
75 | }
76 | }
77 |
78 | func setUserIDCookie(w http.ResponseWriter, userID sql.NullInt64) {
79 | value := strconv.Itoa(int(userID.Int64))
80 | maxAge := 365 * 24 * 60 * 60
81 | if !userID.Valid {
82 | maxAge = -1
83 | value = ""
84 | }
85 |
86 | cookie := http.Cookie{
87 | Name: "userID",
88 | Value: value,
89 | Path: "/",
90 | MaxAge: maxAge,
91 | HttpOnly: true,
92 | Secure: true,
93 | SameSite: http.SameSiteLaxMode,
94 | }
95 |
96 | // Use the http.SetCookie() function to send the cookie to the client.
97 | // Behind the scenes this adds a `Set-Cookie` header to the response
98 | // containing the necessary cookie data.
99 | http.SetCookie(w, &cookie)
100 | }
101 |
--------------------------------------------------------------------------------
/canonicaldomain.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "net/http"
6 | "strings"
7 |
8 | "github.com/johnwarden/httperror"
9 | )
10 |
11 | var nonCanonicalDomains = map[string]string{
12 | "social-protocols-news.fly.dev": "news.social-protocols.org",
13 | "127.0.0.1:8080": "localhost:8080", // just for testing
14 | }
15 |
16 | var canonicalDomains = getValues(nonCanonicalDomains)
17 |
18 | func (app app) canonicalDomainMiddleware(handler http.Handler) http.Handler {
19 | return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
20 | // Redirect any non-canonical domain to the corresponding canonical domain.
21 | for nonCanonicalDomain, canonicalDomain := range nonCanonicalDomains {
22 | if r.Host == nonCanonicalDomain {
23 | url := "https://" + canonicalDomain + r.RequestURI
24 | http.Redirect(w, r, url, http.StatusMovedPermanently)
25 | return
26 | }
27 | }
28 | isCanonical := false
29 | for _, canonicalDomain := range canonicalDomains {
30 | if strings.HasPrefix(r.Host, canonicalDomain) {
31 | isCanonical = true
32 | break
33 | }
34 | }
35 | if !isCanonical {
36 | httperror.DefaultErrorHandler(w, httperror.New(http.StatusForbidden, fmt.Sprintf("Invalid request host: %s", r.Host)))
37 | return
38 | }
39 |
40 | handler.ServeHTTP(w, r)
41 | })
42 | }
43 |
--------------------------------------------------------------------------------
/compare-against-random-voter.sql:
--------------------------------------------------------------------------------
1 | with parameters as (
2 | select
3 | 1.50 as priorWeight
4 | , 0.003462767 as fatigueFactor
5 | ),
6 | stories as (
7 | select
8 | id
9 | , votes.entryTime is not null as mystory
10 | , entryUpvoteRate
11 | , max(cumulativeUpvotes) as cumulativeUpvotes
12 | , max(cumulativeExpectedUpvotes) as cumulativeExpectedUpvotes
13 | , max(score) as score
14 | , (cumulativeUpvotes + priorWeight)/((1-exp(-fatigueFactor*cumulativeExpectedUpvotes))/fatigueFactor + priorWeight) qualityScore
15 |
16 | , log((cumulativeUpvotes + priorWeight)/((1-exp(-fatigueFactor*cumulativeExpectedUpvotes))/fatigueFactor + priorWeight))*100 gain
17 |
18 |
19 | from dataset
20 | join parameters
21 | left join votes on
22 | votes.userID = 1
23 | and votes.storyID = dataset.id
24 |
25 |
26 | -- where id >= (select min(storyID) from votes where userID = 1 and storyID > 36754601) and id <= (select max(storyID) from votes where userID = 1 and storyID > 36754601)
27 | -- where id >= (select min(storyID) from votes where userID = 1 and storyID > 36780531) and id <= (select max(storyID) from votes where userID = 1 and storyID > 36780531)
28 | -- where id >= (select min(storyID) from votes where userID = 1)
29 | where id >= (select min(storyID) from votes where userID = 1) and id <= (select max(storyID) from votes where userID = 1)
30 |
31 | -- and id <= (select max(storyID) from votes where userID = 1)
32 |
33 | group by id
34 | )
35 |
36 | -- select * from stories where id = 36805284;
37 |
38 |
39 |
40 | , sums as (
41 | select
42 | sum(case when mystory then cumulativeUpvotes else null end) as myCumulativeUpvotes
43 | , sum(case when mystory then cumulativeExpectedUpvotes else null end) as myCumulativeExpectedUpvotes
44 | , avg(case when mystory then score else null end) as myAverageScore
45 | , avg(case when mystory then cumulativeUpvotes / cumulativeExpectedUpvotes else null end) as myAverageUpvoteRate
46 |
47 | -- The below doesn't make sense. Because cumulativeUpvotes are sometimes 0, and the log of 0 is not defined.
48 | -- , exp(avg(case when mystory then log(cumulativeUpvotes / cumulativeExpectedUpvotes) else null end)) as myGeoAverageUpvoteRate
49 |
50 |
51 | -- , sum(case when votes.entryTime is not null then score-1 else null end)/count(distinct votes.storyID) as myAverageScore
52 | , sum(cumulativeUpvotes) as overallCumulativeUpvotes
53 | , sum(cumulativeExpectedUpvotes) as overallCumulativeExpectedUpvotes
54 | , avg(score) as overallAverageScore
55 | , avg(cumulativeUpvotes / cumulativeExpectedUpvotes) as overallAverageUpvoteRate
56 |
57 | -- The below doesn't make sense. Because cumulativeUpvotes are sometimes 0, and the log of 0 is not defined.
58 | -- , exp(avg(log(cumulativeUpvotes / cumulativeExpectedUpvotes))) as overallGeoAverageUpvoteRate
59 |
60 |
61 | , exp(avg(log((cumulativeUpvotes + priorWeight)/((1-exp(-fatigueFactor*cumulativeExpectedUpvotes))/fatigueFactor + priorWeight)))) geoAverageQualityScore
62 |
63 |
64 | , sum(log((cumulativeUpvotes + priorWeight)/((1-exp(-fatigueFactor*cumulativeExpectedUpvotes))/fatigueFactor + priorWeight)) )*100 baselineGain
65 |
66 |
67 | -- , exp(avg(log((cumulativeUpvotes + priorWeight)/(cumulativeExpectedUpvotes + priorWeight)))) geoAverageQualityScore
68 |
69 |
70 | -- , sum(case when votes.entryTime is null then score-1 else null end)/(count(distinct dataset.id) - count(distinct votes.storyID)) as overallAverageScore
71 | from stories
72 | join parameters
73 | )
74 | select
75 | -- *
76 | myAverageScore
77 | , myAverageUpvoteRate
78 | , myCumulativeUpvotes/myCumulativeExpectedUpvotes as myUpvoteRate
79 | , overallAverageScore
80 | , overallAverageUpvoteRate
81 | , overallCumulativeUpvotes/overallCumulativeExpectedUpvotes as overallUpvoteRate
82 | , geoAverageQualityScore
83 | , baselineGain
84 | from sums;
85 |
86 |
87 | -- Discussion: The geomean quality score is close to 1, as expected. The average score is greater than 1, because that's what will happen
88 | -- if you take the average of exp(x) when the average of x is 0. FOr example in R:
89 | -- (ins)> x = rnorm(10000, mean=0, sd=2)
90 | -- (ins)> mean(x)
91 | -- [1] -0.007797868
92 | -- (ins)> mean(exp(x))
93 | -- [1] 9.844065
94 |
--------------------------------------------------------------------------------
/devbox.json:
--------------------------------------------------------------------------------
1 | {
2 | "$schema": "https://raw.githubusercontent.com/jetify-com/devbox/0.13.4/.schema/devbox.schema.json",
3 | "packages": [
4 | "entr@latest",
5 | "git@latest",
6 | "gcc@latest",
7 | "gotools@latest",
8 | "golangci-lint@latest",
9 | "sqlite-interactive@latest",
10 | "go@latest",
11 | "just@latest"
12 | ],
13 | }
14 |
--------------------------------------------------------------------------------
/domain_penalties.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "bytes"
5 | "encoding/csv"
6 | "fmt"
7 | "io"
8 | "strconv"
9 |
10 | "github.com/pkg/errors"
11 | "gorm.io/driver/sqlite"
12 | "gorm.io/gorm"
13 | "gorm.io/gorm/clause"
14 | )
15 |
16 | type DomainPenalty struct {
17 | Domain string `gorm:"primaryKey"`
18 | AvgPenalty float64
19 | }
20 |
21 | func (ndb newsDatabase) importPenaltiesData(sqliteDataDir string) error {
22 | frontpageDatabaseFilename := fmt.Sprintf("%s/%s", sqliteDataDir, sqliteDataFilename)
23 |
24 | db, err := gorm.Open(sqlite.Open(frontpageDatabaseFilename), &gorm.Config{})
25 | if err != nil {
26 | panic("failed to connect database")
27 | }
28 |
29 | err = db.AutoMigrate(&DomainPenalty{})
30 | if err != nil {
31 | return errors.Wrap(err, "db.AutoMigrate Domain Penalties table")
32 | }
33 |
34 | // Open domain penalty seed data file as CSV
35 | b, _ := resources.ReadFile("seed/domain-penalties.csv")
36 | buf := bytes.NewBuffer(b)
37 | r := csv.NewReader(buf)
38 |
39 | // Read the header row.
40 | _, err = r.Read()
41 | if err != nil {
42 | return errors.Wrap(err, "missing header row in domain penalties data")
43 | }
44 |
45 | for {
46 | record, err := r.Read()
47 | if err != nil {
48 | if errors.Is(err, io.EOF) {
49 | break
50 | }
51 | return errors.Wrapf(err, "Parsing penalty CSV")
52 | }
53 |
54 | avgPenalty, err := strconv.ParseFloat(record[1], 64)
55 | if err != nil {
56 | return errors.Wrapf(err, "Parsing penalty record %s, %s", record[0], record[1])
57 | }
58 | err = db.Clauses(clause.OnConflict{ // adding this onConflict clause makes the create into an upsert
59 | UpdateAll: true,
60 | }).Create(&DomainPenalty{Domain: record[0], AvgPenalty: avgPenalty}).Error
61 |
62 | if err != nil {
63 | return errors.Wrapf(err, "Parsing inserting domain penalty %s, %f", record[0], avgPenalty)
64 | }
65 |
66 | }
67 |
68 | return nil
69 | }
70 |
--------------------------------------------------------------------------------
/fly.toml:
--------------------------------------------------------------------------------
1 | # fly.toml file generated for social-protocols-news on 2022-09-14T17:00:08+02:00
2 |
3 | app = "social-protocols-news"
4 | kill_signal = "SIGINT"
5 | kill_timeout = 5
6 | processes = []
7 | primary_region = "ewr"
8 |
9 | [build]
10 | builder = "paketobuildpacks/builder:base"
11 | buildpacks = ["gcr.io/paketo-buildpacks/go"]
12 |
13 | [env]
14 | PORT = "8080"
15 | SQLITE_DATA_DIR="/data"
16 | LOG_LEVEL="DEBUG"
17 | CACHE_SIZE="100"
18 | R2_BUCKET="news-archive"
19 | R2_USE_SSL="true"
20 | R2_ENDPOINT="https://9e2da4e2b5c6dd05d36f399d4afc7d4c.r2.cloudflarestorage.com"
21 |
22 | [experimental]
23 | allowed_public_ports = []
24 | auto_rollback = true
25 |
26 | [[services]]
27 | http_checks = []
28 | internal_port = 8080
29 | processes = ["app"]
30 | protocol = "tcp"
31 | script_checks = []
32 | [services.concurrency]
33 | hard_limit = 25
34 | soft_limit = 20
35 | type = "connections"
36 |
37 | [[services.ports]]
38 | force_https = true
39 | handlers = ["http"]
40 | port = 80
41 |
42 | [[services.ports]]
43 | handlers = ["tls", "http"]
44 | port = 443
45 |
46 | [[services.tcp_checks]]
47 | grace_period = "1s"
48 | interval = "15s"
49 | restart_limit = 0
50 | timeout = "2s"
51 |
52 |
53 |
54 | # flyctl volumes create data --region ewr --size 3
55 | [[mounts]]
56 | source = "data3"
57 | destination = "/data"
58 |
59 | # prometheus metrics
60 | [metrics]
61 | port = 9091
62 | path = "/metrics"
63 |
64 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/social-protocols/news
2 |
3 | go 1.22
4 |
5 | toolchain go1.23.3
6 |
7 | require (
8 | github.com/NYTimes/gziphandler v1.1.1
9 | github.com/VictoriaMetrics/metrics v1.23.0
10 | github.com/dustin/go-humanize v1.0.1
11 | github.com/gocolly/colly/v2 v2.1.0
12 | github.com/gorilla/schema v1.2.0
13 | github.com/hashicorp/go-retryablehttp v0.7.1
14 | github.com/johnwarden/hn v1.0.1
15 | github.com/johnwarden/httperror v1.6.0
16 | github.com/julienschmidt/httprouter v1.3.0
17 | github.com/mattn/go-sqlite3 v1.14.15
18 | github.com/minio/minio-go/v7 v7.0.80
19 | github.com/multiprocessio/go-sqlite3-stdlib v0.0.0-20220822170115-9f6825a1cd25
20 | github.com/pkg/errors v0.9.1
21 | github.com/weppos/publicsuffix-go v0.20.0
22 | golang.org/x/exp v0.0.0-20221114191408-850992195362
23 | gonum.org/v1/gonum v0.12.0
24 | gorm.io/driver/sqlite v1.4.3
25 | gorm.io/gorm v1.24.2
26 | )
27 |
28 | //replace github.com/johnwarden/httperror v1.6.0 => ../httperror
29 | //replace "github.com/johnwarden/hn" v1.0.1 => "../hn"
30 |
31 | require (
32 | github.com/PuerkitoBio/goquery v1.5.1 // indirect
33 | github.com/alitto/pond/v2 v2.1.4 // indirect
34 | github.com/andybalholm/cascadia v1.2.0 // indirect
35 | github.com/antchfx/htmlquery v1.2.3 // indirect
36 | github.com/antchfx/xmlquery v1.2.4 // indirect
37 | github.com/antchfx/xpath v1.1.8 // indirect
38 | github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de // indirect
39 | github.com/fatih/color v1.13.0 // indirect
40 | github.com/go-ini/ini v1.67.0 // indirect
41 | github.com/gobwas/glob v0.2.3 // indirect
42 | github.com/goccy/go-json v0.10.3 // indirect
43 | github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e // indirect
44 | github.com/golang/protobuf v1.4.2 // indirect
45 | github.com/google/uuid v1.6.0 // indirect
46 | github.com/hashicorp/go-cleanhttp v0.5.2 // indirect
47 | github.com/hashicorp/go-hclog v0.16.2 // indirect
48 | github.com/jinzhu/inflection v1.0.0 // indirect
49 | github.com/jinzhu/now v1.1.5 // indirect
50 | github.com/kennygrant/sanitize v1.2.4 // indirect
51 | github.com/klauspost/compress v1.17.11 // indirect
52 | github.com/klauspost/cpuid/v2 v2.2.8 // indirect
53 | github.com/mattn/go-colorable v0.1.13 // indirect
54 | github.com/mattn/go-isatty v0.0.16 // indirect
55 | github.com/minio/md5-simd v1.1.2 // indirect
56 | github.com/rs/xid v1.6.0 // indirect
57 | github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect
58 | github.com/temoto/robotstxt v1.1.1 // indirect
59 | github.com/valyala/fastrand v1.1.0 // indirect
60 | github.com/valyala/histogram v1.2.0 // indirect
61 | golang.org/x/crypto v0.28.0 // indirect
62 | golang.org/x/net v0.30.0 // indirect
63 | golang.org/x/sys v0.26.0 // indirect
64 | golang.org/x/text v0.19.0 // indirect
65 | google.golang.org/appengine v1.6.6 // indirect
66 | google.golang.org/protobuf v1.24.0 // indirect
67 | )
68 |
--------------------------------------------------------------------------------
/health.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "net/http"
6 | "time"
7 |
8 | "github.com/pkg/errors"
9 | )
10 |
11 | const alertAfterMinutes = 5
12 |
13 | func (app app) healthHandler() func(http.ResponseWriter, *http.Request, loginParams) error {
14 | return func(w http.ResponseWriter, r *http.Request, p loginParams) error {
15 | w.Header().Set("Content-Type", "text/plain; charset=utf-8")
16 |
17 | if r.Method != http.MethodHead {
18 | _, err := w.Write([]byte("ok"))
19 | if err != nil {
20 | return errors.Wrap(err, "writing response")
21 | }
22 | }
23 |
24 | return nil
25 | }
26 | }
27 |
28 | func (app app) crawlHealthHandler() func(http.ResponseWriter, *http.Request, loginParams) error {
29 | return func(w http.ResponseWriter, r *http.Request, p loginParams) error {
30 | w.Header().Set("Content-Type", "text/plain; charset=utf-8")
31 |
32 | lastSampleTime, err := app.ndb.selectLastCrawlTime()
33 | if err != nil {
34 | return errors.Wrap(err, "getting last crawl time")
35 | }
36 |
37 | if time.Now().Unix()-int64(lastSampleTime) > alertAfterMinutes*60 {
38 | return fmt.Errorf("last successful crawl of %d is more than %d minutes ago", lastSampleTime, alertAfterMinutes)
39 | }
40 |
41 | if r.Method != http.MethodHead {
42 | _, err = w.Write([]byte("ok"))
43 | if err != nil {
44 | return errors.Wrap(err, "writing response")
45 | }
46 | }
47 |
48 | return nil
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/httpserver.go:
--------------------------------------------------------------------------------
1 | //nolint:typecheck
2 | package main
3 |
4 | import (
5 | "embed"
6 | "io/fs"
7 | "net/http"
8 | "os"
9 | "time"
10 |
11 | "github.com/julienschmidt/httprouter"
12 | "github.com/pkg/errors"
13 | )
14 |
15 | const (
16 | // writeTimeout = 2500 * time.Millisecond
17 | writeTimeout = 60 * time.Second
18 | readHeaderTimeout = 5 * time.Second
19 | )
20 |
21 | //go:embed static
22 | var staticFS embed.FS
23 |
24 | func (app app) httpServer(onPanic func(error)) *http.Server {
25 | l := app.logger
26 |
27 | port := os.Getenv("PORT")
28 | if port == "" {
29 | port = "8080"
30 | }
31 |
32 | listenAddress := os.Getenv("LISTEN_ADDRESS")
33 |
34 | staticRoot, err := fs.Sub(staticFS, "static")
35 | if err != nil {
36 | LogFatal(l, "fs.Sub", err)
37 | }
38 |
39 | server := &http.Server{
40 | Addr: listenAddress + ":" + port,
41 | WriteTimeout: writeTimeout - 100*time.Millisecond,
42 | ReadHeaderTimeout: readHeaderTimeout,
43 | }
44 |
45 | router := httprouter.New()
46 | router.GET("/static/*filepath", app.serveFiles(http.FS(staticRoot)))
47 |
48 | router.GET("/", middleware("hntop", l, onPanic, app.frontpageHandler("hntop")))
49 | router.GET("/new", middleware("new", l, onPanic, app.frontpageHandler("new")))
50 | router.GET("/top", middleware("top", l, onPanic, app.frontpageHandler("hntop")))
51 | router.GET("/best", middleware("best", l, onPanic, app.frontpageHandler("best")))
52 | router.GET("/ask", middleware("ask", l, onPanic, app.frontpageHandler("ask")))
53 | router.GET("/show", middleware("show", l, onPanic, app.frontpageHandler("show")))
54 | router.GET("/raw", middleware("raw", l, onPanic, app.frontpageHandler("raw")))
55 | router.GET("/fair", middleware("fair", l, onPanic, app.frontpageHandler("fair")))
56 | router.GET("/upvoterate", middleware("upvoterate", l, onPanic, app.frontpageHandler("upvoterate")))
57 | router.GET("/best-upvoterate", middleware("best-upvoterate", l, onPanic, app.frontpageHandler("best-upvoterate")))
58 | router.GET("/penalties", middleware("penalties", l, onPanic, app.frontpageHandler("penalties")))
59 | router.GET("/boosts", middleware("boosts", l, onPanic, app.frontpageHandler("boosts")))
60 | router.GET("/resubmissions", middleware("resubmissions", l, onPanic, app.frontpageHandler("resubmissions")))
61 | router.GET("/stats", middleware("stats", l, onPanic, app.statsHandler()))
62 | router.GET("/about", middleware("about", l, onPanic, app.aboutHandler()))
63 | router.GET("/algorithms", middleware("algorithms", l, onPanic, app.algorithmsHandler()))
64 |
65 | router.POST("/vote", middleware("upvote", l, onPanic, app.voteHandler()))
66 |
67 | router.GET("/score", middleware("score", l, onPanic, app.scoreHandler()))
68 |
69 | router.GET("/login", middleware("login", l, onPanic, app.loginHandler()))
70 | router.GET("/logout", middleware("logout", l, onPanic, app.logoutHandler()))
71 |
72 | router.GET("/health", middleware("health", l, onPanic, app.healthHandler()))
73 | router.HEAD("/health", middleware("health", l, onPanic, app.healthHandler()))
74 | router.GET("/crawl-health", middleware("crawl-health", l, onPanic, app.crawlHealthHandler()))
75 | router.HEAD("/crawl-health", middleware("crawl-health", l, onPanic, app.crawlHealthHandler()))
76 |
77 | server.Handler = app.preRouterMiddleware(router, writeTimeout-100*time.Millisecond)
78 |
79 | return server
80 | }
81 |
82 | func (app app) frontpageHandler(ranking string) func(http.ResponseWriter, *http.Request, OptionalFrontPageParams) error {
83 | return func(w http.ResponseWriter, r *http.Request, params OptionalFrontPageParams) error {
84 | w.Header().Set("Content-Type", "text/html; charset=utf-8")
85 |
86 | err := app.serveFrontPage(r, w, ranking, params.WithDefaults())
87 | return errors.Wrap(err, "serveFrontPage")
88 | }
89 | }
90 |
91 | func (app app) statsHandler() func(http.ResponseWriter, *http.Request, StatsPageParams) error {
92 | return func(w http.ResponseWriter, r *http.Request, params StatsPageParams) error {
93 | w.Header().Set("Content-Type", "text/html; charset=utf-8")
94 |
95 | userID := app.getUserID(r)
96 | return app.statsPage(w, r, params, userID)
97 | }
98 | }
99 |
100 | func (app app) serveFiles(root http.FileSystem) func(w http.ResponseWriter, r *http.Request, p httprouter.Params) {
101 | fileServer := http.FileServer(root)
102 |
103 | return func(w http.ResponseWriter, r *http.Request, p httprouter.Params) {
104 | w.Header().Set("Cache-Control", "public, max-age=86400") // 1 hours
105 | r.URL.Path = p.ByName("filepath")
106 | fileServer.ServeHTTP(w, r)
107 | }
108 | }
109 |
--------------------------------------------------------------------------------
/init.sql:
--------------------------------------------------------------------------------
1 | attach database 'file:/Users/jwarden/hacker-news-data-datadir/frontpage.sqlite?mode=ro' as frontpage;
2 |
--------------------------------------------------------------------------------
/justfile:
--------------------------------------------------------------------------------
1 | set dotenv-load := true
2 |
3 | # List available recipes in the order in which they appear in this file
4 | _default:
5 | @just --list --unsorted
6 |
7 | watch:
8 | ./watch.sh
9 |
10 | sqlite:
11 | sqlite3 $SQLITE_DATA_DIR/frontpage.sqlite
12 |
13 | upvotes-db:
14 | ./upvotes-db.sh
15 |
16 | format:
17 | go fmt
--------------------------------------------------------------------------------
/logger.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "os"
6 | "strings"
7 |
8 | "golang.org/x/exp/slog"
9 | )
10 |
11 | func newLogger(levelString, formatString string) *slog.Logger {
12 | if levelString == "" {
13 | levelString = "DEBUG"
14 | }
15 |
16 | logLevels := map[string]slog.Leveler{
17 | "DEBUG": slog.DebugLevel,
18 | "INFO": slog.InfoLevel,
19 | "WARN": slog.WarnLevel,
20 | "ERROR": slog.ErrorLevel,
21 | }
22 |
23 | l, ok := logLevels[strings.ToUpper(levelString)]
24 | if !ok {
25 | panic("Unrecognized log level: " + levelString)
26 | }
27 |
28 | var lh slog.Handler
29 |
30 | if strings.ToUpper(formatString) == "JSON" {
31 | lh = slog.HandlerOptions{Level: l}.NewJSONHandler(os.Stdout)
32 | } else {
33 | lh = slog.HandlerOptions{Level: l}.NewTextHandler(os.Stdout)
34 | }
35 |
36 | logger := slog.New(lh)
37 | slog.SetDefault(logger)
38 | return logger
39 | }
40 |
41 | func LogErrorf(logger *slog.Logger, msg string, args ...interface{}) {
42 | logger.Error(fmt.Sprintf(msg, args...), nil)
43 | }
44 |
45 | func Debugf(logger *slog.Logger, msg string, args ...interface{}) {
46 | logger.Debug(fmt.Sprintf(msg, args...))
47 | }
48 |
49 | func LogFatal(logger *slog.Logger, msg string, err error, args ...interface{}) {
50 | if len(args) > 0 {
51 | logger.Error(msg, err, args...)
52 | } else {
53 | logger.Error(msg, err)
54 | }
55 | os.Exit(2)
56 | }
57 |
58 | type retryableHTTPClientloggerWrapper struct {
59 | *slog.Logger
60 | }
61 |
62 | func (l retryableHTTPClientloggerWrapper) Error(msg string, keysAndValues ...interface{}) {
63 | l.Logger.Error("retryableHTTPClient: "+msg, nil, keysAndValues...)
64 | }
65 |
66 | func (l retryableHTTPClientloggerWrapper) Debug(msg string, keysAndValues ...interface{}) {
67 | // ignore very verbose debug output from retryableHTTPClientloggerWrapper
68 | }
69 |
70 | // wrapLoggerForRetryableHTTPClient wraps a logger so that it implements an interface required by retryableHTTPClient
71 | func wrapLoggerForRetryableHTTPClient(logger *slog.Logger) retryableHTTPClientloggerWrapper {
72 | // ignore debug messages from this retry client.
73 | l := slog.New(logger.Handler())
74 | return retryableHTTPClientloggerWrapper{l}
75 | }
76 |
--------------------------------------------------------------------------------
/logo.svg:
--------------------------------------------------------------------------------
1 |
2 |
76 |
--------------------------------------------------------------------------------
/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "context"
5 | "net/http"
6 | "os"
7 | "os/signal"
8 | "syscall"
9 | "time"
10 |
11 | "github.com/pkg/errors"
12 | )
13 |
14 | const maxShutDownTimeout = 5 * time.Second
15 |
16 | func main() {
17 | app := initApp()
18 | defer app.cleanup()
19 |
20 | logger := app.logger
21 |
22 | ctx, cancelContext := context.WithCancel(context.Background())
23 | defer cancelContext()
24 |
25 | shutdownPrometheusServer := servePrometheusMetrics()
26 |
27 | // Start the archive worker
28 | go app.archiveWorker(ctx)
29 |
30 | // Listen for a soft kill signal (INT, TERM, HUP)
31 | c := make(chan os.Signal, 1)
32 | signal.Notify(c, syscall.SIGINT, syscall.SIGTERM, syscall.SIGHUP)
33 |
34 | // shutdown function call in case of 1) panic 2) soft kill signal
35 | var httpServer *http.Server // this variable included in shutdown closure
36 |
37 | shutdown := func() {
38 | // cancel the current background context
39 | cancelContext()
40 |
41 | err := shutdownPrometheusServer(ctx)
42 | if err != nil {
43 | logger.Error("shutdownPrometheusServer", err)
44 | }
45 |
46 | if httpServer != nil {
47 | logger.Info("Shutting down HTTP server")
48 | // shut down the HTTP server with a timeout in case the server doesn't want to shut down.
49 | // use background context, because we just cancelled ctx
50 | ctxWithTimeout, cancel := context.WithTimeout(context.Background(), maxShutDownTimeout)
51 | defer cancel()
52 | err := httpServer.Shutdown(ctxWithTimeout)
53 | if err != nil {
54 | logger.Error("httpServer.Shutdown", err)
55 | // if server doesn't respond to shutdown signal, nothing remains but to panic.
56 | panic("HTTP server shutdown failed")
57 | }
58 |
59 | logger.Info("HTTP server shutdown complete")
60 | }
61 | }
62 |
63 | go func() {
64 | sig := <-c
65 |
66 | // Clean shutdown
67 | logger.Info("Received shutdown signal", "signal", sig)
68 | shutdown()
69 |
70 | // now exit process
71 | logger.Info("Main loop exited. Terminating process")
72 |
73 | os.Exit(0)
74 | }()
75 |
76 | httpServer = app.httpServer(
77 | func(error) {
78 | logger.Info("Panic in HTTP handler. Shutting down")
79 | shutdown()
80 | os.Exit(2)
81 | },
82 | )
83 |
84 | go func() {
85 | logger.Info("HTTP server listening", "address", httpServer.Addr)
86 | err := httpServer.ListenAndServe()
87 | if err != nil && err != http.ErrServerClosed {
88 | logger.Error("server.ListenAndServe", err)
89 | }
90 | logger.Info("Server shut down")
91 | }()
92 |
93 | app.mainLoop(ctx)
94 | }
95 |
96 | func (app app) mainLoop(ctx context.Context) {
97 | logger := app.logger
98 |
99 | lastCrawlTime, err := app.ndb.selectLastCrawlTime()
100 | if err != nil {
101 | LogFatal(logger, "selectLastCrawlTime", err)
102 | }
103 |
104 | t := time.Now().Unix()
105 |
106 | elapsed := int(t) - lastCrawlTime
107 |
108 | // If it has been more than a minute since our last crawl,
109 | // then crawl right away.
110 | if elapsed >= 60 {
111 | logger.Info("60 seconds since last crawl. Crawling now.")
112 | if err = app.crawlAndPostprocess(ctx); err != nil {
113 | logger.Error("crawlAndPostprocess", err)
114 |
115 | if errors.Is(err, context.Canceled) {
116 | return
117 | }
118 | }
119 | } else {
120 | logger.Info("Less than 60 seconds since last crawl.", "waitSeconds", 60-time.Now().Unix()%60)
121 | }
122 |
123 | // And now set a ticker so we crawl every minute going forward
124 | ticker := make(chan int64)
125 |
126 | // Make the first tick happen at the next
127 | // Minute mark.
128 | go func() {
129 | t := time.Now().Unix()
130 | delay := 60 - t%60
131 | <-time.After(time.Duration(delay) * time.Second)
132 | ticker <- t + delay
133 | }()
134 |
135 | for {
136 | select {
137 | case <-ticker:
138 | t := time.Now().Unix()
139 | // Set the next tick at the minute mark. We use this instead of using
140 | // time.NewTicker because in dev mode our app can be suspended, and I
141 | // want to see all the timestamps in the DB as multiples of 60.
142 | delay := 60 - t%60
143 | nextTickTime := t + delay
144 | go func() {
145 | <-time.After(time.Duration(delay) * time.Second)
146 | ticker <- nextTickTime
147 | }()
148 |
149 | logger.Info("Beginning crawl")
150 |
151 | // Create a context with deadline for both crawl and idle period
152 | crawlCtx, cancel := context.WithDeadline(ctx, time.Unix(nextTickTime-1, 0))
153 | defer cancel()
154 |
155 | if err = app.crawlAndPostprocess(crawlCtx); err != nil {
156 | logger.Error("crawlAndPostprocess", err)
157 | } else {
158 | app.logger.Info("Finished crawl and postprocess")
159 |
160 | // Only send idle context if we have enough time (at least 10 seconds)
161 | if delay >= 5 {
162 | // Try to send the same context to the archive worker
163 | select {
164 | case app.archiveTriggerChan <- crawlCtx:
165 | app.logger.Debug("Sent idle context to archive worker")
166 | default:
167 | app.logger.Debug("Archive trigger channel full, skipping signal")
168 | }
169 | } else {
170 | app.logger.Debug("Skipping idle context - not enough time", "delay", delay)
171 | }
172 | }
173 |
174 | case <-ctx.Done():
175 | return
176 | }
177 | }
178 | }
179 |
--------------------------------------------------------------------------------
/middleware.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "database/sql"
5 | "encoding/json"
6 | "net/http"
7 | "reflect"
8 | "strconv"
9 | "strings"
10 | "time"
11 |
12 | "github.com/pkg/errors"
13 | "golang.org/x/exp/slog"
14 |
15 | "github.com/julienschmidt/httprouter"
16 |
17 | "github.com/johnwarden/httperror"
18 |
19 | "github.com/gorilla/schema"
20 |
21 | "github.com/NYTimes/gziphandler"
22 | )
23 |
24 | // middleware converts a handler of type httperror.XHandlerFunc[P] into an
25 | // httprouter.Handle. We use the former type for our http handler functions:
26 | // this is a clean function signature that accepts parameters as a struct and
27 | // returns an error. But we need to pass an httprouter.Handle to our router.
28 | // So we wrap our httperror.XHandlerFunc[P], parsing the URL parameters to
29 | // produce the parameter struct, passing it to the inner handler, then
30 | // handling any errors that are returned.
31 | func middleware[P any](routeName string, logger *slog.Logger, onPanic func(error), h httperror.XHandlerFunc[P]) httprouter.Handle {
32 | h = httperror.XPanicMiddleware[P](h)
33 |
34 | h = prometheusMiddleware[P](routeName, h)
35 |
36 | handleError := func(w http.ResponseWriter, err error) {
37 | if errors.Is(err, httperror.Panic) {
38 | // do this in a goroutine otherwise we get deadlock if onPanic shuts downs the HTTP server
39 | // because the http server shutdown function will wait for all requests to terminate,
40 | // including this one!
41 | go onPanic(err)
42 | }
43 | httperror.DefaultErrorHandler(w, err)
44 | }
45 |
46 | return func(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
47 | var params P
48 | err := unmarshalRouterRequest(r, ps, ¶ms)
49 | if err != nil {
50 | err = httperror.Wrap(err, http.StatusBadRequest)
51 | logger.Error("unmarshalRouterRequest", err, "url", r.URL)
52 | handleError(w, err)
53 | return
54 | }
55 |
56 | err = h(w, r, params)
57 | if err != nil {
58 | if httperror.StatusCode(err) >= 500 {
59 | logger.Error("executing handler", err, "url", r.URL)
60 | requestErrorsTotal.Inc()
61 | }
62 | handleError(w, err)
63 | }
64 | }
65 | }
66 |
67 | var decoder = schema.NewDecoder()
68 |
69 | func nullInt64Converter(value string) reflect.Value {
70 | var result sql.NullInt64
71 | if value != "" {
72 | v, _ := strconv.ParseInt(value, 10, 64)
73 | result = sql.NullInt64{Int64: v, Valid: true}
74 | }
75 | return reflect.ValueOf(result)
76 | }
77 |
78 | func nullFloat64Converter(value string) reflect.Value {
79 | var result sql.NullFloat64
80 | if value != "" {
81 | v, _ := strconv.ParseFloat(value, 64)
82 | result = sql.NullFloat64{Float64: v, Valid: true}
83 | }
84 | return reflect.ValueOf(result)
85 | }
86 |
87 | func init() {
88 | decoder.RegisterConverter(sql.NullInt64{}, nullInt64Converter)
89 | decoder.RegisterConverter(sql.NullFloat64{}, nullFloat64Converter)
90 | }
91 |
92 | // unmarshalRouterRequest is a generic request URL unmarshaler for use with
93 | // httprouter. It unmarshals the request parameters parsed by httprouter, as
94 | // well as any URL parameters, into a struct of any type, matching query
95 | // names to struct field names.
96 | func unmarshalRouterRequest(r *http.Request, ps httprouter.Params, params any) error {
97 | if r.Method == "POST" {
98 | err := json.NewDecoder(r.Body).Decode(params)
99 | if err != nil {
100 | return errors.Wrap(err, "decode json")
101 | }
102 | return nil
103 | }
104 |
105 | m := make(map[string][]string)
106 |
107 | // First convert the httprouter.Params into a map
108 | for _, p := range ps {
109 | key := p.Key
110 | if v, ok := m[key]; ok {
111 | m[key] = append(v, p.Value)
112 | } else {
113 | m[key] = []string{p.Value}
114 | }
115 | }
116 |
117 | // Then merge in the URL query parameters.
118 | for key, values := range r.URL.Query() {
119 | if v, ok := m[key]; ok {
120 | m[key] = append(v, values...)
121 | } else {
122 | m[key] = values
123 | }
124 | }
125 |
126 | // Then unmarshal.
127 | err := decoder.Decode(params, m)
128 | if err != nil {
129 | if !strings.HasPrefix(err.Error(), "schema: invalid path") {
130 | // ignore errors due to unrecognized parameters
131 | return errors.Wrap(err, "decode parameters")
132 | }
133 | }
134 |
135 | return nil
136 | }
137 |
138 | // preRouterMiddleware wraps the router itself. It is for middleware that does
139 | // not need to know anything about the route (params, name, etc)
140 | func (app app) preRouterMiddleware(handler http.Handler, writeTimeout time.Duration) http.Handler {
141 | handler = app.cacheAndCompressMiddleware(handler)
142 | handler = app.canonicalDomainMiddleware(handler) // redirects must happen before caching!
143 | handler = app.timeoutMiddleware(handler, writeTimeout) // redirects must happen before caching!
144 | return handler
145 | }
146 |
147 | // We could improve this middleware. Currently we cache before we
148 | // compress, because the cache middleware we use here doesn't recognize the
149 | // accept-encoding header, and if we compressed before we cache, cache
150 | // entries would be randomly compressed or not, regardless of the
151 | // accept-encoding header. Unfortunately by caching before we compress,
152 | // requests are cached uncompressed. A compressed-cache middleware would be a
153 | // nice improvement. Also our cache-control headers should be synced with the
154 | // exact cache expiration time, which should be synced with the crawl. But
155 | // what we have here is simple and probably good enough.
156 |
157 | func (app app) cacheAndCompressMiddleware(handler http.Handler) http.Handler {
158 | // if app.cacheSize > 0 {
159 |
160 | // memorycached, err := memory.NewAdapter(
161 | // memory.AdapterWithAlgorithm(memory.LRU),
162 | // memory.AdapterWithCapacity(app.cacheSize),
163 | // )
164 | // if err != nil {
165 | // LogFatal(app.logger, "memory.NewAdapater", err)
166 | // }
167 |
168 | // cacheClient, err := cache.NewClient(
169 | // cache.ClientWithAdapter(memorycached),
170 | // cache.ClientWithTTL(1*time.Minute),
171 | // cache.ClientWithRefreshKey("opn"),
172 | // )
173 | // if err != nil {
174 | // LogFatal(app.logger, "cache.NewClient", err)
175 | // }
176 |
177 | // var h http.Handler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
178 | // // since we update data only every minute, tell browsers to cache for one minute
179 | // handler.ServeHTTP(w, r)
180 | // })
181 |
182 | // h = cacheClient.Middleware(h)
183 | // }
184 | h := handler
185 |
186 | return gziphandler.GzipHandler(h)
187 | }
188 |
--------------------------------------------------------------------------------
/migrate-volume.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e # Exit on any error
3 |
4 | # Configuration
5 | APP_NAME="news"
6 | OLD_VOLUME_NAME="data2"
7 | NEW_VOLUME_NAME="data3"
8 | NEW_VOLUME_SIZE="3" # Adjust this to your needs
9 | REGION="ewr" # Your current region
10 |
11 | # Function to wait for VM to be ready
12 | wait_for_vm() {
13 | echo "Waiting for VM to be ready..."
14 | while true; do
15 | STATUS=$(fly status --app $APP_NAME)
16 | if echo "$STATUS" | grep -q "running"; then
17 | echo "VM is ready"
18 | break
19 | fi
20 | echo "VM not ready yet, waiting..."
21 | sleep 5
22 | done
23 | }
24 |
25 | echo "Stopping the application..."
26 | fly scale count 0 --app $APP_NAME
27 |
28 | echo "Creating new volume..."
29 | fly volumes create $NEW_VOLUME_NAME --size $NEW_VOLUME_SIZE --region $REGION
30 |
31 | echo "Creating temporary machine with old volume..."
32 | cat > migrate-old.toml << EOL
33 | app = "$APP_NAME"
34 | primary_region = "$REGION"
35 |
36 | [build]
37 | image = "alpine:latest"
38 |
39 | [mounts]
40 | source = "$OLD_VOLUME_NAME"
41 | destination = "/data"
42 |
43 | [processes]
44 | app = "sleep infinity"
45 | EOL
46 |
47 | echo "Deploying temporary machine with old volume..."
48 | fly deploy --config migrate-old.toml --app $APP_NAME
49 | wait_for_vm
50 |
51 | echo "Copying data from old volume to temporary storage..."
52 | fly ssh console --command 'cd /data && tar czf frontpage.sqlite.gz frontpage.sqlite && tar czf frontpage.sqlite-shm.gz frontpage.sqlite-shm && tar czf frontpage.sqlite-wal.gz frontpage.sqlite-wal' --app $APP_NAME
53 |
54 | echo "Downloading database files from old volume..."
55 | fly sftp shell --app $APP_NAME << EOF
56 | get /data/frontpage.sqlite.gz ~/social-protocols-data/recover/frontpage.sqlite.gz
57 | get /data/frontpage.sqlite-shm.gz ~/social-protocols-data/recover/frontpage.sqlite-shm.gz
58 | get /data/frontpage.sqlite-wal.gz ~/social-protocols-data/recover/frontpage.sqlite-wal.gz
59 | exit
60 | EOF
61 |
62 | echo "Destroying temporary machine..."
63 | fly scale count 0 --app $APP_NAME
64 | fly machines destroy $(fly machines list --json | jq -r '.[].id') --force --app $APP_NAME
65 |
66 | echo "Creating temporary machine with new volume..."
67 | cat > migrate-new.toml << EOL
68 | app = "$APP_NAME"
69 | primary_region = "$REGION"
70 |
71 | [build]
72 | image = "alpine:latest"
73 |
74 | [mounts]
75 | source = "$NEW_VOLUME_NAME"
76 | destination = "/data"
77 |
78 | [processes]
79 | app = "sleep infinity"
80 | EOL
81 |
82 | echo "Deploying temporary machine with new volume..."
83 | fly deploy --config migrate-new.toml --app $APP_NAME
84 | wait_for_vm
85 |
86 | echo "Uploading database files to new volume..."
87 | fly sftp shell --app $APP_NAME << EOF
88 | put ~/social-protocols-data/recover/frontpage.sqlite.gz /data/frontpage.sqlite.gz
89 | put ~/social-protocols-data/recover/frontpage.sqlite-shm.gz /data/frontpage.sqlite-shm.gz
90 | put ~/social-protocols-data/recover/frontpage.sqlite-wal.gz /data/frontpage.sqlite-wal.gz
91 | exit
92 | EOF
93 |
94 | echo "Extracting database files on new volume..."
95 | fly ssh console --command 'cd /data && gunzip frontpage.sqlite.gz && gunzip frontpage.sqlite-shm.gz && gunzip frontpage.sqlite-wal.gz' --app $APP_NAME
96 |
97 | echo "Updating mount configuration..."
98 | # Create a temporary file for the new fly.toml
99 | cat > fly.toml.new << EOL
100 | [mounts]
101 | source = "$NEW_VOLUME_NAME"
102 | destination = "/data"
103 | EOL
104 |
105 | # Backup the original fly.toml
106 | cp fly.toml fly.toml.backup
107 |
108 | # Update the mounts section in fly.toml
109 | sed -i.bak '/\[mounts\]/,/^$/c\' fly.toml
110 | cat fly.toml.new >> fly.toml
111 | rm fly.toml.new migrate-old.toml migrate-new.toml
112 |
113 | echo "Deploying application with new volume..."
114 | fly deploy
115 | wait_for_vm
116 |
117 | echo "Verifying application is running..."
118 | fly status --app $APP_NAME
119 |
120 | echo "If everything looks good, you can delete the old volume with:"
121 | echo "fly volumes delete $OLD_VOLUME_NAME --app $APP_NAME"
122 | echo ""
123 | echo "To rollback, restore the original fly.toml:"
124 | echo "mv fly.toml.backup fly.toml"
--------------------------------------------------------------------------------
/postprocessing.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "bytes"
5 | "context"
6 | "database/sql"
7 | "fmt"
8 | "io"
9 | "strings"
10 | "time"
11 |
12 | "github.com/pkg/errors"
13 | "golang.org/x/exp/slog"
14 | )
15 |
16 | const (
17 | qnRankFormulaSQL = "pow(ageHours * (cumulativeUpvotes + overallPriorWeight)/((1-exp(-fatigueFactor*cumulativeExpectedUpvotes))/fatigueFactor + overallPriorWeight), 0.8) / pow(ageHours + 2, gravity/0.8) desc"
18 |
19 | // qnRankFormulaSQL = `
20 | // pow(
21 | // ageHours *
22 | // sample_from_gamma_distribution(
23 | // cumulativeUpvotes + overallPriorWeight,
24 | // (
25 | // 1-exp(-fatigueFactor*cumulativeExpectedUpvotes)
26 | // ) / fatigueFactor + overallPriorWeight
27 | // )
28 | // , 0.8
29 | // ) / pow(
30 | // ageHours + 2
31 | // , gravity/0.8
32 | // ) desc`
33 |
34 | hnRankFormulaSQL = "(score-1) / pow(ageHours + 2, gravity/0.8) desc"
35 | )
36 |
37 | func (app app) crawlPostprocess(ctx context.Context, tx *sql.Tx) error {
38 | t := time.Now()
39 | defer crawlPostprocessingDuration.UpdateDuration(t)
40 |
41 | var err error
42 |
43 | // for _, filename := range []string{"previous-crawl.sql", "resubmissions.sql", "raw-ranks.sql", "upvote-rates.sql"} {
44 | for _, filename := range []string{
45 | "previous-crawl.sql",
46 | "resubmissions.sql",
47 | "raw-ranks.sql",
48 | } {
49 | app.logger.Info("Processing SQL file", slog.String("filename", filename))
50 | err = executeSQLFile(ctx, tx, filename)
51 | if err != nil {
52 | return err
53 | }
54 | }
55 |
56 | err = app.updateQNRanks(ctx, tx)
57 | if err != nil {
58 | return errors.Wrap(err, "updateQNRanks")
59 | }
60 |
61 | app.logger.Info("Finished crawl postprocessing", slog.Duration("elapsed", time.Since(t)))
62 |
63 | return err
64 | }
65 |
66 | var qnRanksSQL = readSQLSource("qnranks.sql")
67 |
68 | func (app app) updateQNRanks(ctx context.Context, tx *sql.Tx) error {
69 | t := time.Now()
70 |
71 | d := defaultFrontPageParams
72 | sql := fmt.Sprintf(qnRanksSQL, d.PriorWeight, d.OverallPriorWeight, d.Gravity, d.PenaltyWeight, d.FatigueFactor, qnRankFormulaSQL)
73 |
74 | stmt, err := tx.Prepare(sql)
75 | if err != nil {
76 | return errors.Wrap(err, "preparing updateQNRanksSQL")
77 | }
78 |
79 | _, err = stmt.ExecContext(ctx)
80 |
81 | app.logger.Info("Finished executing updateQNRanks", slog.Duration("elapsed", time.Since(t)))
82 |
83 | return errors.Wrap(err, "executing updateQNRanksSQL")
84 | }
85 |
86 | func readSQLSource(filename string) string {
87 | f, err := resources.Open("sql/" + filename)
88 | if err != nil {
89 | panic(err)
90 | }
91 | defer f.Close()
92 | buf := bytes.NewBuffer(nil)
93 | _, err = io.Copy(buf, f)
94 | if err != nil {
95 | panic(err)
96 | }
97 |
98 | return buf.String()
99 | }
100 |
101 | func executeSQLFile(ctx context.Context, tx *sql.Tx, filename string) error {
102 | sql := readSQLSource(filename)
103 |
104 | sql = strings.Trim(sql, " \n\r;")
105 |
106 | parts := strings.Split(sql, ";\n")
107 |
108 | for _, sql := range parts {
109 |
110 | stmt, err := tx.Prepare(sql)
111 | if err != nil {
112 | return errors.Wrapf(err, "preparing SQL in file %s", filename)
113 | }
114 |
115 | _, err = stmt.ExecContext(ctx)
116 |
117 | if err != nil {
118 | return errors.Wrapf(err, "executing SQL in file %s", filename)
119 | }
120 | }
121 | return nil
122 | }
123 |
--------------------------------------------------------------------------------
/prometheus.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "context"
5 | "net/http"
6 | "os"
7 | "time"
8 |
9 | "github.com/VictoriaMetrics/metrics"
10 | "github.com/johnwarden/httperror"
11 | "golang.org/x/exp/slog"
12 | )
13 |
14 | // Register various metrics.
15 | // Metric name may contain labels in Prometheus format - see below.
16 |
17 | var (
18 | crawlErrorsTotal = metrics.NewCounter(`errors_total{type="crawl"}`)
19 | archiveErrorsTotal = metrics.NewCounter(`errors_total{type="archive"}`)
20 | requestErrorsTotal = metrics.NewCounter(`errors_total{type="request"}`)
21 | crawlDuration = metrics.NewHistogram("crawl_duration_seconds")
22 | crawlPostprocessingDuration = metrics.NewHistogram("crawl_postprocessing_duration_seconds")
23 |
24 | upvotesTotal = metrics.NewCounter(`upvotes_total`)
25 | submissionsTotal = metrics.NewCounter(`submissions_total`)
26 | storiesArchivedTotal = metrics.NewCounter(`stories_archived_total`)
27 | storiesPurgedTotal = metrics.NewCounter(`stories_purged_total`)
28 |
29 | vacuumOperationsTotal = metrics.NewCounter(`database_vacuum_operations_total{database="frontpage"}`)
30 |
31 | // Store histograms per route to avoid duplicate registration
32 | routeHistograms = make(map[string]*metrics.Histogram)
33 | )
34 |
35 | // getRouteHistogram returns an existing histogram for a route or creates a new one
36 | func getRouteHistogram(routeName string) *metrics.Histogram {
37 | if h, exists := routeHistograms[routeName]; exists {
38 | return h
39 | }
40 | h := metrics.NewHistogram(`requests_duration_seconds{route="` + routeName + `"}`)
41 | routeHistograms[routeName] = h
42 | return h
43 | }
44 |
45 | func servePrometheusMetrics() func(ctx context.Context) error {
46 | mux := http.NewServeMux()
47 |
48 | // Export all the registered metrics in Prometheus format at `/metrics` http path.
49 | mux.HandleFunc("/metrics", func(w http.ResponseWriter, req *http.Request) {
50 | metrics.WritePrometheus(w, true)
51 | })
52 |
53 | listenAddress := os.Getenv("LISTEN_ADDRESS")
54 |
55 | s := &http.Server{
56 | Addr: listenAddress + ":9091",
57 | Handler: mux,
58 | }
59 |
60 | go func() {
61 | LogFatal(slog.Default(), "Listen and serve prometheus", s.ListenAndServe())
62 | }()
63 |
64 | return s.Shutdown
65 | }
66 |
67 | func prometheusMiddleware[P any](routeName string, h httperror.XHandler[P]) httperror.XHandlerFunc[P] {
68 | requestDuration := getRouteHistogram(routeName)
69 |
70 | return func(w http.ResponseWriter, r *http.Request, p P) error {
71 | var startTime time.Time
72 | if r.Method != http.MethodHead {
73 | startTime = time.Now()
74 | }
75 |
76 | err := h.Serve(w, r, p)
77 |
78 | if r.Method != http.MethodHead && routeName != "health" && routeName != "crawl-health" {
79 | requestDuration.UpdateDuration(startTime)
80 | }
81 |
82 | return err
83 | }
84 | }
85 |
--------------------------------------------------------------------------------
/reset-prior-average-upvote-rate.sql:
--------------------------------------------------------------------------------
1 |
2 | with parameters as (
3 | select
4 | -- 2.2956 as priorWeight
5 | -- 4.0 as priorWeight
6 | 1.7 as priorWeight
7 | , 0.003462767 as fatigueFactor
8 | -- , 1.036 as priorAverage
9 | -- , 1.036 as priorAverage
10 | -- , .99 as priorAverage
11 | -- , 1.0 as priorAverage
12 | ), entryRates as (
13 | select
14 | userID
15 | , storyID
16 | , entryTime
17 | , entryUpvoteRate
18 | , max(cumulativeUpvotes) cumulativeUpvotes
19 | , max(cumulativeExpectedUpvotes) cumulativeExpectedUpvotes
20 | , (cumulativeUpvotes + priorWeight)/((1-exp(-fatigueFactor*cumulativeExpectedUpvotes))/fatigueFactor + priorWeight) newEntryUpvoteRate
21 | -- , (cumulativeUpvotes + priorWeight*1.174)/((1-exp(-fatigueFactor*cumulativeExpectedUpvotes))/fatigueFactor + priorWeight) newEntryUpvoteRate
22 | -- , (cumulativeUpvotes + priorWeight*1.145)/(cumulativeExpectedUpvotes + priorWeight) as newEntryUpvoteRate
23 |
24 |
25 | from
26 | votes
27 | join dataset
28 | on dataset.id = storyID
29 | join parameters
30 | where
31 | dataset.sampleTime
32 | and sampleTime <= entryTime
33 | -- and votes.userID != 0
34 | group by userID, storyID, entryTime
35 | )
36 | -- select * from entryRates where userID = 0 and storyID = 36805231 limit 10;
37 |
38 | update votes as u
39 | set entryUpvotes = entryRates.cumulativeUpvotes
40 | , entryExpectedUpvotes = entryRates.cumulativeExpectedUpvotes
41 | , entryUpvoteRate = entryRates.newEntryUpvoteRate
42 | from
43 | entryRates
44 | where entryRates.userID = u.userID
45 | and entryRates.storyID = u.storyID ;
46 |
47 |
--------------------------------------------------------------------------------
/resources.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import "embed"
4 |
5 | //go:embed templates/*
6 | //go:embed sql/*
7 | //go:embed seed/*
8 | var resources embed.FS
9 |
--------------------------------------------------------------------------------
/score-page.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "database/sql"
5 | "fmt"
6 | "net/http"
7 |
8 | "github.com/johnwarden/httperror"
9 | "github.com/pkg/errors"
10 | )
11 |
12 | type ScorePageData struct {
13 | PageTemplateData
14 | Positions []Position
15 | Score float64
16 | ScorePlotData [][]any
17 | }
18 |
19 | // Override IsScorePage since it's not determined by Ranking
20 | func (d ScorePageData) IsScorePage() bool {
21 | return true
22 | }
23 |
24 | func (p ScorePageData) ScoreString() string {
25 | return fmt.Sprintf("%.2f", p.Score)
26 | }
27 |
28 | func (p ScorePageData) AverageScoreString() string {
29 | return fmt.Sprintf("%.2f", p.Score/float64(len(p.Positions)))
30 | }
31 |
32 | type ScorePageParams struct {
33 | UserID sql.NullInt64
34 | OptionalModelParams
35 | ScoringFormula string
36 | }
37 |
38 | func (app app) scoreHandler() func(http.ResponseWriter, *http.Request, ScorePageParams) error {
39 | return func(w http.ResponseWriter, r *http.Request, params ScorePageParams) error {
40 | nullUserID := params.UserID
41 | if !nullUserID.Valid {
42 |
43 | nullUserID = app.getUserID(r)
44 |
45 | if !nullUserID.Valid {
46 | return httperror.PublicErrorf(http.StatusUnauthorized, "not logged in")
47 | }
48 | }
49 |
50 | modelParams := params.OptionalModelParams.WithDefaults()
51 |
52 | userID := int(nullUserID.Int64)
53 |
54 | positions, err := app.getDetailedPositions(r.Context(), userID)
55 | if err != nil {
56 | return errors.Wrap(err, "getDetailedPositions")
57 | }
58 |
59 | var score float64
60 | for i, p := range positions {
61 |
62 | p.EntryUpvoteRate = modelParams.upvoteRate(p.EntryUpvotes, p.EntryExpectedUpvotes)
63 | p.CurrentUpvoteRate = modelParams.upvoteRate(p.CurrentUpvotes, p.CurrentExpectedUpvotes)
64 | p.Story.UpvoteRate = p.CurrentUpvoteRate
65 |
66 | if p.ExitUpvotes.Valid && p.ExitExpectedUpvotes.Valid {
67 | p.ExitUpvoteRate = sql.NullFloat64{
68 | Float64: modelParams.upvoteRate(int(p.ExitUpvotes.Int64), p.ExitExpectedUpvotes.Float64),
69 | Valid: true,
70 | }
71 | }
72 |
73 | p.UserScore = UserScore(p, modelParams, params.ScoringFormula)
74 |
75 | score += p.UserScore
76 | p.RunningScore = score
77 |
78 | p.Story.UpvoteRate = p.UpvoteRate
79 |
80 | positions[i] = p
81 | }
82 |
83 | n := len(positions)
84 | for i := range positions {
85 | positions[i].RunningScore = score - positions[i].RunningScore + positions[i].UserScore
86 | positions[i].Label = intToAlphaLabel(n - i - 1)
87 | }
88 |
89 | scorePlotData := make([][]any, n)
90 | for i, p := range positions {
91 | scorePlotData[n-i-1] = []any{
92 | p.EntryTime, p.RunningScore, fmt.Sprintf("%d", p.PositionID), p.Story.Title, p.UserScoreString(), p.Direction, p.EntryUpvoteRateString(), p.CurrentUpvoteRateString(), p.ExitUpvoteRateString(),
93 | }
94 | }
95 |
96 | pageSize := 1000
97 | if n > pageSize {
98 | n = pageSize
99 | }
100 |
101 | d := ScorePageData{
102 | PageTemplateData: PageTemplateData{
103 | UserID: nullUserID,
104 | Ranking: "score",
105 | },
106 | Positions: positions[0:n],
107 | Score: score,
108 | ScorePlotData: scorePlotData,
109 | }
110 |
111 | if err = templates.ExecuteTemplate(w, "score.html.tmpl", d); err != nil {
112 | return errors.Wrap(err, "executing score template")
113 | }
114 |
115 | return nil
116 | }
117 | }
118 |
119 | // convert an integer into an alpha-numerical label starting with A through Z, then continuing AA, AB, etc.
120 |
121 | func intToAlphaLabel(i int) string {
122 | r := make([]byte, 0, 1)
123 |
124 | // result := ""
125 | n := 0
126 | for {
127 | digit := i % 26
128 | letter := 'A' + digit
129 | // result = string(letter) + result
130 |
131 | r = append(r, byte(letter))
132 |
133 | i -= digit
134 | if i == 0 {
135 | break
136 | }
137 | i /= 26
138 | i -= 1
139 | n++
140 | }
141 |
142 | n = len(r)
143 | for i := 0; i < n/2; i++ {
144 | j := n - i - 1
145 |
146 | r[i], r[j] = r[j], r[i]
147 | }
148 |
149 | return string(r)
150 | }
151 |
--------------------------------------------------------------------------------
/scraper.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "strconv"
7 | "strings"
8 | "sync"
9 | "time"
10 |
11 | colly "github.com/gocolly/colly/v2"
12 | "github.com/pkg/errors"
13 | "golang.org/x/exp/slog"
14 | )
15 |
16 | type rawStory struct {
17 | ID string
18 | row1
19 | row2
20 | }
21 |
22 | type row1 struct {
23 | Title string `selector:"span.titleline a"`
24 | FullTitle string `selector:"span.titleline"`
25 | URL string `selector:"span.titleline a" attr:"href"`
26 | Rank string `selector:"span.rank"`
27 | }
28 |
29 | type row2 struct {
30 | Author string `selector:"a.hnuser"`
31 | Score string `selector:"span.score"`
32 | SubmissionTime string `selector:"span.age" attr:"title"`
33 | AgeApprox string `selector:"span.age"`
34 | Links []string `selector:"a"`
35 | }
36 |
37 | type ScrapedStory struct {
38 | Story
39 | Rank int
40 | Source string
41 | }
42 |
43 | func (rs rawStory) Clean() (ScrapedStory, error) {
44 | story := ScrapedStory{
45 | Story: Story{
46 | Title: rs.Title,
47 | By: rs.Author,
48 | URL: rs.URL,
49 | },
50 | Source: "scraper",
51 | }
52 |
53 | // parse id
54 | {
55 | id, err := strconv.Atoi(rs.ID)
56 | if err != nil {
57 | return story, errors.Wrapf(err, "parse story id %s", rs.ID)
58 | }
59 | story.ID = id
60 | }
61 |
62 | // fix url
63 | if strings.HasPrefix(story.Story.URL, "item?id=") {
64 | story.Story.URL = "https://news.ycombinator.com/" + story.Story.URL
65 | }
66 |
67 | // parse score. This field will look like "4 points"
68 | {
69 | if fs := strings.Fields(rs.Score); len(fs) > 0 {
70 | scoreStr := strings.Fields(rs.Score)[0]
71 |
72 | score, err := strconv.Atoi(scoreStr)
73 | story.Score = score
74 | if err != nil {
75 | return story, errors.Wrapf(err, "parse story score %s", rs.Score)
76 | }
77 | } else {
78 | // if there is no upvotes field, then this is an HN job.
79 | // we want to include these in the database because they get ranked
80 | story.Job = true
81 | }
82 | }
83 |
84 | // parse submission time
85 | {
86 | // submission times now contain a timestamp string, followed by a
87 | // space then a unix timestamp with what looks like the *current*
88 | // time which I suppose we can just ignore. For
89 | // example "2024-10-23T16:44:01 1729713776"
90 | parts := strings.Split(rs.SubmissionTime, " ")
91 |
92 | var submissionTime time.Time
93 | var err error
94 |
95 | if strings.HasSuffix(parts[0], "Z") {
96 | // Old format with "Z" indicating UTC
97 | submissionTime, err = time.Parse("2006-01-02T15:04:05Z", parts[0])
98 | } else {
99 | // New format without "Z"
100 | submissionTime, err = time.Parse("2006-01-02T15:04:05", parts[0])
101 | }
102 |
103 | if err != nil {
104 | return story, errors.Wrapf(err, "parse submission time %s", rs.SubmissionTime)
105 | }
106 | story.SubmissionTime = submissionTime.Unix()
107 | story.OriginalSubmissionTime = story.SubmissionTime
108 | }
109 |
110 | // parse approximate age
111 | {
112 | // this will be something like "1 minute ago" or "3 hours ago"
113 | if fs := strings.Fields(rs.AgeApprox); len(fs) > 1 {
114 | n, err := strconv.Atoi(fs[0])
115 | if err != nil {
116 | return story, errors.Wrapf(err, "parse relative age %s", rs.AgeApprox)
117 | }
118 |
119 | var units int64
120 | if strings.HasPrefix(fs[1], "minute") { // "minute" or "minutes"
121 | units = 60
122 | } else if strings.HasPrefix(fs[1], "hour") {
123 | units = 3600
124 | } else if strings.HasPrefix(fs[1], "day") {
125 | units = 3600 * 24
126 | } else if strings.HasPrefix(fs[1], "month") {
127 | units = 3600 * 24 * 30
128 | } else if strings.HasPrefix(fs[1], "year") {
129 | units = 3600 * 24 * 364
130 | }
131 |
132 | story.AgeApprox = int64(n) * units
133 | } else {
134 | return story, fmt.Errorf("parse age %s", rs.AgeApprox)
135 | }
136 |
137 | // parse rank. we know the rank because of the order it appears in.
138 | // we just use this to do an integrity check later.
139 | {
140 | tRank := strings.Trim(rs.Rank, ".")
141 | var err error
142 | story.Rank, err = strconv.Atoi(tRank)
143 | if err != nil || story.Rank == 0 {
144 | return story, errors.Wrapf(err, "parse rank %s", rs.Rank)
145 | }
146 | }
147 |
148 | // parse the number of comments
149 | {
150 | // if there are comments, this will be the last tag. Unfortunately, it doesn't have an id or class.
151 | commentString := rs.Links[len(rs.Links)-1]
152 |
153 | // this string will be a single word like "comment" or "hide" if there are no comments.
154 | // otherwise it will be something like "12 comments"
155 | if fs := strings.Fields(commentString); len(fs) > 1 {
156 | c, err := strconv.Atoi(fs[0])
157 | if err != nil {
158 | return story, errors.Wrapf(err, "parse comments %s", commentString)
159 | }
160 | story.Comments = c
161 | }
162 | }
163 |
164 | // parse [flagged] and [dupe] tags
165 | {
166 | if strings.Contains(rs.FullTitle, "[flagged]") {
167 | story.Flagged = true
168 | }
169 | if strings.Contains(rs.FullTitle, "[dupe]") {
170 | story.Dupe = true
171 | }
172 | }
173 |
174 | return story, nil
175 | }
176 | }
177 |
178 | func (app app) newScraper(resultCh chan ScrapedStory, errCh chan error, moreLinkCh chan string) *colly.Collector {
179 | c := colly.NewCollector()
180 | c.SetClient(app.httpClient)
181 |
182 | var rs rawStory
183 |
184 | c.OnHTML("a.morelink", func(e *colly.HTMLElement) {
185 | moreLinkCh <- e.Attr("href")
186 | })
187 |
188 | c.OnHTML("tr table", func(e *colly.HTMLElement) {
189 | n := 0
190 | lastStoryRownum := 0
191 | e.ForEach("tr", func(i int, e *colly.HTMLElement) {
192 | class := e.Attr("class")
193 |
194 | // stories will always start with a tr of class athing
195 | if strings.Contains(class, "athing") && n < 30 {
196 | n = n + 1
197 | lastStoryRownum = i
198 | if n > 30 {
199 | return
200 | }
201 |
202 | rs = rawStory{
203 | ID: e.Attr("id"),
204 | }
205 | err := e.Unmarshal(&rs.row1)
206 | if err != nil {
207 | errCh <- err
208 | }
209 | } else if class == "" && i == lastStoryRownum+1 && n > 0 && n <= 30 {
210 | // the first tr after the "athing" contains the second row of
211 | // details for the story. Note also we must skip any trs
212 | // before the first athing because sometimes they contain
213 | // general page content.
214 |
215 | err := e.Unmarshal(&rs.row2)
216 |
217 | if err != nil {
218 | errCh <- err
219 | } else {
220 | st, err := rs.Clean()
221 | rank := st.Rank
222 |
223 | // Do an integrity check. If the row shown for the story equals the row
224 | // count we are keeping, we area all good.
225 | if err == nil && ((rank-1)%30)+1 != n {
226 | err = fmt.Errorf("Ranks out of order. Expected %d but parsed %d", n, (rank-1)%30+1)
227 | }
228 |
229 | if err != nil {
230 | Debugf(app.logger, "Failed to parse story %d. Raw story %#v", n, rs)
231 | errCh <- err
232 | } else {
233 | resultCh <- st
234 | }
235 | }
236 | }
237 | })
238 | })
239 |
240 | c.OnError(func(r *colly.Response, err error) {
241 | err = errors.Wrapf(err, "Failed to parse page %s", r.Request.URL)
242 | errCh <- err
243 | })
244 |
245 | return c
246 | }
247 |
248 | func (app app) scrapeHN(pageType string, resultCh chan ScrapedStory, errCh chan error) {
249 | baseUrl := "https://news.ycombinator.com/"
250 | url := baseUrl
251 | if pageType == "new" {
252 | url = url + "newest"
253 | } else if pageType != "top" {
254 | url = url + pageType
255 | }
256 | for p := 1; p <= 3; p++ {
257 | moreLinkCh := make(chan string, 1)
258 | c := app.newScraper(resultCh, errCh, moreLinkCh)
259 | err := c.Visit(url)
260 | if err != nil {
261 | errCh <- err
262 | }
263 | select {
264 | case relativeURL := <-moreLinkCh:
265 | url = baseUrl + relativeURL
266 | default:
267 | // there won't always be a next link, in particular the show page could have less than 3 pages worth of stories
268 | }
269 |
270 | }
271 | close(resultCh)
272 | close(errCh)
273 | }
274 |
275 | func (app app) scrapeFrontPageStories(ctx context.Context) (map[int]ScrapedStory, error) {
276 | app.logger.Info("Scraping front page stories")
277 |
278 | stories := map[int]ScrapedStory{}
279 |
280 | pageTypeName := "top"
281 |
282 | nSuccess := 0
283 |
284 | resultCh := make(chan ScrapedStory)
285 | errCh := make(chan error)
286 |
287 | var wg sync.WaitGroup
288 |
289 | t := time.Now()
290 |
291 | // scrape in a goroutine. the scraper will write results to the channel
292 | // we provide
293 | wg.Add(1)
294 | go func() {
295 | defer wg.Done()
296 | app.scrapeHN(pageTypeName, resultCh, errCh)
297 | }()
298 |
299 | // read from the error channel in print errors in a separate goroutine.
300 | // The scraper will block writing to the error channel if nothing is reading
301 | // from it.
302 | wg.Add(1)
303 | go func() {
304 | defer wg.Done()
305 | for err := range errCh {
306 | app.logger.Error("Error parsing story", err)
307 | crawlErrorsTotal.Inc()
308 | }
309 | }()
310 |
311 | for story := range resultCh {
312 | id := story.ID
313 |
314 | stories[id] = story
315 |
316 | nSuccess += 1
317 | }
318 |
319 | if nSuccess == 0 {
320 | return stories, fmt.Errorf("Didn't successfully parse any stories from %s page", pageTypeName)
321 | }
322 | Debugf(app.logger, "Crawled %d stories on %s page", nSuccess, pageTypeName)
323 |
324 | wg.Wait()
325 |
326 | app.logger.Info("Scraped stories", "pageTypeName", pageTypeName, slog.Duration("elapsed", time.Since(t)))
327 |
328 | return stories, nil
329 | }
330 |
--------------------------------------------------------------------------------
/seed/domain-penalties.csv:
--------------------------------------------------------------------------------
1 | domain,avgPenalty
2 | www.phoronix.com,0.255983153311316
3 | arstechnica.com,0.236416125252806
4 | www.theguardian.com,0.304580202256946
5 | old.reddit.com,0.329146893687822
6 | twitter.com,0.343862039075023
7 | www.theregister.com,0.281049828586506
8 | www.theatlantic.com,0.269041168885244
9 | www.cnn.com,0.287677200012903
10 | www.latimes.com,0.442553344733379
11 | apnews.com,0.306319646291724
12 | www.wired.com,0.247088570701928
13 | torrentfreak.com,0.378240055055204
14 | www.fastcompany.com,0.259302489090707
15 | www.protocol.com,0.295085830042498
16 | www.forbes.com,0.373620224179877
17 | reason.com,0.393667070368929
18 | drewdevault.com,0.426765471415533
19 | www.washingtonpost.com,0.281714743939709
20 | www.scmp.com,0.357802047615181
21 | www.politico.com,0.300041785931081
22 | medium.com,0.280016222997068
23 | www.wsj.com,0.246757013133426
24 | www.cnbc.com,0.292165034038854
25 | www.ft.com,0.271791649480615
26 | nypost.com,0.426552760633264
27 | www.nytimes.com,0.247017260946097
28 | reclaimthenet.org,0.449452850524531
29 | tech.marksblogg.com,0.681258036075086
30 | gizmodo.com,0.235581227145393
31 | www.vice.com,0.273699442401756
32 | www.bbc.com,0.243614162442268
33 | techcrunch.com,0.267693218312698
34 | en.wikipedia.org,0.200554150996098
35 | www.macrumors.com,0.252190681850287
36 | www.bleepingcomputer.com,0.298953498978752
37 | www.telegraph.co.uk,0.328513269271261
38 | www.tomshardware.com,0.225581267219099
39 | www.thedrive.com,0.253433255715287
40 | www.sfchronicle.com,0.295735132759634
41 | www.businessinsider.com,0.327460617838536
42 | www.theverge.com,0.233671896247822
43 | www.eff.org,0.30545264142663
44 | theconversation.com,0.238252498612625
45 | www.bbc.co.uk,0.266873076770723
46 | astralcodexten.substack.com,0.274378958851422
47 | www.engadget.com,0.222515465389002
48 | www.marketwatch.com,0.345967596506456
49 | www.nasa.gov,0.235891052875635
50 | www.nationalreview.com,0.386958948863856
51 | web.archive.org,0.299904628615894
52 | www.dw.com,0.319977862059942
53 | nationalpost.com,0.442595555825156
54 | www.newsweek.com,0.389042499337081
55 | www.bloomberg.com,0.242968773325337
56 | www.nbcnews.com,0.272889000950255
57 | www.technologyreview.com,0.257431302810684
58 | lite.cnn.com,0.250700151641931
59 | venturebeat.com,0.276781089118411
60 | www.sfgate.com,0.281372733111948
61 | phys.org,0.191695575116859
62 | petapixel.com,0.202737368232906
63 | jalopnik.com,0.224182778568468
64 | www.cbsnews.com,0.215748434523396
65 | www.sciencealert.com,0.291226974242294
66 | appleinsider.com,0.358670726355805
67 | hackernoon.com,0.314810633979195
68 | www.space.com,0.209511817616723
69 | www.techdirt.com,0.279776442812103
70 | www.cbc.ca,0.247250210810386
71 | slate.com,0.265835164534057
72 | 9to5mac.com,0.251672380939431
73 | quillette.com,0.30212810108685
74 | www.independent.co.uk,0.287744881377527
75 | news.yahoo.com,0.306129953355535
76 | www.newscientist.com,0.248626857306742
77 | marginalrevolution.com,0.240319697857452
78 | www.cnet.com,0.240582214090158
79 | www.usatoday.com,0.249900498713825
80 | futurism.com,0.311144077242749
81 | www.scientificamerican.com,0.250479576916355
82 | thehill.com,0.297360919489742
83 | www.indiehackers.com,0.280577764502318
84 | finance.yahoo.com,0.242354394239265
85 | docs.google.com,0.246768467189723
86 | therecord.media,0.265035130188274
87 | blogs.nasa.gov,0.234768976839632
88 | www.micahlerner.com,0.228182741047012
89 | themarkup.org,0.169215301594475
90 | restofworld.org,0.277497103221563
91 | www.politico.eu,0.231618820954611
92 | www.france24.com,0.298707256113616
93 | betterprogramming.pub,0.239944839492751
94 | time.com,0.220345275200395
95 | www.schneier.com,0.24690892186614
96 | www.lesswrong.com,0.198347134793858
97 | www.pcmag.com,0.252328948155656
98 | thebulletin.org,0.185234962067628
99 | www.teslaoracle.com,0.218183317161904
100 | hbr.org,0.193063289207588
101 | thenewstack.io,0.226202908952288
102 | www.productlessons.xyz,0.165012975629553
103 | www.polygon.com,0.209155781150655
104 | medicalxpress.com,0.187450245477553
105 | www.anandtech.com,0.174405165433974
106 | electrek.co,0.142642514874113
107 |
--------------------------------------------------------------------------------
/sql/cumulative-upvotes.sql:
--------------------------------------------------------------------------------
1 | -- this query updates cumulativeUpvotes and cumulativeExpectedUpvotes
2 | -- accounting for possible gaps in the data (stories in the latest crawl but not the previous crawl).
3 | -- We only want cumulativeUpvotes or cumulativeExpectedUpvotes to increase if we have two consecutive data
4 | -- points (one minute apart).
5 |
6 | with latest as (
7 | select * from dataset where sampleTime = (select max(sampleTime) from dataset)
8 | )
9 | update dataset as d
10 | set
11 | cumulativeUpvotes = case
12 | when not gapInData then previousCrawl.cumulativeUpvotes + latest.score - previousCrawl.score
13 | else previousCrawl.cumulativeUpvotes
14 | end
15 | , cumulativeExpectedUpvotes = case
16 | when not gapInData then latest.cumulativeExpectedUpvotes
17 | else previousCrawl.cumulativeExpectedUpvotes
18 | end
19 | from latest left join previousCrawl using (id)
20 | where
21 | d.id = latest.id
22 | and d.sampleTime = (select max(sampleTime) from dataset)
23 |
--------------------------------------------------------------------------------
/sql/previous-crawl-index-old.sql:
--------------------------------------------------------------------------------
1 | create index previousCrawl_id_idx on previousCrawl (id);
--------------------------------------------------------------------------------
/sql/previous-crawl.sql:
--------------------------------------------------------------------------------
1 | -- This query selects the previous datapoint for every story in the latest crawl
2 | -- It is a bit tricky because the sampleTime may be different for each story, because
3 | -- Some stories may appear and disappear from crawl results if they fall off the front page and reappear.
4 |
5 | create view if not exists previousCrawl as
6 | with latest as (
7 | select * from dataset
8 | where sampleTime = (select max(sampleTime) from dataset)
9 | )
10 | -- identify stories that are in the previous crawl. This is a quick indexed lookup
11 | , previousCrawl as (
12 | select
13 | id
14 | , sampleTime
15 | from dataset
16 | where sampleTime = (select max(sampleTime) from dataset where sampleTime != (select max(sampleTime) from dataset))
17 | )
18 | -- this this query finds the sampleTime of the last time this story was
19 | -- crawled, for all stories that were not in the previous crawl. This
20 | -- subquery can be slow, so only do it for stories that weren't in the
21 | -- previous crawl.
22 | , previousSampleForStory as (
23 | select
24 | latest.id
25 | , ifnull(previousCrawl.sampleTime, max(dataset.sampleTime)) as sampleTime
26 | , previousCrawl.sampleTime is null as gapInData
27 | from latest left join previousCrawl using (id)
28 | left join dataset on (
29 | previousCrawl.id is null
30 | and latest.id = dataset.id
31 | and dataset.sampleTime < (select max(sampleTime) from dataset)
32 | )
33 | group by 1
34 | )
35 | select dataset.*, gapInData from previousSampleForStory join dataset using (id, sampleTime);
36 |
--------------------------------------------------------------------------------
/sql/qnranks.sql:
--------------------------------------------------------------------------------
1 | with parameters as (select %f as priorWeight, %f as overallPriorWeight, %f as gravity, %f as penaltyWeight, %f as fatigueFactor)
2 | , latestData as (
3 | select
4 | id
5 | , score
6 | , sampleTime
7 | , cast(sampleTime-submissionTime as real)/3600 as ageHours
8 | , cumulativeUpvotes
9 | , cumulativeExpectedUpvotes
10 | , penalty
11 | from dataset
12 | where sampleTime = (select max(sampleTime) from dataset)
13 | and score >= 3 -- story can't reach front page until score >= 3
14 | and coalesce(topRank, bestRank, newRank, askRank, showRank) is not null -- let's not rank stories if they aren't accumulating attention
15 | ),
16 | qnRanks as (
17 | select
18 | id
19 | , dense_rank() over(order by %s) as rank
20 | , sampleTime
21 | , penalty
22 | from latestData join parameters
23 | )
24 | update dataset as d set qnRank = qnRanks.rank
25 | from qnRanks
26 | where d.id = qnRanks.id and d.sampleTime = qnRanks.sampleTime;
27 |
--------------------------------------------------------------------------------
/sql/random-new-voter.sql:
--------------------------------------------------------------------------------
1 | with limits as (
2 | select
3 | count(*) / 1000 as n
4 | , abs(random()) % 10 as m
5 | from dataset
6 | )
7 | , randomFrontpageSample as (
8 | select id, sampleTime, cumulativeUpvotes, cumulativeExpectedUpvotes
9 | from dataset
10 | join stories using (id)
11 | join limits
12 | where timestamp > ( select min(sampleTime) from dataset ) -- only stories submitted since we started crawling
13 | and newRank is not null
14 | and not job
15 | and ( ( dataset.rowid - (select min(rowid) from dataset) ) % n ) = m
16 | )
17 | , storiesToUpvote as (
18 | select id as storyID
19 | , min(sampleTime) as minSampleTime
20 | , min(cumulativeUpvotes) as minUpvotes
21 | , min(cumulativeExpectedUpvotes) as minExpectedUpvotes
22 | from randomFrontpageSample
23 | group by id
24 | order by sampleTime
25 | )
26 | , positions as (
27 | select
28 | 0 as userID
29 | , storiesToUpvote.storyID
30 | , 1 as direction
31 | , minSampleTime as entryTime
32 | , minUpvotes as entryUpvotes
33 | , minExpectedUPvotes as entryExpectedUpvotes
34 | , row_number() over () as positionID
35 | from storiesToUpvote
36 | -- left join votes existingVotes using (storyID)
37 | -- where existingVotes.storyID is null
38 | ) select
39 | userID
40 | , storyID
41 | , positionID
42 | , direction
43 | , entryTime
44 | , entryUpvotes
45 | , entryExpectedUpvotes
46 | , null as exitTime
47 | , null as exitUpvotes
48 | , null as exitExpectedUpvotes
49 | , cumulativeUpvotes
50 | , cumulativeExpectedUpvotes
51 | , title
52 | , url
53 | , by
54 | , unixepoch() - sampleTime + coalesce(ageApprox, sampleTime - submissionTime) ageApprox
55 | , score
56 | , descendants as comments
57 | from positions
58 | join dataset on
59 | positions.storyID = id
60 | join stories using (id)
61 | group by positionID
62 | having max(dataset.sampleTime)
63 | order by entryTime desc
64 | ;
65 |
--------------------------------------------------------------------------------
/sql/random-top-voter.sql:
--------------------------------------------------------------------------------
1 | with randomDatapoints as (
2 | select
3 | id, sampleTime , cumulativeUpvotes, cumulativeExpectedUpvotes
4 | -- , row_number() over () as
5 | , row_number() over () as i
6 | , count() over () as nIDs
7 | from dataset
8 | join stories using (id)
9 | where
10 | timestamp > ( select min(sampleTime) from dataset ) -- only stories submitted since we started crawling
11 | and sampleTime > ( select max(sampleTime) from dataset ) - 24 * 60 * 60
12 | and topRank is not null
13 | ),
14 | limits as (
15 | select abs(random()) % ( nIds / 100 ) as n
16 | from randomDatapoints
17 | where i = 1
18 | )
19 | , storiesToUpvote as (
20 | select id as storyID
21 | , min(sampleTime) as minSampleTime
22 | , min(cumulativeUpvotes) as minUpvotes
23 | , min(cumulativeExpectedUpvotes) as minExpectedUpvotes
24 | from randomDatapoints join limits
25 | -- sampleTime % nIDs = n
26 | where
27 | ( i ) % (nIDs / 100) = n
28 | group by id
29 | order by sampleTime
30 | )
31 | , positions as (
32 | select
33 | ? as userID
34 | , storiesToUpvote.storyID
35 | , 1 as direction
36 | , minSampleTime as entryTime
37 | , minUpvotes as entryUpvotes
38 | , minExpectedUPvotes as entryExpectedUpvotes
39 | , row_number() over () as positionID
40 | from storiesToUpvote
41 | -- left join votes existingVotes using (storyID)
42 | -- where existingVotes.storyID is null
43 | ) select
44 | userID
45 | , storyID
46 | , positionID
47 | , direction
48 | , entryTime
49 | , entryUpvotes
50 | , entryExpectedUpvotes
51 | , null as exitTime
52 | , null as exitUpvotes
53 | , null as exitExpectedUpvotes
54 | , cumulativeUpvotes
55 | , cumulativeExpectedUpvotes
56 | , title
57 | , url
58 | , by
59 | , unixepoch() - sampleTime + coalesce(ageApprox, sampleTime - submissionTime) ageApprox
60 | , score
61 | , descendants as comments
62 | from positions
63 | join dataset on
64 | positions.storyID = id
65 | join stories using (id)
66 | group by positionID
67 | having max(dataset.sampleTime)
68 | order by entryTime desc;
--------------------------------------------------------------------------------
/sql/raw-ranks.sql:
--------------------------------------------------------------------------------
1 | with rankingScores as (
2 | select
3 | id
4 | , sampleTime
5 | , topRank
6 | , pow(score-1, 0.8) / pow(cast(sampleTime - submissionTime as real)/3600+2, 1.8) as rankingScore -- pre-penalty HN ranking formula
7 | , ageApprox
8 | , job
9 | , score
10 | , timeStamp != submissionTime as resubmitted
11 | from dataset join stories using (id)
12 | where sampleTime = (select max(sampleTime) from dataset)
13 | -- normally a story is eligible to rank on front page once score >= 3
14 | -- but jobs can be on the front page without a score, and sometimes I see
15 | -- stories on the front page of a score of only 2. We want to calculate
16 | -- raw rank for any store that is ranked, or **should** be ranked.
17 | and (score >= 3 or topRank is not null)
18 | order by topRank asc, rankingScore desc
19 | ),
20 | rawRanks as (
21 | select
22 | id
23 | , sampleTime
24 | , job
25 | , resubmitted
26 | , topRank as rank
27 | , score
28 | , count(*) over (order by rankingScore desc) as rawRank
29 | from rankingScores
30 | order by rank nulls last
31 | )
32 | update dataset as d
33 | set rawRank = count(*) over (
34 | order by case when rawRanks.job then rawRanks.rank else rawRanks.rawRank end, rawRanks.job desc
35 | )
36 | from rawRanks
37 | where d.id = rawRanks.id
38 | and d.sampleTime = rawRanks.sampleTime
39 | ;
40 |
--------------------------------------------------------------------------------
/sql/resubmissions.sql:
--------------------------------------------------------------------------------
1 | -- ESTIMATING RESUBMISSION TIME
2 |
3 | -- THE PROBLEM
4 |
5 | -- When a story is resubmitted, its submission time is updated to the current
6 | -- time, which gives it a rankings boost.
7 |
8 | -- We want to know what this new submission time is, so our algorithm can give
9 | -- stories the same boost. Also our penalty calculate requires knowing each
10 | -- story's pre-penalty ranking score, which requires knowing their submission
11 | -- times.
12 |
13 | -- Unfortunately exact resubmission times are not currently published by HN. The API always
14 | -- gives the story's original submission time.
15 |
16 | -- Each story's submission time datestamp is also included in the HTML when
17 | -- the story is displayed: you can see it when you hover the mouse over the
18 | -- age field ("20 minutes ago").
19 |
20 | -- Unfortunately, although the approximate age field age ("20 minutes ago:)
21 | -- reflects the resubmission time, the datestamp in the HTML is the original
22 | -- submission time.
23 |
24 | -- So we can only estimate the resubmission time from this approximate age
25 | -- field.
26 |
27 | -- But the approximate age is neither precise nor accurate. It is always a
28 | -- whole number of minutes, hours, or days, rounded down: 1 hour 59 minutes
29 | -- is show as "1 hour ago", and 1 day 23 hours is shown as "one day ago".
30 |
31 | -- When a story is less than an hour old, we have minute-level granularity,
32 | -- However, this number is imprecise: it can be off by a couple of minutes
33 | -- either way.
34 |
35 | -- Further resubmitted stories don't seem to show up on the front page (at least
36 | -- not the top 90 ranks we crawl) until they are at least an hour old.
37 |
38 |
39 | -- THE SOLUTION We wrote dang to ask if he can help us out here. But I have
40 | -- implemented a pretty accurate solution:
41 |
42 | -- We can tell a story has been resubmitted within the last 24 hours because
43 | -- the submission time will be far earlier (typically hours) than the
44 | -- approximate age parsed from the web page (e.g. 3 hours ago).
45 |
46 | -- If the story is less than 1 day old, we can then place lower and upper
47 | -- bounds on the resubmission time. If it says "3 hours", it means anyway
48 | -- from 3:00 h to 3:59 h ago.
49 |
50 | -- So each time we crawl, we calculate a lowe bound on the story's
51 | -- resubmission time (based on an upper-boudn on age
52 | -- , and then compare it to the previous upper bound and move the bound
53 | -- accordingly (taking the lowest upper bound).
54 |
55 | -- So if a story was submitted "3 hours ago" we know the story is at at most 4
56 | -- hours old. So we save the sampleTime-4 hours in the submissionTime field,
57 | -- understanding that this is a lower bound on submissionTime. Then in the
58 | -- next minute we redo the calculation. If it still says "3 hours old" then
59 | -- our new implied lower bound on submission time will be greater the the
60 | -- previously lower bound by one minute. So we move up the lower bound up by a minue.
61 | -- (lower bounds always move up as we discover higher lower bounds).
62 |
63 | -- When the age string changes to "4 hours ago", we will know the story is at
64 | -- least 5 hours 59 minutes old. But the implied submission time will one hour less
65 | -- than the lower bound we calculated one minute before. So we keep the
66 | -- current lower bound. At this point, we have the exact resumibssion time
67 | -- within a couple of minutes either way.
68 |
69 | -- Other considerations: We can't detect resubmission times for stories more than a day old
70 | -- (unless they were resubmitted several days later) It is possible that a
71 | -- resubmitted story is more than a day old, and is still on the front page.
72 | -- In that case, we cannot determine it is a resubmitted story. So we need to
73 | -- calculate the resubmission time beofre the stories is a day old. We then
74 | -- remember this time, updating each subsequent datapoint to use this time.
75 |
76 | with latest as (
77 | -- first, get the data from the latest crawl, determine which stories have
78 | -- been resubmitted, and estimate a lower bound on submission time
79 | select
80 | *
81 | , timestamp as originalSubmissionTime
82 | , sampleTime - ageApprox - timestamp > 7200 and ageApprox < 3600*24 as resubmitted
83 | , cast(
84 | case
85 | when
86 | -- we know a story has been resubmitted if the submission time implied
87 | -- by the approximate age differs by too much. Because age is rounded
88 | -- down, the difference can be up to one hour plus a few minutes
89 | -- because of data delay. In practice, the difference is always
90 | -- several hours. Using a cutoff at two hours should be good. also,
91 | -- we should filter out stories more than a day old: if we just saw
92 | -- these stories for the first time, we don't know if they have been
93 | -- resubmitted or not (and thus don't know how old they really are)
94 | sampleTime - ageApprox - timestamp > 3600*2 and ageApprox < 3600*24
95 | and not job then
96 | -- calculate an upper bound on age
97 | case
98 | when ageApprox < 3600 then ageApprox+59 -- e.g. if a story is "5 minutes old", it could be up to 5 minutes and 59 seconds old
99 | when ageApprox < 3600*24 then (ageApprox+59*60) -- if a story is "1 hour old" it could be up to 1:59m old
100 | end + 100 -- add another 100 seconds because the age field tends to be a little stale.
101 | else sampleTime - timestamp
102 | end
103 | as real) / 3600 as ageHours
104 | from dataset join stories using (id)
105 | where sampleTime = (select max(sampleTime) from dataset)
106 | )
107 | update dataset as d
108 | -- And use the greater of the lower-bound submission time from the last crawl, and the one we just calculated.
109 | set submissionTime = case when latest.sampleTime - ageHours*3600 > ifnull(previousCrawl.submissionTime,0) then cast(latest.sampleTime - ageHours*3600 as int) else previousCrawl.submissionTime end
110 | from latest
111 | left join previousCrawl using (id)
112 | where d.id = latest.id and d.sampleTime = latest.sampleTime;
113 |
--------------------------------------------------------------------------------
/sql/upvote-rates.sql:
--------------------------------------------------------------------------------
1 | /*Calculate the moving average upvote rate. The moving average window is based
2 | on expected upvotes, instead of time. As a result, the length of the window
3 | in terms of number of rows of data is variable. The calculation to identify
4 | the rows that fall within the window could be very inefficient: the query
5 | will scan the entire dataset to find rows where the difference between
6 | cumulativeExpectedUpvotes and the latest cumulativeExpectedUpvotes falls
7 | within the window. So we save the samleTime of the start of the window in
8 | the database, so the query only needs to scan rows within this window.
9 | */
10 | with parameters as (
11 | select 50 as windowSize
12 | , 2.3 as priorWeight
13 | , 0.003462767 as fatigueFactor
14 | ), latest as (
15 | select
16 | latest.id
17 | , latest.sampleTime
18 | , latest.score
19 | , latest.cumulativeUpvotes
20 | , latest.cumulativeExpectedUpvotes
21 | , ifnull(previous.upvoteRateWindow,0) as upvoteRateWindow
22 | from dataset latest join previousCrawl previous using (id)
23 | where latest.sampleTime = (select max(sampleTime) from dataset)
24 | )
25 | , windows as (
26 | select
27 | latest.id
28 | , latest.sampleTime
29 | , latest.cumulativeUpvotes as cumulativeUpvotes
30 | , latest.cumulativeExpectedUpvotes as cumulativeExpectedUpvotes
31 | , max(dataset.sampleTime) as newWindow
32 | , min(latest.cumulativeUpvotes - dataset.cumulativeUpvotes) as upvotesInWindow
33 | , min(latest.cumulativeExpectedUpvotes - dataset.cumulativeExpectedUpvotes) as expectedUpvotesInWindow
34 | , min(latest.cumulativeExpectedUpvotes - dataset.cumulativeExpectedUpvotes) - windowSize as over
35 | , parameters.*
36 | from latest
37 | join parameters
38 | left join dataset on
39 | latest.id = dataset.id
40 | and dataset.sampleTime >= latest.upvoteRateWindow
41 | and latest.cumulativeExpectedUpvotes - dataset.cumulativeExpectedUpvotes > windowSize
42 | group by latest.id
43 | )
44 | update dataset
45 | set
46 | upvoteRate = case
47 | when upvotesInWindow is null then ( dataset.cumulativeUpvotes + priorWeight ) / ( (1-exp(-fatigueFactor*dataset.cumulativeExpectedUpvotes))/fatigueFactor + priorWeight)
48 | else ( upvotesInWindow + priorWeight ) / (
49 | -- The formula for adjusting expected upvotes for fatigue comes from the assumption that expected upvote rate decays
50 | -- exponentially: fatigueAdjustedExpectedUpvoteRate = exp(-fatigueFactor*cumulativeExpectedUpvotes).
51 | -- So fatigueAdjustedExpectedUpvotes is the total area under this curve, or the integral of
52 | -- fatigueAdjustedExpectedUpvoteRate from 0 to max(cumulativeExpectedUpvotes), which is:
53 | -- ( 1-exp(-fatigueFactor*max(cumulativeExpectedUpvotes)) ) / fatigueFactor
54 | -- But no we want the area under the curve within the moving average window,
55 | -- So we integrate from max(cumulativeExpectedUpvotes) - expectedUpvotesInWindow to max(cumulativeExpectedUpvotes),
56 | -- which gives us the below formula.
57 |
58 | (
59 | exp(-fatigueFactor*(dataset.cumulativeExpectedUpvotes - expectedUpvotesInWindow))
60 | -exp(-fatigueFactor*dataset.cumulativeExpectedUpvotes)
61 | )/fatigueFactor
62 | + priorWeight)
63 | end
64 | , upvoteRateWindow = newWindow
65 | from windows
66 | where windows.id = dataset.id and windows.sampleTime = dataset.sampleTime;
67 |
68 | -- select
69 | -- id
70 | -- , sampleTime
71 | -- , newWindow
72 | -- , cumulativeUpvotes
73 | -- , cumulativeExpectedUpvotes
74 | -- , upvotesInWindow
75 | -- , expectedUpvotesInWindow
76 | -- , ( upvotesInWindow + priorWeight ) / ( expectedUpvotesInWindow + priorWeight) as movingAverageUpvoteRate
77 | -- , ( cumulativeUpvotes + priorWeight ) / ( cumulativeExpectedUpvotes + priorWeight) as upvoteRate
78 | -- from windows
79 | -- where movingAverageUpvoteRate is not null
80 | -- limit 10;
81 |
82 |
83 |
84 |
85 | -- where datset.id = windows.id
86 |
87 |
88 | -- select
89 | -- id
90 | -- , newWindow
91 | -- , cumulativeUpvotes
92 | -- , cumulativeExpectedUpvotes
93 | -- , upvotesInWindow
94 | -- , expectedUpvotesInWindow
95 | -- , ( upvotesInWindow + priorWeight ) / ( expectedUpvotesInWindow + priorWeight) as movingAverageUpvoteRate
96 | -- , ( cumulativeUpvotes + priorWeight ) / ( cumulativeExpectedUpvotes + priorWeight) as upvoteRate
97 | -- from windows join parameters
98 | -- -- where movingAverageUpvoteRate is not null
99 | -- limit 10;
100 |
101 |
102 |
--------------------------------------------------------------------------------
/static/android-chrome-192x192.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/social-protocols/quality-news/17d482e0ee499116207c67aa66a787c7808c95a2/static/android-chrome-192x192.png
--------------------------------------------------------------------------------
/static/android-chrome-512x512.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/social-protocols/quality-news/17d482e0ee499116207c67aa66a787c7808c95a2/static/android-chrome-512x512.png
--------------------------------------------------------------------------------
/static/apple-touch-icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/social-protocols/quality-news/17d482e0ee499116207c67aa66a787c7808c95a2/static/apple-touch-icon.png
--------------------------------------------------------------------------------
/static/browserconfig.xml:
--------------------------------------------------------------------------------
1 |
2 | Quality News is a Hacker News client with: For more details, see the Readme on
10 | GitHub. This is a collective intelligence experiment by Social Protocols. Follow us on Mastodon or Twitter, or send a mail. The ×upvoteRate quantifies how much more or less
19 | likely users are to upvote this story compared to the average story. It
20 | is calculated as the story's total upvotes divided by total
21 | expected upvotes. The expected upvotes for a story is an estimate of
25 | the number of upvotes the average story would have
26 | received if it were shown at the same times at the same ranks. The raw rank is the rank that a story would have
30 | according to the "raw" Hacker News ranking formula: This formula produces a certain ranking that you can see on the (raw page).
34 |
35 | But the HN ranking is further influenced by moderator actions, user flags, and other factors which boost or penalize stories.
36 |
37 | The delta between the raw rank and front page rank. An over-ranked page may have received a boost by HN moderators, while an
42 | under-ranked page may have received a penalty.
43 | The rank delta is the difference between the story's
51 | actual rank and it's raw rank (described above).
54 | A value of +1 means that a story is ranked 1 position higher on the front page than if it were ranked using the raw formula only.
55 | The second-chance age is the story's revised age
60 | after being re-posted from the the second-chance
62 | queue.
5 |
6 |
3 |
8 |
9 | Definition
16 |
17 |
Upvote Rate
18 | Expected Upvotes
24 | Raw Rank
29 |
32 |
33 | upvotes^0.8 / (ageHours+2)^1.8
Rank Delta
41 | Rank Delta
50 | Second-Chance Age
59 | Ranking Algorithms
4 |
7 |
22 |