├── poetry.toml
├── docker
├── .dockerignore
├── Dockerfile
├── utils.py
├── deeplcache.py
├── requirements.txt
├── generatehtml.py
├── postslack.py
├── posttwitter.py
├── postbluesky.py
└── main.py
├── .flake8
├── .gitignore
├── LICENSE
├── pyproject.toml
├── misc
└── update_job.sh
└── README.md
/poetry.toml:
--------------------------------------------------------------------------------
1 | [virtualenvs]
2 | in-project = true
3 |
--------------------------------------------------------------------------------
/docker/.dockerignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | __pycache__
3 | *.json.gz
4 |
--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 320
3 | extend-ignore = E203
4 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .python-version
2 | .vscode
3 | .DS_Store
4 | __pycache__
5 | .venv
6 | poetry.lock
7 | *.ipynb
8 |
--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: 2022-2025 Susumu OTA <1632335+susumuota@users.noreply.github.com>
2 | # SPDX-License-Identifier: MIT
3 |
4 | FROM python:3.11.13-slim
5 |
6 | RUN apt-get update && apt-get install -y --no-install-recommends \
7 | aria2 \
8 | fonts-ipafont-gothic \
9 | poppler-utils \
10 | wkhtmltopdf \
11 | && rm -rf /var/lib/apt/lists/*
12 |
13 | WORKDIR /app
14 |
15 | COPY requirements.txt requirements.txt
16 | RUN pip install -r requirements.txt
17 |
18 | COPY . .
19 |
20 | CMD [ "python", "-u", "main.py" ]
21 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023-2025 Susumu OTA
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "arxiv-reddit-summary"
3 | version = "0.7.6"
4 | description = "Summarize the top 30 most popular arXiv papers on Reddit, Hacker News and Hugging Face in the last 30 days. And post them to Slack, Twitter and Bluesky."
5 | authors = ["Susumu OTA <1632335+susumuota@users.noreply.github.com>"]
6 | license = "MIT"
7 | readme = "README.md"
8 |
9 | [tool.poetry.dependencies]
10 | python = "^3.11"
11 | praw = "^7.8.1"
12 | pandas = "^2.2.3"
13 | arxiv = "^2.1.3"
14 | tweepy = "^4.15.0"
15 | python-dateutil = "^2.9.0.post0"
16 | imgkit = "^1.2.3"
17 | google-cloud-storage = "^3.0.0"
18 | deepl = "^1.21.0"
19 | pysbd = "^0.3.4"
20 | slack-sdk = "^3.34.0"
21 | nanoatp = "^0.5.1"
22 | requests = "^2.32.3"
23 | beautifulsoup4 = "^4.13.3"
24 |
25 |
26 | [tool.poetry.group.dev.dependencies]
27 | black = "^25.1.0"
28 | flake8 = "^7.1.2"
29 | isort = "^6.0.0"
30 | ipykernel = "^6.29.5"
31 | ruff = "^0.11.3"
32 |
33 | [build-system]
34 | requires = ["poetry-core"]
35 | build-backend = "poetry.core.masonry.api"
36 |
37 | [tool.black]
38 | line-length = 320
39 |
40 | [tool.isort]
41 | profile = "black"
42 |
43 | [tool.ruff]
44 | line-length = 320
45 |
--------------------------------------------------------------------------------
/docker/utils.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: 2023 Susumu OTA <1632335+susumuota@users.noreply.github.com>
2 | # SPDX-License-Identifier: MIT
3 |
4 | import os
5 | import subprocess
6 | import unicodedata
7 | from shlex import quote
8 |
9 | import imgkit
10 |
11 |
12 | def download_arxiv_pdf(arxiv_id: str, tmp_dir: str):
13 | dir = quote(tmp_dir)
14 | output = quote(f"{arxiv_id}.pdf")
15 | url = quote(f"https://arxiv.org/pdf/{arxiv_id}.pdf")
16 | result = subprocess.run(f"aria2c -q -x5 -k1M -d {dir} -o {output} {url}", shell=True)
17 | assert result.returncode == 0 # TODO
18 | return os.path.join(tmp_dir, f"{arxiv_id}.pdf")
19 |
20 |
21 | def pdf_to_png(pdf_filename: str):
22 | filename = quote(pdf_filename)
23 | result = subprocess.run(f"pdftoppm -q -png -singlefile -scale-to-x 1200 -scale-to-y -1 {filename} {filename}", shell=True)
24 | assert result.returncode == 0 # TODO
25 | return f"{pdf_filename}.png"
26 |
27 |
28 | def html_to_image(html: str, image_filename: str, quality: int = 94):
29 | result = imgkit.from_string(html, image_filename, options={"width": 1200, "quiet": "", "quality": quality})
30 | assert result is True # TODO
31 | return image_filename
32 |
33 |
34 | def get_char_width(c: str):
35 | return 2 if unicodedata.east_asian_width(c) in "FWA" else 1
36 |
37 |
38 | def len_tweet(text: str):
39 | return sum(map(get_char_width, text))
40 |
41 |
42 | def strip_tweet(text: str, max_length=280, dots="..."):
43 | length = max_length - (len(dots) if dots else 0)
44 | buf = []
45 | count = 0
46 | for c in text:
47 | width = get_char_width(c)
48 | if count + width > length:
49 | return "".join(buf) + (dots if dots else "")
50 | buf.append(c)
51 | count += width
52 | return text
53 |
54 |
55 | def avoid_auto_link(text: str):
56 | """replace period to one dot leader to avoid auto link.
57 | https://shkspr.mobi/blog/2015/01/how-to-stop-twitter-auto-linking-urls/"""
58 | return text.replace(".", "․")
59 |
60 |
61 | def strip(text: str, length: int):
62 | return text[: length - 3] + "..." if len(text) > length else text
63 |
64 |
65 | def get_link_type(link: str):
66 | match link:
67 | case x if x.find("reddit.com") != -1 or x.find("redd.it") != -1:
68 | return "Reddit"
69 | case x if x.find("news.ycombinator.com") != -1:
70 | return "Hacker News"
71 | case x if x.find("huggingface.co") != -1:
72 | return "Hugging Face"
73 | case x if x.find("alphaxiv.org") != -1:
74 | return "alphaXiv"
75 | case _:
76 | return ""
77 |
--------------------------------------------------------------------------------
/misc/update_job.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # SPDX-FileCopyrightText: 2022 Susumu OTA <1632335+susumuota@users.noreply.github.com>
4 | # SPDX-License-Identifier: MIT
5 |
6 | export NOTIFY_TOP_N="30"
7 | export SLACK_CHANNEL="#test"
8 |
9 | export PROJECT_ID="arxiv-summary-1"
10 | export GCS_BUCKET_NAME="arxiv-summary"
11 | export REGION="us-central1"
12 | export IMAGE_NAME="arxiv-reddit-summary"
13 | export RUN_SERVICE_ACCOUNT="run-sa"
14 | export REPOSITORY="arxiv-reddit-summary"
15 | export TAG_NAME="latest"
16 | export RUN_JOB_NAME="arxiv-reddit-summary-job-1"
17 | export SCHEDULER_JOB_NAME="arxiv-reddit-summary-job-everyday-9am"
18 |
19 |
20 | gcloud artifacts repositories list --project=$PROJECT_ID --location=$REGION
21 | gcloud artifacts repositories delete $REPOSITORY --project=$PROJECT_ID --location=$REGION --quiet
22 | gcloud artifacts repositories create $REPOSITORY \
23 | --project=$PROJECT_ID \
24 | --repository-format="docker" \
25 | --location=$REGION
26 | gcloud artifacts repositories list --project=$PROJECT_ID --location=$REGION
27 |
28 | gcloud builds submit \
29 | --project=$PROJECT_ID \
30 | --region=$REGION \
31 | --tag="${REGION}-docker.pkg.dev/${PROJECT_ID}/${REPOSITORY}/${IMAGE_NAME}:${TAG_NAME}"
32 | gcloud artifacts repositories list --project=$PROJECT_ID --location=$REGION
33 |
34 | gcloud beta run jobs list --project=$PROJECT_ID
35 | gcloud beta run jobs delete $RUN_JOB_NAME --project=$PROJECT_ID --region=$REGION --quiet
36 | gcloud beta run jobs create $RUN_JOB_NAME \
37 | --image="${REGION}-docker.pkg.dev/${PROJECT_ID}/${REPOSITORY}/${IMAGE_NAME}:${TAG_NAME}" \
38 | --project=$PROJECT_ID \
39 | --region=$REGION \
40 | --service-account="${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
41 | --set-secrets="TWITTER_BEARER_TOKEN=TWITTER_BEARER_TOKEN:1" \
42 | --set-secrets="TWITTER_API_KEY=TWITTER_API_KEY:1" \
43 | --set-secrets="TWITTER_API_KEY_SECRET=TWITTER_API_KEY_SECRET:1" \
44 | --set-secrets="TWITTER_ACCESS_TOKEN=TWITTER_ACCESS_TOKEN:1" \
45 | --set-secrets="TWITTER_ACCESS_TOKEN_SECRET=TWITTER_ACCESS_TOKEN_SECRET:1" \
46 | --set-secrets="DEEPL_AUTH_KEY=DEEPL_AUTH_KEY:1" \
47 | --set-secrets="SLACK_BOT_TOKEN=SLACK_BOT_TOKEN:1" \
48 | --set-secrets="praw_client_id=praw_client_id:1" \
49 | --set-secrets="praw_client_secret=praw_client_secret:1" \
50 | --set-secrets="praw_user_agent=praw_user_agent:1" \
51 | --set-secrets="ATP_IDENTIFIER=ATP_IDENTIFIER:1" \
52 | --set-secrets="ATP_PASSWORD=ATP_PASSWORD:1" \
53 | --set-env-vars="NOTIFY_TOP_N=${NOTIFY_TOP_N}" \
54 | --set-env-vars="SLACK_CHANNEL=${SLACK_CHANNEL}" \
55 | --set-env-vars="GCS_BUCKET_NAME=${GCS_BUCKET_NAME}" \
56 | --max-retries=0 \
57 | --task-timeout="30m" \
58 | --memory="1024Mi"
59 | gcloud beta run jobs list --project=$PROJECT_ID
60 |
--------------------------------------------------------------------------------
/docker/deeplcache.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: 2023 Susumu OTA <1632335+susumuota@users.noreply.github.com>
2 | # SPDX-License-Identifier: MIT
3 |
4 | import gzip
5 | import json
6 | import os
7 | import tempfile
8 | from datetime import datetime, timedelta, timezone
9 |
10 | import deepl
11 |
12 |
13 | class DeepLCache:
14 | def __init__(self, translator: deepl.Translator):
15 | self.translator = translator
16 | self.cache: dict[str, tuple[list[str], str]] = {}
17 |
18 | def clear_cache(self, expire_timedelta: timedelta | None = None):
19 | if expire_timedelta is None:
20 | self.cache = {}
21 | return
22 | expire_dt = datetime.now(timezone.utc) - expire_timedelta
23 |
24 | def is_not_expire(item): # item is [arxiv_id, [texts, ts]]
25 | return datetime.fromisoformat(item[1][1]) > expire_dt
26 |
27 | self.cache = dict(filter(is_not_expire, self.cache.items()))
28 |
29 | def __repr__(self):
30 | return repr(self.cache) # TODO
31 |
32 | def load(self, filename: str):
33 | with gzip.open(filename, "rt", encoding="UTF-8") as f:
34 | self.cache = json.load(f)
35 |
36 | def save(self, filename: str):
37 | with gzip.open(filename, "wt", encoding="UTF-8") as f:
38 | json.dump(self.cache, f)
39 |
40 | def load_from_s3(self, s3_bucket, filename: str):
41 | with tempfile.TemporaryDirectory() as tmpdir:
42 | tmpfilename = os.path.join(tmpdir, filename)
43 | s3_bucket.download_file(filename, tmpfilename)
44 | self.load(tmpfilename)
45 |
46 | def save_to_s3(self, s3_bucket, filename: str):
47 | with tempfile.TemporaryDirectory() as tmpdir:
48 | tmpfilename = os.path.join(tmpdir, filename)
49 | self.save(tmpfilename)
50 | s3_bucket.upload_file(filename, tmpfilename)
51 |
52 | def load_from_gcs(self, gcs_bucket, filename: str):
53 | with tempfile.TemporaryDirectory() as tmpdir:
54 | tmpfilename = os.path.join(tmpdir, filename)
55 | gcs_bucket.blob(filename).download_to_filename(tmpfilename)
56 | self.load(tmpfilename)
57 |
58 | def save_to_gcs(self, gcs_bucket, filename: str):
59 | with tempfile.TemporaryDirectory() as tmpdir:
60 | tmpfilename = os.path.join(tmpdir, filename)
61 | self.save(tmpfilename)
62 | gcs_bucket.blob(filename).upload_from_filename(tmpfilename)
63 |
64 | def get(self, key: str, default=None):
65 | return self.cache.get(key, default)
66 |
67 | def translate_text(self, text: str | list[str], target_lang: str, key: str):
68 | trans = self.get(key, None)
69 | if trans is not None:
70 | return trans
71 | result = self.translator.translate_text(text=text, target_lang=target_lang)
72 | trans_texts = [r.text for r in result] if type(result) is list else [result.text] if type(result) is deepl.TextResult else []
73 | trans_ts = datetime.now(timezone.utc).isoformat()
74 | trans = (trans_texts, trans_ts)
75 | self.cache[key] = trans
76 | return trans
77 |
--------------------------------------------------------------------------------
/docker/requirements.txt:
--------------------------------------------------------------------------------
1 | arxiv==2.1.3 ; python_version >= "3.11" and python_version < "4.0"
2 | beautifulsoup4==4.13.3 ; python_version >= "3.11" and python_version < "4.0"
3 | cachetools==5.5.1 ; python_version >= "3.11" and python_version < "4.0"
4 | certifi==2025.1.31 ; python_version >= "3.11" and python_version < "4.0"
5 | charset-normalizer==3.4.1 ; python_version >= "3.11" and python_version < "4.0"
6 | deepl==1.21.0 ; python_version >= "3.11" and python_version < "4.0"
7 | feedparser==6.0.11 ; python_version >= "3.11" and python_version < "4.0"
8 | google-api-core==2.24.1 ; python_version >= "3.11" and python_version < "4.0"
9 | google-auth==2.38.0 ; python_version >= "3.11" and python_version < "4.0"
10 | google-cloud-core==2.4.1 ; python_version >= "3.11" and python_version < "4.0"
11 | google-cloud-storage==3.0.0 ; python_version >= "3.11" and python_version < "4.0"
12 | google-crc32c==1.6.0 ; python_version >= "3.11" and python_version < "4.0"
13 | google-resumable-media==2.7.2 ; python_version >= "3.11" and python_version < "4.0"
14 | googleapis-common-protos==1.67.0 ; python_version >= "3.11" and python_version < "4.0"
15 | idna==3.10 ; python_version >= "3.11" and python_version < "4.0"
16 | imgkit==1.2.3 ; python_version >= "3.11" and python_version < "4.0"
17 | nanoatp==0.5.1 ; python_version >= "3.11" and python_version < "4.0"
18 | numpy==2.2.3 ; python_version >= "3.11" and python_version < "4.0"
19 | oauthlib==3.2.2 ; python_version >= "3.11" and python_version < "4.0"
20 | pandas==2.2.3 ; python_version >= "3.11" and python_version < "4.0"
21 | praw==7.8.1 ; python_version >= "3.11" and python_version < "4.0"
22 | prawcore==2.4.0 ; python_version >= "3.11" and python_version < "4.0"
23 | proto-plus==1.26.0 ; python_version >= "3.11" and python_version < "4.0"
24 | protobuf==5.29.3 ; python_version >= "3.11" and python_version < "4.0"
25 | pyasn1-modules==0.4.1 ; python_version >= "3.11" and python_version < "4.0"
26 | pyasn1==0.6.1 ; python_version >= "3.11" and python_version < "4.0"
27 | pysbd==0.3.4 ; python_version >= "3.11" and python_version < "4.0"
28 | python-dateutil==2.9.0.post0 ; python_version >= "3.11" and python_version < "4.0"
29 | pytz==2025.1 ; python_version >= "3.11" and python_version < "4.0"
30 | requests-oauthlib==2.0.0 ; python_version >= "3.11" and python_version < "4.0"
31 | requests==2.32.3 ; python_version >= "3.11" and python_version < "4.0"
32 | rsa==4.9 ; python_version >= "3.11" and python_version < "4.0"
33 | sgmllib3k==1.0.0 ; python_version >= "3.11" and python_version < "4.0"
34 | six==1.17.0 ; python_version >= "3.11" and python_version < "4.0"
35 | slack-sdk==3.34.0 ; python_version >= "3.11" and python_version < "4.0"
36 | soupsieve==2.6 ; python_version >= "3.11" and python_version < "4.0"
37 | tld==0.13 ; python_version >= "3.11" and python_version < "4.0"
38 | tweepy==4.15.0 ; python_version >= "3.11" and python_version < "4.0"
39 | typing-extensions==4.12.2 ; python_version >= "3.11" and python_version < "4.0"
40 | tzdata==2025.1 ; python_version >= "3.11" and python_version < "4.0"
41 | update-checker==0.18.0 ; python_version >= "3.11" and python_version < "4.0"
42 | urllib3==2.3.0 ; python_version >= "3.11" and python_version < "4.0"
43 | websocket-client==1.8.0 ; python_version >= "3.11" and python_version < "4.0"
44 |
--------------------------------------------------------------------------------
/docker/generatehtml.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: 2023-2025 Susumu OTA <1632335+susumuota@users.noreply.github.com>
2 | # SPDX-License-Identifier: MIT
3 |
4 | import re
5 | from datetime import datetime, timedelta, timezone
6 | from html import escape
7 | from itertools import zip_longest
8 |
9 | import dateutil.parser
10 | import deeplcache
11 | import pandas as pd
12 |
13 | HTML_TRANS_TEMPLATE = """
14 |
15 |
16 |
17 |
29 |
30 |
31 | {url}
32 |
33 | {title}
34 |
35 |
36 | {authors}
37 |
38 |
39 | {content}
40 |
41 |
42 |
43 | """
44 |
45 | HTML_TRANS_ITEM_TEMPLATE = """
46 |
47 |
48 | {translation}
49 |
50 |
51 |
52 | {source}
53 |
54 |
55 | """
56 |
57 |
58 | def generate_trans_html(arxiv_id: str, title: str, authors: list[str], trans_texts: list[str], summary_texts: list[str]):
59 | authors_md = escape(", ".join(authors))
60 | title_md = escape(title)
61 | url_md = f"https://arxiv.org/abs/{arxiv_id}"
62 | items = map(
63 | lambda item: HTML_TRANS_ITEM_TEMPLATE.format(translation=escape(item[0]), source=escape(item[1])),
64 | zip_longest(trans_texts, summary_texts, fillvalue=""),
65 | )
66 | return HTML_TRANS_TEMPLATE.format(title=title_md, authors=authors_md, url=url_md, content="\n".join(items))
67 |
68 |
69 | HTML_TOP_N_TEMPLATE = """
70 |
71 |
72 |
73 |
82 |
83 |
84 | {date}
85 |
86 | {title}
87 |
88 |
89 | {content}
90 |
91 |
92 |
93 | """
94 |
95 | HTML_TOP_N_ITEM_TEMPLATE = """
96 |
97 | [{i}/{n}] {title}
98 | {stats}, {categories}, {updated}
99 |
100 | """
101 |
102 |
103 | def generate_top_n_html(page_title: str, date: str, df: pd.DataFrame, dlc: deeplcache.DeepLCache):
104 | page_title = escape(page_title)
105 | df = df[::-1] # normal order (reversed reversed order)
106 | items = []
107 | twenty_three_hours_ago = datetime.now(timezone.utc) - timedelta(hours=23)
108 | for i, (arxiv_id, updated, title, primary_category, categories, score, num_comments, count) in enumerate(zip(df["arxiv_id"], df["updated"], df["title"], df["primary_category"], df["categories"], df["score"], df["num_comments"], df["count"])):
109 | title = escape(title)
110 | trans = dlc.get(arxiv_id, None)
111 | if trans is None:
112 | continue
113 | _, trans_ts = trans
114 | if twenty_three_hours_ago < datetime.fromisoformat(trans_ts):
115 | title = f'[New] {title}'
116 | categories = " | ".join([primary_category] + [c for c in categories if c != primary_category and re.match(r"\w+\.\w+$", c)])
117 | stats = f"{score} Likes, {num_comments} Comments, {count} Posts"
118 | updated = dateutil.parser.isoparse(updated).strftime("%d %b %Y")
119 | items.append(HTML_TOP_N_ITEM_TEMPLATE.format(i=(i + 1), n=len(df), title=title, stats=stats, categories=categories, updated=updated, arxiv_id=arxiv_id))
120 | return HTML_TOP_N_TEMPLATE.format(title=page_title, date=date, content="\n".join(items))
121 |
--------------------------------------------------------------------------------
/docker/postslack.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: 2023 Susumu OTA <1632335+susumuota@users.noreply.github.com>
2 | # SPDX-License-Identifier: MIT
3 |
4 | import re
5 | import time
6 | from datetime import datetime, timedelta, timezone
7 |
8 | import dateutil.parser
9 | import deeplcache
10 | import pandas as pd
11 | import pysbd
12 | import slack_sdk
13 | import utils
14 |
15 |
16 | def post_to_slack_header(api: slack_sdk.WebClient, channel: str, df: pd.DataFrame):
17 | text = f"Top {len(df)} most popular arXiv papers in the last 30 days"
18 | blocks = [{"type": "header", "text": {"type": "plain_text", "text": text}}]
19 | return api.chat_postMessage(channel=channel, text=text, blocks=blocks)
20 |
21 |
22 | def generate_slack_title_blocks(df: pd.DataFrame, i: int, is_new: bool, title: str, score: int, num_comments: int, count: int, primary_category: str, categories: list[str], updated: str, first_summary: str):
23 | new_md = ":new: " if is_new else ""
24 | title_md = utils.strip(title, 200)
25 | stats_md = f"_*{score}* Likes, {num_comments} Comments, {count} Posts_"
26 | categories_md = utils.avoid_auto_link(" | ".join([primary_category] + [c for c in categories if c != primary_category and re.match(r"\w+\.\w+$", c)]))
27 | updated_md = dateutil.parser.isoparse(updated).strftime("%d %b %Y")
28 | return [{"type": "section", "text": {"type": "mrkdwn", "text": f"[{len(df) - i}/{len(df)}] {new_md}*{title_md}*\n{stats_md}, {categories_md}, {updated_md}\n{first_summary}"}}]
29 |
30 |
31 | def generate_slack_summary(dlc: deeplcache.DeepLCache, seg: pysbd.Segmenter, twenty_three_hours_ago: datetime, arxiv_id: str, summary: str):
32 | segs = seg.segment(summary.replace("\n", " ")[:2000])
33 | summary_texts: list[str] = [str(seg) for seg in segs] if type(segs) is list else [segs] if type(segs) is str else []
34 | first_summary = summary_texts[0][:200] # sometimes pysbd failed to split
35 | translation_md = None
36 | is_new = False
37 | trans = dlc.get(arxiv_id, None)
38 | if trans is not None:
39 | trans_texts, trans_ts = trans
40 | first_summary = trans_texts[0][:200] # sometimes pysbd failed to split
41 | is_new = True if twenty_three_hours_ago < datetime.fromisoformat(trans_ts) else False
42 | # assert len(summary_texts) == len(trans_texts) # this rarely happen
43 | if len(summary_texts) != len(trans_texts):
44 | print("different texts length", arxiv_id, len(summary_texts), len(trans_texts))
45 | translation_md = "\n\n".join(trans_texts)
46 | translation_md = utils.strip(translation_md, 3000) # must be less than 3001 characters
47 | return is_new, first_summary, translation_md
48 |
49 |
50 | def post_to_slack_title(api: slack_sdk.WebClient, channel: str, dlc: deeplcache.DeepLCache, df: pd.DataFrame, seg: pysbd.Segmenter, twenty_three_hours_ago: datetime, i: int, arxiv_id: str, updated: str, title: str, summary: str, primary_category: str, categories: list[str], score: int, num_comments: int, count: int):
51 | is_new, first_summary, translation_md = generate_slack_summary(dlc, seg, twenty_three_hours_ago, arxiv_id, summary)
52 | blocks = generate_slack_title_blocks(df, i, is_new, title, score, num_comments, count, primary_category, categories, updated, first_summary)
53 | title_md = utils.strip(title, 200)
54 | response = api.chat_postMessage(channel=channel, text=title_md, blocks=blocks)
55 | return response, translation_md
56 |
57 |
58 | def post_to_slack_translation(api: slack_sdk.WebClient, channel: str, title: str, ts: str, translation_md: str):
59 | blocks = [{"type": "section", "text": {"type": "mrkdwn", "text": translation_md}}]
60 | title_md = utils.strip(title, 200)
61 | return api.chat_postMessage(channel=channel, text=title_md, blocks=blocks, thread_ts=ts)
62 |
63 |
64 | def post_to_slack_authors(api: slack_sdk.WebClient, channel: str, title: str, ts: str, authors: list[str], comment: str, arxiv_id: str):
65 | authors_md = utils.strip(", ".join(authors), 1000)
66 | comment_md = f"\n\n*Comments*: {utils.strip(comment, 1000)}\n\n" if comment else ""
67 | abs_md = f""
68 | pdf_md = f""
69 | twitter_md = f""
70 | reddit_md = f""
71 | hackernews_md = f""
72 | huggingface_md = f""
73 | alphaxiv_md = f""
74 | blocks = [{"type": "section", "text": {"type": "mrkdwn", "text": f"*Links*: {abs_md}, {pdf_md}, {twitter_md}, {reddit_md}, {hackernews_md}, {huggingface_md}, {alphaxiv_md}\n\n*Authors*: {authors_md}{comment_md}"}}]
75 | title_md = utils.strip(title, 200)
76 | return api.chat_postMessage(channel=channel, text=title_md, blocks=blocks, thread_ts=ts)
77 |
78 |
79 | def post_to_slack_documents(api: slack_sdk.WebClient, channel: str, ts: str, df: pd.DataFrame):
80 | for i, (id, score, num_comments, created_at) in enumerate(zip(df["id"], df["score"], df["num_comments"], df["created_at"])):
81 | blocks = []
82 | stats_md = f"_*{score}* Likes, {num_comments} Comments_"
83 | created_at_md = datetime.fromtimestamp(created_at).strftime("%d %b %Y")
84 | url_md = f"<{id}|{created_at_md}>"
85 | blocks = [{"type": "section", "text": {"type": "mrkdwn", "text": f"({i + 1}/{len(df)}) {stats_md}, {url_md}\n"}}]
86 | api.chat_postMessage(channel=channel, text=url_md, thread_ts=ts, blocks=blocks)
87 | time.sleep(1)
88 |
89 |
90 | def post_to_slack(api: slack_sdk.WebClient, channel: str, dlc: deeplcache.DeepLCache, df: pd.DataFrame, document_df: pd.DataFrame):
91 | df = df[::-1] # reverse order
92 | post_to_slack_header(api, channel, df)
93 | time.sleep(1)
94 | seg = pysbd.Segmenter(language="en", clean=False)
95 | twenty_three_hours_ago = datetime.now(timezone.utc) - timedelta(hours=23)
96 | for i, (arxiv_id, updated, title, summary, authors, comment, primary_category, categories, score, num_comments, count) in enumerate(zip(df["arxiv_id"], df["updated"], df["title"], df["summary"], df["authors"], df["comment"], df["primary_category"], df["categories"], df["score"], df["num_comments"], df["count"])):
97 | response, translation_md = post_to_slack_title(api, channel, dlc, df, seg, twenty_three_hours_ago, i, arxiv_id, updated, title, summary, primary_category, categories, score, num_comments, count)
98 | time.sleep(1)
99 | ts = response["ts"]
100 | if not ts:
101 | continue
102 | if translation_md:
103 | post_to_slack_translation(api, channel, title, ts, translation_md)
104 | time.sleep(1)
105 | post_to_slack_authors(api, channel, title, ts, authors, comment, arxiv_id)
106 | time.sleep(1)
107 | top_n_documents = document_df[document_df["arxiv_id"].apply(lambda ids: arxiv_id in ids)].head(3) # TODO
108 | post_to_slack_documents(api, channel, ts, top_n_documents)
109 | print("post_to_slack: ", f"[{len(df) - i}/{len(df)}]")
110 |
--------------------------------------------------------------------------------
/docker/posttwitter.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: 2023 Susumu OTA <1632335+susumuota@users.noreply.github.com>
2 | # SPDX-License-Identifier: MIT
3 |
4 | import os
5 | import re
6 | import tempfile
7 | import time
8 | from datetime import datetime, timedelta, timezone
9 |
10 | import dateutil.parser
11 | import deeplcache
12 | import generatehtml
13 | import pandas as pd
14 | import pysbd
15 | import tweepy
16 | import utils
17 |
18 |
19 | def upload_first_page_to_twitter(api_v1: tweepy.API, arxiv_id: str):
20 | with tempfile.TemporaryDirectory() as tmp_dir:
21 | pdf_filename = utils.download_arxiv_pdf(arxiv_id, tmp_dir)
22 | first_page_filename = utils.pdf_to_png(pdf_filename)
23 | if os.path.isfile(first_page_filename) and os.path.getsize(first_page_filename) > 0:
24 | media = api_v1.media_upload(first_page_filename)
25 | return media.media_id if media else None
26 | return None
27 |
28 |
29 | def generate_twitter_first_page(df: pd.DataFrame, i: int, is_new: bool, arxiv_id: str, updated: str, title: str, summary_texts: list[str], authors: list[str], score: int, num_comments: int, count: int, primary_category: str, categories: list[str]):
30 | summary_text = " ".join(summary_texts)
31 | new_md = "🆕" if is_new else ""
32 | authors_md = ", ".join(authors)
33 | categories_md = utils.avoid_auto_link(" | ".join([primary_category] + [c for c in categories if c != primary_category and re.match(r"\w+\.\w+$", c)]))
34 | stats_md = f"{score} Likes, {num_comments} Comments, {count} Posts"
35 | updated_md = dateutil.parser.isoparse(updated).strftime("%d %b %Y")
36 | title_md = title
37 | abs_md = f"https://arxiv.org/abs/{arxiv_id}"
38 | text = f"[{len(df) - i}/{len(df)}] {stats_md}\n{abs_md} {categories_md}, {updated_md}\n\n{new_md}{title_md}\n\n{authors_md}"
39 | return text, summary_text
40 |
41 |
42 | def post_to_twitter_first_page(api_v1: tweepy.API, api_v2: tweepy.Client, df: pd.DataFrame, i: int, is_new: bool, arxiv_id: str, updated: str, title: str, summary_texts: list[str], authors: list[str], score: int, num_comments: int, count: int, primary_category: str, categories: list[str]) -> str:
43 | text, summary_text = generate_twitter_first_page(df, i, is_new, arxiv_id, updated, title, summary_texts, authors, score, num_comments, count, primary_category, categories)
44 | media_ids = []
45 | first_page_media_id = upload_first_page_to_twitter(api_v1, arxiv_id)
46 | if first_page_media_id:
47 | api_v1.create_media_metadata(first_page_media_id, utils.strip_tweet(summary_text, 1000))
48 | media_ids.append(first_page_media_id)
49 | prev_tweet_id: str = ""
50 | try:
51 | response = api_v2.create_tweet(text=utils.strip_tweet(text, 280), user_auth=True, media_ids=media_ids if len(media_ids) > 0 else None)
52 | prev_tweet_id = response.data["id"] if type(response) is tweepy.Response and not response.errors else ""
53 | except Exception as e:
54 | print(e)
55 | return prev_tweet_id
56 |
57 |
58 | def post_to_twitter_link(api_v2: tweepy.Client, prev_tweet_id: str, arxiv_id: str, link_type: str) -> str:
59 | twitter_uri = f"https://x.com/search?q=arxiv.org%2Fabs%2F{arxiv_id}%20OR%20arxiv.org%2Fpdf%2F{arxiv_id}.pdf"
60 | reddit_uri = f"https://www.reddit.com/search/?q=%22{arxiv_id}%22&sort=top"
61 | hackernews_uri = f"https://hn.algolia.com/?query=%22{arxiv_id}%22&type=all"
62 | # the last uri will become a link card
63 | text = f"Twitter: {twitter_uri}"
64 | text = f"Twitter: {twitter_uri} \nReddit: {reddit_uri}" if link_type == "Reddit" else text
65 | text = f"Twitter: {twitter_uri} \nHacker News: {hackernews_uri}" if link_type == "Hacker News" else text
66 | try:
67 | response = api_v2.create_tweet(text=utils.strip_tweet(text, 280), user_auth=True, in_reply_to_tweet_id=prev_tweet_id)
68 | prev_tweet_id = response.data["id"] if type(response) is tweepy.Response and not response.errors else ""
69 | except Exception as e:
70 | print(e)
71 | return prev_tweet_id
72 |
73 |
74 | def post_to_twitter_tweets(api_v2: tweepy.Client, prev_tweet_id: str, document_df: pd.DataFrame) -> str:
75 | # df = document_df[::-1] # reverse order
76 | df = document_df
77 | for i, (id, score, num_comments, created_at) in enumerate(zip(df["id"], df["score"], df["num_comments"], df["created_at"])):
78 | stats_md = f"{score} Likes, {num_comments} Comments"
79 | created_at_md = datetime.fromtimestamp(created_at).strftime("%d %b %Y")
80 | link = utils.get_link_type(id) or id
81 | # index = len(df) - i # reverse order
82 | index = i + 1
83 | text = f"({index}/{len(df)}) {stats_md}, {created_at_md}, {link}\n{id}\n"
84 | try:
85 | response = api_v2.create_tweet(text=utils.strip_tweet(text, 280), user_auth=True, in_reply_to_tweet_id=prev_tweet_id)
86 | prev_tweet_id = response.data["id"] if type(response) is tweepy.Response and not response.errors else ""
87 | except Exception as e:
88 | print(e)
89 | time.sleep(1)
90 | return prev_tweet_id
91 |
92 |
93 | def upload_html_to_twitter(api_v1: tweepy.API, filename: str, html_text: str):
94 | with tempfile.TemporaryDirectory() as tmp_dir:
95 | abs_path = os.path.join(tmp_dir, filename)
96 | abs_path = utils.html_to_image(html_text, abs_path)
97 | if os.path.isfile(abs_path) and os.path.getsize(abs_path) > 0:
98 | media = api_v1.media_upload(abs_path)
99 | return media.media_id if media else None
100 | return None
101 |
102 |
103 | def post_to_twitter_ranking(api_v1: tweepy.API, api_v2: tweepy.Client, dlc: deeplcache.DeepLCache, df: pd.DataFrame):
104 | title = f"Top {len(df)} most popular arXiv papers in the last 30 days"
105 | date = datetime.now(timezone.utc).strftime("%d %b %Y")
106 | media_ids = []
107 | html_text = generatehtml.generate_top_n_html(title, date, df, dlc)
108 | top_n_media_id = upload_html_to_twitter(api_v1, "top_n.jpg", html_text)
109 | if top_n_media_id:
110 | rev_df = df[::-1]
111 | metadata = "\n".join(map(lambda item: f"[{item[0] + 1}/{len(df)}] arxiv.org/abs/{item[1][0]}", enumerate(zip(rev_df["arxiv_id"]))))
112 | api_v1.create_media_metadata(top_n_media_id, utils.strip_tweet(metadata, 1000))
113 | media_ids.append(top_n_media_id)
114 | text = title
115 | try:
116 | api_v2.create_tweet(text=utils.strip_tweet(text, 280), user_auth=True, media_ids=media_ids if len(media_ids) > 0 else None)
117 | except Exception as e:
118 | print(e)
119 |
120 |
121 | def post_to_twitter_trans(api_v1: tweepy.API, api_v2: tweepy.Client, prev_tweet_id: str, arxiv_id: str, title: str, authors: list[str], summary_texts: list[str], trans_texts: list[str]):
122 | html_text = generatehtml.generate_trans_html(arxiv_id, title, authors, trans_texts, summary_texts)
123 | media_ids = []
124 | translation_media_id = upload_html_to_twitter(api_v1, f"{arxiv_id}.trans.jpg", html_text)
125 | trans_text = "".join(trans_texts)
126 | if translation_media_id:
127 | api_v1.create_media_metadata(translation_media_id, utils.strip_tweet(trans_text, 1000))
128 | media_ids.append(translation_media_id)
129 | text = f"https://arxiv.org/abs/{arxiv_id}\n{trans_text}"
130 | try:
131 | api_v2.create_tweet(text=utils.strip_tweet(text, 280), user_auth=True, media_ids=media_ids if len(media_ids) > 0 else None, in_reply_to_tweet_id=prev_tweet_id)
132 | except Exception as e:
133 | print(e)
134 |
135 |
136 | def post_to_twitter(api_v1: tweepy.API, api_v2: tweepy.Client, dlc: deeplcache.DeepLCache, df: pd.DataFrame, document_df: pd.DataFrame):
137 | df = df[::-1] # reverse order
138 | twenty_three_hours_ago = datetime.now(timezone.utc) - timedelta(hours=23)
139 | seg = pysbd.Segmenter(language="en", clean=False)
140 | post_to_twitter_ranking(api_v1, api_v2, dlc, df)
141 | for i, (arxiv_id, updated, title, summary, authors, comment, primary_category, categories, score, num_comments, count) in enumerate(zip(df["arxiv_id"], df["updated"], df["title"], df["summary"], df["authors"], df["comment"], df["primary_category"], df["categories"], df["score"], df["num_comments"], df["count"])):
142 | trans = dlc.get(arxiv_id, None)
143 | if trans is None:
144 | continue
145 | trans_texts, trans_ts = trans
146 | segs = seg.segment(summary.replace("\n", " ")[:2000])
147 | summary_texts: list[str] = [str(seg) for seg in segs] if type(segs) is list else [segs] if type(segs) is str else []
148 | # only post new papers
149 | if not (twenty_three_hours_ago < datetime.fromisoformat(trans_ts)):
150 | continue
151 | is_new = True
152 | prev_tweet_id = post_to_twitter_first_page(api_v1, api_v2, df, i, is_new, arxiv_id, updated, title, summary_texts, authors, score, num_comments, count, primary_category, categories)
153 | time.sleep(1)
154 | if not prev_tweet_id:
155 | continue
156 | top_n_documents = document_df[document_df["arxiv_id"].apply(lambda ids: arxiv_id in ids)].head(3) # TODO
157 | link_type = utils.get_link_type(top_n_documents.iloc[0]["id"])
158 | prev_tweet_id = post_to_twitter_link(api_v2, prev_tweet_id, arxiv_id, link_type)
159 | time.sleep(1)
160 | if not prev_tweet_id:
161 | continue
162 | prev_tweet_id = post_to_twitter_tweets(api_v2, prev_tweet_id, top_n_documents)
163 | post_to_twitter_trans(api_v1, api_v2, prev_tweet_id, arxiv_id, title, authors, summary_texts, trans_texts)
164 | print("post_to_twitter: ", f"[{len(df) - i}/{len(df)}]")
165 | time.sleep(1)
166 |
--------------------------------------------------------------------------------
/docker/postbluesky.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: 2023-2025 Susumu OTA <1632335+susumuota@users.noreply.github.com>
2 | # SPDX-License-Identifier: MIT
3 |
4 | import os
5 | import re
6 | import tempfile
7 | import time
8 | from datetime import datetime, timedelta, timezone
9 | from typing import Any
10 |
11 | import dateutil.parser
12 | import deeplcache
13 | import generatehtml
14 | import nanoatp
15 | import pandas as pd
16 | import pysbd
17 | import utils
18 |
19 |
20 | def generate_facets(text: str, patterns: list[tuple[str, str]]):
21 | # TODO: fix naive implementation
22 | facets: list[dict[str, Any]] = []
23 | for pattern, uri in patterns:
24 | start = text.find(pattern)
25 | if start == -1:
26 | continue
27 | end = start + len(pattern)
28 | facets.append(
29 | {
30 | "$type": "app.bsky.richtext.facet",
31 | "index": {"byteStart": start, "byteEnd": end},
32 | "features": [{"$type": "app.bsky.richtext.facet#link", "uri": uri}],
33 | }
34 | )
35 | facets.sort(key=lambda facet: facet["index"]["byteStart"])
36 | return facets
37 |
38 |
39 | def upload_first_page_to_bluesky(api: nanoatp.BskyAgent, arxiv_id: str, summary_text: str) -> dict[str, Any]:
40 | with tempfile.TemporaryDirectory() as tmp_dir:
41 | pdf_filename = utils.download_arxiv_pdf(arxiv_id, tmp_dir)
42 | first_page_filename = utils.pdf_to_png(pdf_filename)
43 | if os.path.isfile(first_page_filename) and os.path.getsize(first_page_filename) > 0:
44 | return api.uploadImage(first_page_filename, utils.strip_tweet(summary_text, 2000))
45 | return {}
46 |
47 |
48 | def generate_bluesky_first_page(df: pd.DataFrame, i: int, is_new: bool, arxiv_id: str, updated: str, title: str, summary_texts: list[str], authors: list[str], score: int, num_comments: int, count: int, primary_category: str, categories: list[str]):
49 | summary_text = "\n\n".join(summary_texts)
50 | new_md = "🆕" if is_new else ""
51 | authors_md = ", ".join(authors)
52 | categories_md = utils.avoid_auto_link(" | ".join([primary_category] + [c for c in categories if c != primary_category and re.match(r"\w+\.\w+$", c)]))
53 | stats_md = f"{score} Likes, {num_comments} Comments, {count} Posts"
54 | updated_md = dateutil.parser.isoparse(updated).strftime("%d %b %Y")
55 | title_md = title
56 | text = f"[{len(df) - i}/{len(df)}] {stats_md}\n{arxiv_id}, {categories_md}, {updated_md}\n\n{new_md}{title_md}\n\n{authors_md}"
57 | return text, summary_text
58 |
59 |
60 | def post_to_bluesky_first_page(api: nanoatp.BskyAgent, df: pd.DataFrame, i: int, is_new: bool, arxiv_id: str, updated: str, title: str, summary_texts: list[str], authors: list[str], score: int, num_comments: int, count: int, primary_category: str, categories: list[str]):
61 | first_page_text, summary_text = generate_bluesky_first_page(df, i, is_new, arxiv_id, updated, title, summary_texts, authors, score, num_comments, count, primary_category, categories)
62 | images = []
63 | image = upload_first_page_to_bluesky(api, arxiv_id, summary_text)
64 | images.append(image) if image else None
65 | parent_post: dict[str, str] = {}
66 | text = f"{first_page_text}"
67 | patterns = [(arxiv_id, f"https://arxiv.org/abs/{arxiv_id}")]
68 | facets = generate_facets(text, patterns)
69 | embed = {"$type": "app.bsky.embed.images", "images": images}
70 | record = {"text": utils.strip_tweet(text, 300), "facets": facets, "embed": embed}
71 | try:
72 | parent_post = api.post(record)
73 | except Exception as e:
74 | print(e)
75 | return parent_post
76 |
77 |
78 | def generate_external(api: nanoatp.BskyAgent, uri: str, title: str, description: str):
79 | try:
80 | external = api.uploadExternal(uri)
81 | except Exception as e:
82 | print({"function": "uploadExternal", "uri": uri, "error": str(e)})
83 | external = {
84 | "$type": "app.bsky.embed.external#external",
85 | "uri": uri,
86 | "title": title,
87 | "description": description,
88 | }
89 | return external
90 |
91 |
92 | def post_to_bluesky_link(api: nanoatp.BskyAgent, root_post: dict[str, str], parent_post: dict[str, str], arxiv_id: str, title: str, summary_texts: list[str]):
93 | patterns = [
94 | ("abs", f"https://arxiv.org/abs/{arxiv_id}"),
95 | ("pdf", f"https://arxiv.org/pdf/{arxiv_id}.pdf"),
96 | ("Bluesky", f"https://bsky.app/search?q={arxiv_id}"),
97 | ("Twitter", f"https://x.com/search?q=arxiv.org%2Fabs%2F{arxiv_id}%20OR%20arxiv.org%2Fpdf%2F{arxiv_id}.pdf"),
98 | ("Reddit", f"https://www.reddit.com/search/?q=%22{arxiv_id}%22&sort=top"),
99 | ("Hacker News", f"https://hn.algolia.com/?query=%22{arxiv_id}%22&type=all"),
100 | ("Hugging Face", f"https://huggingface.co/papers/{arxiv_id}"),
101 | ("alphaXiv", f"https://www.alphaxiv.org/abs/{arxiv_id}"),
102 | ]
103 | text = "Links: abs, pdf\nSearch: Bluesky, Twitter, Reddit, Hacker News, Hugging Face, alphaXiv"
104 | facets = generate_facets(text, patterns)
105 | uri = patterns[0][1]
106 | external = generate_external(api, uri, title, utils.strip_tweet(" ".join(summary_texts), 300))
107 | embed = {"$type": "app.bsky.embed.external", "external": external}
108 | record = {"text": utils.strip_tweet(text, 300), "facets": facets, "reply": {"root": root_post, "parent": parent_post}, "embed": embed}
109 | try:
110 | parent_post = api.post(record)
111 | except Exception as e:
112 | print(e)
113 | return parent_post
114 |
115 |
116 | def post_to_bluesky_posts(api: nanoatp.BskyAgent, root_post: dict[str, str], parent_post: dict[str, str], df: pd.DataFrame):
117 | for i, (id, score, num_comments, created_at, title, description) in enumerate(zip(df["id"], df["score"], df["num_comments"], df["created_at"], df["title"], df["description"])):
118 | stats_md = f"{score} Likes, {num_comments} Comments"
119 | created_at_md = datetime.fromtimestamp(created_at).strftime("%d %b %Y")
120 | link = utils.get_link_type(id) or id
121 | index = i + 1
122 | text = f"({index}/{len(df)}) {stats_md}, {created_at_md}, {link}"
123 | patterns = [(link, id)]
124 | facets = generate_facets(text, patterns)
125 | external = generate_external(api, id, title, utils.strip_tweet(description, 300))
126 | embed = {"$type": "app.bsky.embed.external", "external": external}
127 | record = {"text": utils.strip_tweet(text, 300), "facets": facets, "reply": {"root": root_post, "parent": parent_post}, "embed": embed}
128 | try:
129 | parent_post = api.post(record)
130 | except Exception as e:
131 | print(e)
132 | time.sleep(1)
133 | return parent_post
134 |
135 |
136 | def upload_html_to_bluesky(api: nanoatp.BskyAgent, filename: str, html_text: str, alt_text: str, quality: int = 94) -> dict[str, Any]:
137 | with tempfile.TemporaryDirectory() as tmp_dir:
138 | abs_path = os.path.join(tmp_dir, filename)
139 | abs_path = utils.html_to_image(html_text, abs_path, quality)
140 | if os.path.isfile(abs_path) and os.path.getsize(abs_path) > 0:
141 | return api.uploadImage(abs_path, utils.strip_tweet(alt_text, 2000))
142 | return {}
143 |
144 |
145 | def post_to_bluesky_trans(api: nanoatp.BskyAgent, root_post: dict[str, str], parent_post: dict[str, str], arxiv_id: str, title: str, authors: list[str], summary_texts: list[str], trans_texts: list[str]) -> dict[str, str]:
146 | html_text = generatehtml.generate_trans_html(arxiv_id, title, authors, trans_texts, summary_texts)
147 | trans_text = "".join(trans_texts)
148 | images = []
149 | image = upload_html_to_bluesky(api, f"{arxiv_id}.trans.jpg", html_text, "\n\n".join(trans_texts))
150 | images.append(image) if image else None
151 | text = f"{arxiv_id}\n{trans_text}"
152 | patterns = [(arxiv_id, f"https://arxiv.org/abs/{arxiv_id}")]
153 | facets = generate_facets(text, patterns)
154 | embed = {"$type": "app.bsky.embed.images", "images": images}
155 | record = {"text": utils.strip_tweet(text, 300), "facets": facets, "reply": {"root": root_post, "parent": parent_post}, "embed": embed}
156 | try:
157 | return api.post(record)
158 | except Exception as e:
159 | print(e)
160 | return {}
161 |
162 |
163 | def post_to_bluesky_ranking(api: nanoatp.BskyAgent, dlc: deeplcache.DeepLCache, df: pd.DataFrame) -> dict[str, str]:
164 | title = f"Top {len(df)} most popular arXiv papers in the last 30 days.\n"
165 | date = datetime.now(timezone.utc).strftime("%d %b %Y")
166 | html_text = generatehtml.generate_top_n_html(title, date, df, dlc)
167 | uris = list(map(lambda item: (f"{item[0] + 1}/{len(df)}", f"https://arxiv.org/abs/{item[1][0]}"), enumerate(zip(df[::-1]["arxiv_id"]))))
168 | alt_text = "\n".join(map(lambda item: " ".join(item), uris))
169 | image = upload_html_to_bluesky(api, "top_n.jpg", html_text, alt_text, 90) # sometimes the image is too large to upload
170 | images = []
171 | images.append(image) if image else None
172 | text = title + " ".join(map(lambda item: f"[{item[0]}]", uris))
173 | facets = generate_facets(text, uris)
174 | embed = {"$type": "app.bsky.embed.images", "images": images}
175 | record = {"text": utils.strip_tweet(text, 300), "facets": facets, "embed": embed}
176 | try:
177 | return api.post(record)
178 | except Exception as e:
179 | print(e)
180 | return {}
181 |
182 |
183 | def post_to_bluesky(api: nanoatp.BskyAgent, dlc: deeplcache.DeepLCache, df: pd.DataFrame, document_df: pd.DataFrame):
184 | df = df[::-1] # reverse order
185 | twenty_three_hours_ago = datetime.now(timezone.utc) - timedelta(hours=23)
186 | seg = pysbd.Segmenter(language="en", clean=False)
187 | for i, (arxiv_id, updated, title, summary, authors, primary_category, categories, score, num_comments, count) in enumerate(zip(df["arxiv_id"], df["updated"], df["title"], df["summary"], df["authors"], df["primary_category"], df["categories"], df["score"], df["num_comments"], df["count"])):
188 | trans = dlc.get(arxiv_id, None)
189 | if trans is None:
190 | continue
191 | trans_texts, trans_ts = trans
192 | # only post new papers
193 | if not (twenty_three_hours_ago < datetime.fromisoformat(trans_ts)):
194 | continue
195 | segs = seg.segment(summary.replace("\n", " ")[:2000])
196 | summary_texts: list[str] = [str(seg) for seg in segs] if type(segs) is list else [segs] if type(segs) is str else []
197 | is_new = True
198 | parent_post = post_to_bluesky_first_page(api, df, i, is_new, arxiv_id, updated, title, summary_texts, authors, score, num_comments, count, primary_category, categories)
199 | if parent_post is None:
200 | continue
201 | root_post = parent_post
202 | time.sleep(1)
203 | top_n_documents = document_df[document_df["arxiv_id"].apply(lambda ids: arxiv_id in ids)].head(3) # TODO
204 | parent_post = post_to_bluesky_posts(api, root_post, parent_post, top_n_documents)
205 | parent_post = post_to_bluesky_link(api, root_post, parent_post, arxiv_id, title, summary_texts)
206 | time.sleep(1)
207 | post_to_bluesky_trans(api, root_post, parent_post, arxiv_id, title, authors, summary_texts, trans_texts)
208 | print("post_to_bluesky: ", f"[{len(df) - i}/{len(df)}]")
209 | time.sleep(1)
210 | return post_to_bluesky_ranking(api, dlc, df)
211 |
--------------------------------------------------------------------------------
/docker/main.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: 2023 Susumu OTA <1632335+susumuota@users.noreply.github.com>
2 | #
3 | # SPDX-License-Identifier: MIT
4 |
5 | # Those environment variables are required to use PRAW.
6 | # export praw_client_id="reddit client id"
7 | # export praw_client_secret="reddit client secret"
8 | # export praw_user_agent="reddit user agent"
9 |
10 | import os
11 | import re
12 | import time
13 | from datetime import datetime, timedelta, timezone
14 |
15 | import arxiv
16 | import deepl
17 | import deeplcache
18 | import nanoatp
19 | import pandas as pd
20 | import postbluesky
21 | import postslack
22 | import posttwitter
23 | import praw
24 | import pysbd
25 | import requests
26 | import slack_sdk
27 | import tweepy
28 | from google.cloud import storage
29 |
30 | # https://info.arxiv.org/help/arxiv_identifier.html
31 | ARXIV_URL_PATTERN = re.compile(r"https?://arxiv\.org/(abs|pdf)/([0-9]{4}\.[0-9]{4,5})(v[0-9]+)?(\.pdf)?")
32 | ARXIV_ID_PATTERN = re.compile(r"([0-9]{4}\.[0-9]{4,5})(v[0-9]+)?")
33 |
34 |
35 | def parse_arxiv_ids(text: str) -> list[str]:
36 | text = text.replace("\\", "") # TODO: some text includes 2 backslashes in urls
37 | return list(set([m[1] for m in re.findall(ARXIV_URL_PATTERN, text)]))
38 |
39 |
40 | def flatten(lists: list[list]):
41 | return [item for sublist in lists for item in sublist]
42 |
43 |
44 | def submission_to_dict(submission: praw.reddit.Submission):
45 | """https://praw.readthedocs.io/en/stable/code_overview/models/submission.html"""
46 | arxiv_ids = parse_arxiv_ids(submission.selftext)
47 | score = int(submission.score / len(arxiv_ids) if len(arxiv_ids) > 0 else submission.score)
48 | return {
49 | "id": f"https://redd.it/{submission.id}",
50 | "score": score,
51 | "num_comments": submission.num_comments,
52 | "created_at": submission.created_utc,
53 | "arxiv_id": arxiv_ids,
54 | "title": submission.title,
55 | "description": submission.selftext,
56 | }
57 |
58 |
59 | def search_reddit(query: str, sort="relevance", syntax="lucene", time_filter="all", limit: int | None = None):
60 | """https://praw.readthedocs.io/en/latest/code_overview/models/subreddit.html#praw.models.Subreddit.search"""
61 | rs = list(praw.Reddit().subreddit("all").search(query=query, sort=sort, syntax=syntax, time_filter=time_filter, limit=limit))
62 | return pd.json_normalize([submission_to_dict(r) for r in rs])
63 |
64 |
65 | def hit_to_dict(hit: dict):
66 | """https://hn.algolia.com/api"""
67 | arxiv_ids = parse_arxiv_ids(hit["url"])
68 | score = int(hit["points"] / len(arxiv_ids) if len(arxiv_ids) > 0 else hit["points"])
69 | return {
70 | "id": f"https://news.ycombinator.com/item?id={hit['objectID']}",
71 | "score": score,
72 | "num_comments": hit["num_comments"],
73 | "created_at": hit["created_at_i"],
74 | "arxiv_id": arxiv_ids,
75 | "title": hit["title"],
76 | "description": hit["url"],
77 | }
78 |
79 |
80 | def search_hackernews(query: str, attribute="", days=0, limit: int | None = None):
81 | """https://hn.algolia.com/api"""
82 | params = {"query": query}
83 | params.update({"restrictSearchableAttributes": attribute}) if attribute else None
84 | if days > 0:
85 | days_ago = int((datetime.now() - timedelta(days=days)).timestamp())
86 | params.update({"numericFilters": f"created_at_i>{days_ago}"})
87 | params.update({"hitsPerPage": str(limit)}) if limit else None
88 | response = requests.get("https://hn.algolia.com/api/v1/search", params=params)
89 | json = response.json()
90 | return pd.json_normalize([hit_to_dict(hit) for hit in json["hits"]])
91 |
92 |
93 | def article_to_dict(article: dict):
94 | """https://huggingface.co/docs/hub/en/api#get-apidailypapers"""
95 | arxiv_id = article["paper"]["id"]
96 | created_at = int(datetime.fromisoformat(article["paper"]["submittedOnDailyAt"].replace("Z", "+00:00")).timestamp())
97 | return {
98 | "id": f"https://huggingface.co/papers/{arxiv_id}",
99 | "score": article["paper"]["upvotes"],
100 | "num_comments": article["numComments"],
101 | "created_at": created_at,
102 | "arxiv_id": [arxiv_id],
103 | "title": article["title"],
104 | "description": article["summary"],
105 | }
106 |
107 |
108 | def get_huggingface(timestamp: float, wait=1):
109 | """https://huggingface.co/docs/hub/en/api#get-apidailypapers"""
110 | date = datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d")
111 | url = f"https://huggingface.co/api/daily_papers?date={date}"
112 | referer = f"https://huggingface.co/papers/date/{date}"
113 | ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"
114 | time.sleep(wait)
115 | response = requests.get(url, headers={"Referer": referer, "User-Agent": ua})
116 | print(f"Status code {response.status_code}, {len(response.text)} characters at {date}")
117 | if response.status_code != 200:
118 | print(f"Failed to fetch data for {date}: {response.status_code}")
119 | return []
120 | articles = response.json()
121 | if not articles or "error" in articles or not isinstance(articles, list):
122 | print(f"No articles found for {date} or error in response.")
123 | return []
124 | print(f"Got {len(articles)} articles from {date}")
125 | return [article_to_dict(article) for article in articles]
126 |
127 |
128 | def search_huggingface(days=30, wait=1):
129 | """https://huggingface.co/docs/hub/en/api#get-apidailypapers"""
130 | now = datetime.now()
131 | timestamps = [(now - timedelta(days=d)).timestamp() for d in range(days)]
132 | df = pd.json_normalize(flatten([get_huggingface(ts, wait) for ts in timestamps]))
133 | return df.drop_duplicates(subset=["id"], keep="last").reset_index(drop=True)
134 |
135 |
136 | def paper_to_dict(paper: dict):
137 | """https://www.alphaxiv.org/explore?sort=Likes&time=30+Days"""
138 | arxiv_id = paper["universal_paper_id"]
139 | try:
140 | created_at = int(datetime.fromisoformat(paper["publication_date"].replace("Z", "+00:00")).timestamp())
141 | except Exception as e:
142 | print(f"Failed to parse publication date for {arxiv_id}: {e}")
143 | created_at = datetime.now(timezone.utc).timestamp()
144 | return {
145 | "id": f"https://www.alphaxiv.org/abs/{arxiv_id}",
146 | "score": paper["metrics"]["public_total_votes"],
147 | "num_comments": 0, # TODO: find the number of comments
148 | "created_at": created_at,
149 | "arxiv_id": [arxiv_id],
150 | "title": paper["title"],
151 | "description": paper["abstract"],
152 | }
153 |
154 |
155 | def get_alphaxiv(sort_by="Likes", interval="30+Days", page_size=10, page_num=0, wait=1):
156 | """https://www.alphaxiv.org/explore?sort=Likes&time=30+Days"""
157 | url = f"https://api.alphaxiv.org/v2/papers/trending-papers?page_num={page_num}&sort_by={sort_by}&page_size={page_size}&interval={interval}"
158 | referer = f"https://www.alphaxiv.org/explore?sort={sort_by}&time={interval}"
159 | ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"
160 | time.sleep(wait)
161 | response = requests.get(url, headers={"Referer": referer, "User-Agent": ua})
162 | print(f"Status code {response.status_code}, {len(response.text)} characters at page {page_num}")
163 | if response.status_code != 200:
164 | print(f"Failed to fetch data: {response.status_code}")
165 | return []
166 | json = response.json()
167 | if not json or "error" in json or "data" not in json or "trending_papers" not in json["data"]:
168 | print("No articles found or error in response.")
169 | return []
170 | return [paper_to_dict(paper) for paper in json["data"]["trending_papers"]]
171 |
172 |
173 | def search_alphaxiv(sort_by="Likes", interval="30+Days", page_size=10, limit=30, wait=1):
174 | """https://www.alphaxiv.org/explore?sort=Likes&time=30+Days"""
175 | page_nums = [i for i in range(0, (limit + page_size - 1) // page_size)]
176 | df = pd.json_normalize(flatten([get_alphaxiv(sort_by=sort_by, interval=interval, page_size=page_size, page_num=page_num, wait=wait) for page_num in page_nums]))
177 | return df.drop_duplicates(subset=["id"], keep="last").reset_index(drop=True)
178 |
179 |
180 | def filter_invalid_arxiv_id(document_df: pd.DataFrame):
181 | """Filter out documents with invalid arXiv IDs using ARXIV_ID_PATTERN."""
182 |
183 | def is_valid_arxiv_id_list(arxiv_id_list):
184 | if not arxiv_id_list:
185 | return False
186 | return all(ARXIV_ID_PATTERN.match(arxiv_id) for arxiv_id in arxiv_id_list)
187 |
188 | valid_mask = document_df["arxiv_id"].apply(is_valid_arxiv_id_list)
189 | filtered_df = document_df[valid_mask].reset_index(drop=True)
190 |
191 | invalid_count = len(document_df) - len(filtered_df)
192 | if invalid_count > 0:
193 | print(f"Filtered out {invalid_count} documents with invalid arXiv IDs")
194 |
195 | return filtered_df
196 |
197 |
198 | def get_arxiv_stats(document_df: pd.DataFrame):
199 | return document_df.explode("arxiv_id").groupby("arxiv_id").agg(score=("score", "sum"), num_comments=("num_comments", "sum"), count=("id", "count"), document_id=("id", pd.Series.to_list)).sort_values(by=["score", "num_comments", "count"], ascending=False).reset_index()
200 |
201 |
202 | def arxiv_result_to_dict(r: arxiv.Result):
203 | m = ARXIV_URL_PATTERN.match(r.entry_id)
204 | arxiv_id = m.group(2) if m else None
205 | assert arxiv_id is not None
206 | arxiv_id_v = m.group(2) + m.group(3) if m else None
207 | assert arxiv_id_v is not None
208 | return {
209 | "arxiv_id": arxiv_id,
210 | "arxiv_id_v": arxiv_id_v,
211 | "entry_id": r.entry_id,
212 | "updated": str(r.updated), # TODO
213 | "published": str(r.published), # TODO
214 | "title": r.title,
215 | "authors": [str(a) for a in r.authors],
216 | "summary": r.summary,
217 | "comment": r.comment,
218 | "journal_ref": r.journal_ref,
219 | "doi": r.doi,
220 | "primary_category": r.primary_category,
221 | "categories": [str(c) for c in r.categories],
222 | "links": [str(link) for link in r.links],
223 | "pdf_url": r.pdf_url,
224 | }
225 |
226 |
227 | def get_arxiv_contents(id_list: list[str], chunk_size=100):
228 | rs: list[arxiv.Result] = []
229 | cdr = id_list
230 | for i in range(1 + len(id_list) // chunk_size):
231 | car = cdr[:chunk_size]
232 | cdr = cdr[chunk_size:]
233 | if len(car) > 0:
234 | try:
235 | search = arxiv.Search(id_list=car, max_results=len(car))
236 | r = list(search.results())
237 | rs.extend(r)
238 | print("search_arxiv_contents: ", i, len(r), len(rs))
239 | except Exception as e:
240 | print(e)
241 | return pd.json_normalize([arxiv_result_to_dict(r) for r in rs])
242 |
243 |
244 | def filter_df(df: pd.DataFrame, top_n=10, days=365, count=1, num_comments=0):
245 | df = df[df["count"] >= count]
246 | df = df[df["num_comments"] >= num_comments]
247 | days_ago = (datetime.now(timezone.utc) - timedelta(days=days)).strftime("%Y-%m-%d") # noqa: F841
248 | return df.query("published > @days_ago").head(top_n).reset_index(drop=True)
249 |
250 |
251 | def summarize(query, time_filter="month", days=30, limit=300):
252 | try:
253 | print("search_reddit...")
254 | reddit_document_df = search_reddit(f"selftext:{query}", sort="top", time_filter=time_filter, limit=limit)
255 | print("search_reddit...done: ", len(reddit_document_df))
256 | except Exception as e:
257 | print(e)
258 | reddit_document_df = pd.json_normalize([])
259 | try:
260 | print("search_hackernews...")
261 | hackernews_document_df = search_hackernews(query, attribute="url", days=days, limit=limit)
262 | print("search_hackernews...done: ", len(hackernews_document_df))
263 | except Exception as e:
264 | print(e)
265 | hackernews_document_df = pd.json_normalize([])
266 | try:
267 | print("search_huggingface...")
268 | search_huggingface_df = search_huggingface(days=days)
269 | print("search_huggingface...done: ", len(search_huggingface_df))
270 | except Exception as e:
271 | print(e)
272 | search_huggingface_df = pd.json_normalize([])
273 | try:
274 | print("search_alphaxiv...")
275 | search_alphaxiv_df = search_alphaxiv(limit=limit)
276 | print("search_alphaxiv...done: ", len(search_alphaxiv_df))
277 | except Exception as e:
278 | print(e)
279 | search_alphaxiv_df = pd.json_normalize([])
280 | concat_df = pd.concat([reddit_document_df, hackernews_document_df, search_huggingface_df, search_alphaxiv_df], ignore_index=True).sort_values(by=["score", "num_comments"], ascending=False).reset_index(drop=True)
281 | document_df = filter_invalid_arxiv_id(concat_df)
282 | print("document_df: ", len(document_df))
283 | stats_df = get_arxiv_stats(document_df)
284 | print("stats_df: ", len(stats_df))
285 | contents_df = get_arxiv_contents(stats_df["arxiv_id"].tolist(), chunk_size=100)
286 | print("contents_df: ", len(contents_df))
287 | paper_df = pd.merge(stats_df, contents_df, on="arxiv_id")
288 | print("paper_df: ", len(paper_df))
289 | return paper_df, document_df
290 |
291 |
292 | def translate_arxiv(dlc: deeplcache.DeepLCache, df: pd.DataFrame, target_lang: str):
293 | seg = pysbd.Segmenter(language="en", clean=False)
294 | print("translate_arxiv: before: ", len(dlc.cache))
295 | print(dlc.translator.get_usage())
296 | for arxiv_id, summary in zip(df["arxiv_id"], df["summary"]):
297 | summary_texts = seg.segment(summary.replace("\n", " ")[:2000])
298 | trans_texts, trans_ts = dlc.translate_text(summary_texts, target_lang, arxiv_id)
299 | print("translate_arxiv: ", arxiv_id, sum([len(s) for s in summary_texts]), sum([len(t) for t in trans_texts]), trans_ts)
300 | print("translate_arxiv: after: ", len(dlc.cache))
301 | print(dlc.translator.get_usage())
302 | return dlc
303 |
304 |
305 | def main():
306 | # settings
307 | query = "arxiv.org"
308 | summarize_time_filter = "month" # or "week"
309 | summarize_days = 30 # should be 30 if "month"
310 | summarize_limit = 300
311 | filter_days = 30
312 | filter_count = 1
313 | filter_num_comments = 1
314 | deepl_target_lang = "JA"
315 | deepl_expire_days = 90
316 | notify_top_n = int(os.getenv("NOTIFY_TOP_N", 10))
317 |
318 | # prepare apis
319 | gcs_bucket = storage.Client().bucket(os.getenv("GCS_BUCKET_NAME"))
320 | deepl_api = deepl.Translator(os.getenv("DEEPL_AUTH_KEY")) # type: ignore
321 | slack_api = slack_sdk.WebClient(os.getenv("SLACK_BOT_TOKEN"))
322 | slack_channel = os.getenv("SLACK_CHANNEL")
323 | tweepy_api_v2 = tweepy.Client(bearer_token=os.getenv("TWITTER_BEARER_TOKEN"), consumer_key=os.getenv("TWITTER_API_KEY"), consumer_secret=os.getenv("TWITTER_API_KEY_SECRET"), access_token=os.getenv("TWITTER_ACCESS_TOKEN"), access_token_secret=os.getenv("TWITTER_ACCESS_TOKEN_SECRET"), wait_on_rate_limit=True)
324 | # because media_upload is only available on api v1.
325 | tweepy_api_v1 = tweepy.API(tweepy.OAuth1UserHandler(consumer_key=os.getenv("TWITTER_API_KEY"), consumer_secret=os.getenv("TWITTER_API_KEY_SECRET"), access_token=os.getenv("TWITTER_ACCESS_TOKEN"), access_token_secret=os.getenv("TWITTER_ACCESS_TOKEN_SECRET")), wait_on_rate_limit=True)
326 | bluesky_api = nanoatp.BskyAgent()
327 | bluesky_api.login(os.getenv("ATP_IDENTIFIER"), os.getenv("ATP_PASSWORD")) # type: ignore
328 |
329 | # search reddit and measure popularity
330 | paper_df, document_df = summarize(query, time_filter=summarize_time_filter, days=summarize_days, limit=summarize_limit)
331 |
332 | # filter by days
333 | filtered_df = filter_df(paper_df, top_n=notify_top_n, days=filter_days, count=filter_count, num_comments=filter_num_comments)
334 | print("filtered_df: ", len(filtered_df))
335 |
336 | # translate summary text
337 | dlc = deeplcache.DeepLCache(deepl_api)
338 | try:
339 | dlc.load_from_gcs(gcs_bucket, "deepl_cache.json.gz")
340 | except Exception as e:
341 | print(e)
342 | dlc = translate_arxiv(dlc, filtered_df, deepl_target_lang)
343 | dlc.clear_cache(expire_timedelta=timedelta(days=deepl_expire_days))
344 | dlc.save_to_gcs(gcs_bucket, "deepl_cache.json.gz")
345 |
346 | # post
347 | try:
348 | postslack.post_to_slack(slack_api, slack_channel, dlc, filtered_df, document_df)
349 | except Exception as e:
350 | print(e)
351 |
352 | try:
353 | postbluesky.post_to_bluesky(bluesky_api, dlc, filtered_df, document_df)
354 | except Exception as e:
355 | print(e)
356 |
357 | try:
358 | posttwitter.post_to_twitter(tweepy_api_v1, tweepy_api_v2, dlc, filtered_df, document_df)
359 | except Exception as e:
360 | print(e)
361 |
362 |
363 | if __name__ == "__main__":
364 | main()
365 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # arXiv Reddit Summary
2 |
3 | Summarize the top 30 most popular arXiv papers on Reddit, Hacker News and Hugging Face in the last 30 days. And post them to Slack, Twitter and Bluesky.
4 |
5 | ## Demo
6 |
7 | - https://x.com/susumuota
8 | - https://bsky.app/profile/paper.bsky.social
9 |
10 | ## Google Cloud Run
11 |
12 | This system is running on Google Cloud Run jobs.
13 |
14 | - https://cloud.google.com/build/docs/build-push-docker-image
15 | - https://cloud.google.com/run/docs/create-jobs#command-line
16 | - https://cloud.google.com/scheduler/docs/creating#gcloud
17 |
18 | ## Create a project
19 |
20 | - https://cloud.google.com/resource-manager/docs/creating-managing-projects#creating_a_project
21 |
22 | ```sh
23 | export PROJECT_ID="arxiv-summary-1"
24 | gcloud projects create $PROJECT_ID
25 | gcloud projects list
26 | # gcloud projects delete $PROJECT_ID
27 | # unset PROJECT_ID
28 | ```
29 |
30 | ## Enable billing
31 |
32 | Follow this instruction. As far as I know there is no way to enable billing from the command line.
33 |
34 | - https://cloud.google.com/billing/docs/how-to/modify-project#how-to-enable-billing
35 | - https://console.cloud.google.com/billing/projects
36 |
37 | Then confirm it.
38 |
39 | ```sh
40 | gcloud beta billing projects describe $PROJECT_ID
41 | ```
42 |
43 | It should show `billingEnabled: true`.
44 |
45 | ## Create a bucket
46 |
47 | ```sh
48 | export GCS_BUCKET_NAME="arxiv-summary"
49 | export REGION="us-central1"
50 | gcloud storage buckets create "gs://${GCS_BUCKET_NAME}" \
51 | --project=$PROJECT_ID \
52 | --location=$REGION \
53 | --public-access-prevention \
54 | --uniform-bucket-level-access
55 | gcloud storage buckets list --project=$PROJECT_ID | grep name
56 | # gcloud storage buckets delete "gs://${GCS_BUCKET_NAME}" --project=$PROJECT_ID
57 | # unset GCS_BUCKET_NAME
58 | ```
59 |
60 | ## Build a Docker image on local machine and test it
61 |
62 | - https://cloud.google.com/sdk/gcloud/reference/auth/application-default/login
63 | - https://stackoverflow.com/a/50826145
64 |
65 | Application settings.
66 |
67 | ```sh
68 | # export TWITTER_BEARER_TOKEN="secret info"
69 | # export TWITTER_API_KEY="secret info"
70 | # export TWITTER_API_KEY_SECRET="secret info"
71 | # export TWITTER_ACCESS_TOKEN="secret info"
72 | # export TWITTER_ACCESS_TOKEN_SECRET="secret info"
73 | # export DEEPL_AUTH_KEY="secret info"
74 | # export SLACK_BOT_TOKEN="secret info"
75 | # export praw_client_id="secret info"
76 | # export praw_client_secret="secret info"
77 | # export praw_user_agent="secret info"
78 | # export ATP_IDENTIFIER="secret info"
79 | # export ATP_PASSWORD="secret info"
80 |
81 | export NOTIFY_TOP_N="30" # 30 on production env
82 | export SLACK_CHANNEL="#test" # #anywhere on production env
83 | ```
84 |
85 | Local test.
86 |
87 | ```sh
88 | poetry export -f requirements.txt --without-hashes -o docker/requirements.txt
89 | ```
90 |
91 | ```sh
92 | gcloud auth application-default login
93 | cd docker
94 | export IMAGE_NAME="arxiv-reddit-summary"
95 | docker build -t $IMAGE_NAME .
96 | docker run --rm \
97 | -e TWITTER_BEARER_TOKEN=$TWITTER_BEARER_TOKEN \
98 | -e TWITTER_API_KEY=$TWITTER_API_KEY \
99 | -e TWITTER_API_KEY_SECRET=$TWITTER_API_KEY_SECRET \
100 | -e TWITTER_ACCESS_TOKEN=$TWITTER_ACCESS_TOKEN \
101 | -e TWITTER_ACCESS_TOKEN_SECRET=$TWITTER_ACCESS_TOKEN_SECRET \
102 | -e DEEPL_AUTH_KEY=$DEEPL_AUTH_KEY \
103 | -e SLACK_BOT_TOKEN=$SLACK_BOT_TOKEN \
104 | -e praw_client_id=$praw_client_id \
105 | -e praw_client_secret=$praw_client_secret \
106 | -e praw_user_agent=$praw_user_agent \
107 | -e ATP_IDENTIFIER=$ATP_IDENTIFIER \
108 | -e ATP_PASSWORD=$ATP_PASSWORD \
109 | -e NOTIFY_TOP_N=$NOTIFY_TOP_N \
110 | -e SLACK_CHANNEL=$SLACK_CHANNEL \
111 | -e GCS_BUCKET_NAME=$GCS_BUCKET_NAME \
112 | -e GCLOUD_PROJECT=$PROJECT_ID \
113 | -v $HOME/.config/gcloud:/root/.config/gcloud \
114 | $IMAGE_NAME
115 | docker images
116 | # docker rmi $IMAGE_NAME
117 | # unset IMAGE_NAME
118 | ```
119 |
120 | ## Create a service account for Cloud Run
121 |
122 | ```sh
123 | export RUN_SERVICE_ACCOUNT="run-sa"
124 | gcloud iam service-accounts create $RUN_SERVICE_ACCOUNT --project=$PROJECT_ID
125 | gcloud iam service-accounts list --project=$PROJECT_ID
126 | # gcloud iam service-accounts delete "${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" --project=$PROJECT_ID
127 | # unset RUN_SERVICE_ACCOUNT
128 | ```
129 |
130 | ## Add roles to service account to access GCS and to invoke Cloud Run
131 |
132 | - https://cloud.google.com/storage/docs/access-control/iam-roles
133 | - https://cloud.google.com/scheduler/docs/creating#gcloud
134 | - https://cloud.google.com/iam/docs/creating-managing-service-accounts#creating
135 | - https://codelabs.developers.google.com/cloud-run-jobs-and-cloud-scheduler#4
136 |
137 | ```sh
138 | gcloud projects add-iam-policy-binding $PROJECT_ID \
139 | --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
140 | --role="roles/storage.objectAdmin"
141 | gcloud projects add-iam-policy-binding $PROJECT_ID \
142 | --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
143 | --role="roles/run.invoker"
144 | gcloud projects get-iam-policy $PROJECT_ID
145 | # gcloud projects remove-iam-policy-binding $PROJECT_ID \
146 | # --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
147 | # --role="roles/storage.objectAdmin"
148 | # gcloud projects remove-iam-policy-binding $PROJECT_ID \
149 | # --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
150 | # --role="roles/run.invoker"
151 | ```
152 |
153 | ## Create secret data
154 |
155 | - https://cloud.google.com/secret-manager/docs/create-secret#secretmanager-quickstart-gcloud
156 | - https://cloud.google.com/run/docs/configuring/secrets
157 |
158 | ```sh
159 | gcloud services enable secretmanager.googleapis.com --project=$PROJECT_ID
160 | echo -n $TWITTER_BEARER_TOKEN | gcloud secrets create "TWITTER_BEARER_TOKEN" \
161 | --project=$PROJECT_ID \
162 | --replication-policy="automatic" \
163 | --data-file=-
164 | echo -n $TWITTER_API_KEY | gcloud secrets create "TWITTER_API_KEY" \
165 | --project=$PROJECT_ID \
166 | --replication-policy="automatic" \
167 | --data-file=-
168 | echo -n $TWITTER_API_KEY_SECRET | gcloud secrets create "TWITTER_API_KEY_SECRET" \
169 | --project=$PROJECT_ID \
170 | --replication-policy="automatic" \
171 | --data-file=-
172 | echo -n $TWITTER_ACCESS_TOKEN | gcloud secrets create "TWITTER_ACCESS_TOKEN" \
173 | --project=$PROJECT_ID \
174 | --replication-policy="automatic" \
175 | --data-file=-
176 | echo -n $TWITTER_ACCESS_TOKEN_SECRET | gcloud secrets create "TWITTER_ACCESS_TOKEN_SECRET" \
177 | --project=$PROJECT_ID \
178 | --replication-policy="automatic" \
179 | --data-file=-
180 | echo -n $DEEPL_AUTH_KEY | gcloud secrets create "DEEPL_AUTH_KEY" \
181 | --project=$PROJECT_ID \
182 | --replication-policy="automatic" \
183 | --data-file=-
184 | echo -n $SLACK_BOT_TOKEN | gcloud secrets create "SLACK_BOT_TOKEN" \
185 | --project=$PROJECT_ID \
186 | --replication-policy="automatic" \
187 | --data-file=-
188 | echo -n $praw_client_id | gcloud secrets create "praw_client_id" \
189 | --project=$PROJECT_ID \
190 | --replication-policy="automatic" \
191 | --data-file=-
192 | echo -n $praw_client_secret | gcloud secrets create "praw_client_secret" \
193 | --project=$PROJECT_ID \
194 | --replication-policy="automatic" \
195 | --data-file=-
196 | echo -n $praw_user_agent | gcloud secrets create "praw_user_agent" \
197 | --project=$PROJECT_ID \
198 | --replication-policy="automatic" \
199 | --data-file=-
200 | echo -n $ATP_IDENTIFIER | gcloud secrets create "ATP_IDENTIFIER" \
201 | --project=$PROJECT_ID \
202 | --replication-policy="automatic" \
203 | --data-file=-
204 | echo -n $ATP_PASSWORD | gcloud secrets create "ATP_PASSWORD" \
205 | --project=$PROJECT_ID \
206 | --replication-policy="automatic" \
207 | --data-file=-
208 | gcloud secrets list --project=$PROJECT_ID
209 | gcloud secrets versions access 1 --secret="TWITTER_BEARER_TOKEN" --project=$PROJECT_ID
210 | gcloud secrets versions access 1 --secret="TWITTER_API_KEY" --project=$PROJECT_ID
211 | gcloud secrets versions access 1 --secret="TWITTER_API_KEY_SECRET" --project=$PROJECT_ID
212 | gcloud secrets versions access 1 --secret="TWITTER_ACCESS_TOKEN" --project=$PROJECT_ID
213 | gcloud secrets versions access 1 --secret="TWITTER_ACCESS_TOKEN_SECRET" --project=$PROJECT_ID
214 | gcloud secrets versions access 1 --secret="DEEPL_AUTH_KEY" --project=$PROJECT_ID
215 | gcloud secrets versions access 1 --secret="SLACK_BOT_TOKEN" --project=$PROJECT_ID
216 | gcloud secrets versions access 1 --secret="praw_client_id" --project=$PROJECT_ID
217 | gcloud secrets versions access 1 --secret="praw_client_secret" --project=$PROJECT_ID
218 | gcloud secrets versions access 1 --secret="praw_user_agent" --project=$PROJECT_ID
219 | gcloud secrets versions access 1 --secret="ATP_IDENTIFIER" --project=$PROJECT_ID
220 | gcloud secrets versions access 1 --secret="ATP_PASSWORD" --project=$PROJECT_ID
221 | # gcloud secrets delete "TWITTER_BEARER_TOKEN" --project=$PROJECT_ID
222 | # gcloud secrets delete "TWITTER_API_KEY" --project=$PROJECT_ID
223 | # gcloud secrets delete "TWITTER_API_KEY_SECRET" --project=$PROJECT_ID
224 | # gcloud secrets delete "TWITTER_ACCESS_TOKEN" --project=$PROJECT_ID
225 | # gcloud secrets delete "TWITTER_ACCESS_TOKEN_SECRET" --project=$PROJECT_ID
226 | # gcloud secrets delete "DEEPL_AUTH_KEY" --project=$PROJECT_ID
227 | # gcloud secrets delete "SLACK_BOT_TOKEN" --project=$PROJECT_ID
228 | # gcloud secrets delete "praw_client_id" --project=$PROJECT_ID
229 | # gcloud secrets delete "praw_client_secret" --project=$PROJECT_ID
230 | # gcloud secrets delete "praw_user_agent" --project=$PROJECT_ID
231 | # gcloud secrets delete "ATP_IDENTIFIER" --project=$PROJECT_ID
232 | # gcloud secrets delete "ATP_PASSWORD" --project=$PROJECT_ID
233 | # gcloud services disable secretmanager.googleapis.com --project=$PROJECT_ID
234 | ```
235 |
236 | ## Add roles to secrets to be accessed by service account
237 |
238 | - https://cloud.google.com/secret-manager/docs/managing-secrets#secretmanager-create-secret-gcloud
239 |
240 | ```sh
241 | gcloud secrets add-iam-policy-binding "TWITTER_BEARER_TOKEN" \
242 | --project=$PROJECT_ID \
243 | --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
244 | --role="roles/secretmanager.secretAccessor"
245 | gcloud secrets add-iam-policy-binding "TWITTER_API_KEY" \
246 | --project=$PROJECT_ID \
247 | --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
248 | --role="roles/secretmanager.secretAccessor"
249 | gcloud secrets add-iam-policy-binding "TWITTER_API_KEY_SECRET" \
250 | --project=$PROJECT_ID \
251 | --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
252 | --role="roles/secretmanager.secretAccessor"
253 | gcloud secrets add-iam-policy-binding "TWITTER_ACCESS_TOKEN" \
254 | --project=$PROJECT_ID \
255 | --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
256 | --role="roles/secretmanager.secretAccessor"
257 | gcloud secrets add-iam-policy-binding "TWITTER_ACCESS_TOKEN_SECRET" \
258 | --project=$PROJECT_ID \
259 | --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
260 | --role="roles/secretmanager.secretAccessor"
261 | gcloud secrets add-iam-policy-binding "DEEPL_AUTH_KEY" \
262 | --project=$PROJECT_ID \
263 | --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
264 | --role="roles/secretmanager.secretAccessor"
265 | gcloud secrets add-iam-policy-binding "SLACK_BOT_TOKEN" \
266 | --project=$PROJECT_ID \
267 | --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
268 | --role="roles/secretmanager.secretAccessor"
269 | gcloud secrets add-iam-policy-binding "praw_client_id" \
270 | --project=$PROJECT_ID \
271 | --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
272 | --role="roles/secretmanager.secretAccessor"
273 | gcloud secrets add-iam-policy-binding "praw_client_secret" \
274 | --project=$PROJECT_ID \
275 | --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
276 | --role="roles/secretmanager.secretAccessor"
277 | gcloud secrets add-iam-policy-binding "praw_user_agent" \
278 | --project=$PROJECT_ID \
279 | --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
280 | --role="roles/secretmanager.secretAccessor"
281 | gcloud secrets add-iam-policy-binding "ATP_IDENTIFIER" \
282 | --project=$PROJECT_ID \
283 | --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
284 | --role="roles/secretmanager.secretAccessor"
285 | gcloud secrets add-iam-policy-binding "ATP_PASSWORD" \
286 | --project=$PROJECT_ID \
287 | --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
288 | --role="roles/secretmanager.secretAccessor"
289 | gcloud secrets get-iam-policy "TWITTER_BEARER_TOKEN" --project=$PROJECT_ID
290 | gcloud secrets get-iam-policy "TWITTER_API_KEY" --project=$PROJECT_ID
291 | gcloud secrets get-iam-policy "TWITTER_API_KEY_SECRET" --project=$PROJECT_ID
292 | gcloud secrets get-iam-policy "TWITTER_ACCESS_TOKEN" --project=$PROJECT_ID
293 | gcloud secrets get-iam-policy "TWITTER_ACCESS_TOKEN_SECRET" --project=$PROJECT_ID
294 | gcloud secrets get-iam-policy "DEEPL_AUTH_KEY" --project=$PROJECT_ID
295 | gcloud secrets get-iam-policy "SLACK_BOT_TOKEN" --project=$PROJECT_ID
296 | gcloud secrets get-iam-policy "praw_client_id" --project=$PROJECT_ID
297 | gcloud secrets get-iam-policy "praw_client_secret" --project=$PROJECT_ID
298 | gcloud secrets get-iam-policy "praw_user_agent" --project=$PROJECT_ID
299 | gcloud secrets get-iam-policy "ATP_IDENTIFIER" --project=$PROJECT_ID
300 | gcloud secrets get-iam-policy "ATP_PASSWORD" --project=$PROJECT_ID
301 | # gcloud secrets remove-iam-policy-binding "TWITTER_BEARER_TOKEN" \
302 | # --project=$PROJECT_ID \
303 | # --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
304 | # --role="roles/secretmanager.secretAccessor"
305 | # gcloud secrets remove-iam-policy-binding "TWITTER_API_KEY" \
306 | # --project=$PROJECT_ID \
307 | # --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
308 | # --role="roles/secretmanager.secretAccessor"
309 | # gcloud secrets remove-iam-policy-binding "TWITTER_API_KEY_SECRET" \
310 | # --project=$PROJECT_ID \
311 | # --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
312 | # --role="roles/secretmanager.secretAccessor"
313 | # gcloud secrets remove-iam-policy-binding "TWITTER_ACCESS_TOKEN" \
314 | # --project=$PROJECT_ID \
315 | # --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
316 | # --role="roles/secretmanager.secretAccessor"
317 | # gcloud secrets remove-iam-policy-binding "TWITTER_ACCESS_TOKEN_SECRET" \
318 | # --project=$PROJECT_ID \
319 | # --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
320 | # --role="roles/secretmanager.secretAccessor"
321 | # gcloud secrets remove-iam-policy-binding "DEEPL_AUTH_KEY" \
322 | # --project=$PROJECT_ID \
323 | # --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
324 | # --role="roles/secretmanager.secretAccessor"
325 | # gcloud secrets remove-iam-policy-binding "SLACK_BOT_TOKEN" \
326 | # --project=$PROJECT_ID \
327 | # --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
328 | # --role="roles/secretmanager.secretAccessor"
329 | # gcloud secrets remove-iam-policy-binding "praw_client_id" \
330 | # --project=$PROJECT_ID \
331 | # --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
332 | # --role="roles/secretmanager.secretAccessor"
333 | # gcloud secrets remove-iam-policy-binding "praw_client_secret" \
334 | # --project=$PROJECT_ID \
335 | # --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
336 | # --role="roles/secretmanager.secretAccessor"
337 | # gcloud secrets remove-iam-policy-binding "praw_user_agent" \
338 | # --project=$PROJECT_ID \
339 | # --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
340 | # --role="roles/secretmanager.secretAccessor"
341 | # gcloud secrets remove-iam-policy-binding "ATP_IDENTIFIER" \
342 | # --project=$PROJECT_ID \
343 | # --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
344 | # --role="roles/secretmanager.secretAccessor"
345 | # gcloud secrets remove-iam-policy-binding "ATP_PASSWORD" \
346 | # --project=$PROJECT_ID \
347 | # --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
348 | # --role="roles/secretmanager.secretAccessor"
349 | ```
350 |
351 | ## Create a Docker repository
352 |
353 | - https://cloud.google.com/build/docs/build-push-docker-image
354 | - https://codelabs.developers.google.com/cloud-run-jobs-and-cloud-scheduler#3
355 |
356 | ```sh
357 | gcloud services enable artifactregistry.googleapis.com --project=$PROJECT_ID
358 | export REPOSITORY="arxiv-reddit-summary"
359 | gcloud artifacts repositories create $REPOSITORY \
360 | --project=$PROJECT_ID \
361 | --repository-format="docker" \
362 | --location=$REGION
363 | gcloud artifacts repositories list --project=$PROJECT_ID --location=$REGION
364 | # gcloud artifacts repositories delete $REPOSITORY --project=$PROJECT_ID --location=$REGION
365 | # gcloud services disable artifactregistry.googleapis.com --project=$PROJECT_ID
366 | # unset REPOSITORY REGION
367 | ```
368 |
369 | ## Build a Docker image
370 |
371 | - https://cloud.google.com/build/docs/build-push-docker-image
372 | - https://cloud.google.com/build/docs/building/build-containers#use-dockerfile
373 | - https://codelabs.developers.google.com/cloud-run-jobs-and-cloud-scheduler#3
374 |
375 | ```sh
376 | gcloud services enable cloudbuild.googleapis.com --project=$PROJECT_ID
377 | export TAG_NAME="latest"
378 | gcloud builds submit \
379 | --project=$PROJECT_ID \
380 | --region=$REGION \
381 | --tag="${REGION}-docker.pkg.dev/${PROJECT_ID}/${REPOSITORY}/${IMAGE_NAME}:${TAG_NAME}"
382 | gcloud builds list --project=$PROJECT_ID --region=$REGION
383 | gcloud artifacts repositories list --project=$PROJECT_ID --location=$REGION
384 | # gcloud services disable cloudbuild.googleapis.com --project=$PROJECT_ID
385 | # unset TAG_NAME
386 | ```
387 |
388 | ## Test a Docker image on local machine
389 |
390 | **This process may increase charge because of data transfer.**
391 |
392 | - https://cloud.google.com/build/docs/building/build-containers#run_the_docker_image
393 | - https://cloud.google.com/artifact-registry/pricing
394 | - https://support.terra.bio/hc/en-us/articles/4408985788187-How-to-configure-GCR-Artifact-Registry-to-prevent-egress-charges
395 |
396 | ```sh
397 | gcloud auth configure-docker ${REGION}-docker.pkg.dev
398 | docker run --rm \
399 | -e TWITTER_BEARER_TOKEN=$TWITTER_BEARER_TOKEN \
400 | -e TWITTER_API_KEY=$TWITTER_API_KEY \
401 | -e TWITTER_API_KEY_SECRET=$TWITTER_API_KEY_SECRET \
402 | -e TWITTER_ACCESS_TOKEN=$TWITTER_ACCESS_TOKEN \
403 | -e TWITTER_ACCESS_TOKEN_SECRET=$TWITTER_ACCESS_TOKEN_SECRET \
404 | -e DEEPL_AUTH_KEY=$DEEPL_AUTH_KEY \
405 | -e SLACK_BOT_TOKEN=$SLACK_BOT_TOKEN \
406 | -e praw_client_id=$praw_client_id \
407 | -e praw_client_secret=$praw_client_secret \
408 | -e praw_user_agent=$praw_user_agent \
409 | -e ATP_IDENTIFIER=$ATP_IDENTIFIER \
410 | -e ATP_PASSWORD=$ATP_PASSWORD \
411 | -e NOTIFY_TOP_N=$NOTIFY_TOP_N \
412 | -e SLACK_CHANNEL=$SLACK_CHANNEL \
413 | -e GCS_BUCKET_NAME=$GCS_BUCKET_NAME \
414 | -e GCLOUD_PROJECT=$PROJECT_ID \
415 | -v $HOME/.config/gcloud:/root/.config/gcloud \
416 | "${REGION}-docker.pkg.dev/${PROJECT_ID}/${REPOSITORY}/${IMAGE_NAME}:${TAG_NAME}"
417 | docker images
418 | # docker rmi "${REGION}-docker.pkg.dev/${PROJECT_ID}/${REPOSITORY}/${IMAGE_NAME}:${TAG_NAME}"
419 | ```
420 |
421 | ## Create a Cloud Run job
422 |
423 | - https://cloud.google.com/run/docs/create-jobs#command-line
424 |
425 | Change parameters for production env.
426 |
427 | ```sh
428 | export NOTIFY_TOP_N="30" # 10 on development env
429 | export SLACK_CHANNEL="#test" # #test on development env
430 | ```
431 |
432 | ```sh
433 | gcloud services enable run.googleapis.com --project=$PROJECT_ID
434 | export RUN_JOB_NAME="arxiv-reddit-summary-job-1"
435 | gcloud beta run jobs create $RUN_JOB_NAME \
436 | --image="${REGION}-docker.pkg.dev/${PROJECT_ID}/${REPOSITORY}/${IMAGE_NAME}:${TAG_NAME}" \
437 | --project=$PROJECT_ID \
438 | --region=$REGION \
439 | --service-account="${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
440 | --set-secrets="TWITTER_BEARER_TOKEN=TWITTER_BEARER_TOKEN:1" \
441 | --set-secrets="TWITTER_API_KEY=TWITTER_API_KEY:1" \
442 | --set-secrets="TWITTER_API_KEY_SECRET=TWITTER_API_KEY_SECRET:1" \
443 | --set-secrets="TWITTER_ACCESS_TOKEN=TWITTER_ACCESS_TOKEN:1" \
444 | --set-secrets="TWITTER_ACCESS_TOKEN_SECRET=TWITTER_ACCESS_TOKEN_SECRET:1" \
445 | --set-secrets="DEEPL_AUTH_KEY=DEEPL_AUTH_KEY:1" \
446 | --set-secrets="SLACK_BOT_TOKEN=SLACK_BOT_TOKEN:1" \
447 | --set-secrets="praw_client_id=praw_client_id:1" \
448 | --set-secrets="praw_client_secret=praw_client_secret:1" \
449 | --set-secrets="praw_user_agent=praw_user_agent:1" \
450 | --set-secrets="ATP_IDENTIFIER=ATP_IDENTIFIER:1" \
451 | --set-secrets="ATP_PASSWORD=ATP_PASSWORD:1" \
452 | --set-env-vars="NOTIFY_TOP_N=${NOTIFY_TOP_N}" \
453 | --set-env-vars="SLACK_CHANNEL=${SLACK_CHANNEL}" \
454 | --set-env-vars="GCS_BUCKET_NAME=${GCS_BUCKET_NAME}" \
455 | --max-retries=0 \
456 | --task-timeout="30m" \
457 | --memory="1024Mi"
458 | gcloud beta run jobs list --project=$PROJECT_ID
459 | gcloud beta run jobs describe $RUN_JOB_NAME --project=$PROJECT_ID --region=$REGION
460 | # gcloud beta run jobs delete $RUN_JOB_NAME --project=$PROJECT_ID --region=$REGION
461 | # gcloud services disable run.googleapis.com --project=$PROJECT_ID
462 | # unset RUN_JOB_NAME
463 | ```
464 |
465 | ## Execute a job
466 |
467 | - https://cloud.google.com/run/docs/execute/jobs
468 |
469 | ```sh
470 | gcloud beta run jobs execute $RUN_JOB_NAME --project=$PROJECT_ID --region=$REGION
471 | gcloud beta run jobs executions list --project=$PROJECT_ID --region=$REGION
472 | ```
473 |
474 | ```sh
475 | gcloud logging read "resource.type=cloud_run_job" \
476 | --project=$PROJECT_ID \
477 | --limit 10 | egrep "textPayload|message"
478 | ```
479 |
480 | ## Create a Cloud Scheduler job
481 |
482 | - https://cloud.google.com/run/docs/execute/jobs-on-schedule#command-line
483 | - https://codelabs.developers.google.com/cloud-run-jobs-and-cloud-scheduler#4
484 |
485 | ```sh
486 | export SCHEDULER_JOB_NAME="arxiv-reddit-summary-job-everyday-9am"
487 | gcloud services enable cloudscheduler.googleapis.com --project=$PROJECT_ID
488 | gcloud scheduler jobs create http $SCHEDULER_JOB_NAME \
489 | --project=$PROJECT_ID \
490 | --location=$REGION \
491 | --schedule="0 9 * * *" \
492 | --time-zone "Asia/Tokyo" \
493 | --uri="https://${REGION}-run.googleapis.com/apis/run.googleapis.com/v1/namespaces/${PROJECT_ID}/jobs/${RUN_JOB_NAME}:run" \
494 | --http-method="POST" \
495 | --oauth-service-account-email="${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com"
496 | gcloud scheduler jobs list --project=$PROJECT_ID --location=$REGION
497 | gcloud scheduler jobs describe $SCHEDULER_JOB_NAME --project=$PROJECT_ID --location=$REGION
498 | # gcloud scheduler jobs delete $SCHEDULER_JOB_NAME --project=$PROJECT_ID --location=$REGION
499 | # gcloud services disable cloudscheduler.googleapis.com --project=$PROJECT_ID
500 | # unset SCHEDULER_JOB_NAME
501 | ```
502 |
503 | ```sh
504 | gcloud logging read "resource.type=cloud_run_job OR resource.type=cloud_scheduler_job" \
505 | --project=$PROJECT_ID \
506 | --limit 10 | egrep "textPayload|message"
507 | ```
508 |
509 | ## License
510 |
511 | MIT License, See LICENSE file.
512 |
513 | ## Author
514 |
515 | Susumu OTA
516 |
--------------------------------------------------------------------------------