├── poetry.toml ├── docker ├── .dockerignore ├── Dockerfile ├── utils.py ├── deeplcache.py ├── requirements.txt ├── generatehtml.py ├── postslack.py ├── posttwitter.py ├── postbluesky.py └── main.py ├── .flake8 ├── .gitignore ├── LICENSE ├── pyproject.toml ├── misc └── update_job.sh └── README.md /poetry.toml: -------------------------------------------------------------------------------- 1 | [virtualenvs] 2 | in-project = true 3 | -------------------------------------------------------------------------------- /docker/.dockerignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | __pycache__ 3 | *.json.gz 4 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 320 3 | extend-ignore = E203 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .python-version 2 | .vscode 3 | .DS_Store 4 | __pycache__ 5 | .venv 6 | poetry.lock 7 | *.ipynb 8 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2022-2025 Susumu OTA <1632335+susumuota@users.noreply.github.com> 2 | # SPDX-License-Identifier: MIT 3 | 4 | FROM python:3.11.13-slim 5 | 6 | RUN apt-get update && apt-get install -y --no-install-recommends \ 7 | aria2 \ 8 | fonts-ipafont-gothic \ 9 | poppler-utils \ 10 | wkhtmltopdf \ 11 | && rm -rf /var/lib/apt/lists/* 12 | 13 | WORKDIR /app 14 | 15 | COPY requirements.txt requirements.txt 16 | RUN pip install -r requirements.txt 17 | 18 | COPY . . 19 | 20 | CMD [ "python", "-u", "main.py" ] 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023-2025 Susumu OTA 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "arxiv-reddit-summary" 3 | version = "0.7.6" 4 | description = "Summarize the top 30 most popular arXiv papers on Reddit, Hacker News and Hugging Face in the last 30 days. And post them to Slack, Twitter and Bluesky." 5 | authors = ["Susumu OTA <1632335+susumuota@users.noreply.github.com>"] 6 | license = "MIT" 7 | readme = "README.md" 8 | 9 | [tool.poetry.dependencies] 10 | python = "^3.11" 11 | praw = "^7.8.1" 12 | pandas = "^2.2.3" 13 | arxiv = "^2.1.3" 14 | tweepy = "^4.15.0" 15 | python-dateutil = "^2.9.0.post0" 16 | imgkit = "^1.2.3" 17 | google-cloud-storage = "^3.0.0" 18 | deepl = "^1.21.0" 19 | pysbd = "^0.3.4" 20 | slack-sdk = "^3.34.0" 21 | nanoatp = "^0.5.1" 22 | requests = "^2.32.3" 23 | beautifulsoup4 = "^4.13.3" 24 | 25 | 26 | [tool.poetry.group.dev.dependencies] 27 | black = "^25.1.0" 28 | flake8 = "^7.1.2" 29 | isort = "^6.0.0" 30 | ipykernel = "^6.29.5" 31 | ruff = "^0.11.3" 32 | 33 | [build-system] 34 | requires = ["poetry-core"] 35 | build-backend = "poetry.core.masonry.api" 36 | 37 | [tool.black] 38 | line-length = 320 39 | 40 | [tool.isort] 41 | profile = "black" 42 | 43 | [tool.ruff] 44 | line-length = 320 45 | -------------------------------------------------------------------------------- /docker/utils.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2023 Susumu OTA <1632335+susumuota@users.noreply.github.com> 2 | # SPDX-License-Identifier: MIT 3 | 4 | import os 5 | import subprocess 6 | import unicodedata 7 | from shlex import quote 8 | 9 | import imgkit 10 | 11 | 12 | def download_arxiv_pdf(arxiv_id: str, tmp_dir: str): 13 | dir = quote(tmp_dir) 14 | output = quote(f"{arxiv_id}.pdf") 15 | url = quote(f"https://arxiv.org/pdf/{arxiv_id}.pdf") 16 | result = subprocess.run(f"aria2c -q -x5 -k1M -d {dir} -o {output} {url}", shell=True) 17 | assert result.returncode == 0 # TODO 18 | return os.path.join(tmp_dir, f"{arxiv_id}.pdf") 19 | 20 | 21 | def pdf_to_png(pdf_filename: str): 22 | filename = quote(pdf_filename) 23 | result = subprocess.run(f"pdftoppm -q -png -singlefile -scale-to-x 1200 -scale-to-y -1 {filename} {filename}", shell=True) 24 | assert result.returncode == 0 # TODO 25 | return f"{pdf_filename}.png" 26 | 27 | 28 | def html_to_image(html: str, image_filename: str, quality: int = 94): 29 | result = imgkit.from_string(html, image_filename, options={"width": 1200, "quiet": "", "quality": quality}) 30 | assert result is True # TODO 31 | return image_filename 32 | 33 | 34 | def get_char_width(c: str): 35 | return 2 if unicodedata.east_asian_width(c) in "FWA" else 1 36 | 37 | 38 | def len_tweet(text: str): 39 | return sum(map(get_char_width, text)) 40 | 41 | 42 | def strip_tweet(text: str, max_length=280, dots="..."): 43 | length = max_length - (len(dots) if dots else 0) 44 | buf = [] 45 | count = 0 46 | for c in text: 47 | width = get_char_width(c) 48 | if count + width > length: 49 | return "".join(buf) + (dots if dots else "") 50 | buf.append(c) 51 | count += width 52 | return text 53 | 54 | 55 | def avoid_auto_link(text: str): 56 | """replace period to one dot leader to avoid auto link. 57 | https://shkspr.mobi/blog/2015/01/how-to-stop-twitter-auto-linking-urls/""" 58 | return text.replace(".", "․") 59 | 60 | 61 | def strip(text: str, length: int): 62 | return text[: length - 3] + "..." if len(text) > length else text 63 | 64 | 65 | def get_link_type(link: str): 66 | match link: 67 | case x if x.find("reddit.com") != -1 or x.find("redd.it") != -1: 68 | return "Reddit" 69 | case x if x.find("news.ycombinator.com") != -1: 70 | return "Hacker News" 71 | case x if x.find("huggingface.co") != -1: 72 | return "Hugging Face" 73 | case x if x.find("alphaxiv.org") != -1: 74 | return "alphaXiv" 75 | case _: 76 | return "" 77 | -------------------------------------------------------------------------------- /misc/update_job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # SPDX-FileCopyrightText: 2022 Susumu OTA <1632335+susumuota@users.noreply.github.com> 4 | # SPDX-License-Identifier: MIT 5 | 6 | export NOTIFY_TOP_N="30" 7 | export SLACK_CHANNEL="#test" 8 | 9 | export PROJECT_ID="arxiv-summary-1" 10 | export GCS_BUCKET_NAME="arxiv-summary" 11 | export REGION="us-central1" 12 | export IMAGE_NAME="arxiv-reddit-summary" 13 | export RUN_SERVICE_ACCOUNT="run-sa" 14 | export REPOSITORY="arxiv-reddit-summary" 15 | export TAG_NAME="latest" 16 | export RUN_JOB_NAME="arxiv-reddit-summary-job-1" 17 | export SCHEDULER_JOB_NAME="arxiv-reddit-summary-job-everyday-9am" 18 | 19 | 20 | gcloud artifacts repositories list --project=$PROJECT_ID --location=$REGION 21 | gcloud artifacts repositories delete $REPOSITORY --project=$PROJECT_ID --location=$REGION --quiet 22 | gcloud artifacts repositories create $REPOSITORY \ 23 | --project=$PROJECT_ID \ 24 | --repository-format="docker" \ 25 | --location=$REGION 26 | gcloud artifacts repositories list --project=$PROJECT_ID --location=$REGION 27 | 28 | gcloud builds submit \ 29 | --project=$PROJECT_ID \ 30 | --region=$REGION \ 31 | --tag="${REGION}-docker.pkg.dev/${PROJECT_ID}/${REPOSITORY}/${IMAGE_NAME}:${TAG_NAME}" 32 | gcloud artifacts repositories list --project=$PROJECT_ID --location=$REGION 33 | 34 | gcloud beta run jobs list --project=$PROJECT_ID 35 | gcloud beta run jobs delete $RUN_JOB_NAME --project=$PROJECT_ID --region=$REGION --quiet 36 | gcloud beta run jobs create $RUN_JOB_NAME \ 37 | --image="${REGION}-docker.pkg.dev/${PROJECT_ID}/${REPOSITORY}/${IMAGE_NAME}:${TAG_NAME}" \ 38 | --project=$PROJECT_ID \ 39 | --region=$REGION \ 40 | --service-account="${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \ 41 | --set-secrets="TWITTER_BEARER_TOKEN=TWITTER_BEARER_TOKEN:1" \ 42 | --set-secrets="TWITTER_API_KEY=TWITTER_API_KEY:1" \ 43 | --set-secrets="TWITTER_API_KEY_SECRET=TWITTER_API_KEY_SECRET:1" \ 44 | --set-secrets="TWITTER_ACCESS_TOKEN=TWITTER_ACCESS_TOKEN:1" \ 45 | --set-secrets="TWITTER_ACCESS_TOKEN_SECRET=TWITTER_ACCESS_TOKEN_SECRET:1" \ 46 | --set-secrets="DEEPL_AUTH_KEY=DEEPL_AUTH_KEY:1" \ 47 | --set-secrets="SLACK_BOT_TOKEN=SLACK_BOT_TOKEN:1" \ 48 | --set-secrets="praw_client_id=praw_client_id:1" \ 49 | --set-secrets="praw_client_secret=praw_client_secret:1" \ 50 | --set-secrets="praw_user_agent=praw_user_agent:1" \ 51 | --set-secrets="ATP_IDENTIFIER=ATP_IDENTIFIER:1" \ 52 | --set-secrets="ATP_PASSWORD=ATP_PASSWORD:1" \ 53 | --set-env-vars="NOTIFY_TOP_N=${NOTIFY_TOP_N}" \ 54 | --set-env-vars="SLACK_CHANNEL=${SLACK_CHANNEL}" \ 55 | --set-env-vars="GCS_BUCKET_NAME=${GCS_BUCKET_NAME}" \ 56 | --max-retries=0 \ 57 | --task-timeout="30m" \ 58 | --memory="1024Mi" 59 | gcloud beta run jobs list --project=$PROJECT_ID 60 | -------------------------------------------------------------------------------- /docker/deeplcache.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2023 Susumu OTA <1632335+susumuota@users.noreply.github.com> 2 | # SPDX-License-Identifier: MIT 3 | 4 | import gzip 5 | import json 6 | import os 7 | import tempfile 8 | from datetime import datetime, timedelta, timezone 9 | 10 | import deepl 11 | 12 | 13 | class DeepLCache: 14 | def __init__(self, translator: deepl.Translator): 15 | self.translator = translator 16 | self.cache: dict[str, tuple[list[str], str]] = {} 17 | 18 | def clear_cache(self, expire_timedelta: timedelta | None = None): 19 | if expire_timedelta is None: 20 | self.cache = {} 21 | return 22 | expire_dt = datetime.now(timezone.utc) - expire_timedelta 23 | 24 | def is_not_expire(item): # item is [arxiv_id, [texts, ts]] 25 | return datetime.fromisoformat(item[1][1]) > expire_dt 26 | 27 | self.cache = dict(filter(is_not_expire, self.cache.items())) 28 | 29 | def __repr__(self): 30 | return repr(self.cache) # TODO 31 | 32 | def load(self, filename: str): 33 | with gzip.open(filename, "rt", encoding="UTF-8") as f: 34 | self.cache = json.load(f) 35 | 36 | def save(self, filename: str): 37 | with gzip.open(filename, "wt", encoding="UTF-8") as f: 38 | json.dump(self.cache, f) 39 | 40 | def load_from_s3(self, s3_bucket, filename: str): 41 | with tempfile.TemporaryDirectory() as tmpdir: 42 | tmpfilename = os.path.join(tmpdir, filename) 43 | s3_bucket.download_file(filename, tmpfilename) 44 | self.load(tmpfilename) 45 | 46 | def save_to_s3(self, s3_bucket, filename: str): 47 | with tempfile.TemporaryDirectory() as tmpdir: 48 | tmpfilename = os.path.join(tmpdir, filename) 49 | self.save(tmpfilename) 50 | s3_bucket.upload_file(filename, tmpfilename) 51 | 52 | def load_from_gcs(self, gcs_bucket, filename: str): 53 | with tempfile.TemporaryDirectory() as tmpdir: 54 | tmpfilename = os.path.join(tmpdir, filename) 55 | gcs_bucket.blob(filename).download_to_filename(tmpfilename) 56 | self.load(tmpfilename) 57 | 58 | def save_to_gcs(self, gcs_bucket, filename: str): 59 | with tempfile.TemporaryDirectory() as tmpdir: 60 | tmpfilename = os.path.join(tmpdir, filename) 61 | self.save(tmpfilename) 62 | gcs_bucket.blob(filename).upload_from_filename(tmpfilename) 63 | 64 | def get(self, key: str, default=None): 65 | return self.cache.get(key, default) 66 | 67 | def translate_text(self, text: str | list[str], target_lang: str, key: str): 68 | trans = self.get(key, None) 69 | if trans is not None: 70 | return trans 71 | result = self.translator.translate_text(text=text, target_lang=target_lang) 72 | trans_texts = [r.text for r in result] if type(result) is list else [result.text] if type(result) is deepl.TextResult else [] 73 | trans_ts = datetime.now(timezone.utc).isoformat() 74 | trans = (trans_texts, trans_ts) 75 | self.cache[key] = trans 76 | return trans 77 | -------------------------------------------------------------------------------- /docker/requirements.txt: -------------------------------------------------------------------------------- 1 | arxiv==2.1.3 ; python_version >= "3.11" and python_version < "4.0" 2 | beautifulsoup4==4.13.3 ; python_version >= "3.11" and python_version < "4.0" 3 | cachetools==5.5.1 ; python_version >= "3.11" and python_version < "4.0" 4 | certifi==2025.1.31 ; python_version >= "3.11" and python_version < "4.0" 5 | charset-normalizer==3.4.1 ; python_version >= "3.11" and python_version < "4.0" 6 | deepl==1.21.0 ; python_version >= "3.11" and python_version < "4.0" 7 | feedparser==6.0.11 ; python_version >= "3.11" and python_version < "4.0" 8 | google-api-core==2.24.1 ; python_version >= "3.11" and python_version < "4.0" 9 | google-auth==2.38.0 ; python_version >= "3.11" and python_version < "4.0" 10 | google-cloud-core==2.4.1 ; python_version >= "3.11" and python_version < "4.0" 11 | google-cloud-storage==3.0.0 ; python_version >= "3.11" and python_version < "4.0" 12 | google-crc32c==1.6.0 ; python_version >= "3.11" and python_version < "4.0" 13 | google-resumable-media==2.7.2 ; python_version >= "3.11" and python_version < "4.0" 14 | googleapis-common-protos==1.67.0 ; python_version >= "3.11" and python_version < "4.0" 15 | idna==3.10 ; python_version >= "3.11" and python_version < "4.0" 16 | imgkit==1.2.3 ; python_version >= "3.11" and python_version < "4.0" 17 | nanoatp==0.5.1 ; python_version >= "3.11" and python_version < "4.0" 18 | numpy==2.2.3 ; python_version >= "3.11" and python_version < "4.0" 19 | oauthlib==3.2.2 ; python_version >= "3.11" and python_version < "4.0" 20 | pandas==2.2.3 ; python_version >= "3.11" and python_version < "4.0" 21 | praw==7.8.1 ; python_version >= "3.11" and python_version < "4.0" 22 | prawcore==2.4.0 ; python_version >= "3.11" and python_version < "4.0" 23 | proto-plus==1.26.0 ; python_version >= "3.11" and python_version < "4.0" 24 | protobuf==5.29.3 ; python_version >= "3.11" and python_version < "4.0" 25 | pyasn1-modules==0.4.1 ; python_version >= "3.11" and python_version < "4.0" 26 | pyasn1==0.6.1 ; python_version >= "3.11" and python_version < "4.0" 27 | pysbd==0.3.4 ; python_version >= "3.11" and python_version < "4.0" 28 | python-dateutil==2.9.0.post0 ; python_version >= "3.11" and python_version < "4.0" 29 | pytz==2025.1 ; python_version >= "3.11" and python_version < "4.0" 30 | requests-oauthlib==2.0.0 ; python_version >= "3.11" and python_version < "4.0" 31 | requests==2.32.3 ; python_version >= "3.11" and python_version < "4.0" 32 | rsa==4.9 ; python_version >= "3.11" and python_version < "4.0" 33 | sgmllib3k==1.0.0 ; python_version >= "3.11" and python_version < "4.0" 34 | six==1.17.0 ; python_version >= "3.11" and python_version < "4.0" 35 | slack-sdk==3.34.0 ; python_version >= "3.11" and python_version < "4.0" 36 | soupsieve==2.6 ; python_version >= "3.11" and python_version < "4.0" 37 | tld==0.13 ; python_version >= "3.11" and python_version < "4.0" 38 | tweepy==4.15.0 ; python_version >= "3.11" and python_version < "4.0" 39 | typing-extensions==4.12.2 ; python_version >= "3.11" and python_version < "4.0" 40 | tzdata==2025.1 ; python_version >= "3.11" and python_version < "4.0" 41 | update-checker==0.18.0 ; python_version >= "3.11" and python_version < "4.0" 42 | urllib3==2.3.0 ; python_version >= "3.11" and python_version < "4.0" 43 | websocket-client==1.8.0 ; python_version >= "3.11" and python_version < "4.0" 44 | -------------------------------------------------------------------------------- /docker/generatehtml.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2023-2025 Susumu OTA <1632335+susumuota@users.noreply.github.com> 2 | # SPDX-License-Identifier: MIT 3 | 4 | import re 5 | from datetime import datetime, timedelta, timezone 6 | from html import escape 7 | from itertools import zip_longest 8 | 9 | import dateutil.parser 10 | import deeplcache 11 | import pandas as pd 12 | 13 | HTML_TRANS_TEMPLATE = """ 14 | 15 | 16 | 17 | 29 | 30 | 31 | {url} 32 |

33 | {title} 34 |

35 |

36 | {authors} 37 |

38 |
39 | {content} 40 |
41 | 42 | 43 | """ 44 | 45 | HTML_TRANS_ITEM_TEMPLATE = """ 46 |

47 | 48 | {translation} 49 | 50 |
51 | 52 | {source} 53 | 54 |

55 | """ 56 | 57 | 58 | def generate_trans_html(arxiv_id: str, title: str, authors: list[str], trans_texts: list[str], summary_texts: list[str]): 59 | authors_md = escape(", ".join(authors)) 60 | title_md = escape(title) 61 | url_md = f"https://arxiv.org/abs/{arxiv_id}" 62 | items = map( 63 | lambda item: HTML_TRANS_ITEM_TEMPLATE.format(translation=escape(item[0]), source=escape(item[1])), 64 | zip_longest(trans_texts, summary_texts, fillvalue=""), 65 | ) 66 | return HTML_TRANS_TEMPLATE.format(title=title_md, authors=authors_md, url=url_md, content="\n".join(items)) 67 | 68 | 69 | HTML_TOP_N_TEMPLATE = """ 70 | 71 | 72 | 73 | 82 | 83 | 84 | {date} 85 |

86 | {title} 87 |

88 |
89 | {content} 90 |
91 | 92 | 93 | """ 94 | 95 | HTML_TOP_N_ITEM_TEMPLATE = """ 96 |

97 | [{i}/{n}] {title}
98 | {stats}, {categories}, {updated} 99 |

100 | """ 101 | 102 | 103 | def generate_top_n_html(page_title: str, date: str, df: pd.DataFrame, dlc: deeplcache.DeepLCache): 104 | page_title = escape(page_title) 105 | df = df[::-1] # normal order (reversed reversed order) 106 | items = [] 107 | twenty_three_hours_ago = datetime.now(timezone.utc) - timedelta(hours=23) 108 | for i, (arxiv_id, updated, title, primary_category, categories, score, num_comments, count) in enumerate(zip(df["arxiv_id"], df["updated"], df["title"], df["primary_category"], df["categories"], df["score"], df["num_comments"], df["count"])): 109 | title = escape(title) 110 | trans = dlc.get(arxiv_id, None) 111 | if trans is None: 112 | continue 113 | _, trans_ts = trans 114 | if twenty_three_hours_ago < datetime.fromisoformat(trans_ts): 115 | title = f'[New] {title}' 116 | categories = " | ".join([primary_category] + [c for c in categories if c != primary_category and re.match(r"\w+\.\w+$", c)]) 117 | stats = f"{score} Likes, {num_comments} Comments, {count} Posts" 118 | updated = dateutil.parser.isoparse(updated).strftime("%d %b %Y") 119 | items.append(HTML_TOP_N_ITEM_TEMPLATE.format(i=(i + 1), n=len(df), title=title, stats=stats, categories=categories, updated=updated, arxiv_id=arxiv_id)) 120 | return HTML_TOP_N_TEMPLATE.format(title=page_title, date=date, content="\n".join(items)) 121 | -------------------------------------------------------------------------------- /docker/postslack.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2023 Susumu OTA <1632335+susumuota@users.noreply.github.com> 2 | # SPDX-License-Identifier: MIT 3 | 4 | import re 5 | import time 6 | from datetime import datetime, timedelta, timezone 7 | 8 | import dateutil.parser 9 | import deeplcache 10 | import pandas as pd 11 | import pysbd 12 | import slack_sdk 13 | import utils 14 | 15 | 16 | def post_to_slack_header(api: slack_sdk.WebClient, channel: str, df: pd.DataFrame): 17 | text = f"Top {len(df)} most popular arXiv papers in the last 30 days" 18 | blocks = [{"type": "header", "text": {"type": "plain_text", "text": text}}] 19 | return api.chat_postMessage(channel=channel, text=text, blocks=blocks) 20 | 21 | 22 | def generate_slack_title_blocks(df: pd.DataFrame, i: int, is_new: bool, title: str, score: int, num_comments: int, count: int, primary_category: str, categories: list[str], updated: str, first_summary: str): 23 | new_md = ":new: " if is_new else "" 24 | title_md = utils.strip(title, 200) 25 | stats_md = f"_*{score}* Likes, {num_comments} Comments, {count} Posts_" 26 | categories_md = utils.avoid_auto_link(" | ".join([primary_category] + [c for c in categories if c != primary_category and re.match(r"\w+\.\w+$", c)])) 27 | updated_md = dateutil.parser.isoparse(updated).strftime("%d %b %Y") 28 | return [{"type": "section", "text": {"type": "mrkdwn", "text": f"[{len(df) - i}/{len(df)}] {new_md}*{title_md}*\n{stats_md}, {categories_md}, {updated_md}\n{first_summary}"}}] 29 | 30 | 31 | def generate_slack_summary(dlc: deeplcache.DeepLCache, seg: pysbd.Segmenter, twenty_three_hours_ago: datetime, arxiv_id: str, summary: str): 32 | segs = seg.segment(summary.replace("\n", " ")[:2000]) 33 | summary_texts: list[str] = [str(seg) for seg in segs] if type(segs) is list else [segs] if type(segs) is str else [] 34 | first_summary = summary_texts[0][:200] # sometimes pysbd failed to split 35 | translation_md = None 36 | is_new = False 37 | trans = dlc.get(arxiv_id, None) 38 | if trans is not None: 39 | trans_texts, trans_ts = trans 40 | first_summary = trans_texts[0][:200] # sometimes pysbd failed to split 41 | is_new = True if twenty_three_hours_ago < datetime.fromisoformat(trans_ts) else False 42 | # assert len(summary_texts) == len(trans_texts) # this rarely happen 43 | if len(summary_texts) != len(trans_texts): 44 | print("different texts length", arxiv_id, len(summary_texts), len(trans_texts)) 45 | translation_md = "\n\n".join(trans_texts) 46 | translation_md = utils.strip(translation_md, 3000) # must be less than 3001 characters 47 | return is_new, first_summary, translation_md 48 | 49 | 50 | def post_to_slack_title(api: slack_sdk.WebClient, channel: str, dlc: deeplcache.DeepLCache, df: pd.DataFrame, seg: pysbd.Segmenter, twenty_three_hours_ago: datetime, i: int, arxiv_id: str, updated: str, title: str, summary: str, primary_category: str, categories: list[str], score: int, num_comments: int, count: int): 51 | is_new, first_summary, translation_md = generate_slack_summary(dlc, seg, twenty_three_hours_ago, arxiv_id, summary) 52 | blocks = generate_slack_title_blocks(df, i, is_new, title, score, num_comments, count, primary_category, categories, updated, first_summary) 53 | title_md = utils.strip(title, 200) 54 | response = api.chat_postMessage(channel=channel, text=title_md, blocks=blocks) 55 | return response, translation_md 56 | 57 | 58 | def post_to_slack_translation(api: slack_sdk.WebClient, channel: str, title: str, ts: str, translation_md: str): 59 | blocks = [{"type": "section", "text": {"type": "mrkdwn", "text": translation_md}}] 60 | title_md = utils.strip(title, 200) 61 | return api.chat_postMessage(channel=channel, text=title_md, blocks=blocks, thread_ts=ts) 62 | 63 | 64 | def post_to_slack_authors(api: slack_sdk.WebClient, channel: str, title: str, ts: str, authors: list[str], comment: str, arxiv_id: str): 65 | authors_md = utils.strip(", ".join(authors), 1000) 66 | comment_md = f"\n\n*Comments*: {utils.strip(comment, 1000)}\n\n" if comment else "" 67 | abs_md = f"" 68 | pdf_md = f"" 69 | twitter_md = f"" 70 | reddit_md = f"" 71 | hackernews_md = f"" 72 | huggingface_md = f"" 73 | alphaxiv_md = f"" 74 | blocks = [{"type": "section", "text": {"type": "mrkdwn", "text": f"*Links*: {abs_md}, {pdf_md}, {twitter_md}, {reddit_md}, {hackernews_md}, {huggingface_md}, {alphaxiv_md}\n\n*Authors*: {authors_md}{comment_md}"}}] 75 | title_md = utils.strip(title, 200) 76 | return api.chat_postMessage(channel=channel, text=title_md, blocks=blocks, thread_ts=ts) 77 | 78 | 79 | def post_to_slack_documents(api: slack_sdk.WebClient, channel: str, ts: str, df: pd.DataFrame): 80 | for i, (id, score, num_comments, created_at) in enumerate(zip(df["id"], df["score"], df["num_comments"], df["created_at"])): 81 | blocks = [] 82 | stats_md = f"_*{score}* Likes, {num_comments} Comments_" 83 | created_at_md = datetime.fromtimestamp(created_at).strftime("%d %b %Y") 84 | url_md = f"<{id}|{created_at_md}>" 85 | blocks = [{"type": "section", "text": {"type": "mrkdwn", "text": f"({i + 1}/{len(df)}) {stats_md}, {url_md}\n"}}] 86 | api.chat_postMessage(channel=channel, text=url_md, thread_ts=ts, blocks=blocks) 87 | time.sleep(1) 88 | 89 | 90 | def post_to_slack(api: slack_sdk.WebClient, channel: str, dlc: deeplcache.DeepLCache, df: pd.DataFrame, document_df: pd.DataFrame): 91 | df = df[::-1] # reverse order 92 | post_to_slack_header(api, channel, df) 93 | time.sleep(1) 94 | seg = pysbd.Segmenter(language="en", clean=False) 95 | twenty_three_hours_ago = datetime.now(timezone.utc) - timedelta(hours=23) 96 | for i, (arxiv_id, updated, title, summary, authors, comment, primary_category, categories, score, num_comments, count) in enumerate(zip(df["arxiv_id"], df["updated"], df["title"], df["summary"], df["authors"], df["comment"], df["primary_category"], df["categories"], df["score"], df["num_comments"], df["count"])): 97 | response, translation_md = post_to_slack_title(api, channel, dlc, df, seg, twenty_three_hours_ago, i, arxiv_id, updated, title, summary, primary_category, categories, score, num_comments, count) 98 | time.sleep(1) 99 | ts = response["ts"] 100 | if not ts: 101 | continue 102 | if translation_md: 103 | post_to_slack_translation(api, channel, title, ts, translation_md) 104 | time.sleep(1) 105 | post_to_slack_authors(api, channel, title, ts, authors, comment, arxiv_id) 106 | time.sleep(1) 107 | top_n_documents = document_df[document_df["arxiv_id"].apply(lambda ids: arxiv_id in ids)].head(3) # TODO 108 | post_to_slack_documents(api, channel, ts, top_n_documents) 109 | print("post_to_slack: ", f"[{len(df) - i}/{len(df)}]") 110 | -------------------------------------------------------------------------------- /docker/posttwitter.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2023 Susumu OTA <1632335+susumuota@users.noreply.github.com> 2 | # SPDX-License-Identifier: MIT 3 | 4 | import os 5 | import re 6 | import tempfile 7 | import time 8 | from datetime import datetime, timedelta, timezone 9 | 10 | import dateutil.parser 11 | import deeplcache 12 | import generatehtml 13 | import pandas as pd 14 | import pysbd 15 | import tweepy 16 | import utils 17 | 18 | 19 | def upload_first_page_to_twitter(api_v1: tweepy.API, arxiv_id: str): 20 | with tempfile.TemporaryDirectory() as tmp_dir: 21 | pdf_filename = utils.download_arxiv_pdf(arxiv_id, tmp_dir) 22 | first_page_filename = utils.pdf_to_png(pdf_filename) 23 | if os.path.isfile(first_page_filename) and os.path.getsize(first_page_filename) > 0: 24 | media = api_v1.media_upload(first_page_filename) 25 | return media.media_id if media else None 26 | return None 27 | 28 | 29 | def generate_twitter_first_page(df: pd.DataFrame, i: int, is_new: bool, arxiv_id: str, updated: str, title: str, summary_texts: list[str], authors: list[str], score: int, num_comments: int, count: int, primary_category: str, categories: list[str]): 30 | summary_text = " ".join(summary_texts) 31 | new_md = "🆕" if is_new else "" 32 | authors_md = ", ".join(authors) 33 | categories_md = utils.avoid_auto_link(" | ".join([primary_category] + [c for c in categories if c != primary_category and re.match(r"\w+\.\w+$", c)])) 34 | stats_md = f"{score} Likes, {num_comments} Comments, {count} Posts" 35 | updated_md = dateutil.parser.isoparse(updated).strftime("%d %b %Y") 36 | title_md = title 37 | abs_md = f"https://arxiv.org/abs/{arxiv_id}" 38 | text = f"[{len(df) - i}/{len(df)}] {stats_md}\n{abs_md} {categories_md}, {updated_md}\n\n{new_md}{title_md}\n\n{authors_md}" 39 | return text, summary_text 40 | 41 | 42 | def post_to_twitter_first_page(api_v1: tweepy.API, api_v2: tweepy.Client, df: pd.DataFrame, i: int, is_new: bool, arxiv_id: str, updated: str, title: str, summary_texts: list[str], authors: list[str], score: int, num_comments: int, count: int, primary_category: str, categories: list[str]) -> str: 43 | text, summary_text = generate_twitter_first_page(df, i, is_new, arxiv_id, updated, title, summary_texts, authors, score, num_comments, count, primary_category, categories) 44 | media_ids = [] 45 | first_page_media_id = upload_first_page_to_twitter(api_v1, arxiv_id) 46 | if first_page_media_id: 47 | api_v1.create_media_metadata(first_page_media_id, utils.strip_tweet(summary_text, 1000)) 48 | media_ids.append(first_page_media_id) 49 | prev_tweet_id: str = "" 50 | try: 51 | response = api_v2.create_tweet(text=utils.strip_tweet(text, 280), user_auth=True, media_ids=media_ids if len(media_ids) > 0 else None) 52 | prev_tweet_id = response.data["id"] if type(response) is tweepy.Response and not response.errors else "" 53 | except Exception as e: 54 | print(e) 55 | return prev_tweet_id 56 | 57 | 58 | def post_to_twitter_link(api_v2: tweepy.Client, prev_tweet_id: str, arxiv_id: str, link_type: str) -> str: 59 | twitter_uri = f"https://x.com/search?q=arxiv.org%2Fabs%2F{arxiv_id}%20OR%20arxiv.org%2Fpdf%2F{arxiv_id}.pdf" 60 | reddit_uri = f"https://www.reddit.com/search/?q=%22{arxiv_id}%22&sort=top" 61 | hackernews_uri = f"https://hn.algolia.com/?query=%22{arxiv_id}%22&type=all" 62 | # the last uri will become a link card 63 | text = f"Twitter: {twitter_uri}" 64 | text = f"Twitter: {twitter_uri} \nReddit: {reddit_uri}" if link_type == "Reddit" else text 65 | text = f"Twitter: {twitter_uri} \nHacker News: {hackernews_uri}" if link_type == "Hacker News" else text 66 | try: 67 | response = api_v2.create_tweet(text=utils.strip_tweet(text, 280), user_auth=True, in_reply_to_tweet_id=prev_tweet_id) 68 | prev_tweet_id = response.data["id"] if type(response) is tweepy.Response and not response.errors else "" 69 | except Exception as e: 70 | print(e) 71 | return prev_tweet_id 72 | 73 | 74 | def post_to_twitter_tweets(api_v2: tweepy.Client, prev_tweet_id: str, document_df: pd.DataFrame) -> str: 75 | # df = document_df[::-1] # reverse order 76 | df = document_df 77 | for i, (id, score, num_comments, created_at) in enumerate(zip(df["id"], df["score"], df["num_comments"], df["created_at"])): 78 | stats_md = f"{score} Likes, {num_comments} Comments" 79 | created_at_md = datetime.fromtimestamp(created_at).strftime("%d %b %Y") 80 | link = utils.get_link_type(id) or id 81 | # index = len(df) - i # reverse order 82 | index = i + 1 83 | text = f"({index}/{len(df)}) {stats_md}, {created_at_md}, {link}\n{id}\n" 84 | try: 85 | response = api_v2.create_tweet(text=utils.strip_tweet(text, 280), user_auth=True, in_reply_to_tweet_id=prev_tweet_id) 86 | prev_tweet_id = response.data["id"] if type(response) is tweepy.Response and not response.errors else "" 87 | except Exception as e: 88 | print(e) 89 | time.sleep(1) 90 | return prev_tweet_id 91 | 92 | 93 | def upload_html_to_twitter(api_v1: tweepy.API, filename: str, html_text: str): 94 | with tempfile.TemporaryDirectory() as tmp_dir: 95 | abs_path = os.path.join(tmp_dir, filename) 96 | abs_path = utils.html_to_image(html_text, abs_path) 97 | if os.path.isfile(abs_path) and os.path.getsize(abs_path) > 0: 98 | media = api_v1.media_upload(abs_path) 99 | return media.media_id if media else None 100 | return None 101 | 102 | 103 | def post_to_twitter_ranking(api_v1: tweepy.API, api_v2: tweepy.Client, dlc: deeplcache.DeepLCache, df: pd.DataFrame): 104 | title = f"Top {len(df)} most popular arXiv papers in the last 30 days" 105 | date = datetime.now(timezone.utc).strftime("%d %b %Y") 106 | media_ids = [] 107 | html_text = generatehtml.generate_top_n_html(title, date, df, dlc) 108 | top_n_media_id = upload_html_to_twitter(api_v1, "top_n.jpg", html_text) 109 | if top_n_media_id: 110 | rev_df = df[::-1] 111 | metadata = "\n".join(map(lambda item: f"[{item[0] + 1}/{len(df)}] arxiv.org/abs/{item[1][0]}", enumerate(zip(rev_df["arxiv_id"])))) 112 | api_v1.create_media_metadata(top_n_media_id, utils.strip_tweet(metadata, 1000)) 113 | media_ids.append(top_n_media_id) 114 | text = title 115 | try: 116 | api_v2.create_tweet(text=utils.strip_tweet(text, 280), user_auth=True, media_ids=media_ids if len(media_ids) > 0 else None) 117 | except Exception as e: 118 | print(e) 119 | 120 | 121 | def post_to_twitter_trans(api_v1: tweepy.API, api_v2: tweepy.Client, prev_tweet_id: str, arxiv_id: str, title: str, authors: list[str], summary_texts: list[str], trans_texts: list[str]): 122 | html_text = generatehtml.generate_trans_html(arxiv_id, title, authors, trans_texts, summary_texts) 123 | media_ids = [] 124 | translation_media_id = upload_html_to_twitter(api_v1, f"{arxiv_id}.trans.jpg", html_text) 125 | trans_text = "".join(trans_texts) 126 | if translation_media_id: 127 | api_v1.create_media_metadata(translation_media_id, utils.strip_tweet(trans_text, 1000)) 128 | media_ids.append(translation_media_id) 129 | text = f"https://arxiv.org/abs/{arxiv_id}\n{trans_text}" 130 | try: 131 | api_v2.create_tweet(text=utils.strip_tweet(text, 280), user_auth=True, media_ids=media_ids if len(media_ids) > 0 else None, in_reply_to_tweet_id=prev_tweet_id) 132 | except Exception as e: 133 | print(e) 134 | 135 | 136 | def post_to_twitter(api_v1: tweepy.API, api_v2: tweepy.Client, dlc: deeplcache.DeepLCache, df: pd.DataFrame, document_df: pd.DataFrame): 137 | df = df[::-1] # reverse order 138 | twenty_three_hours_ago = datetime.now(timezone.utc) - timedelta(hours=23) 139 | seg = pysbd.Segmenter(language="en", clean=False) 140 | post_to_twitter_ranking(api_v1, api_v2, dlc, df) 141 | for i, (arxiv_id, updated, title, summary, authors, comment, primary_category, categories, score, num_comments, count) in enumerate(zip(df["arxiv_id"], df["updated"], df["title"], df["summary"], df["authors"], df["comment"], df["primary_category"], df["categories"], df["score"], df["num_comments"], df["count"])): 142 | trans = dlc.get(arxiv_id, None) 143 | if trans is None: 144 | continue 145 | trans_texts, trans_ts = trans 146 | segs = seg.segment(summary.replace("\n", " ")[:2000]) 147 | summary_texts: list[str] = [str(seg) for seg in segs] if type(segs) is list else [segs] if type(segs) is str else [] 148 | # only post new papers 149 | if not (twenty_three_hours_ago < datetime.fromisoformat(trans_ts)): 150 | continue 151 | is_new = True 152 | prev_tweet_id = post_to_twitter_first_page(api_v1, api_v2, df, i, is_new, arxiv_id, updated, title, summary_texts, authors, score, num_comments, count, primary_category, categories) 153 | time.sleep(1) 154 | if not prev_tweet_id: 155 | continue 156 | top_n_documents = document_df[document_df["arxiv_id"].apply(lambda ids: arxiv_id in ids)].head(3) # TODO 157 | link_type = utils.get_link_type(top_n_documents.iloc[0]["id"]) 158 | prev_tweet_id = post_to_twitter_link(api_v2, prev_tweet_id, arxiv_id, link_type) 159 | time.sleep(1) 160 | if not prev_tweet_id: 161 | continue 162 | prev_tweet_id = post_to_twitter_tweets(api_v2, prev_tweet_id, top_n_documents) 163 | post_to_twitter_trans(api_v1, api_v2, prev_tweet_id, arxiv_id, title, authors, summary_texts, trans_texts) 164 | print("post_to_twitter: ", f"[{len(df) - i}/{len(df)}]") 165 | time.sleep(1) 166 | -------------------------------------------------------------------------------- /docker/postbluesky.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2023-2025 Susumu OTA <1632335+susumuota@users.noreply.github.com> 2 | # SPDX-License-Identifier: MIT 3 | 4 | import os 5 | import re 6 | import tempfile 7 | import time 8 | from datetime import datetime, timedelta, timezone 9 | from typing import Any 10 | 11 | import dateutil.parser 12 | import deeplcache 13 | import generatehtml 14 | import nanoatp 15 | import pandas as pd 16 | import pysbd 17 | import utils 18 | 19 | 20 | def generate_facets(text: str, patterns: list[tuple[str, str]]): 21 | # TODO: fix naive implementation 22 | facets: list[dict[str, Any]] = [] 23 | for pattern, uri in patterns: 24 | start = text.find(pattern) 25 | if start == -1: 26 | continue 27 | end = start + len(pattern) 28 | facets.append( 29 | { 30 | "$type": "app.bsky.richtext.facet", 31 | "index": {"byteStart": start, "byteEnd": end}, 32 | "features": [{"$type": "app.bsky.richtext.facet#link", "uri": uri}], 33 | } 34 | ) 35 | facets.sort(key=lambda facet: facet["index"]["byteStart"]) 36 | return facets 37 | 38 | 39 | def upload_first_page_to_bluesky(api: nanoatp.BskyAgent, arxiv_id: str, summary_text: str) -> dict[str, Any]: 40 | with tempfile.TemporaryDirectory() as tmp_dir: 41 | pdf_filename = utils.download_arxiv_pdf(arxiv_id, tmp_dir) 42 | first_page_filename = utils.pdf_to_png(pdf_filename) 43 | if os.path.isfile(first_page_filename) and os.path.getsize(first_page_filename) > 0: 44 | return api.uploadImage(first_page_filename, utils.strip_tweet(summary_text, 2000)) 45 | return {} 46 | 47 | 48 | def generate_bluesky_first_page(df: pd.DataFrame, i: int, is_new: bool, arxiv_id: str, updated: str, title: str, summary_texts: list[str], authors: list[str], score: int, num_comments: int, count: int, primary_category: str, categories: list[str]): 49 | summary_text = "\n\n".join(summary_texts) 50 | new_md = "🆕" if is_new else "" 51 | authors_md = ", ".join(authors) 52 | categories_md = utils.avoid_auto_link(" | ".join([primary_category] + [c for c in categories if c != primary_category and re.match(r"\w+\.\w+$", c)])) 53 | stats_md = f"{score} Likes, {num_comments} Comments, {count} Posts" 54 | updated_md = dateutil.parser.isoparse(updated).strftime("%d %b %Y") 55 | title_md = title 56 | text = f"[{len(df) - i}/{len(df)}] {stats_md}\n{arxiv_id}, {categories_md}, {updated_md}\n\n{new_md}{title_md}\n\n{authors_md}" 57 | return text, summary_text 58 | 59 | 60 | def post_to_bluesky_first_page(api: nanoatp.BskyAgent, df: pd.DataFrame, i: int, is_new: bool, arxiv_id: str, updated: str, title: str, summary_texts: list[str], authors: list[str], score: int, num_comments: int, count: int, primary_category: str, categories: list[str]): 61 | first_page_text, summary_text = generate_bluesky_first_page(df, i, is_new, arxiv_id, updated, title, summary_texts, authors, score, num_comments, count, primary_category, categories) 62 | images = [] 63 | image = upload_first_page_to_bluesky(api, arxiv_id, summary_text) 64 | images.append(image) if image else None 65 | parent_post: dict[str, str] = {} 66 | text = f"{first_page_text}" 67 | patterns = [(arxiv_id, f"https://arxiv.org/abs/{arxiv_id}")] 68 | facets = generate_facets(text, patterns) 69 | embed = {"$type": "app.bsky.embed.images", "images": images} 70 | record = {"text": utils.strip_tweet(text, 300), "facets": facets, "embed": embed} 71 | try: 72 | parent_post = api.post(record) 73 | except Exception as e: 74 | print(e) 75 | return parent_post 76 | 77 | 78 | def generate_external(api: nanoatp.BskyAgent, uri: str, title: str, description: str): 79 | try: 80 | external = api.uploadExternal(uri) 81 | except Exception as e: 82 | print({"function": "uploadExternal", "uri": uri, "error": str(e)}) 83 | external = { 84 | "$type": "app.bsky.embed.external#external", 85 | "uri": uri, 86 | "title": title, 87 | "description": description, 88 | } 89 | return external 90 | 91 | 92 | def post_to_bluesky_link(api: nanoatp.BskyAgent, root_post: dict[str, str], parent_post: dict[str, str], arxiv_id: str, title: str, summary_texts: list[str]): 93 | patterns = [ 94 | ("abs", f"https://arxiv.org/abs/{arxiv_id}"), 95 | ("pdf", f"https://arxiv.org/pdf/{arxiv_id}.pdf"), 96 | ("Bluesky", f"https://bsky.app/search?q={arxiv_id}"), 97 | ("Twitter", f"https://x.com/search?q=arxiv.org%2Fabs%2F{arxiv_id}%20OR%20arxiv.org%2Fpdf%2F{arxiv_id}.pdf"), 98 | ("Reddit", f"https://www.reddit.com/search/?q=%22{arxiv_id}%22&sort=top"), 99 | ("Hacker News", f"https://hn.algolia.com/?query=%22{arxiv_id}%22&type=all"), 100 | ("Hugging Face", f"https://huggingface.co/papers/{arxiv_id}"), 101 | ("alphaXiv", f"https://www.alphaxiv.org/abs/{arxiv_id}"), 102 | ] 103 | text = "Links: abs, pdf\nSearch: Bluesky, Twitter, Reddit, Hacker News, Hugging Face, alphaXiv" 104 | facets = generate_facets(text, patterns) 105 | uri = patterns[0][1] 106 | external = generate_external(api, uri, title, utils.strip_tweet(" ".join(summary_texts), 300)) 107 | embed = {"$type": "app.bsky.embed.external", "external": external} 108 | record = {"text": utils.strip_tweet(text, 300), "facets": facets, "reply": {"root": root_post, "parent": parent_post}, "embed": embed} 109 | try: 110 | parent_post = api.post(record) 111 | except Exception as e: 112 | print(e) 113 | return parent_post 114 | 115 | 116 | def post_to_bluesky_posts(api: nanoatp.BskyAgent, root_post: dict[str, str], parent_post: dict[str, str], df: pd.DataFrame): 117 | for i, (id, score, num_comments, created_at, title, description) in enumerate(zip(df["id"], df["score"], df["num_comments"], df["created_at"], df["title"], df["description"])): 118 | stats_md = f"{score} Likes, {num_comments} Comments" 119 | created_at_md = datetime.fromtimestamp(created_at).strftime("%d %b %Y") 120 | link = utils.get_link_type(id) or id 121 | index = i + 1 122 | text = f"({index}/{len(df)}) {stats_md}, {created_at_md}, {link}" 123 | patterns = [(link, id)] 124 | facets = generate_facets(text, patterns) 125 | external = generate_external(api, id, title, utils.strip_tweet(description, 300)) 126 | embed = {"$type": "app.bsky.embed.external", "external": external} 127 | record = {"text": utils.strip_tweet(text, 300), "facets": facets, "reply": {"root": root_post, "parent": parent_post}, "embed": embed} 128 | try: 129 | parent_post = api.post(record) 130 | except Exception as e: 131 | print(e) 132 | time.sleep(1) 133 | return parent_post 134 | 135 | 136 | def upload_html_to_bluesky(api: nanoatp.BskyAgent, filename: str, html_text: str, alt_text: str, quality: int = 94) -> dict[str, Any]: 137 | with tempfile.TemporaryDirectory() as tmp_dir: 138 | abs_path = os.path.join(tmp_dir, filename) 139 | abs_path = utils.html_to_image(html_text, abs_path, quality) 140 | if os.path.isfile(abs_path) and os.path.getsize(abs_path) > 0: 141 | return api.uploadImage(abs_path, utils.strip_tweet(alt_text, 2000)) 142 | return {} 143 | 144 | 145 | def post_to_bluesky_trans(api: nanoatp.BskyAgent, root_post: dict[str, str], parent_post: dict[str, str], arxiv_id: str, title: str, authors: list[str], summary_texts: list[str], trans_texts: list[str]) -> dict[str, str]: 146 | html_text = generatehtml.generate_trans_html(arxiv_id, title, authors, trans_texts, summary_texts) 147 | trans_text = "".join(trans_texts) 148 | images = [] 149 | image = upload_html_to_bluesky(api, f"{arxiv_id}.trans.jpg", html_text, "\n\n".join(trans_texts)) 150 | images.append(image) if image else None 151 | text = f"{arxiv_id}\n{trans_text}" 152 | patterns = [(arxiv_id, f"https://arxiv.org/abs/{arxiv_id}")] 153 | facets = generate_facets(text, patterns) 154 | embed = {"$type": "app.bsky.embed.images", "images": images} 155 | record = {"text": utils.strip_tweet(text, 300), "facets": facets, "reply": {"root": root_post, "parent": parent_post}, "embed": embed} 156 | try: 157 | return api.post(record) 158 | except Exception as e: 159 | print(e) 160 | return {} 161 | 162 | 163 | def post_to_bluesky_ranking(api: nanoatp.BskyAgent, dlc: deeplcache.DeepLCache, df: pd.DataFrame) -> dict[str, str]: 164 | title = f"Top {len(df)} most popular arXiv papers in the last 30 days.\n" 165 | date = datetime.now(timezone.utc).strftime("%d %b %Y") 166 | html_text = generatehtml.generate_top_n_html(title, date, df, dlc) 167 | uris = list(map(lambda item: (f"{item[0] + 1}/{len(df)}", f"https://arxiv.org/abs/{item[1][0]}"), enumerate(zip(df[::-1]["arxiv_id"])))) 168 | alt_text = "\n".join(map(lambda item: " ".join(item), uris)) 169 | image = upload_html_to_bluesky(api, "top_n.jpg", html_text, alt_text, 90) # sometimes the image is too large to upload 170 | images = [] 171 | images.append(image) if image else None 172 | text = title + " ".join(map(lambda item: f"[{item[0]}]", uris)) 173 | facets = generate_facets(text, uris) 174 | embed = {"$type": "app.bsky.embed.images", "images": images} 175 | record = {"text": utils.strip_tweet(text, 300), "facets": facets, "embed": embed} 176 | try: 177 | return api.post(record) 178 | except Exception as e: 179 | print(e) 180 | return {} 181 | 182 | 183 | def post_to_bluesky(api: nanoatp.BskyAgent, dlc: deeplcache.DeepLCache, df: pd.DataFrame, document_df: pd.DataFrame): 184 | df = df[::-1] # reverse order 185 | twenty_three_hours_ago = datetime.now(timezone.utc) - timedelta(hours=23) 186 | seg = pysbd.Segmenter(language="en", clean=False) 187 | for i, (arxiv_id, updated, title, summary, authors, primary_category, categories, score, num_comments, count) in enumerate(zip(df["arxiv_id"], df["updated"], df["title"], df["summary"], df["authors"], df["primary_category"], df["categories"], df["score"], df["num_comments"], df["count"])): 188 | trans = dlc.get(arxiv_id, None) 189 | if trans is None: 190 | continue 191 | trans_texts, trans_ts = trans 192 | # only post new papers 193 | if not (twenty_three_hours_ago < datetime.fromisoformat(trans_ts)): 194 | continue 195 | segs = seg.segment(summary.replace("\n", " ")[:2000]) 196 | summary_texts: list[str] = [str(seg) for seg in segs] if type(segs) is list else [segs] if type(segs) is str else [] 197 | is_new = True 198 | parent_post = post_to_bluesky_first_page(api, df, i, is_new, arxiv_id, updated, title, summary_texts, authors, score, num_comments, count, primary_category, categories) 199 | if parent_post is None: 200 | continue 201 | root_post = parent_post 202 | time.sleep(1) 203 | top_n_documents = document_df[document_df["arxiv_id"].apply(lambda ids: arxiv_id in ids)].head(3) # TODO 204 | parent_post = post_to_bluesky_posts(api, root_post, parent_post, top_n_documents) 205 | parent_post = post_to_bluesky_link(api, root_post, parent_post, arxiv_id, title, summary_texts) 206 | time.sleep(1) 207 | post_to_bluesky_trans(api, root_post, parent_post, arxiv_id, title, authors, summary_texts, trans_texts) 208 | print("post_to_bluesky: ", f"[{len(df) - i}/{len(df)}]") 209 | time.sleep(1) 210 | return post_to_bluesky_ranking(api, dlc, df) 211 | -------------------------------------------------------------------------------- /docker/main.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2023 Susumu OTA <1632335+susumuota@users.noreply.github.com> 2 | # 3 | # SPDX-License-Identifier: MIT 4 | 5 | # Those environment variables are required to use PRAW. 6 | # export praw_client_id="reddit client id" 7 | # export praw_client_secret="reddit client secret" 8 | # export praw_user_agent="reddit user agent" 9 | 10 | import os 11 | import re 12 | import time 13 | from datetime import datetime, timedelta, timezone 14 | 15 | import arxiv 16 | import deepl 17 | import deeplcache 18 | import nanoatp 19 | import pandas as pd 20 | import postbluesky 21 | import postslack 22 | import posttwitter 23 | import praw 24 | import pysbd 25 | import requests 26 | import slack_sdk 27 | import tweepy 28 | from google.cloud import storage 29 | 30 | # https://info.arxiv.org/help/arxiv_identifier.html 31 | ARXIV_URL_PATTERN = re.compile(r"https?://arxiv\.org/(abs|pdf)/([0-9]{4}\.[0-9]{4,5})(v[0-9]+)?(\.pdf)?") 32 | ARXIV_ID_PATTERN = re.compile(r"([0-9]{4}\.[0-9]{4,5})(v[0-9]+)?") 33 | 34 | 35 | def parse_arxiv_ids(text: str) -> list[str]: 36 | text = text.replace("\\", "") # TODO: some text includes 2 backslashes in urls 37 | return list(set([m[1] for m in re.findall(ARXIV_URL_PATTERN, text)])) 38 | 39 | 40 | def flatten(lists: list[list]): 41 | return [item for sublist in lists for item in sublist] 42 | 43 | 44 | def submission_to_dict(submission: praw.reddit.Submission): 45 | """https://praw.readthedocs.io/en/stable/code_overview/models/submission.html""" 46 | arxiv_ids = parse_arxiv_ids(submission.selftext) 47 | score = int(submission.score / len(arxiv_ids) if len(arxiv_ids) > 0 else submission.score) 48 | return { 49 | "id": f"https://redd.it/{submission.id}", 50 | "score": score, 51 | "num_comments": submission.num_comments, 52 | "created_at": submission.created_utc, 53 | "arxiv_id": arxiv_ids, 54 | "title": submission.title, 55 | "description": submission.selftext, 56 | } 57 | 58 | 59 | def search_reddit(query: str, sort="relevance", syntax="lucene", time_filter="all", limit: int | None = None): 60 | """https://praw.readthedocs.io/en/latest/code_overview/models/subreddit.html#praw.models.Subreddit.search""" 61 | rs = list(praw.Reddit().subreddit("all").search(query=query, sort=sort, syntax=syntax, time_filter=time_filter, limit=limit)) 62 | return pd.json_normalize([submission_to_dict(r) for r in rs]) 63 | 64 | 65 | def hit_to_dict(hit: dict): 66 | """https://hn.algolia.com/api""" 67 | arxiv_ids = parse_arxiv_ids(hit["url"]) 68 | score = int(hit["points"] / len(arxiv_ids) if len(arxiv_ids) > 0 else hit["points"]) 69 | return { 70 | "id": f"https://news.ycombinator.com/item?id={hit['objectID']}", 71 | "score": score, 72 | "num_comments": hit["num_comments"], 73 | "created_at": hit["created_at_i"], 74 | "arxiv_id": arxiv_ids, 75 | "title": hit["title"], 76 | "description": hit["url"], 77 | } 78 | 79 | 80 | def search_hackernews(query: str, attribute="", days=0, limit: int | None = None): 81 | """https://hn.algolia.com/api""" 82 | params = {"query": query} 83 | params.update({"restrictSearchableAttributes": attribute}) if attribute else None 84 | if days > 0: 85 | days_ago = int((datetime.now() - timedelta(days=days)).timestamp()) 86 | params.update({"numericFilters": f"created_at_i>{days_ago}"}) 87 | params.update({"hitsPerPage": str(limit)}) if limit else None 88 | response = requests.get("https://hn.algolia.com/api/v1/search", params=params) 89 | json = response.json() 90 | return pd.json_normalize([hit_to_dict(hit) for hit in json["hits"]]) 91 | 92 | 93 | def article_to_dict(article: dict): 94 | """https://huggingface.co/docs/hub/en/api#get-apidailypapers""" 95 | arxiv_id = article["paper"]["id"] 96 | created_at = int(datetime.fromisoformat(article["paper"]["submittedOnDailyAt"].replace("Z", "+00:00")).timestamp()) 97 | return { 98 | "id": f"https://huggingface.co/papers/{arxiv_id}", 99 | "score": article["paper"]["upvotes"], 100 | "num_comments": article["numComments"], 101 | "created_at": created_at, 102 | "arxiv_id": [arxiv_id], 103 | "title": article["title"], 104 | "description": article["summary"], 105 | } 106 | 107 | 108 | def get_huggingface(timestamp: float, wait=1): 109 | """https://huggingface.co/docs/hub/en/api#get-apidailypapers""" 110 | date = datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d") 111 | url = f"https://huggingface.co/api/daily_papers?date={date}" 112 | referer = f"https://huggingface.co/papers/date/{date}" 113 | ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36" 114 | time.sleep(wait) 115 | response = requests.get(url, headers={"Referer": referer, "User-Agent": ua}) 116 | print(f"Status code {response.status_code}, {len(response.text)} characters at {date}") 117 | if response.status_code != 200: 118 | print(f"Failed to fetch data for {date}: {response.status_code}") 119 | return [] 120 | articles = response.json() 121 | if not articles or "error" in articles or not isinstance(articles, list): 122 | print(f"No articles found for {date} or error in response.") 123 | return [] 124 | print(f"Got {len(articles)} articles from {date}") 125 | return [article_to_dict(article) for article in articles] 126 | 127 | 128 | def search_huggingface(days=30, wait=1): 129 | """https://huggingface.co/docs/hub/en/api#get-apidailypapers""" 130 | now = datetime.now() 131 | timestamps = [(now - timedelta(days=d)).timestamp() for d in range(days)] 132 | df = pd.json_normalize(flatten([get_huggingface(ts, wait) for ts in timestamps])) 133 | return df.drop_duplicates(subset=["id"], keep="last").reset_index(drop=True) 134 | 135 | 136 | def paper_to_dict(paper: dict): 137 | """https://www.alphaxiv.org/explore?sort=Likes&time=30+Days""" 138 | arxiv_id = paper["universal_paper_id"] 139 | try: 140 | created_at = int(datetime.fromisoformat(paper["publication_date"].replace("Z", "+00:00")).timestamp()) 141 | except Exception as e: 142 | print(f"Failed to parse publication date for {arxiv_id}: {e}") 143 | created_at = datetime.now(timezone.utc).timestamp() 144 | return { 145 | "id": f"https://www.alphaxiv.org/abs/{arxiv_id}", 146 | "score": paper["metrics"]["public_total_votes"], 147 | "num_comments": 0, # TODO: find the number of comments 148 | "created_at": created_at, 149 | "arxiv_id": [arxiv_id], 150 | "title": paper["title"], 151 | "description": paper["abstract"], 152 | } 153 | 154 | 155 | def get_alphaxiv(sort_by="Likes", interval="30+Days", page_size=10, page_num=0, wait=1): 156 | """https://www.alphaxiv.org/explore?sort=Likes&time=30+Days""" 157 | url = f"https://api.alphaxiv.org/v2/papers/trending-papers?page_num={page_num}&sort_by={sort_by}&page_size={page_size}&interval={interval}" 158 | referer = f"https://www.alphaxiv.org/explore?sort={sort_by}&time={interval}" 159 | ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36" 160 | time.sleep(wait) 161 | response = requests.get(url, headers={"Referer": referer, "User-Agent": ua}) 162 | print(f"Status code {response.status_code}, {len(response.text)} characters at page {page_num}") 163 | if response.status_code != 200: 164 | print(f"Failed to fetch data: {response.status_code}") 165 | return [] 166 | json = response.json() 167 | if not json or "error" in json or "data" not in json or "trending_papers" not in json["data"]: 168 | print("No articles found or error in response.") 169 | return [] 170 | return [paper_to_dict(paper) for paper in json["data"]["trending_papers"]] 171 | 172 | 173 | def search_alphaxiv(sort_by="Likes", interval="30+Days", page_size=10, limit=30, wait=1): 174 | """https://www.alphaxiv.org/explore?sort=Likes&time=30+Days""" 175 | page_nums = [i for i in range(0, (limit + page_size - 1) // page_size)] 176 | df = pd.json_normalize(flatten([get_alphaxiv(sort_by=sort_by, interval=interval, page_size=page_size, page_num=page_num, wait=wait) for page_num in page_nums])) 177 | return df.drop_duplicates(subset=["id"], keep="last").reset_index(drop=True) 178 | 179 | 180 | def filter_invalid_arxiv_id(document_df: pd.DataFrame): 181 | """Filter out documents with invalid arXiv IDs using ARXIV_ID_PATTERN.""" 182 | 183 | def is_valid_arxiv_id_list(arxiv_id_list): 184 | if not arxiv_id_list: 185 | return False 186 | return all(ARXIV_ID_PATTERN.match(arxiv_id) for arxiv_id in arxiv_id_list) 187 | 188 | valid_mask = document_df["arxiv_id"].apply(is_valid_arxiv_id_list) 189 | filtered_df = document_df[valid_mask].reset_index(drop=True) 190 | 191 | invalid_count = len(document_df) - len(filtered_df) 192 | if invalid_count > 0: 193 | print(f"Filtered out {invalid_count} documents with invalid arXiv IDs") 194 | 195 | return filtered_df 196 | 197 | 198 | def get_arxiv_stats(document_df: pd.DataFrame): 199 | return document_df.explode("arxiv_id").groupby("arxiv_id").agg(score=("score", "sum"), num_comments=("num_comments", "sum"), count=("id", "count"), document_id=("id", pd.Series.to_list)).sort_values(by=["score", "num_comments", "count"], ascending=False).reset_index() 200 | 201 | 202 | def arxiv_result_to_dict(r: arxiv.Result): 203 | m = ARXIV_URL_PATTERN.match(r.entry_id) 204 | arxiv_id = m.group(2) if m else None 205 | assert arxiv_id is not None 206 | arxiv_id_v = m.group(2) + m.group(3) if m else None 207 | assert arxiv_id_v is not None 208 | return { 209 | "arxiv_id": arxiv_id, 210 | "arxiv_id_v": arxiv_id_v, 211 | "entry_id": r.entry_id, 212 | "updated": str(r.updated), # TODO 213 | "published": str(r.published), # TODO 214 | "title": r.title, 215 | "authors": [str(a) for a in r.authors], 216 | "summary": r.summary, 217 | "comment": r.comment, 218 | "journal_ref": r.journal_ref, 219 | "doi": r.doi, 220 | "primary_category": r.primary_category, 221 | "categories": [str(c) for c in r.categories], 222 | "links": [str(link) for link in r.links], 223 | "pdf_url": r.pdf_url, 224 | } 225 | 226 | 227 | def get_arxiv_contents(id_list: list[str], chunk_size=100): 228 | rs: list[arxiv.Result] = [] 229 | cdr = id_list 230 | for i in range(1 + len(id_list) // chunk_size): 231 | car = cdr[:chunk_size] 232 | cdr = cdr[chunk_size:] 233 | if len(car) > 0: 234 | try: 235 | search = arxiv.Search(id_list=car, max_results=len(car)) 236 | r = list(search.results()) 237 | rs.extend(r) 238 | print("search_arxiv_contents: ", i, len(r), len(rs)) 239 | except Exception as e: 240 | print(e) 241 | return pd.json_normalize([arxiv_result_to_dict(r) for r in rs]) 242 | 243 | 244 | def filter_df(df: pd.DataFrame, top_n=10, days=365, count=1, num_comments=0): 245 | df = df[df["count"] >= count] 246 | df = df[df["num_comments"] >= num_comments] 247 | days_ago = (datetime.now(timezone.utc) - timedelta(days=days)).strftime("%Y-%m-%d") # noqa: F841 248 | return df.query("published > @days_ago").head(top_n).reset_index(drop=True) 249 | 250 | 251 | def summarize(query, time_filter="month", days=30, limit=300): 252 | try: 253 | print("search_reddit...") 254 | reddit_document_df = search_reddit(f"selftext:{query}", sort="top", time_filter=time_filter, limit=limit) 255 | print("search_reddit...done: ", len(reddit_document_df)) 256 | except Exception as e: 257 | print(e) 258 | reddit_document_df = pd.json_normalize([]) 259 | try: 260 | print("search_hackernews...") 261 | hackernews_document_df = search_hackernews(query, attribute="url", days=days, limit=limit) 262 | print("search_hackernews...done: ", len(hackernews_document_df)) 263 | except Exception as e: 264 | print(e) 265 | hackernews_document_df = pd.json_normalize([]) 266 | try: 267 | print("search_huggingface...") 268 | search_huggingface_df = search_huggingface(days=days) 269 | print("search_huggingface...done: ", len(search_huggingface_df)) 270 | except Exception as e: 271 | print(e) 272 | search_huggingface_df = pd.json_normalize([]) 273 | try: 274 | print("search_alphaxiv...") 275 | search_alphaxiv_df = search_alphaxiv(limit=limit) 276 | print("search_alphaxiv...done: ", len(search_alphaxiv_df)) 277 | except Exception as e: 278 | print(e) 279 | search_alphaxiv_df = pd.json_normalize([]) 280 | concat_df = pd.concat([reddit_document_df, hackernews_document_df, search_huggingface_df, search_alphaxiv_df], ignore_index=True).sort_values(by=["score", "num_comments"], ascending=False).reset_index(drop=True) 281 | document_df = filter_invalid_arxiv_id(concat_df) 282 | print("document_df: ", len(document_df)) 283 | stats_df = get_arxiv_stats(document_df) 284 | print("stats_df: ", len(stats_df)) 285 | contents_df = get_arxiv_contents(stats_df["arxiv_id"].tolist(), chunk_size=100) 286 | print("contents_df: ", len(contents_df)) 287 | paper_df = pd.merge(stats_df, contents_df, on="arxiv_id") 288 | print("paper_df: ", len(paper_df)) 289 | return paper_df, document_df 290 | 291 | 292 | def translate_arxiv(dlc: deeplcache.DeepLCache, df: pd.DataFrame, target_lang: str): 293 | seg = pysbd.Segmenter(language="en", clean=False) 294 | print("translate_arxiv: before: ", len(dlc.cache)) 295 | print(dlc.translator.get_usage()) 296 | for arxiv_id, summary in zip(df["arxiv_id"], df["summary"]): 297 | summary_texts = seg.segment(summary.replace("\n", " ")[:2000]) 298 | trans_texts, trans_ts = dlc.translate_text(summary_texts, target_lang, arxiv_id) 299 | print("translate_arxiv: ", arxiv_id, sum([len(s) for s in summary_texts]), sum([len(t) for t in trans_texts]), trans_ts) 300 | print("translate_arxiv: after: ", len(dlc.cache)) 301 | print(dlc.translator.get_usage()) 302 | return dlc 303 | 304 | 305 | def main(): 306 | # settings 307 | query = "arxiv.org" 308 | summarize_time_filter = "month" # or "week" 309 | summarize_days = 30 # should be 30 if "month" 310 | summarize_limit = 300 311 | filter_days = 30 312 | filter_count = 1 313 | filter_num_comments = 1 314 | deepl_target_lang = "JA" 315 | deepl_expire_days = 90 316 | notify_top_n = int(os.getenv("NOTIFY_TOP_N", 10)) 317 | 318 | # prepare apis 319 | gcs_bucket = storage.Client().bucket(os.getenv("GCS_BUCKET_NAME")) 320 | deepl_api = deepl.Translator(os.getenv("DEEPL_AUTH_KEY")) # type: ignore 321 | slack_api = slack_sdk.WebClient(os.getenv("SLACK_BOT_TOKEN")) 322 | slack_channel = os.getenv("SLACK_CHANNEL") 323 | tweepy_api_v2 = tweepy.Client(bearer_token=os.getenv("TWITTER_BEARER_TOKEN"), consumer_key=os.getenv("TWITTER_API_KEY"), consumer_secret=os.getenv("TWITTER_API_KEY_SECRET"), access_token=os.getenv("TWITTER_ACCESS_TOKEN"), access_token_secret=os.getenv("TWITTER_ACCESS_TOKEN_SECRET"), wait_on_rate_limit=True) 324 | # because media_upload is only available on api v1. 325 | tweepy_api_v1 = tweepy.API(tweepy.OAuth1UserHandler(consumer_key=os.getenv("TWITTER_API_KEY"), consumer_secret=os.getenv("TWITTER_API_KEY_SECRET"), access_token=os.getenv("TWITTER_ACCESS_TOKEN"), access_token_secret=os.getenv("TWITTER_ACCESS_TOKEN_SECRET")), wait_on_rate_limit=True) 326 | bluesky_api = nanoatp.BskyAgent() 327 | bluesky_api.login(os.getenv("ATP_IDENTIFIER"), os.getenv("ATP_PASSWORD")) # type: ignore 328 | 329 | # search reddit and measure popularity 330 | paper_df, document_df = summarize(query, time_filter=summarize_time_filter, days=summarize_days, limit=summarize_limit) 331 | 332 | # filter by days 333 | filtered_df = filter_df(paper_df, top_n=notify_top_n, days=filter_days, count=filter_count, num_comments=filter_num_comments) 334 | print("filtered_df: ", len(filtered_df)) 335 | 336 | # translate summary text 337 | dlc = deeplcache.DeepLCache(deepl_api) 338 | try: 339 | dlc.load_from_gcs(gcs_bucket, "deepl_cache.json.gz") 340 | except Exception as e: 341 | print(e) 342 | dlc = translate_arxiv(dlc, filtered_df, deepl_target_lang) 343 | dlc.clear_cache(expire_timedelta=timedelta(days=deepl_expire_days)) 344 | dlc.save_to_gcs(gcs_bucket, "deepl_cache.json.gz") 345 | 346 | # post 347 | try: 348 | postslack.post_to_slack(slack_api, slack_channel, dlc, filtered_df, document_df) 349 | except Exception as e: 350 | print(e) 351 | 352 | try: 353 | postbluesky.post_to_bluesky(bluesky_api, dlc, filtered_df, document_df) 354 | except Exception as e: 355 | print(e) 356 | 357 | try: 358 | posttwitter.post_to_twitter(tweepy_api_v1, tweepy_api_v2, dlc, filtered_df, document_df) 359 | except Exception as e: 360 | print(e) 361 | 362 | 363 | if __name__ == "__main__": 364 | main() 365 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # arXiv Reddit Summary 2 | 3 | Summarize the top 30 most popular arXiv papers on Reddit, Hacker News and Hugging Face in the last 30 days. And post them to Slack, Twitter and Bluesky. 4 | 5 | ## Demo 6 | 7 | - https://x.com/susumuota 8 | - https://bsky.app/profile/paper.bsky.social 9 | 10 | ## Google Cloud Run 11 | 12 | This system is running on Google Cloud Run jobs. 13 | 14 | - https://cloud.google.com/build/docs/build-push-docker-image 15 | - https://cloud.google.com/run/docs/create-jobs#command-line 16 | - https://cloud.google.com/scheduler/docs/creating#gcloud 17 | 18 | ## Create a project 19 | 20 | - https://cloud.google.com/resource-manager/docs/creating-managing-projects#creating_a_project 21 | 22 | ```sh 23 | export PROJECT_ID="arxiv-summary-1" 24 | gcloud projects create $PROJECT_ID 25 | gcloud projects list 26 | # gcloud projects delete $PROJECT_ID 27 | # unset PROJECT_ID 28 | ``` 29 | 30 | ## Enable billing 31 | 32 | Follow this instruction. As far as I know there is no way to enable billing from the command line. 33 | 34 | - https://cloud.google.com/billing/docs/how-to/modify-project#how-to-enable-billing 35 | - https://console.cloud.google.com/billing/projects 36 | 37 | Then confirm it. 38 | 39 | ```sh 40 | gcloud beta billing projects describe $PROJECT_ID 41 | ``` 42 | 43 | It should show `billingEnabled: true`. 44 | 45 | ## Create a bucket 46 | 47 | ```sh 48 | export GCS_BUCKET_NAME="arxiv-summary" 49 | export REGION="us-central1" 50 | gcloud storage buckets create "gs://${GCS_BUCKET_NAME}" \ 51 | --project=$PROJECT_ID \ 52 | --location=$REGION \ 53 | --public-access-prevention \ 54 | --uniform-bucket-level-access 55 | gcloud storage buckets list --project=$PROJECT_ID | grep name 56 | # gcloud storage buckets delete "gs://${GCS_BUCKET_NAME}" --project=$PROJECT_ID 57 | # unset GCS_BUCKET_NAME 58 | ``` 59 | 60 | ## Build a Docker image on local machine and test it 61 | 62 | - https://cloud.google.com/sdk/gcloud/reference/auth/application-default/login 63 | - https://stackoverflow.com/a/50826145 64 | 65 | Application settings. 66 | 67 | ```sh 68 | # export TWITTER_BEARER_TOKEN="secret info" 69 | # export TWITTER_API_KEY="secret info" 70 | # export TWITTER_API_KEY_SECRET="secret info" 71 | # export TWITTER_ACCESS_TOKEN="secret info" 72 | # export TWITTER_ACCESS_TOKEN_SECRET="secret info" 73 | # export DEEPL_AUTH_KEY="secret info" 74 | # export SLACK_BOT_TOKEN="secret info" 75 | # export praw_client_id="secret info" 76 | # export praw_client_secret="secret info" 77 | # export praw_user_agent="secret info" 78 | # export ATP_IDENTIFIER="secret info" 79 | # export ATP_PASSWORD="secret info" 80 | 81 | export NOTIFY_TOP_N="30" # 30 on production env 82 | export SLACK_CHANNEL="#test" # #anywhere on production env 83 | ``` 84 | 85 | Local test. 86 | 87 | ```sh 88 | poetry export -f requirements.txt --without-hashes -o docker/requirements.txt 89 | ``` 90 | 91 | ```sh 92 | gcloud auth application-default login 93 | cd docker 94 | export IMAGE_NAME="arxiv-reddit-summary" 95 | docker build -t $IMAGE_NAME . 96 | docker run --rm \ 97 | -e TWITTER_BEARER_TOKEN=$TWITTER_BEARER_TOKEN \ 98 | -e TWITTER_API_KEY=$TWITTER_API_KEY \ 99 | -e TWITTER_API_KEY_SECRET=$TWITTER_API_KEY_SECRET \ 100 | -e TWITTER_ACCESS_TOKEN=$TWITTER_ACCESS_TOKEN \ 101 | -e TWITTER_ACCESS_TOKEN_SECRET=$TWITTER_ACCESS_TOKEN_SECRET \ 102 | -e DEEPL_AUTH_KEY=$DEEPL_AUTH_KEY \ 103 | -e SLACK_BOT_TOKEN=$SLACK_BOT_TOKEN \ 104 | -e praw_client_id=$praw_client_id \ 105 | -e praw_client_secret=$praw_client_secret \ 106 | -e praw_user_agent=$praw_user_agent \ 107 | -e ATP_IDENTIFIER=$ATP_IDENTIFIER \ 108 | -e ATP_PASSWORD=$ATP_PASSWORD \ 109 | -e NOTIFY_TOP_N=$NOTIFY_TOP_N \ 110 | -e SLACK_CHANNEL=$SLACK_CHANNEL \ 111 | -e GCS_BUCKET_NAME=$GCS_BUCKET_NAME \ 112 | -e GCLOUD_PROJECT=$PROJECT_ID \ 113 | -v $HOME/.config/gcloud:/root/.config/gcloud \ 114 | $IMAGE_NAME 115 | docker images 116 | # docker rmi $IMAGE_NAME 117 | # unset IMAGE_NAME 118 | ``` 119 | 120 | ## Create a service account for Cloud Run 121 | 122 | ```sh 123 | export RUN_SERVICE_ACCOUNT="run-sa" 124 | gcloud iam service-accounts create $RUN_SERVICE_ACCOUNT --project=$PROJECT_ID 125 | gcloud iam service-accounts list --project=$PROJECT_ID 126 | # gcloud iam service-accounts delete "${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" --project=$PROJECT_ID 127 | # unset RUN_SERVICE_ACCOUNT 128 | ``` 129 | 130 | ## Add roles to service account to access GCS and to invoke Cloud Run 131 | 132 | - https://cloud.google.com/storage/docs/access-control/iam-roles 133 | - https://cloud.google.com/scheduler/docs/creating#gcloud 134 | - https://cloud.google.com/iam/docs/creating-managing-service-accounts#creating 135 | - https://codelabs.developers.google.com/cloud-run-jobs-and-cloud-scheduler#4 136 | 137 | ```sh 138 | gcloud projects add-iam-policy-binding $PROJECT_ID \ 139 | --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \ 140 | --role="roles/storage.objectAdmin" 141 | gcloud projects add-iam-policy-binding $PROJECT_ID \ 142 | --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \ 143 | --role="roles/run.invoker" 144 | gcloud projects get-iam-policy $PROJECT_ID 145 | # gcloud projects remove-iam-policy-binding $PROJECT_ID \ 146 | # --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \ 147 | # --role="roles/storage.objectAdmin" 148 | # gcloud projects remove-iam-policy-binding $PROJECT_ID \ 149 | # --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \ 150 | # --role="roles/run.invoker" 151 | ``` 152 | 153 | ## Create secret data 154 | 155 | - https://cloud.google.com/secret-manager/docs/create-secret#secretmanager-quickstart-gcloud 156 | - https://cloud.google.com/run/docs/configuring/secrets 157 | 158 | ```sh 159 | gcloud services enable secretmanager.googleapis.com --project=$PROJECT_ID 160 | echo -n $TWITTER_BEARER_TOKEN | gcloud secrets create "TWITTER_BEARER_TOKEN" \ 161 | --project=$PROJECT_ID \ 162 | --replication-policy="automatic" \ 163 | --data-file=- 164 | echo -n $TWITTER_API_KEY | gcloud secrets create "TWITTER_API_KEY" \ 165 | --project=$PROJECT_ID \ 166 | --replication-policy="automatic" \ 167 | --data-file=- 168 | echo -n $TWITTER_API_KEY_SECRET | gcloud secrets create "TWITTER_API_KEY_SECRET" \ 169 | --project=$PROJECT_ID \ 170 | --replication-policy="automatic" \ 171 | --data-file=- 172 | echo -n $TWITTER_ACCESS_TOKEN | gcloud secrets create "TWITTER_ACCESS_TOKEN" \ 173 | --project=$PROJECT_ID \ 174 | --replication-policy="automatic" \ 175 | --data-file=- 176 | echo -n $TWITTER_ACCESS_TOKEN_SECRET | gcloud secrets create "TWITTER_ACCESS_TOKEN_SECRET" \ 177 | --project=$PROJECT_ID \ 178 | --replication-policy="automatic" \ 179 | --data-file=- 180 | echo -n $DEEPL_AUTH_KEY | gcloud secrets create "DEEPL_AUTH_KEY" \ 181 | --project=$PROJECT_ID \ 182 | --replication-policy="automatic" \ 183 | --data-file=- 184 | echo -n $SLACK_BOT_TOKEN | gcloud secrets create "SLACK_BOT_TOKEN" \ 185 | --project=$PROJECT_ID \ 186 | --replication-policy="automatic" \ 187 | --data-file=- 188 | echo -n $praw_client_id | gcloud secrets create "praw_client_id" \ 189 | --project=$PROJECT_ID \ 190 | --replication-policy="automatic" \ 191 | --data-file=- 192 | echo -n $praw_client_secret | gcloud secrets create "praw_client_secret" \ 193 | --project=$PROJECT_ID \ 194 | --replication-policy="automatic" \ 195 | --data-file=- 196 | echo -n $praw_user_agent | gcloud secrets create "praw_user_agent" \ 197 | --project=$PROJECT_ID \ 198 | --replication-policy="automatic" \ 199 | --data-file=- 200 | echo -n $ATP_IDENTIFIER | gcloud secrets create "ATP_IDENTIFIER" \ 201 | --project=$PROJECT_ID \ 202 | --replication-policy="automatic" \ 203 | --data-file=- 204 | echo -n $ATP_PASSWORD | gcloud secrets create "ATP_PASSWORD" \ 205 | --project=$PROJECT_ID \ 206 | --replication-policy="automatic" \ 207 | --data-file=- 208 | gcloud secrets list --project=$PROJECT_ID 209 | gcloud secrets versions access 1 --secret="TWITTER_BEARER_TOKEN" --project=$PROJECT_ID 210 | gcloud secrets versions access 1 --secret="TWITTER_API_KEY" --project=$PROJECT_ID 211 | gcloud secrets versions access 1 --secret="TWITTER_API_KEY_SECRET" --project=$PROJECT_ID 212 | gcloud secrets versions access 1 --secret="TWITTER_ACCESS_TOKEN" --project=$PROJECT_ID 213 | gcloud secrets versions access 1 --secret="TWITTER_ACCESS_TOKEN_SECRET" --project=$PROJECT_ID 214 | gcloud secrets versions access 1 --secret="DEEPL_AUTH_KEY" --project=$PROJECT_ID 215 | gcloud secrets versions access 1 --secret="SLACK_BOT_TOKEN" --project=$PROJECT_ID 216 | gcloud secrets versions access 1 --secret="praw_client_id" --project=$PROJECT_ID 217 | gcloud secrets versions access 1 --secret="praw_client_secret" --project=$PROJECT_ID 218 | gcloud secrets versions access 1 --secret="praw_user_agent" --project=$PROJECT_ID 219 | gcloud secrets versions access 1 --secret="ATP_IDENTIFIER" --project=$PROJECT_ID 220 | gcloud secrets versions access 1 --secret="ATP_PASSWORD" --project=$PROJECT_ID 221 | # gcloud secrets delete "TWITTER_BEARER_TOKEN" --project=$PROJECT_ID 222 | # gcloud secrets delete "TWITTER_API_KEY" --project=$PROJECT_ID 223 | # gcloud secrets delete "TWITTER_API_KEY_SECRET" --project=$PROJECT_ID 224 | # gcloud secrets delete "TWITTER_ACCESS_TOKEN" --project=$PROJECT_ID 225 | # gcloud secrets delete "TWITTER_ACCESS_TOKEN_SECRET" --project=$PROJECT_ID 226 | # gcloud secrets delete "DEEPL_AUTH_KEY" --project=$PROJECT_ID 227 | # gcloud secrets delete "SLACK_BOT_TOKEN" --project=$PROJECT_ID 228 | # gcloud secrets delete "praw_client_id" --project=$PROJECT_ID 229 | # gcloud secrets delete "praw_client_secret" --project=$PROJECT_ID 230 | # gcloud secrets delete "praw_user_agent" --project=$PROJECT_ID 231 | # gcloud secrets delete "ATP_IDENTIFIER" --project=$PROJECT_ID 232 | # gcloud secrets delete "ATP_PASSWORD" --project=$PROJECT_ID 233 | # gcloud services disable secretmanager.googleapis.com --project=$PROJECT_ID 234 | ``` 235 | 236 | ## Add roles to secrets to be accessed by service account 237 | 238 | - https://cloud.google.com/secret-manager/docs/managing-secrets#secretmanager-create-secret-gcloud 239 | 240 | ```sh 241 | gcloud secrets add-iam-policy-binding "TWITTER_BEARER_TOKEN" \ 242 | --project=$PROJECT_ID \ 243 | --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \ 244 | --role="roles/secretmanager.secretAccessor" 245 | gcloud secrets add-iam-policy-binding "TWITTER_API_KEY" \ 246 | --project=$PROJECT_ID \ 247 | --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \ 248 | --role="roles/secretmanager.secretAccessor" 249 | gcloud secrets add-iam-policy-binding "TWITTER_API_KEY_SECRET" \ 250 | --project=$PROJECT_ID \ 251 | --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \ 252 | --role="roles/secretmanager.secretAccessor" 253 | gcloud secrets add-iam-policy-binding "TWITTER_ACCESS_TOKEN" \ 254 | --project=$PROJECT_ID \ 255 | --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \ 256 | --role="roles/secretmanager.secretAccessor" 257 | gcloud secrets add-iam-policy-binding "TWITTER_ACCESS_TOKEN_SECRET" \ 258 | --project=$PROJECT_ID \ 259 | --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \ 260 | --role="roles/secretmanager.secretAccessor" 261 | gcloud secrets add-iam-policy-binding "DEEPL_AUTH_KEY" \ 262 | --project=$PROJECT_ID \ 263 | --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \ 264 | --role="roles/secretmanager.secretAccessor" 265 | gcloud secrets add-iam-policy-binding "SLACK_BOT_TOKEN" \ 266 | --project=$PROJECT_ID \ 267 | --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \ 268 | --role="roles/secretmanager.secretAccessor" 269 | gcloud secrets add-iam-policy-binding "praw_client_id" \ 270 | --project=$PROJECT_ID \ 271 | --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \ 272 | --role="roles/secretmanager.secretAccessor" 273 | gcloud secrets add-iam-policy-binding "praw_client_secret" \ 274 | --project=$PROJECT_ID \ 275 | --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \ 276 | --role="roles/secretmanager.secretAccessor" 277 | gcloud secrets add-iam-policy-binding "praw_user_agent" \ 278 | --project=$PROJECT_ID \ 279 | --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \ 280 | --role="roles/secretmanager.secretAccessor" 281 | gcloud secrets add-iam-policy-binding "ATP_IDENTIFIER" \ 282 | --project=$PROJECT_ID \ 283 | --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \ 284 | --role="roles/secretmanager.secretAccessor" 285 | gcloud secrets add-iam-policy-binding "ATP_PASSWORD" \ 286 | --project=$PROJECT_ID \ 287 | --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \ 288 | --role="roles/secretmanager.secretAccessor" 289 | gcloud secrets get-iam-policy "TWITTER_BEARER_TOKEN" --project=$PROJECT_ID 290 | gcloud secrets get-iam-policy "TWITTER_API_KEY" --project=$PROJECT_ID 291 | gcloud secrets get-iam-policy "TWITTER_API_KEY_SECRET" --project=$PROJECT_ID 292 | gcloud secrets get-iam-policy "TWITTER_ACCESS_TOKEN" --project=$PROJECT_ID 293 | gcloud secrets get-iam-policy "TWITTER_ACCESS_TOKEN_SECRET" --project=$PROJECT_ID 294 | gcloud secrets get-iam-policy "DEEPL_AUTH_KEY" --project=$PROJECT_ID 295 | gcloud secrets get-iam-policy "SLACK_BOT_TOKEN" --project=$PROJECT_ID 296 | gcloud secrets get-iam-policy "praw_client_id" --project=$PROJECT_ID 297 | gcloud secrets get-iam-policy "praw_client_secret" --project=$PROJECT_ID 298 | gcloud secrets get-iam-policy "praw_user_agent" --project=$PROJECT_ID 299 | gcloud secrets get-iam-policy "ATP_IDENTIFIER" --project=$PROJECT_ID 300 | gcloud secrets get-iam-policy "ATP_PASSWORD" --project=$PROJECT_ID 301 | # gcloud secrets remove-iam-policy-binding "TWITTER_BEARER_TOKEN" \ 302 | # --project=$PROJECT_ID \ 303 | # --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \ 304 | # --role="roles/secretmanager.secretAccessor" 305 | # gcloud secrets remove-iam-policy-binding "TWITTER_API_KEY" \ 306 | # --project=$PROJECT_ID \ 307 | # --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \ 308 | # --role="roles/secretmanager.secretAccessor" 309 | # gcloud secrets remove-iam-policy-binding "TWITTER_API_KEY_SECRET" \ 310 | # --project=$PROJECT_ID \ 311 | # --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \ 312 | # --role="roles/secretmanager.secretAccessor" 313 | # gcloud secrets remove-iam-policy-binding "TWITTER_ACCESS_TOKEN" \ 314 | # --project=$PROJECT_ID \ 315 | # --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \ 316 | # --role="roles/secretmanager.secretAccessor" 317 | # gcloud secrets remove-iam-policy-binding "TWITTER_ACCESS_TOKEN_SECRET" \ 318 | # --project=$PROJECT_ID \ 319 | # --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \ 320 | # --role="roles/secretmanager.secretAccessor" 321 | # gcloud secrets remove-iam-policy-binding "DEEPL_AUTH_KEY" \ 322 | # --project=$PROJECT_ID \ 323 | # --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \ 324 | # --role="roles/secretmanager.secretAccessor" 325 | # gcloud secrets remove-iam-policy-binding "SLACK_BOT_TOKEN" \ 326 | # --project=$PROJECT_ID \ 327 | # --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \ 328 | # --role="roles/secretmanager.secretAccessor" 329 | # gcloud secrets remove-iam-policy-binding "praw_client_id" \ 330 | # --project=$PROJECT_ID \ 331 | # --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \ 332 | # --role="roles/secretmanager.secretAccessor" 333 | # gcloud secrets remove-iam-policy-binding "praw_client_secret" \ 334 | # --project=$PROJECT_ID \ 335 | # --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \ 336 | # --role="roles/secretmanager.secretAccessor" 337 | # gcloud secrets remove-iam-policy-binding "praw_user_agent" \ 338 | # --project=$PROJECT_ID \ 339 | # --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \ 340 | # --role="roles/secretmanager.secretAccessor" 341 | # gcloud secrets remove-iam-policy-binding "ATP_IDENTIFIER" \ 342 | # --project=$PROJECT_ID \ 343 | # --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \ 344 | # --role="roles/secretmanager.secretAccessor" 345 | # gcloud secrets remove-iam-policy-binding "ATP_PASSWORD" \ 346 | # --project=$PROJECT_ID \ 347 | # --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \ 348 | # --role="roles/secretmanager.secretAccessor" 349 | ``` 350 | 351 | ## Create a Docker repository 352 | 353 | - https://cloud.google.com/build/docs/build-push-docker-image 354 | - https://codelabs.developers.google.com/cloud-run-jobs-and-cloud-scheduler#3 355 | 356 | ```sh 357 | gcloud services enable artifactregistry.googleapis.com --project=$PROJECT_ID 358 | export REPOSITORY="arxiv-reddit-summary" 359 | gcloud artifacts repositories create $REPOSITORY \ 360 | --project=$PROJECT_ID \ 361 | --repository-format="docker" \ 362 | --location=$REGION 363 | gcloud artifacts repositories list --project=$PROJECT_ID --location=$REGION 364 | # gcloud artifacts repositories delete $REPOSITORY --project=$PROJECT_ID --location=$REGION 365 | # gcloud services disable artifactregistry.googleapis.com --project=$PROJECT_ID 366 | # unset REPOSITORY REGION 367 | ``` 368 | 369 | ## Build a Docker image 370 | 371 | - https://cloud.google.com/build/docs/build-push-docker-image 372 | - https://cloud.google.com/build/docs/building/build-containers#use-dockerfile 373 | - https://codelabs.developers.google.com/cloud-run-jobs-and-cloud-scheduler#3 374 | 375 | ```sh 376 | gcloud services enable cloudbuild.googleapis.com --project=$PROJECT_ID 377 | export TAG_NAME="latest" 378 | gcloud builds submit \ 379 | --project=$PROJECT_ID \ 380 | --region=$REGION \ 381 | --tag="${REGION}-docker.pkg.dev/${PROJECT_ID}/${REPOSITORY}/${IMAGE_NAME}:${TAG_NAME}" 382 | gcloud builds list --project=$PROJECT_ID --region=$REGION 383 | gcloud artifacts repositories list --project=$PROJECT_ID --location=$REGION 384 | # gcloud services disable cloudbuild.googleapis.com --project=$PROJECT_ID 385 | # unset TAG_NAME 386 | ``` 387 | 388 | ## Test a Docker image on local machine 389 | 390 | **This process may increase charge because of data transfer.** 391 | 392 | - https://cloud.google.com/build/docs/building/build-containers#run_the_docker_image 393 | - https://cloud.google.com/artifact-registry/pricing 394 | - https://support.terra.bio/hc/en-us/articles/4408985788187-How-to-configure-GCR-Artifact-Registry-to-prevent-egress-charges 395 | 396 | ```sh 397 | gcloud auth configure-docker ${REGION}-docker.pkg.dev 398 | docker run --rm \ 399 | -e TWITTER_BEARER_TOKEN=$TWITTER_BEARER_TOKEN \ 400 | -e TWITTER_API_KEY=$TWITTER_API_KEY \ 401 | -e TWITTER_API_KEY_SECRET=$TWITTER_API_KEY_SECRET \ 402 | -e TWITTER_ACCESS_TOKEN=$TWITTER_ACCESS_TOKEN \ 403 | -e TWITTER_ACCESS_TOKEN_SECRET=$TWITTER_ACCESS_TOKEN_SECRET \ 404 | -e DEEPL_AUTH_KEY=$DEEPL_AUTH_KEY \ 405 | -e SLACK_BOT_TOKEN=$SLACK_BOT_TOKEN \ 406 | -e praw_client_id=$praw_client_id \ 407 | -e praw_client_secret=$praw_client_secret \ 408 | -e praw_user_agent=$praw_user_agent \ 409 | -e ATP_IDENTIFIER=$ATP_IDENTIFIER \ 410 | -e ATP_PASSWORD=$ATP_PASSWORD \ 411 | -e NOTIFY_TOP_N=$NOTIFY_TOP_N \ 412 | -e SLACK_CHANNEL=$SLACK_CHANNEL \ 413 | -e GCS_BUCKET_NAME=$GCS_BUCKET_NAME \ 414 | -e GCLOUD_PROJECT=$PROJECT_ID \ 415 | -v $HOME/.config/gcloud:/root/.config/gcloud \ 416 | "${REGION}-docker.pkg.dev/${PROJECT_ID}/${REPOSITORY}/${IMAGE_NAME}:${TAG_NAME}" 417 | docker images 418 | # docker rmi "${REGION}-docker.pkg.dev/${PROJECT_ID}/${REPOSITORY}/${IMAGE_NAME}:${TAG_NAME}" 419 | ``` 420 | 421 | ## Create a Cloud Run job 422 | 423 | - https://cloud.google.com/run/docs/create-jobs#command-line 424 | 425 | Change parameters for production env. 426 | 427 | ```sh 428 | export NOTIFY_TOP_N="30" # 10 on development env 429 | export SLACK_CHANNEL="#test" # #test on development env 430 | ``` 431 | 432 | ```sh 433 | gcloud services enable run.googleapis.com --project=$PROJECT_ID 434 | export RUN_JOB_NAME="arxiv-reddit-summary-job-1" 435 | gcloud beta run jobs create $RUN_JOB_NAME \ 436 | --image="${REGION}-docker.pkg.dev/${PROJECT_ID}/${REPOSITORY}/${IMAGE_NAME}:${TAG_NAME}" \ 437 | --project=$PROJECT_ID \ 438 | --region=$REGION \ 439 | --service-account="${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \ 440 | --set-secrets="TWITTER_BEARER_TOKEN=TWITTER_BEARER_TOKEN:1" \ 441 | --set-secrets="TWITTER_API_KEY=TWITTER_API_KEY:1" \ 442 | --set-secrets="TWITTER_API_KEY_SECRET=TWITTER_API_KEY_SECRET:1" \ 443 | --set-secrets="TWITTER_ACCESS_TOKEN=TWITTER_ACCESS_TOKEN:1" \ 444 | --set-secrets="TWITTER_ACCESS_TOKEN_SECRET=TWITTER_ACCESS_TOKEN_SECRET:1" \ 445 | --set-secrets="DEEPL_AUTH_KEY=DEEPL_AUTH_KEY:1" \ 446 | --set-secrets="SLACK_BOT_TOKEN=SLACK_BOT_TOKEN:1" \ 447 | --set-secrets="praw_client_id=praw_client_id:1" \ 448 | --set-secrets="praw_client_secret=praw_client_secret:1" \ 449 | --set-secrets="praw_user_agent=praw_user_agent:1" \ 450 | --set-secrets="ATP_IDENTIFIER=ATP_IDENTIFIER:1" \ 451 | --set-secrets="ATP_PASSWORD=ATP_PASSWORD:1" \ 452 | --set-env-vars="NOTIFY_TOP_N=${NOTIFY_TOP_N}" \ 453 | --set-env-vars="SLACK_CHANNEL=${SLACK_CHANNEL}" \ 454 | --set-env-vars="GCS_BUCKET_NAME=${GCS_BUCKET_NAME}" \ 455 | --max-retries=0 \ 456 | --task-timeout="30m" \ 457 | --memory="1024Mi" 458 | gcloud beta run jobs list --project=$PROJECT_ID 459 | gcloud beta run jobs describe $RUN_JOB_NAME --project=$PROJECT_ID --region=$REGION 460 | # gcloud beta run jobs delete $RUN_JOB_NAME --project=$PROJECT_ID --region=$REGION 461 | # gcloud services disable run.googleapis.com --project=$PROJECT_ID 462 | # unset RUN_JOB_NAME 463 | ``` 464 | 465 | ## Execute a job 466 | 467 | - https://cloud.google.com/run/docs/execute/jobs 468 | 469 | ```sh 470 | gcloud beta run jobs execute $RUN_JOB_NAME --project=$PROJECT_ID --region=$REGION 471 | gcloud beta run jobs executions list --project=$PROJECT_ID --region=$REGION 472 | ``` 473 | 474 | ```sh 475 | gcloud logging read "resource.type=cloud_run_job" \ 476 | --project=$PROJECT_ID \ 477 | --limit 10 | egrep "textPayload|message" 478 | ``` 479 | 480 | ## Create a Cloud Scheduler job 481 | 482 | - https://cloud.google.com/run/docs/execute/jobs-on-schedule#command-line 483 | - https://codelabs.developers.google.com/cloud-run-jobs-and-cloud-scheduler#4 484 | 485 | ```sh 486 | export SCHEDULER_JOB_NAME="arxiv-reddit-summary-job-everyday-9am" 487 | gcloud services enable cloudscheduler.googleapis.com --project=$PROJECT_ID 488 | gcloud scheduler jobs create http $SCHEDULER_JOB_NAME \ 489 | --project=$PROJECT_ID \ 490 | --location=$REGION \ 491 | --schedule="0 9 * * *" \ 492 | --time-zone "Asia/Tokyo" \ 493 | --uri="https://${REGION}-run.googleapis.com/apis/run.googleapis.com/v1/namespaces/${PROJECT_ID}/jobs/${RUN_JOB_NAME}:run" \ 494 | --http-method="POST" \ 495 | --oauth-service-account-email="${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" 496 | gcloud scheduler jobs list --project=$PROJECT_ID --location=$REGION 497 | gcloud scheduler jobs describe $SCHEDULER_JOB_NAME --project=$PROJECT_ID --location=$REGION 498 | # gcloud scheduler jobs delete $SCHEDULER_JOB_NAME --project=$PROJECT_ID --location=$REGION 499 | # gcloud services disable cloudscheduler.googleapis.com --project=$PROJECT_ID 500 | # unset SCHEDULER_JOB_NAME 501 | ``` 502 | 503 | ```sh 504 | gcloud logging read "resource.type=cloud_run_job OR resource.type=cloud_scheduler_job" \ 505 | --project=$PROJECT_ID \ 506 | --limit 10 | egrep "textPayload|message" 507 | ``` 508 | 509 | ## License 510 | 511 | MIT License, See LICENSE file. 512 | 513 | ## Author 514 | 515 | Susumu OTA 516 | --------------------------------------------------------------------------------