├── poetry.toml
├── docker
    ├── .dockerignore
    ├── Dockerfile
    ├── utils.py
    ├── deeplcache.py
    ├── requirements.txt
    ├── generatehtml.py
    ├── postslack.py
    ├── posttwitter.py
    ├── postbluesky.py
    └── main.py
├── .flake8
├── .gitignore
├── LICENSE
├── pyproject.toml
├── misc
    └── update_job.sh
└── README.md


/poetry.toml:
--------------------------------------------------------------------------------
1 | [virtualenvs]
2 | in-project = true
3 | 


--------------------------------------------------------------------------------
/docker/.dockerignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | __pycache__
3 | *.json.gz
4 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 320
3 | extend-ignore = E203
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .python-version
2 | .vscode
3 | .DS_Store
4 | __pycache__
5 | .venv
6 | poetry.lock
7 | *.ipynb
8 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: 2022-2025 Susumu OTA <1632335+susumuota@users.noreply.github.com>
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | FROM python:3.11.13-slim
 5 | 
 6 | RUN apt-get update && apt-get install -y --no-install-recommends \
 7 |   aria2 \
 8 |   fonts-ipafont-gothic \
 9 |   poppler-utils \
10 |   wkhtmltopdf \
11 |   && rm -rf /var/lib/apt/lists/*
12 | 
13 | WORKDIR /app
14 | 
15 | COPY requirements.txt requirements.txt
16 | RUN pip install -r requirements.txt
17 | 
18 | COPY . .
19 | 
20 | CMD [ "python", "-u", "main.py" ]
21 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023-2025 Susumu OTA
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "arxiv-reddit-summary"
 3 | version = "0.7.6"
 4 | description = "Summarize the top 30 most popular arXiv papers on Reddit, Hacker News and Hugging Face in the last 30 days. And post them to Slack, Twitter and Bluesky."
 5 | authors = ["Susumu OTA <1632335+susumuota@users.noreply.github.com>"]
 6 | license = "MIT"
 7 | readme = "README.md"
 8 | 
 9 | [tool.poetry.dependencies]
10 | python = "^3.11"
11 | praw = "^7.8.1"
12 | pandas = "^2.2.3"
13 | arxiv = "^2.1.3"
14 | tweepy = "^4.15.0"
15 | python-dateutil = "^2.9.0.post0"
16 | imgkit = "^1.2.3"
17 | google-cloud-storage = "^3.0.0"
18 | deepl = "^1.21.0"
19 | pysbd = "^0.3.4"
20 | slack-sdk = "^3.34.0"
21 | nanoatp = "^0.5.1"
22 | requests = "^2.32.3"
23 | beautifulsoup4 = "^4.13.3"
24 | 
25 | 
26 | [tool.poetry.group.dev.dependencies]
27 | black = "^25.1.0"
28 | flake8 = "^7.1.2"
29 | isort = "^6.0.0"
30 | ipykernel = "^6.29.5"
31 | ruff = "^0.11.3"
32 | 
33 | [build-system]
34 | requires = ["poetry-core"]
35 | build-backend = "poetry.core.masonry.api"
36 | 
37 | [tool.black]
38 | line-length = 320
39 | 
40 | [tool.isort]
41 | profile = "black"
42 | 
43 | [tool.ruff]
44 | line-length = 320
45 | 


--------------------------------------------------------------------------------
/docker/utils.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: 2023 Susumu OTA <1632335+susumuota@users.noreply.github.com>
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | import os
 5 | import subprocess
 6 | import unicodedata
 7 | from shlex import quote
 8 | 
 9 | import imgkit
10 | 
11 | 
12 | def download_arxiv_pdf(arxiv_id: str, tmp_dir: str):
13 |     dir = quote(tmp_dir)
14 |     output = quote(f"{arxiv_id}.pdf")
15 |     url = quote(f"https://arxiv.org/pdf/{arxiv_id}.pdf")
16 |     result = subprocess.run(f"aria2c -q -x5 -k1M -d {dir} -o {output} {url}", shell=True)
17 |     assert result.returncode == 0  # TODO
18 |     return os.path.join(tmp_dir, f"{arxiv_id}.pdf")
19 | 
20 | 
21 | def pdf_to_png(pdf_filename: str):
22 |     filename = quote(pdf_filename)
23 |     result = subprocess.run(f"pdftoppm -q -png -singlefile -scale-to-x 1200 -scale-to-y -1 {filename} {filename}", shell=True)
24 |     assert result.returncode == 0  # TODO
25 |     return f"{pdf_filename}.png"
26 | 
27 | 
28 | def html_to_image(html: str, image_filename: str, quality: int = 94):
29 |     result = imgkit.from_string(html, image_filename, options={"width": 1200, "quiet": "", "quality": quality})
30 |     assert result is True  # TODO
31 |     return image_filename
32 | 
33 | 
34 | def get_char_width(c: str):
35 |     return 2 if unicodedata.east_asian_width(c) in "FWA" else 1
36 | 
37 | 
38 | def len_tweet(text: str):
39 |     return sum(map(get_char_width, text))
40 | 
41 | 
42 | def strip_tweet(text: str, max_length=280, dots="..."):
43 |     length = max_length - (len(dots) if dots else 0)
44 |     buf = []
45 |     count = 0
46 |     for c in text:
47 |         width = get_char_width(c)
48 |         if count + width > length:
49 |             return "".join(buf) + (dots if dots else "")
50 |         buf.append(c)
51 |         count += width
52 |     return text
53 | 
54 | 
55 | def avoid_auto_link(text: str):
56 |     """replace period to one dot leader to avoid auto link.
57 |     https://shkspr.mobi/blog/2015/01/how-to-stop-twitter-auto-linking-urls/"""
58 |     return text.replace(".", "․")
59 | 
60 | 
61 | def strip(text: str, length: int):
62 |     return text[: length - 3] + "..." if len(text) > length else text
63 | 
64 | 
65 | def get_link_type(link: str):
66 |     match link:
67 |         case x if x.find("reddit.com") != -1 or x.find("redd.it") != -1:
68 |             return "Reddit"
69 |         case x if x.find("news.ycombinator.com") != -1:
70 |             return "Hacker News"
71 |         case x if x.find("huggingface.co") != -1:
72 |             return "Hugging Face"
73 |         case x if x.find("alphaxiv.org") != -1:
74 |             return "alphaXiv"
75 |         case _:
76 |             return ""
77 | 


--------------------------------------------------------------------------------
/misc/update_job.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # SPDX-FileCopyrightText: 2022 Susumu OTA <1632335+susumuota@users.noreply.github.com>
 4 | # SPDX-License-Identifier: MIT
 5 | 
 6 | export NOTIFY_TOP_N="30"
 7 | export SLACK_CHANNEL="#test"
 8 | 
 9 | export PROJECT_ID="arxiv-summary-1"
10 | export GCS_BUCKET_NAME="arxiv-summary"
11 | export REGION="us-central1"
12 | export IMAGE_NAME="arxiv-reddit-summary"
13 | export RUN_SERVICE_ACCOUNT="run-sa"
14 | export REPOSITORY="arxiv-reddit-summary"
15 | export TAG_NAME="latest"
16 | export RUN_JOB_NAME="arxiv-reddit-summary-job-1"
17 | export SCHEDULER_JOB_NAME="arxiv-reddit-summary-job-everyday-9am"
18 | 
19 | 
20 | gcloud artifacts repositories list --project=$PROJECT_ID --location=$REGION
21 | gcloud artifacts repositories delete $REPOSITORY --project=$PROJECT_ID --location=$REGION --quiet
22 | gcloud artifacts repositories create $REPOSITORY \
23 |   --project=$PROJECT_ID \
24 |   --repository-format="docker" \
25 |   --location=$REGION
26 | gcloud artifacts repositories list --project=$PROJECT_ID --location=$REGION
27 | 
28 | gcloud builds submit \
29 |   --project=$PROJECT_ID \
30 |   --region=$REGION \
31 |   --tag="${REGION}-docker.pkg.dev/${PROJECT_ID}/${REPOSITORY}/${IMAGE_NAME}:${TAG_NAME}"
32 | gcloud artifacts repositories list --project=$PROJECT_ID --location=$REGION
33 | 
34 | gcloud beta run jobs list --project=$PROJECT_ID
35 | gcloud beta run jobs delete $RUN_JOB_NAME --project=$PROJECT_ID --region=$REGION --quiet
36 | gcloud beta run jobs create $RUN_JOB_NAME \
37 |   --image="${REGION}-docker.pkg.dev/${PROJECT_ID}/${REPOSITORY}/${IMAGE_NAME}:${TAG_NAME}" \
38 |   --project=$PROJECT_ID \
39 |   --region=$REGION \
40 |   --service-account="${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
41 |   --set-secrets="TWITTER_BEARER_TOKEN=TWITTER_BEARER_TOKEN:1" \
42 |   --set-secrets="TWITTER_API_KEY=TWITTER_API_KEY:1" \
43 |   --set-secrets="TWITTER_API_KEY_SECRET=TWITTER_API_KEY_SECRET:1" \
44 |   --set-secrets="TWITTER_ACCESS_TOKEN=TWITTER_ACCESS_TOKEN:1" \
45 |   --set-secrets="TWITTER_ACCESS_TOKEN_SECRET=TWITTER_ACCESS_TOKEN_SECRET:1" \
46 |   --set-secrets="DEEPL_AUTH_KEY=DEEPL_AUTH_KEY:1" \
47 |   --set-secrets="SLACK_BOT_TOKEN=SLACK_BOT_TOKEN:1" \
48 |   --set-secrets="praw_client_id=praw_client_id:1" \
49 |   --set-secrets="praw_client_secret=praw_client_secret:1" \
50 |   --set-secrets="praw_user_agent=praw_user_agent:1" \
51 |   --set-secrets="ATP_IDENTIFIER=ATP_IDENTIFIER:1" \
52 |   --set-secrets="ATP_PASSWORD=ATP_PASSWORD:1" \
53 |   --set-env-vars="NOTIFY_TOP_N=${NOTIFY_TOP_N}" \
54 |   --set-env-vars="SLACK_CHANNEL=${SLACK_CHANNEL}" \
55 |   --set-env-vars="GCS_BUCKET_NAME=${GCS_BUCKET_NAME}" \
56 |   --max-retries=0 \
57 |   --task-timeout="30m" \
58 |   --memory="1024Mi"
59 | gcloud beta run jobs list --project=$PROJECT_ID
60 | 


--------------------------------------------------------------------------------
/docker/deeplcache.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: 2023 Susumu OTA <1632335+susumuota@users.noreply.github.com>
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | import gzip
 5 | import json
 6 | import os
 7 | import tempfile
 8 | from datetime import datetime, timedelta, timezone
 9 | 
10 | import deepl
11 | 
12 | 
13 | class DeepLCache:
14 |     def __init__(self, translator: deepl.Translator):
15 |         self.translator = translator
16 |         self.cache: dict[str, tuple[list[str], str]] = {}
17 | 
18 |     def clear_cache(self, expire_timedelta: timedelta | None = None):
19 |         if expire_timedelta is None:
20 |             self.cache = {}
21 |             return
22 |         expire_dt = datetime.now(timezone.utc) - expire_timedelta
23 | 
24 |         def is_not_expire(item):  # item is [arxiv_id, [texts, ts]]
25 |             return datetime.fromisoformat(item[1][1]) > expire_dt
26 | 
27 |         self.cache = dict(filter(is_not_expire, self.cache.items()))
28 | 
29 |     def __repr__(self):
30 |         return repr(self.cache)  # TODO
31 | 
32 |     def load(self, filename: str):
33 |         with gzip.open(filename, "rt", encoding="UTF-8") as f:
34 |             self.cache = json.load(f)
35 | 
36 |     def save(self, filename: str):
37 |         with gzip.open(filename, "wt", encoding="UTF-8") as f:
38 |             json.dump(self.cache, f)
39 | 
40 |     def load_from_s3(self, s3_bucket, filename: str):
41 |         with tempfile.TemporaryDirectory() as tmpdir:
42 |             tmpfilename = os.path.join(tmpdir, filename)
43 |             s3_bucket.download_file(filename, tmpfilename)
44 |             self.load(tmpfilename)
45 | 
46 |     def save_to_s3(self, s3_bucket, filename: str):
47 |         with tempfile.TemporaryDirectory() as tmpdir:
48 |             tmpfilename = os.path.join(tmpdir, filename)
49 |             self.save(tmpfilename)
50 |             s3_bucket.upload_file(filename, tmpfilename)
51 | 
52 |     def load_from_gcs(self, gcs_bucket, filename: str):
53 |         with tempfile.TemporaryDirectory() as tmpdir:
54 |             tmpfilename = os.path.join(tmpdir, filename)
55 |             gcs_bucket.blob(filename).download_to_filename(tmpfilename)
56 |             self.load(tmpfilename)
57 | 
58 |     def save_to_gcs(self, gcs_bucket, filename: str):
59 |         with tempfile.TemporaryDirectory() as tmpdir:
60 |             tmpfilename = os.path.join(tmpdir, filename)
61 |             self.save(tmpfilename)
62 |             gcs_bucket.blob(filename).upload_from_filename(tmpfilename)
63 | 
64 |     def get(self, key: str, default=None):
65 |         return self.cache.get(key, default)
66 | 
67 |     def translate_text(self, text: str | list[str], target_lang: str, key: str):
68 |         trans = self.get(key, None)
69 |         if trans is not None:
70 |             return trans
71 |         result = self.translator.translate_text(text=text, target_lang=target_lang)
72 |         trans_texts = [r.text for r in result] if type(result) is list else [result.text] if type(result) is deepl.TextResult else []
73 |         trans_ts = datetime.now(timezone.utc).isoformat()
74 |         trans = (trans_texts, trans_ts)
75 |         self.cache[key] = trans
76 |         return trans
77 | 


--------------------------------------------------------------------------------
/docker/requirements.txt:
--------------------------------------------------------------------------------
 1 | arxiv==2.1.3 ; python_version >= "3.11" and python_version < "4.0"
 2 | beautifulsoup4==4.13.3 ; python_version >= "3.11" and python_version < "4.0"
 3 | cachetools==5.5.1 ; python_version >= "3.11" and python_version < "4.0"
 4 | certifi==2025.1.31 ; python_version >= "3.11" and python_version < "4.0"
 5 | charset-normalizer==3.4.1 ; python_version >= "3.11" and python_version < "4.0"
 6 | deepl==1.21.0 ; python_version >= "3.11" and python_version < "4.0"
 7 | feedparser==6.0.11 ; python_version >= "3.11" and python_version < "4.0"
 8 | google-api-core==2.24.1 ; python_version >= "3.11" and python_version < "4.0"
 9 | google-auth==2.38.0 ; python_version >= "3.11" and python_version < "4.0"
10 | google-cloud-core==2.4.1 ; python_version >= "3.11" and python_version < "4.0"
11 | google-cloud-storage==3.0.0 ; python_version >= "3.11" and python_version < "4.0"
12 | google-crc32c==1.6.0 ; python_version >= "3.11" and python_version < "4.0"
13 | google-resumable-media==2.7.2 ; python_version >= "3.11" and python_version < "4.0"
14 | googleapis-common-protos==1.67.0 ; python_version >= "3.11" and python_version < "4.0"
15 | idna==3.10 ; python_version >= "3.11" and python_version < "4.0"
16 | imgkit==1.2.3 ; python_version >= "3.11" and python_version < "4.0"
17 | nanoatp==0.5.1 ; python_version >= "3.11" and python_version < "4.0"
18 | numpy==2.2.3 ; python_version >= "3.11" and python_version < "4.0"
19 | oauthlib==3.2.2 ; python_version >= "3.11" and python_version < "4.0"
20 | pandas==2.2.3 ; python_version >= "3.11" and python_version < "4.0"
21 | praw==7.8.1 ; python_version >= "3.11" and python_version < "4.0"
22 | prawcore==2.4.0 ; python_version >= "3.11" and python_version < "4.0"
23 | proto-plus==1.26.0 ; python_version >= "3.11" and python_version < "4.0"
24 | protobuf==5.29.3 ; python_version >= "3.11" and python_version < "4.0"
25 | pyasn1-modules==0.4.1 ; python_version >= "3.11" and python_version < "4.0"
26 | pyasn1==0.6.1 ; python_version >= "3.11" and python_version < "4.0"
27 | pysbd==0.3.4 ; python_version >= "3.11" and python_version < "4.0"
28 | python-dateutil==2.9.0.post0 ; python_version >= "3.11" and python_version < "4.0"
29 | pytz==2025.1 ; python_version >= "3.11" and python_version < "4.0"
30 | requests-oauthlib==2.0.0 ; python_version >= "3.11" and python_version < "4.0"
31 | requests==2.32.3 ; python_version >= "3.11" and python_version < "4.0"
32 | rsa==4.9 ; python_version >= "3.11" and python_version < "4.0"
33 | sgmllib3k==1.0.0 ; python_version >= "3.11" and python_version < "4.0"
34 | six==1.17.0 ; python_version >= "3.11" and python_version < "4.0"
35 | slack-sdk==3.34.0 ; python_version >= "3.11" and python_version < "4.0"
36 | soupsieve==2.6 ; python_version >= "3.11" and python_version < "4.0"
37 | tld==0.13 ; python_version >= "3.11" and python_version < "4.0"
38 | tweepy==4.15.0 ; python_version >= "3.11" and python_version < "4.0"
39 | typing-extensions==4.12.2 ; python_version >= "3.11" and python_version < "4.0"
40 | tzdata==2025.1 ; python_version >= "3.11" and python_version < "4.0"
41 | update-checker==0.18.0 ; python_version >= "3.11" and python_version < "4.0"
42 | urllib3==2.3.0 ; python_version >= "3.11" and python_version < "4.0"
43 | websocket-client==1.8.0 ; python_version >= "3.11" and python_version < "4.0"
44 | 


--------------------------------------------------------------------------------
/docker/generatehtml.py:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: 2023-2025 Susumu OTA <1632335+susumuota@users.noreply.github.com>
  2 | # SPDX-License-Identifier: MIT
  3 | 
  4 | import re
  5 | from datetime import datetime, timedelta, timezone
  6 | from html import escape
  7 | from itertools import zip_longest
  8 | 
  9 | import dateutil.parser
 10 | import deeplcache
 11 | import pandas as pd
 12 | 
 13 | HTML_TRANS_TEMPLATE = """
 14 | <html>
 15 |   <head>
 16 |     <meta charset="utf-8">
 17 |     <style>
 18 |       body {{
 19 |         font-size: 24px;
 20 |         margin: 2em;
 21 |       }}
 22 |       .translation {{
 23 |         color: black;
 24 |       }}
 25 |       .source {{
 26 |         color: blue;
 27 |       }}
 28 |     </style>
 29 |   </head>
 30 |   <body>
 31 |     <span>{url}</span>
 32 |     <h2>
 33 |       {title}
 34 |     </h2>
 35 |     <h4>
 36 |       {authors}
 37 |     </h4>
 38 |     <div>
 39 |       {content}
 40 |     </div>
 41 |   </body>
 42 | </html>
 43 | """
 44 | 
 45 | HTML_TRANS_ITEM_TEMPLATE = """
 46 | <p class="item">
 47 |   <span class="translation">
 48 |     {translation}
 49 |   </span>
 50 |   <br />
 51 |   <span class="source">
 52 |     {source}
 53 |   </span>
 54 | </p>
 55 | """
 56 | 
 57 | 
 58 | def generate_trans_html(arxiv_id: str, title: str, authors: list[str], trans_texts: list[str], summary_texts: list[str]):
 59 |     authors_md = escape(", ".join(authors))
 60 |     title_md = escape(title)
 61 |     url_md = f"https://arxiv.org/abs/{arxiv_id}"
 62 |     items = map(
 63 |         lambda item: HTML_TRANS_ITEM_TEMPLATE.format(translation=escape(item[0]), source=escape(item[1])),
 64 |         zip_longest(trans_texts, summary_texts, fillvalue=""),
 65 |     )
 66 |     return HTML_TRANS_TEMPLATE.format(title=title_md, authors=authors_md, url=url_md, content="\n".join(items))
 67 | 
 68 | 
 69 | HTML_TOP_N_TEMPLATE = """
 70 | <html>
 71 |   <head>
 72 |     <meta charset="utf-8">
 73 |     <style>
 74 |       body {{
 75 |         font-size: 24px;
 76 |         margin: 2em;
 77 |       }}
 78 |       .new {{
 79 |         color: blue;
 80 |       }}
 81 |     </style>
 82 |   </head>
 83 |   <body>
 84 |     <span>{date}</span>
 85 |     <h2>
 86 |       {title}
 87 |     </h2>
 88 |     <div>
 89 |       {content}
 90 |     </div>
 91 |   </body>
 92 | </html>
 93 | """
 94 | 
 95 | HTML_TOP_N_ITEM_TEMPLATE = """
 96 | <p class="item">
 97 |   [{i}/{n}] <b>{title}</b><br />
 98 |   {stats}, {categories}, {updated}
 99 | </p>
100 | """
101 | 
102 | 
103 | def generate_top_n_html(page_title: str, date: str, df: pd.DataFrame, dlc: deeplcache.DeepLCache):
104 |     page_title = escape(page_title)
105 |     df = df[::-1]  # normal order (reversed reversed order)
106 |     items = []
107 |     twenty_three_hours_ago = datetime.now(timezone.utc) - timedelta(hours=23)
108 |     for i, (arxiv_id, updated, title, primary_category, categories, score, num_comments, count) in enumerate(zip(df["arxiv_id"], df["updated"], df["title"], df["primary_category"], df["categories"], df["score"], df["num_comments"], df["count"])):
109 |         title = escape(title)
110 |         trans = dlc.get(arxiv_id, None)
111 |         if trans is None:
112 |             continue
113 |         _, trans_ts = trans
114 |         if twenty_three_hours_ago < datetime.fromisoformat(trans_ts):
115 |             title = f'<span class="new">[New] {title}</span>'
116 |         categories = " | ".join([primary_category] + [c for c in categories if c != primary_category and re.match(r"\w+\.\w+$", c)])
117 |         stats = f"<b>{score}</b> Likes, {num_comments} Comments, {count} Posts"
118 |         updated = dateutil.parser.isoparse(updated).strftime("%d %b %Y")
119 |         items.append(HTML_TOP_N_ITEM_TEMPLATE.format(i=(i + 1), n=len(df), title=title, stats=stats, categories=categories, updated=updated, arxiv_id=arxiv_id))
120 |     return HTML_TOP_N_TEMPLATE.format(title=page_title, date=date, content="\n".join(items))
121 | 


--------------------------------------------------------------------------------
/docker/postslack.py:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: 2023 Susumu OTA <1632335+susumuota@users.noreply.github.com>
  2 | # SPDX-License-Identifier: MIT
  3 | 
  4 | import re
  5 | import time
  6 | from datetime import datetime, timedelta, timezone
  7 | 
  8 | import dateutil.parser
  9 | import deeplcache
 10 | import pandas as pd
 11 | import pysbd
 12 | import slack_sdk
 13 | import utils
 14 | 
 15 | 
 16 | def post_to_slack_header(api: slack_sdk.WebClient, channel: str, df: pd.DataFrame):
 17 |     text = f"Top {len(df)} most popular arXiv papers in the last 30 days"
 18 |     blocks = [{"type": "header", "text": {"type": "plain_text", "text": text}}]
 19 |     return api.chat_postMessage(channel=channel, text=text, blocks=blocks)
 20 | 
 21 | 
 22 | def generate_slack_title_blocks(df: pd.DataFrame, i: int, is_new: bool, title: str, score: int, num_comments: int, count: int, primary_category: str, categories: list[str], updated: str, first_summary: str):
 23 |     new_md = ":new: " if is_new else ""
 24 |     title_md = utils.strip(title, 200)
 25 |     stats_md = f"_*{score}* Likes, {num_comments} Comments, {count} Posts_"
 26 |     categories_md = utils.avoid_auto_link(" | ".join([primary_category] + [c for c in categories if c != primary_category and re.match(r"\w+\.\w+$", c)]))
 27 |     updated_md = dateutil.parser.isoparse(updated).strftime("%d %b %Y")
 28 |     return [{"type": "section", "text": {"type": "mrkdwn", "text": f"[{len(df) - i}/{len(df)}] {new_md}*{title_md}*\n{stats_md}, {categories_md}, {updated_md}\n{first_summary}"}}]
 29 | 
 30 | 
 31 | def generate_slack_summary(dlc: deeplcache.DeepLCache, seg: pysbd.Segmenter, twenty_three_hours_ago: datetime, arxiv_id: str, summary: str):
 32 |     segs = seg.segment(summary.replace("\n", " ")[:2000])
 33 |     summary_texts: list[str] = [str(seg) for seg in segs] if type(segs) is list else [segs] if type(segs) is str else []
 34 |     first_summary = summary_texts[0][:200]  # sometimes pysbd failed to split
 35 |     translation_md = None
 36 |     is_new = False
 37 |     trans = dlc.get(arxiv_id, None)
 38 |     if trans is not None:
 39 |         trans_texts, trans_ts = trans
 40 |         first_summary = trans_texts[0][:200]  # sometimes pysbd failed to split
 41 |         is_new = True if twenty_three_hours_ago < datetime.fromisoformat(trans_ts) else False
 42 |         # assert len(summary_texts) == len(trans_texts) # this rarely happen
 43 |         if len(summary_texts) != len(trans_texts):
 44 |             print("different texts length", arxiv_id, len(summary_texts), len(trans_texts))
 45 |         translation_md = "\n\n".join(trans_texts)
 46 |         translation_md = utils.strip(translation_md, 3000)  # must be less than 3001 characters
 47 |     return is_new, first_summary, translation_md
 48 | 
 49 | 
 50 | def post_to_slack_title(api: slack_sdk.WebClient, channel: str, dlc: deeplcache.DeepLCache, df: pd.DataFrame, seg: pysbd.Segmenter, twenty_three_hours_ago: datetime, i: int, arxiv_id: str, updated: str, title: str, summary: str, primary_category: str, categories: list[str], score: int, num_comments: int, count: int):
 51 |     is_new, first_summary, translation_md = generate_slack_summary(dlc, seg, twenty_three_hours_ago, arxiv_id, summary)
 52 |     blocks = generate_slack_title_blocks(df, i, is_new, title, score, num_comments, count, primary_category, categories, updated, first_summary)
 53 |     title_md = utils.strip(title, 200)
 54 |     response = api.chat_postMessage(channel=channel, text=title_md, blocks=blocks)
 55 |     return response, translation_md
 56 | 
 57 | 
 58 | def post_to_slack_translation(api: slack_sdk.WebClient, channel: str, title: str, ts: str, translation_md: str):
 59 |     blocks = [{"type": "section", "text": {"type": "mrkdwn", "text": translation_md}}]
 60 |     title_md = utils.strip(title, 200)
 61 |     return api.chat_postMessage(channel=channel, text=title_md, blocks=blocks, thread_ts=ts)
 62 | 
 63 | 
 64 | def post_to_slack_authors(api: slack_sdk.WebClient, channel: str, title: str, ts: str, authors: list[str], comment: str, arxiv_id: str):
 65 |     authors_md = utils.strip(", ".join(authors), 1000)
 66 |     comment_md = f"\n\n*Comments*: {utils.strip(comment, 1000)}\n\n" if comment else ""
 67 |     abs_md = f"<https://arxiv.org/abs/{arxiv_id}|abs>"
 68 |     pdf_md = f"<https://arxiv.org/pdf/{arxiv_id}.pdf|pdf>"
 69 |     twitter_md = f"<https://twitter.com/search?q=arxiv.org%2Fabs%2F{arxiv_id}%20OR%20arxiv.org%2Fpdf%2F{arxiv_id}.pdf|Twitter>"
 70 |     reddit_md = f"<https://www.reddit.com/search/?q=%22{arxiv_id}%22&sort=top|Reddit>"
 71 |     hackernews_md = f"<https://hn.algolia.com/?query=%22{arxiv_id}%22&type=all|HackerNews>"
 72 |     huggingface_md = f"<https://huggingface.co/papers/{arxiv_id}|HuggingFace>"
 73 |     alphaxiv_md = f"<https://www.alphaxiv.org/abs/{arxiv_id}|alphaXiv>"
 74 |     blocks = [{"type": "section", "text": {"type": "mrkdwn", "text": f"*Links*: {abs_md}, {pdf_md}, {twitter_md}, {reddit_md}, {hackernews_md}, {huggingface_md}, {alphaxiv_md}\n\n*Authors*: {authors_md}{comment_md}"}}]
 75 |     title_md = utils.strip(title, 200)
 76 |     return api.chat_postMessage(channel=channel, text=title_md, blocks=blocks, thread_ts=ts)
 77 | 
 78 | 
 79 | def post_to_slack_documents(api: slack_sdk.WebClient, channel: str, ts: str, df: pd.DataFrame):
 80 |     for i, (id, score, num_comments, created_at) in enumerate(zip(df["id"], df["score"], df["num_comments"], df["created_at"])):
 81 |         blocks = []
 82 |         stats_md = f"_*{score}* Likes, {num_comments} Comments_"
 83 |         created_at_md = datetime.fromtimestamp(created_at).strftime("%d %b %Y")
 84 |         url_md = f"<{id}|{created_at_md}>"
 85 |         blocks = [{"type": "section", "text": {"type": "mrkdwn", "text": f"({i + 1}/{len(df)}) {stats_md}, {url_md}\n"}}]
 86 |         api.chat_postMessage(channel=channel, text=url_md, thread_ts=ts, blocks=blocks)
 87 |         time.sleep(1)
 88 | 
 89 | 
 90 | def post_to_slack(api: slack_sdk.WebClient, channel: str, dlc: deeplcache.DeepLCache, df: pd.DataFrame, document_df: pd.DataFrame):
 91 |     df = df[::-1]  # reverse order
 92 |     post_to_slack_header(api, channel, df)
 93 |     time.sleep(1)
 94 |     seg = pysbd.Segmenter(language="en", clean=False)
 95 |     twenty_three_hours_ago = datetime.now(timezone.utc) - timedelta(hours=23)
 96 |     for i, (arxiv_id, updated, title, summary, authors, comment, primary_category, categories, score, num_comments, count) in enumerate(zip(df["arxiv_id"], df["updated"], df["title"], df["summary"], df["authors"], df["comment"], df["primary_category"], df["categories"], df["score"], df["num_comments"], df["count"])):
 97 |         response, translation_md = post_to_slack_title(api, channel, dlc, df, seg, twenty_three_hours_ago, i, arxiv_id, updated, title, summary, primary_category, categories, score, num_comments, count)
 98 |         time.sleep(1)
 99 |         ts = response["ts"]
100 |         if not ts:
101 |             continue
102 |         if translation_md:
103 |             post_to_slack_translation(api, channel, title, ts, translation_md)
104 |             time.sleep(1)
105 |         post_to_slack_authors(api, channel, title, ts, authors, comment, arxiv_id)
106 |         time.sleep(1)
107 |         top_n_documents = document_df[document_df["arxiv_id"].apply(lambda ids: arxiv_id in ids)].head(3)  # TODO
108 |         post_to_slack_documents(api, channel, ts, top_n_documents)
109 |         print("post_to_slack: ", f"[{len(df) - i}/{len(df)}]")
110 | 


--------------------------------------------------------------------------------
/docker/posttwitter.py:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: 2023 Susumu OTA <1632335+susumuota@users.noreply.github.com>
  2 | # SPDX-License-Identifier: MIT
  3 | 
  4 | import os
  5 | import re
  6 | import tempfile
  7 | import time
  8 | from datetime import datetime, timedelta, timezone
  9 | 
 10 | import dateutil.parser
 11 | import deeplcache
 12 | import generatehtml
 13 | import pandas as pd
 14 | import pysbd
 15 | import tweepy
 16 | import utils
 17 | 
 18 | 
 19 | def upload_first_page_to_twitter(api_v1: tweepy.API, arxiv_id: str):
 20 |     with tempfile.TemporaryDirectory() as tmp_dir:
 21 |         pdf_filename = utils.download_arxiv_pdf(arxiv_id, tmp_dir)
 22 |         first_page_filename = utils.pdf_to_png(pdf_filename)
 23 |         if os.path.isfile(first_page_filename) and os.path.getsize(first_page_filename) > 0:
 24 |             media = api_v1.media_upload(first_page_filename)
 25 |             return media.media_id if media else None
 26 |     return None
 27 | 
 28 | 
 29 | def generate_twitter_first_page(df: pd.DataFrame, i: int, is_new: bool, arxiv_id: str, updated: str, title: str, summary_texts: list[str], authors: list[str], score: int, num_comments: int, count: int, primary_category: str, categories: list[str]):
 30 |     summary_text = " ".join(summary_texts)
 31 |     new_md = "🆕" if is_new else ""
 32 |     authors_md = ", ".join(authors)
 33 |     categories_md = utils.avoid_auto_link(" | ".join([primary_category] + [c for c in categories if c != primary_category and re.match(r"\w+\.\w+$", c)]))
 34 |     stats_md = f"{score} Likes, {num_comments} Comments, {count} Posts"
 35 |     updated_md = dateutil.parser.isoparse(updated).strftime("%d %b %Y")
 36 |     title_md = title
 37 |     abs_md = f"https://arxiv.org/abs/{arxiv_id}"
 38 |     text = f"[{len(df) - i}/{len(df)}] {stats_md}\n{abs_md} {categories_md}, {updated_md}\n\n{new_md}{title_md}\n\n{authors_md}"
 39 |     return text, summary_text
 40 | 
 41 | 
 42 | def post_to_twitter_first_page(api_v1: tweepy.API, api_v2: tweepy.Client, df: pd.DataFrame, i: int, is_new: bool, arxiv_id: str, updated: str, title: str, summary_texts: list[str], authors: list[str], score: int, num_comments: int, count: int, primary_category: str, categories: list[str]) -> str:
 43 |     text, summary_text = generate_twitter_first_page(df, i, is_new, arxiv_id, updated, title, summary_texts, authors, score, num_comments, count, primary_category, categories)
 44 |     media_ids = []
 45 |     first_page_media_id = upload_first_page_to_twitter(api_v1, arxiv_id)
 46 |     if first_page_media_id:
 47 |         api_v1.create_media_metadata(first_page_media_id, utils.strip_tweet(summary_text, 1000))
 48 |         media_ids.append(first_page_media_id)
 49 |     prev_tweet_id: str = ""
 50 |     try:
 51 |         response = api_v2.create_tweet(text=utils.strip_tweet(text, 280), user_auth=True, media_ids=media_ids if len(media_ids) > 0 else None)
 52 |         prev_tweet_id = response.data["id"] if type(response) is tweepy.Response and not response.errors else ""
 53 |     except Exception as e:
 54 |         print(e)
 55 |     return prev_tweet_id
 56 | 
 57 | 
 58 | def post_to_twitter_link(api_v2: tweepy.Client, prev_tweet_id: str, arxiv_id: str, link_type: str) -> str:
 59 |     twitter_uri = f"https://x.com/search?q=arxiv.org%2Fabs%2F{arxiv_id}%20OR%20arxiv.org%2Fpdf%2F{arxiv_id}.pdf"
 60 |     reddit_uri = f"https://www.reddit.com/search/?q=%22{arxiv_id}%22&sort=top"
 61 |     hackernews_uri = f"https://hn.algolia.com/?query=%22{arxiv_id}%22&type=all"
 62 |     # the last uri will become a link card
 63 |     text = f"Twitter: {twitter_uri}"
 64 |     text = f"Twitter: {twitter_uri} \nReddit: {reddit_uri}" if link_type == "Reddit" else text
 65 |     text = f"Twitter: {twitter_uri} \nHacker News: {hackernews_uri}" if link_type == "Hacker News" else text
 66 |     try:
 67 |         response = api_v2.create_tweet(text=utils.strip_tweet(text, 280), user_auth=True, in_reply_to_tweet_id=prev_tweet_id)
 68 |         prev_tweet_id = response.data["id"] if type(response) is tweepy.Response and not response.errors else ""
 69 |     except Exception as e:
 70 |         print(e)
 71 |     return prev_tweet_id
 72 | 
 73 | 
 74 | def post_to_twitter_tweets(api_v2: tweepy.Client, prev_tweet_id: str, document_df: pd.DataFrame) -> str:
 75 |     # df = document_df[::-1]  # reverse order
 76 |     df = document_df
 77 |     for i, (id, score, num_comments, created_at) in enumerate(zip(df["id"], df["score"], df["num_comments"], df["created_at"])):
 78 |         stats_md = f"{score} Likes, {num_comments} Comments"
 79 |         created_at_md = datetime.fromtimestamp(created_at).strftime("%d %b %Y")
 80 |         link = utils.get_link_type(id) or id
 81 |         # index = len(df) - i  # reverse order
 82 |         index = i + 1
 83 |         text = f"({index}/{len(df)}) {stats_md}, {created_at_md}, {link}\n{id}\n"
 84 |         try:
 85 |             response = api_v2.create_tweet(text=utils.strip_tweet(text, 280), user_auth=True, in_reply_to_tweet_id=prev_tweet_id)
 86 |             prev_tweet_id = response.data["id"] if type(response) is tweepy.Response and not response.errors else ""
 87 |         except Exception as e:
 88 |             print(e)
 89 |         time.sleep(1)
 90 |     return prev_tweet_id
 91 | 
 92 | 
 93 | def upload_html_to_twitter(api_v1: tweepy.API, filename: str, html_text: str):
 94 |     with tempfile.TemporaryDirectory() as tmp_dir:
 95 |         abs_path = os.path.join(tmp_dir, filename)
 96 |         abs_path = utils.html_to_image(html_text, abs_path)
 97 |         if os.path.isfile(abs_path) and os.path.getsize(abs_path) > 0:
 98 |             media = api_v1.media_upload(abs_path)
 99 |             return media.media_id if media else None
100 |     return None
101 | 
102 | 
103 | def post_to_twitter_ranking(api_v1: tweepy.API, api_v2: tweepy.Client, dlc: deeplcache.DeepLCache, df: pd.DataFrame):
104 |     title = f"Top {len(df)} most popular arXiv papers in the last 30 days"
105 |     date = datetime.now(timezone.utc).strftime("%d %b %Y")
106 |     media_ids = []
107 |     html_text = generatehtml.generate_top_n_html(title, date, df, dlc)
108 |     top_n_media_id = upload_html_to_twitter(api_v1, "top_n.jpg", html_text)
109 |     if top_n_media_id:
110 |         rev_df = df[::-1]
111 |         metadata = "\n".join(map(lambda item: f"[{item[0] + 1}/{len(df)}] arxiv.org/abs/{item[1][0]}", enumerate(zip(rev_df["arxiv_id"]))))
112 |         api_v1.create_media_metadata(top_n_media_id, utils.strip_tweet(metadata, 1000))
113 |         media_ids.append(top_n_media_id)
114 |     text = title
115 |     try:
116 |         api_v2.create_tweet(text=utils.strip_tweet(text, 280), user_auth=True, media_ids=media_ids if len(media_ids) > 0 else None)
117 |     except Exception as e:
118 |         print(e)
119 | 
120 | 
121 | def post_to_twitter_trans(api_v1: tweepy.API, api_v2: tweepy.Client, prev_tweet_id: str, arxiv_id: str, title: str, authors: list[str], summary_texts: list[str], trans_texts: list[str]):
122 |     html_text = generatehtml.generate_trans_html(arxiv_id, title, authors, trans_texts, summary_texts)
123 |     media_ids = []
124 |     translation_media_id = upload_html_to_twitter(api_v1, f"{arxiv_id}.trans.jpg", html_text)
125 |     trans_text = "".join(trans_texts)
126 |     if translation_media_id:
127 |         api_v1.create_media_metadata(translation_media_id, utils.strip_tweet(trans_text, 1000))
128 |         media_ids.append(translation_media_id)
129 |     text = f"https://arxiv.org/abs/{arxiv_id}\n{trans_text}"
130 |     try:
131 |         api_v2.create_tweet(text=utils.strip_tweet(text, 280), user_auth=True, media_ids=media_ids if len(media_ids) > 0 else None, in_reply_to_tweet_id=prev_tweet_id)
132 |     except Exception as e:
133 |         print(e)
134 | 
135 | 
136 | def post_to_twitter(api_v1: tweepy.API, api_v2: tweepy.Client, dlc: deeplcache.DeepLCache, df: pd.DataFrame, document_df: pd.DataFrame):
137 |     df = df[::-1]  # reverse order
138 |     twenty_three_hours_ago = datetime.now(timezone.utc) - timedelta(hours=23)
139 |     seg = pysbd.Segmenter(language="en", clean=False)
140 |     post_to_twitter_ranking(api_v1, api_v2, dlc, df)
141 |     for i, (arxiv_id, updated, title, summary, authors, comment, primary_category, categories, score, num_comments, count) in enumerate(zip(df["arxiv_id"], df["updated"], df["title"], df["summary"], df["authors"], df["comment"], df["primary_category"], df["categories"], df["score"], df["num_comments"], df["count"])):
142 |         trans = dlc.get(arxiv_id, None)
143 |         if trans is None:
144 |             continue
145 |         trans_texts, trans_ts = trans
146 |         segs = seg.segment(summary.replace("\n", " ")[:2000])
147 |         summary_texts: list[str] = [str(seg) for seg in segs] if type(segs) is list else [segs] if type(segs) is str else []
148 |         # only post new papers
149 |         if not (twenty_three_hours_ago < datetime.fromisoformat(trans_ts)):
150 |             continue
151 |         is_new = True
152 |         prev_tweet_id = post_to_twitter_first_page(api_v1, api_v2, df, i, is_new, arxiv_id, updated, title, summary_texts, authors, score, num_comments, count, primary_category, categories)
153 |         time.sleep(1)
154 |         if not prev_tweet_id:
155 |             continue
156 |         top_n_documents = document_df[document_df["arxiv_id"].apply(lambda ids: arxiv_id in ids)].head(3)  # TODO
157 |         link_type = utils.get_link_type(top_n_documents.iloc[0]["id"])
158 |         prev_tweet_id = post_to_twitter_link(api_v2, prev_tweet_id, arxiv_id, link_type)
159 |         time.sleep(1)
160 |         if not prev_tweet_id:
161 |             continue
162 |         prev_tweet_id = post_to_twitter_tweets(api_v2, prev_tweet_id, top_n_documents)
163 |         post_to_twitter_trans(api_v1, api_v2, prev_tweet_id, arxiv_id, title, authors, summary_texts, trans_texts)
164 |         print("post_to_twitter: ", f"[{len(df) - i}/{len(df)}]")
165 |         time.sleep(1)
166 | 


--------------------------------------------------------------------------------
/docker/postbluesky.py:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: 2023-2025 Susumu OTA <1632335+susumuota@users.noreply.github.com>
  2 | # SPDX-License-Identifier: MIT
  3 | 
  4 | import os
  5 | import re
  6 | import tempfile
  7 | import time
  8 | from datetime import datetime, timedelta, timezone
  9 | from typing import Any
 10 | 
 11 | import dateutil.parser
 12 | import deeplcache
 13 | import generatehtml
 14 | import nanoatp
 15 | import pandas as pd
 16 | import pysbd
 17 | import utils
 18 | 
 19 | 
 20 | def generate_facets(text: str, patterns: list[tuple[str, str]]):
 21 |     # TODO: fix naive implementation
 22 |     facets: list[dict[str, Any]] = []
 23 |     for pattern, uri in patterns:
 24 |         start = text.find(pattern)
 25 |         if start == -1:
 26 |             continue
 27 |         end = start + len(pattern)
 28 |         facets.append(
 29 |             {
 30 |                 "$type": "app.bsky.richtext.facet",
 31 |                 "index": {"byteStart": start, "byteEnd": end},
 32 |                 "features": [{"$type": "app.bsky.richtext.facet#link", "uri": uri}],
 33 |             }
 34 |         )
 35 |     facets.sort(key=lambda facet: facet["index"]["byteStart"])
 36 |     return facets
 37 | 
 38 | 
 39 | def upload_first_page_to_bluesky(api: nanoatp.BskyAgent, arxiv_id: str, summary_text: str) -> dict[str, Any]:
 40 |     with tempfile.TemporaryDirectory() as tmp_dir:
 41 |         pdf_filename = utils.download_arxiv_pdf(arxiv_id, tmp_dir)
 42 |         first_page_filename = utils.pdf_to_png(pdf_filename)
 43 |         if os.path.isfile(first_page_filename) and os.path.getsize(first_page_filename) > 0:
 44 |             return api.uploadImage(first_page_filename, utils.strip_tweet(summary_text, 2000))
 45 |     return {}
 46 | 
 47 | 
 48 | def generate_bluesky_first_page(df: pd.DataFrame, i: int, is_new: bool, arxiv_id: str, updated: str, title: str, summary_texts: list[str], authors: list[str], score: int, num_comments: int, count: int, primary_category: str, categories: list[str]):
 49 |     summary_text = "\n\n".join(summary_texts)
 50 |     new_md = "🆕" if is_new else ""
 51 |     authors_md = ", ".join(authors)
 52 |     categories_md = utils.avoid_auto_link(" | ".join([primary_category] + [c for c in categories if c != primary_category and re.match(r"\w+\.\w+$", c)]))
 53 |     stats_md = f"{score} Likes, {num_comments} Comments, {count} Posts"
 54 |     updated_md = dateutil.parser.isoparse(updated).strftime("%d %b %Y")
 55 |     title_md = title
 56 |     text = f"[{len(df) - i}/{len(df)}] {stats_md}\n{arxiv_id}, {categories_md}, {updated_md}\n\n{new_md}{title_md}\n\n{authors_md}"
 57 |     return text, summary_text
 58 | 
 59 | 
 60 | def post_to_bluesky_first_page(api: nanoatp.BskyAgent, df: pd.DataFrame, i: int, is_new: bool, arxiv_id: str, updated: str, title: str, summary_texts: list[str], authors: list[str], score: int, num_comments: int, count: int, primary_category: str, categories: list[str]):
 61 |     first_page_text, summary_text = generate_bluesky_first_page(df, i, is_new, arxiv_id, updated, title, summary_texts, authors, score, num_comments, count, primary_category, categories)
 62 |     images = []
 63 |     image = upload_first_page_to_bluesky(api, arxiv_id, summary_text)
 64 |     images.append(image) if image else None
 65 |     parent_post: dict[str, str] = {}
 66 |     text = f"{first_page_text}"
 67 |     patterns = [(arxiv_id, f"https://arxiv.org/abs/{arxiv_id}")]
 68 |     facets = generate_facets(text, patterns)
 69 |     embed = {"$type": "app.bsky.embed.images", "images": images}
 70 |     record = {"text": utils.strip_tweet(text, 300), "facets": facets, "embed": embed}
 71 |     try:
 72 |         parent_post = api.post(record)
 73 |     except Exception as e:
 74 |         print(e)
 75 |     return parent_post
 76 | 
 77 | 
 78 | def generate_external(api: nanoatp.BskyAgent, uri: str, title: str, description: str):
 79 |     try:
 80 |         external = api.uploadExternal(uri)
 81 |     except Exception as e:
 82 |         print({"function": "uploadExternal", "uri": uri, "error": str(e)})
 83 |         external = {
 84 |             "$type": "app.bsky.embed.external#external",
 85 |             "uri": uri,
 86 |             "title": title,
 87 |             "description": description,
 88 |         }
 89 |     return external
 90 | 
 91 | 
 92 | def post_to_bluesky_link(api: nanoatp.BskyAgent, root_post: dict[str, str], parent_post: dict[str, str], arxiv_id: str, title: str, summary_texts: list[str]):
 93 |     patterns = [
 94 |         ("abs", f"https://arxiv.org/abs/{arxiv_id}"),
 95 |         ("pdf", f"https://arxiv.org/pdf/{arxiv_id}.pdf"),
 96 |         ("Bluesky", f"https://bsky.app/search?q={arxiv_id}"),
 97 |         ("Twitter", f"https://x.com/search?q=arxiv.org%2Fabs%2F{arxiv_id}%20OR%20arxiv.org%2Fpdf%2F{arxiv_id}.pdf"),
 98 |         ("Reddit", f"https://www.reddit.com/search/?q=%22{arxiv_id}%22&sort=top"),
 99 |         ("Hacker News", f"https://hn.algolia.com/?query=%22{arxiv_id}%22&type=all"),
100 |         ("Hugging Face", f"https://huggingface.co/papers/{arxiv_id}"),
101 |         ("alphaXiv", f"https://www.alphaxiv.org/abs/{arxiv_id}"),
102 |     ]
103 |     text = "Links: abs, pdf\nSearch: Bluesky, Twitter, Reddit, Hacker News, Hugging Face, alphaXiv"
104 |     facets = generate_facets(text, patterns)
105 |     uri = patterns[0][1]
106 |     external = generate_external(api, uri, title, utils.strip_tweet(" ".join(summary_texts), 300))
107 |     embed = {"$type": "app.bsky.embed.external", "external": external}
108 |     record = {"text": utils.strip_tweet(text, 300), "facets": facets, "reply": {"root": root_post, "parent": parent_post}, "embed": embed}
109 |     try:
110 |         parent_post = api.post(record)
111 |     except Exception as e:
112 |         print(e)
113 |     return parent_post
114 | 
115 | 
116 | def post_to_bluesky_posts(api: nanoatp.BskyAgent, root_post: dict[str, str], parent_post: dict[str, str], df: pd.DataFrame):
117 |     for i, (id, score, num_comments, created_at, title, description) in enumerate(zip(df["id"], df["score"], df["num_comments"], df["created_at"], df["title"], df["description"])):
118 |         stats_md = f"{score} Likes, {num_comments} Comments"
119 |         created_at_md = datetime.fromtimestamp(created_at).strftime("%d %b %Y")
120 |         link = utils.get_link_type(id) or id
121 |         index = i + 1
122 |         text = f"({index}/{len(df)}) {stats_md}, {created_at_md}, {link}"
123 |         patterns = [(link, id)]
124 |         facets = generate_facets(text, patterns)
125 |         external = generate_external(api, id, title, utils.strip_tweet(description, 300))
126 |         embed = {"$type": "app.bsky.embed.external", "external": external}
127 |         record = {"text": utils.strip_tweet(text, 300), "facets": facets, "reply": {"root": root_post, "parent": parent_post}, "embed": embed}
128 |         try:
129 |             parent_post = api.post(record)
130 |         except Exception as e:
131 |             print(e)
132 |         time.sleep(1)
133 |     return parent_post
134 | 
135 | 
136 | def upload_html_to_bluesky(api: nanoatp.BskyAgent, filename: str, html_text: str, alt_text: str, quality: int = 94) -> dict[str, Any]:
137 |     with tempfile.TemporaryDirectory() as tmp_dir:
138 |         abs_path = os.path.join(tmp_dir, filename)
139 |         abs_path = utils.html_to_image(html_text, abs_path, quality)
140 |         if os.path.isfile(abs_path) and os.path.getsize(abs_path) > 0:
141 |             return api.uploadImage(abs_path, utils.strip_tweet(alt_text, 2000))
142 |     return {}
143 | 
144 | 
145 | def post_to_bluesky_trans(api: nanoatp.BskyAgent, root_post: dict[str, str], parent_post: dict[str, str], arxiv_id: str, title: str, authors: list[str], summary_texts: list[str], trans_texts: list[str]) -> dict[str, str]:
146 |     html_text = generatehtml.generate_trans_html(arxiv_id, title, authors, trans_texts, summary_texts)
147 |     trans_text = "".join(trans_texts)
148 |     images = []
149 |     image = upload_html_to_bluesky(api, f"{arxiv_id}.trans.jpg", html_text, "\n\n".join(trans_texts))
150 |     images.append(image) if image else None
151 |     text = f"{arxiv_id}\n{trans_text}"
152 |     patterns = [(arxiv_id, f"https://arxiv.org/abs/{arxiv_id}")]
153 |     facets = generate_facets(text, patterns)
154 |     embed = {"$type": "app.bsky.embed.images", "images": images}
155 |     record = {"text": utils.strip_tweet(text, 300), "facets": facets, "reply": {"root": root_post, "parent": parent_post}, "embed": embed}
156 |     try:
157 |         return api.post(record)
158 |     except Exception as e:
159 |         print(e)
160 |     return {}
161 | 
162 | 
163 | def post_to_bluesky_ranking(api: nanoatp.BskyAgent, dlc: deeplcache.DeepLCache, df: pd.DataFrame) -> dict[str, str]:
164 |     title = f"Top {len(df)} most popular arXiv papers in the last 30 days.\n"
165 |     date = datetime.now(timezone.utc).strftime("%d %b %Y")
166 |     html_text = generatehtml.generate_top_n_html(title, date, df, dlc)
167 |     uris = list(map(lambda item: (f"{item[0] + 1}/{len(df)}", f"https://arxiv.org/abs/{item[1][0]}"), enumerate(zip(df[::-1]["arxiv_id"]))))
168 |     alt_text = "\n".join(map(lambda item: " ".join(item), uris))
169 |     image = upload_html_to_bluesky(api, "top_n.jpg", html_text, alt_text, 90)  # sometimes the image is too large to upload
170 |     images = []
171 |     images.append(image) if image else None
172 |     text = title + " ".join(map(lambda item: f"[{item[0]}]", uris))
173 |     facets = generate_facets(text, uris)
174 |     embed = {"$type": "app.bsky.embed.images", "images": images}
175 |     record = {"text": utils.strip_tweet(text, 300), "facets": facets, "embed": embed}
176 |     try:
177 |         return api.post(record)
178 |     except Exception as e:
179 |         print(e)
180 |     return {}
181 | 
182 | 
183 | def post_to_bluesky(api: nanoatp.BskyAgent, dlc: deeplcache.DeepLCache, df: pd.DataFrame, document_df: pd.DataFrame):
184 |     df = df[::-1]  # reverse order
185 |     twenty_three_hours_ago = datetime.now(timezone.utc) - timedelta(hours=23)
186 |     seg = pysbd.Segmenter(language="en", clean=False)
187 |     for i, (arxiv_id, updated, title, summary, authors, primary_category, categories, score, num_comments, count) in enumerate(zip(df["arxiv_id"], df["updated"], df["title"], df["summary"], df["authors"], df["primary_category"], df["categories"], df["score"], df["num_comments"], df["count"])):
188 |         trans = dlc.get(arxiv_id, None)
189 |         if trans is None:
190 |             continue
191 |         trans_texts, trans_ts = trans
192 |         # only post new papers
193 |         if not (twenty_three_hours_ago < datetime.fromisoformat(trans_ts)):
194 |             continue
195 |         segs = seg.segment(summary.replace("\n", " ")[:2000])
196 |         summary_texts: list[str] = [str(seg) for seg in segs] if type(segs) is list else [segs] if type(segs) is str else []
197 |         is_new = True
198 |         parent_post = post_to_bluesky_first_page(api, df, i, is_new, arxiv_id, updated, title, summary_texts, authors, score, num_comments, count, primary_category, categories)
199 |         if parent_post is None:
200 |             continue
201 |         root_post = parent_post
202 |         time.sleep(1)
203 |         top_n_documents = document_df[document_df["arxiv_id"].apply(lambda ids: arxiv_id in ids)].head(3)  # TODO
204 |         parent_post = post_to_bluesky_posts(api, root_post, parent_post, top_n_documents)
205 |         parent_post = post_to_bluesky_link(api, root_post, parent_post, arxiv_id, title, summary_texts)
206 |         time.sleep(1)
207 |         post_to_bluesky_trans(api, root_post, parent_post, arxiv_id, title, authors, summary_texts, trans_texts)
208 |         print("post_to_bluesky: ", f"[{len(df) - i}/{len(df)}]")
209 |         time.sleep(1)
210 |     return post_to_bluesky_ranking(api, dlc, df)
211 | 


--------------------------------------------------------------------------------
/docker/main.py:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: 2023 Susumu OTA <1632335+susumuota@users.noreply.github.com>
  2 | #
  3 | # SPDX-License-Identifier: MIT
  4 | 
  5 | # Those environment variables are required to use PRAW.
  6 | # export praw_client_id="reddit client id"
  7 | # export praw_client_secret="reddit client secret"
  8 | # export praw_user_agent="reddit user agent"
  9 | 
 10 | import os
 11 | import re
 12 | import time
 13 | from datetime import datetime, timedelta, timezone
 14 | 
 15 | import arxiv
 16 | import deepl
 17 | import deeplcache
 18 | import nanoatp
 19 | import pandas as pd
 20 | import postbluesky
 21 | import postslack
 22 | import posttwitter
 23 | import praw
 24 | import pysbd
 25 | import requests
 26 | import slack_sdk
 27 | import tweepy
 28 | from google.cloud import storage
 29 | 
 30 | # https://info.arxiv.org/help/arxiv_identifier.html
 31 | ARXIV_URL_PATTERN = re.compile(r"https?://arxiv\.org/(abs|pdf)/([0-9]{4}\.[0-9]{4,5})(v[0-9]+)?(\.pdf)?")
 32 | ARXIV_ID_PATTERN = re.compile(r"([0-9]{4}\.[0-9]{4,5})(v[0-9]+)?")
 33 | 
 34 | 
 35 | def parse_arxiv_ids(text: str) -> list[str]:
 36 |     text = text.replace("\\", "")  # TODO: some text includes 2 backslashes in urls
 37 |     return list(set([m[1] for m in re.findall(ARXIV_URL_PATTERN, text)]))
 38 | 
 39 | 
 40 | def flatten(lists: list[list]):
 41 |     return [item for sublist in lists for item in sublist]
 42 | 
 43 | 
 44 | def submission_to_dict(submission: praw.reddit.Submission):
 45 |     """https://praw.readthedocs.io/en/stable/code_overview/models/submission.html"""
 46 |     arxiv_ids = parse_arxiv_ids(submission.selftext)
 47 |     score = int(submission.score / len(arxiv_ids) if len(arxiv_ids) > 0 else submission.score)
 48 |     return {
 49 |         "id": f"https://redd.it/{submission.id}",
 50 |         "score": score,
 51 |         "num_comments": submission.num_comments,
 52 |         "created_at": submission.created_utc,
 53 |         "arxiv_id": arxiv_ids,
 54 |         "title": submission.title,
 55 |         "description": submission.selftext,
 56 |     }
 57 | 
 58 | 
 59 | def search_reddit(query: str, sort="relevance", syntax="lucene", time_filter="all", limit: int | None = None):
 60 |     """https://praw.readthedocs.io/en/latest/code_overview/models/subreddit.html#praw.models.Subreddit.search"""
 61 |     rs = list(praw.Reddit().subreddit("all").search(query=query, sort=sort, syntax=syntax, time_filter=time_filter, limit=limit))
 62 |     return pd.json_normalize([submission_to_dict(r) for r in rs])
 63 | 
 64 | 
 65 | def hit_to_dict(hit: dict):
 66 |     """https://hn.algolia.com/api"""
 67 |     arxiv_ids = parse_arxiv_ids(hit["url"])
 68 |     score = int(hit["points"] / len(arxiv_ids) if len(arxiv_ids) > 0 else hit["points"])
 69 |     return {
 70 |         "id": f"https://news.ycombinator.com/item?id={hit['objectID']}",
 71 |         "score": score,
 72 |         "num_comments": hit["num_comments"],
 73 |         "created_at": hit["created_at_i"],
 74 |         "arxiv_id": arxiv_ids,
 75 |         "title": hit["title"],
 76 |         "description": hit["url"],
 77 |     }
 78 | 
 79 | 
 80 | def search_hackernews(query: str, attribute="", days=0, limit: int | None = None):
 81 |     """https://hn.algolia.com/api"""
 82 |     params = {"query": query}
 83 |     params.update({"restrictSearchableAttributes": attribute}) if attribute else None
 84 |     if days > 0:
 85 |         days_ago = int((datetime.now() - timedelta(days=days)).timestamp())
 86 |         params.update({"numericFilters": f"created_at_i>{days_ago}"})
 87 |     params.update({"hitsPerPage": str(limit)}) if limit else None
 88 |     response = requests.get("https://hn.algolia.com/api/v1/search", params=params)
 89 |     json = response.json()
 90 |     return pd.json_normalize([hit_to_dict(hit) for hit in json["hits"]])
 91 | 
 92 | 
 93 | def article_to_dict(article: dict):
 94 |     """https://huggingface.co/docs/hub/en/api#get-apidailypapers"""
 95 |     arxiv_id = article["paper"]["id"]
 96 |     created_at = int(datetime.fromisoformat(article["paper"]["submittedOnDailyAt"].replace("Z", "+00:00")).timestamp())
 97 |     return {
 98 |         "id": f"https://huggingface.co/papers/{arxiv_id}",
 99 |         "score": article["paper"]["upvotes"],
100 |         "num_comments": article["numComments"],
101 |         "created_at": created_at,
102 |         "arxiv_id": [arxiv_id],
103 |         "title": article["title"],
104 |         "description": article["summary"],
105 |     }
106 | 
107 | 
108 | def get_huggingface(timestamp: float, wait=1):
109 |     """https://huggingface.co/docs/hub/en/api#get-apidailypapers"""
110 |     date = datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d")
111 |     url = f"https://huggingface.co/api/daily_papers?date={date}"
112 |     referer = f"https://huggingface.co/papers/date/{date}"
113 |     ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"
114 |     time.sleep(wait)
115 |     response = requests.get(url, headers={"Referer": referer, "User-Agent": ua})
116 |     print(f"Status code {response.status_code}, {len(response.text)} characters at {date}")
117 |     if response.status_code != 200:
118 |         print(f"Failed to fetch data for {date}: {response.status_code}")
119 |         return []
120 |     articles = response.json()
121 |     if not articles or "error" in articles or not isinstance(articles, list):
122 |         print(f"No articles found for {date} or error in response.")
123 |         return []
124 |     print(f"Got {len(articles)} articles from {date}")
125 |     return [article_to_dict(article) for article in articles]
126 | 
127 | 
128 | def search_huggingface(days=30, wait=1):
129 |     """https://huggingface.co/docs/hub/en/api#get-apidailypapers"""
130 |     now = datetime.now()
131 |     timestamps = [(now - timedelta(days=d)).timestamp() for d in range(days)]
132 |     df = pd.json_normalize(flatten([get_huggingface(ts, wait) for ts in timestamps]))
133 |     return df.drop_duplicates(subset=["id"], keep="last").reset_index(drop=True)
134 | 
135 | 
136 | def paper_to_dict(paper: dict):
137 |     """https://www.alphaxiv.org/explore?sort=Likes&time=30+Days"""
138 |     arxiv_id = paper["universal_paper_id"]
139 |     try:
140 |         created_at = int(datetime.fromisoformat(paper["publication_date"].replace("Z", "+00:00")).timestamp())
141 |     except Exception as e:
142 |         print(f"Failed to parse publication date for {arxiv_id}: {e}")
143 |         created_at = datetime.now(timezone.utc).timestamp()
144 |     return {
145 |         "id": f"https://www.alphaxiv.org/abs/{arxiv_id}",
146 |         "score": paper["metrics"]["public_total_votes"],
147 |         "num_comments": 0,  # TODO: find the number of comments
148 |         "created_at": created_at,
149 |         "arxiv_id": [arxiv_id],
150 |         "title": paper["title"],
151 |         "description": paper["abstract"],
152 |     }
153 | 
154 | 
155 | def get_alphaxiv(sort_by="Likes", interval="30+Days", page_size=10, page_num=0, wait=1):
156 |     """https://www.alphaxiv.org/explore?sort=Likes&time=30+Days"""
157 |     url = f"https://api.alphaxiv.org/v2/papers/trending-papers?page_num={page_num}&sort_by={sort_by}&page_size={page_size}&interval={interval}"
158 |     referer = f"https://www.alphaxiv.org/explore?sort={sort_by}&time={interval}"
159 |     ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"
160 |     time.sleep(wait)
161 |     response = requests.get(url, headers={"Referer": referer, "User-Agent": ua})
162 |     print(f"Status code {response.status_code}, {len(response.text)} characters at page {page_num}")
163 |     if response.status_code != 200:
164 |         print(f"Failed to fetch data: {response.status_code}")
165 |         return []
166 |     json = response.json()
167 |     if not json or "error" in json or "data" not in json or "trending_papers" not in json["data"]:
168 |         print("No articles found or error in response.")
169 |         return []
170 |     return [paper_to_dict(paper) for paper in json["data"]["trending_papers"]]
171 | 
172 | 
173 | def search_alphaxiv(sort_by="Likes", interval="30+Days", page_size=10, limit=30, wait=1):
174 |     """https://www.alphaxiv.org/explore?sort=Likes&time=30+Days"""
175 |     page_nums = [i for i in range(0, (limit + page_size - 1) // page_size)]
176 |     df = pd.json_normalize(flatten([get_alphaxiv(sort_by=sort_by, interval=interval, page_size=page_size, page_num=page_num, wait=wait) for page_num in page_nums]))
177 |     return df.drop_duplicates(subset=["id"], keep="last").reset_index(drop=True)
178 | 
179 | 
180 | def filter_invalid_arxiv_id(document_df: pd.DataFrame):
181 |     """Filter out documents with invalid arXiv IDs using ARXIV_ID_PATTERN."""
182 | 
183 |     def is_valid_arxiv_id_list(arxiv_id_list):
184 |         if not arxiv_id_list:
185 |             return False
186 |         return all(ARXIV_ID_PATTERN.match(arxiv_id) for arxiv_id in arxiv_id_list)
187 | 
188 |     valid_mask = document_df["arxiv_id"].apply(is_valid_arxiv_id_list)
189 |     filtered_df = document_df[valid_mask].reset_index(drop=True)
190 | 
191 |     invalid_count = len(document_df) - len(filtered_df)
192 |     if invalid_count > 0:
193 |         print(f"Filtered out {invalid_count} documents with invalid arXiv IDs")
194 | 
195 |     return filtered_df
196 | 
197 | 
198 | def get_arxiv_stats(document_df: pd.DataFrame):
199 |     return document_df.explode("arxiv_id").groupby("arxiv_id").agg(score=("score", "sum"), num_comments=("num_comments", "sum"), count=("id", "count"), document_id=("id", pd.Series.to_list)).sort_values(by=["score", "num_comments", "count"], ascending=False).reset_index()
200 | 
201 | 
202 | def arxiv_result_to_dict(r: arxiv.Result):
203 |     m = ARXIV_URL_PATTERN.match(r.entry_id)
204 |     arxiv_id = m.group(2) if m else None
205 |     assert arxiv_id is not None
206 |     arxiv_id_v = m.group(2) + m.group(3) if m else None
207 |     assert arxiv_id_v is not None
208 |     return {
209 |         "arxiv_id": arxiv_id,
210 |         "arxiv_id_v": arxiv_id_v,
211 |         "entry_id": r.entry_id,
212 |         "updated": str(r.updated),  # TODO
213 |         "published": str(r.published),  # TODO
214 |         "title": r.title,
215 |         "authors": [str(a) for a in r.authors],
216 |         "summary": r.summary,
217 |         "comment": r.comment,
218 |         "journal_ref": r.journal_ref,
219 |         "doi": r.doi,
220 |         "primary_category": r.primary_category,
221 |         "categories": [str(c) for c in r.categories],
222 |         "links": [str(link) for link in r.links],
223 |         "pdf_url": r.pdf_url,
224 |     }
225 | 
226 | 
227 | def get_arxiv_contents(id_list: list[str], chunk_size=100):
228 |     rs: list[arxiv.Result] = []
229 |     cdr = id_list
230 |     for i in range(1 + len(id_list) // chunk_size):
231 |         car = cdr[:chunk_size]
232 |         cdr = cdr[chunk_size:]
233 |         if len(car) > 0:
234 |             try:
235 |                 search = arxiv.Search(id_list=car, max_results=len(car))
236 |                 r = list(search.results())
237 |                 rs.extend(r)
238 |                 print("search_arxiv_contents: ", i, len(r), len(rs))
239 |             except Exception as e:
240 |                 print(e)
241 |     return pd.json_normalize([arxiv_result_to_dict(r) for r in rs])
242 | 
243 | 
244 | def filter_df(df: pd.DataFrame, top_n=10, days=365, count=1, num_comments=0):
245 |     df = df[df["count"] >= count]
246 |     df = df[df["num_comments"] >= num_comments]
247 |     days_ago = (datetime.now(timezone.utc) - timedelta(days=days)).strftime("%Y-%m-%d")  # noqa: F841
248 |     return df.query("published > @days_ago").head(top_n).reset_index(drop=True)
249 | 
250 | 
251 | def summarize(query, time_filter="month", days=30, limit=300):
252 |     try:
253 |         print("search_reddit...")
254 |         reddit_document_df = search_reddit(f"selftext:{query}", sort="top", time_filter=time_filter, limit=limit)
255 |         print("search_reddit...done: ", len(reddit_document_df))
256 |     except Exception as e:
257 |         print(e)
258 |         reddit_document_df = pd.json_normalize([])
259 |     try:
260 |         print("search_hackernews...")
261 |         hackernews_document_df = search_hackernews(query, attribute="url", days=days, limit=limit)
262 |         print("search_hackernews...done: ", len(hackernews_document_df))
263 |     except Exception as e:
264 |         print(e)
265 |         hackernews_document_df = pd.json_normalize([])
266 |     try:
267 |         print("search_huggingface...")
268 |         search_huggingface_df = search_huggingface(days=days)
269 |         print("search_huggingface...done: ", len(search_huggingface_df))
270 |     except Exception as e:
271 |         print(e)
272 |         search_huggingface_df = pd.json_normalize([])
273 |     try:
274 |         print("search_alphaxiv...")
275 |         search_alphaxiv_df = search_alphaxiv(limit=limit)
276 |         print("search_alphaxiv...done: ", len(search_alphaxiv_df))
277 |     except Exception as e:
278 |         print(e)
279 |         search_alphaxiv_df = pd.json_normalize([])
280 |     concat_df = pd.concat([reddit_document_df, hackernews_document_df, search_huggingface_df, search_alphaxiv_df], ignore_index=True).sort_values(by=["score", "num_comments"], ascending=False).reset_index(drop=True)
281 |     document_df = filter_invalid_arxiv_id(concat_df)
282 |     print("document_df: ", len(document_df))
283 |     stats_df = get_arxiv_stats(document_df)
284 |     print("stats_df: ", len(stats_df))
285 |     contents_df = get_arxiv_contents(stats_df["arxiv_id"].tolist(), chunk_size=100)
286 |     print("contents_df: ", len(contents_df))
287 |     paper_df = pd.merge(stats_df, contents_df, on="arxiv_id")
288 |     print("paper_df: ", len(paper_df))
289 |     return paper_df, document_df
290 | 
291 | 
292 | def translate_arxiv(dlc: deeplcache.DeepLCache, df: pd.DataFrame, target_lang: str):
293 |     seg = pysbd.Segmenter(language="en", clean=False)
294 |     print("translate_arxiv: before: ", len(dlc.cache))
295 |     print(dlc.translator.get_usage())
296 |     for arxiv_id, summary in zip(df["arxiv_id"], df["summary"]):
297 |         summary_texts = seg.segment(summary.replace("\n", " ")[:2000])
298 |         trans_texts, trans_ts = dlc.translate_text(summary_texts, target_lang, arxiv_id)
299 |         print("translate_arxiv: ", arxiv_id, sum([len(s) for s in summary_texts]), sum([len(t) for t in trans_texts]), trans_ts)
300 |     print("translate_arxiv: after: ", len(dlc.cache))
301 |     print(dlc.translator.get_usage())
302 |     return dlc
303 | 
304 | 
305 | def main():
306 |     # settings
307 |     query = "arxiv.org"
308 |     summarize_time_filter = "month"  # or "week"
309 |     summarize_days = 30  # should be 30 if "month"
310 |     summarize_limit = 300
311 |     filter_days = 30
312 |     filter_count = 1
313 |     filter_num_comments = 1
314 |     deepl_target_lang = "JA"
315 |     deepl_expire_days = 90
316 |     notify_top_n = int(os.getenv("NOTIFY_TOP_N", 10))
317 | 
318 |     # prepare apis
319 |     gcs_bucket = storage.Client().bucket(os.getenv("GCS_BUCKET_NAME"))
320 |     deepl_api = deepl.Translator(os.getenv("DEEPL_AUTH_KEY"))  # type: ignore
321 |     slack_api = slack_sdk.WebClient(os.getenv("SLACK_BOT_TOKEN"))
322 |     slack_channel = os.getenv("SLACK_CHANNEL")
323 |     tweepy_api_v2 = tweepy.Client(bearer_token=os.getenv("TWITTER_BEARER_TOKEN"), consumer_key=os.getenv("TWITTER_API_KEY"), consumer_secret=os.getenv("TWITTER_API_KEY_SECRET"), access_token=os.getenv("TWITTER_ACCESS_TOKEN"), access_token_secret=os.getenv("TWITTER_ACCESS_TOKEN_SECRET"), wait_on_rate_limit=True)
324 |     # because media_upload is only available on api v1.
325 |     tweepy_api_v1 = tweepy.API(tweepy.OAuth1UserHandler(consumer_key=os.getenv("TWITTER_API_KEY"), consumer_secret=os.getenv("TWITTER_API_KEY_SECRET"), access_token=os.getenv("TWITTER_ACCESS_TOKEN"), access_token_secret=os.getenv("TWITTER_ACCESS_TOKEN_SECRET")), wait_on_rate_limit=True)
326 |     bluesky_api = nanoatp.BskyAgent()
327 |     bluesky_api.login(os.getenv("ATP_IDENTIFIER"), os.getenv("ATP_PASSWORD"))  # type: ignore
328 | 
329 |     # search reddit and measure popularity
330 |     paper_df, document_df = summarize(query, time_filter=summarize_time_filter, days=summarize_days, limit=summarize_limit)
331 | 
332 |     # filter by days
333 |     filtered_df = filter_df(paper_df, top_n=notify_top_n, days=filter_days, count=filter_count, num_comments=filter_num_comments)
334 |     print("filtered_df: ", len(filtered_df))
335 | 
336 |     # translate summary text
337 |     dlc = deeplcache.DeepLCache(deepl_api)
338 |     try:
339 |         dlc.load_from_gcs(gcs_bucket, "deepl_cache.json.gz")
340 |     except Exception as e:
341 |         print(e)
342 |     dlc = translate_arxiv(dlc, filtered_df, deepl_target_lang)
343 |     dlc.clear_cache(expire_timedelta=timedelta(days=deepl_expire_days))
344 |     dlc.save_to_gcs(gcs_bucket, "deepl_cache.json.gz")
345 | 
346 |     # post
347 |     try:
348 |         postslack.post_to_slack(slack_api, slack_channel, dlc, filtered_df, document_df)
349 |     except Exception as e:
350 |         print(e)
351 | 
352 |     try:
353 |         postbluesky.post_to_bluesky(bluesky_api, dlc, filtered_df, document_df)
354 |     except Exception as e:
355 |         print(e)
356 | 
357 |     try:
358 |         posttwitter.post_to_twitter(tweepy_api_v1, tweepy_api_v2, dlc, filtered_df, document_df)
359 |     except Exception as e:
360 |         print(e)
361 | 
362 | 
363 | if __name__ == "__main__":
364 |     main()
365 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # arXiv Reddit Summary
  2 | 
  3 | Summarize the top 30 most popular arXiv papers on Reddit, Hacker News and Hugging Face in the last 30 days. And post them to Slack, Twitter and Bluesky.
  4 | 
  5 | ## Demo
  6 | 
  7 | - https://x.com/susumuota
  8 | - https://bsky.app/profile/paper.bsky.social
  9 | 
 10 | ## Google Cloud Run
 11 | 
 12 | This system is running on Google Cloud Run jobs.
 13 | 
 14 | - https://cloud.google.com/build/docs/build-push-docker-image
 15 | - https://cloud.google.com/run/docs/create-jobs#command-line
 16 | - https://cloud.google.com/scheduler/docs/creating#gcloud
 17 | 
 18 | ## Create a project
 19 | 
 20 | - https://cloud.google.com/resource-manager/docs/creating-managing-projects#creating_a_project
 21 | 
 22 | ```sh
 23 | export PROJECT_ID="arxiv-summary-1"
 24 | gcloud projects create $PROJECT_ID
 25 | gcloud projects list
 26 | # gcloud projects delete $PROJECT_ID
 27 | # unset PROJECT_ID
 28 | ```
 29 | 
 30 | ## Enable billing
 31 | 
 32 | Follow this instruction. As far as I know there is no way to enable billing from the command line.
 33 | 
 34 | - https://cloud.google.com/billing/docs/how-to/modify-project#how-to-enable-billing
 35 | - https://console.cloud.google.com/billing/projects
 36 | 
 37 | Then confirm it.
 38 | 
 39 | ```sh
 40 | gcloud beta billing projects describe $PROJECT_ID
 41 | ```
 42 | 
 43 | It should show `billingEnabled: true`.
 44 | 
 45 | ## Create a bucket
 46 | 
 47 | ```sh
 48 | export GCS_BUCKET_NAME="arxiv-summary"
 49 | export REGION="us-central1"
 50 | gcloud storage buckets create "gs://${GCS_BUCKET_NAME}" \
 51 |   --project=$PROJECT_ID \
 52 |   --location=$REGION \
 53 |   --public-access-prevention \
 54 |   --uniform-bucket-level-access
 55 | gcloud storage buckets list --project=$PROJECT_ID | grep name
 56 | # gcloud storage buckets delete "gs://${GCS_BUCKET_NAME}" --project=$PROJECT_ID
 57 | # unset GCS_BUCKET_NAME
 58 | ```
 59 | 
 60 | ## Build a Docker image on local machine and test it
 61 | 
 62 | - https://cloud.google.com/sdk/gcloud/reference/auth/application-default/login
 63 | - https://stackoverflow.com/a/50826145
 64 | 
 65 | Application settings.
 66 | 
 67 | ```sh
 68 | # export TWITTER_BEARER_TOKEN="secret info"
 69 | # export TWITTER_API_KEY="secret info"
 70 | # export TWITTER_API_KEY_SECRET="secret info"
 71 | # export TWITTER_ACCESS_TOKEN="secret info"
 72 | # export TWITTER_ACCESS_TOKEN_SECRET="secret info"
 73 | # export DEEPL_AUTH_KEY="secret info"
 74 | # export SLACK_BOT_TOKEN="secret info"
 75 | # export praw_client_id="secret info"
 76 | # export praw_client_secret="secret info"
 77 | # export praw_user_agent="secret info"
 78 | # export ATP_IDENTIFIER="secret info"
 79 | # export ATP_PASSWORD="secret info"
 80 | 
 81 | export NOTIFY_TOP_N="30"      # 30 on production env
 82 | export SLACK_CHANNEL="#test"  # #anywhere on production env
 83 | ```
 84 | 
 85 | Local test.
 86 | 
 87 | ```sh
 88 | poetry export -f requirements.txt --without-hashes -o docker/requirements.txt
 89 | ```
 90 | 
 91 | ```sh
 92 | gcloud auth application-default login
 93 | cd docker
 94 | export IMAGE_NAME="arxiv-reddit-summary"
 95 | docker build -t $IMAGE_NAME .
 96 | docker run --rm \
 97 |   -e TWITTER_BEARER_TOKEN=$TWITTER_BEARER_TOKEN \
 98 |   -e TWITTER_API_KEY=$TWITTER_API_KEY \
 99 |   -e TWITTER_API_KEY_SECRET=$TWITTER_API_KEY_SECRET \
100 |   -e TWITTER_ACCESS_TOKEN=$TWITTER_ACCESS_TOKEN \
101 |   -e TWITTER_ACCESS_TOKEN_SECRET=$TWITTER_ACCESS_TOKEN_SECRET \
102 |   -e DEEPL_AUTH_KEY=$DEEPL_AUTH_KEY \
103 |   -e SLACK_BOT_TOKEN=$SLACK_BOT_TOKEN \
104 |   -e praw_client_id=$praw_client_id \
105 |   -e praw_client_secret=$praw_client_secret \
106 |   -e praw_user_agent=$praw_user_agent \
107 |   -e ATP_IDENTIFIER=$ATP_IDENTIFIER \
108 |   -e ATP_PASSWORD=$ATP_PASSWORD \
109 |   -e NOTIFY_TOP_N=$NOTIFY_TOP_N \
110 |   -e SLACK_CHANNEL=$SLACK_CHANNEL \
111 |   -e GCS_BUCKET_NAME=$GCS_BUCKET_NAME \
112 |   -e GCLOUD_PROJECT=$PROJECT_ID \
113 |   -v $HOME/.config/gcloud:/root/.config/gcloud \
114 |   $IMAGE_NAME
115 | docker images
116 | # docker rmi $IMAGE_NAME
117 | # unset IMAGE_NAME
118 | ```
119 | 
120 | ## Create a service account for Cloud Run
121 | 
122 | ```sh
123 | export RUN_SERVICE_ACCOUNT="run-sa"
124 | gcloud iam service-accounts create $RUN_SERVICE_ACCOUNT --project=$PROJECT_ID
125 | gcloud iam service-accounts list --project=$PROJECT_ID
126 | # gcloud iam service-accounts delete "${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" --project=$PROJECT_ID
127 | # unset RUN_SERVICE_ACCOUNT
128 | ```
129 | 
130 | ## Add roles to service account to access GCS and to invoke Cloud Run
131 | 
132 | - https://cloud.google.com/storage/docs/access-control/iam-roles
133 | - https://cloud.google.com/scheduler/docs/creating#gcloud
134 | - https://cloud.google.com/iam/docs/creating-managing-service-accounts#creating
135 | - https://codelabs.developers.google.com/cloud-run-jobs-and-cloud-scheduler#4
136 | 
137 | ```sh
138 | gcloud projects add-iam-policy-binding $PROJECT_ID \
139 |   --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
140 |   --role="roles/storage.objectAdmin"
141 | gcloud projects add-iam-policy-binding $PROJECT_ID \
142 |   --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
143 |   --role="roles/run.invoker"
144 | gcloud projects get-iam-policy $PROJECT_ID
145 | # gcloud projects remove-iam-policy-binding $PROJECT_ID \
146 | #   --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
147 | #   --role="roles/storage.objectAdmin"
148 | # gcloud projects remove-iam-policy-binding $PROJECT_ID \
149 | #   --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
150 | #   --role="roles/run.invoker"
151 | ```
152 | 
153 | ## Create secret data
154 | 
155 | - https://cloud.google.com/secret-manager/docs/create-secret#secretmanager-quickstart-gcloud
156 | - https://cloud.google.com/run/docs/configuring/secrets
157 | 
158 | ```sh
159 | gcloud services enable secretmanager.googleapis.com --project=$PROJECT_ID
160 | echo -n $TWITTER_BEARER_TOKEN | gcloud secrets create "TWITTER_BEARER_TOKEN" \
161 |   --project=$PROJECT_ID \
162 |   --replication-policy="automatic" \
163 |   --data-file=-
164 | echo -n $TWITTER_API_KEY | gcloud secrets create "TWITTER_API_KEY" \
165 |   --project=$PROJECT_ID \
166 |   --replication-policy="automatic" \
167 |   --data-file=-
168 | echo -n $TWITTER_API_KEY_SECRET | gcloud secrets create "TWITTER_API_KEY_SECRET" \
169 |   --project=$PROJECT_ID \
170 |   --replication-policy="automatic" \
171 |   --data-file=-
172 | echo -n $TWITTER_ACCESS_TOKEN | gcloud secrets create "TWITTER_ACCESS_TOKEN" \
173 |   --project=$PROJECT_ID \
174 |   --replication-policy="automatic" \
175 |   --data-file=-
176 | echo -n $TWITTER_ACCESS_TOKEN_SECRET | gcloud secrets create "TWITTER_ACCESS_TOKEN_SECRET" \
177 |   --project=$PROJECT_ID \
178 |   --replication-policy="automatic" \
179 |   --data-file=-
180 | echo -n $DEEPL_AUTH_KEY | gcloud secrets create "DEEPL_AUTH_KEY" \
181 |   --project=$PROJECT_ID \
182 |   --replication-policy="automatic" \
183 |   --data-file=-
184 | echo -n $SLACK_BOT_TOKEN | gcloud secrets create "SLACK_BOT_TOKEN" \
185 |   --project=$PROJECT_ID \
186 |   --replication-policy="automatic" \
187 |   --data-file=-
188 | echo -n $praw_client_id | gcloud secrets create "praw_client_id" \
189 |   --project=$PROJECT_ID \
190 |   --replication-policy="automatic" \
191 |   --data-file=-
192 | echo -n $praw_client_secret | gcloud secrets create "praw_client_secret" \
193 |   --project=$PROJECT_ID \
194 |   --replication-policy="automatic" \
195 |   --data-file=-
196 | echo -n $praw_user_agent | gcloud secrets create "praw_user_agent" \
197 |   --project=$PROJECT_ID \
198 |   --replication-policy="automatic" \
199 |   --data-file=-
200 | echo -n $ATP_IDENTIFIER | gcloud secrets create "ATP_IDENTIFIER" \
201 |   --project=$PROJECT_ID \
202 |   --replication-policy="automatic" \
203 |   --data-file=-
204 | echo -n $ATP_PASSWORD | gcloud secrets create "ATP_PASSWORD" \
205 |   --project=$PROJECT_ID \
206 |   --replication-policy="automatic" \
207 |   --data-file=-
208 | gcloud secrets list --project=$PROJECT_ID
209 | gcloud secrets versions access 1 --secret="TWITTER_BEARER_TOKEN" --project=$PROJECT_ID
210 | gcloud secrets versions access 1 --secret="TWITTER_API_KEY" --project=$PROJECT_ID
211 | gcloud secrets versions access 1 --secret="TWITTER_API_KEY_SECRET" --project=$PROJECT_ID
212 | gcloud secrets versions access 1 --secret="TWITTER_ACCESS_TOKEN" --project=$PROJECT_ID
213 | gcloud secrets versions access 1 --secret="TWITTER_ACCESS_TOKEN_SECRET" --project=$PROJECT_ID
214 | gcloud secrets versions access 1 --secret="DEEPL_AUTH_KEY" --project=$PROJECT_ID
215 | gcloud secrets versions access 1 --secret="SLACK_BOT_TOKEN" --project=$PROJECT_ID
216 | gcloud secrets versions access 1 --secret="praw_client_id" --project=$PROJECT_ID
217 | gcloud secrets versions access 1 --secret="praw_client_secret" --project=$PROJECT_ID
218 | gcloud secrets versions access 1 --secret="praw_user_agent" --project=$PROJECT_ID
219 | gcloud secrets versions access 1 --secret="ATP_IDENTIFIER" --project=$PROJECT_ID
220 | gcloud secrets versions access 1 --secret="ATP_PASSWORD" --project=$PROJECT_ID
221 | # gcloud secrets delete "TWITTER_BEARER_TOKEN" --project=$PROJECT_ID
222 | # gcloud secrets delete "TWITTER_API_KEY" --project=$PROJECT_ID
223 | # gcloud secrets delete "TWITTER_API_KEY_SECRET" --project=$PROJECT_ID
224 | # gcloud secrets delete "TWITTER_ACCESS_TOKEN" --project=$PROJECT_ID
225 | # gcloud secrets delete "TWITTER_ACCESS_TOKEN_SECRET" --project=$PROJECT_ID
226 | # gcloud secrets delete "DEEPL_AUTH_KEY" --project=$PROJECT_ID
227 | # gcloud secrets delete "SLACK_BOT_TOKEN" --project=$PROJECT_ID
228 | # gcloud secrets delete "praw_client_id" --project=$PROJECT_ID
229 | # gcloud secrets delete "praw_client_secret" --project=$PROJECT_ID
230 | # gcloud secrets delete "praw_user_agent" --project=$PROJECT_ID
231 | # gcloud secrets delete "ATP_IDENTIFIER" --project=$PROJECT_ID
232 | # gcloud secrets delete "ATP_PASSWORD" --project=$PROJECT_ID
233 | # gcloud services disable secretmanager.googleapis.com --project=$PROJECT_ID
234 | ```
235 | 
236 | ## Add roles to secrets to be accessed by service account
237 | 
238 | - https://cloud.google.com/secret-manager/docs/managing-secrets#secretmanager-create-secret-gcloud
239 | 
240 | ```sh
241 | gcloud secrets add-iam-policy-binding "TWITTER_BEARER_TOKEN" \
242 |   --project=$PROJECT_ID \
243 |   --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
244 |   --role="roles/secretmanager.secretAccessor"
245 | gcloud secrets add-iam-policy-binding "TWITTER_API_KEY" \
246 |   --project=$PROJECT_ID \
247 |   --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
248 |   --role="roles/secretmanager.secretAccessor"
249 | gcloud secrets add-iam-policy-binding "TWITTER_API_KEY_SECRET" \
250 |   --project=$PROJECT_ID \
251 |   --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
252 |   --role="roles/secretmanager.secretAccessor"
253 | gcloud secrets add-iam-policy-binding "TWITTER_ACCESS_TOKEN" \
254 |   --project=$PROJECT_ID \
255 |   --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
256 |   --role="roles/secretmanager.secretAccessor"
257 | gcloud secrets add-iam-policy-binding "TWITTER_ACCESS_TOKEN_SECRET" \
258 |   --project=$PROJECT_ID \
259 |   --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
260 |   --role="roles/secretmanager.secretAccessor"
261 | gcloud secrets add-iam-policy-binding "DEEPL_AUTH_KEY" \
262 |   --project=$PROJECT_ID \
263 |   --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
264 |   --role="roles/secretmanager.secretAccessor"
265 | gcloud secrets add-iam-policy-binding "SLACK_BOT_TOKEN" \
266 |   --project=$PROJECT_ID \
267 |   --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
268 |   --role="roles/secretmanager.secretAccessor"
269 | gcloud secrets add-iam-policy-binding "praw_client_id" \
270 |   --project=$PROJECT_ID \
271 |   --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
272 |   --role="roles/secretmanager.secretAccessor"
273 | gcloud secrets add-iam-policy-binding "praw_client_secret" \
274 |   --project=$PROJECT_ID \
275 |   --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
276 |   --role="roles/secretmanager.secretAccessor"
277 | gcloud secrets add-iam-policy-binding "praw_user_agent" \
278 |   --project=$PROJECT_ID \
279 |   --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
280 |   --role="roles/secretmanager.secretAccessor"
281 | gcloud secrets add-iam-policy-binding "ATP_IDENTIFIER" \
282 |   --project=$PROJECT_ID \
283 |   --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
284 |   --role="roles/secretmanager.secretAccessor"
285 | gcloud secrets add-iam-policy-binding "ATP_PASSWORD" \
286 |   --project=$PROJECT_ID \
287 |   --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
288 |   --role="roles/secretmanager.secretAccessor"
289 | gcloud secrets get-iam-policy "TWITTER_BEARER_TOKEN" --project=$PROJECT_ID
290 | gcloud secrets get-iam-policy "TWITTER_API_KEY" --project=$PROJECT_ID
291 | gcloud secrets get-iam-policy "TWITTER_API_KEY_SECRET" --project=$PROJECT_ID
292 | gcloud secrets get-iam-policy "TWITTER_ACCESS_TOKEN" --project=$PROJECT_ID
293 | gcloud secrets get-iam-policy "TWITTER_ACCESS_TOKEN_SECRET" --project=$PROJECT_ID
294 | gcloud secrets get-iam-policy "DEEPL_AUTH_KEY" --project=$PROJECT_ID
295 | gcloud secrets get-iam-policy "SLACK_BOT_TOKEN" --project=$PROJECT_ID
296 | gcloud secrets get-iam-policy "praw_client_id" --project=$PROJECT_ID
297 | gcloud secrets get-iam-policy "praw_client_secret" --project=$PROJECT_ID
298 | gcloud secrets get-iam-policy "praw_user_agent" --project=$PROJECT_ID
299 | gcloud secrets get-iam-policy "ATP_IDENTIFIER" --project=$PROJECT_ID
300 | gcloud secrets get-iam-policy "ATP_PASSWORD" --project=$PROJECT_ID
301 | # gcloud secrets remove-iam-policy-binding "TWITTER_BEARER_TOKEN" \
302 | #   --project=$PROJECT_ID \
303 | #   --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
304 | #   --role="roles/secretmanager.secretAccessor"
305 | # gcloud secrets remove-iam-policy-binding "TWITTER_API_KEY" \
306 | #   --project=$PROJECT_ID \
307 | #   --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
308 | #   --role="roles/secretmanager.secretAccessor"
309 | # gcloud secrets remove-iam-policy-binding "TWITTER_API_KEY_SECRET" \
310 | #   --project=$PROJECT_ID \
311 | #   --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
312 | #   --role="roles/secretmanager.secretAccessor"
313 | # gcloud secrets remove-iam-policy-binding "TWITTER_ACCESS_TOKEN" \
314 | #   --project=$PROJECT_ID \
315 | #   --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
316 | #   --role="roles/secretmanager.secretAccessor"
317 | # gcloud secrets remove-iam-policy-binding "TWITTER_ACCESS_TOKEN_SECRET" \
318 | #   --project=$PROJECT_ID \
319 | #   --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
320 | #   --role="roles/secretmanager.secretAccessor"
321 | # gcloud secrets remove-iam-policy-binding "DEEPL_AUTH_KEY" \
322 | #   --project=$PROJECT_ID \
323 | #   --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
324 | #   --role="roles/secretmanager.secretAccessor"
325 | # gcloud secrets remove-iam-policy-binding "SLACK_BOT_TOKEN" \
326 | #   --project=$PROJECT_ID \
327 | #   --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
328 | #   --role="roles/secretmanager.secretAccessor"
329 | # gcloud secrets remove-iam-policy-binding "praw_client_id" \
330 | #   --project=$PROJECT_ID \
331 | #   --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
332 | #   --role="roles/secretmanager.secretAccessor"
333 | # gcloud secrets remove-iam-policy-binding "praw_client_secret" \
334 | #   --project=$PROJECT_ID \
335 | #   --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
336 | #   --role="roles/secretmanager.secretAccessor"
337 | # gcloud secrets remove-iam-policy-binding "praw_user_agent" \
338 | #   --project=$PROJECT_ID \
339 | #   --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
340 | #   --role="roles/secretmanager.secretAccessor"
341 | # gcloud secrets remove-iam-policy-binding "ATP_IDENTIFIER" \
342 | #   --project=$PROJECT_ID \
343 | #   --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
344 | #   --role="roles/secretmanager.secretAccessor"
345 | # gcloud secrets remove-iam-policy-binding "ATP_PASSWORD" \
346 | #   --project=$PROJECT_ID \
347 | #   --member="serviceAccount:${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
348 | #   --role="roles/secretmanager.secretAccessor"
349 | ```
350 | 
351 | ## Create a Docker repository
352 | 
353 | - https://cloud.google.com/build/docs/build-push-docker-image
354 | - https://codelabs.developers.google.com/cloud-run-jobs-and-cloud-scheduler#3
355 | 
356 | ```sh
357 | gcloud services enable artifactregistry.googleapis.com --project=$PROJECT_ID
358 | export REPOSITORY="arxiv-reddit-summary"
359 | gcloud artifacts repositories create $REPOSITORY \
360 |   --project=$PROJECT_ID \
361 |   --repository-format="docker" \
362 |   --location=$REGION
363 | gcloud artifacts repositories list --project=$PROJECT_ID --location=$REGION
364 | # gcloud artifacts repositories delete $REPOSITORY --project=$PROJECT_ID --location=$REGION
365 | # gcloud services disable artifactregistry.googleapis.com --project=$PROJECT_ID
366 | # unset REPOSITORY REGION
367 | ```
368 | 
369 | ## Build a Docker image
370 | 
371 | - https://cloud.google.com/build/docs/build-push-docker-image
372 | - https://cloud.google.com/build/docs/building/build-containers#use-dockerfile
373 | - https://codelabs.developers.google.com/cloud-run-jobs-and-cloud-scheduler#3
374 | 
375 | ```sh
376 | gcloud services enable cloudbuild.googleapis.com --project=$PROJECT_ID
377 | export TAG_NAME="latest"
378 | gcloud builds submit \
379 |   --project=$PROJECT_ID \
380 |   --region=$REGION \
381 |   --tag="${REGION}-docker.pkg.dev/${PROJECT_ID}/${REPOSITORY}/${IMAGE_NAME}:${TAG_NAME}"
382 | gcloud builds list --project=$PROJECT_ID --region=$REGION
383 | gcloud artifacts repositories list --project=$PROJECT_ID --location=$REGION
384 | # gcloud services disable cloudbuild.googleapis.com --project=$PROJECT_ID
385 | # unset TAG_NAME
386 | ```
387 | 
388 | ## Test a Docker image on local machine
389 | 
390 | **This process may increase charge because of data transfer.**
391 | 
392 | - https://cloud.google.com/build/docs/building/build-containers#run_the_docker_image
393 | - https://cloud.google.com/artifact-registry/pricing
394 | - https://support.terra.bio/hc/en-us/articles/4408985788187-How-to-configure-GCR-Artifact-Registry-to-prevent-egress-charges
395 | 
396 | ```sh
397 | gcloud auth configure-docker ${REGION}-docker.pkg.dev
398 | docker run --rm \
399 |   -e TWITTER_BEARER_TOKEN=$TWITTER_BEARER_TOKEN \
400 |   -e TWITTER_API_KEY=$TWITTER_API_KEY \
401 |   -e TWITTER_API_KEY_SECRET=$TWITTER_API_KEY_SECRET \
402 |   -e TWITTER_ACCESS_TOKEN=$TWITTER_ACCESS_TOKEN \
403 |   -e TWITTER_ACCESS_TOKEN_SECRET=$TWITTER_ACCESS_TOKEN_SECRET \
404 |   -e DEEPL_AUTH_KEY=$DEEPL_AUTH_KEY \
405 |   -e SLACK_BOT_TOKEN=$SLACK_BOT_TOKEN \
406 |   -e praw_client_id=$praw_client_id \
407 |   -e praw_client_secret=$praw_client_secret \
408 |   -e praw_user_agent=$praw_user_agent \
409 |   -e ATP_IDENTIFIER=$ATP_IDENTIFIER \
410 |   -e ATP_PASSWORD=$ATP_PASSWORD \
411 |   -e NOTIFY_TOP_N=$NOTIFY_TOP_N \
412 |   -e SLACK_CHANNEL=$SLACK_CHANNEL \
413 |   -e GCS_BUCKET_NAME=$GCS_BUCKET_NAME \
414 |   -e GCLOUD_PROJECT=$PROJECT_ID \
415 |   -v $HOME/.config/gcloud:/root/.config/gcloud \
416 |   "${REGION}-docker.pkg.dev/${PROJECT_ID}/${REPOSITORY}/${IMAGE_NAME}:${TAG_NAME}"
417 | docker images
418 | # docker rmi "${REGION}-docker.pkg.dev/${PROJECT_ID}/${REPOSITORY}/${IMAGE_NAME}:${TAG_NAME}"
419 | ```
420 | 
421 | ## Create a Cloud Run job
422 | 
423 | - https://cloud.google.com/run/docs/create-jobs#command-line
424 | 
425 | Change parameters for production env.
426 | 
427 | ```sh
428 | export NOTIFY_TOP_N="30"          # 10 on development env
429 | export SLACK_CHANNEL="#test"      # #test on development env
430 | ```
431 | 
432 | ```sh
433 | gcloud services enable run.googleapis.com --project=$PROJECT_ID
434 | export RUN_JOB_NAME="arxiv-reddit-summary-job-1"
435 | gcloud beta run jobs create $RUN_JOB_NAME \
436 |   --image="${REGION}-docker.pkg.dev/${PROJECT_ID}/${REPOSITORY}/${IMAGE_NAME}:${TAG_NAME}" \
437 |   --project=$PROJECT_ID \
438 |   --region=$REGION \
439 |   --service-account="${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com" \
440 |   --set-secrets="TWITTER_BEARER_TOKEN=TWITTER_BEARER_TOKEN:1" \
441 |   --set-secrets="TWITTER_API_KEY=TWITTER_API_KEY:1" \
442 |   --set-secrets="TWITTER_API_KEY_SECRET=TWITTER_API_KEY_SECRET:1" \
443 |   --set-secrets="TWITTER_ACCESS_TOKEN=TWITTER_ACCESS_TOKEN:1" \
444 |   --set-secrets="TWITTER_ACCESS_TOKEN_SECRET=TWITTER_ACCESS_TOKEN_SECRET:1" \
445 |   --set-secrets="DEEPL_AUTH_KEY=DEEPL_AUTH_KEY:1" \
446 |   --set-secrets="SLACK_BOT_TOKEN=SLACK_BOT_TOKEN:1" \
447 |   --set-secrets="praw_client_id=praw_client_id:1" \
448 |   --set-secrets="praw_client_secret=praw_client_secret:1" \
449 |   --set-secrets="praw_user_agent=praw_user_agent:1" \
450 |   --set-secrets="ATP_IDENTIFIER=ATP_IDENTIFIER:1" \
451 |   --set-secrets="ATP_PASSWORD=ATP_PASSWORD:1" \
452 |   --set-env-vars="NOTIFY_TOP_N=${NOTIFY_TOP_N}" \
453 |   --set-env-vars="SLACK_CHANNEL=${SLACK_CHANNEL}" \
454 |   --set-env-vars="GCS_BUCKET_NAME=${GCS_BUCKET_NAME}" \
455 |   --max-retries=0 \
456 |   --task-timeout="30m" \
457 |   --memory="1024Mi"
458 | gcloud beta run jobs list --project=$PROJECT_ID
459 | gcloud beta run jobs describe $RUN_JOB_NAME --project=$PROJECT_ID --region=$REGION
460 | # gcloud beta run jobs delete $RUN_JOB_NAME --project=$PROJECT_ID --region=$REGION
461 | # gcloud services disable run.googleapis.com --project=$PROJECT_ID
462 | # unset RUN_JOB_NAME
463 | ```
464 | 
465 | ## Execute a job
466 | 
467 | - https://cloud.google.com/run/docs/execute/jobs
468 | 
469 | ```sh
470 | gcloud beta run jobs execute $RUN_JOB_NAME --project=$PROJECT_ID --region=$REGION
471 | gcloud beta run jobs executions list --project=$PROJECT_ID --region=$REGION
472 | ```
473 | 
474 | ```sh
475 | gcloud logging read "resource.type=cloud_run_job" \
476 |   --project=$PROJECT_ID \
477 |   --limit 10 | egrep "textPayload|message"
478 | ```
479 | 
480 | ## Create a Cloud Scheduler job
481 | 
482 | - https://cloud.google.com/run/docs/execute/jobs-on-schedule#command-line
483 | - https://codelabs.developers.google.com/cloud-run-jobs-and-cloud-scheduler#4
484 | 
485 | ```sh
486 | export SCHEDULER_JOB_NAME="arxiv-reddit-summary-job-everyday-9am"
487 | gcloud services enable cloudscheduler.googleapis.com --project=$PROJECT_ID
488 | gcloud scheduler jobs create http $SCHEDULER_JOB_NAME \
489 |   --project=$PROJECT_ID \
490 |   --location=$REGION \
491 |   --schedule="0 9 * * *" \
492 |   --time-zone "Asia/Tokyo" \
493 |   --uri="https://${REGION}-run.googleapis.com/apis/run.googleapis.com/v1/namespaces/${PROJECT_ID}/jobs/${RUN_JOB_NAME}:run" \
494 |   --http-method="POST" \
495 |   --oauth-service-account-email="${RUN_SERVICE_ACCOUNT}@${PROJECT_ID}.iam.gserviceaccount.com"
496 | gcloud scheduler jobs list --project=$PROJECT_ID --location=$REGION
497 | gcloud scheduler jobs describe $SCHEDULER_JOB_NAME --project=$PROJECT_ID --location=$REGION
498 | # gcloud scheduler jobs delete $SCHEDULER_JOB_NAME --project=$PROJECT_ID --location=$REGION
499 | # gcloud services disable cloudscheduler.googleapis.com --project=$PROJECT_ID
500 | # unset SCHEDULER_JOB_NAME
501 | ```
502 | 
503 | ```sh
504 | gcloud logging read "resource.type=cloud_run_job OR resource.type=cloud_scheduler_job" \
505 |   --project=$PROJECT_ID \
506 |   --limit 10 | egrep "textPayload|message"
507 | ```
508 | 
509 | ## License
510 | 
511 | MIT License, See LICENSE file.
512 | 
513 | ## Author
514 | 
515 | Susumu OTA
516 | 


--------------------------------------------------------------------------------