├── LICENSE ├── Mastodon Export to Mastodon.ipynb ├── README.md ├── mod_mastodon.sh ├── sample-config.json └── tweetjs-to-mastodon.ipynb /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Luca Hammer 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Mastodon Export to Mastodon.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Fediporter\n", 8 | "Jupyter notebook to migrate content like Tweets oder Mastodon posts to Mastodon.\n", 9 | "\n", 10 | "Only works with instances that are patched to allow backdated posts through the API. More info: https://github.com/lucahammer/fediporter" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": { 17 | "ExecuteTime": { 18 | "end_time": "2022-12-23T10:05:14.110272Z", 19 | "start_time": "2022-12-23T10:05:14.107618Z" 20 | } 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "import requests\n", 25 | "from bs4 import BeautifulSoup\n", 26 | "from tqdm.notebook import tqdm\n", 27 | "import json" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 163, 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "name": "stdout", 37 | "output_type": "stream", 38 | "text": [ 39 | "{\"name\":\"Imported from det.social/@Luca\",\"website\":\"https://det.social/@luca\",\"vapid_key\":\"uXL829wxgH9wYavQxkARcWv6fiNUZ94B\"}\n" 40 | ] 41 | } 42 | ], 43 | "source": [ 44 | "with open('config.json', 'r') as f:\n", 45 | " config = json.loads(f.read())\n", 46 | "\n", 47 | "API_BASE_URL = config['mastodon_url']\n", 48 | "MASTODON_BEARER = \"oyufS1RbwqgIMlLcudOtA33F4N5YZrkDHiHDlw5Y-dA\" #config['mastodon_bearer']\n", 49 | "HEADERS = headers={'Authorization': f'Bearer {MASTODON_BEARER}'}\n", 50 | "\n", 51 | "# Test Mastodon bearer token\n", 52 | "url = f\"{API_BASE_URL}/api/v1/apps/verify_credentials\"\n", 53 | "r = requests.get(url, headers=HEADERS)\n", 54 | "print(r.text)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 5, 60 | "metadata": { 61 | "ExecuteTime": { 62 | "end_time": "2022-12-23T11:57:44.037582Z", 63 | "start_time": "2022-12-23T11:57:43.929260Z" 64 | } 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "def post_status(data):\n", 69 | " HEADERS = {'Authorization': f'Bearer {MASTODON_BEARER}'}\n", 70 | " url = f\"{API_BASE_URL}/api/v1/statuses\"\n", 71 | " r = requests.post(url, \n", 72 | " data=data, \n", 73 | " headers=HEADERS)\n", 74 | " return r.json()\n", 75 | "\n", 76 | "def load_mastodon_posts(file):\n", 77 | " with open(file, 'r') as f:\n", 78 | " posts = json.loads(f.read())['orderedItems']\n", 79 | " return(posts)\n", 80 | "\n", 81 | "def fix_text(text, tags):\n", 82 | " text = text.replace('

', '++breakme++

') #add placeholder to replace with linebreaks to keep paragraphs (it's weird)\n", 83 | " text = BeautifulSoup(text, 'html.parser').get_text()\n", 84 | " tags = [tag for tag in tags if tag['type'] == 'Mention']\n", 85 | " for tag in tags:\n", 86 | " name = f\"@{tag['name'].split('@')[1]} \"\n", 87 | " text = text.replace(name,f\"{tag['name']} \")\n", 88 | " text = text.replace('++breakme++', '\\n\\n')\n", 89 | " return (text)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 6, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "posts = load_mastodon_posts('detsocial/outbox.json')" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 258, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "masto_dict = {}" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 262, 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "data": { 117 | "application/vnd.jupyter.widget-view+json": { 118 | "model_id": "10d15159e0b140d7a6289e8f773fbb89", 119 | "version_major": 2, 120 | "version_minor": 0 121 | }, 122 | "text/plain": [ 123 | "HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=574.0), HTML(value='')))" 124 | ] 125 | }, 126 | "metadata": {}, 127 | "output_type": "display_data" 128 | }, 129 | { 130 | "name": "stdout", 131 | "output_type": "stream", 132 | "text": [ 133 | "\n" 134 | ] 135 | } 136 | ], 137 | "source": [ 138 | "for post in tqdm(posts):\n", 139 | " if 'contentMap' in post['object'] and post['id'] not in masto_dict and (('inReplyTo' in post and '/luca/' in post['inReplyTo']) or 'inReplyTo' not in post):\n", 140 | " if 'en' in post['object']['contentMap']:\n", 141 | " toot = {\n", 142 | " 'status': fix_text(post['object']['contentMap']['en'], post['object']['tag']),\n", 143 | " 'language': 'en'}\n", 144 | " if 'de' in post['object']['contentMap']:\n", 145 | " toot = {\n", 146 | " 'status': fix_text(post['object']['contentMap']['de'], post['object']['tag']),\n", 147 | " 'language': 'de'}\n", 148 | " toot['created_at'] = post['object']['published']\n", 149 | " if post['to'][0].endswith('/followers') and len(post['cc']) > 0 and post['cc'][0].endswith('#Public'):\n", 150 | " toot['visibility'] = 'unlisted'\n", 151 | " elif post['to'][0].endswith('#Public'):\n", 152 | " toot['visibility'] = 'public'\n", 153 | " elif post['to'][0].endswith('/followers') and len(post['cc']) > 0 and not post['cc'][0].endswith('#Public'):\n", 154 | " toot['visibility'] = 'private'\n", 155 | " else:\n", 156 | " toot['visibility'] = 'direct'\n", 157 | " \n", 158 | " if 'attachment' in post['object'] and len(post['object']['attachment']) > 0:\n", 159 | " #print(post['object']['attachment'])\n", 160 | " media_ids = []\n", 161 | " for attachment in post['object']['attachment']: \n", 162 | " image_path = f\"detsocial/home/det/live/public/system/{attachment['url']}\"\n", 163 | " file = open(image_path, 'rb')\n", 164 | " data = file.read()\n", 165 | " url = f\"{API_BASE_URL}/api/v2/media\"\n", 166 | " files={'file': (image_path, data, 'application/octet-stream')}\n", 167 | " if 'name' in attachment:\n", 168 | " values = {'description' : attachment['name']}\n", 169 | " r = requests.post(url, files=files, data=values, headers=HEADERS)\n", 170 | " else:\n", 171 | " r = requests.post(url, files=files, headers=HEADERS)\n", 172 | " json_data = r.json()\n", 173 | " media_ids.append(json_data['id'])\n", 174 | " toot['media_ids[]'] = media_ids\n", 175 | " \n", 176 | " if 'inReplyTo' in post['object'] and post['object']['inReplyTo'] is not None and '/luca/' in post['object']['inReplyTo']:\n", 177 | " # if post is part of a thread, get ID of previous post\n", 178 | " try:\n", 179 | " toot['in_reply_to_id'] = masto_dict.get(post['object']['inReplyTo'])\n", 180 | " except:\n", 181 | " print(post)\n", 182 | "\n", 183 | " #print(toot) \n", 184 | " posted = post_status(toot)\n", 185 | " #print(posted)\n", 186 | " \n", 187 | " masto_dict[post['object']['id']] = posted['id']\n", 188 | " else:\n", 189 | " pass\n", 190 | " #print('skipped')\n", 191 | " " 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 280, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "# Manually adding new statuses\n", 201 | "\n", 202 | "toot = {}\n", 203 | "media_ids = []\n", 204 | "\n", 205 | "toot = {\n", 206 | " 'status': '''xxx''',\n", 207 | " 'language': 'en'}\n", 208 | "\n", 209 | "image_path = f\"xxx.png\"\n", 210 | "file = open(image_path, 'rb')\n", 211 | "data = file.read()\n", 212 | "url = f\"{API_BASE_URL}/api/v2/media\"\n", 213 | "files={'file': (image_path, data, 'application/octet-stream')}\n", 214 | "r = requests.post(url, files=files, data={'description' : '''xxx'''}, headers=HEADERS)\n", 215 | "json_data = r.json()\n", 216 | "media_ids.append(json_data['id'])\n", 217 | "toot['media_ids[]'] = media_ids\n", 218 | "toot['in_reply_to_id'] = 'xxx'\n", 219 | "toot['created_at'] = \"2022-12-25T20:21:00Z\"" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "post_status(toot)" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "# todo\n", 236 | "- content warnings?\n", 237 | "- URLs?" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 257, 243 | "metadata": {}, 244 | "outputs": [ 245 | { 246 | "data": { 247 | "application/vnd.jupyter.widget-view+json": { 248 | "model_id": "a8edcfd337e549e19b1533a1ec4f4a48", 249 | "version_major": 2, 250 | "version_minor": 0 251 | }, 252 | "text/plain": [ 253 | "HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))" 254 | ] 255 | }, 256 | "metadata": {}, 257 | "output_type": "display_data" 258 | }, 259 | { 260 | "name": "stdout", 261 | "output_type": "stream", 262 | "text": [ 263 | "\n" 264 | ] 265 | } 266 | ], 267 | "source": [ 268 | "# delete the posted posts because something went wrong\n", 269 | "\n", 270 | "def delete_status(status_id):\n", 271 | " HEADERS = {'Authorization': f'Bearer {MASTODON_BEARER}'}\n", 272 | " url = f\"{API_BASE_URL}/api/v1/statuses/{status_id}\"\n", 273 | " r = requests.delete(url, \n", 274 | " headers=HEADERS)\n", 275 | " return r.json()\n", 276 | "\n", 277 | "for thing in tqdm(masto_dict.values()):\n", 278 | " x = (delete_status(thing))" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [] 287 | } 288 | ], 289 | "metadata": { 290 | "kernelspec": { 291 | "display_name": "Python 3", 292 | "language": "python", 293 | "name": "python3" 294 | }, 295 | "language_info": { 296 | "codemirror_mode": { 297 | "name": "ipython", 298 | "version": 3 299 | }, 300 | "file_extension": ".py", 301 | "mimetype": "text/x-python", 302 | "name": "python", 303 | "nbconvert_exporter": "python", 304 | "pygments_lexer": "ipython3", 305 | "version": "3.7.8" 306 | } 307 | }, 308 | "nbformat": 4, 309 | "nbformat_minor": 4 310 | } 311 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | > **_NOTE:_** Kevin Payravi improved the code and converted it to a regular Python script. You can find it here: https://github.com/KevinPayravi/twitter-archive-to-mastodon. Some fixes were pulled into the notebook. 2 | 3 | 4 | # fediporter 5 | Notes on importing content to Mastodon. Very early version. Pull requests welcome. (It's unlikely that I will work on this for long.) 6 | 7 | Where it started: ~~https://det.social/@luca/109559157244375603~~ https://social.luca.run/@luca/109559157272902502 8 | 9 | I run a personal Mastodon instance and wanted to import my Tweets without spamming my followers, the local timeline or other instances. In this repo you will find with what I came up. It's not meant for end users but admins who are okay with losing all their data. 10 | 11 | # Modding Mastodon 12 | Mastodon does not allow the import of posts/toots/statuses, but as admins of our own instance, we can adapt it to our needs. 13 | 14 | I decided to add an created_at parameter to the API to allow the creation of backdated posts. Two files need to be changed. My instance uses the linuxserver docker image (https://docs.linuxserver.io/images/docker-mastodon) on an Unraid server. The mod_mastodon.sh script is adapted to that setup and can be executed from the Unraid console. It changes two files: app/controllers/api/v1/statuses_controller.rb and app/services/post_status_service.rb. After changing those files, you need to restart your server. If you use a different environment, you probably need to change the paths. 15 | 16 | app/controllers/api/v1/statuses_controller.rb 17 | ``` 18 | def create 19 | @status = PostStatusService.new.call( 20 | current_user.account, 21 | text: status_params[:status], 22 | thread: @thread, 23 | media_ids: status_params[:media_ids], 24 | sensitive: status_params[:sensitive], 25 | spoiler_text: status_params[:spoiler_text], 26 | visibility: status_params[:visibility], 27 | language: status_params[:language], 28 | scheduled_at: status_params[:scheduled_at], 29 | created_at: status_params[:created_at], 30 | application: doorkeeper_token.application, 31 | poll: status_params[:poll], 32 | idempotency: request.headers['Idempotency-Key'], 33 | with_rate_limit: true 34 | ) 35 | ``` 36 | 37 | ``` 38 | def status_params 39 | params.permit( 40 | :status, 41 | :in_reply_to_id, 42 | :sensitive, :spoiler_text, :visibility, 43 | :language, 44 | :scheduled_at, 45 | :created_at, 46 | media_ids: [], 47 | poll: [ 48 | :multiple, 49 | :hide_totals, 50 | :expires_in, 51 | options: [], 52 | ] 53 | ) 54 | end 55 | ``` 56 | 57 | To not spam anyone, posts with a created_at parameter won't be pushed. That's what the changes to post_status_service.rb prevent. 58 | 59 | app/services/post_status_service.rb 60 | ``` 61 | def postprocess_status! 62 | Trends.tags.register(@status)il LinkCrawlWorker.perform_async(@status.id) 63 | if not @options[:created_at] 64 | DistributionWorker.perform_async(@status.id) 65 | ActivityPub::DistributionWorker.perform_async(@status.id) 66 | end 67 | PollExpirationNotifyWorker.perform_at(@status.poll.expires_at, @status.poll.id) if @status.poll 68 | end 69 | ``` 70 | ``` 71 | def status_attributes 72 | { 73 | text: @text, 74 | created_at: @options[:created_at], 75 | media_attachments: @media || [], 76 | ordered_media_attachment_ids: (@options[:media_ids] || []).map(&:to_i) & @media.map(&:id), 77 | thread: @in_reply_to, 78 | poll_attributes: poll_attributes, 79 | sensitive: @sensitive, 80 | spoiler_text: @options[:spoiler_text] || '', 81 | visibility: @visibility, 82 | language: valid_locale_cascade(@options[:language], @account.user&.preferred_posting_language, I18n.default_locale), 83 | application: @options[:application], 84 | rate_limit: @options[:with_rate_limit], 85 | }.compact 86 | end 87 | ``` 88 | 89 | # Adding content through the API 90 | Now that the API supports a created_at parameter, we can import any content through the API. I am most familiar with Python, so I went with that. The first content I tried was my Twitter archive. That's what the tweets-to-mastodon.py does (I use it in a Jupyter Notebook, but currently too lazy to clean it up, so only fragments.) 91 | 92 | Already works: 93 | - Replace @username with @username@twitter.com 94 | - Upload media (can use higher resolution media from https://github.com/timhutton/twitter-archive-parser) 95 | - Threads are recreated as threads (very fragile because post IDs are only stored in a variable) 96 | - t.co URLs are replaced with the expanded versions 97 | 98 | Planned: 99 | - Retweets (currently, they are skipped because they are often truncated and that would be silly) 100 | - Alt texts (they aren't included in the Twitter archive and we will need to retreive them from the API or website) 101 | - Making import of replies optional 102 | - Edited Tweets (should all versions be imported?) 103 | -------------------------------------------------------------------------------- /mod_mastodon.sh: -------------------------------------------------------------------------------- 1 | # Be careful. This only works with the linuxserver docker image on Unraid. 2 | # It changes Mastodon files and those changes will be gone with the next update of the container. 3 | # It should not be run multiple times because it does not test if it was run before and is stupid and destructive. 4 | # DON'T USE IF YOU DON'T KNOW HOW TO FIX THE FILES IT CHANGES 5 | 6 | # app/controllers/api/v1/statuses_controller.rb 7 | # allow created_at parameter in API calls 8 | docker exec mastodon sed -i 's/scheduled_at: status_params\[:scheduled_at\],/scheduled_at: status_params\[:scheduled_at\],\n created_at: status_params\[:created_at\],/g' /app/www/app/controllers/api/v1/statuses_controller.rb 9 | docker exec mastodon sed -i 's/:scheduled_at,/:scheduled_at,\n :created_at,/g' /app/www/app/controllers/api/v1/statuses_controller.rb 10 | 11 | 12 | # app/services/post_status_service.rb 13 | # Stop new posts from being pushed to other servers. 14 | docker exec mastodon sed -i 's/ DistributionWorker.perform_async(@status.id)/ if not @options[:created_at]\n DistributionWorker.perform_async(@status.id)/g' /app/www/app/services/post_status_service.rb 15 | docker exec mastodon sed -i 's/ActivityPub::DistributionWorker.perform_async(@status.id)/ ActivityPub::DistributionWorker.perform_async(@status.id)\n end/g' /app/www/app/services/post_status_service.rb 16 | 17 | # add created_at to database 18 | docker exec mastodon sed -i 's/text: @text,/text: @text,\n created_at: @options\[:created_at\],/g' /app/www/app/services/post_status_service.rb 19 | -------------------------------------------------------------------------------- /sample-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "mastodon_url" : "https://social.luca.run", 3 | "mastodon_bearer" : "", 4 | "twitter_bearer" : "", 5 | 6 | "data_dir" : "luca/data/", 7 | 8 | "media_dir_backup" : "luca/data/tweets_media/", 9 | "media_dir" : "luca/parser-output/media/" 10 | } 11 | -------------------------------------------------------------------------------- /tweetjs-to-mastodon.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "# Fediporter\n", 9 | "\n", 10 | "Jupyter notebook to migrate content like Tweets oder Mastodon posts to Mastodon.\n", 11 | "\n", 12 | "Only works with instances that are patched to allow backdated posts through the API. More info: https://github.com/lucahammer/fediporter\n" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": { 19 | "ExecuteTime": { 20 | "end_time": "2022-12-23T10:05:14.110272Z", 21 | "start_time": "2022-12-23T10:05:14.107618Z" 22 | } 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "from pathlib import Path\n", 27 | "from tqdm.notebook import tqdm\n", 28 | "import datetime\n", 29 | "import json\n", 30 | "import re\n", 31 | "import requests" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "with open('config.json', 'r') as f:\n", 41 | " config = json.loads(f.read())\n", 42 | "\n", 43 | "API_BASE_URL = config['mastodon_url']\n", 44 | "MASTODON_BEARER = config['mastodon_bearer']\n", 45 | "TWITTER_BEARER = config['twitter_bearer']\n", 46 | "\n", 47 | "DATA_DIR = config['data_dir'] # Unzipped twitter data export\n", 48 | "media_dir_backup = config['media_dir_backup'] # media folder of twitter data export\n", 49 | "media_dir = config['media_dir'] # media folder of https://github.com/timhutton/twitter-archive-parser\n", 50 | "\n", 51 | "# Test Mastodon bearer token\n", 52 | "url = f\"{API_BASE_URL}/api/v1/apps/verify_credentials\"\n", 53 | "r = requests.get(url, headers=HEADERS)\n", 54 | "print(r.text)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": { 61 | "ExecuteTime": { 62 | "end_time": "2022-12-23T11:57:44.037582Z", 63 | "start_time": "2022-12-23T11:57:43.929260Z" 64 | } 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "def post_status(data):\n", 69 | " HEADERS = {'Authorization': f'Bearer {MASTODON_BEARER}'}\n", 70 | " url = f\"{API_BASE_URL}/api/v1/statuses\"\n", 71 | " r = requests.post(url,\n", 72 | " data=data,\n", 73 | " headers=HEADERS)\n", 74 | " return r.json()\n", 75 | "\n", 76 | "\n", 77 | "def load_tweets():\n", 78 | " with open(DATA_DIR+\"tweets.js\", 'r', encoding='utf8') as f:\n", 79 | " raw = f.read()\n", 80 | " raw = raw.replace(\"window.YTD.tweets.part0 = \", \"\")\n", 81 | " tweets = json.loads(raw)\n", 82 | " tweets = [tweet['tweet'] for tweet in tweets]\n", 83 | " tweets = sorted(tweets, key=lambda d: int(d['id']))\n", 84 | " return tweets\n", 85 | "\n", 86 | "\n", 87 | "def to_timestamp(created_at):\n", 88 | " timestamp = datetime.datetime.strptime(\n", 89 | " created_at, '%a %b %d %X %z %Y').isoformat(timespec='seconds')\n", 90 | " return timestamp\n", 91 | "\n", 92 | "\n", 93 | "def replace_urls(tweet):\n", 94 | " if 'full_text' in tweet:\n", 95 | " text = tweet['full_text']\n", 96 | " else:\n", 97 | " text = tweet['text']\n", 98 | " if 'entities' in tweet and 'urls' in tweet['entities']:\n", 99 | " for url in tweet['entities']['urls']:\n", 100 | " text = text.replace(url['url'], url['expanded_url'])\n", 101 | " return (text)\n", 102 | "\n", 103 | "\n", 104 | "def replace_usernames(text):\n", 105 | " text = re.sub(r\"(\\B\\@[A-Za-z0-9_]{1,15})(\\:)?\", r\"\\1@twitter.com\\2\", text)\n", 106 | " return text\n", 107 | "\n", 108 | "\n", 109 | "def tweet_to_toot(tweet):\n", 110 | " toot = {\n", 111 | " 'status': replace_usernames(replace_urls(tweet)),\n", 112 | " 'visibility': 'public',\n", 113 | " 'created_at': to_timestamp(tweet['created_at']),\n", 114 | " 'language': tweet['lang']\n", 115 | " }\n", 116 | " return toot\n", 117 | "\n", 118 | "\n", 119 | "def retrieve_alt_texts(tweet_ids):\n", 120 | " # get alt text for specific IDs from Twitter API\n", 121 | " twitter_url = \"https://api.twitter.com/2/tweets\"\n", 122 | " twitter_heeaders = {\"Authorization\": f\"Bearer {TWITTER_BEARER}\"}\n", 123 | " twitter_params = {'ids': ','.join(tweet_ids),\n", 124 | " 'tweet.fields': 'text,attachments,entities',\n", 125 | " 'expansions': 'attachments.media_keys',\n", 126 | " 'media.fields': 'alt_text'\n", 127 | " }\n", 128 | " resp = requests.get(\n", 129 | " twitter_url, headers=twitter_heeaders, params=twitter_params)\n", 130 | " resp_json = resp.json()\n", 131 | "\n", 132 | " for media in resp_json['includes']['media']:\n", 133 | " if 'alt_text' in media:\n", 134 | " alt_texts[media['media_key']] = media['alt_text']\n", 135 | "\n", 136 | "\n", 137 | "def add_alt_texts(tweets):\n", 138 | " # looks for Tweets with media and asks Twitter API for alt texts\n", 139 | " # adds those alt texts to the dict alt_texts\n", 140 | " tweets_with_media = [tweet for tweet in tweets[50100:]\n", 141 | " if 'media' in tweet['entities']]\n", 142 | " print(f'Found {len(tweets_with_media)} Tweets with media attachements.')\n", 143 | " tweet_ids = [str(tweets_with_media['id'])\n", 144 | " for tweets_with_media in tweets_with_media]\n", 145 | " batches = [tweet_ids[idx:idx+100] for idx in range(0, len(tweet_ids), 100)]\n", 146 | "\n", 147 | " for batch in tqdm(batches):\n", 148 | " retrieve_alt_texts(batch)\n", 149 | "\n", 150 | " print(f'Found {len(alt_texts)} alt texts.')\n", 151 | "\n", 152 | "\n", 153 | "def retrieve_rt_texts(tweet_ids):\n", 154 | " # get full text of Retweets for specific IDs from Twitter API\n", 155 | " twitter_url = \"https://api.twitter.com/2/tweets\"\n", 156 | " twitter_heeaders = {\"Authorization\": f\"Bearer {TWITTER_BEARER}\"}\n", 157 | " twitter_params = {'ids': ','.join(tweet_ids),\n", 158 | " 'tweet.fields': 'text,referenced_tweets,entities',\n", 159 | " 'expansions': 'referenced_tweets.id'\n", 160 | " }\n", 161 | " resp = requests.get(\n", 162 | " twitter_url, headers=twitter_heeaders, params=twitter_params)\n", 163 | " resp_json = resp.json()\n", 164 | "\n", 165 | " for tweet in resp_json['data']:\n", 166 | " rt = [rt for rt in resp_json['includes']['tweets']\n", 167 | " if rt['id'] == tweet['referenced_tweets'][0]['id']][0]\n", 168 | " text = replace_urls(rt)\n", 169 | " text = f\"{tweet['text'].split(':')[0]}: {text}\\nhttps://twitter.com/{tweet['text'].split(':')[0].split('@')[-1]}/status/{rt['id']}\"\n", 170 | " text = replace_usernames(text)\n", 171 | " rt_texts[tweet['id']] = text\n", 172 | "\n", 173 | "\n", 174 | "def add_full_RT_texts(tweets):\n", 175 | " # looks for Tweets with media and asks Twitter API for alt texts\n", 176 | " # adds those alt texts to the dict alt_texts\n", 177 | " truncated_retweets = retweets = [tweet for tweet in tweets if tweet['full_text'].startswith(\n", 178 | " 'RT @') and tweet['full_text'].endswith('…')]\n", 179 | " print(f'Found {len(truncated_retweets)} truncated Retweets.')\n", 180 | " tweet_ids = [str(tweet['id']) for tweet in truncated_retweets]\n", 181 | " batches = [tweet_ids[idx:idx+100] for idx in range(0, len(tweet_ids), 100)]\n", 182 | "\n", 183 | " for batch in tqdm(batches):\n", 184 | " retrieve_rt_texts(batch)\n", 185 | "\n", 186 | " print(f'Collected {len(rt_texts)} full texts for Retweets.')\n" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "tweets = load_tweets()\n", 196 | "len(tweets)\n" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "alt_texts = {}\n", 206 | "add_alt_texts(tweets)\n" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "rt_texts = {}\n", 216 | "add_full_RT_texts(tweets)\n" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "ids_dict = {}\n" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": { 232 | "ExecuteTime": { 233 | "end_time": "2022-12-25T08:28:46.728201Z", 234 | "start_time": "2022-12-25T08:28:46.566070Z" 235 | } 236 | }, 237 | "outputs": [], 238 | "source": [ 239 | "for tweet in tqdm(tweets[25218+4828:]):\n", 240 | "\n", 241 | " if tweet['id'] in ids_dict:\n", 242 | " # was already posted, we can skip it\n", 243 | " pass\n", 244 | " elif tweet['full_text'].startswith('RT @'):\n", 245 | " # Retweets are often truncated and full data needs to be retreived from the API\n", 246 | " if rt_texts.get(tweet['id']):\n", 247 | " toot = {'status': rt_texts.get(tweet['id']),\n", 248 | " 'visibility': 'public',\n", 249 | " 'created_at': to_timestamp(tweet['created_at']),\n", 250 | " 'language': tweet['lang']\n", 251 | " }\n", 252 | " else:\n", 253 | " toot = tweet_to_toot(tweet)\n", 254 | " posted = post_status(toot)\n", 255 | " ids_dict[tweet['id']] = posted['id']\n", 256 | " else:\n", 257 | " toot = tweet_to_toot(tweet)\n", 258 | " if 'media' in tweet['entities']:\n", 259 | " # upload media to append to the post\n", 260 | " media_ids = []\n", 261 | " for media in tweet['extended_entities']['media']:\n", 262 | " image_path = f\"{media_dir}{tweet['id']}-{media['media_url_https'].split('/')[-1]}\"\n", 263 | " if not Path(image_path).is_file():\n", 264 | " image_path = f\"{media_dir_backup}{tweet['id']}-{media['media_url_https'].split('/')[-1]}\"\n", 265 | " if not Path(image_path).is_file():\n", 266 | " continue\n", 267 | " file = open(image_path, 'rb')\n", 268 | " data = file.read()\n", 269 | " url = f\"{API_BASE_URL}/api/v2/media\"\n", 270 | " files = {\n", 271 | " 'file': (image_path, data, 'application/octet-stream')}\n", 272 | " if alt_texts.get('3_' + media['id']):\n", 273 | " values = {'description': alt_texts.get('3_' + media['id'])}\n", 274 | " r = requests.post(url, files=files,\n", 275 | " data=values, headers=HEADERS)\n", 276 | " else:\n", 277 | " r = requests.post(url, files=files, headers=HEADERS)\n", 278 | " json_data = r.json()\n", 279 | " media_ids.append(json_data['id'])\n", 280 | " toot['status'] = toot['status'].replace(media['url'], '')\n", 281 | " toot['media_ids[]'] = media_ids\n", 282 | " if 'in_reply_to_screen_name' in tweet and tweet['in_reply_to_screen_name'] == 'luca':\n", 283 | " # if Tweet is part of a thread, get ID if previous post\n", 284 | " try:\n", 285 | " toot['in_reply_to_id'] = ids_dict.get(\n", 286 | " tweet['in_reply_to_status_id'])\n", 287 | " except:\n", 288 | " print(tweet)\n", 289 | " # print(tweet)\n", 290 | " # print(toot)\n", 291 | " posted = post_status(toot)\n", 292 | " # print(posted)\n", 293 | " ids_dict[tweet['id']] = posted['id']\n" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [ 302 | "with open('ids_dict.txt', 'w') as f:\n", 303 | " f.write(json.dumps(ids_dict))\n" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "with open('alt_texts.txt', 'w') as f:\n", 313 | " f.write(json.dumps(alt_texts))\n" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": null, 319 | "metadata": {}, 320 | "outputs": [], 321 | "source": [ 322 | "with open('rt_texts.txt', 'w') as f:\n", 323 | " f.write(json.dumps(rt_texts))\n" 324 | ] 325 | }, 326 | { 327 | "attachments": {}, 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "# Todos\n", 332 | "\n", 333 | "- [ ] change ids and alt_texts storage from dict to file\n", 334 | "- [ ] check alt texts (first ones are broken and need to be replaced)\n", 335 | "- [ ] check images (first ones weren't attached to posts and need to be replaced)\n", 336 | "- [x] add Retweets (get full text from API and post them; haven't been posted yet)\n", 337 | "- [ ] fix videos (they weren't uploaded, but linked. :( )\n", 338 | "- [ ] replace self-quotes with self-posts (update url in posts)\n", 339 | "- [ ] import mastodon data\n" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": null, 352 | "metadata": {}, 353 | "outputs": [], 354 | "source": [] 355 | } 356 | ], 357 | "metadata": { 358 | "kernelspec": { 359 | "display_name": "Python 3", 360 | "language": "python", 361 | "name": "python3" 362 | }, 363 | "language_info": { 364 | "codemirror_mode": { 365 | "name": "ipython", 366 | "version": 3 367 | }, 368 | "file_extension": ".py", 369 | "mimetype": "text/x-python", 370 | "name": "python", 371 | "nbconvert_exporter": "python", 372 | "pygments_lexer": "ipython3", 373 | "version": "3.11.0 (main, Oct 25 2022, 13:57:33) [Clang 14.0.0 (clang-1400.0.29.202)]" 374 | }, 375 | "vscode": { 376 | "interpreter": { 377 | "hash": "5c7b89af1651d0b8571dde13640ecdccf7d5a6204171d6ab33e7c296e100e08a" 378 | } 379 | } 380 | }, 381 | "nbformat": 4, 382 | "nbformat_minor": 4 383 | } 384 | --------------------------------------------------------------------------------