├── LICENSE
├── Mastodon Export to Mastodon.ipynb
├── README.md
├── mod_mastodon.sh
├── sample-config.json
└── tweetjs-to-mastodon.ipynb


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Luca Hammer
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Mastodon Export to Mastodon.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Fediporter\n",
  8 |     "Jupyter notebook to migrate content like Tweets oder Mastodon posts to Mastodon.\n",
  9 |     "\n",
 10 |     "Only works with instances that are patched to allow backdated posts through the API. More info: https://github.com/lucahammer/fediporter"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 1,
 16 |    "metadata": {
 17 |     "ExecuteTime": {
 18 |      "end_time": "2022-12-23T10:05:14.110272Z",
 19 |      "start_time": "2022-12-23T10:05:14.107618Z"
 20 |     }
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import requests\n",
 25 |     "from bs4 import BeautifulSoup\n",
 26 |     "from tqdm.notebook import tqdm\n",
 27 |     "import json"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 163,
 33 |    "metadata": {},
 34 |    "outputs": [
 35 |     {
 36 |      "name": "stdout",
 37 |      "output_type": "stream",
 38 |      "text": [
 39 |       "{\"name\":\"Imported from det.social/@Luca\",\"website\":\"https://det.social/@luca\",\"vapid_key\":\"uXL829wxgH9wYavQxkARcWv6fiNUZ94B\"}\n"
 40 |      ]
 41 |     }
 42 |    ],
 43 |    "source": [
 44 |     "with open('config.json', 'r') as f:\n",
 45 |     "    config = json.loads(f.read())\n",
 46 |     "\n",
 47 |     "API_BASE_URL = config['mastodon_url']\n",
 48 |     "MASTODON_BEARER =  \"oyufS1RbwqgIMlLcudOtA33F4N5YZrkDHiHDlw5Y-dA\"  #config['mastodon_bearer']\n",
 49 |     "HEADERS = headers={'Authorization': f'Bearer {MASTODON_BEARER}'}\n",
 50 |     "\n",
 51 |     "# Test Mastodon bearer token\n",
 52 |     "url = f\"{API_BASE_URL}/api/v1/apps/verify_credentials\"\n",
 53 |     "r = requests.get(url, headers=HEADERS)\n",
 54 |     "print(r.text)"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 5,
 60 |    "metadata": {
 61 |     "ExecuteTime": {
 62 |      "end_time": "2022-12-23T11:57:44.037582Z",
 63 |      "start_time": "2022-12-23T11:57:43.929260Z"
 64 |     }
 65 |    },
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "def post_status(data):\n",
 69 |     "    HEADERS = {'Authorization': f'Bearer {MASTODON_BEARER}'}\n",
 70 |     "    url = f\"{API_BASE_URL}/api/v1/statuses\"\n",
 71 |     "    r = requests.post(url, \n",
 72 |     "                      data=data, \n",
 73 |     "                      headers=HEADERS)\n",
 74 |     "    return r.json()\n",
 75 |     "\n",
 76 |     "def load_mastodon_posts(file):\n",
 77 |     "    with open(file, 'r') as f:\n",
 78 |     "        posts = json.loads(f.read())['orderedItems']\n",
 79 |     "    return(posts)\n",
 80 |     "\n",
 81 |     "def fix_text(text, tags):\n",
 82 |     "    text = text.replace('</p><p>', '++breakme++</p><p>') #add placeholder to replace with linebreaks to keep paragraphs (it's weird)\n",
 83 |     "    text = BeautifulSoup(text, 'html.parser').get_text()\n",
 84 |     "    tags = [tag for tag in tags if tag['type'] == 'Mention']\n",
 85 |     "    for tag in tags:\n",
 86 |     "        name = f\"@{tag['name'].split('@')[1]} \"\n",
 87 |     "        text = text.replace(name,f\"{tag['name']} \")\n",
 88 |     "    text = text.replace('++breakme++', '\\n\\n')\n",
 89 |     "    return (text)"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 6,
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "posts = load_mastodon_posts('detsocial/outbox.json')"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 258,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "masto_dict = {}"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 262,
113 |    "metadata": {},
114 |    "outputs": [
115 |     {
116 |      "data": {
117 |       "application/vnd.jupyter.widget-view+json": {
118 |        "model_id": "10d15159e0b140d7a6289e8f773fbb89",
119 |        "version_major": 2,
120 |        "version_minor": 0
121 |       },
122 |       "text/plain": [
123 |        "HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=574.0), HTML(value='')))"
124 |       ]
125 |      },
126 |      "metadata": {},
127 |      "output_type": "display_data"
128 |     },
129 |     {
130 |      "name": "stdout",
131 |      "output_type": "stream",
132 |      "text": [
133 |       "\n"
134 |      ]
135 |     }
136 |    ],
137 |    "source": [
138 |     "for post in tqdm(posts):\n",
139 |     "    if 'contentMap' in post['object'] and post['id'] not in masto_dict and (('inReplyTo' in post and '/luca/' in post['inReplyTo']) or 'inReplyTo' not in post):\n",
140 |     "        if 'en' in post['object']['contentMap']:\n",
141 |     "            toot = {\n",
142 |     "                'status': fix_text(post['object']['contentMap']['en'], post['object']['tag']),\n",
143 |     "                'language': 'en'}\n",
144 |     "        if 'de' in post['object']['contentMap']:\n",
145 |     "            toot = {\n",
146 |     "                'status': fix_text(post['object']['contentMap']['de'], post['object']['tag']),\n",
147 |     "                'language': 'de'}\n",
148 |     "        toot['created_at'] = post['object']['published']\n",
149 |     "        if post['to'][0].endswith('/followers') and len(post['cc']) > 0 and post['cc'][0].endswith('#Public'):\n",
150 |     "            toot['visibility'] = 'unlisted'\n",
151 |     "        elif post['to'][0].endswith('#Public'):\n",
152 |     "            toot['visibility'] = 'public'\n",
153 |     "        elif post['to'][0].endswith('/followers') and len(post['cc']) > 0 and not post['cc'][0].endswith('#Public'):\n",
154 |     "            toot['visibility'] = 'private'\n",
155 |     "        else:\n",
156 |     "            toot['visibility'] = 'direct'\n",
157 |     "        \n",
158 |     "        if 'attachment' in post['object'] and len(post['object']['attachment']) > 0:\n",
159 |     "            #print(post['object']['attachment'])\n",
160 |     "            media_ids = []\n",
161 |     "            for attachment in post['object']['attachment']:           \n",
162 |     "                image_path = f\"detsocial/home/det/live/public/system/{attachment['url']}\"\n",
163 |     "                file = open(image_path, 'rb')\n",
164 |     "                data = file.read()\n",
165 |     "                url = f\"{API_BASE_URL}/api/v2/media\"\n",
166 |     "                files={'file': (image_path, data, 'application/octet-stream')}\n",
167 |     "                if 'name' in attachment:\n",
168 |     "                    values = {'description' : attachment['name']}\n",
169 |     "                    r = requests.post(url, files=files, data=values, headers=HEADERS)\n",
170 |     "                else:\n",
171 |     "                    r = requests.post(url, files=files, headers=HEADERS)\n",
172 |     "                json_data = r.json()\n",
173 |     "                media_ids.append(json_data['id'])\n",
174 |     "            toot['media_ids[]'] = media_ids\n",
175 |     "        \n",
176 |     "        if 'inReplyTo' in post['object'] and post['object']['inReplyTo'] is not None and '/luca/' in post['object']['inReplyTo']:\n",
177 |     "            # if post is part of a thread, get ID of previous post\n",
178 |     "            try:\n",
179 |     "                toot['in_reply_to_id'] = masto_dict.get(post['object']['inReplyTo'])\n",
180 |     "            except:\n",
181 |     "                print(post)\n",
182 |     "\n",
183 |     "        #print(toot)        \n",
184 |     "        posted = post_status(toot)\n",
185 |     "        #print(posted)\n",
186 |     "        \n",
187 |     "        masto_dict[post['object']['id']] = posted['id']\n",
188 |     "    else:\n",
189 |     "        pass\n",
190 |     "        #print('skipped')\n",
191 |     "    "
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": 280,
197 |    "metadata": {},
198 |    "outputs": [],
199 |    "source": [
200 |     "# Manually adding new statuses\n",
201 |     "\n",
202 |     "toot = {}\n",
203 |     "media_ids = []\n",
204 |     "\n",
205 |     "toot = {\n",
206 |     "    'status': '''xxx''',\n",
207 |     "    'language': 'en'}\n",
208 |     "\n",
209 |     "image_path = f\"xxx.png\"\n",
210 |     "file = open(image_path, 'rb')\n",
211 |     "data = file.read()\n",
212 |     "url = f\"{API_BASE_URL}/api/v2/media\"\n",
213 |     "files={'file': (image_path, data, 'application/octet-stream')}\n",
214 |     "r = requests.post(url, files=files, data={'description' : '''xxx'''}, headers=HEADERS)\n",
215 |     "json_data = r.json()\n",
216 |     "media_ids.append(json_data['id'])\n",
217 |     "toot['media_ids[]'] = media_ids\n",
218 |     "toot['in_reply_to_id'] = 'xxx'\n",
219 |     "toot['created_at'] = \"2022-12-25T20:21:00Z\""
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": [
228 |     "post_status(toot)"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "markdown",
233 |    "metadata": {},
234 |    "source": [
235 |     "# todo\n",
236 |     "- content warnings?\n",
237 |     "- URLs?"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": 257,
243 |    "metadata": {},
244 |    "outputs": [
245 |     {
246 |      "data": {
247 |       "application/vnd.jupyter.widget-view+json": {
248 |        "model_id": "a8edcfd337e549e19b1533a1ec4f4a48",
249 |        "version_major": 2,
250 |        "version_minor": 0
251 |       },
252 |       "text/plain": [
253 |        "HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))"
254 |       ]
255 |      },
256 |      "metadata": {},
257 |      "output_type": "display_data"
258 |     },
259 |     {
260 |      "name": "stdout",
261 |      "output_type": "stream",
262 |      "text": [
263 |       "\n"
264 |      ]
265 |     }
266 |    ],
267 |    "source": [
268 |     "# delete the posted posts because something went wrong\n",
269 |     "\n",
270 |     "def delete_status(status_id):\n",
271 |     "    HEADERS = {'Authorization': f'Bearer {MASTODON_BEARER}'}\n",
272 |     "    url = f\"{API_BASE_URL}/api/v1/statuses/{status_id}\"\n",
273 |     "    r = requests.delete(url, \n",
274 |     "                      headers=HEADERS)\n",
275 |     "    return r.json()\n",
276 |     "\n",
277 |     "for thing in tqdm(masto_dict.values()):\n",
278 |     "    x = (delete_status(thing))"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": null,
284 |    "metadata": {},
285 |    "outputs": [],
286 |    "source": []
287 |   }
288 |  ],
289 |  "metadata": {
290 |   "kernelspec": {
291 |    "display_name": "Python 3",
292 |    "language": "python",
293 |    "name": "python3"
294 |   },
295 |   "language_info": {
296 |    "codemirror_mode": {
297 |     "name": "ipython",
298 |     "version": 3
299 |    },
300 |    "file_extension": ".py",
301 |    "mimetype": "text/x-python",
302 |    "name": "python",
303 |    "nbconvert_exporter": "python",
304 |    "pygments_lexer": "ipython3",
305 |    "version": "3.7.8"
306 |   }
307 |  },
308 |  "nbformat": 4,
309 |  "nbformat_minor": 4
310 | }
311 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | > **_NOTE:_**  Kevin Payravi improved the code and converted it to a regular Python script. You can find it here: https://github.com/KevinPayravi/twitter-archive-to-mastodon. Some fixes were pulled into the notebook.
  2 | 
  3 | 
  4 | # fediporter
  5 | Notes on importing content to Mastodon. Very early version. Pull requests welcome. (It's unlikely that I will work on this for long.)
  6 | 
  7 | Where it started: ~~https://det.social/@luca/109559157244375603~~ https://social.luca.run/@luca/109559157272902502
  8 | 
  9 | I run a personal Mastodon instance and wanted to import my Tweets without spamming my followers, the local timeline or other instances. In this repo you will find with what I came up. It's not meant for end users but admins who are okay with losing all their data. 
 10 | 
 11 | # Modding Mastodon
 12 | Mastodon does not allow the import of posts/toots/statuses, but as admins of our own instance, we can adapt it to our needs. 
 13 | 
 14 | I decided to add an created_at parameter to the API to allow the creation of backdated posts. Two files need to be changed. My instance uses the linuxserver docker image (https://docs.linuxserver.io/images/docker-mastodon) on an Unraid server. The mod_mastodon.sh script is adapted to that setup and can be executed from the Unraid console. It changes two files: app/controllers/api/v1/statuses_controller.rb and app/services/post_status_service.rb. After changing those files, you need to restart your server. If you use a different environment, you probably need to change the paths.
 15 | 
 16 | app/controllers/api/v1/statuses_controller.rb
 17 | ```
 18 |   def create
 19 |     @status = PostStatusService.new.call(
 20 |       current_user.account,
 21 |       text: status_params[:status],
 22 |       thread: @thread,
 23 |       media_ids: status_params[:media_ids],
 24 |       sensitive: status_params[:sensitive],
 25 |       spoiler_text: status_params[:spoiler_text],
 26 |       visibility: status_params[:visibility],
 27 |       language: status_params[:language],
 28 |       scheduled_at: status_params[:scheduled_at],
 29 |       created_at: status_params[:created_at],
 30 |       application: doorkeeper_token.application,
 31 |       poll: status_params[:poll],
 32 |       idempotency: request.headers['Idempotency-Key'],
 33 |       with_rate_limit: true
 34 |     )
 35 | ```
 36 | 
 37 | ```
 38 |   def status_params
 39 |     params.permit(
 40 |       :status,
 41 |       :in_reply_to_id,
 42 |       :sensitive,      :spoiler_text,      :visibility,
 43 |       :language,
 44 |       :scheduled_at,
 45 |       :created_at,
 46 |       media_ids: [],
 47 |       poll: [
 48 |         :multiple,
 49 |         :hide_totals,
 50 |         :expires_in,
 51 |         options: [],
 52 |       ]
 53 |     )
 54 |   end
 55 | ```
 56 | 
 57 | To not spam anyone, posts with a created_at parameter won't be pushed. That's what the changes to post_status_service.rb prevent.
 58 | 
 59 | app/services/post_status_service.rb
 60 | ```
 61 | def postprocess_status!
 62 |     Trends.tags.register(@status)il  LinkCrawlWorker.perform_async(@status.id)
 63 |     if not @options[:created_at]
 64 |       DistributionWorker.perform_async(@status.id)
 65 |       ActivityPub::DistributionWorker.perform_async(@status.id)
 66 |     end
 67 |     PollExpirationNotifyWorker.perform_at(@status.poll.expires_at, @status.poll.id) if @status.poll
 68 |   end
 69 | ```
 70 | ```
 71 | def status_attributes
 72 |     {
 73 |       text: @text,
 74 |       created_at: @options[:created_at],
 75 |       media_attachments: @media || [],
 76 |       ordered_media_attachment_ids: (@options[:media_ids] || []).map(&:to_i) & @media.map(&:id),
 77 |       thread: @in_reply_to,
 78 |       poll_attributes: poll_attributes,
 79 |       sensitive: @sensitive,
 80 |       spoiler_text: @options[:spoiler_text] || '',
 81 |       visibility: @visibility,
 82 |       language: valid_locale_cascade(@options[:language], @account.user&.preferred_posting_language, I18n.default_locale),
 83 |       application: @options[:application],
 84 |       rate_limit: @options[:with_rate_limit],
 85 |     }.compact
 86 |   end
 87 | ```
 88 | 
 89 | # Adding content through the API
 90 | Now that the API supports a created_at parameter, we can import any content through the API. I am most familiar with Python, so I went with that. The first content I tried was my Twitter archive. That's what the tweets-to-mastodon.py does (I use it in a Jupyter Notebook, but currently too lazy to clean it up, so only fragments.)
 91 | 
 92 | Already works:
 93 | - Replace @username with @username@twitter.com
 94 | - Upload media (can use higher resolution media from https://github.com/timhutton/twitter-archive-parser)
 95 | - Threads are recreated as threads (very fragile because post IDs are only stored in a variable)
 96 | - t.co URLs are replaced with the expanded versions
 97 | 
 98 | Planned:
 99 | - Retweets (currently, they are skipped because they are often truncated and that would be silly)
100 | - Alt texts (they aren't included in the Twitter archive and we will need to retreive them from the API or website)
101 | - Making import of replies optional
102 | - Edited Tweets (should all versions be imported?)
103 | 


--------------------------------------------------------------------------------
/mod_mastodon.sh:
--------------------------------------------------------------------------------
 1 | # Be careful. This only works with the linuxserver docker image on Unraid.
 2 | # It changes Mastodon files and those changes will be gone with the next update of the container.
 3 | # It should not be run multiple times because it does not test if it was run before and is stupid and destructive.
 4 | # DON'T USE IF YOU DON'T KNOW HOW TO FIX THE FILES IT CHANGES
 5 | 
 6 | # app/controllers/api/v1/statuses_controller.rb
 7 | # allow created_at parameter in API calls
 8 | docker exec mastodon sed -i 's/scheduled_at: status_params\[:scheduled_at\],/scheduled_at: status_params\[:scheduled_at\],\n      created_at: status_params\[:created_at\],/g' /app/www/app/controllers/api/v1/statuses_controller.rb
 9 | docker exec mastodon sed -i 's/:scheduled_at,/:scheduled_at,\n      :created_at,/g' /app/www/app/controllers/api/v1/statuses_controller.rb
10 | 
11 | 
12 | # app/services/post_status_service.rb
13 | # Stop new posts from being pushed to other servers.
14 | docker exec mastodon sed -i 's/    DistributionWorker.perform_async(@status.id)/    if not @options[:created_at]\n      DistributionWorker.perform_async(@status.id)/g' /app/www/app/services/post_status_service.rb
15 | docker exec mastodon sed -i 's/ActivityPub::DistributionWorker.perform_async(@status.id)/  ActivityPub::DistributionWorker.perform_async(@status.id)\n    end/g' /app/www/app/services/post_status_service.rb
16 | 
17 | # add created_at to database
18 | docker exec mastodon sed -i 's/text: @text,/text: @text,\n      created_at: @options\[:created_at\],/g' /app/www/app/services/post_status_service.rb
19 | 


--------------------------------------------------------------------------------
/sample-config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "mastodon_url" : "https://social.luca.run",
 3 |     "mastodon_bearer" : "",
 4 |     "twitter_bearer" : "",
 5 |     
 6 |     "data_dir" : "luca/data/", 
 7 |     
 8 |     "media_dir_backup" : "luca/data/tweets_media/",
 9 |     "media_dir" : "luca/parser-output/media/" 
10 | }
11 | 


--------------------------------------------------------------------------------
/tweetjs-to-mastodon.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "attachments": {},
  5 |    "cell_type": "markdown",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Fediporter\n",
  9 |     "\n",
 10 |     "Jupyter notebook to migrate content like Tweets oder Mastodon posts to Mastodon.\n",
 11 |     "\n",
 12 |     "Only works with instances that are patched to allow backdated posts through the API. More info: https://github.com/lucahammer/fediporter\n"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": null,
 18 |    "metadata": {
 19 |     "ExecuteTime": {
 20 |      "end_time": "2022-12-23T10:05:14.110272Z",
 21 |      "start_time": "2022-12-23T10:05:14.107618Z"
 22 |     }
 23 |    },
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "from pathlib import Path\n",
 27 |     "from tqdm.notebook import tqdm\n",
 28 |     "import datetime\n",
 29 |     "import json\n",
 30 |     "import re\n",
 31 |     "import requests"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "with open('config.json', 'r') as f:\n",
 41 |     "    config = json.loads(f.read())\n",
 42 |     "\n",
 43 |     "API_BASE_URL = config['mastodon_url']\n",
 44 |     "MASTODON_BEARER = config['mastodon_bearer']\n",
 45 |     "TWITTER_BEARER = config['twitter_bearer']\n",
 46 |     "\n",
 47 |     "DATA_DIR = config['data_dir'] # Unzipped twitter data export\n",
 48 |     "media_dir_backup = config['media_dir_backup'] # media folder of twitter data export\n",
 49 |     "media_dir = config['media_dir'] # media folder of https://github.com/timhutton/twitter-archive-parser\n",
 50 |     "\n",
 51 |     "# Test Mastodon bearer token\n",
 52 |     "url = f\"{API_BASE_URL}/api/v1/apps/verify_credentials\"\n",
 53 |     "r = requests.get(url, headers=HEADERS)\n",
 54 |     "print(r.text)"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {
 61 |     "ExecuteTime": {
 62 |      "end_time": "2022-12-23T11:57:44.037582Z",
 63 |      "start_time": "2022-12-23T11:57:43.929260Z"
 64 |     }
 65 |    },
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "def post_status(data):\n",
 69 |     "    HEADERS = {'Authorization': f'Bearer {MASTODON_BEARER}'}\n",
 70 |     "    url = f\"{API_BASE_URL}/api/v1/statuses\"\n",
 71 |     "    r = requests.post(url,\n",
 72 |     "                      data=data,\n",
 73 |     "                      headers=HEADERS)\n",
 74 |     "    return r.json()\n",
 75 |     "\n",
 76 |     "\n",
 77 |     "def load_tweets():\n",
 78 |     "    with open(DATA_DIR+\"tweets.js\", 'r', encoding='utf8') as f:\n",
 79 |     "        raw = f.read()\n",
 80 |     "    raw = raw.replace(\"window.YTD.tweets.part0 = \", \"\")\n",
 81 |     "    tweets = json.loads(raw)\n",
 82 |     "    tweets = [tweet['tweet'] for tweet in tweets]\n",
 83 |     "    tweets = sorted(tweets, key=lambda d: int(d['id']))\n",
 84 |     "    return tweets\n",
 85 |     "\n",
 86 |     "\n",
 87 |     "def to_timestamp(created_at):\n",
 88 |     "    timestamp = datetime.datetime.strptime(\n",
 89 |     "        created_at, '%a %b %d %X %z %Y').isoformat(timespec='seconds')\n",
 90 |     "    return timestamp\n",
 91 |     "\n",
 92 |     "\n",
 93 |     "def replace_urls(tweet):\n",
 94 |     "    if 'full_text' in tweet:\n",
 95 |     "        text = tweet['full_text']\n",
 96 |     "    else:\n",
 97 |     "        text = tweet['text']\n",
 98 |     "    if 'entities' in tweet and 'urls' in tweet['entities']:\n",
 99 |     "        for url in tweet['entities']['urls']:\n",
100 |     "            text = text.replace(url['url'], url['expanded_url'])\n",
101 |     "    return (text)\n",
102 |     "\n",
103 |     "\n",
104 |     "def replace_usernames(text):\n",
105 |     "    text = re.sub(r\"(\\B\\@[A-Za-z0-9_]{1,15})(\\:)?\", r\"\\1@twitter.com\\2\", text)\n",
106 |     "    return text\n",
107 |     "\n",
108 |     "\n",
109 |     "def tweet_to_toot(tweet):\n",
110 |     "    toot = {\n",
111 |     "        'status': replace_usernames(replace_urls(tweet)),\n",
112 |     "        'visibility': 'public',\n",
113 |     "        'created_at': to_timestamp(tweet['created_at']),\n",
114 |     "        'language': tweet['lang']\n",
115 |     "    }\n",
116 |     "    return toot\n",
117 |     "\n",
118 |     "\n",
119 |     "def retrieve_alt_texts(tweet_ids):\n",
120 |     "    # get alt text for specific IDs from Twitter API\n",
121 |     "    twitter_url = \"https://api.twitter.com/2/tweets\"\n",
122 |     "    twitter_heeaders = {\"Authorization\": f\"Bearer {TWITTER_BEARER}\"}\n",
123 |     "    twitter_params = {'ids': ','.join(tweet_ids),\n",
124 |     "                      'tweet.fields': 'text,attachments,entities',\n",
125 |     "                      'expansions': 'attachments.media_keys',\n",
126 |     "                      'media.fields': 'alt_text'\n",
127 |     "                      }\n",
128 |     "    resp = requests.get(\n",
129 |     "        twitter_url, headers=twitter_heeaders, params=twitter_params)\n",
130 |     "    resp_json = resp.json()\n",
131 |     "\n",
132 |     "    for media in resp_json['includes']['media']:\n",
133 |     "        if 'alt_text' in media:\n",
134 |     "            alt_texts[media['media_key']] = media['alt_text']\n",
135 |     "\n",
136 |     "\n",
137 |     "def add_alt_texts(tweets):\n",
138 |     "    # looks for Tweets with media and asks Twitter API for alt texts\n",
139 |     "    # adds those alt texts to the dict alt_texts\n",
140 |     "    tweets_with_media = [tweet for tweet in tweets[50100:]\n",
141 |     "                         if 'media' in tweet['entities']]\n",
142 |     "    print(f'Found {len(tweets_with_media)} Tweets with media attachements.')\n",
143 |     "    tweet_ids = [str(tweets_with_media['id'])\n",
144 |     "                 for tweets_with_media in tweets_with_media]\n",
145 |     "    batches = [tweet_ids[idx:idx+100] for idx in range(0, len(tweet_ids), 100)]\n",
146 |     "\n",
147 |     "    for batch in tqdm(batches):\n",
148 |     "        retrieve_alt_texts(batch)\n",
149 |     "\n",
150 |     "    print(f'Found {len(alt_texts)} alt texts.')\n",
151 |     "\n",
152 |     "\n",
153 |     "def retrieve_rt_texts(tweet_ids):\n",
154 |     "    # get full text of Retweets for specific IDs from Twitter API\n",
155 |     "    twitter_url = \"https://api.twitter.com/2/tweets\"\n",
156 |     "    twitter_heeaders = {\"Authorization\": f\"Bearer {TWITTER_BEARER}\"}\n",
157 |     "    twitter_params = {'ids': ','.join(tweet_ids),\n",
158 |     "                      'tweet.fields': 'text,referenced_tweets,entities',\n",
159 |     "                      'expansions': 'referenced_tweets.id'\n",
160 |     "                      }\n",
161 |     "    resp = requests.get(\n",
162 |     "        twitter_url, headers=twitter_heeaders, params=twitter_params)\n",
163 |     "    resp_json = resp.json()\n",
164 |     "\n",
165 |     "    for tweet in resp_json['data']:\n",
166 |     "        rt = [rt for rt in resp_json['includes']['tweets']\n",
167 |     "              if rt['id'] == tweet['referenced_tweets'][0]['id']][0]\n",
168 |     "        text = replace_urls(rt)\n",
169 |     "        text = f\"{tweet['text'].split(':')[0]}: {text}\\nhttps://twitter.com/{tweet['text'].split(':')[0].split('@')[-1]}/status/{rt['id']}\"\n",
170 |     "        text = replace_usernames(text)\n",
171 |     "        rt_texts[tweet['id']] = text\n",
172 |     "\n",
173 |     "\n",
174 |     "def add_full_RT_texts(tweets):\n",
175 |     "    # looks for Tweets with media and asks Twitter API for alt texts\n",
176 |     "    # adds those alt texts to the dict alt_texts\n",
177 |     "    truncated_retweets = retweets = [tweet for tweet in tweets if tweet['full_text'].startswith(\n",
178 |     "        'RT @') and tweet['full_text'].endswith('…')]\n",
179 |     "    print(f'Found {len(truncated_retweets)} truncated Retweets.')\n",
180 |     "    tweet_ids = [str(tweet['id']) for tweet in truncated_retweets]\n",
181 |     "    batches = [tweet_ids[idx:idx+100] for idx in range(0, len(tweet_ids), 100)]\n",
182 |     "\n",
183 |     "    for batch in tqdm(batches):\n",
184 |     "        retrieve_rt_texts(batch)\n",
185 |     "\n",
186 |     "    print(f'Collected {len(rt_texts)} full texts for Retweets.')\n"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": [
195 |     "tweets = load_tweets()\n",
196 |     "len(tweets)\n"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": null,
202 |    "metadata": {},
203 |    "outputs": [],
204 |    "source": [
205 |     "alt_texts = {}\n",
206 |     "add_alt_texts(tweets)\n"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": [
215 |     "rt_texts = {}\n",
216 |     "add_full_RT_texts(tweets)\n"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": null,
222 |    "metadata": {},
223 |    "outputs": [],
224 |    "source": [
225 |     "ids_dict = {}\n"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": null,
231 |    "metadata": {
232 |     "ExecuteTime": {
233 |      "end_time": "2022-12-25T08:28:46.728201Z",
234 |      "start_time": "2022-12-25T08:28:46.566070Z"
235 |     }
236 |    },
237 |    "outputs": [],
238 |    "source": [
239 |     "for tweet in tqdm(tweets[25218+4828:]):\n",
240 |     "\n",
241 |     "    if tweet['id'] in ids_dict:\n",
242 |     "        # was already posted, we can skip it\n",
243 |     "        pass\n",
244 |     "    elif tweet['full_text'].startswith('RT @'):\n",
245 |     "        # Retweets are often truncated and full data needs to be retreived from the API\n",
246 |     "        if rt_texts.get(tweet['id']):\n",
247 |     "            toot = {'status': rt_texts.get(tweet['id']),\n",
248 |     "                    'visibility': 'public',\n",
249 |     "                    'created_at': to_timestamp(tweet['created_at']),\n",
250 |     "                    'language': tweet['lang']\n",
251 |     "                    }\n",
252 |     "        else:\n",
253 |     "            toot = tweet_to_toot(tweet)\n",
254 |     "        posted = post_status(toot)\n",
255 |     "        ids_dict[tweet['id']] = posted['id']\n",
256 |     "    else:\n",
257 |     "        toot = tweet_to_toot(tweet)\n",
258 |     "        if 'media' in tweet['entities']:\n",
259 |     "            # upload media to append to the post\n",
260 |     "            media_ids = []\n",
261 |     "            for media in tweet['extended_entities']['media']:\n",
262 |     "                image_path = f\"{media_dir}{tweet['id']}-{media['media_url_https'].split('/')[-1]}\"\n",
263 |     "                if not Path(image_path).is_file():\n",
264 |     "                    image_path = f\"{media_dir_backup}{tweet['id']}-{media['media_url_https'].split('/')[-1]}\"\n",
265 |     "                    if not Path(image_path).is_file():\n",
266 |     "                        continue\n",
267 |     "                file = open(image_path, 'rb')\n",
268 |     "                data = file.read()\n",
269 |     "                url = f\"{API_BASE_URL}/api/v2/media\"\n",
270 |     "                files = {\n",
271 |     "                    'file': (image_path, data, 'application/octet-stream')}\n",
272 |     "                if alt_texts.get('3_' + media['id']):\n",
273 |     "                    values = {'description': alt_texts.get('3_' + media['id'])}\n",
274 |     "                    r = requests.post(url, files=files,\n",
275 |     "                                      data=values, headers=HEADERS)\n",
276 |     "                else:\n",
277 |     "                    r = requests.post(url, files=files, headers=HEADERS)\n",
278 |     "                json_data = r.json()\n",
279 |     "                media_ids.append(json_data['id'])\n",
280 |     "                toot['status'] = toot['status'].replace(media['url'], '')\n",
281 |     "            toot['media_ids[]'] = media_ids\n",
282 |     "        if 'in_reply_to_screen_name' in tweet and tweet['in_reply_to_screen_name'] == 'luca':\n",
283 |     "            # if Tweet is part of a thread, get ID if previous post\n",
284 |     "            try:\n",
285 |     "                toot['in_reply_to_id'] = ids_dict.get(\n",
286 |     "                    tweet['in_reply_to_status_id'])\n",
287 |     "            except:\n",
288 |     "                print(tweet)\n",
289 |     "        # print(tweet)\n",
290 |     "        # print(toot)\n",
291 |     "        posted = post_status(toot)\n",
292 |     "        # print(posted)\n",
293 |     "        ids_dict[tweet['id']] = posted['id']\n"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": null,
299 |    "metadata": {},
300 |    "outputs": [],
301 |    "source": [
302 |     "with open('ids_dict.txt', 'w') as f:\n",
303 |     "    f.write(json.dumps(ids_dict))\n"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": null,
309 |    "metadata": {},
310 |    "outputs": [],
311 |    "source": [
312 |     "with open('alt_texts.txt', 'w') as f:\n",
313 |     "    f.write(json.dumps(alt_texts))\n"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": null,
319 |    "metadata": {},
320 |    "outputs": [],
321 |    "source": [
322 |     "with open('rt_texts.txt', 'w') as f:\n",
323 |     "    f.write(json.dumps(rt_texts))\n"
324 |    ]
325 |   },
326 |   {
327 |    "attachments": {},
328 |    "cell_type": "markdown",
329 |    "metadata": {},
330 |    "source": [
331 |     "# Todos\n",
332 |     "\n",
333 |     "- [ ] change ids and alt_texts storage from dict to file\n",
334 |     "- [ ] check alt texts (first ones are broken and need to be replaced)\n",
335 |     "- [ ] check images (first ones weren't attached to posts and need to be replaced)\n",
336 |     "- [x] add Retweets (get full text from API and post them; haven't been posted yet)\n",
337 |     "- [ ] fix videos (they weren't uploaded, but linked. :( )\n",
338 |     "- [ ] replace self-quotes with self-posts (update url in posts)\n",
339 |     "- [ ] import mastodon data\n"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": null,
345 |    "metadata": {},
346 |    "outputs": [],
347 |    "source": []
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": null,
352 |    "metadata": {},
353 |    "outputs": [],
354 |    "source": []
355 |   }
356 |  ],
357 |  "metadata": {
358 |   "kernelspec": {
359 |    "display_name": "Python 3",
360 |    "language": "python",
361 |    "name": "python3"
362 |   },
363 |   "language_info": {
364 |    "codemirror_mode": {
365 |     "name": "ipython",
366 |     "version": 3
367 |    },
368 |    "file_extension": ".py",
369 |    "mimetype": "text/x-python",
370 |    "name": "python",
371 |    "nbconvert_exporter": "python",
372 |    "pygments_lexer": "ipython3",
373 |    "version": "3.11.0 (main, Oct 25 2022, 13:57:33) [Clang 14.0.0 (clang-1400.0.29.202)]"
374 |   },
375 |   "vscode": {
376 |    "interpreter": {
377 |     "hash": "5c7b89af1651d0b8571dde13640ecdccf7d5a6204171d6ab33e7c296e100e08a"
378 |    }
379 |   }
380 |  },
381 |  "nbformat": 4,
382 |  "nbformat_minor": 4
383 | }
384 | 


--------------------------------------------------------------------------------