├── .gitignore
├── .idea
├── .gitignore
├── inspectionProfiles
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
├── vcs.xml
└── weibo2toot.iml
├── LICENSE
├── README.md
├── conf.sample.ini
├── requirements.txt
├── run.py
└── utils
├── __init__.py
├── feed2toot.py
├── feed_decoder.py
├── feed_parser.py
├── get_config.py
├── media_downloader.py
└── toot_poster.py
/.gitignore:
--------------------------------------------------------------------------------
1 | temp
2 | ffmpeg.exe
3 | conf.ini
4 | db.txt
5 | __pycache__
6 | venv
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Editor-based HTTP Client requests
5 | /httpRequests/
6 | # Datasource local storage ignored files
7 | /dataSources/
8 | /dataSources.local.xml
9 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/weibo2toot.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Mashiro / bili2toot contributors
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Moved to
2 | 项目已重构:
3 |
4 | # weibo2toot
5 |
6 | A simple script that transport Weibo content to Mastodon. Based on the Weibo RSS feed powered by [RSSHub](https://rsshub.app).
7 |
8 | 一个将微博搬运到长毛象的脚本——基于[RSSHub](https://rsshub.app)生成的微博RSS。
9 |
10 |
11 | 一些说明
12 |
13 | 1、微博视频有严格的反盗链措施,一定频率内请求次数过多可能导致请求返回403,所以输出中看到视频下载失败问题不大,默认情况下会用视频缩略图顶替,视频原链接会在嘟文中标出(完全保留微博风格)。
14 |
15 | 2、`TypeError: Cannot read property 'screen_name' of undefined`
16 | 这是一个 RSSHub 方面的问题,部分微博博主的内容需要登陆才可见,不支持订阅,可以通过打开 https://m.weibo.cn/u/:uid 验证。
17 |
18 | 3、表情处理:微博表情暂时没有找到好的索引及批处理的方式(我们需要先将表情批量扒下来并上传为Mastodon的自定义表情),所以目前暂时还没法处理表情。
19 |
20 |
21 | ```
22 | pip3 install -r requirements.txt
23 | cp conf.sample.ini conf.ini
24 | nano conf.ini
25 | python3 run.py
26 | ```
27 |
28 | crontab job setting:
29 | ```
30 | crontab -e
31 | ```
32 | or (Ubuntu 18.04)
33 | ```
34 | nano /etc/crontab
35 | /etc/init.d/cron restart
36 | ```
37 |
38 | Recommand do job every 20 minutes (RSSHub default cache periods):
39 | ```
40 | #m h dom mon dow user command
41 | */20 * * * * root cd /weibo2toot && python3 run.py
42 | ```
43 |
--------------------------------------------------------------------------------
/conf.sample.ini:
--------------------------------------------------------------------------------
1 | [PROXY]
2 | ProxyOn = false
3 | HttpProxy = http://127.0.0.1:7890
4 | HttpsProxy = https://127.0.0.1:7890
5 |
6 | [MASTODON]
7 | BaseUrl = https://hello.2heng.xin/
8 | # register your application here: https://hello.2heng.xin/settings/applications
9 | AccessToken = your_app_token
10 | # 'direct' - post will be visible only to mentioned users
11 | # 'private' - post will be visible only to followers
12 | # 'unlisted' - post will be public but not appear on the public timeline
13 | # 'public' - post will be public
14 | TootVisibility = unlisted
15 | IncludeVideo = false
16 | SourcePrefix = :icon_weibo:
17 | ExternalLinkPrefix = :sys_link:
18 | VideoSourcePrefix = :sys_video:
19 |
20 | [WEIBO]
21 | WeiboRss = https://rsshub.app/weibo/user/3264072325
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.9.1
2 | Pillow==9.0.1
3 | ffmpy==0.2.3
4 | feedparser==5.2.1
5 | Mastodon.py==1.5.1
6 | filetype==1.0.7
--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | Created on May 29, 2020
5 | Desc: Twitter feed to toot (based on RSSHub's feed)
6 | Author: Mashiro
7 | URL: https://2heng.xin
8 | License: MIT
9 | """
10 | from utils.feed_parser import FeedParaser
11 | from utils.feed2toot import Feed2Toot
12 | from utils.get_config import GetConfig
13 | import os
14 |
15 | config = GetConfig()
16 |
17 | if __name__ == '__main__':
18 | if config['PROXY']['ProxyOn'] == 'true':
19 | os.environ['HTTP_PROXY'] = config['PROXY']['HttpProxy']
20 | os.environ['HTTPS_PROXY'] = config['PROXY']['HttpsProxy']
21 |
22 | RSS_dict = FeedParaser(config['WEIBO']['WeiboRss'])
23 | Feed2Toot(RSS_dict)
24 |
--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mashirozx/weibo2toot/32f0d3a4baf0bae589277b3c2b2136f0940fd5b7/utils/__init__.py
--------------------------------------------------------------------------------
/utils/feed2toot.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | Created on May 29, 2020
5 | Desc: feed to toot
6 | Author: Mashiro
7 | URL: https://2heng.xin
8 | License: MIT
9 | """
10 | from os import path, makedirs
11 | import shutil
12 | from .feed_decoder import TweetDecoder
13 | from .media_downloader import MediaDownloader
14 | from .toot_poster import TootPoster
15 |
16 | def Feed2Toot(feed_data):
17 | if path.exists('db.txt'):
18 | historyList = [line.rstrip('\n') for line in open('db.txt')]
19 | else:
20 | historyList = []
21 |
22 | for tweet in reversed(feed_data):
23 | if not path.exists('temp'):
24 | makedirs('temp')
25 |
26 | if tweet['id'] not in historyList:
27 | print('INFO: decode ' + tweet['id'])
28 | tweet_decoded = TweetDecoder(tweet)
29 | print('INFO: download ' + tweet['id'])
30 | try:
31 | toot_content = MediaDownloader(tweet_decoded)
32 | print('INFO: download succeed ' + tweet['id'])
33 | except Exception:
34 | print('ERRO: download failed ' + tweet['id'])
35 | # for e in Exception:
36 | # print(e)
37 | print('INFO: post toot ' + tweet['id'])
38 | try:
39 | TootPoster(toot_content)
40 | print('INFO: post succeed ' + tweet['id'])
41 | except Exception:
42 | print('ERRO: post failed ' + tweet['id'])
43 | historyList.append(tweet['id'])
44 |
45 | if path.exists('temp'):
46 | shutil.rmtree('temp')
47 |
48 | print('INFO: save to db ' + tweet['id'])
49 | with open('db.txt', 'w+') as db:
50 | for row in historyList:
51 | db.write(str(row) + '\n')
52 |
53 | if __name__ == '__main__':
54 | test_feed = [{
55 | 'title': "content",
56 | 'summary': 'content
',
57 | 'id': 'https://twitter.com/zlj517/status/1266540485973180416',
58 | 'link': 'https://twitter.com/zlj517/status/1266540485973180416',
59 | }]
60 | Feed2Toot(test_feed)
--------------------------------------------------------------------------------
/utils/feed_decoder.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | Created on May 29, 2020
5 | Desc: Twitter HTML parser
6 | Author: Mashiro
7 | URL: https://2heng.xin
8 | License: MIT
9 | """
10 | from bs4 import BeautifulSoup
11 | from html import unescape
12 | import re
13 | from .get_config import GetConfig
14 |
15 | config = GetConfig()
16 |
17 | def TweetDecoder(rss_data):
18 | """
19 | :params object: Summary from FeedParaser
20 | :return object
21 | """
22 | soup = BeautifulSoup(rss_data['summary'], features='html.parser')
23 |
24 | data = {
25 | 'video': [],
26 | 'video_poster': [],
27 | 'image': [],
28 | 'plain': None
29 | }
30 |
31 | for link in soup.find_all('a'):
32 | # link.replace_with(' ' + link.get('href') + ' ')
33 | if (link.has_attr('data-url')):
34 | if ('://t.cn/' in link.get('data-url')):
35 | if ('微博视频' in link.getText()):
36 | link.replace_with(f'''[?bs4_replace_flag?] {config['MASTODON']['VideoSourcePrefix']} {link.getText()} {link.get('data-url')} [?bs4_replace_flag?]''')
37 | else:
38 | link.replace_with(f'''[?bs4_replace_flag?] {config['MASTODON']['ExternalLinkPrefix']} {link.getText()} {link.get('data-url')} [?bs4_replace_flag?]''')
39 | else:
40 | link.replace_with(f'''[?bs4_replace_flag?] {config['MASTODON']['ExternalLinkPrefix']} {link.getText()} {link.get('href')} [?bs4_replace_flag?]''')
41 | elif (link.getText()[-1] == '#'):
42 | link.replace_with(f'''[?bs4_replace_flag?] {link.getText()[:-1]} [?bs4_replace_flag?]''')
43 | else:
44 | link.replace_with('[?bs4_replace_flag?]'+link.getText()+'[?bs4_replace_flag?]')
45 |
46 | for span in soup.find_all('span'):
47 | if('url-icon' in span.get('class')):
48 | # print(span)
49 | img = span.find('img')
50 | img.replace_with(img.get('alt'))
51 |
52 | for video in soup.find_all('video'):
53 | # print(video.get('src'))
54 | if ('://f.video.weibocdn.com' in video.get('src')):
55 | # need to add a reffer i guess.
56 | data['video'].append(video.get('src'))
57 | data['video_poster'].append(video.get('poster'))
58 | video.replace_with('')
59 |
60 | for image in soup.find_all('img'):
61 | # print(video.get('src'))
62 | data['image'].append(image.get('src'))
63 | image.replace_with('')
64 |
65 | for br in soup.find_all('br'):
66 | br.replace_with('<|n>')
67 |
68 | for span in soup.find_all('span'):
69 | span.replace_with('')
70 | # span.replace_with(span.text)
71 |
72 | for div in soup.find_all('div'):
73 | div.replace_with('')
74 |
75 | for blockquote in soup.find_all('blockquote'):
76 | blockquote.unwrap()
77 |
78 | # print(soup.prettify())
79 | # print(str(data))
80 | plain_content = unescape(soup.prettify())
81 | plain_content = plain_content.replace(' ', ' ')
82 | plain_content = plain_content.replace('\n[?bs4_replace_flag?]',' ').replace('[?bs4_replace_flag?]\n',' ').replace('[?bs4_replace_flag?]','').replace('\n- ','\n\- ').replace('<|n>','\n')
83 | # plain_content = re.sub(r'(#[^#]+)#', lambda m : m.group(1)+' ', plain_content)
84 | data['plain'] = plain_content + '\n'+config['MASTODON']['SourcePrefix']+' ' + rss_data['link']
85 | return data
86 |
87 | if __name__ == '__main__':
88 | test_video = """
89 | Xin Chun Kuai Le to my dear friends in China! Follow @GranityStudios to discover all the New Year gifts from me. KobeBryant的微博视频
90 | """
91 | print(TweetDecoder(test_video))
92 |
--------------------------------------------------------------------------------
/utils/feed_parser.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | Created on May 29, 2020
5 | Desc: RSS feed parser
6 | Author: Mashiro
7 | URL: https://2heng.xin
8 | License: MIT
9 | """
10 | import feedparser
11 |
12 | def FeedParaser(rss_link):
13 | """
14 | :param str: RSS URL
15 | :return object: rss object
16 | """
17 | RssHubFeed = feedparser.parse(rss_link)
18 |
19 | rss = []
20 |
21 | for item in RssHubFeed.entries:
22 | data={}
23 | # for detail in item.keys():
24 | # data[detail]=item[detail]
25 | data['title']=item['title']
26 | data['summary']=item['summary']
27 | data['id']=item['id']
28 | data['link']=item['link']
29 | rss.append(data)
30 |
31 | # print(rss)
32 | return rss
33 |
34 | if __name__ == '__main__':
35 | print(str(FeedParaser("https://rsshub.app/bilibili/user/dynamic/161775300")))
--------------------------------------------------------------------------------
/utils/get_config.py:
--------------------------------------------------------------------------------
1 | import configparser
2 |
3 | config = configparser.ConfigParser()
4 | config.read('conf.ini')
5 |
6 | def GetConfig():
7 | for i in config:
8 | for t in i:
9 | t = str(t)
10 | return config
--------------------------------------------------------------------------------
/utils/media_downloader.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | Created on May 29, 2020
5 | Desc: Media file downloader
6 | Author: Mashiro
7 | URL: https://2heng.xin
8 | License: MIT
9 | """
10 | import urllib.request
11 | # import ffmpy
12 | from .get_config import GetConfig
13 |
14 | config = GetConfig()
15 |
16 | def MediaDownloader(data):
17 | """
18 | :param object: Data return from TweetDecoder
19 | :return {'gif_count': (max+1)gif_id, 'video_count': video_id, 'image_count': img_id, 'plain': str}
20 | """
21 | # set header
22 | opener = urllib.request.build_opener()
23 | opener.addheaders = []
24 | opener.addheaders.append(('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'))
25 | opener.addheaders.append(('Referer', 'https://weibo.com/'))
26 | urllib.request.install_opener(opener)
27 |
28 | res = {'video_count': None, 'image_count': None, 'plain': None, 'video_link': None}
29 |
30 | if data['image']:
31 | img_id = 1
32 | for url in data['image']:
33 | if (img_id <= 4):
34 | try:
35 | urllib.request.urlretrieve(url, 'temp/img'+str(img_id)+'.png')
36 | img_id = img_id+1
37 | except Exception:
38 | print(f'ERRO: failed[img]: {url}')
39 | # for e in Exception:
40 | # print(e)
41 |
42 | res['image_count']=img_id
43 |
44 | if data['video']:
45 | video_id = 1
46 | for url in data['video']:
47 | if (video_id <= 1):
48 | try:
49 | if config['MASTODON']['IncludeVideo'] != 'false':
50 | urllib.request.urlretrieve(url, 'temp/video'+str(video_id)+'.mp4')
51 |
52 | urllib.request.urlretrieve(data['video_poster'][video_id-1], 'temp/video'+str(video_id)+'.png')
53 | res['video_link']=url
54 | video_id = video_id+1
55 | except Exception:
56 | print(f'ERRO: failed[vid]: {url}')
57 | # for e in Exception:
58 | # print(e)
59 |
60 | res['video_count']=video_id
61 |
62 | res['plain']=data['plain']
63 |
64 | return res
65 |
66 | if __name__ == '__main__':
67 | test_data = {'gif': ['https://video.twimg.com/tweet_video/EZLxKmTUMAARbSa.mp4'], 'gif_poster': ['https://pbs.twimg.com/tweet_video_thumb/EZLxKmTUMAARbSa.jpg'], 'video': ['https://video.twimg.com/ext_tw_video/1265470079203827712/pu/vid/1280x720/B-BRCBM0djUAqJl0.mp4?tag=10'], 'video_poster': ['https://pbs.twimg.com/ext_tw_video_thumb/1265470079203827712/pu/img/VujsmqbQORfHDeCP.jpg'], 'image': ['https://pbs.twimg.com/media/EZJh5RPUMAEz4aS?format=jpg&name=orig','https://s3-view.2heng.xin/aws_cached/2019/07/14/53c2adbc381e3aa17968d5d36feee002.md.png', 'https://s3-view.2heng.xin/aws_cached/2020/05/19/b1a7d8ff391616ad152f9958c6302ba0.md.jpg', 'https://s3-view.2heng.xin/aws_cached/2020/05/18/671a82563dfe40885196166683bf6f0b.md.jpg'], 'plain': '流程图工具 Excalidraw 可以做出下面这样的图示效果,可惜中文没有手写效果。 https://excalidraw.com/ '}
68 | MediaDownloader(test_data)
--------------------------------------------------------------------------------
/utils/toot_poster.py:
--------------------------------------------------------------------------------
1 | from mastodon import Mastodon
2 | import filetype
3 | from .get_config import GetConfig
4 |
5 | config = GetConfig()
6 |
7 | mastodon = Mastodon(
8 | access_token = config['MASTODON']['AccessToken'],
9 | api_base_url = config['MASTODON']['BaseUrl']
10 | )
11 |
12 | def media_post(file):
13 | kind = filetype.guess(file)
14 | # print('File extension: %s' % kind.extension)
15 | # print('File MIME type: %s' % kind.mime)
16 | return mastodon.media_post(file, kind.mime)
17 |
18 | def TootPoster(data):
19 | """
20 | :data object: Return from media_downloader
21 | :return void
22 | """
23 | media_ids_arr = []
24 |
25 | if data['video_count'] is not None:
26 | id=1
27 | if config['MASTODON']['IncludeVideo'] == 'false':
28 | media_ids_arr.append(media_post('temp/video%d.png' % id))
29 | # data['plain'] = data['plain'] + '\n'+config['MASTODON']['VideoSourcePrefix']+' ' + data['video_link']
30 | else:
31 | try:
32 | media_ids_arr.append(media_post('temp/video%d.mp4' % id))
33 | except Exception:
34 | media_ids_arr.append(media_post('temp/video%d.png' % id))
35 | # data['plain'] = data['plain'] + '\n'+config['MASTODON']['VideoSourcePrefix']+' ' + data['video_link']
36 |
37 | if data['image_count'] is not None:
38 | for id in range(1, min(data['image_count'], 5)):
39 | media_ids_arr.append(media_post('temp/img%d.png' % id))
40 |
41 | try:
42 | mastodon.status_post(status=data['plain'], media_ids=media_ids_arr, visibility=config['MASTODON']['TootVisibility'])
43 | except Exception:
44 | print(f'ERRO: failed[mastodon.status_post]')
45 | # for e in Exception:
46 | # print(e)
47 |
48 | if __name__ == '__main__':
49 | test_data = {'gif_count': 1, 'video_count': None, 'image_count': 3, 'plain': 'Tooting from python using `status_post` #mastodonpy !'}
50 | TootPoster(test_data)
--------------------------------------------------------------------------------