<条目路径> < -g (添加该参数修改全局)>")
41 | return # 糊弄下IDE
42 | elif len(param_list) == 2:
43 | prefix = param_list[0].strip().lower()
44 | api = ''
45 | url = param_list[1].strip().rstrip('/') # 防止之后拼接的时候多出来斜杠
46 | else:
47 | prefix = param_list[0].strip().lower()
48 | api = param_list[1].strip().rstrip('/')
49 | url = param_list[2].strip().rstrip('/')
50 |
51 | # check params
52 | if url.endswith('api.php'):
53 | await add_wiki.finish("参数错误!如果您只提供了一个地址,则其必须是条目路径而非api地址")
54 | if api and not re.match(r'^https?:/{2}\w.+$', api):
55 | await add_wiki.finish("非法的api地址,请重新输入!")
56 | if not re.match(r'^https?:/{2}\w.+$', url):
57 | await add_wiki.finish("非法的条目路径,请重新输入!")
58 | if prefix in reserved or ":" in prefix or ":" in prefix:
59 | await add_wiki.finish("该前缀为保留前缀或含有非法字符,请重新输入!")
60 |
61 | if api:
62 | success = False
63 | for i in range(3):
64 | try:
65 | if get_driver().config.wiki_proxy:
66 | await MediaWiki.create(url=api, timeout=10, proxies=get_driver().config.wiki_proxy)
67 | else:
68 | await MediaWiki.create(url=api, timeout=10)
69 | success = True
70 | break
71 | except (MediaWikiAPIURLError, TimeoutError):
72 | continue
73 | except Exception as e:
74 | logger.error(f"添加wiki时发生错误:{e}")
75 | await add_wiki.finish("因未知错误无法连接到api,请bot管理员检查日志")
76 | if not success:
77 | await add_wiki.finish("无法连接到wiki,请检查api地址是否正确!如果确认无误,可能是网络故障或者防火墙拦截,"
78 | "您可以不提供api地址,直接提供条目路径即可")
79 |
80 | # 进行插入操作
81 | group_id = event.group_id if isinstance(event, GroupMessageEvent) else 0
82 | config = Config(group_id=group_id)
83 | if (is_global and config.add_wiki_global(prefix, api, url)) \
84 | or (not is_global and config.add_wiki(prefix, api, url)):
85 | await add_wiki.finish(f"添加/编辑Wiki:{prefix}成功!")
86 | else:
87 | await add_wiki.finish("呜……出错了……请联系bot管理员进行处理……")
88 |
89 |
90 | list_wiki = on_command("wiki.list")
91 |
92 |
93 | @list_wiki.handle()
94 | async def _list_wiki(bot: Bot, event: MessageEvent, raw_command: str = RawCommand()):
95 | msg = str(event.message).strip()
96 | param_list, param_dict = process_command(raw_command, msg)
97 |
98 | # check if is global
99 | is_global = bool(param_dict.get("g"))
100 |
101 | if is_global:
102 | config = Config(group_id=0)
103 | await list_wiki.finish(config.list_data[1])
104 | elif isinstance(event, GroupMessageEvent):
105 | config = Config(group_id=event.group_id)
106 | await list_wiki.finish(config.list_data[0])
107 |
108 |
109 | del_wiki = on_command("wiki.delete", permission=SUPERUSER | GROUP_ADMIN | GROUP_OWNER)
110 |
111 |
112 | @del_wiki.handle()
113 | async def _del_wiki(bot: Bot, event: MessageEvent, raw_command: str = RawCommand()):
114 | msg = str(event.message).strip()
115 | param_list, param_dict = process_command(raw_command, msg)
116 |
117 | # check if is global
118 | is_global = False
119 | if param_dict.get("g"):
120 | if await SUPERUSER(bot, event):
121 | is_global = True
122 | else:
123 | await del_wiki.finish("您没有权限使用此命令!")
124 |
125 | if not param_list:
126 | await del_wiki.finish("你似乎没有提供要删除的前缀的说……")
127 | prefix = param_list[0]
128 | group_id = event.group_id if isinstance(event, GroupMessageEvent) else 0
129 | config = Config(group_id=group_id)
130 |
131 | if (is_global and config.del_wiki_global(prefix)) or (not is_global and config.del_wiki(prefix)):
132 | await del_wiki.finish("删除成功")
133 | else:
134 | await del_wiki.finish("呜……删除失败了……请检查前缀是否有误")
135 |
136 |
137 | set_default = on_command("wiki.default", permission=SUPERUSER | GROUP_ADMIN | GROUP_OWNER)
138 |
139 |
140 | @set_default.handle()
141 | async def _set_default(bot: Bot, event: MessageEvent, state: T_State, raw_command: str = RawCommand()):
142 | msg = str(event.message).strip()
143 | param_list, param_dict = process_command(raw_command, msg)
144 |
145 | # check if is global
146 | is_global = False
147 | if param_dict.get("g"):
148 | if await SUPERUSER(bot, event):
149 | is_global = True
150 | else:
151 | await set_default.finish("您没有权限使用此命令!")
152 |
153 | if not param_list:
154 | await set_default.finish("你似乎没有提供要设置的前缀的说……")
155 | prefix = param_list[0]
156 | group_id = event.group_id if isinstance(event, GroupMessageEvent) else 0
157 | config = Config(group_id=group_id)
158 |
159 | if (is_global and config.set_default_global(prefix)) or (not is_global and config.set_default(prefix)):
160 | await set_default.finish("设置成功")
161 | else:
162 | await set_default.finish("呜……设置失败了……请检查前缀是否有误")
163 |
--------------------------------------------------------------------------------
/nonebot_plugin_mediawiki/mediawiki/exceptions.py:
--------------------------------------------------------------------------------
1 | """
2 | MediaWiki Exceptions
3 | """
4 | from .constants import URL
5 | from .utilities import str_or_unicode
6 |
7 | ODD_ERROR_MESSAGE = (
8 | "This should not happen. If the MediaWiki site you are "
9 | "querying is available, then please report this issue on "
10 | "GitHub: {URL}/issues".format(URL=URL)
11 | )
12 |
13 |
14 | class MediaWikiBaseException(Exception):
15 | """ Base MediaWikiException
16 |
17 | Args:
18 | message: The message of the exception """
19 |
20 | def __init__(self, message):
21 | self._message = message
22 | super(MediaWikiBaseException, self).__init__(self.message)
23 |
24 | def __unicode__(self):
25 | return self.message
26 |
27 | def __str__(self):
28 | return str_or_unicode(self.__unicode__())
29 |
30 | @property
31 | def message(self):
32 | """ str: The MediaWiki exception message """
33 | return self._message
34 |
35 |
36 | class MediaWikiException(MediaWikiBaseException):
37 | """ MediaWiki Exception Class
38 |
39 | Args:
40 | error (str): The error message that the MediaWiki site returned """
41 |
42 | def __init__(self, error):
43 | self._error = error
44 | msg = 'An unknown error occurred: "{0}". Please report it on GitHub!'.format(
45 | self.error
46 | )
47 | super(MediaWikiException, self).__init__(msg)
48 |
49 | @property
50 | def error(self):
51 | """ str: The error message that the MediaWiki site returned """
52 | return self._error
53 |
54 |
55 | class PageError(MediaWikiBaseException):
56 | """ Exception raised when no MediaWiki page matched a query
57 |
58 | Args:
59 | title (str): Title of the page
60 | pageid (int): MediaWiki page id of the page"""
61 |
62 | def __init__(self, title=None, pageid=None):
63 | if title:
64 | self._title = title
65 | msg = '"{0}" does not match any pages. Try another query!'.format(
66 | self.title
67 | )
68 | elif pageid:
69 | self._pageid = pageid
70 | msg = 'Page id "{0}" does not match any pages. Try another id!'.format(
71 | self.pageid
72 | )
73 | else:
74 | self._title = ""
75 | msg = '"{0}" does not match any pages. Try another query!'.format(
76 | self.title
77 | )
78 | super(PageError, self).__init__(msg)
79 |
80 | @property
81 | def title(self):
82 | """ str: The title that caused the page error """
83 | return self._title
84 |
85 | @property
86 | def pageid(self):
87 | """ int: The page id that caused the page error """
88 | return self._pageid
89 |
90 |
91 | class RedirectError(MediaWikiBaseException):
92 | """ Exception raised when a page title unexpectedly resolves to
93 | a redirect
94 |
95 | Args:
96 | title (str): Title of the page that redirected
97 | Note:
98 | This should only occur if both auto_suggest and redirect \
99 | are set to **False** """
100 |
101 | def __init__(self, title):
102 | self._title = title
103 | msg = (
104 | '"{0}" resulted in a redirect. Set the redirect property to True '
105 | "to allow automatic redirects."
106 | ).format(self.title)
107 |
108 | super(RedirectError, self).__init__(msg)
109 |
110 | @property
111 | def title(self):
112 | """ str: The title that was redirected """
113 | return self._title
114 |
115 |
116 | class DisambiguationError(MediaWikiBaseException):
117 | """ Exception raised when a page resolves to a Disambiguation page
118 |
119 | Args:
120 | title (str): Title that resulted in a disambiguation page
121 | may_refer_to (list): List of possible titles
122 | url (str): Full URL to the disambiguation page
123 | details (list[dict]): A list of dictionaries with more information of \
124 | possible results
125 | Note:
126 | `options` only includes titles that link to valid \
127 | MediaWiki pages """
128 |
129 | def __init__(self, title, may_refer_to, url, details=None):
130 | self._title = title
131 | self._options = sorted(may_refer_to)
132 | self._details = details
133 | self._url = url
134 | msg = '\n"{0}" may refer to: \n ' "{1}".format(
135 | self.title, "\n ".join(self.options)
136 | )
137 | super(DisambiguationError, self).__init__(msg)
138 |
139 | @property
140 | def url(self):
141 | """ str: The url, if possible, of the disambiguation page """
142 | return self._url
143 |
144 | @property
145 | def title(self):
146 | """ str: The title of the page """
147 | return self._title
148 |
149 | @property
150 | def options(self):
151 | """ list: The list of possible page titles """
152 | return self._options
153 |
154 | @property
155 | def details(self):
156 | """ list: The details of the proposed non-disambigous pages """
157 | return self._details
158 |
159 |
160 | class HTTPTimeoutError(MediaWikiBaseException):
161 | """ Exception raised when a request to the Mediawiki site times out.
162 |
163 | Args:
164 | query (str): The query that timed out"""
165 |
166 | def __init__(self, query):
167 | self._query = query
168 | msg = (
169 | 'Searching for "{0}" resulted in a timeout. '
170 | "Try again in a few seconds, and ensure you have rate limiting "
171 | "set to True."
172 | ).format(self.query)
173 | super(HTTPTimeoutError, self).__init__(msg)
174 |
175 | @property
176 | def query(self):
177 | """ str: The query that timed out """
178 | return self._query
179 |
180 |
181 | class MediaWikiAPIURLError(MediaWikiBaseException):
182 | """ Exception raised when the MediaWiki server does not support the API
183 |
184 | Args:
185 | api_url (str): The API URL that was not recognized """
186 |
187 | def __init__(self, api_url):
188 | self._api_url = api_url
189 | msg = "{0} is not a valid MediaWiki API URL".format(self.api_url)
190 | super(MediaWikiAPIURLError, self).__init__(msg)
191 |
192 | @property
193 | def api_url(self):
194 | """ str: The api url that raised the exception """
195 | return self._api_url
196 |
197 |
198 | class MediaWikiGeoCoordError(MediaWikiBaseException):
199 | """ Exceptions to handle GeoData exceptions
200 |
201 | Args:
202 | error (str): Error message from the MediaWiki site related to \
203 | GeoCoordinates """
204 |
205 | def __init__(self, error):
206 | self._error = error
207 | msg = (
208 | "GeoData search resulted in the following error: {0}"
209 | " - Please use valid coordinates or a proper page title."
210 | ).format(self.error)
211 | super(MediaWikiGeoCoordError, self).__init__(msg)
212 |
213 | @property
214 | def error(self):
215 | """ str: The error that was thrown when pulling GeoCoordinates """
216 | return self._error
217 |
218 |
219 | class MediaWikiCategoryTreeError(MediaWikiBaseException):
220 | """ Exception when the category tree is unable to complete for an unknown
221 | reason
222 |
223 | Args:
224 | category (str): The category that threw an exception """
225 |
226 | def __init__(self, category):
227 | self._category = category
228 | msg = (
229 | "Categorytree threw an exception for trying to get the "
230 | "same category '{}' too many times. Please try again later "
231 | "and perhaps use the rate limiting "
232 | "option."
233 | ).format(self._category)
234 | super(MediaWikiCategoryTreeError, self).__init__(msg)
235 |
236 | @property
237 | def category(self):
238 | """ str: The category that threw an exception during category tree \
239 | generation """
240 | return self._category
241 |
242 |
243 | class MediaWikiLoginError(MediaWikiBaseException):
244 | """ Exception raised when unable to login to the MediaWiki site
245 |
246 | Args:
247 | error (str): The error message that the MediaWiki site returned """
248 |
249 | def __init__(self, error):
250 | self._error = error
251 | super(MediaWikiLoginError, self).__init__(error)
252 |
253 | @property
254 | def error(self):
255 | """ str: The error message that the MediaWiki site returned """
256 | return self._error
257 |
258 |
259 | # Exception add by KoishiMoe
260 | class InterWikiError(MediaWikiBaseException):
261 | """ Exception raised when a page resolves to be an interwiki link
262 |
263 | Args:
264 | title (str): Title that results in a interwiki link
265 | url (str): Full URL to the link
266 | """
267 |
268 | def __init__(self, title, url):
269 | self._title = title
270 | self._url = url
271 | msg = "{0} is an interwiki link to {1}".format(self._title, self._url)
272 | super(InterWikiError, self).__init__(msg)
273 |
274 | @property
275 | def title(self):
276 | return self._title
277 |
278 | @property
279 | def url(self):
280 | return self._url
281 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | # nonebot-plugin-mediawiki
8 |
9 | _适用于 [NoneBot2](https://v2.nonebot.dev) 的 MediaWiki 查询插件_
10 |
11 |
12 |
13 | ------
14 |
15 |
16 |
17 | 
18 | 
19 | [](https://github.com/KoishiMoe/nonebot-plugin-mediawiki/blob/main/LICENSE)
20 | [](https://pypi.org/project/nonebot-plugin-mediawiki/)
21 | 
22 |
23 | [](https://github.com/KoishiMoe/nonebot-plugin-mediawiki/issues)
24 | [](https://github.com/KoishiMoe/nonebot-plugin-mediawiki/releases)
25 | 
26 | 
27 |
28 |
29 |
30 | ------
31 |
32 | 本项目是 [Flandre](https://github.com/KoishiMoe/Flandre) 的
33 | [wiki](https://github.com/KoishiMoe/Flandre/tree/main/src/plugins/wiki) 组件,经简单修改成为独立插件发布
34 |
35 | ## 旧版本用户请注意
36 | 本插件在1.0版本进行了一次重构,同时更改了设置命令的语法(查询不受影响),请阅读文档的相应部分
37 |
38 | ## 关于更新
39 |
40 | 这个插件是我很久之前写的,由于个人精力有限,目前**并未**积极跟进上游更新,也**没有**进行兼容性测试。如果你在最新版的nonebot2上使用它时出现了问题,请在issue区指出。
41 |
42 | 功能更新会在后续有时间的时候在进行。当前计划中的有:跟进最新版nonebot,解除onebot适配器依赖,添加条目跟踪等功能,以及简化命令、优化管理相关操作,并对整个项目进行重构以优化性能、提升可维护性。
43 |
44 | ETA:无
45 |
46 |
47 |
48 | ## FAQ
49 |
50 |
51 | 这个插件是干什么的?
52 |
53 | * 在**群聊**中查wiki用的。此处的wiki特指基于[mediawiki](https://mediawiki.org)开设的wiki网站
54 | * 对大多数群可能都有用,但是使用频率大概会很低。从功能上说,我是为了一些具有特定的专业性话题的讨论群设计的。
55 | 例如Linux群,成员如果问了一个Wiki上已经写的很详细的问题,其他成员就可以直接利用bot来指引提问者去查看对应页面,而非自己去wiki上找,
56 | 然后再复制链接发出来。另外,对于一些wiki项目的编辑交流群来说,这种插件也可能有助于提高编辑间的交流效率
57 | > 简单举例:
58 | >
59 | > A: dalao们,我这钓鱼怎么钓不出来附魔书啊
60 | >
61 | > B: 参考[[钓鱼#垃圾与宝藏]]
62 | >
63 | > Bot: https://zh.minecraft.wiki/w/%E9%92%93%E9%B1%BC#%E5%9E%83%E5%9C%BE%E4%B8%8E%E5%AE%9D%E8%97%8F
64 |
65 |
66 |
67 | 这个插件和其他wiki插件有什么不同吗?
68 |
69 | 在我发布这个插件时,nonebot还没有这类插件,而我在的群用得上这个,因此就顺便搓了一个发布了。现在nb市场也有些其他的wiki插件(或者包含wiki功能的bot),
70 | 其中的一些是适配特定wiki的,不具有通用性(但它们可能适配了本插件无法适配的wiki,例如scpwiki——它使用的系统是wikidot,且不开放api);
71 | 另一些(目前插件市场里只有一个,叫nonebot-plugin-wiki)同样针对mediawiki设计,意味着它们和本插件的功能在相当程度上可能是重叠的,
72 | 此时你可以比较差异功能、更新频率、兼容性、稳定性等选择适合自己的(例如前面提到的插件,接入了bwiki的api,可以获取bwiki上条目的简介之类)
73 |
74 |
75 |
76 | MediaWiki是啥?API是啥?条目路径是啥?
77 |
78 | * MediaWiki是维基媒体基金会开发的一套wiki引擎,大名鼎鼎的维基百科就是由它驱动的。
79 | 目前世界上有很多wiki网站均使用不同版本的该引擎驱动,尤其是很多游戏、动漫等的wiki
80 | * 本插件提到的API都指MediaWiki的API。利用API,bot可以与wiki站点通信,来快速从wiki站点获取想要的信息。在本插件中,bot就利用api搜索指定标题是否存在、
81 | 获取页面的链接、查询消歧义义项等。
82 | * 部分wiki出于各种原因可能不开放API,或者有非常严格的使用限制,这使得利用API获取链接的方法无法使用。所幸MediaWiki的链接格式非常稳定,
83 | 都是 `一个固定链接/条目名`,基于此,在已知这个固定链接的前提下,我们可以直接将其和条目名拼接生成链接。当然,这样bot就无法检查条目是否存在,
84 | 损失了一部分功能。在MediaWiki中,这个固定链接一般被叫做`条目路径`
85 |
86 |
87 |
88 | 我怎么知道我要查的wiki用的是不是MediaWiki?
89 |
90 | * 直接在目标wiki搜索框搜`Special:Version`就行,一般会跳到版本页面;没有搜索功能的wiki也可以直接把浏览器地址栏末尾的条目名改成前面的那一串然后回车。
91 | 如果提示没有权限访问、不存在的特殊页面之类,应该也是MediaWiki
92 | * 其实一般的MediaWiki站点不会刻意隐藏自己使用了MediaWiki,没什么意义,所以一般在网站的关于、版权信息之类的地方也能找到相关说明
93 |
94 |
95 |
96 | 查询命令怎么这么奇怪
97 |
98 | * 因为MediaWiki中,同wiki内部互相引用条目用的就是双方括号,引用模板则是双花括号,这样设计是为了和wiki保持一致。
99 | * 至于圆括号,MediaWiki中确实没有,不过上面都用了其他两种括号了,下面用圆括号也比较自然(确信)
100 | * 为了方便手机查询 (issue#1) ,本插件也有简化的条目查询命令,即 `wiki xxx`
101 |
102 |
103 |
104 | 这插件怎么用
105 |
106 | * 如果你是一般路过群友,只需要知道 `wiki 前缀:条目名` 这种查询语法一般就可以了,前缀的定义在下面有写。如果你的群只绑定了一个wiki,前缀是可以省略的
107 | * 如果你是群管理员……看下面文档的说明吧~
108 |
109 |
110 | ## 使用说明
111 |
112 | ### TL;DR
113 |
114 | 查询条目: `[[条目名]]` `[[prefix:条目名]]`
115 |
116 | 查询条目(方法2): `wiki 条目名` `wiki prefix:条目名`
117 |
118 | 查询模板: `{{模板名}}` `{{prefix:模板名}}`
119 |
120 | 绕过api查询条目: `((条目名))` `((prefix:条目名))`
121 |
122 | 页面截图: `wiki.shot prefix:条目名`
123 |
124 | 添加:wiki.add <前缀> <通用url地址> < -g(添加该参数表示操作全局wiki)>
125 |
126 | 删除:wiki.delete <前缀> < -g >
127 |
128 | 列表:wiki.list < -g >
129 |
130 | 设置默认:wiki.default <前缀> < -g >
131 |
132 | **其中所有非全局指令均需要在目标群中进行,所有全局指令(除查询)均只有Bot管理员能执行**
133 |
134 | ### 查询功能
135 |
136 | 查询功能的语法和标准的mediawiki内链格式基本一致:
137 |
138 | 使用半角中括号包裹要查询的条目名,如 `[[帮助]]`
139 |
140 | 使用半角大括号包裹要查询的模板名,如 `{{测试模板}}`
141 |
142 | (PS:直接使用`[[Template:模板名]]`也是可行的)
143 |
144 | 此外,方便起见,也可以用`wiki 条目名` `wiki prefix:条目名`的方法查询
145 |
146 | 使用`wiki.shot prefix:条目名`可以获取对应页面的截图 **(测试版功能,使用时请注意安全风险,如调取敏感条目,泄漏服务器ip,或者使用浏览器漏洞对服务器进行攻击。如不需要本功能,请先使用pypi上的正式版)**
147 |
148 | Bot会尝试去调取目标wiki的api,并获取对应标题的页面信息(默认允许重定向、跨wiki、简繁转换)。如果未找到对应条目,或者对应页面是消歧义页面,
149 | 则会提供数字来选择。如果调用api失败或者未配置api,会回落到字符串拼接的方式生成链接。
150 |
151 | > Tip:如果api返回的结果不是你想要的,可以使用半角小括号包裹条目名以绕过api,如 ((帮助))
152 |
153 | 当绑定了多个wiki时,需要指定前缀以查询默认wiki之外的wiki,例如,假如将某个wiki的前缀设置为flan,且不是默认wiki,则查询命令对应为[[flan:帮助]]
154 |
155 | ### 管理功能
156 |
157 | * wiki列表
158 | * 权限:所有人可用
159 | * 语法:`wiki.list`
160 | * 返回:当前群绑定的wiki列表,以及全局wiki列表
161 |
162 |
163 | * 添加wiki
164 | * 语法 `wiki.add`
165 | * 参数:
166 | * 前缀:用于区分wiki的前缀,仅支持字母、数字和下划线,不能和本群已有的重复,但可以和全局已有的重复,此时本地设置优先。另外,为了防止和mediawiki的名字空间冲突,bot默认屏蔽了部分名字空间名作为前缀的能力,也请在绑定前先了解目标wiki的名字空间情况。
167 | * api地址(可选):目标wiki的mediawiki api的地址。某些wiki可能限制api调用,此时可以不设置api。该地址通常可以在目标wiki的`Special:版本#接入点URL`页面中找到。或者也可以尝试这些一般的格式:
168 |
169 | > https://www.example.org/api.php (如萌娘百科)
170 | >
171 | > https://www.example.org/w/api.php (如维基百科)
172 |
173 | * 通用url:目标wiki的条目路径。通常来讲,在该url后加上正确的条目名即可访问目标条目。可以在目标wiki的`Special:版本#接入点URL`中找到(“条目路径”中的$1即条目名)
174 |
175 | > 例如,对维基百科:https://www.example.org/wiki
176 | >
177 | > 对萌百等:https://www.example.org/
178 |
179 |
180 |
181 | * 删除wiki
182 | * 语法 `wiki.delete`
183 | * 参数:
184 | * 前缀:要删除的wiki的前缀
185 |
186 |
187 | * 设置默认wiki
188 | * 语法 `wiki.default`
189 | * 参数:
190 | * 前缀:要设置默认的wiki的前缀
191 |
192 | > Tip:本群/全局绑定的的一个wiki将被自动设置为本地/全局的默认wiki,当本地/全局绑定的默认wiki被删除时会自动清除对应的默认wiki设置,无需手动操作。
193 |
194 |
195 | ### 附加说明
196 | #### 本地和全局
197 |
198 | bot管理员可以设置全局的wiki,全局wiki的设计意图在于回落,换句话说,本地设置无条件优先于全局设置。当且仅当在以下情况下,全局设置会被应用:
199 |
200 | 1. 本地没有绑定任何wiki
201 | 2. 本地没有设置默认前缀,而查询请求中又不包含前缀
202 |
203 | > 注意:如果本地有和全局默认前缀相同的wiki时,本地的wiki仍将被优先调用
204 |
205 | 3. 本地设置了默认前缀,但是本地不存在该wiki
206 |
207 | > 注意:当前缀在全局中也不存在时,前缀将被视为名字空间,直接和条目名一并传入api进行查询
208 |
209 | 4. 查询请求中包含的前缀在本地不存在
210 |
211 | #### API调用
212 |
213 | 为了提供更准确的结果,默认情况下bot会调用mediawiki api查询条目。当api无法正常调用时,会使用通用url和条目名拼接作为回落。
214 | 如果返回了错误的结果,可以使用小括号查询来绕过api。
215 |
216 | #### 使用代理
217 |
218 | 如果你需要使用代理来优化某些wiki的访问速度,可以在`.env`文件中设置`WIKI_PROXY`环境变量,该变量的值为代理地址,格式为`scheme://(user:password@)host:port`,例如:
219 |
220 | ```dotenv
221 | WIKI_PROXY=http://127.0.0.1:1080
222 | ```
223 |
224 | ```dotenv
225 | WIKI_PROXY=socks5://user:mysecret@example.org:11451
226 | ```
227 |
228 | 该变量会被传递给aiohttp和playwright,因此可以用于本插件中的所有网络请求。
229 |
230 | 如果`.env`文件中的配置无法正常生效,你也可以:
231 |
232 | a) 在系统环境变量中添加代理设置
233 | ```shell
234 | # windows cmd
235 | set WIKI_PROXY 'http://127.0.0.1:1080'
236 | # windows powershell
237 | $Env:WIKI_PROXY='http://127.0.0.1:1080'
238 | # linux/macOS
239 | export WIKI_PROXY='http://127.0.0.1:1080'
240 | ```
241 |
242 | b) 在bot的入口文件(通常是`bot.py`)中对`config.wiki_proxy`直接赋值:
243 | ```python
244 | import nonebot
245 |
246 | # 初始化时
247 | nonebot.init(wiki_proxy="http://127.0.0.1:1080")
248 |
249 | # 或者在初始化后
250 | config = nonebot.get_driver().config
251 | config.wiki_proxy = "socks5://user:pass@proxy.example.org"
252 | ```
253 |
254 | 请参考[nonebot文档](https://nonebot.dev/docs/appendices/config#%E9%85%8D%E7%BD%AE%E9%A1%B9%E7%9A%84%E5%8A%A0%E8%BD%BD)获取更多信息
255 |
256 | > 注意:该代理设置不支持按wiki分流,即所有wiki的请求都会使用同一个代理。如果你有此类需求,建议使用代理客户端内置的分流功能,它们通常会提供更灵活的配置选项。
257 | >
258 | > 由于众所周知的原因,传播某些需要代理才能访问的wiki内的内容可能影响帐号安全,请谨慎添加此类wiki,或者使用保证内容合法的境内镜像站(如有)
259 |
260 | #### 截图功能
261 |
262 | 本插件支持截图功能,但是需要额外安装依赖。如果你不需要截图功能,可以跳过这一节。
263 |
264 | * 如果你还没安装该插件
265 | ```shell
266 | pip install nonebot-plugin-mediawiki[shot]
267 | ```
268 | 这样,pip会在安装该插件时自动安装截图功能所需的依赖(目前仅有playwright)
269 |
270 | * 如果你已经安装了该插件,则需要在安装了该插件的虚拟环境中安装playwright
271 | ```shell
272 | # 激活虚拟环境
273 | # linux, venv
274 | source venv/bin/activate
275 | # windows, venv
276 | venv\Scripts\activate.bat
277 | # 安装playwright
278 | pip install playwright
279 | ```
280 |
281 | * 然后再安装chromium(或firefox/webkit,如需使用其他浏览器,请参考后续说明设置环境变量)
282 | ```shell
283 | # 安装chromium
284 | playwright install --with-deps chromium
285 | # 如果你的系统不被playwright官方支持,可以尝试只安装浏览器(不带上面的--with-deps参数),然后使用发行版的包管理器安装完整的chromium以满足依赖。但请注意发行版的chromium版本和playwright的可能不一致,带来兼容性问题
286 | # CentOS
287 | sudo yum install chromium
288 | # Arch
289 | sudo pacman -S chromium
290 | ```
291 |
292 | * 页面设置
293 |
294 | 如果需要调整截图时浏览器窗口的大小,可以在`.env`文件中设置`WIKI_SHOT_BROWSER_WIDTH`和`WIKI_SHOT_BROWSER_HEIGHT`环境变量,例如:
295 |
296 | ```dotenv
297 | WIKI_SHOT_BROWSER_WIDTH=408
298 | WIKI_SHOT_BROWSER_HEIGHT=785
299 | ```
300 |
301 | 将窗口大小设置为更小的值可以减少截图时的资源占用,并在支持响应式布局的wiki中获得更适合阅读的页面布局。但对优化不好的wiki来说,可能会导致部分内容无法正常显示。
302 |
303 | 如果需要调整浏览器的默认语言,可以在`.env`文件中设置`WIKI_SHOT_BROWSER_LANGUAGE`环境变量,例如:
304 |
305 | ```dotenv
306 | WIKI_SHOT_BROWSER_LANGUAGE=zh-CN
307 | ```
308 |
309 | 如果需要调整页面加载时间限制(默认30秒),可以在`.env`文件中设置`WIKI_SHOT_TIMEOUT`环境变量,单位为秒,例如:
310 |
311 | ```dotenv
312 | WIKI_SHOT_TIMEOUT=60
313 | ```
314 |
315 | 无头浏览器操作的时间内,bot不会向用户发送进度消息,因此过长的超时时间可能被认为是无响应,调整时间限制时请注意这一点。
316 |
317 | 如果需要调整何时进行截图,可以在`.env`文件中设置`WIKI_SHOT_WAIT_UNTIL`环境变量,有效值有`load`、`domcontentloaded`、`networkidle`、`commit`,例如:
318 |
319 | ```dotenv
320 | WIKI_SHOT_WAIT_UNTIL=networkidle
321 | ```
322 |
323 | 请参考[playwright文档](https://playwright.dev/python/docs/api/class-page#page-goto)获取这些值的具体含义
324 |
325 | 如果要分页截图,可以在`.env`文件中设置`WIKI_SHOT_SPLIT_PAGES`环境变量,表示最大允许的分页数量,例如:
326 |
327 | ```dotenv
328 | WIKI_SHOT_SPLIT_PAGES=20 # 允许最多20页
329 | ```
330 |
331 | 默认值是0,表示不分页截图。每个页面的长宽由上面提到的`WIKI_SHOT_BROWSER_WIDTH`和`WIKI_SHOT_BROWSER_HEIGHT`控制。
332 |
333 | > 注意:部分wiki使用的自定义皮肤可能无法正确分页,导致分页逻辑陷入死循环,因此请根据自己的需求设定一个合理的最大值
334 |
335 | * 使用其他浏览器
336 |
337 | 如果你想使用firefox或webkit进行截图,可以在`.env`文件中设置`WIKI_SHOT_BROWSER`环境变量,例如:
338 |
339 | ```dotenv
340 | WIKI_SHOT_BROWSER=firefox # 或者 webkit
341 | ```
342 |
343 | > 注意:
344 | >
345 | > 1. 切换浏览器后,需要重新安装浏览器以及其依赖。例如,使用firefox时,需要运行`playwright install --with-deps firefox`来安装firefox浏览器及其依赖
346 | >
347 | > 2. 虽然mediawiki对主流浏览器的支持都还不错,但部分wiki有自定义的脚本和样式,这些脚本和样式可能只针对chromium进行了优化,使用其他浏览器时可能会出现显示异常的情况,还请注意
348 | >
349 | > 3. 虽然chrome经常因其高占用而被诟病,但这并不代表它的竞争对手就一定更省资源。由于chrome的市场地位,它往往是各大网站优化的重点对象,使用其他浏览器时可能会遇到更高的资源占用
350 |
351 | 当前该功能**仍处于测试阶段**,不建议在生产环境中使用。以下是一些您可能需要注意的问题:
352 | * chromium会占用大量服务器资源,如果您的服务器配置较低,建议不要使用截图功能。我们建议至少使用2GB内存、且没有其他大型程序运行的服务器来启用该功能。如果您确实需要在资源受限的环境中使用该功能,可以尝试调整浏览器窗口大小、延长超时时间等参数来提高成功率,但无法保证一定能正常工作
353 | * 本插件对输出的内容没有过滤,您可能需要考虑安全性问题(例如,如果您的bot在公开群中使用,可能会被恶意利用来发送一些不适合在某些地区显示的内容)。
354 | * 某些wiki有奇怪的弹窗、广告等,也有些wiki的防火墙比较严格,或者有人机验证等,可能会导致无头浏览器无法正常获取页面。
355 | * 恶意的群成员可能会利用一些wiki的特殊页面来获取bot的服务器ip等敏感信息
356 | * 攻击者可能会利用浏览器漏洞来入侵服务器,建议定期更新playwright和chromium
357 |
358 | #### 对特定wiki的附加功能
359 |
360 | 为个别wiki添加了特殊的处理逻辑,以绕过某些限制或提升查询体验。截至目前只适配了某娘百科
361 |
362 | ##### 萌百
363 |
364 | 鉴于该wiki的新皮肤中出现大量干扰元素以及动效,影响截图效果并在某些设备上带来严重卡顿,故插件会自动在截图前,在url尾部添加`useskin=vector`参数以强制使用较为简洁的vector皮肤
365 |
366 | > 2025年暑期更新:萌百管理员已在桌面版中彻底删除了vector皮肤,但目前在移动端中,其继任者`vector-2022`皮肤仍暂时可用。由于`vector-2022`支持响应式布局,在宽屏设备使用移动端网站(`mzh`)并不会影响显示效果,因此插件会默认将所有萌百的截图请求重定向到移动端并使用`vector-2022`皮肤
367 |
368 | 如果你不希望插件对萌百进行这些特殊处理,可以添加以下环境变量来禁用这一行为:
369 |
370 | ```dotenv
371 | WIKI_MOEGIRL_USE_NEW_SKIN=true
372 | ```
373 |
374 |
375 | What can I say?
376 |
377 | [](https://mzh.moegirl.org.cn/index.php?curid=511920&oldid=7072039&useskin=vector-2022)
378 | [](https://zh.moegirl.org.cn/Special:Preferences)
379 |
380 |
--------------------------------------------------------------------------------
/nonebot_plugin_mediawiki/worker.py:
--------------------------------------------------------------------------------
1 | import re
2 | import urllib.parse
3 | from asyncio import TimeoutError
4 | from urllib import parse
5 |
6 | import nonebot
7 | from aiohttp import ContentTypeError, ClientProxyConnectionError, ClientConnectorError
8 | from nonebot import on_regex, on_command, logger
9 | from nonebot.adapters.onebot.v11 import Bot, utils, GroupMessageEvent, GROUP, MessageSegment
10 | from nonebot.internal.matcher import Matcher
11 | from nonebot.typing import T_State
12 |
13 | from .config import Config
14 | from .constants import ARTICLE_RAW, ARTICLE, RAW, TEMPLATE
15 | from .exception import NoDefaultPrefixException, NoSuchPrefixException
16 |
17 | __all__ = ['wiki_preprocess', 'wiki_parse']
18 |
19 | from .fakemwapi import DummyMediaWiki, DummyPage
20 |
21 | from .mediawiki import MediaWiki, HTTPTimeoutError, MediaWikiException, MediaWikiGeoCoordError, PageError, \
22 | DisambiguationError
23 | from .mediawiki.exceptions import InterWikiError, MediaWikiAPIURLError, MediaWikiBaseException
24 | from .utilities import ensure_url_param
25 |
26 | # 已有的MediaWiki实例
27 | wiki_instances = {}
28 |
29 | playwright = None
30 | browser = None
31 | browser_context = None
32 | playwright_not_installed = False
33 | playwright_launch_error = False
34 |
35 |
36 | @nonebot.get_driver().on_shutdown
37 | async def shutdown():
38 | global playwright, browser
39 | if browser:
40 | await browser.close()
41 | browser = None
42 | if playwright:
43 | playwright.stop()
44 | playwright = None
45 |
46 | # 响应器
47 | # TODO: use matcher group
48 | wiki_article = on_regex(ARTICLE_RAW, permission=GROUP, state={"mode": "article"})
49 | wiki_template = on_regex(TEMPLATE, permission=GROUP, state={"mode": "template"})
50 | wiki_raw = on_regex(RAW, permission=GROUP, state={"mode": "raw"})
51 | wiki_quick = on_command("wiki ", permission=GROUP, state={"mode": "quick"})
52 | wiki_shot = on_command("wiki.shot ", permission=GROUP, state={"mode": "shot"})
53 |
54 |
55 | @wiki_article.handle()
56 | @wiki_template.handle()
57 | @wiki_raw.handle()
58 | @wiki_quick.handle()
59 | @wiki_shot.handle()
60 | async def wiki_preprocess(bot: Bot, event: GroupMessageEvent, state: T_State, matcher: Matcher):
61 | message = utils.unescape(str(event.message).strip())
62 | mode = state["mode"]
63 | if mode == "article":
64 | title = re.findall(ARTICLE, message)
65 | elif mode == "template":
66 | title = re.findall(TEMPLATE, message)
67 | state["is_template"] = True
68 | elif mode == "raw":
69 | title = re.findall(RAW, message)
70 | state["is_raw"] = True
71 | elif mode == "quick":
72 | title = message.split(" ", maxsplit=1)
73 | if not title or len(title) < 2:
74 | await matcher.finish()
75 | title = title[1].lstrip()
76 | if not title:
77 | await matcher.finish()
78 | title = [title]
79 | elif mode == "shot":
80 | global playwright, browser, browser_context, playwright_launch_error, playwright_not_installed
81 | if playwright_not_installed:
82 | await matcher.finish("Playwright未安装")
83 | if playwright_launch_error:
84 | await matcher.finish("Playwright启动失败,如果您已安装Chromium,请重启Bot")
85 | if not playwright:
86 | try:
87 | from playwright.async_api import async_playwright, Error
88 | playwright = await async_playwright().start()
89 | if not browser:
90 | try:
91 | selected_browser = nonebot.get_driver().config.wiki_shot_browser.lower()
92 | except AttributeError:
93 | selected_browser = "chromium"
94 | if selected_browser not in ["chromium", "firefox", "webkit"]:
95 | selected_browser = "chromium"
96 | logger.warning(f"不支持的浏览器选项 {selected_browser} ,已回落到 chromium")
97 | try:
98 | launch_arguments = {
99 | "headless": True,
100 | }
101 |
102 | p = nonebot.get_driver().config.wiki_proxy
103 | if p:
104 | p = urllib.parse.urlparse(p)
105 | proxy = {
106 | "server": f"{p.scheme}://{p.hostname}:{p.port}",
107 | "username": p.username,
108 | "password": p.password
109 | }
110 | launch_arguments["proxy"] = proxy
111 |
112 | if selected_browser == "firefox":
113 | browser = await playwright.firefox.launch(**launch_arguments)
114 | elif selected_browser == "webkit":
115 | browser = await playwright.webkit.launch(**launch_arguments)
116 | else:
117 | browser = await playwright.chromium.launch(**launch_arguments)
118 |
119 | try:
120 | lang = nonebot.get_driver().config.wiki_shot_browser_language
121 | except AttributeError:
122 | lang = "zh-CN"
123 | try:
124 | width = int(nonebot.get_driver().config.wiki_shot_browser_width)
125 | except (AttributeError, ValueError):
126 | width = 1920
127 | try:
128 | height = int(nonebot.get_driver().config.wiki_shot_browser_height)
129 | except (AttributeError, ValueError):
130 | height = 1080
131 | browser_context = await browser.new_context(
132 | locale=lang,
133 | viewport={"width": width, "height": height},
134 | )
135 | except Error as e:
136 | playwright_launch_error = True
137 | logger.warning("Playwright启动失败,请检查是否安装了Chromium\n"
138 | "安装方法:在bot的虚拟环境中执行:playwright install chromium")
139 | logger.warning("注意:对于无头服务器,您可能需要使用系统的包管理器安装完整版的Chromium以保证系统中有可用的依赖\n"
140 | "例如:在Ubuntu 20.04中,您可以使用apt安装:sudo apt install chromium-browser\n"
141 | "在Archlinux中,您可以使用pacman安装:sudo pacman -S chromium")
142 | logger.warning(f"下面是Playwright的错误信息,可能对您有帮助:\n{e}")
143 | await matcher.finish("Playwright启动失败,请检查是否安装了Chromium")
144 | except ImportError:
145 | playwright_not_installed = True
146 | await matcher.finish("Playwright未安装")
147 |
148 | title = message.split(" ", maxsplit=1)
149 | if not title or len(title) < 2:
150 | await matcher.finish()
151 | title = title[1].lstrip()
152 | if not title:
153 | await matcher.finish()
154 | title = [title]
155 | state["is_shot"] = True
156 |
157 | if not title:
158 | await matcher.finish()
159 | state["title"] = title[0]
160 | state["is_user_choice"] = False
161 |
162 |
163 | @wiki_article.got("title", "请从上面选择一项,或回复0来根据原标题直接生成链接,回复”取消“退出")
164 | @wiki_template.got("title", "请从上面选择一项,或回复0来根据原标题直接生成链接,回复”取消“退出")
165 | @wiki_raw.got("title", "请从上面选择一项,或回复0来根据原标题直接生成链接,回复”取消“退出")
166 | @wiki_quick.got("title", "请从上面选择一项,或回复0来根据原标题直接生成链接,回复”取消“退出")
167 | @wiki_shot.got("title", "请从上面选择一项,或回复0来根据原标题直接生成链接,回复”取消“退出")
168 | async def wiki_parse(bot: Bot, event: GroupMessageEvent, state: T_State, matcher: Matcher):
169 | # 标记
170 | page = None
171 | exception = None
172 |
173 | if state.get("is_user_choice"): # 选择模式,获取先前存储的数据
174 | msg = str(state["title"]).strip()
175 | if (not msg.isdigit()) or int(msg) not in range(len(state["options"]) + 1): # 非选择项或超范围
176 | await matcher.finish()
177 |
178 | choice = int(msg)
179 | if not choice: # 选0,直接生成链接
180 | if state.get("disambiguation"):
181 | page = DummyPage(state['disambiguation'].url, state['raw_title'])
182 | else:
183 | instance = state["dummy_instance"]
184 | page = await instance.page(state["raw_title"])
185 | else:
186 | title = state["options"][choice - 1]
187 | wiki_instance = state["instance"]
188 | dummy_instance = state["dummy_instance"]
189 | api = state["api"]
190 | else:
191 | config = Config(event.group_id)
192 | title = state["title"]
193 | prefix = re.match(r'\w+:|\w+:', title)
194 | if not prefix:
195 | prefix = ''
196 | else:
197 | prefix = prefix.group(0).lower().rstrip("::")
198 | if prefix in config.prefixes:
199 | title = re.sub(f"{prefix}:|{prefix}:", '', title, count=1, flags=re.I)
200 | else:
201 | prefix = ''
202 |
203 | if title is None or title.strip() == "":
204 | await matcher.finish()
205 |
206 | # 检查锚点
207 | anchor_list = re.split('#', title, maxsplit=1)
208 | title = anchor_list[0]
209 | state["anchor"] = anchor_list[1] if len(anchor_list) > 1 else state.get("anchor")
210 |
211 | if not state.get("is_user_choice"):
212 | if state.get("is_template"):
213 | title = "Template:" + title
214 | try:
215 | api, url = config.get_from_prefix(prefix)[:2]
216 | except NoDefaultPrefixException:
217 | await matcher.finish("没有找到默认前缀,请群管或bot管理员先设置默认前缀")
218 | return
219 | except NoSuchPrefixException:
220 | await matcher.finish("指定的默认前缀对应的wiki不存在,请管理员检查设置")
221 | return
222 |
223 | state["api"] = api # 选择模式下,不会主动读取配置,因此需要提供api地址供生成链接
224 |
225 | dummy_instance = DummyMediaWiki(url) # 用于生成直链的MediaWiki实例
226 | if state.get("is_raw"):
227 | wiki_instance = dummy_instance
228 | else:
229 | # 获取已有的MediaWiki实例,以api链接作为key
230 | global wiki_instances
231 | if api in wiki_instances.keys():
232 | wiki_instance = wiki_instances[api]
233 | else:
234 | if api:
235 | try:
236 | p = nonebot.get_driver().config.wiki_proxy
237 | if p:
238 | wiki_instance = await MediaWiki.create(url=api, proxies=p)
239 | else:
240 | wiki_instance = await MediaWiki.create(url=api)
241 | wiki_instances[api] = wiki_instance
242 | except (MediaWikiBaseException, TimeoutError, ClientProxyConnectionError, ConnectionRefusedError, AssertionError, ClientConnectorError) as e:
243 | logger.info(f"连接到MediaWiki API 时发生了错误:{e}")
244 | exception = "Api连接失败"
245 | wiki_instance = dummy_instance
246 | else: # 没api地址就算了
247 | wiki_instance = dummy_instance
248 |
249 | if not page:
250 | try:
251 | page = await wiki_instance.page(title=title, auto_suggest=False, convert_titles=True, iwurl=True)
252 | exception = exception or None
253 | except (HTTPTimeoutError, TimeoutError):
254 | exception = "连接超时"
255 | page = await dummy_instance.page(title=title)
256 | except (MediaWikiException, MediaWikiGeoCoordError, ContentTypeError) as e: # ContentTypeError:非json内容
257 | exception = "Api调用出错"
258 | logger.info(f"MediaWiki API 返回了错误信息:{e}")
259 | page = await dummy_instance.page(title=title)
260 | except PageError:
261 | try:
262 | search = await wiki_instance.search(title)
263 | if search:
264 | result = f"页面 {title} 不存在;你是不是想找:"
265 | for k, v in enumerate(search):
266 | result += f"\n{k + 1}. {v}"
267 | state["is_user_choice"] = True
268 | state["options"] = search
269 | state["raw_title"] = title
270 | state["instance"] = wiki_instance
271 | state["dummy_instance"] = dummy_instance
272 | state.pop("title")
273 | await matcher.reject(result)
274 | return # 同理,糊弄下IDE
275 | else:
276 | page = await dummy_instance.page(title=title)
277 | except (MediaWikiBaseException, TimeoutError):
278 | page = await dummy_instance.page(title=title)
279 | exception = "未找到页面"
280 | except DisambiguationError as e:
281 | result = f"条目 {e.title} 是一个消歧义页面,有以下含义:"
282 | for k, v in enumerate(e.options):
283 | result += f"\n{k + 1}. {v}"
284 | state["is_user_choice"] = True
285 | state["disambiguation"] = e
286 | state["options"] = e.options
287 | state["raw_title"] = title
288 | state["instance"] = wiki_instance
289 | state["dummy_instance"] = dummy_instance
290 | state.pop("title")
291 | await matcher.reject(result)
292 | return
293 | except InterWikiError as e:
294 | result = f"跨维基链接:{e.title}\n" \
295 | f"链接:{e.url}"
296 | await matcher.finish(result)
297 | return
298 | except Exception as e:
299 | exception = "未知错误"
300 | logger.warning(f"MediaWiki API 发生了未知异常:{e}")
301 | page = await dummy_instance.page(title=title)
302 |
303 | if not exception and state.get("mode") == "shot":
304 | if browser:
305 | try:
306 | pg = await browser_context.new_page()
307 | try:
308 | u = page.url
309 | try:
310 | use_vector = str(nonebot.get_driver().config.wiki_moegirl_use_new_skin).lower() != "true"
311 | except AttributeError:
312 | use_vector = True
313 | if use_vector:
314 | if re.match(r'https?://zh\.moegirl\.org\.cn/.*', u):
315 | u.replace("zh.moegirl.org.cn", "mzh.moegirl.org.cn", count=1) # the administrators have removed legacy skin support on desktop site
316 | u = ensure_url_param(u, "moegirl.org.cn", "useskin", "vector-2022")
317 | try:
318 | timeout = float(nonebot.get_driver().config.wiki_shot_timeout)
319 | except AttributeError:
320 | timeout = 30.0
321 | try:
322 | wait_until = nonebot.get_driver().config.wiki_shot_wait_until
323 | except AttributeError:
324 | wait_until = "load"
325 | await pg.goto(u, timeout=timeout*1000, wait_until=wait_until)
326 |
327 | split = 0
328 | try:
329 | split = int(nonebot.get_driver().config.wiki_shot_split_pages)
330 | except AttributeError:
331 | pass
332 | except ValueError:
333 | logger.warning("wiki_shot_split_pages 配置项格式错误,已回落到默认值 0")
334 | if split > 0:
335 | page_num = 1
336 | fail_count = 0
337 |
338 | viewport_height = await pg.evaluate("window.innerHeight")
339 | last_scroll_y = -1
340 | while True:
341 | current_scroll_y = await pg.evaluate("window.scrollY")
342 | if current_scroll_y == last_scroll_y:
343 | # logger.debug("页面滚动完成")
344 | break
345 | last_scroll_y = current_scroll_y
346 |
347 | try:
348 | # TODO: 把截出来的图提交到另一个线程中发送,并实现重试等
349 | await matcher.send(MessageSegment.image(await pg.screenshot(full_page=False, type="jpeg", quality=80)))
350 | page_num += 1
351 | fail_count = 0
352 |
353 | if page_num > split:
354 | logger.info("已达到最大分割页数,终止截图")
355 | break
356 | except Exception as e:
357 | logger.warning(f"截图时发生了错误:{e}")
358 | fail_count += 1
359 | if fail_count >=3:
360 | logger.warning("连续三次截图失败,终止截图")
361 | raise e
362 | continue
363 | await pg.evaluate(f"window.scrollBy(0, {viewport_height});")
364 | await pg.wait_for_timeout(250)
365 | else:
366 | img = await pg.screenshot(full_page=True, type="jpeg", quality=80)
367 | await matcher.send(MessageSegment.image(img))
368 | except TimeoutError:
369 | logger.warning(f"页面{page.url}加载超时")
370 | exception = "截图失败:页面加载超时"
371 | except Exception as e:
372 | logger.warning(f"截图时发生了错误:{e}")
373 | exception = "截图失败:页面加载失败"
374 | finally:
375 | await pg.close()
376 | except Exception as e:
377 | logger.warning(f"截图时发生了错误:{e}")
378 | exception = "截图失败"
379 |
380 | result = f"错误:{exception}\n" if exception else ""
381 | if page.title != title:
382 | result += f"重定向 {title} → {page.title}\n"
383 | else:
384 | result += f"标题:{page.title}\n"
385 | if hasattr(page, "pageid"):
386 | result += f"链接:{api[:-7]}index.php?curid={page.pageid}" # 使用页面id来缩短链接
387 | else:
388 | result += f"链接:{page.url}"
389 | if state.get("anchor"):
390 | result += parse.quote("#" + state["anchor"])
391 |
392 | await matcher.finish(result)
393 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU AFFERO GENERAL PUBLIC LICENSE
2 | Version 3, 19 November 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 | Preamble
9 |
10 | The GNU Affero General Public License is a free, copyleft license for
11 | software and other kinds of works, specifically designed to ensure
12 | cooperation with the community in the case of network server software.
13 |
14 | The licenses for most software and other practical works are designed
15 | to take away your freedom to share and change the works. By contrast,
16 | our General Public Licenses are intended to guarantee your freedom to
17 | share and change all versions of a program--to make sure it remains free
18 | software for all its users.
19 |
20 | When we speak of free software, we are referring to freedom, not
21 | price. Our General Public Licenses are designed to make sure that you
22 | have the freedom to distribute copies of free software (and charge for
23 | them if you wish), that you receive source code or can get it if you
24 | want it, that you can change the software or use pieces of it in new
25 | free programs, and that you know you can do these things.
26 |
27 | Developers that use our General Public Licenses protect your rights
28 | with two steps: (1) assert copyright on the software, and (2) offer
29 | you this License which gives you legal permission to copy, distribute
30 | and/or modify the software.
31 |
32 | A secondary benefit of defending all users' freedom is that
33 | improvements made in alternate versions of the program, if they
34 | receive widespread use, become available for other developers to
35 | incorporate. Many developers of free software are heartened and
36 | encouraged by the resulting cooperation. However, in the case of
37 | software used on network servers, this result may fail to come about.
38 | The GNU General Public License permits making a modified version and
39 | letting the public access it on a server without ever releasing its
40 | source code to the public.
41 |
42 | The GNU Affero General Public License is designed specifically to
43 | ensure that, in such cases, the modified source code becomes available
44 | to the community. It requires the operator of a network server to
45 | provide the source code of the modified version running there to the
46 | users of that server. Therefore, public use of a modified version, on
47 | a publicly accessible server, gives the public access to the source
48 | code of the modified version.
49 |
50 | An older license, called the Affero General Public License and
51 | published by Affero, was designed to accomplish similar goals. This is
52 | a different license, not a version of the Affero GPL, but Affero has
53 | released a new version of the Affero GPL which permits relicensing under
54 | this license.
55 |
56 | The precise terms and conditions for copying, distribution and
57 | modification follow.
58 |
59 | TERMS AND CONDITIONS
60 |
61 | 0. Definitions.
62 |
63 | "This License" refers to version 3 of the GNU Affero General Public License.
64 |
65 | "Copyright" also means copyright-like laws that apply to other kinds of
66 | works, such as semiconductor masks.
67 |
68 | "The Program" refers to any copyrightable work licensed under this
69 | License. Each licensee is addressed as "you". "Licensees" and
70 | "recipients" may be individuals or organizations.
71 |
72 | To "modify" a work means to copy from or adapt all or part of the work
73 | in a fashion requiring copyright permission, other than the making of an
74 | exact copy. The resulting work is called a "modified version" of the
75 | earlier work or a work "based on" the earlier work.
76 |
77 | A "covered work" means either the unmodified Program or a work based
78 | on the Program.
79 |
80 | To "propagate" a work means to do anything with it that, without
81 | permission, would make you directly or secondarily liable for
82 | infringement under applicable copyright law, except executing it on a
83 | computer or modifying a private copy. Propagation includes copying,
84 | distribution (with or without modification), making available to the
85 | public, and in some countries other activities as well.
86 |
87 | To "convey" a work means any kind of propagation that enables other
88 | parties to make or receive copies. Mere interaction with a user through
89 | a computer network, with no transfer of a copy, is not conveying.
90 |
91 | An interactive user interface displays "Appropriate Legal Notices"
92 | to the extent that it includes a convenient and prominently visible
93 | feature that (1) displays an appropriate copyright notice, and (2)
94 | tells the user that there is no warranty for the work (except to the
95 | extent that warranties are provided), that licensees may convey the
96 | work under this License, and how to view a copy of this License. If
97 | the interface presents a list of user commands or options, such as a
98 | menu, a prominent item in the list meets this criterion.
99 |
100 | 1. Source Code.
101 |
102 | The "source code" for a work means the preferred form of the work
103 | for making modifications to it. "Object code" means any non-source
104 | form of a work.
105 |
106 | A "Standard Interface" means an interface that either is an official
107 | standard defined by a recognized standards body, or, in the case of
108 | interfaces specified for a particular programming language, one that
109 | is widely used among developers working in that language.
110 |
111 | The "System Libraries" of an executable work include anything, other
112 | than the work as a whole, that (a) is included in the normal form of
113 | packaging a Major Component, but which is not part of that Major
114 | Component, and (b) serves only to enable use of the work with that
115 | Major Component, or to implement a Standard Interface for which an
116 | implementation is available to the public in source code form. A
117 | "Major Component", in this context, means a major essential component
118 | (kernel, window system, and so on) of the specific operating system
119 | (if any) on which the executable work runs, or a compiler used to
120 | produce the work, or an object code interpreter used to run it.
121 |
122 | The "Corresponding Source" for a work in object code form means all
123 | the source code needed to generate, install, and (for an executable
124 | work) run the object code and to modify the work, including scripts to
125 | control those activities. However, it does not include the work's
126 | System Libraries, or general-purpose tools or generally available free
127 | programs which are used unmodified in performing those activities but
128 | which are not part of the work. For example, Corresponding Source
129 | includes interface definition files associated with source files for
130 | the work, and the source code for shared libraries and dynamically
131 | linked subprograms that the work is specifically designed to require,
132 | such as by intimate data communication or control flow between those
133 | subprograms and other parts of the work.
134 |
135 | The Corresponding Source need not include anything that users
136 | can regenerate automatically from other parts of the Corresponding
137 | Source.
138 |
139 | The Corresponding Source for a work in source code form is that
140 | same work.
141 |
142 | 2. Basic Permissions.
143 |
144 | All rights granted under this License are granted for the term of
145 | copyright on the Program, and are irrevocable provided the stated
146 | conditions are met. This License explicitly affirms your unlimited
147 | permission to run the unmodified Program. The output from running a
148 | covered work is covered by this License only if the output, given its
149 | content, constitutes a covered work. This License acknowledges your
150 | rights of fair use or other equivalent, as provided by copyright law.
151 |
152 | You may make, run and propagate covered works that you do not
153 | convey, without conditions so long as your license otherwise remains
154 | in force. You may convey covered works to others for the sole purpose
155 | of having them make modifications exclusively for you, or provide you
156 | with facilities for running those works, provided that you comply with
157 | the terms of this License in conveying all material for which you do
158 | not control copyright. Those thus making or running the covered works
159 | for you must do so exclusively on your behalf, under your direction
160 | and control, on terms that prohibit them from making any copies of
161 | your copyrighted material outside their relationship with you.
162 |
163 | Conveying under any other circumstances is permitted solely under
164 | the conditions stated below. Sublicensing is not allowed; section 10
165 | makes it unnecessary.
166 |
167 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
168 |
169 | No covered work shall be deemed part of an effective technological
170 | measure under any applicable law fulfilling obligations under article
171 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
172 | similar laws prohibiting or restricting circumvention of such
173 | measures.
174 |
175 | When you convey a covered work, you waive any legal power to forbid
176 | circumvention of technological measures to the extent such circumvention
177 | is effected by exercising rights under this License with respect to
178 | the covered work, and you disclaim any intention to limit operation or
179 | modification of the work as a means of enforcing, against the work's
180 | users, your or third parties' legal rights to forbid circumvention of
181 | technological measures.
182 |
183 | 4. Conveying Verbatim Copies.
184 |
185 | You may convey verbatim copies of the Program's source code as you
186 | receive it, in any medium, provided that you conspicuously and
187 | appropriately publish on each copy an appropriate copyright notice;
188 | keep intact all notices stating that this License and any
189 | non-permissive terms added in accord with section 7 apply to the code;
190 | keep intact all notices of the absence of any warranty; and give all
191 | recipients a copy of this License along with the Program.
192 |
193 | You may charge any price or no price for each copy that you convey,
194 | and you may offer support or warranty protection for a fee.
195 |
196 | 5. Conveying Modified Source Versions.
197 |
198 | You may convey a work based on the Program, or the modifications to
199 | produce it from the Program, in the form of source code under the
200 | terms of section 4, provided that you also meet all of these conditions:
201 |
202 | a) The work must carry prominent notices stating that you modified
203 | it, and giving a relevant date.
204 |
205 | b) The work must carry prominent notices stating that it is
206 | released under this License and any conditions added under section
207 | 7. This requirement modifies the requirement in section 4 to
208 | "keep intact all notices".
209 |
210 | c) You must license the entire work, as a whole, under this
211 | License to anyone who comes into possession of a copy. This
212 | License will therefore apply, along with any applicable section 7
213 | additional terms, to the whole of the work, and all its parts,
214 | regardless of how they are packaged. This License gives no
215 | permission to license the work in any other way, but it does not
216 | invalidate such permission if you have separately received it.
217 |
218 | d) If the work has interactive user interfaces, each must display
219 | Appropriate Legal Notices; however, if the Program has interactive
220 | interfaces that do not display Appropriate Legal Notices, your
221 | work need not make them do so.
222 |
223 | A compilation of a covered work with other separate and independent
224 | works, which are not by their nature extensions of the covered work,
225 | and which are not combined with it such as to form a larger program,
226 | in or on a volume of a storage or distribution medium, is called an
227 | "aggregate" if the compilation and its resulting copyright are not
228 | used to limit the access or legal rights of the compilation's users
229 | beyond what the individual works permit. Inclusion of a covered work
230 | in an aggregate does not cause this License to apply to the other
231 | parts of the aggregate.
232 |
233 | 6. Conveying Non-Source Forms.
234 |
235 | You may convey a covered work in object code form under the terms
236 | of sections 4 and 5, provided that you also convey the
237 | machine-readable Corresponding Source under the terms of this License,
238 | in one of these ways:
239 |
240 | a) Convey the object code in, or embodied in, a physical product
241 | (including a physical distribution medium), accompanied by the
242 | Corresponding Source fixed on a durable physical medium
243 | customarily used for software interchange.
244 |
245 | b) Convey the object code in, or embodied in, a physical product
246 | (including a physical distribution medium), accompanied by a
247 | written offer, valid for at least three years and valid for as
248 | long as you offer spare parts or customer support for that product
249 | model, to give anyone who possesses the object code either (1) a
250 | copy of the Corresponding Source for all the software in the
251 | product that is covered by this License, on a durable physical
252 | medium customarily used for software interchange, for a price no
253 | more than your reasonable cost of physically performing this
254 | conveying of source, or (2) access to copy the
255 | Corresponding Source from a network server at no charge.
256 |
257 | c) Convey individual copies of the object code with a copy of the
258 | written offer to provide the Corresponding Source. This
259 | alternative is allowed only occasionally and noncommercially, and
260 | only if you received the object code with such an offer, in accord
261 | with subsection 6b.
262 |
263 | d) Convey the object code by offering access from a designated
264 | place (gratis or for a charge), and offer equivalent access to the
265 | Corresponding Source in the same way through the same place at no
266 | further charge. You need not require recipients to copy the
267 | Corresponding Source along with the object code. If the place to
268 | copy the object code is a network server, the Corresponding Source
269 | may be on a different server (operated by you or a third party)
270 | that supports equivalent copying facilities, provided you maintain
271 | clear directions next to the object code saying where to find the
272 | Corresponding Source. Regardless of what server hosts the
273 | Corresponding Source, you remain obligated to ensure that it is
274 | available for as long as needed to satisfy these requirements.
275 |
276 | e) Convey the object code using peer-to-peer transmission, provided
277 | you inform other peers where the object code and Corresponding
278 | Source of the work are being offered to the general public at no
279 | charge under subsection 6d.
280 |
281 | A separable portion of the object code, whose source code is excluded
282 | from the Corresponding Source as a System Library, need not be
283 | included in conveying the object code work.
284 |
285 | A "User Product" is either (1) a "consumer product", which means any
286 | tangible personal property which is normally used for personal, family,
287 | or household purposes, or (2) anything designed or sold for incorporation
288 | into a dwelling. In determining whether a product is a consumer product,
289 | doubtful cases shall be resolved in favor of coverage. For a particular
290 | product received by a particular user, "normally used" refers to a
291 | typical or common use of that class of product, regardless of the status
292 | of the particular user or of the way in which the particular user
293 | actually uses, or expects or is expected to use, the product. A product
294 | is a consumer product regardless of whether the product has substantial
295 | commercial, industrial or non-consumer uses, unless such uses represent
296 | the only significant mode of use of the product.
297 |
298 | "Installation Information" for a User Product means any methods,
299 | procedures, authorization keys, or other information required to install
300 | and execute modified versions of a covered work in that User Product from
301 | a modified version of its Corresponding Source. The information must
302 | suffice to ensure that the continued functioning of the modified object
303 | code is in no case prevented or interfered with solely because
304 | modification has been made.
305 |
306 | If you convey an object code work under this section in, or with, or
307 | specifically for use in, a User Product, and the conveying occurs as
308 | part of a transaction in which the right of possession and use of the
309 | User Product is transferred to the recipient in perpetuity or for a
310 | fixed term (regardless of how the transaction is characterized), the
311 | Corresponding Source conveyed under this section must be accompanied
312 | by the Installation Information. But this requirement does not apply
313 | if neither you nor any third party retains the ability to install
314 | modified object code on the User Product (for example, the work has
315 | been installed in ROM).
316 |
317 | The requirement to provide Installation Information does not include a
318 | requirement to continue to provide support service, warranty, or updates
319 | for a work that has been modified or installed by the recipient, or for
320 | the User Product in which it has been modified or installed. Access to a
321 | network may be denied when the modification itself materially and
322 | adversely affects the operation of the network or violates the rules and
323 | protocols for communication across the network.
324 |
325 | Corresponding Source conveyed, and Installation Information provided,
326 | in accord with this section must be in a format that is publicly
327 | documented (and with an implementation available to the public in
328 | source code form), and must require no special password or key for
329 | unpacking, reading or copying.
330 |
331 | 7. Additional Terms.
332 |
333 | "Additional permissions" are terms that supplement the terms of this
334 | License by making exceptions from one or more of its conditions.
335 | Additional permissions that are applicable to the entire Program shall
336 | be treated as though they were included in this License, to the extent
337 | that they are valid under applicable law. If additional permissions
338 | apply only to part of the Program, that part may be used separately
339 | under those permissions, but the entire Program remains governed by
340 | this License without regard to the additional permissions.
341 |
342 | When you convey a copy of a covered work, you may at your option
343 | remove any additional permissions from that copy, or from any part of
344 | it. (Additional permissions may be written to require their own
345 | removal in certain cases when you modify the work.) You may place
346 | additional permissions on material, added by you to a covered work,
347 | for which you have or can give appropriate copyright permission.
348 |
349 | Notwithstanding any other provision of this License, for material you
350 | add to a covered work, you may (if authorized by the copyright holders of
351 | that material) supplement the terms of this License with terms:
352 |
353 | a) Disclaiming warranty or limiting liability differently from the
354 | terms of sections 15 and 16 of this License; or
355 |
356 | b) Requiring preservation of specified reasonable legal notices or
357 | author attributions in that material or in the Appropriate Legal
358 | Notices displayed by works containing it; or
359 |
360 | c) Prohibiting misrepresentation of the origin of that material, or
361 | requiring that modified versions of such material be marked in
362 | reasonable ways as different from the original version; or
363 |
364 | d) Limiting the use for publicity purposes of names of licensors or
365 | authors of the material; or
366 |
367 | e) Declining to grant rights under trademark law for use of some
368 | trade names, trademarks, or service marks; or
369 |
370 | f) Requiring indemnification of licensors and authors of that
371 | material by anyone who conveys the material (or modified versions of
372 | it) with contractual assumptions of liability to the recipient, for
373 | any liability that these contractual assumptions directly impose on
374 | those licensors and authors.
375 |
376 | All other non-permissive additional terms are considered "further
377 | restrictions" within the meaning of section 10. If the Program as you
378 | received it, or any part of it, contains a notice stating that it is
379 | governed by this License along with a term that is a further
380 | restriction, you may remove that term. If a license document contains
381 | a further restriction but permits relicensing or conveying under this
382 | License, you may add to a covered work material governed by the terms
383 | of that license document, provided that the further restriction does
384 | not survive such relicensing or conveying.
385 |
386 | If you add terms to a covered work in accord with this section, you
387 | must place, in the relevant source files, a statement of the
388 | additional terms that apply to those files, or a notice indicating
389 | where to find the applicable terms.
390 |
391 | Additional terms, permissive or non-permissive, may be stated in the
392 | form of a separately written license, or stated as exceptions;
393 | the above requirements apply either way.
394 |
395 | 8. Termination.
396 |
397 | You may not propagate or modify a covered work except as expressly
398 | provided under this License. Any attempt otherwise to propagate or
399 | modify it is void, and will automatically terminate your rights under
400 | this License (including any patent licenses granted under the third
401 | paragraph of section 11).
402 |
403 | However, if you cease all violation of this License, then your
404 | license from a particular copyright holder is reinstated (a)
405 | provisionally, unless and until the copyright holder explicitly and
406 | finally terminates your license, and (b) permanently, if the copyright
407 | holder fails to notify you of the violation by some reasonable means
408 | prior to 60 days after the cessation.
409 |
410 | Moreover, your license from a particular copyright holder is
411 | reinstated permanently if the copyright holder notifies you of the
412 | violation by some reasonable means, this is the first time you have
413 | received notice of violation of this License (for any work) from that
414 | copyright holder, and you cure the violation prior to 30 days after
415 | your receipt of the notice.
416 |
417 | Termination of your rights under this section does not terminate the
418 | licenses of parties who have received copies or rights from you under
419 | this License. If your rights have been terminated and not permanently
420 | reinstated, you do not qualify to receive new licenses for the same
421 | material under section 10.
422 |
423 | 9. Acceptance Not Required for Having Copies.
424 |
425 | You are not required to accept this License in order to receive or
426 | run a copy of the Program. Ancillary propagation of a covered work
427 | occurring solely as a consequence of using peer-to-peer transmission
428 | to receive a copy likewise does not require acceptance. However,
429 | nothing other than this License grants you permission to propagate or
430 | modify any covered work. These actions infringe copyright if you do
431 | not accept this License. Therefore, by modifying or propagating a
432 | covered work, you indicate your acceptance of this License to do so.
433 |
434 | 10. Automatic Licensing of Downstream Recipients.
435 |
436 | Each time you convey a covered work, the recipient automatically
437 | receives a license from the original licensors, to run, modify and
438 | propagate that work, subject to this License. You are not responsible
439 | for enforcing compliance by third parties with this License.
440 |
441 | An "entity transaction" is a transaction transferring control of an
442 | organization, or substantially all assets of one, or subdividing an
443 | organization, or merging organizations. If propagation of a covered
444 | work results from an entity transaction, each party to that
445 | transaction who receives a copy of the work also receives whatever
446 | licenses to the work the party's predecessor in interest had or could
447 | give under the previous paragraph, plus a right to possession of the
448 | Corresponding Source of the work from the predecessor in interest, if
449 | the predecessor has it or can get it with reasonable efforts.
450 |
451 | You may not impose any further restrictions on the exercise of the
452 | rights granted or affirmed under this License. For example, you may
453 | not impose a license fee, royalty, or other charge for exercise of
454 | rights granted under this License, and you may not initiate litigation
455 | (including a cross-claim or counterclaim in a lawsuit) alleging that
456 | any patent claim is infringed by making, using, selling, offering for
457 | sale, or importing the Program or any portion of it.
458 |
459 | 11. Patents.
460 |
461 | A "contributor" is a copyright holder who authorizes use under this
462 | License of the Program or a work on which the Program is based. The
463 | work thus licensed is called the contributor's "contributor version".
464 |
465 | A contributor's "essential patent claims" are all patent claims
466 | owned or controlled by the contributor, whether already acquired or
467 | hereafter acquired, that would be infringed by some manner, permitted
468 | by this License, of making, using, or selling its contributor version,
469 | but do not include claims that would be infringed only as a
470 | consequence of further modification of the contributor version. For
471 | purposes of this definition, "control" includes the right to grant
472 | patent sublicenses in a manner consistent with the requirements of
473 | this License.
474 |
475 | Each contributor grants you a non-exclusive, worldwide, royalty-free
476 | patent license under the contributor's essential patent claims, to
477 | make, use, sell, offer for sale, import and otherwise run, modify and
478 | propagate the contents of its contributor version.
479 |
480 | In the following three paragraphs, a "patent license" is any express
481 | agreement or commitment, however denominated, not to enforce a patent
482 | (such as an express permission to practice a patent or covenant not to
483 | sue for patent infringement). To "grant" such a patent license to a
484 | party means to make such an agreement or commitment not to enforce a
485 | patent against the party.
486 |
487 | If you convey a covered work, knowingly relying on a patent license,
488 | and the Corresponding Source of the work is not available for anyone
489 | to copy, free of charge and under the terms of this License, through a
490 | publicly available network server or other readily accessible means,
491 | then you must either (1) cause the Corresponding Source to be so
492 | available, or (2) arrange to deprive yourself of the benefit of the
493 | patent license for this particular work, or (3) arrange, in a manner
494 | consistent with the requirements of this License, to extend the patent
495 | license to downstream recipients. "Knowingly relying" means you have
496 | actual knowledge that, but for the patent license, your conveying the
497 | covered work in a country, or your recipient's use of the covered work
498 | in a country, would infringe one or more identifiable patents in that
499 | country that you have reason to believe are valid.
500 |
501 | If, pursuant to or in connection with a single transaction or
502 | arrangement, you convey, or propagate by procuring conveyance of, a
503 | covered work, and grant a patent license to some of the parties
504 | receiving the covered work authorizing them to use, propagate, modify
505 | or convey a specific copy of the covered work, then the patent license
506 | you grant is automatically extended to all recipients of the covered
507 | work and works based on it.
508 |
509 | A patent license is "discriminatory" if it does not include within
510 | the scope of its coverage, prohibits the exercise of, or is
511 | conditioned on the non-exercise of one or more of the rights that are
512 | specifically granted under this License. You may not convey a covered
513 | work if you are a party to an arrangement with a third party that is
514 | in the business of distributing software, under which you make payment
515 | to the third party based on the extent of your activity of conveying
516 | the work, and under which the third party grants, to any of the
517 | parties who would receive the covered work from you, a discriminatory
518 | patent license (a) in connection with copies of the covered work
519 | conveyed by you (or copies made from those copies), or (b) primarily
520 | for and in connection with specific products or compilations that
521 | contain the covered work, unless you entered into that arrangement,
522 | or that patent license was granted, prior to 28 March 2007.
523 |
524 | Nothing in this License shall be construed as excluding or limiting
525 | any implied license or other defenses to infringement that may
526 | otherwise be available to you under applicable patent law.
527 |
528 | 12. No Surrender of Others' Freedom.
529 |
530 | If conditions are imposed on you (whether by court order, agreement or
531 | otherwise) that contradict the conditions of this License, they do not
532 | excuse you from the conditions of this License. If you cannot convey a
533 | covered work so as to satisfy simultaneously your obligations under this
534 | License and any other pertinent obligations, then as a consequence you may
535 | not convey it at all. For example, if you agree to terms that obligate you
536 | to collect a royalty for further conveying from those to whom you convey
537 | the Program, the only way you could satisfy both those terms and this
538 | License would be to refrain entirely from conveying the Program.
539 |
540 | 13. Remote Network Interaction; Use with the GNU General Public License.
541 |
542 | Notwithstanding any other provision of this License, if you modify the
543 | Program, your modified version must prominently offer all users
544 | interacting with it remotely through a computer network (if your version
545 | supports such interaction) an opportunity to receive the Corresponding
546 | Source of your version by providing access to the Corresponding Source
547 | from a network server at no charge, through some standard or customary
548 | means of facilitating copying of software. This Corresponding Source
549 | shall include the Corresponding Source for any work covered by version 3
550 | of the GNU General Public License that is incorporated pursuant to the
551 | following paragraph.
552 |
553 | Notwithstanding any other provision of this License, you have
554 | permission to link or combine any covered work with a work licensed
555 | under version 3 of the GNU General Public License into a single
556 | combined work, and to convey the resulting work. The terms of this
557 | License will continue to apply to the part which is the covered work,
558 | but the work with which it is combined will remain governed by version
559 | 3 of the GNU General Public License.
560 |
561 | 14. Revised Versions of this License.
562 |
563 | The Free Software Foundation may publish revised and/or new versions of
564 | the GNU Affero General Public License from time to time. Such new versions
565 | will be similar in spirit to the present version, but may differ in detail to
566 | address new problems or concerns.
567 |
568 | Each version is given a distinguishing version number. If the
569 | Program specifies that a certain numbered version of the GNU Affero General
570 | Public License "or any later version" applies to it, you have the
571 | option of following the terms and conditions either of that numbered
572 | version or of any later version published by the Free Software
573 | Foundation. If the Program does not specify a version number of the
574 | GNU Affero General Public License, you may choose any version ever published
575 | by the Free Software Foundation.
576 |
577 | If the Program specifies that a proxy can decide which future
578 | versions of the GNU Affero General Public License can be used, that proxy's
579 | public statement of acceptance of a version permanently authorizes you
580 | to choose that version for the Program.
581 |
582 | Later license versions may give you additional or different
583 | permissions. However, no additional obligations are imposed on any
584 | author or copyright holder as a result of your choosing to follow a
585 | later version.
586 |
587 | 15. Disclaimer of Warranty.
588 |
589 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
590 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
591 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
592 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
593 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
594 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
595 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
596 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
597 |
598 | 16. Limitation of Liability.
599 |
600 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
602 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
603 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
604 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
605 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
606 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
607 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
608 | SUCH DAMAGES.
609 |
610 | 17. Interpretation of Sections 15 and 16.
611 |
612 | If the disclaimer of warranty and limitation of liability provided
613 | above cannot be given local legal effect according to their terms,
614 | reviewing courts shall apply local law that most closely approximates
615 | an absolute waiver of all civil liability in connection with the
616 | Program, unless a warranty or assumption of liability accompanies a
617 | copy of the Program in return for a fee.
618 |
619 | END OF TERMS AND CONDITIONS
620 |
621 | How to Apply These Terms to Your New Programs
622 |
623 | If you develop a new program, and you want it to be of the greatest
624 | possible use to the public, the best way to achieve this is to make it
625 | free software which everyone can redistribute and change under these terms.
626 |
627 | To do so, attach the following notices to the program. It is safest
628 | to attach them to the start of each source file to most effectively
629 | state the exclusion of warranty; and each file should have at least
630 | the "copyright" line and a pointer to where the full notice is found.
631 |
632 |
633 | Copyright (C)
634 |
635 | This program is free software: you can redistribute it and/or modify
636 | it under the terms of the GNU Affero General Public License as published
637 | by the Free Software Foundation, either version 3 of the License, or
638 | (at your option) any later version.
639 |
640 | This program is distributed in the hope that it will be useful,
641 | but WITHOUT ANY WARRANTY; without even the implied warranty of
642 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
643 | GNU Affero General Public License for more details.
644 |
645 | You should have received a copy of the GNU Affero General Public License
646 | along with this program. If not, see .
647 |
648 | Also add information on how to contact you by electronic and paper mail.
649 |
650 | If your software can interact with users remotely through a computer
651 | network, you should also make sure that it provides a way for users to
652 | get its source. For example, if your program is a web application, its
653 | interface could display a "Source" link that leads users to an archive
654 | of the code. There are many ways you could offer source, and different
655 | solutions will be better for different programs; see section 13 for the
656 | specific requirements.
657 |
658 | You should also get your employer (if you work as a programmer) or school,
659 | if any, to sign a "copyright disclaimer" for the program, if necessary.
660 | For more information on this, and how to apply and follow the GNU AGPL, see
661 | .
662 |
--------------------------------------------------------------------------------
/nonebot_plugin_mediawiki/mediawiki/mediawikipage.py:
--------------------------------------------------------------------------------
1 | """
2 | MediaWikiPage class module
3 | """
4 | # MIT License
5 | # Author: Tyler Barrus (barrust@gmail.com); KoishiMoe
6 |
7 | import re
8 | from collections import OrderedDict
9 | from decimal import Decimal
10 |
11 | from bs4 import BeautifulSoup, Tag
12 |
13 | from .exceptions import (
14 | MediaWikiBaseException,
15 | MediaWikiException,
16 | PageError,
17 | RedirectError,
18 | DisambiguationError,
19 | ODD_ERROR_MESSAGE,
20 | InterWikiError,
21 | )
22 | from .utilities import str_or_unicode, is_relative_url
23 |
24 |
25 | class MediaWikiPage(object):
26 | """ MediaWiki Page Instance
27 |
28 | Warning:
29 | This should never need to be used directly! Please use \
30 | :func:`MediaWikiPage.create` instead.
31 | """
32 | __slots__ = [
33 | "mediawiki",
34 | "url",
35 | "title",
36 | "original_title",
37 | "pageid",
38 | "_content",
39 | "_revision_id",
40 | "_parent_id",
41 | "_html",
42 | "_soup",
43 | "_images",
44 | "_references",
45 | "_categories",
46 | "_coordinates",
47 | "_links",
48 | "_redirects",
49 | "_backlinks",
50 | "_langlinks",
51 | "_summary",
52 | "_sections",
53 | "_table_of_contents",
54 | "_logos",
55 | "_hatnotes",
56 | "_wikitext",
57 | "_preview",
58 | "_converttitles",
59 | "_iwurl",
60 | ]
61 |
62 | def __init__(
63 | self,
64 | mediawiki,
65 | title=None,
66 | pageid=None,
67 | preload=False,
68 | original_title="",
69 | convert_titles=False,
70 | iwurl=True,
71 | ):
72 |
73 | self.mediawiki = mediawiki
74 | self.url = None
75 | if title is not None:
76 | self.title = title
77 | self.original_title = original_title or title
78 | elif pageid is not None:
79 | self.pageid = pageid
80 | else:
81 | raise ValueError("Either a title or a pageid must be specified")
82 |
83 | self._content = None
84 | self._revision_id = None
85 | self._parent_id = None
86 | self._html = False # None signifies nothing returned...
87 | self._images = None
88 | self._references = None
89 | self._categories = None
90 | self._coordinates = False # None signifies nothing returned...
91 | self._links = None
92 | self._redirects = None
93 | self._backlinks = None
94 | self._langlinks = None
95 | self._summary = None
96 | self._sections = None
97 | self._table_of_contents = None
98 | self._logos = None
99 | self._hatnotes = None
100 | self._soup = None
101 | self._wikitext = None
102 | self._preview = None
103 | self._converttitles = convert_titles
104 | self._iwurl = iwurl
105 |
106 | preload_props = [
107 | "content",
108 | "summary",
109 | "images",
110 | "references",
111 | "links",
112 | "sections",
113 | "redirects",
114 | "coordinates",
115 | "backlinks",
116 | "categories",
117 | ]
118 | if preload:
119 | for prop in preload_props:
120 | getattr(self, prop)
121 |
122 | # end __init__
123 |
124 | @classmethod
125 | async def create(
126 | cls,
127 | mediawiki,
128 | title=None,
129 | pageid=None,
130 | redirect=True,
131 | preload=False,
132 | original_title="",
133 | convert_titles=False,
134 | iwurl=True,
135 | ):
136 | """ create a MediaWikiPage instance
137 |
138 | Args:
139 | mediawiki (MediaWiki): MediaWiki class object from which to pull
140 | title (str): Title of page to retrieve
141 | pageid (int): MediaWiki site pageid to retrieve
142 | redirect (bool): **True:** Follow redirects
143 | preload (bool): **True:** Load most properties after getting page
144 | original_title (str): Not to be used from the caller; used to \
145 | help follow redirects
146 | convert_titles (bool): Convert titles to other variants if necessary. \
147 | Only works if the wiki's content language supports variant conversion.
148 | iwurl (bool): Whether to get the full URL if the title is an interwiki link.
149 | Raises:
150 | :py:func:`mediawiki.exceptions.PageError`: if page provided does \
151 | not exist
152 | Raises:
153 | :py:func:`mediawiki.exceptions.DisambiguationError`: if page \
154 | provided is a disambiguation page
155 | Raises:
156 | :py:func:`mediawiki.exceptions.RedirectError`: if redirect is \
157 | **False** and the pageid or title provided redirects to another \
158 | page
159 | Warning:
160 | This should never need to be used directly! Please use \
161 | :func:`mediawiki.MediaWiki.page` instead."""
162 | self = MediaWikiPage(mediawiki, title, pageid, preload, original_title, convert_titles, iwurl)
163 | await self.__load(redirect=redirect, preload=preload)
164 | return self
165 |
166 | def __repr__(self):
167 | """ repr """
168 | return self.__str__()
169 |
170 | def __unicode__(self):
171 | """ python 2.7 unicode """
172 | return """""".format(self.title)
173 |
174 | def __str__(self):
175 | """ python > 3 unicode python 2.7 byte str """
176 | return str_or_unicode(self.__unicode__())
177 |
178 | def __eq__(self, other):
179 | """ base eq function """
180 | try:
181 | return (
182 | self.pageid == other.pageid
183 | and self.title == other.title
184 | and self.url == other.url
185 | )
186 | except AttributeError:
187 | return False
188 |
189 | # Properties
190 | async def _pull_content_revision_parent(self):
191 | """ combine the pulling of these three properties """
192 |
193 | if self._revision_id is None:
194 | query_params = {
195 | "prop": "extracts|revisions",
196 | "explaintext": "",
197 | "rvprop": "ids",
198 | }
199 | query_params.update(self.__title_query_param())
200 | request = await self.mediawiki.wiki_request(query_params)
201 | page_info = request["query"]["pages"][self.pageid]
202 | self._content = page_info.get("extract", None)
203 | self._revision_id = page_info["revisions"][0]["revid"]
204 | self._parent_id = page_info["revisions"][0]["parentid"]
205 |
206 | if self._content is None and 'TextExtracts' not in self.mediawiki.extensions:
207 | msg = "Unable to extract page content; the TextExtracts extension must be installed!"
208 | raise MediaWikiBaseException(msg)
209 | return self._content, self._revision_id, self._parent_id
210 |
211 | async def content(self):
212 | """ str: The page content in text format
213 |
214 | Note:
215 | Not settable
216 | Note:
217 | Side effect is to also get revision_id and parent_id """
218 | if self._content is None:
219 | await self._pull_content_revision_parent()
220 | return self._content
221 |
222 | async def revision_id(self):
223 | """ int: The current revision id of the page
224 |
225 | Note:
226 | Not settable
227 | Note:
228 | Side effect is to also get content and parent_id """
229 | if self._revision_id is None:
230 | await self._pull_content_revision_parent()
231 | return self._revision_id
232 |
233 | async def parent_id(self):
234 | """ int: The parent id of the page
235 |
236 | Note:
237 | Not settable
238 | Note:
239 | Side effect is to also get content and revision_id """
240 | if self._parent_id is None:
241 | await self._pull_content_revision_parent()
242 | return self._parent_id
243 |
244 | async def html(self):
245 | """ str: HTML representation of the page
246 |
247 | Note:
248 | Not settable
249 | Warning:
250 | This can be slow for very large pages """
251 | if self._html is False:
252 | self._html = None
253 | query_params = {
254 | "prop": "revisions",
255 | "rvprop": "content",
256 | "rvlimit": 1,
257 | "rvparse": "",
258 | "titles": self.title,
259 | }
260 | request = await self.mediawiki.wiki_request(query_params)
261 | page = request["query"]["pages"][self.pageid]
262 | self._html = page["revisions"][0]["*"]
263 | return self._html
264 |
265 | async def wikitext(self):
266 | """ str: Wikitext representation of the page
267 |
268 | Note:
269 | Not settable """
270 | if self._wikitext is None:
271 | query_params = {
272 | "action": "parse",
273 | "pageid": self.pageid,
274 | "prop": "wikitext",
275 | "formatversion": "latest",
276 | }
277 | request = await self.mediawiki.wiki_request(query_params)
278 | self._wikitext = request["parse"]["wikitext"]
279 | return self._wikitext
280 |
281 | async def images(self):
282 | """ list: Images on the page
283 |
284 | Note:
285 | Not settable """
286 | if self._images is None:
287 | self._images = list()
288 | params = {
289 | "generator": "images",
290 | "gimlimit": "max",
291 | "prop": "imageinfo", # this will be replaced by fileinfo
292 | "iiprop": "url",
293 | }
294 | async for page in self._continued_query(params):
295 | if "imageinfo" in page and "url" in page["imageinfo"][0]:
296 | self._images.append(page["imageinfo"][0]["url"])
297 | self._images = sorted(self._images)
298 | return self._images
299 |
300 | async def logos(self):
301 | """ list: Parse images within the infobox signifying either the main \
302 | image or logo
303 |
304 | Note:
305 | Not settable
306 | Note:
307 | Side effect is to also pull the html which can be slow
308 | Note:
309 | This is a parsing operation and not part of the standard API"""
310 | if self._logos is None:
311 | self._logos = list()
312 | # Cache the results of parsing the html, so that multiple calls happen much faster
313 | if not self._soup:
314 | self._soup = BeautifulSoup(await self.html(), "html.parser")
315 | info = self._soup.find("table", {"class": "infobox"})
316 | if info is not None:
317 | children = info.find_all("a", class_="image")
318 | for child in children:
319 | self._logos.append("https:" + child.img["src"])
320 | return self._logos
321 |
322 | async def hatnotes(self):
323 | """ list: Parse hatnotes from the HTML
324 |
325 | Note:
326 | Not settable
327 | Note:
328 | Side effect is to also pull the html which can be slow
329 | Note:
330 | This is a parsing operation and not part of the standard API"""
331 | if self._hatnotes is None:
332 | self._hatnotes = list()
333 | # Cache the results of parsing the html, so that multiple calls happen much faster
334 | if not self._soup:
335 | self._soup = BeautifulSoup(await self.html(), "html.parser")
336 | notes = self._soup.find_all("div", class_="hatnote")
337 | if notes is not None:
338 | for note in notes:
339 | tmp = list()
340 | for child in note.children:
341 | if hasattr(child, "text"):
342 | tmp.append(child.text)
343 | else:
344 | tmp.append(child)
345 | self._hatnotes.append("".join(tmp))
346 | return self._hatnotes
347 |
348 | async def references(self):
349 | """ list: External links, or references, listed anywhere on the \
350 | MediaWiki page
351 | Note:
352 | Not settable
353 | Note
354 | May include external links within page that are not \
355 | technically cited anywhere """
356 | if self._references is None:
357 | self._references = list()
358 | await self.__pull_combined_properties()
359 | return self._references
360 |
361 | async def categories(self):
362 | """ list: Non-hidden categories on the page
363 |
364 | Note:
365 | Not settable """
366 | if self._categories is None:
367 | self._categories = list()
368 | await self.__pull_combined_properties()
369 | return self._categories
370 |
371 | async def coordinates(self):
372 | """ Tuple: GeoCoordinates of the place referenced; results in \
373 | lat/long tuple or None if no geocoordinates present
374 |
375 | Note:
376 | Not settable
377 | Note:
378 | Requires the GeoData extension to be installed """
379 | if self._coordinates is False:
380 | self._coordinates = None
381 | await self.__pull_combined_properties()
382 | return self._coordinates
383 |
384 | async def links(self):
385 | """ list: List of all MediaWiki page links on the page
386 |
387 | Note:
388 | Not settable """
389 | if self._links is None:
390 | self._links = list()
391 | await self.__pull_combined_properties()
392 | return self._links
393 |
394 | async def redirects(self):
395 | """ list: List of all redirects to this page; **i.e.,** the titles \
396 | listed here will redirect to this page title
397 |
398 | Note:
399 | Not settable """
400 | if self._redirects is None:
401 | self._redirects = list()
402 | await self.__pull_combined_properties()
403 | return self._redirects
404 |
405 | async def backlinks(self):
406 | """ list: Pages that link to this page
407 |
408 | Note:
409 | Not settable """
410 | if self._backlinks is None:
411 | self._backlinks = list()
412 | params = {
413 | "action": "query",
414 | "list": "backlinks",
415 | "bltitle": self.title,
416 | "bllimit": "max",
417 | "blfilterredir": "nonredirects",
418 | "blnamespace": 0,
419 | }
420 | tmp = [link["title"] async for link in self._continued_query(params, "backlinks")]
421 | self._backlinks = sorted(tmp)
422 | return self._backlinks
423 |
424 | async def langlinks(self):
425 | """ dict: Names of the page in other languages for which page is \
426 | where the key is the language code and the page name is the name \
427 | of the page in that language.
428 |
429 | Note:
430 | Not settable
431 | Note:
432 | list of all language links from the provided pages to other \
433 | languages according to: \
434 | https://www.mediawiki.org/wiki/API:Langlinks """
435 |
436 | if self._langlinks is None:
437 | params = {"prop": "langlinks", "cllimit": "max"}
438 | query_result = self._continued_query(params)
439 |
440 | langlinks = dict()
441 | async for lang_info in query_result:
442 | langlinks[lang_info["lang"]] = lang_info["*"]
443 | self._langlinks = langlinks
444 | return self._langlinks
445 |
446 | async def preview(self):
447 | """ dict: Page preview information that builds the preview hover """
448 | if self._preview is None:
449 | params = {
450 | "action": "query",
451 | "formatversion": "2",
452 | "prop": "info|extracts|pageimages|revisions|pageterms|coordinates",
453 | "exsentences": "5",
454 | "explaintext": "true",
455 | "piprop": "thumbnail|original",
456 | "pithumbsize": "320",
457 | "pilicense": "any",
458 | "rvprop": "timestamp|ids",
459 | "wbptterms": "description",
460 | "titles": self.title,
461 | }
462 | raw = await self.mediawiki.wiki_request(params)
463 | self._preview = raw.get("query", dict()).get("pages", list())[0]
464 | return self._preview
465 |
466 | async def summary(self):
467 | """ str: Default page summary
468 |
469 | Note:
470 | Not settable """
471 | if self._summary is None:
472 | await self.__pull_combined_properties()
473 | return self._summary
474 |
475 | async def summarize(self, sentences=0, chars=0):
476 | """ Summarize page either by number of sentences, chars, or first
477 | section (**default**)
478 |
479 | Args:
480 | sentences (int): Number of sentences to use in summary \
481 | (first `x` sentences)
482 | chars (int): Number of characters to use in summary \
483 | (first `x` characters)
484 | Returns:
485 | str: The summary of the MediaWiki page
486 | Note:
487 | Precedence for parameters: sentences then chars; if both are \
488 | 0 then the entire first section is returned """
489 | query_params = {"prop": "extracts", "explaintext": "", "titles": self.title}
490 | if sentences:
491 | query_params["exsentences"] = 10 if sentences > 10 else sentences
492 | elif chars:
493 | query_params["exchars"] = 1 if chars < 1 else chars
494 | else:
495 | query_params["exintro"] = ""
496 |
497 | request = await self.mediawiki.wiki_request(query_params)
498 | summary = request["query"]["pages"][self.pageid].get("extract")
499 | return summary
500 |
501 | async def sections(self):
502 | """ list: Table of contents sections
503 |
504 | Note:
505 | Not settable """
506 | # NOTE: Due to MediaWiki sites adding superscripts or italics or bold
507 | # information in the sections, moving to regex to get the
508 | # `non-decorated` name instead of using the query api!
509 | if self._sections is None:
510 | await self._parse_sections()
511 | return self._sections
512 |
513 | async def table_of_contents(self):
514 | """ OrderedDict: Dictionary of sections and subsections
515 |
516 | Note:
517 | Leaf nodes are empty OrderedDict objects
518 | Note:
519 | Not Settable"""
520 |
521 | if self._table_of_contents is None:
522 | await self._parse_sections()
523 | return self._table_of_contents
524 |
525 | async def section(self, section_title):
526 | """ Plain text section content
527 |
528 | Args:
529 | section_title (str): Name of the section to pull or None \
530 | for the header section
531 | Returns:
532 | str: The content of the section
533 | Note:
534 | Use **None** if the header section is desired
535 | Note:
536 | Returns **None** if section title is not found; only text \
537 | between title and next section or subsection title is returned
538 | Note:
539 | Side effect is to also pull the content which can be slow
540 | Note:
541 | This is a parsing operation and not part of the standard API"""
542 | if not section_title:
543 | try:
544 | content = await self.content()
545 | index = 0
546 | except ValueError:
547 | return None
548 | except IndexError:
549 | pass
550 | else:
551 | section = "== {0} ==".format(section_title)
552 | try:
553 | content = await self.content()
554 | index = content.index(section) + len(section)
555 |
556 | # ensure we have the full section header...
557 | while True:
558 | if content[index + 1] == "=":
559 | index += 1
560 | else:
561 | break
562 | except ValueError:
563 | return None
564 | except IndexError:
565 | pass
566 |
567 | content = await self.content()
568 | try:
569 | next_index = content.index("==", index)
570 | except ValueError:
571 | next_index = len(await self.content())
572 |
573 | return content[index:next_index].lstrip("=").strip()
574 |
575 | async def parse_section_links(self, section_title):
576 | """ Parse all links within a section
577 |
578 | Args:
579 | section_title (str): Name of the section to pull or, if \
580 | None is provided, the links between the main heading and \
581 | the first section
582 | Returns:
583 | list: List of (title, url) tuples
584 | Note:
585 | Use **None** to pull the links from the header section
586 | Note:
587 | Returns **None** if section title is not found
588 | Note:
589 | Side effect is to also pull the html which can be slow
590 | Note:
591 | This is a parsing operation and not part of the standard API"""
592 | # Cache the results of parsing the html, so that multiple calls happen much faster
593 | if not self._soup:
594 | self._soup = BeautifulSoup(await self.html(), "html.parser")
595 |
596 | if not section_title:
597 | return self._parse_section_links(None)
598 |
599 | headlines = self._soup.find_all("span", class_="mw-headline")
600 | tmp_soup = BeautifulSoup(section_title, "html.parser")
601 | tmp_sec_title = tmp_soup.get_text().lower()
602 | id_tag = None
603 | for headline in headlines:
604 | tmp_id = headline.text
605 | if tmp_id.lower() == tmp_sec_title:
606 | id_tag = headline.get("id")
607 | break
608 |
609 | if id_tag is not None:
610 | return self._parse_section_links(id_tag)
611 | return None
612 |
613 | # Protected Methods
614 | async def __load(self, redirect=True, preload=False):
615 | """ load the basic page information """
616 | query_params = {
617 | "prop": "info|pageprops",
618 | "inprop": "url",
619 | "ppprop": "disambiguation",
620 | "redirects": "",
621 | }
622 | query_params.update(self.__title_query_param())
623 |
624 | # params add by KoishiMoe
625 | if self._converttitles:
626 | query_params.update({"converttitles": 1})
627 | if self._iwurl:
628 | query_params.update({"iwurl": 1})
629 |
630 | request = await self.mediawiki.wiki_request(query_params)
631 |
632 | query = request["query"]
633 | if query.get("pages"):
634 | pageid = list(query["pages"].keys())[0]
635 | page = query["pages"][pageid]
636 |
637 | # determine result of the request
638 | # interwiki is present in query if page is a interwiki; in this case, there's no `pages` in query
639 | if "interwiki" in query:
640 | self._handle_interwiki(query)
641 | # converted may be present in query if convert_titles == True
642 | if "converted" in query:
643 | self.title = query["converted"][0].get('to') or self.title
644 | # missing is present if the page is missing
645 | if "missing" in page or pageid == '-1': # sometimes it doesn't return missing, but pageid == -1
646 | self._raise_page_error()
647 | # redirects is present in query if page is a redirect
648 | elif "redirects" in query:
649 | await self._handle_redirect(redirect, preload, query, page)
650 | # if pageprops is returned, it must be a disambiguation error
651 | elif "pageprops" in page:
652 | await self._raise_disambiguation_error(page, pageid)
653 | else:
654 | self.pageid = pageid
655 | self.title = page["title"]
656 | self.url = page["fullurl"]
657 |
658 | def _raise_page_error(self):
659 | """ raise the correct type of page error """
660 | if hasattr(self, "title"):
661 | raise PageError(title=self.title)
662 | raise PageError(pageid=self.pageid)
663 |
664 | async def _raise_disambiguation_error(self, page, pageid):
665 | """ parse and throw a disambiguation error """
666 | query_params = {
667 | "prop": "revisions",
668 | "rvprop": "content",
669 | "rvparse": "",
670 | "rvlimit": 1,
671 | }
672 | query_params.update(self.__title_query_param())
673 | request = await self.mediawiki.wiki_request(query_params)
674 | html = request["query"]["pages"][pageid]["revisions"][0]["*"]
675 |
676 | lis = BeautifulSoup(html, "html.parser").find_all("li")
677 | filtered_lis = [
678 | li for li in lis if "tocsection" not in "".join(li.get("class", list()))
679 | ]
680 | may_refer_to = [li.a.get_text() for li in filtered_lis if li.a]
681 |
682 | disambiguation = list()
683 | for lis_item in filtered_lis:
684 | item = lis_item.find_all("a")
685 | one_disambiguation = dict()
686 | one_disambiguation["description"] = lis_item.text
687 | if item and item[0].has_attr("title"):
688 | one_disambiguation["title"] = item[0]["title"]
689 | else:
690 | # these are non-linked records so double up the text
691 | one_disambiguation["title"] = lis_item.text
692 | disambiguation.append(one_disambiguation)
693 | raise DisambiguationError(
694 | getattr(self, "title", page["title"]),
695 | may_refer_to,
696 | page["fullurl"],
697 | disambiguation,
698 | )
699 |
700 | # method add by KoishiMoe
701 | def _handle_interwiki(self, query):
702 | inter_wiki = query["interwiki"][0]
703 | title = inter_wiki.get("title", '')[len(f'{inter_wiki.get("iw", "")}:'):]
704 | url = inter_wiki.get("url", '')
705 | raise InterWikiError(title, url)
706 |
707 | async def _handle_redirect(self, redirect, preload, query, page):
708 | """ handle redirect """
709 | if redirect:
710 | redirects = query["redirects"][0]
711 |
712 | if "normalized" in query:
713 | normalized = query["normalized"][0]
714 | if normalized["from"] != self.title:
715 | raise MediaWikiException(ODD_ERROR_MESSAGE)
716 | from_title = normalized["to"]
717 | else:
718 | if not getattr(self, "title", None):
719 | self.title = redirects["from"]
720 | delattr(self, "pageid")
721 | from_title = self.title
722 | if redirects["from"] != from_title:
723 | raise MediaWikiException(ODD_ERROR_MESSAGE)
724 |
725 | # change the title and reload the whole object
726 | self.__init__(
727 | self.mediawiki,
728 | title=redirects["to"],
729 | preload=preload,
730 | )
731 | await self.__load(redirect=redirect, preload=preload)
732 | else:
733 | raise RedirectError(getattr(self, "title", page["title"]))
734 |
735 | async def _continued_query(self, query_params, key="pages"):
736 | """ Based on
737 | https://www.mediawiki.org/wiki/API:Query#Continuing_queries """
738 | query_params.update(self.__title_query_param())
739 |
740 | last_cont = dict()
741 | prop = query_params.get("prop")
742 |
743 | while True:
744 | params = query_params.copy()
745 | params.update(last_cont)
746 |
747 | request = await self.mediawiki.wiki_request(params)
748 |
749 | if "query" not in request:
750 | break
751 |
752 | pages = request["query"][key]
753 | if "generator" in query_params:
754 | for datum in pages.values():
755 | yield datum
756 | elif isinstance(pages, list):
757 | for datum in list(enumerate(pages)):
758 | yield datum[1]
759 | else:
760 | for datum in pages[self.pageid].get(prop, list()):
761 | yield datum
762 |
763 | if "continue" not in request or request["continue"] == last_cont:
764 | break
765 |
766 | last_cont = request["continue"]
767 |
768 | def _parse_section_links(self, id_tag):
769 | """ given a section id, parse the links in the unordered list """
770 | all_links = list()
771 |
772 | if id_tag is None:
773 | root = self._soup.find("div", {"class": "mw-parser-output"})
774 | if root is None:
775 | return all_links
776 | candidates = root.children
777 | else:
778 | root = self._soup.find("span", {"id": id_tag})
779 | if root is None:
780 | return all_links
781 | candidates = self._soup.find(id=id_tag).parent.next_siblings
782 |
783 | for node in candidates:
784 | if not isinstance(node, Tag):
785 | continue
786 | if node.get("role", "") == "navigation":
787 | continue
788 | elif "infobox" in node.get("class", []):
789 | continue
790 |
791 | # If the classname contains "toc", the element is a table of contents.
792 | # The comprehension is necessary because there are several possible
793 | # types of tocs: "toclevel", "toc", ...
794 | toc_classnames = [cname for cname in node.get("class", []) if "toc" in cname]
795 | if toc_classnames:
796 | continue
797 |
798 | # this is actually the child node's class...
799 | is_headline = node.find("span", {"class": "mw-headline"})
800 | if is_headline is not None:
801 | break
802 | if node.name == "a":
803 | all_links.append(self.__parse_link_info(node))
804 | else:
805 | for link in node.find_all("a"):
806 | all_links.append(self.__parse_link_info(link))
807 | return all_links
808 |
809 | def __parse_link_info(self, link):
810 | """ parse the tag for the link """
811 | href = link.get("href", "")
812 | txt = link.string or href
813 | is_rel = is_relative_url(href)
814 | if is_rel is True:
815 | tmp = "{0}{1}".format(self.mediawiki.base_url, href)
816 | elif is_rel is None:
817 | tmp = "{0}{1}".format(self.url, href)
818 | else:
819 | tmp = href
820 | return txt, tmp
821 |
822 | async def _parse_sections(self):
823 | """ parse sections and TOC """
824 |
825 | def _list_to_dict(_dict, path, sec):
826 | tmp = _dict
827 | for elm in path[:-1]:
828 | tmp = tmp[elm]
829 | tmp[sec] = OrderedDict()
830 |
831 | self._sections = list()
832 | section_regexp = r"\n==* .* ==*\n" # '== {STUFF_NOT_\n} =='
833 | found_obj = re.findall(section_regexp, await self.content())
834 |
835 | res = OrderedDict()
836 | path = list()
837 | last_depth = 0
838 | for obj in found_obj:
839 | depth = obj.count("=") / 2 # this gets us to the single side...
840 | depth -= 2 # now, we can calculate depth
841 |
842 | sec = obj.lstrip("\n= ").rstrip(" =\n")
843 | if depth == 0:
844 | last_depth = 0
845 | path = [sec]
846 | res[sec] = OrderedDict()
847 | elif depth > last_depth:
848 | last_depth = depth
849 | path.append(sec)
850 | _list_to_dict(res, path, sec)
851 | elif depth < last_depth:
852 | # path.pop()
853 | while last_depth > depth:
854 | path.pop()
855 | last_depth -= 1
856 | path.pop()
857 | path.append(sec)
858 | _list_to_dict(res, path, sec)
859 | last_depth = depth
860 | else:
861 | path.pop()
862 | path.append(sec)
863 | _list_to_dict(res, path, sec)
864 | last_depth = depth
865 | self._sections.append(sec)
866 |
867 | self._table_of_contents = res
868 |
869 | def __title_query_param(self):
870 | """ util function to determine which parameter method to use """
871 | if getattr(self, "title", None) is not None:
872 | return {"titles": self.title}
873 | return {"pageids": self.pageid}
874 |
875 | async def __pull_combined_properties(self):
876 | """ something here... """
877 |
878 | query_params = {
879 | "titles": self.title,
880 | "prop": "extracts|redirects|links|coordinates|categories|extlinks",
881 | # "continue": dict(),
882 | "continue": "",
883 | # summary
884 | "explaintext": "",
885 | "exintro": "", # full first section for the summary!
886 | # redirects
887 | "rdprop": "title",
888 | "rdlimit": "max",
889 | # links
890 | "plnamespace": 0,
891 | "pllimit": "max",
892 | # coordinates
893 | "colimit": "max",
894 | # categories
895 | "cllimit": "max",
896 | "clshow": "!hidden",
897 | # references
898 | "ellimit": "max",
899 | }
900 |
901 | last_cont = dict()
902 | results = dict()
903 | idx = 0
904 | while True:
905 | params = query_params.copy()
906 | params.update(last_cont)
907 |
908 | request = await self.mediawiki.wiki_request(params)
909 | idx += 1
910 |
911 | if "query" not in request:
912 | break
913 |
914 | keys = [
915 | "extracts",
916 | "redirects",
917 | "links",
918 | "coordinates",
919 | "categories",
920 | "extlinks",
921 | ]
922 | new_cont = request.get("continue")
923 | request = request["query"]["pages"][self.pageid]
924 | if not results:
925 | results = request
926 | else:
927 | for key in keys:
928 | if key in request and request.get(key) is not None:
929 | val = request.get(key)
930 | tmp = results.get(key)
931 | if isinstance(tmp, (list, tuple)):
932 | results[key] = results.get(key, list) + val
933 | if new_cont is None or new_cont == last_cont:
934 | break
935 |
936 | last_cont = new_cont
937 |
938 | # redirects
939 | tmp = [link["title"] for link in results.get("redirects", list())]
940 | self._redirects = sorted(tmp)
941 |
942 | # summary
943 | self._summary = results.get("extract")
944 |
945 | # links
946 | tmp = [link["title"] for link in results.get("links", list())]
947 | self._links = sorted(tmp)
948 |
949 | # categories
950 | def _get_cat(val):
951 | """ parse the category correctly """
952 | tmp = val["title"]
953 | if tmp.startswith(self.mediawiki.category_prefix):
954 | return tmp[len(self.mediawiki.category_prefix) + 1:]
955 | return tmp
956 |
957 | tmp = [_get_cat(link) for link in results.get("categories", list())]
958 | self._categories = sorted(tmp)
959 |
960 | # coordinates
961 | if "coordinates" in results:
962 | self._coordinates = (
963 | Decimal(results["coordinates"][0]["lat"]),
964 | Decimal(results["coordinates"][0]["lon"]),
965 | )
966 |
967 | # references
968 | tmp = [link["*"] for link in results.get("extlinks", list())]
969 | self._references = sorted(tmp)
970 |
--------------------------------------------------------------------------------
/nonebot_plugin_mediawiki/mediawiki/mediawiki.py:
--------------------------------------------------------------------------------
1 | """
2 | MediaWiki class module
3 | """
4 | # MIT License
5 | # Author: Tyler Barrus (barrust@gmail.com); KoishiMoe
6 |
7 | import asyncio
8 | from datetime import datetime, timedelta
9 | from decimal import Decimal, DecimalException
10 | from json import JSONDecodeError
11 |
12 | import aiohttp
13 |
14 | from .constants import VERSION, URL
15 | from .exceptions import (
16 | HTTPTimeoutError,
17 | MediaWikiAPIURLError,
18 | MediaWikiCategoryTreeError,
19 | MediaWikiException,
20 | MediaWikiGeoCoordError,
21 | MediaWikiLoginError,
22 | PageError,
23 | )
24 | from .mediawikipage import MediaWikiPage
25 | from .utilities import memoize
26 |
27 |
28 | class MediaWiki(object):
29 | """ MediaWiki API Wrapper Instance
30 |
31 | Warning:
32 | This should never need to be used directly! Please use \
33 | :func:`MediaWiki.create` instead.
34 | """
35 |
36 | __slots__ = [
37 | "_version",
38 | "_lang",
39 | "_api_url",
40 | "_cat_prefix",
41 | "_timeout",
42 | "_user_agent",
43 | "_session",
44 | "_rate_limit",
45 | "_rate_limit_last_call",
46 | "_min_wait",
47 | "_extensions",
48 | "_api_version",
49 | "_api_version_str",
50 | "_base_url",
51 | "__supported_languages",
52 | "__available_languages",
53 | "_cache",
54 | "_refresh_interval",
55 | "_use_cache",
56 | "_is_logged_in",
57 | "_proxies",
58 | ]
59 |
60 | def __init__(self,
61 | url="https://{lang}.wikipedia.org/w/api.php",
62 | lang="en",
63 | timeout=15.0,
64 | rate_limit=False,
65 | rate_limit_wait=timedelta(milliseconds=50),
66 | cat_prefix="Category",
67 | ):
68 | """ DO NOT USE ME, USE MediaWiki.create() INSTEAD !!! """
69 | self._version = VERSION
70 | self._lang = lang.lower()
71 | self._api_url = url.format(lang=self._lang)
72 | self._cat_prefix = None
73 | self.category_prefix = cat_prefix
74 | self._timeout = None
75 | self.timeout = timeout
76 | # requests library parameters
77 | self._session = None
78 | self._user_agent = "python-mediawiki/VERSION-{0}" "/({1})/BOT".format(VERSION, URL)
79 | self._proxies = None
80 |
81 | self._rate_limit = None
82 | self.rate_limit = bool(rate_limit)
83 | self._rate_limit_last_call = None
84 | self._min_wait = rate_limit_wait
85 | self._extensions = None
86 | self._api_version = None
87 | self._api_version_str = None
88 | self._base_url = None
89 | self.__supported_languages = None
90 | self.__available_languages = None
91 |
92 | # for memoized results
93 | self._cache = dict()
94 | self._refresh_interval = None
95 | self._use_cache = True
96 |
97 | # for login information
98 | self._is_logged_in = False
99 |
100 | @classmethod
101 | async def create(
102 | cls,
103 | url="https://{lang}.wikipedia.org/w/api.php",
104 | lang="en",
105 | timeout=15.0,
106 | rate_limit=False,
107 | rate_limit_wait=timedelta(milliseconds=50),
108 | cat_prefix="Category",
109 | user_agent=None,
110 | username=None,
111 | password=None,
112 | proxies: str = None,
113 | ):
114 |
115 | """ MediaWiki API Wrapper Instance
116 |
117 | Args:
118 | url (str): API URL of the MediaWiki site; defaults to Wikipedia
119 | lang (str): Language of the MediaWiki site; used to help change API URL
120 | timeout (float): HTTP timeout setting; None means no timeout
121 | rate_limit (bool): Use rate limiting to limit calls to the site
122 | rate_limit_wait (timedelta): Amount of time to wait between requests
123 | cat_prefix (str): The prefix for categories used by the mediawiki site; defaults to Category (en)
124 | user_agent (str): The user agent string to use when making requests; defaults to a library version but \
125 | per the MediaWiki API documentation it recommends setting a unique one and not using the \
126 | library's default user-agent string
127 | username (str): The username to use to log into the MediaWiki
128 | password (str): The password to use to log into the MediaWiki
129 | proxies (str): Proxy **URL** for aiohttp library to use.
130 | It looks like 'http://your_proxy_url:your_proxy_port' or
131 | 'http://your_user:your_password@your_proxy_url:your_proxy_port' (If your proxy requires authentication)
132 | """
133 |
134 | self = MediaWiki(
135 | url=url,
136 | lang=lang,
137 | timeout=timeout,
138 | rate_limit=rate_limit,
139 | rate_limit_wait=rate_limit_wait,
140 | cat_prefix=cat_prefix,
141 | )
142 |
143 | # set library parameters
144 | if user_agent is not None:
145 | await self.set_user_agent(user_agent)
146 | await self.set_proxies(proxies) # this will call self._reset_session()
147 |
148 | if password is not None and username is not None:
149 | await self.login(username, password)
150 |
151 | try:
152 | await self._get_site_info()
153 | except MediaWikiException:
154 | raise MediaWikiAPIURLError(url)
155 |
156 | return self
157 |
158 | def __del__(self):
159 | if self._session:
160 | loop = asyncio.get_event_loop()
161 | loop.create_task(self._session.close())
162 |
163 | # non-settable properties
164 | @property
165 | def version(self):
166 | """ str: The version of the pymediawiki library
167 |
168 | Note:
169 | Not settable """
170 | return self._version
171 |
172 | @property
173 | def api_version(self):
174 | """ str: API Version of the MediaWiki site
175 |
176 | Note:
177 | Not settable """
178 | return self._api_version_str
179 |
180 | @property
181 | def base_url(self):
182 | """ str: Base URL for the MediaWiki site
183 |
184 | Note:
185 | Not settable """
186 | return self._base_url
187 |
188 | @property
189 | def extensions(self):
190 | """ list: Extensions installed on the MediaWiki site
191 |
192 | Note:
193 | Not settable """
194 | return self._extensions
195 |
196 | # settable properties
197 | @property
198 | def rate_limit(self):
199 | """ bool: Turn on or off Rate Limiting """
200 | return self._rate_limit
201 |
202 | @rate_limit.setter
203 | def rate_limit(self, rate_limit):
204 | """ Turn on or off rate limiting """
205 | self._rate_limit = bool(rate_limit)
206 | self._rate_limit_last_call = None
207 | self.clear_memoized()
208 |
209 | @property
210 | def proxies(self):
211 | return self._proxies
212 |
213 | async def set_proxies(self, proxies):
214 | """ Turn on, off, or set proxy use through the aiohttp library """
215 | if proxies and isinstance(proxies, str):
216 | self._proxies = proxies
217 | else:
218 | self._proxies = None
219 | await self._reset_session()
220 |
221 | @property
222 | def use_cache(self):
223 | """ bool: Whether caching should be used; on (**True**) or off \
224 | (**False**) """
225 | return self._use_cache
226 |
227 | @use_cache.setter
228 | def use_cache(self, use_cache):
229 | """ toggle using the cache or not """
230 | self._use_cache = bool(use_cache)
231 |
232 | @property
233 | def rate_limit_min_wait(self):
234 | """ timedelta: Time to wait between calls
235 |
236 | Note:
237 | Only used if rate_limit is **True** """
238 | return self._min_wait
239 |
240 | @rate_limit_min_wait.setter
241 | def rate_limit_min_wait(self, min_wait):
242 | """ Set minimum wait to use for rate limiting """
243 | self._min_wait = min_wait
244 | self._rate_limit_last_call = None
245 |
246 | @property
247 | def timeout(self):
248 | """ float: Response timeout for API requests
249 |
250 | Note:
251 | Use **None** for no response timeout """
252 | return self._timeout
253 |
254 | @timeout.setter
255 | def timeout(self, timeout):
256 | """ Set request timeout in seconds (or fractions of a second) """
257 |
258 | if timeout is None:
259 | self._timeout = None # no timeout
260 | return
261 | self._timeout = float(timeout) # allow the exception to be raised
262 |
263 | @property
264 | def language(self):
265 | """ str: The API URL language, if possible this will update the API \
266 | URL
267 |
268 | Note:
269 | Use correct language titles with the updated API URL
270 | Note:
271 | Some API URLs do not encode language; unable to update if \
272 | this is the case """
273 | return self._lang
274 |
275 | @language.setter
276 | def language(self, lang):
277 | """ Set the language to use; attempts to change the API URL """
278 | lang = lang.lower()
279 | if self._lang == lang:
280 | return
281 |
282 | url = self._api_url
283 | tmp = url.replace("/{0}.".format(self._lang), "/{0}.".format(lang))
284 |
285 | self._api_url = tmp
286 | self._lang = lang
287 | self.clear_memoized()
288 |
289 | @property
290 | def category_prefix(self):
291 | """ str: The category prefix to use when using category based functions
292 |
293 | Note:
294 | Use the correct category name for the language selected """
295 | return self._cat_prefix
296 |
297 | @category_prefix.setter
298 | def category_prefix(self, prefix):
299 | """ Set the category prefix correctly """
300 | if prefix[-1:] == ":":
301 | prefix = prefix[:-1]
302 | self._cat_prefix = prefix
303 |
304 | @property
305 | def user_agent(self):
306 | """ str: User agent string
307 |
308 | Note: If using in as part of another project, this should be \
309 | changed """
310 | return self._user_agent
311 |
312 | async def set_user_agent(self, user_agent):
313 | """ Set the new user agent string
314 |
315 | Note: Will need to re-log into the MediaWiki if user agent string \
316 | is changed """
317 | self._user_agent = user_agent
318 | await self._reset_session()
319 |
320 | @property
321 | def api_url(self):
322 | """ str: API URL of the MediaWiki site
323 |
324 | Note:
325 | Not settable; See :py:func:`mediawiki.MediaWiki.set_api_url`"""
326 | return self._api_url
327 |
328 | @property
329 | def memoized(self):
330 | """ dict: Return the memoize cache
331 |
332 | Note:
333 | Not settable; see
334 | :py:func:`mediawiki.MediaWiki.clear_memoized` """
335 | return self._cache
336 |
337 | @property
338 | def refresh_interval(self):
339 | """ int: The interval at which the memoize cache is to be refreshed """
340 | return self._refresh_interval
341 |
342 | @refresh_interval.setter
343 | def refresh_interval(self, refresh_interval):
344 | """ Set the new cache refresh interval """
345 | if isinstance(refresh_interval, int) and refresh_interval > 0:
346 | self._refresh_interval = refresh_interval
347 | else:
348 | self._refresh_interval = None
349 |
350 | async def login(self, username, password, strict=True):
351 | """ Login as specified user
352 |
353 | Args:
354 | username (str): The username to log in with
355 | password (str): The password for the user
356 | strict (bool): `True` to throw an error on failure
357 | Returns:
358 | bool: `True` if successfully logged in; `False` otherwise
359 | Raises:
360 | :py:func:`mediawiki.exceptions.MediaWikiLoginError`: if unable to login
361 |
362 | Note:
363 | Per the MediaWiki API, one should use the `bot password`; \
364 | see https://www.mediawiki.org/wiki/API:Login for more information """
365 | # get login token
366 | params = {
367 | "action": "query",
368 | "meta": "tokens",
369 | "type": "login",
370 | "format": "json",
371 | }
372 | token_res = await self._get_response(params)
373 | if "query" in token_res and "tokens" in token_res["query"]:
374 | token = token_res["query"]["tokens"]["logintoken"]
375 |
376 | params = {
377 | "action": "login",
378 | "lgname": username,
379 | "lgpassword": password,
380 | "lgtoken": token,
381 | "format": "json",
382 | }
383 |
384 | res = await self._post_response(params)
385 | if res["login"]["result"] == "Success":
386 | self._is_logged_in = True
387 | return True
388 | self._is_logged_in = False
389 | reason = res["login"]["reason"]
390 | if strict:
391 | msg = "MediaWiki login failure: {}".format(reason)
392 | raise MediaWikiLoginError(msg)
393 | return False
394 |
395 | # non-properties
396 | async def set_api_url(
397 | self, api_url="https://{lang}.wikipedia.org/w/api.php", lang="en", username=None, password=None,
398 | ):
399 | """ Set the API URL and language
400 |
401 | Args:
402 | api_url (str): API URL to use
403 | lang (str): Language of the API URL
404 | username (str): The username, if needed, to log into the MediaWiki site
405 | password (str): The password, if needed, to log into the MediaWiki site
406 | Raises:
407 | :py:func:`mediawiki.exceptions.MediaWikiAPIURLError`: if the \
408 | url is not a valid MediaWiki site or login fails """
409 | old_api_url = self._api_url
410 | old_lang = self._lang
411 | self._lang = lang.lower()
412 | self._api_url = api_url.format(lang=self._lang)
413 |
414 | self._is_logged_in = False
415 | try:
416 | if username is not None and password is not None:
417 | await self.login(username, password)
418 | await self._get_site_info()
419 | self.__supported_languages = None # reset this
420 | self.__available_languages = None # reset this
421 | except (asyncio.TimeoutError, MediaWikiException):
422 | # reset api url and lang in the event that the exception was caught
423 | self._api_url = old_api_url
424 | self._lang = old_lang
425 | raise MediaWikiAPIURLError(api_url)
426 | self.clear_memoized()
427 |
428 | async def _reset_session(self):
429 | """ Set session information """
430 | if self._session:
431 | await self._session.close()
432 |
433 | headers = {"User-Agent": self._user_agent}
434 | self._session = aiohttp.ClientSession()
435 | self._session.headers.update(headers)
436 | self._is_logged_in = False
437 |
438 | def clear_memoized(self):
439 | """ Clear memoized (cached) values """
440 | if hasattr(self, "_cache"):
441 | self._cache.clear()
442 |
443 | # non-setup functions
444 | async def supported_languages(self):
445 | """ dict: All supported language prefixes on the MediaWiki site
446 |
447 | Note:
448 | Not Settable """
449 | if self.__supported_languages is None:
450 | res = await self.wiki_request({"meta": "siteinfo", "siprop": "languages"})
451 | tmp = res["query"]["languages"]
452 | supported = {lang["code"]: lang["*"] for lang in tmp}
453 | self.__supported_languages = supported
454 | return self.__supported_languages
455 |
456 | async def available_languages(self):
457 | """ dict: All available language prefixes on the MediaWiki site
458 |
459 | Note:
460 | Not Settable """
461 | if self.__available_languages is None:
462 | available = {}
463 | supported_languages = await self.supported_languages()
464 | for lang in supported_languages:
465 | try:
466 | MediaWiki(lang=lang)
467 | available[lang] = True
468 | except (aiohttp.ClientConnectionError, asyncio.TimeoutError, MediaWikiException,
469 | MediaWikiAPIURLError):
470 | available[lang] = False
471 | self.__available_languages = available
472 | return self.__available_languages
473 |
474 | @property
475 | def logged_in(self):
476 | """ bool: Returns if logged into the MediaWiki site """
477 | return self._is_logged_in
478 |
479 | async def random(self, pages=1):
480 | """ Request a random page title or list of random titles
481 |
482 | Args:
483 | pages (int): Number of random pages to return
484 | Returns:
485 | list or int: A list of random page titles or a random page title if pages = 1 """
486 | if pages is None or pages < 1:
487 | raise ValueError("Number of pages must be greater than 0")
488 |
489 | query_params = {"list": "random", "rnnamespace": 0, "rnlimit": pages}
490 |
491 | request = await self.wiki_request(query_params)
492 | titles = [page["title"] for page in request["query"]["random"]]
493 |
494 | if len(titles) == 1:
495 | return titles[0]
496 | return titles
497 |
498 | @memoize
499 | async def allpages(self, query="", results=10):
500 | """ Request all pages from mediawiki instance
501 |
502 | Args:
503 | query (str): Search string to use for pulling pages
504 | results (int): The number of pages to return
505 | Returns:
506 | list: The pages that meet the search query
507 | Note:
508 | Could add ability to continue past the limit of 500
509 | """
510 | max_pull = 500
511 | limit = min(results, max_pull) if results is not None else max_pull
512 | query_params = {"list": "allpages", "aplimit": limit, "apfrom": query}
513 |
514 | request = await self.wiki_request(query_params)
515 |
516 | self._check_error_response(request, query)
517 |
518 | titles = [page["title"] for page in request["query"]["allpages"]]
519 | return titles
520 |
521 | @memoize
522 | async def search(self, query, results=10, suggestion=False):
523 | """ Search for similar titles
524 |
525 | Args:
526 | query (str): Page title
527 | results (int): Number of pages to return
528 | suggestion (bool): Use suggestion
529 | Returns:
530 | tuple or list: tuple (list results, suggestion) if suggestion is **True**; list of results otherwise
531 | Note:
532 | Could add ability to continue past the limit of 500
533 | """
534 |
535 | self._check_query(query, "Query must be specified")
536 |
537 | max_pull = 500
538 |
539 | search_params = {
540 | "list": "search",
541 | "srprop": "",
542 | "srlimit": min(results, max_pull) if results is not None else max_pull,
543 | "srsearch": query,
544 | "sroffset": 0, # this is what will be used to pull more than the max
545 | }
546 | if suggestion:
547 | search_params["srinfo"] = "suggestion"
548 |
549 | raw_results = await self.wiki_request(search_params)
550 |
551 | self._check_error_response(raw_results, query)
552 |
553 | search_results = [d["title"] for d in raw_results["query"]["search"]]
554 |
555 | if suggestion:
556 | sug = None
557 | if raw_results["query"].get("searchinfo"):
558 | sug = raw_results["query"]["searchinfo"]["suggestion"]
559 | return search_results, sug
560 | return search_results
561 |
562 | @memoize
563 | async def suggest(self, query):
564 | """ Gather suggestions based on the provided title or None if no
565 | suggestions found
566 |
567 | Args:
568 | query (str): Page title
569 | Returns:
570 | String or None: Suggested page title or **None** if no suggestion found
571 | """
572 | res, suggest = await self.search(query, results=1, suggestion=True)
573 | try:
574 | title = res[0] or suggest
575 | except IndexError: # page doesn't exist
576 | title = None
577 | return title
578 |
579 | @memoize
580 | async def geosearch(
581 | self, latitude=None, longitude=None, radius=1000, title=None, auto_suggest=True, results=10,
582 | ):
583 | """ Search for pages that relate to the provided geocoords or near
584 | the page
585 |
586 | Args:
587 | latitude (Decimal or None): Latitude geocoord; must be coercible to decimal
588 | longitude (Decimal or None): Longitude geocoord; must be coercible to decimal
589 | radius (int): Radius around page or geocoords to pull back; in meters
590 | title (str): Page title to use as a geocoordinate; this has precedence over lat/long
591 | auto_suggest (bool): Auto-suggest the page title
592 | results (int): Number of pages within the radius to return
593 | Returns:
594 | list: A listing of page titles
595 | Note:
596 | The Geosearch API does not support pulling more than the maximum of 500
597 | Raises:
598 | ValueError: If either the passed latitude or longitude are not coercible to a Decimal
599 | """
600 |
601 | def test_lat_long(val):
602 | """ handle testing lat and long """
603 | if not isinstance(val, Decimal):
604 | error = (
605 | "Latitude and Longitude must be specified either as "
606 | "a Decimal or in formats that can be coerced into "
607 | "a Decimal."
608 | )
609 | try:
610 | return Decimal(val)
611 | except (DecimalException, TypeError):
612 | raise ValueError(error)
613 | return val
614 |
615 | # end local function
616 | max_pull = 500
617 |
618 | limit = min(results, max_pull) if results is not None else max_pull
619 | params = {"list": "geosearch", "gsradius": radius, "gslimit": limit}
620 | if title is not None:
621 | if auto_suggest:
622 | title = await self.suggest(title)
623 | params["gspage"] = title
624 | else:
625 | lat = test_lat_long(latitude)
626 | lon = test_lat_long(longitude)
627 | params["gscoord"] = "{0}|{1}".format(lat, lon)
628 |
629 | raw_results = await self.wiki_request(params)
630 |
631 | self._check_error_response(raw_results, title)
632 |
633 | return [d["title"] for d in raw_results["query"]["geosearch"]]
634 |
635 | @memoize
636 | async def opensearch(self, query, results=10, redirect=True):
637 | """ Execute a MediaWiki opensearch request, similar to search box
638 | suggestions and conforming to the OpenSearch specification
639 |
640 | Args:
641 | query (str): Title to search for
642 | results (int): Number of pages within the radius to return
643 | redirect (bool): If **False** return the redirect itself, otherwise resolve redirects
644 | Returns:
645 | List: List of results that are stored in a tuple (Title, Summary, URL)
646 | Note:
647 | The Opensearch API does not support pulling more than the maximum of 500
648 | Raises:
649 | """
650 |
651 | self._check_query(query, "Query must be specified")
652 | max_pull = 500
653 |
654 | query_params = {
655 | "action": "opensearch",
656 | "search": query,
657 | "limit": (min(results, max_pull) if results is not None else max_pull),
658 | "redirects": ("resolve" if redirect else "return"),
659 | "warningsaserror": True,
660 | "namespace": "",
661 | }
662 |
663 | results = await self.wiki_request(query_params)
664 |
665 | self._check_error_response(results, query)
666 |
667 | res = list()
668 | for i, item in enumerate(results[1]):
669 | res.append((item, results[2][i], results[3][i]))
670 | return res
671 |
672 | @memoize
673 | async def prefixsearch(self, prefix, results=10):
674 | """ Perform a prefix search using the provided prefix string
675 |
676 | Args:
677 | prefix (str): Prefix string to use for search
678 | results (int): Number of pages with the prefix to return
679 | Returns:
680 | list: List of page titles
681 | Note:
682 | **Per the documentation:** "The purpose of this module is \
683 | similar to action=opensearch: to take user input and provide \
684 | the best-matching titles. Depending on the search engine \
685 | backend, this might include typo correction, redirect \
686 | avoidance, or other heuristics."
687 | Note:
688 | Could add ability to continue past the limit of 500
689 | """
690 |
691 | self._check_query(prefix, "Prefix must be specified")
692 |
693 | query_params = {
694 | "list": "prefixsearch",
695 | "pssearch": prefix,
696 | "pslimit": ("max" if (results > 500 or results is None) else results),
697 | "psnamespace": 0,
698 | "psoffset": 0, # parameterize to skip to later in the list?
699 | }
700 |
701 | raw_results = await self.wiki_request(query_params)
702 |
703 | self._check_error_response(raw_results, prefix)
704 |
705 | return [rec["title"] for rec in raw_results["query"]["prefixsearch"]]
706 |
707 | @memoize
708 | async def summary(self, title, sentences=0, chars=0, auto_suggest=True, redirect=True):
709 | """ Get the summary for the title in question
710 |
711 | Args:
712 | title (str): Page title to summarize
713 | sentences (int): Number of sentences to return in summary
714 | chars (int): Number of characters to return in summary
715 | auto_suggest (bool): Run auto-suggest on title before summarizing
716 | redirect (bool): Use page redirect on title before summarizing
717 | Returns:
718 | str: The summarized results of the page
719 | Note:
720 | Precedence for parameters: sentences then chars; if both are \
721 | 0 then the entire first section is returned """
722 | page_info = await self.page(title, auto_suggest=auto_suggest, redirect=redirect)
723 | return await page_info.summarize(sentences, chars)
724 |
725 | @memoize
726 | async def categorymembers(self, category, results=10, subcategories=True):
727 | """ Get information about a category: pages and subcategories
728 |
729 | Args:
730 | category (str): Category name
731 | results (int): Number of result
732 | subcategories (bool): Include subcategories (**True**) or not (**False**)
733 | Returns:
734 | Tuple or List: Either a tuple ([pages], [subcategories]) or just the list of pages
735 | Note:
736 | Set results to **None** to get all results """
737 | self._check_query(category, "Category must be specified")
738 |
739 | max_pull = 500
740 | search_params = {
741 | "list": "categorymembers",
742 | "cmprop": "ids|title|type",
743 | "cmtype": ("page|subcat|file" if subcategories else "page|file"),
744 | "cmlimit": (min(results, max_pull) if results is not None else max_pull),
745 | "cmtitle": "{0}:{1}".format(self.category_prefix, category),
746 | }
747 | pages = list()
748 | subcats = list()
749 | returned_results = 0
750 | finished = False
751 | last_cont = dict()
752 | while not finished:
753 | params = search_params.copy()
754 | params.update(last_cont)
755 | raw_res = await self.wiki_request(params)
756 |
757 | self._check_error_response(raw_res, category)
758 |
759 | current_pull = len(raw_res["query"]["categorymembers"])
760 | for rec in raw_res["query"]["categorymembers"]:
761 | if rec["type"] in ("page", "file"):
762 | pages.append(rec["title"])
763 | elif rec["type"] == "subcat":
764 | tmp = rec["title"]
765 | if tmp.startswith(self.category_prefix):
766 | tmp = tmp[len(self.category_prefix) + 1:]
767 | subcats.append(tmp)
768 |
769 | cont = raw_res.get("query-continue", False)
770 | if cont and "categorymembers" in cont:
771 | cont = cont["categorymembers"]
772 | else:
773 | cont = raw_res.get("continue", False)
774 |
775 | if cont is False or last_cont == cont:
776 | break
777 |
778 | returned_results += current_pull
779 | if results is None or (results - returned_results > 0):
780 | last_cont = cont
781 | else:
782 | finished = True
783 |
784 | if results is not None and results - returned_results < max_pull:
785 | search_params["cmlimit"] = results - returned_results
786 | # end while loop
787 |
788 | if subcategories:
789 | return pages, subcats
790 | return pages
791 |
792 | async def categorytree(self, category, depth=5):
793 | """ Generate the Category Tree for the given categories
794 |
795 | Args:
796 | category(str or list of strings): Category name(s)
797 | depth(int): Depth to traverse the tree
798 | Returns:
799 | dict: Category tree structure
800 | Note:
801 | Set depth to **None** to get the whole tree
802 | Note:
803 | Return Data Structure: Subcategory contains the same recursive structure
804 |
805 | >>> {
806 | 'category': {
807 | 'depth': Number,
808 | 'links': list,
809 | 'parent-categories': list,
810 | 'sub-categories': dict
811 | }
812 | }
813 |
814 | .. versionadded:: 0.3.10 """
815 |
816 | # make it simple to use both a list or a single category term
817 | cats = [category] if not isinstance(category, list) else category
818 |
819 | self.__category_parameter_verification(cats, depth, category)
820 |
821 | results = dict()
822 | categories = dict()
823 | links = dict()
824 |
825 | for cat in [x for x in cats if x]:
826 | await self.__cat_tree_rec(cat, depth, results, 0, categories, links)
827 | return results
828 |
829 | async def page(self, title=None, pageid=None, auto_suggest=True, redirect=True, preload=False,
830 | convert_titles=False, iwurl=True):
831 | """ Get MediaWiki page based on the provided title or pageid
832 |
833 | Args:
834 | title (str): Page title
835 | pageid (int): MediaWiki page identifier
836 | auto_suggest (bool): **True:** Allow page title auto-suggest
837 | redirect (bool): **True:** Follow page redirects
838 | preload (bool): **True:** Load most page properties
839 | convert_titles (bool): **False:** Convert titles to other variants if necessary. \
840 | Only works if the wiki's content language supports variant conversion.
841 | iwurl (bool): **False:** Whether to get the full URL if the title is an interwiki link.
842 | Raises:
843 | ValueError: when title is blank or None and no pageid is provided
844 | Raises:
845 | :py:func:`mediawiki.exceptions.PageError`: if page does not exist
846 | Note:
847 | Title takes precedence over pageid if both are provided """
848 | if (title is None or title.strip() == "") and pageid is None:
849 | raise ValueError("Either a title or a pageid must be specified")
850 | if title:
851 | if auto_suggest:
852 | temp_title = await self.suggest(title)
853 | if temp_title is None: # page doesn't exist
854 | raise PageError(title=title)
855 | title = temp_title
856 | return await MediaWikiPage.create(self, title, redirect=redirect, preload=preload,
857 | convert_titles=convert_titles, iwurl=iwurl)
858 | return await MediaWikiPage.create(self, pageid=pageid, preload=preload,
859 | convert_titles=convert_titles, iwurl=iwurl)
860 |
861 | async def wiki_request(self, params):
862 | """ Make a request to the MediaWiki API using the given search
863 | parameters
864 |
865 | Args:
866 | params (dict): Request parameters
867 | Returns:
868 | A parsed dict of the JSON response
869 | Note:
870 | Useful when wanting to query the MediaWiki site for some \
871 | value that is not part of the wrapper API """
872 |
873 | params["format"] = "json"
874 | if "action" not in params:
875 | params["action"] = "query"
876 |
877 | limit = self._rate_limit
878 | last_call = self._rate_limit_last_call
879 | if limit and last_call and last_call + self._min_wait > datetime.now():
880 | # call time to quick for rate limited api requests, wait
881 | wait_time = (last_call + self._min_wait) - datetime.now()
882 | await asyncio.sleep(wait_time.total_seconds())
883 |
884 | req = await self._get_response(params)
885 |
886 | if self._rate_limit:
887 | self._rate_limit_last_call = datetime.now()
888 |
889 | return req
890 |
891 | # Protected functions
892 | async def _get_site_info(self):
893 | """ Parse out the Wikimedia site information including API Version and Extensions """
894 | response = await self.wiki_request({"meta": "siteinfo", "siprop": "extensions|general"})
895 |
896 | # parse what we need out here!
897 | query = response.get("query", None)
898 | if query is None or query.get("general", None) is None:
899 | raise MediaWikiException("Missing query in response")
900 |
901 | gen = query.get("general", None)
902 |
903 | api_version = gen["generator"].split(" ")[1].split("-")[0]
904 |
905 | major_minor = api_version.split(".")
906 | for i, item in enumerate(major_minor):
907 | major_minor[i] = int(item)
908 | self._api_version = tuple(major_minor)
909 | self._api_version_str = ".".join([str(x) for x in self._api_version])
910 |
911 | # parse the base url out
912 | tmp = gen.get("server", "")
913 | if tmp == "":
914 | raise MediaWikiException("Unable to parse base url")
915 | if tmp.startswith("http://") or tmp.startswith("https://"):
916 | self._base_url = tmp
917 | elif gen["base"].startswith("https:"):
918 | self._base_url = "https:{}".format(tmp)
919 | else:
920 | self._base_url = "http:{}".format(tmp)
921 |
922 | self._extensions = [ext["name"] for ext in query["extensions"]]
923 | self._extensions = sorted(list(set(self._extensions)))
924 |
925 | # end _get_site_info
926 |
927 | @staticmethod
928 | def _check_error_response(response, query):
929 | """ check for default error messages and throw correct exception """
930 | if "error" in response:
931 | http_error = ["HTTP request timed out.", "Pool queue is full"]
932 | geo_error = [
933 | "Page coordinates unknown.",
934 | "One of the parameters gscoord, gspage, gsbbox is required",
935 | "Invalid coordinate provided",
936 | ]
937 | err = response["error"]["info"]
938 | if err in http_error:
939 | raise HTTPTimeoutError(query)
940 | if err in geo_error:
941 | raise MediaWikiGeoCoordError(err)
942 | raise MediaWikiException(err)
943 |
944 | @staticmethod
945 | def _check_query(value, message):
946 | """ check if the query is 'valid' """
947 | if value is None or value.strip() == "":
948 | raise ValueError(message)
949 |
950 | @staticmethod
951 | def __category_parameter_verification(cats, depth, category):
952 | # parameter verification
953 | if len(cats) == 1 and (cats[0] is None or cats[0] == ""):
954 | msg = (
955 | "CategoryTree: Parameter 'category' must either "
956 | "be a list of one or more categories or a string; "
957 | "provided: '{}'".format(category)
958 | )
959 | raise ValueError(msg)
960 |
961 | if depth is not None and depth < 1:
962 | msg = "CategoryTree: Parameter 'depth' must be either None " "(for the full tree) or be greater than 0"
963 | raise ValueError(msg)
964 |
965 | async def __cat_tree_rec(self, cat, depth, tree, level, categories, links):
966 | """ recursive function to build out the tree """
967 | tree[cat] = dict()
968 | tree[cat]["depth"] = level
969 | tree[cat]["sub-categories"] = dict()
970 | tree[cat]["links"] = list()
971 | tree[cat]["parent-categories"] = list()
972 | parent_cats = list()
973 |
974 | if cat not in categories:
975 | tries = 0
976 | while True:
977 | if tries > 10:
978 | raise MediaWikiCategoryTreeError(cat)
979 | try:
980 | pag = await self.page("{0}:{1}".format(self.category_prefix, cat))
981 | categories[cat] = pag
982 | parent_cats = await categories[cat].categories()
983 | links[cat] = await self.categorymembers(cat, results=None, subcategories=True)
984 | break
985 | except PageError:
986 | raise PageError("{0}:{1}".format(self.category_prefix, cat))
987 | except KeyboardInterrupt:
988 | raise
989 | except Exception:
990 | tries = tries + 1
991 | await asyncio.sleep(1)
992 | else:
993 | parent_cats = await categories[cat].categories()
994 |
995 | tree[cat]["parent-categories"].extend(parent_cats)
996 | tree[cat]["links"].extend(links[cat][0])
997 |
998 | if depth and level >= depth:
999 | for ctg in links[cat][1]:
1000 | tree[cat]["sub-categories"][ctg] = None
1001 | else:
1002 | for ctg in links[cat][1]:
1003 | await self.__cat_tree_rec(
1004 | ctg, depth, tree[cat]["sub-categories"], level + 1, categories, links,
1005 | )
1006 |
1007 | async def _get_response(self, params):
1008 | """ wrap the call to the requests package """
1009 | try:
1010 | resp = await self._session.get(self._api_url, params=params, timeout=self._timeout, proxy=self._proxies)
1011 | return await resp.json()
1012 | except (JSONDecodeError, aiohttp.ContentTypeError):
1013 | return {}
1014 |
1015 | async def _post_response(self, params):
1016 | """ wrap a post call to the requests package """
1017 | try:
1018 | resp = await self._session.post(self._api_url, data=params, timeout=self._timeout, proxy=self._proxies)
1019 | return await resp.json()
1020 | except JSONDecodeError:
1021 | return {}
1022 |
1023 | # end MediaWiki class
1024 |
--------------------------------------------------------------------------------