.*?')
28 |
29 | def get_regex_for_title(self, escaped_title):
30 | pattern = fr'^\*+ *\[\[({escaped_title})(?:\|[^][]+)?\]\]'
31 | pattern += r' *(?:\([^)]+\))?'
32 | pattern += '(?:,| [-–]) *(.*)$'
33 | return re.compile(pattern, re.M)
34 |
35 | @staticmethod
36 | def handle_link(match):
37 | text = match[2]
38 | if text:
39 | return text.lstrip('|').strip()
40 | else:
41 | return match['title'].strip()
42 |
43 | def validate_description(self, desc):
44 | return (bool(desc) and len(desc.split()) >= self.opt['min_words'])
45 |
46 | def parse_description(self, text):
47 | desc = textlib.removeDisabledParts(
48 | text,
49 | ['comment', 'file', 'nowiki', 'template', self.FORMATTING_REGEX,
50 | self.REF_REGEX])
51 | desc = LINK_REGEX.sub(self.handle_link, desc)
52 | desc = desc.replace(' ', ' ').strip()
53 | desc = re.sub(r' *\([^)]+\)$', '', desc)
54 | desc = desc.partition(';')[0]
55 | desc = re.sub(r'^.*\) [-–] +', '', desc)
56 | desc = re.sub(r'^\([^)]+\) +', '', desc)
57 | while ' ' * 2 in desc:
58 | desc = desc.replace(' ' * 2, ' ')
59 | if re.search(r'[^IVX]\.$', desc) or desc.endswith(tuple(',:')):
60 | desc = desc[:-1].rstrip()
61 | if desc.startswith(('a ', 'an ')):
62 | desc = desc.partition(' ')[2]
63 | return desc
64 |
65 | def get_summary(self, page, desc):
66 | link = page.title(as_link=True, insite=self.repo)
67 | return f'importing [{page.site.lang}] description "{desc}" from {link}'
68 |
69 |
70 | class MissingDescriptionBot(BaseDescriptionBot):
71 |
72 | use_from_page = False
73 |
74 | def __init__(self, **kwargs):
75 | self.available_options.update({
76 | 'allpages': False,
77 | })
78 | super().__init__(**kwargs)
79 | self.store = QueryStore()
80 |
81 | @property
82 | def generator(self):
83 | query = self.store.build_query(
84 | 'missing_descriptions',
85 | hostname=self.site.hostname(),
86 | lang=self.site.lang)
87 | return PreloadingEntityGenerator(
88 | WikidataSPARQLPageGenerator(query, site=self.repo))
89 |
90 | def treat_page_and_item(self, page, item):
91 | if self.site.lang in item.descriptions:
92 | return
93 | title = item.getSitelink(self.site)
94 | link_start = re.escape('[[' + title)
95 | search_query = fr'linksto:"{title}" insource:/\* *{link_start}/'
96 | regex = self.get_regex_for_title(re.escape(title))
97 | for ref_page in PreloadingGenerator(
98 | SearchPageGenerator(search_query, namespaces=[0])):
99 | # todo: first polish text
100 | match = regex.search(ref_page.text)
101 | if not match:
102 | continue
103 | if not self.opt['allpages'] and not ref_page.isDisambig():
104 | continue
105 | desc = self.parse_description(match[2])
106 | if not self.validate_description(desc):
107 | continue
108 | summary = self.get_summary(ref_page, desc)
109 | item.descriptions[self.site.lang] = desc.strip()
110 | if self.user_edit_entity(item, summary=summary):
111 | break
112 |
113 |
114 | class MappingDescriptionBot(BaseDescriptionBot):
115 |
116 | def __init__(self, **kwargs):
117 | super().__init__(**kwargs)
118 | self.regex = self.get_regex_for_title(r'[^\[\|\]]+')
119 |
120 | def get_pages_with_descriptions(self, text):
121 | data = {}
122 | for match in self.regex.finditer(text):
123 | title, desc = match.groups()
124 | page = pywikibot.Page(self.site, title)
125 | data[page] = self.parse_description(desc)
126 | return data
127 |
128 | def treat_page(self):
129 | page = self.current_page
130 | descriptions = self.get_pages_with_descriptions(page.text)
131 | for item in PreloadingEntityGenerator(descriptions.keys()):
132 | if self.site.lang in item.descriptions:
133 | continue
134 | target = pywikibot.Page(item.sitelinks[self.site])
135 | desc = descriptions.get(target)
136 | if not self.validate_description(desc):
137 | continue
138 | summary = self.get_summary(page, desc)
139 | item.descriptions[self.site.lang] = desc.strip()
140 | self.current_page = item
141 | self.user_edit_entity(item, summary=summary)
142 |
143 |
144 | def main(*args):
145 | options = {}
146 | local_args = pywikibot.handle_args(args)
147 | site = pywikibot.Site()
148 | genFactory = GeneratorFactory(site=site)
149 | for arg in genFactory.handle_args(local_args):
150 | if arg.startswith('-'):
151 | arg, sep, value = arg.partition(':')
152 | if value != '':
153 | options[arg[1:]] = int(value) if value.isdigit() else value
154 | else:
155 | options[arg[1:]] = True
156 |
157 | generator = genFactory.getCombinedGenerator(preload=True)
158 | if generator:
159 | bot = MappingDescriptionBot(generator=generator, site=site, **options)
160 | else:
161 | bot = MissingDescriptionBot(site=site, **options)
162 | bot.run()
163 |
164 |
165 | if __name__ == '__main__':
166 | main()
167 |
--------------------------------------------------------------------------------
/merger.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import time
3 |
4 | from operator import attrgetter
5 |
6 | import pywikibot
7 |
8 | from pywikibot.exceptions import APIError, OtherPageSaveError
9 | from pywikibot.data.sparql import SparqlQuery
10 |
11 |
12 | class Merger:
13 |
14 | strategies = {
15 | 'id': '_sort_by_id',
16 | 'claims': '_sort_by_claims',
17 | 'revisions': '_sort_by_revisions',
18 | 'sitelinks': '_sort_by_sitelinks',
19 | }
20 | no_conflict_props = {'P17', 'P21', 'P105', 'P170', 'P171', 'P225', 'P271',
21 | 'P296', 'P495', 'P569', 'P570', 'P734', 'P856'}
22 | no_conflict_trees = {
23 | 'P19': 'P131',
24 | 'P31': 'P279',
25 | 'P131': 'P131',
26 | 'P279': 'P279',
27 | }
28 | no_conflict_types = ['external-id']
29 |
30 | @classmethod
31 | def merge(cls, item_from, item_to, **kwargs):
32 | try:
33 | item_from.mergeInto(item_to, **kwargs)
34 | except APIError as e:
35 | raise OtherPageSaveError(item_from, e)
36 |
37 | @classmethod
38 | def clean_merge(cls, item_from, item_to, safe=False, quick=True, **kwargs):
39 | kwargs.pop('asynchronous', None) # fixme
40 | if safe and not cls.can_merge(item_from, item_to, quick=quick):
41 | raise OtherPageSaveError(
42 | item_from, f'Cannot merge {item_from} with {item_to}')
43 |
44 | cls.merge(item_from, item_to, **kwargs)
45 | if not item_from.isRedirectPage():
46 | try:
47 | item_from.editEntity(
48 | {}, clear=True, summary='Clearing item to prepare for redirect')
49 | except APIError as e:
50 | raise OtherPageSaveError(item_from, e)
51 |
52 | cls.merge(item_from, item_to)
53 |
54 | @classmethod
55 | def _conflicts(cls, data1, data2):
56 | set1 = {repr(x.target) for x in data1} # hack
57 | set2 = {repr(x.target) for x in data2} # hack
58 | return not bool(set1 & set2)
59 |
60 | @classmethod
61 | def _has_dtype(cls, dtype, claims):
62 | for cl in claims:
63 | if cl.type == dtype:
64 | return True
65 | return False
66 |
67 | @classmethod
68 | def _same_tree(cls, prop, data1, data2):
69 | sparql = SparqlQuery() # fixme: dependencies
70 | pattern = ('ASK { VALUES ?x1 { wd:%s } . VALUES ?x2 { wd:%s } . '
71 | '?x1 wdt:%s* ?x2 }')
72 | item1 = ' wd:'.join(map(attrgetter('target.id'), data1))
73 | item2 = ' wd:'.join(map(attrgetter('target.id'), data2))
74 | tries = 3
75 | for ask in (pattern % (item1, item2, prop),
76 | pattern % (item2, item1, prop)):
77 | res = False
78 | while True:
79 | try:
80 | res = sparql.ask(ask)
81 | except requests.exceptions.ConnectionError:
82 | tries -= 1
83 | if tries == 0:
84 | raise
85 | time.sleep(1)
86 | continue
87 | else:
88 | break
89 | if res:
90 | return True
91 |
92 | return False
93 |
94 | @classmethod
95 | def can_merge(cls, item1, item2, quick=True):
96 | props = list(cls.no_conflict_props)
97 | if quick:
98 | props.extend(cls.no_conflict_trees.keys())
99 |
100 | for prop in props:
101 | item1.get()
102 | data1 = item1.claims.get(prop, [])
103 | if not data1:
104 | continue
105 | item2.get()
106 | data2 = item2.claims.get(prop, [])
107 | if not data2:
108 | continue
109 | if cls._conflicts(data1, data2):
110 | return False
111 |
112 | key = lambda claims: claims[0].id
113 | for dtype in cls.no_conflict_types:
114 | callback = lambda claims: claims[0].type == dtype
115 | item1.get()
116 | keys1 = set(map(key, filter(callback, item1.claims.values())))
117 | if not keys1:
118 | continue
119 | item2.get()
120 | keys2 = set(map(key, filter(callback, item2.claims.values())))
121 | if not keys2:
122 | continue
123 | for prop in keys1 & keys2:
124 | if cls._conflicts(item1.claims[prop], item2.claims[prop]):
125 | return False
126 |
127 | if not quick:
128 | for prop in cls.no_conflict_trees:
129 | item1.get()
130 | data1 = item1.claims.get(prop, [])
131 | if not data1:
132 | continue
133 | item2.get()
134 | data2 = item2.claims.get(prop, [])
135 | if not data2:
136 | continue
137 | if not cls._same_tree(cls.no_conflict_trees[prop], data1, data2):
138 | return False
139 |
140 | return True
141 |
142 | @classmethod
143 | def _sort_by_id(cls, item1, item2):
144 | id1, id2 = item1.getID(numeric=True), item2.getID(numeric=True)
145 | return (id1 < id2) - (id1 > id2)
146 |
147 | @classmethod
148 | def _sort_by_revisions(cls, item1, item2):
149 | len1, len2 = map(
150 | lambda item: len(list(item.revisions())), [item1, item2])
151 | return (len1 > len2) - (len1 < len2)
152 |
153 | @classmethod
154 | def _sort_by_claims(cls, item1, item2):
155 | callback = lambda item: sum(map(len, item.claims.values()))
156 | count1, count2 = map(callback, [item1, item2])
157 | return (count1 > count2) - (count1 < count2)
158 |
159 | @classmethod
160 | def _sort_by_sitelinks(cls, item1, item2):
161 | len1, len2 = map(lambda item: len(item.sitelinks), [item1, item2])
162 | return (len1 > len2) - (len1 < len2)
163 |
164 | @classmethod
165 | def sort_for_merge(cls, items, key=['id']):
166 | for strategy in key:
167 | if strategy not in cls.strategies:
168 | continue
169 | callback = getattr(cls, cls.strategies[strategy])
170 | res = callback(*items)
171 | if res == 0:
172 | continue
173 | if res == -1:
174 | items[:] = items[::-1]
175 | break
176 | target_item, from_item = items
177 | return target_item, from_item
178 |
--------------------------------------------------------------------------------
/slice_externalids.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | import re
3 |
4 | import pywikibot
5 |
6 | from pywikibot.data.sparql import SparqlQuery
7 | from pywikibot.pagegenerators import (
8 | PreloadingEntityGenerator,
9 | WikidataSPARQLPageGenerator,
10 | )
11 |
12 | from query_store import QueryStore
13 | from wikidata import WikidataEntityBot
14 |
15 |
16 | class ExternalIdSlicingBot(WikidataEntityBot):
17 |
18 | blacklist = {'P2013'}
19 | use_from_page = False
20 |
21 | def __init__(self, **options):
22 | self.available_options.update({
23 | 'step': 10,
24 | 'offset': 0,
25 | })
26 | super().__init__(**options)
27 | self.cache = {}
28 | self.failed = {}
29 | self.sparql = SparqlQuery(repo=self.repo)
30 | self.store = QueryStore()
31 |
32 | @property
33 | def generator(self):
34 | step = self.opt['step']
35 | opts = {
36 | # fixme: don't use this word
37 | 'blacklist': ' wd:'.join(self.blacklist),
38 | 'limit': step,
39 | }
40 | offset = self.opt['offset']
41 | while True:
42 | pywikibot.info(f'\nLoading items (offset {offset})...')
43 | opts['offset'] = offset
44 | ask = self.store.build_query('ask_externalid_props', **opts)
45 | if not self.sparql.ask(ask):
46 | break
47 | query = self.store.build_query('external-ids', **opts)
48 | gen = PreloadingEntityGenerator(
49 | WikidataSPARQLPageGenerator(query, site=self.repo))
50 | yield from gen
51 | offset += step
52 |
53 | def treat_page_and_item(self, page, item):
54 | for prop, claims in item.claims.items():
55 | if prop in self.blacklist:
56 | continue
57 | if claims[0].type != 'external-id':
58 | continue
59 | for cl in claims:
60 | if not cl.target or not cl.target.startswith('http'):
61 | continue
62 | formatter, regex = self.get_formatter_and_regex(prop)
63 | if not formatter:
64 | pywikibot.info(f"{prop} doesn't have a formatter")
65 | break
66 | value = self.find_value(cl.target, formatter)
67 | if not value:
68 | pywikibot.info(
69 | f'Value not found in "{cl.target}" for property {prop}')
70 | self.failed.setdefault(prop, set()).add(item)
71 | continue
72 | if regex:
73 | try:
74 | match = re.match(f'({regex})', value)
75 | except re.error:
76 | pywikibot.info(f'Couldn\'t apply regex "{regex}"')
77 | break
78 | if not match:
79 | pywikibot.info(
80 | f'Value "{value}" not matched by regex "{regex}"')
81 | self.failed.setdefault(prop, set()).add(item)
82 | continue
83 | value = match.group()
84 | summary = 'harvested the identifier based on [[Property:P1630]]'
85 | if regex:
86 | summary += ' and [[Property:P1793]]'
87 | cl.changeTarget(value, summary=summary)
88 |
89 | def get_formatter_and_regex(self, prop):
90 | if prop not in self.cache:
91 | formatter = regex = None
92 | ppage = pywikibot.PropertyPage(self.repo, prop)
93 | if 'P1630' in ppage.claims:
94 | if len(ppage.claims['P1630']) > 1:
95 | preferred = [cl for cl in ppage.claims['P1630']
96 | if cl.rank == 'preferred']
97 | if len(preferred) == 1:
98 | formatter = preferred[0].target
99 | else:
100 | formatter = ppage.claims['P1630'][0].target
101 |
102 | if 'P1793' in ppage.claims:
103 | if len(ppage.claims['P1793']) > 1:
104 | preferred = [cl for cl in ppage.claims['P1793']
105 | if cl.rank == 'preferred']
106 | if len(preferred) == 1:
107 | regex = preferred[0].target
108 | else:
109 | regex = ppage.claims['P1793'][0].target
110 |
111 | self.cache[prop] = (formatter, regex)
112 |
113 | return self.cache[prop]
114 |
115 | def strip_init_stuff(self, string):
116 | if string.startswith(('http://', 'https://')):
117 | string = string.partition('//')[2]
118 | if string.startswith('www.'):
119 | string = string[4:]
120 | return string
121 |
122 | def find_value(self, url, formatter):
123 | url = self.strip_init_stuff(url)
124 | formatter = self.strip_init_stuff(formatter)
125 | value = pywikibot.page.url2unicode(url)
126 | split = formatter.split('$1')
127 | if not value.startswith(split[0]):
128 | return None
129 | if not split[1]:
130 | return value[len(split[0]):].rstrip('/')
131 |
132 | value = value[:-len(split[-1])]
133 |
134 | try:
135 | index = value.index(split[1], len(split[0]))
136 | except ValueError:
137 | return None
138 | else:
139 | return value[len(split[0]):index].rstrip('/')
140 |
141 | def exit(self): # fixme: teardown
142 | if self.failed:
143 | text = ''
144 | for prop in sorted(self.failed):
145 | text += f'* [[Property:{prop}]]:\n'
146 | for item in sorted(self.failed[prop]):
147 | text += f'** [[{item.title()}]]\n'
148 | username = self.repo.username()
149 | page = pywikibot.Page(
150 | self.repo, f'User:{username}/Wrong external ids')
151 | page.put(text, summary='update')
152 | super().exit()
153 |
154 |
155 | def main(*args):
156 | options = {}
157 | for arg in pywikibot.handle_args(args):
158 | if arg.startswith('-'):
159 | arg, sep, value = arg.partition(':')
160 | if value != '':
161 | options[arg[1:]] = int(value) if value.isdigit() else value
162 | else:
163 | options[arg[1:]] = True
164 |
165 | site = pywikibot.Site('wikidata', 'wikidata')
166 | bot = ExternalIdSlicingBot(site=site, **options)
167 | bot.run()
168 |
169 |
170 | if __name__ == '__main__':
171 | main()
172 |
--------------------------------------------------------------------------------
/list_typos.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | import re
3 |
4 | from collections import defaultdict
5 |
6 | import pywikibot
7 |
8 | from pywikibot import textlib
9 | from pywikibot.bot import SingleSiteBot, ExistingPageBot
10 | from pywikibot.pagegenerators import PreloadingGenerator
11 | from pywikibot.tools.itertools import itergroup
12 |
13 | from typoloader import TypoRule, TyposLoader
14 |
15 |
16 | class TypoReportBot(SingleSiteBot):
17 |
18 | pattern = '# {} \u2013 {}'
19 |
20 | def __init__(self, **kwargs):
21 | self.available_options.update({
22 | 'always': True,
23 | 'anything': False,
24 | 'outputpage': None,
25 | 'typospage': None,
26 | 'whitelistpage': None,
27 | 'false_positives': None,
28 | })
29 | super().__init__(**kwargs)
30 | self.loader = TyposLoader(
31 | self.site, allrules=True, typospage=self.opt.typospage,
32 | whitelistpage=self.opt.whitelistpage)
33 | self.false_positives = set()
34 |
35 | def setup(self):
36 | super().setup()
37 | self.typoRules = self.loader.loadTypos()
38 | #self.fp_page = self.loader.getWhitelistPage()
39 | self.whitelist = self.loader.loadWhitelist()
40 | self.data = defaultdict(list)
41 | self.order = [] # remove when dictionaries are ordered
42 | self.load_false_positives()
43 |
44 | def load_false_positives(self):
45 | if not self.opt.false_positives:
46 | return
47 | page = pywikibot.Page(self.site, self.opt.false_positives)
48 | fps = self.false_positives
49 | for line in page.text.splitlines():
50 | if line.startswith(('#', '*')):
51 | fps.add(line.lstrip('#* '))
52 |
53 | @property
54 | def generator(self):
55 | for rule in self.typoRules:
56 | if rule.query is None:
57 | continue
58 |
59 | pywikibot.info(f'Query: "{rule.query}"')
60 | self.current_rule = rule
61 | yield from PreloadingGenerator(
62 | self.site.search(rule.query, namespaces=[0]))
63 |
64 | def skip_page(self, page):
65 | # TODO: better terminology
66 | if page.title() in self.whitelist:
67 | pywikibot.warning(f'Skipped {page} because it is whitelisted')
68 | return True
69 |
70 | if self.current_rule.find.search(page.title()):
71 | pywikibot.warning(
72 | f'Skipped {page} because the rule matches the title')
73 | return True
74 |
75 | return super().skip_page(page)
76 |
77 | def remove_disabled_parts(self, text):
78 | return textlib.removeDisabledParts(
79 | text, TypoRule.exceptions, site=self.site)
80 |
81 | def treat(self, page):
82 | match = self.current_rule.find.search(page.text)
83 | if not match:
84 | return
85 | text = self.remove_disabled_parts(page.text)
86 | found = set()
87 | for match in self.current_rule.find.finditer(text):
88 | match_text = match[0]
89 | if match_text in found:
90 | continue
91 | found.add(match_text)
92 | link = page.title(as_link=True)
93 | put_text = self.pattern.format(link, match_text)
94 | if put_text[2:] not in self.false_positives:
95 | pywikibot.stdout(put_text)
96 | if not self.data.get(link):
97 | self.order.append(link)
98 | self.data[link].append(match_text)
99 |
100 | def teardown(self):
101 | outputpage = self.opt.outputpage
102 | if (self.generator_completed or self.opt.anything) and outputpage:
103 | put = []
104 | for link in self.order:
105 | for match in self.data[link]:
106 | put.append(self.pattern.format(link, match))
107 | page = pywikibot.Page(self.site, outputpage)
108 | page.text = '\n'.join(put)
109 | page.save(summary='aktualizace seznamu překlepů', minor=False,
110 | bot=False, apply_cosmetic_changes=False)
111 | super().teardown()
112 |
113 |
114 | class PurgeTypoReportBot(SingleSiteBot, ExistingPageBot):
115 |
116 | def __init__(self, **kwargs):
117 | self.helper = TypoReportBot(**kwargs)
118 | super().__init__(site=self.helper.site)
119 | self.put = []
120 | self.cache = defaultdict(list)
121 |
122 | def setup(self):
123 | super().setup()
124 | self.whitelist = self.helper.loader.loadWhitelist()
125 | self.generator = [pywikibot.Page(self.site, self.helper.opt.outputpage)]
126 | self.helper.load_false_positives()
127 |
128 | def line_iterator(self, text):
129 | regex = re.compile(self.helper.pattern.format(
130 | r'\[\[([^]]+)\]\]', '(.+)'))
131 | for line in text.splitlines():
132 | match = regex.fullmatch(line)
133 | if match:
134 | title, text = match.groups()
135 | entry = pywikibot.Page(self.site, title)
136 | self.cache[entry.title()].append(text)
137 | yield entry
138 | else:
139 | self.put.append(line)
140 |
141 | def treat(self, page):
142 | pattern = self.helper.pattern
143 | for entry in PreloadingGenerator(self.line_iterator(page.text)):
144 | key = title = entry.title()
145 | if not entry.exists():
146 | self.cache.pop(key)
147 | continue
148 | while entry.isRedirectPage():
149 | entry = entry.getRedirectTarget()
150 | title = entry.title()
151 | text = self.helper.remove_disabled_parts(entry.text)
152 | for string in self.cache.pop(key):
153 | if string not in text:
154 | continue
155 | put_text = pattern.format(f'[[{title}]]', string)
156 | if put_text[2:] in self.helper.false_positives:
157 | continue
158 | self.put.append(put_text)
159 |
160 | page.text = '\n'.join(self.put)
161 | page.save(summary='odstranění vyřešených překlepů', minor=True,
162 | bot=True, apply_cosmetic_changes=False)
163 |
164 |
165 | def main(*args):
166 | options = {}
167 | cls = TypoReportBot
168 | for arg in pywikibot.handle_args(args):
169 | if arg == 'purge':
170 | cls = PurgeTypoReportBot
171 | elif arg.startswith('-'):
172 | arg, sep, value = arg.partition(':')
173 | if value != '':
174 | options[arg[1:]] = int(value) if value.isdigit() else value
175 | else:
176 | options[arg[1:]] = True
177 |
178 | bot = cls(**options)
179 | bot.run()
180 |
181 |
182 | if __name__ == '__main__':
183 | main()
184 |
--------------------------------------------------------------------------------
/cswiki/pageviews.py:
--------------------------------------------------------------------------------
1 | import heapq
2 | import json
3 | import os.path as osp
4 | from collections import defaultdict
5 | from datetime import date, datetime, timedelta
6 |
7 | import pywikibot
8 | import requests
9 | from pywikibot.comms.http import user_agent
10 | from pywikibot.pagegenerators import PreloadingGenerator
11 |
12 | pywikibot.handle_args()
13 |
14 | site = pywikibot.Site()
15 |
16 | headers = {'User-Agent': user_agent()}
17 | hostname = site.hostname()
18 | prefix = 'https://wikimedia.org/api/rest_v1/metrics/pageviews'
19 | pattern = f'{prefix}/top/{hostname}/all-access/%Y/%m/%d'
20 |
21 | check_templates = {
22 | 'Aktualizovat', 'Celkově zpochybněno', 'Globalizovat', 'Neověřeno', 'NPOV',
23 | 'Pahýl', 'Pravopis', 'Reklama', 'Sloh', 'Upravit', 'Vlastní výzkum',
24 | 'Vyhýbavá slova',
25 | }
26 | check_categories = {
27 | 'Wikipedie:Polozamčené stránky',
28 | 'Wikipedie:Rozšířeně polozamčené stránky',
29 | 'Wikipedie:Dlouhodobě zamčené stránky',
30 | 'Wikipedie:Dobré články',
31 | 'Wikipedie:Nejlepší články',
32 | 'Žijící lidé',
33 | }
34 |
35 | top = 100
36 | days = 7
37 | gamma = 0.85
38 | weights = [pow(gamma, i) for i in range(days)]
39 |
40 | today = date.today()
41 | this = today - timedelta(days=1)
42 | first = today - timedelta(days=days)
43 | min_per_day = []
44 |
45 | check_categories.add(f'Úmrtí v roce {this.year}')
46 | check_categories.add(f'Úmrtí v roce {this.year - 1}')
47 |
48 | aggregate_url = '{}/aggregate/{}/all-access/user/daily/{}/{}'.format(
49 | prefix,
50 | hostname,
51 | first.strftime('%Y%m%d'),
52 | this.strftime('%Y%m%d')
53 | )
54 | resp = requests.get(aggregate_url, headers=headers)
55 | data = resp.json()
56 | daily = [entry['views'] for entry in data['items']]
57 |
58 | index = defaultdict(lambda: [None] * days)
59 | for diff in range(days):
60 | the_day = this - timedelta(days=diff)
61 | resp = requests.get(the_day.strftime(pattern), headers=headers)
62 | data = resp.json()
63 |
64 | array = []
65 | for info in data['items'][0]['articles']:
66 | page = info['article']
67 | views = info['views']
68 | index[page][diff] = views
69 | array.append(views)
70 | min_per_day.append(min(array))
71 | del data
72 |
73 | done_heap = []
74 | stack = []
75 |
76 | for page, values in index.items():
77 | if page.startswith('Speciální:'):
78 | continue
79 | complete = True
80 | total = 0
81 | for views, at_most in zip(values, min_per_day):
82 | if views is None:
83 | complete = False
84 | total += at_most
85 | else:
86 | total += views
87 |
88 | if complete:
89 | done_heap.append((total, page, values))
90 | else:
91 | stack.append((total, page, values))
92 |
93 | done_heap.sort()
94 | del done_heap[:-top]
95 | stack.sort()
96 |
97 | while True:
98 | possible, page, values = stack.pop()
99 | lowest = done_heap[0][0]
100 | if possible < lowest:
101 | break
102 |
103 | present = [i for i, val in enumerate(values) if val is None]
104 |
105 | start = this - timedelta(days=max(present))
106 | end = this - timedelta(days=min(present))
107 |
108 | url = f'{prefix}/per-article/{hostname}/all-access/user/'
109 | url += page.replace('/', '%2F') + '/daily/'
110 | url += start.strftime('%Y%m%d00') + '/' + end.strftime('%Y%m%d00')
111 | resp = requests.get(url, headers=headers)
112 | if resp.ok:
113 | data = resp.json()
114 | for entry in data['items']:
115 | dt = datetime.strptime(entry['timestamp'], '%Y%m%d%H')
116 | delta = this - dt.date()
117 | values[delta.days] = entry['views']
118 |
119 | for i in range(days):
120 | if values[i] is None:
121 | values[i] = 0
122 |
123 | total = sum(values)
124 | assert total <= possible
125 | if total >= lowest:
126 | heapq.heappushpop(done_heap, (total, page, values))
127 |
128 | done_heap.sort(reverse=True)
129 |
130 | lines = []
131 | lines.append(
132 | f"Nejčtenější stránky za období {first.day}. {first.month}. {first.year}"
133 | f" – {this.day}. {this.month}. {this.year}."
134 | )
135 | lines.append('')
136 | lines.append('{| class="wikitable sortable"')
137 | lines.append('! Pořadí')
138 | lines.append('! Stránka')
139 | lines.append('! Celkový
počet návštěv')
140 | lines.append('! Vážený
počet návštěv')
141 | lines.append('! Koeficient')
142 | lines.append('! Problémy')
143 | lines.append('! Příznaky')
144 | lines.append('! class="unsortable" | Graf')
145 |
146 | aggregate = sum(daily)
147 | weighted = sum(v * w for v, w in zip(daily, weights))
148 | coef = weighted / aggregate
149 |
150 | lines.append('|-')
151 | lines.append('|')
152 | lines.append("| ''vše''")
153 | lines.append(f'| {aggregate}')
154 | lines.append(f'| {weighted:.0f}')
155 | lines.append('| %s' % f'{coef:.3f}'.replace('.', ',', 1))
156 | lines.append(f'|')
157 | lines.append(f'|')
158 | lines.append(f"| [https://pageviews.wmcloud.org/siteviews/?sites={hostname}"
159 | f"&agent=user&range=latest-20 [0]]")
160 |
161 | gen = (pywikibot.Page(site, title) for _, title, _ in done_heap)
162 | for rank, (page, (total, title, values)) in enumerate(zip(
163 | site.preloadpages(gen, templates=True, categories=True, content=False),
164 | done_heap
165 | ), start=1):
166 | weighted = sum(v * w for v, w in zip(values, weights))
167 | coef = weighted / total
168 | link_title = title.replace('_', ' ')
169 | if link_title.startswith(('Soubor:', 'Kategorie:')):
170 | link_title = f':{link_title}'
171 |
172 | lines.append('|-')
173 | lines.append(f'| {rank}')
174 | lines.append(f'| [[{link_title}]]')
175 | lines.append(f'| {total}')
176 | lines.append(f'| {weighted:.0f}')
177 | lines.append('| %s' % f'{coef:.3f}'.replace('.', ',', 1))
178 |
179 | show_templates = check_templates.intersection(map(
180 | lambda p: p.title(with_ns=False), page.templates()))
181 | show_categories = check_categories.intersection(map(
182 | lambda p: p.title(with_ns=False), page.categories()))
183 |
184 | if show_templates:
185 | lines.append('| ' + ('
'.join(
186 | f'[[Šablona:{t}|{t}]]' for t in sorted(show_templates))))
187 | else:
188 | lines.append('|')
189 |
190 | if show_categories:
191 | lines.append('| ' + ('
'.join(
192 | f"[[:Kategorie:{c}|{c.removeprefix('Wikipedie:')}]]"
193 | for c in sorted(show_categories))))
194 | else:
195 | lines.append('|')
196 |
197 | lines.append(f"| [https://pageviews.wmcloud.org/pageviews/?project={hostname}"
198 | f"&agent=user&range=latest-20&pages={title}]")
199 |
200 | lines.append('|}')
201 |
202 | the_page = pywikibot.Page(site, f'{site.username()}/Návštěvy', ns=2)
203 | the_page.text = '\n'.join(lines)
204 | the_page.save(minor=False, bot=False, apply_cosmetic_changes=False,
205 | summary='aktualizace')
206 |
--------------------------------------------------------------------------------
/fake_references.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | from contextlib import suppress
3 |
4 | import pywikibot
5 |
6 | from pywikibot import pagegenerators
7 |
8 | from query_store import QueryStore
9 | from wikidata import WikidataEntityBot
10 |
11 |
12 | class FakeReferencesBot(WikidataEntityBot):
13 |
14 | item_ids = ['Q2013']
15 | inferred_from = 'P3452'
16 | ref_props = ['P143', 'P248']
17 | url_props = ['P854']
18 | use_from_page = False
19 | whitelist_props = {'P813', 'P4656'}
20 |
21 | def __init__(self, generator, **kwargs):
22 | self.available_options.update({
23 | 'limit': None,
24 | })
25 | super().__init__(**kwargs)
26 | self.store = QueryStore()
27 | self._generator = generator or self.subgenerator()
28 | self.url_start = self.repo.base_url(self.repo.article_path)
29 |
30 | def subgenerator(self):
31 | limit = self.opt['limit']
32 | for ident in self.item_ids:
33 | from_item = pywikibot.ItemPage(self.repo, ident)
34 | for item in pagegenerators.WikibaseItemGenerator(
35 | from_item.backlinks(
36 | total=limit, filterRedirects=False, namespaces=[0])):
37 | yield item
38 | if limit is not None:
39 | limit -= 1
40 |
41 | if limit == 0:
42 | return
43 |
44 | for prop in self.url_props:
45 | ok = True
46 | while ok and limit != 0:
47 | ok = False
48 | query = self.store.build_query(
49 | 'fake_references_url',
50 | limit=500 if limit is None else min(500, limit),
51 | prop=prop)
52 | for item in pagegenerators.WikidataSPARQLPageGenerator(
53 | query, site=self.repo):
54 | ok = True
55 | yield item
56 | if limit is not None:
57 | limit -= 1
58 |
59 | for prop in self.ref_props:
60 | ok = True
61 | while ok and limit != 0:
62 | ok = False
63 | query = self.store.build_query(
64 | 'fake_references',
65 | limit=100 if limit is None else min(100, limit),
66 | prop=prop)
67 | for item in pagegenerators.WikidataSPARQLPageGenerator(
68 | query, site=self.repo):
69 | ok = True
70 | yield item
71 | if limit is not None:
72 | limit -= 1
73 |
74 | @property
75 | def generator(self):
76 | return pagegenerators.PreloadingEntityGenerator(self._generator)
77 |
78 | @property
79 | def summary(self):
80 | return ('update reference per [[Wikidata:Requests for permissions/'
81 | 'Bot/MatSuBot 8|RfPB]]')
82 |
83 | def treat_page_and_item(self, page, item):
84 | changed = False
85 | for prop, claims in item.claims.items():
86 | for claim in claims:
87 | if self.handle_claim(claim):
88 | changed = True
89 | if changed:
90 | self.user_edit_entity(item, summary=self.summary)
91 |
92 | def handle_claim(self, claim):
93 | ret = False
94 | if not claim.sources:
95 | return ret
96 | if claim.type == 'wikibase-item':
97 | if claim.id == 'P1343' and 'P805' in claim.qualifiers:
98 | target = claim.qualifiers['P805'][0].getTarget()
99 | else:
100 | target = claim.getTarget()
101 | if target:
102 | for source in claim.sources:
103 | ret = self.handle_source_item(source, target) or ret
104 | for source in claim.sources:
105 | ret = self.handle_source_url(source) or ret
106 | return ret
107 |
108 | def handle_source_item(self, source, target):
109 | ret = False
110 | for prop in self.ref_props:
111 | keys = set(source.keys())
112 | if prop not in keys:
113 | continue
114 | if keys - (self.whitelist_props | {prop}):
115 | continue
116 | if len(source[prop]) > 1:
117 | #continue?
118 | return ret
119 |
120 | fake = next(iter(source[prop]))
121 | items = list(self.item_ids) + [target]
122 | if any(fake.target_equals(tgt) for tgt in items):
123 | snak = pywikibot.Claim(
124 | self.repo, self.inferred_from, isReference=True)
125 | snak.setTarget(target)
126 | source.setdefault(self.inferred_from, []).append(snak)
127 | source.pop(prop)
128 | ret = True
129 | return ret
130 |
131 | def handle_source_url(self, source):
132 | ret = False
133 | for prop in self.url_props:
134 | keys = set(source.keys())
135 | if prop not in keys:
136 | continue
137 | if keys - (self.whitelist_props | {prop}):
138 | continue
139 | if len(source[prop]) > 1:
140 | #continue?
141 | return ret
142 |
143 | snak = next(iter(source[prop]))
144 | url = snak.getTarget()
145 | if not url:
146 | continue
147 | target = None
148 | with suppress(pywikibot.InvalidTitle, ValueError):
149 | for prefix in [self.url_start, self.repo.concept_base_uri]:
150 | target_id = url.removeprefix(prefix)
151 | if target_id != url:
152 | target = pywikibot.ItemPage(self.repo, target_id)
153 | break
154 | if target:
155 | if target.isRedirectPage():
156 | target = target.getRedirectTarget()
157 | if target != snak.on_item:
158 | snak = pywikibot.Claim(
159 | self.repo, self.inferred_from, isReference=True)
160 | snak.setTarget(target)
161 | source.setdefault(self.inferred_from, []).append(snak)
162 | source.pop(prop)
163 | ret = True
164 | return ret
165 |
166 |
167 | def main(*args):
168 | options = {}
169 | local_args = pywikibot.handle_args(args)
170 | site = pywikibot.Site()
171 | genFactory = pagegenerators.GeneratorFactory(site=site)
172 | for arg in genFactory.handle_args(local_args):
173 | if arg.startswith('-'):
174 | arg, sep, value = arg.partition(':')
175 | if value != '':
176 | options[arg[1:]] = value if not value.isdigit() else int(value)
177 | else:
178 | options[arg[1:]] = True
179 |
180 | generator = genFactory.getCombinedGenerator()
181 | bot = FakeReferencesBot(generator=generator, site=site, **options)
182 | bot.run()
183 |
184 |
185 | if __name__ == '__main__':
186 | main()
187 |
--------------------------------------------------------------------------------
/typos.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | import time
3 |
4 | import pywikibot
5 | from pywikibot import pagegenerators
6 |
7 | from typoloader import TyposLoader
8 | from wikitext import WikitextFixingBot
9 |
10 |
11 | class TypoBot(WikitextFixingBot):
12 |
13 | '''
14 | Bot for typo fixing
15 |
16 | Supported parameters:
17 | * -allrules - use if you want to load rules that need user's decision
18 | * -offset:# - what typo rule do you want to start from
19 | * -quick - use if you want the bot to focus on the current rule,
20 | ie. skip the page if the rule couldn't be applied
21 | * -threshold:# - skip rule when loaded/replaced ratio gets over #
22 | * -typospage: - what page do you want to load typo rules from
23 | * -whitelistpage: - what page holds pages which should be skipped
24 | '''
25 |
26 | def __init__(self, generator, *, offset=0, **kwargs):
27 | self.available_options.update({
28 | 'allrules': False,
29 | 'quick': False,
30 | 'threshold': 10,
31 | 'typospage': None,
32 | 'whitelistpage': None,
33 | })
34 | kwargs['typos'] = False
35 | self.own_generator = not bool(generator)
36 | if self.own_generator:
37 | self.generator = self.make_generator()
38 | else:
39 | self.generator = generator
40 |
41 | super().__init__(**kwargs)
42 | self.offset = offset
43 |
44 | def setup(self):
45 | loader = TyposLoader(
46 | self.site, allrules=self.opt['allrules'],
47 | typospage=self.opt['typospage'],
48 | whitelistpage=self.opt['whitelistpage'])
49 | self.typoRules = loader.loadTypos()
50 | self.fp_page = loader.getWhitelistPage()
51 | self.whitelist = loader.loadWhitelist()
52 |
53 | @property
54 | def is_rule_accurate(self):
55 | threshold = self.opt['threshold']
56 | result = (self.processed < threshold or
57 | self.processed / threshold < self.replaced)
58 | return result
59 |
60 | def make_generator(self):
61 | for i, rule in enumerate(self.typoRules[:]):
62 | if self.offset > i:
63 | continue
64 | if rule.query is None:
65 | continue
66 |
67 | # todo: if not allrules:...
68 | self.offset = i
69 | pywikibot.info(f'\nQuery: "{rule.query}"')
70 | old_max = rule.longest
71 | rule.longest = 0.0
72 | self.current_rule = rule
73 | self.skip_rule = False
74 | self.processed = self.replaced = 0
75 | for page in self.site.search(rule.query, namespaces=[0]):
76 | if self.skip_rule:
77 | break
78 | yield page
79 | if not self.is_rule_accurate:
80 | pywikibot.info(
81 | f'Skipped inefficient query "{rule.query}" '
82 | f'({self.replaced}/{self.processed}')
83 | break
84 | else:
85 | if self.processed < 1:
86 | pywikibot.info(f'No results from query "{rule.query}"')
87 | else:
88 | percent = (self.replaced / self.processed) * 100
89 | pywikibot.info(
90 | f'{percent:.f}% accuracy of query "{rule.query}"')
91 |
92 | if self.processed > 0:
93 | pywikibot.info(f'Longest match: {rule.longest}s')
94 | rule.longest = max(old_max, rule.longest)
95 |
96 | def save_false_positive(self, page):
97 | link = page.title(as_link=True)
98 | self.fp_page.text += f'\n* {link}'
99 | self.fp_page.save(summary=link, asynchronous=True)
100 | self.whitelist.append(page.title())
101 |
102 | def skip_page(self, page):
103 | if page.title() in self.whitelist:
104 | pywikibot.warning(f'Skipped {page} because it is whitelisted')
105 | return True
106 |
107 | if self.own_generator and self.current_rule.find.search(page.title()):
108 | pywikibot.warning(
109 | f'Skipped {page} because the rule matches the title')
110 | return True
111 |
112 | return super().skip_page(page)
113 |
114 | def init_page(self, page):
115 | out = super().init_page(page)
116 | if self.own_generator:
117 | self.processed += 1
118 | return out
119 |
120 | def treat_page(self):
121 | page = self.current_page
122 | text = page.text
123 | done_replacements = []
124 | quickly = self.opt['quick'] is True
125 | start = time.time()
126 | if self.own_generator:
127 | text = self.current_rule.apply(page.text, done_replacements)
128 | if page.text == text:
129 | if quickly:
130 | pywikibot.info('Typo not found, not fixing another '
131 | 'typos in quick mode')
132 | return
133 | else:
134 | self.replaced += 1
135 |
136 | for rule in self.typoRules:
137 | if self.own_generator and rule == self.current_rule: # __eq__
138 | continue
139 | if rule.find.search(page.title()):
140 | continue
141 | if quickly and rule.needs_decision():
142 | continue
143 |
144 | text = rule.apply(text, done_replacements)
145 | stop = time.time()
146 | if quickly and stop - start > 15:
147 | pywikibot.warning('Other typos exceeded 15s, skipping')
148 | break
149 |
150 | self.put_current(
151 | text, summary='oprava překlepů: %s' % ', '.join(done_replacements))
152 |
153 | def user_confirm(self, question):
154 | if self.opt['always']:
155 | return True
156 |
157 | options = [('yes', 'y'), ('no', 'n'), ('all', 'a')]
158 | if self.fp_page.exists():
159 | options.append(('false positive', 'f'))
160 | if self.own_generator:
161 | options.append(('skip rule', 's'))
162 | options += [('open in browser', 'b'), ('quit', 'q')]
163 |
164 | choice = pywikibot.input_choice(question, options, default='N',
165 | automatic_quit=False)
166 |
167 | if choice == 'n':
168 | return False
169 |
170 | if choice == 's':
171 | self.skip_rule = True
172 | return False
173 |
174 | if choice == 'b':
175 | pywikibot.bot.open_webbrowser(self.current_page)
176 | return False
177 |
178 | if choice == 'f':
179 | self.save_false_positive(self.current_page)
180 | return False
181 |
182 | if choice == 'q':
183 | self.quit()
184 |
185 | if choice == 'a':
186 | self.options['always'] = True
187 |
188 | return True
189 |
190 | def teardown(self):
191 | rules = sorted(
192 | (rule for rule in self.typoRules if not rule.needs_decision()),
193 | key=lambda rule: rule.longest, reverse=True)[:3]
194 | pywikibot.info('\nSlowest autonomous rules:')
195 | for i, rule in enumerate(rules, start=1):
196 | pywikibot.info(f'{i}. "{rule.find.pattern}" - {rule.longest}')
197 | if self.own_generator:
198 | pywikibot.info(f'\nCurrent offset: {self.offset}\n')
199 | super().teardown()
200 |
201 |
202 | def main(*args):
203 | options = {}
204 | local_args = pywikibot.handle_args(args)
205 | genFactory = pagegenerators.GeneratorFactory()
206 | genFactory.handle_arg('-ns:0')
207 | for arg in genFactory.handle_args(local_args):
208 | if arg.startswith('-'):
209 | arg, sep, value = arg.partition(':')
210 | if value != '':
211 | options[arg[1:]] = value if not value.isdigit() else int(value)
212 | else:
213 | options[arg[1:]] = True
214 |
215 | generator = genFactory.getCombinedGenerator(preload=True)
216 | bot = TypoBot(generator, **options)
217 | bot.run()
218 |
219 |
220 | if __name__ == '__main__':
221 | main()
222 |
--------------------------------------------------------------------------------
/cswiki/iucn.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 | from datetime import datetime
4 |
5 | import mwparserfromhell as parser
6 | import pywikibot
7 | import pywikibot.pagegenerators as pg
8 | from pywikibot.exceptions import NoWikibaseEntityError
9 | from pywikibot.page import PropertyPage
10 |
11 | def get_revision_wrapper(item, rev_id: int):
12 | # https://github.com/matejsuchanek/wikidata-constraints/blob/11602b4050e4623c9f1e4e0b279cf2f6c14b2a53/retrieval.py#L131-L164
13 | cls = type(item)
14 | repo = item.repo
15 | entity_id = item.getID()
16 |
17 | rev = cls(repo, entity_id)
18 | data = json.loads(item.getOldVersion(rev_id))
19 | for key, val in data.items():
20 | # handle old serialization
21 | if val == []:
22 | data[key] = {}
23 |
24 | rev._content = data
25 | while True:
26 | try:
27 | rev.get()
28 | except (KeyError, NoWikibaseEntityError) as exc:
29 | # handle deleted properties
30 | if isinstance(exc, NoWikibaseEntityError):
31 | key = exc.entity.id
32 | else:
33 | key = exc.args[0]
34 | # in theory, this isn't needed
35 | if not PropertyPage.is_valid_id(key):
36 | raise
37 |
38 | if key.lower() in data['claims']:
39 | data['claims'].pop(key.lower())
40 | elif key.upper() in data['claims']:
41 | data['claims'].pop(key.upper())
42 | else:
43 | raise
44 | else:
45 | return rev
46 |
47 |
48 | def get_best_statements(statements):
49 | best = []
50 | best_rank = 'normal'
51 | for st in statements:
52 | if st.rank == best_rank:
53 | best.append(st)
54 | elif st.rank == 'preferred':
55 | best[:] = [st]
56 | best_rank = st.rank
57 | return best
58 |
59 |
60 | def is_different(old, new):
61 | if old == new:
62 | return False
63 |
64 | if old.getID() == 'Q11394' and new.getID() == 'Q96377276':
65 | return False
66 |
67 | return True
68 |
69 |
70 | args = pywikibot.handle_args()
71 |
72 | site = pywikibot.Site('cs', 'wikipedia')
73 | repo = pywikibot.Site('wikidata', 'wikidata')
74 |
75 | needle = re.compile(r'\b[Pp]141\b')
76 |
77 | editions = {
78 | #'2012.1': '20120619',
79 | '2012.2': '20121017',
80 | '2013.1': '20130702',
81 | '2013.2': '20131126',
82 | '2014.1': '20140612',
83 | '2014.2': '20140724',
84 | '2014.3': '20141117',
85 | '2015.1': '20150603',
86 | '2015.2': '20150623',
87 | '2015.4': '20151119',
88 | '2016.2': '20160904',
89 | '2016.3': '20161208',
90 | '2017.2': '20170914',
91 | '2017.3': '20171205',
92 | '2018.1': '20180705',
93 | '2019.2': '20190718',
94 | '2019.3': '20191210',
95 | '2020.2': '20200709',
96 | '2020.3': '20201210',
97 | '2021.1': '20210325',
98 | '2021.2': '20210904',
99 | '2021.3': '20211209',
100 | '2022.1': '20220101',
101 | '2022.2': '20221209',
102 | '2023.1': '20231211',
103 | '2025.2': '20251010',
104 | }
105 | stat_to_label = {
106 | 'Q719675': 'téměř ohrožený',
107 | 'Q211005': 'málo dotčený',
108 | 'Q219127': 'kriticky ohrožený druh',
109 | 'Q237350': 'vyhynulý',
110 | 'Q239509': 'vyhynulý v přírodě',
111 | 'Q278113': 'zranitelný',
112 | 'Q719675': 'téměř ohrožený',
113 | 'Q3245245': 'chybí údaje',
114 | 'Q123509': 'vymírání',
115 | 'Q11394': 'ohrožený',
116 | 'Q96377276': 'ohrožený',
117 | }
118 | links = {
119 | pywikibot.Page(site, 'Kriticky_ohrožený_taxon'),
120 | pywikibot.Page(site, 'Málo_dotčený_taxon'),
121 | pywikibot.Page(site, 'O_taxonu_chybí_údaje'),
122 | pywikibot.Page(site, 'Nevyhodnocený_taxon'),
123 | pywikibot.Page(site, 'Ohrožený_taxon'),
124 | pywikibot.Page(site, 'Téměř_ohrožený_taxon'),
125 | pywikibot.Page(site, 'Zranitelný_taxon'),
126 | pywikibot.Page(site, 'Taxon vyhynulý v přírodě'),
127 | pywikibot.Page(site, 'Vyhynulý_taxon'),
128 | }
129 |
130 | lines = [
131 | '',
132 | '{| class="wikitable sortable"',
133 | '! Č.',
134 | '! Taxon',
135 | '! class="unsortable" | Wikidata',
136 | '! Naposled',
137 | '! class="unsortable" | Odkazuje na',
138 | ]
139 | lines.extend(f'! class="unsortable" | {ed}' for ed in editions)
140 |
141 | i = 0
142 |
143 | sparql = '''SELECT ?item WHERE {
144 | ?article schema:about ?item; schema:isPartOf .
145 | ?item wdt:P141 ?iucn .
146 | } ORDER BY ?item'''
147 |
148 | gen = pg.PreloadingEntityGenerator(
149 | pg.WikidataSPARQLPageGenerator(sparql, site=repo)
150 | )
151 |
152 | for item in gen:
153 | best = get_best_statements(item.claims.get('P141', []))
154 | if not best:
155 | continue
156 |
157 | ts_to_status = {}
158 | cur = None
159 |
160 | for rev in item.revisions(reverse=True, content=False):
161 | if not rev.parentid:
162 | continue
163 |
164 | if not needle.search(rev.comment):
165 | continue
166 |
167 | if rev.comment.startswith('/* wbsetreference-set:'):
168 | continue
169 |
170 | if 'mw-reverted' in rev.tags:
171 | continue
172 |
173 | this = get_revision_wrapper(item, rev.revid)
174 | claims = get_best_statements(this.claims.get('P141', []))
175 | if claims:
176 | new = claims[0].getTarget()
177 | if cur is None or is_different(cur, new):
178 | key = rev.timestamp.strftime('%Y%m%d%H%M%S')
179 | ts_to_status[key] = new.getID()
180 | cur = new
181 |
182 | if len(ts_to_status) < 2:
183 | continue
184 |
185 | last_change = max(ts_to_status)
186 |
187 | new = best[0].getTarget()
188 | if cur is None or is_different(cur, new):
189 | key = item.latest_revision.timestamp.strftime('%Y%m%d%H%M%S')
190 | ts_to_status[key] = new.getID()
191 |
192 | link = item.sitelinks[site]
193 | page = pywikibot.Page(link)
194 | created = page.oldest_revision.timestamp
195 | if created > datetime.strptime(last_change, '%Y%m%d%H%M%S'):
196 | continue
197 |
198 | per_edition = {}
199 | for ts, stat in ts_to_status.items(): # asc
200 | last_release_date = max(
201 | (date for date in editions.values() if date < ts),
202 | default=0
203 | )
204 | for ed, date in editions.items():
205 | if last_release_date <= date:
206 | per_edition[ed] = stat
207 |
208 | links_to = [
209 | other.title(as_link=True)
210 | for other in page.linkedPages(
211 | namespaces=0,
212 | content=False,
213 | follow_redirects=True
214 | )
215 | if other in links
216 | ]
217 |
218 | i += 1
219 | ymd = f'{last_change[:4]}-{last_change[4:6]}-{last_change[6:8]}'
220 |
221 | lines.append('|-')
222 | lines.append(f'| {i}')
223 | lines.append(f'| {link.astext()}')
224 | lines.append(f'| [[d:{item.getID()}|{item.getID()}]]')
225 | lines.append(f'| data-sort-value="{last_change}" | {ymd}')
226 | lines.append('| ' + ('
'.join(sorted(links_to))))
227 |
228 | last = '?'
229 | streak = 0
230 | for ed in editions: # asc
231 | stat = per_edition.get(ed, '?')
232 | if stat == last:
233 | streak += 1
234 | continue
235 |
236 | if streak > 1:
237 | lines.append(
238 | f'| colspan="{streak}" align="center" | {stat_to_label.get(last, last)}'
239 | )
240 | elif streak == 1:
241 | lines.append(f'| {stat_to_label.get(last, last)}')
242 |
243 | last = stat
244 | streak = 1
245 |
246 | if streak > 1:
247 | lines.append(
248 | f'| colspan="{streak}" align="center" | {stat_to_label.get(last, last)}'
249 | )
250 | elif streak == 1:
251 | lines.append(f'| {stat_to_label.get(last, last)}')
252 |
253 | lines.append('|}')
254 | lines.append('
')
255 |
256 | new_text = '\n'.join(lines)
257 |
258 | site.login()
259 |
260 | output_page = pywikibot.Page(site, 'Wikipedie:WikiProjekt_Biologie/Status_ohrožení/vše')
261 | code = parser.parse(output_page.text)
262 | for old in code.ifilter_tags(matches='div'):
263 | code.replace(old, new_text)
264 | output_page.text = str(code)
265 | break
266 | else:
267 | output_page.text = new_text
268 |
269 | output_page.save(
270 | summary='tabulka', apply_cosmetic_changes=False, bot=False, minor=False
271 | )
272 |
--------------------------------------------------------------------------------
/cleanup_dates.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | from contextlib import suppress
3 | from datetime import datetime, timedelta
4 | from itertools import chain, combinations
5 |
6 | import pywikibot
7 |
8 | from pywikibot import Claim
9 | from pywikibot.exceptions import APIError
10 | from pywikibot.pagegenerators import (
11 | GeneratorFactory,
12 | PreloadingEntityGenerator,
13 | WikidataSPARQLPageGenerator,
14 | )
15 |
16 | from query_store import QueryStore
17 | from wikidata import WikidataEntityBot
18 |
19 |
20 | class DuplicateDatesBot(WikidataEntityBot):
21 |
22 | invalid_refs = {'P143', 'P813', 'P3452', 'P4656'}
23 | use_from_page = False
24 |
25 | def __init__(self, generator, **kwargs):
26 | self.available_options.update({
27 | 'days': 30,
28 | 'props': ['P569', 'P570', 'P2031', 'P2032'],
29 | })
30 | super().__init__(**kwargs)
31 | self.store = QueryStore()
32 | self._generator = generator or self.custom_generator()
33 |
34 | def custom_generator(self):
35 | for prop in self.opt['props']:
36 | for key in ('duplicate_dates', 'unmerged_dates'):
37 | time = datetime.now() - timedelta(days=self.opt['days'])
38 | query = self.store.build_query(
39 | key, prop=prop, date=time.isoformat(timespec='seconds'))
40 | yield from WikidataSPARQLPageGenerator(query, site=self.repo)
41 |
42 | @property
43 | def generator(self):
44 | return PreloadingEntityGenerator(self._generator)
45 |
46 | @property
47 | def summary(self):
48 | return ('remove redundant and less precise unsourced claim(s), '
49 | '[[Wikidata:Requests for permissions/Bot/MatSuBot 7|see RfPB]]')
50 |
51 | @staticmethod
52 | def first_inside_second(first, second):
53 | if first.precision > second.precision:
54 | if second.precision in {9, 10}:
55 | if first.year == second.year:
56 | if second.precision == 9:
57 | return True
58 | elif second.precision == 10:
59 | return first.month == second.month
60 | return False
61 |
62 | @staticmethod
63 | def first_same_as_second(first, second):
64 | if first == second:
65 | return True
66 | if first.precision == second.precision:
67 | if first.precision in {9, 10} and first.year == second.year:
68 | if first.precision == 10:
69 | return first.month == second.month
70 | else:
71 | return True
72 | return False
73 |
74 | @classmethod
75 | def is_valid_source(cls, source):
76 | return bool(set(source) - cls.invalid_refs)
77 |
78 | @classmethod
79 | def number_of_sources(cls, claim):
80 | number = 0
81 | for source in claim.sources:
82 | number += cls.is_valid_source(source)
83 | return number
84 |
85 | @classmethod
86 | def is_sourced(cls, claim):
87 | return cls.number_of_sources(claim) > 0
88 |
89 | @classmethod
90 | def can_merge_claims(cls, claim1, claim2):
91 | if claim1.getSnakType() != claim2.getSnakType():
92 | return False
93 |
94 | if (
95 | claim1.getSnakType() == 'value'
96 | and not cls.first_same_as_second(
97 | claim1.getTarget(),
98 | claim2.getTarget()
99 | )
100 | ):
101 | return False
102 |
103 | if (
104 | claim1.qualifiers != claim2.qualifiers
105 | and not (
106 | claim1.rank != 'deprecated'
107 | and claim2.rank == 'normal'
108 | and not claim2.qualifiers
109 | and not cls.is_sourced(claim2)
110 | )
111 | and not (
112 | claim2.rank != 'deprecated'
113 | and claim1.rank == 'normal'
114 | and not claim1.qualifiers
115 | and not cls.is_sourced(claim1)
116 | )
117 | ):
118 | return False
119 |
120 | return True
121 |
122 | def treat_page_and_item(self, page, item):
123 | redundant = []
124 | unmerged = []
125 | for prop in self.opt['props']:
126 | claims = item.claims.get(prop, [])
127 | if len(claims) < 2:
128 | continue
129 |
130 | already = set()
131 | for claim1, claim2 in combinations(claims, 2):
132 | if claim1.snak in already or claim2.snak in already:
133 | continue
134 |
135 | if (claim1.rank, claim2.rank) in (
136 | ('preferred', 'deprecated'),
137 | ('deprecated', 'preferred'),
138 | ):
139 | # this would need manual intervention
140 | continue
141 |
142 | if self.can_merge_claims(claim1, claim2):
143 | # never remove preferred/deprecated claim
144 | # if either is normal
145 | if claim1.rank != claim2.rank:
146 | if claim1.rank == 'normal':
147 | claim1, claim2 = claim2, claim1
148 | elif claim2.qualifiers and not claim1.qualifiers:
149 | claim1, claim2 = claim2, claim1
150 | elif (
151 | self.number_of_sources(claim2) >
152 | self.number_of_sources(claim1)
153 | ):
154 | claim1, claim2 = claim2, claim1
155 |
156 | for source in claim2.sources:
157 | if not self.is_valid_source(source):
158 | continue
159 | sources_copy = [
160 | c.copy() for c in chain(*source.values())]
161 | with suppress(APIError): # duplicate reference present
162 | claim1.addSources(sources_copy)
163 |
164 | unmerged.append(claim2)
165 | already.add(claim2.snak)
166 | continue
167 |
168 | if not (claim1.getSnakType() == 'value' == claim2.getSnakType()):
169 | continue
170 |
171 | pairs = [(claim1, claim2), (claim2, claim1)]
172 | for first, second in pairs:
173 | if self.is_sourced(second):
174 | continue
175 | # never remove preferred/deprecated claim
176 | # if either is normal
177 | if first.rank != second.rank and second.rank != 'normal':
178 | continue
179 |
180 | if (
181 | first.qualifiers != second.qualifiers
182 | and not (
183 | first.rank == 'preferred'
184 | and second.rank == 'normal'
185 | and not second.qualifiers
186 | )
187 | ):
188 | continue
189 |
190 | if self.first_inside_second(
191 | first.getTarget(),
192 | second.getTarget()
193 | ):
194 | redundant.append(second)
195 | already.add(second.snak)
196 | break
197 |
198 | if redundant or unmerged:
199 | if redundant:
200 | summary = self.summary
201 | else:
202 | summary = 'remove redundant claim(s)'
203 | item.removeClaims(redundant + unmerged, summary=summary)
204 |
205 |
206 | def main(*args):
207 | options = {}
208 | local_args = pywikibot.handle_args(args)
209 | site = pywikibot.Site()
210 | genFactory = GeneratorFactory(site=site)
211 | for arg in genFactory.handle_args(local_args):
212 | if arg.startswith('-'):
213 | arg, sep, value = arg.partition(':')
214 | if arg == '-prop':
215 | options.setdefault('props', []).append(
216 | value or pywikibot.input('Which property should be treated?'))
217 | elif value:
218 | options[arg[1:]] = int(value) if value.isdigit() else value
219 | else:
220 | options[arg[1:]] = True
221 |
222 | generator = genFactory.getCombinedGenerator()
223 | bot = DuplicateDatesBot(generator=generator, site=site, **options)
224 | bot.run()
225 |
226 |
227 | if __name__ == '__main__':
228 | main()
229 |
--------------------------------------------------------------------------------
/typoloader.py:
--------------------------------------------------------------------------------
1 | import re
2 | import time
3 |
4 | import pywikibot
5 |
6 | from pywikibot import textlib
7 |
8 |
9 | class IncompleteTypoRuleException(Exception):
10 |
11 | '''Exception raised when constructing a typo rule from incomplete data'''
12 |
13 | def __init__(self, message):
14 | self.message = message
15 |
16 |
17 | class InvalidExpressionException(Exception):
18 |
19 | '''Exception raised when an expression has invalid syntax'''
20 |
21 | def __init__(self, error, aspect='regular expression'):
22 | self.message = error.msg
23 | self.aspect = aspect
24 |
25 |
26 | class TypoRule:
27 |
28 | '''Class representing one typo rule'''
29 |
30 | exceptions = [
31 | 'category', 'comment', 'header', 'hyperlink', 'interwiki', 'invoke',
32 | 'property', 'template',
33 |
34 | # tags
35 | 'blockquote', 'code', 'gallery', 'graph', 'imagemap', 'kbd',
36 | 'mapframe', 'maplink', 'math', 'nowiki', 'poem', 'pre', 'score',
37 | 'section', 'syntaxhighlight', 'timeline', 'tt', 'var',
38 |
39 | # "target-part" of a wikilink
40 | re.compile(r'\[\[([^][|]+)(\]\]\w*|([^][|]+\|)+)'),
41 |
42 | re.compile('<[a-z]+ [^<>]+>|[a-z]+>'), # HTML tag
43 | re.compile(r'„[^\n"„“]+["“]|(?')
49 |
50 | def __init__(self, find, replacements, auto=False, query=None):
51 | self.find = find
52 | self.replacements = replacements
53 | self.auto = auto
54 | self.query = query
55 | self.longest = 0
56 |
57 | def __eq__(self, other):
58 | if isinstance(other, self.__class__):
59 | return self.id == other.id
60 | else:
61 | return False
62 |
63 | def __ne__(self, other):
64 | return not self.__eq__(other)
65 |
66 | def __repr__(self):
67 | return (
68 | f'{self.__class__.name}({self.find!r}, {self.replacements!r}, '
69 | f'auto={self.auto!r}, query={self.query!r})'
70 | )
71 |
72 | def needs_decision(self):
73 | return not self.auto or len(self.replacements) > 1
74 |
75 | @classmethod
76 | def newFromParameters(cls, parameters):
77 | if '1' not in parameters:
78 | raise IncompleteTypoRuleException('Missing find expression')
79 |
80 | find = cls.nowikiR.sub('', parameters['1'])
81 | try:
82 | find = re.compile(find, re.M)
83 | except re.error as exc:
84 | raise InvalidExpressionException(exc)
85 |
86 | replacements = []
87 | for key in '23456':
88 | if key in parameters:
89 | replacement = re.sub(r'\$([1-9])', r'\\\1', cls.nowikiR.sub(
90 | '', parameters[key]))
91 | replacements.append(replacement)
92 |
93 | if not replacements:
94 | raise IncompleteTypoRuleException(
95 | f'No replacements found for rule "{find.pattern}"')
96 |
97 | query = None
98 | if parameters.get('hledat'):
99 | part = parameters['hledat'].replace('{{!}}', '|')
100 | if parameters.get('insource') == 'ne':
101 | query = part
102 | else:
103 | try:
104 | re.compile(part)
105 | query = f'insource:/{part}/'
106 | except re.error as exc:
107 | raise InvalidExpressionException(exc, 'query')
108 |
109 | auto = parameters.get('auto') == 'ano'
110 |
111 | return cls(find, replacements, auto, query)
112 |
113 | def summary_hook(self, match, replaced):
114 | def underscores(string):
115 | if string.startswith(' '):
116 | string = '_' + string[1:]
117 | if string.endswith(' '):
118 | string = string[:-1] + '_'
119 | return string
120 |
121 | new = old = match.group()
122 | if self.needs_decision():
123 | options = [('keep', 'k')]
124 | replacements = []
125 | for i, repl in enumerate(self.replacements, start=1):
126 | replacement = match.expand(repl)
127 | replacements.append(replacement)
128 | options.append((f'{i} {underscores(replacement)}', str(i)))
129 | text = match.string
130 | pre = text[max(0, match.start() - 30):match.start()].rpartition('\n')[2]
131 | post = text[match.end():match.end() + 30].partition('\n')[0]
132 | pywikibot.info(f'{pre}<>{old}<>{pos}')
133 | choice = pywikibot.input_choice('Choose the best replacement',
134 | options, automatic_quit=False,
135 | default='k')
136 | if choice != 'k':
137 | new = replacements[int(choice) - 1]
138 | else:
139 | new = match.expand(self.replacements[0])
140 | if old == new:
141 | pywikibot.warning(f'No replacement done in string "{old}"')
142 |
143 | if old != new:
144 | old_str = underscores(old.replace('\n', '\\n'))
145 | new_str = underscores(new.replace('\n', '\\n'))
146 | fragment = f'{old_str} → {new_str}'
147 | if fragment.lower() not in map(str.lower, replaced):
148 | replaced.append(fragment)
149 | return new
150 |
151 | def apply(self, text, replaced=None):
152 | if replaced is None:
153 | replaced = []
154 | hook = lambda match: self.summary_hook(match, replaced)
155 | start = time.clock()
156 | text = textlib.replaceExcept(
157 | text, self.find, hook, self.exceptions, site=self.site)
158 | finish = time.clock()
159 | delta = finish - start
160 | self.longest = max(delta, self.longest)
161 | if delta > 5:
162 | pywikibot.warning(f'Slow typo rule "{self.find.pattern}" ({delta})')
163 | return text
164 |
165 |
166 | class TyposLoader:
167 |
168 | top_id = 0
169 |
170 | '''Class loading and holding typo rules'''
171 |
172 | def __init__(self, site, *, allrules=False, typospage=None,
173 | whitelistpage=None):
174 | self.site = site
175 | self.load_all = allrules
176 | self.typos_page_name = typospage
177 | self.whitelist_page_name = whitelistpage
178 |
179 | def getWhitelistPage(self):
180 | if self.whitelist_page_name is None:
181 | self.whitelist_page_name = 'Wikipedie:WPCleaner/Typo/False'
182 |
183 | return pywikibot.Page(self.site, self.whitelist_page_name)
184 |
185 | def loadTypos(self):
186 | pywikibot.info('Loading typo rules...')
187 | self.typoRules = []
188 |
189 | if self.typos_page_name is None:
190 | self.typos_page_name = 'Wikipedie:WPCleaner/Typo'
191 | typos_page = pywikibot.Page(self.site, self.typos_page_name)
192 | if not typos_page.exists():
193 | # todo: feedback
194 | return
195 |
196 | text = textlib.removeDisabledParts(
197 | typos_page.text, include=['nowiki'], site=self.site)
198 | load_all = self.load_all is True
199 | for template, fielddict in textlib.extract_templates_and_params(
200 | text, remove_disabled_parts=False, strip=False):
201 | if template.lower() == 'typo':
202 | try:
203 | rule = TypoRule.newFromParameters(fielddict)
204 | except IncompleteTypoRuleException as exc:
205 | pywikibot.warning(exc.message) # pwb.exception?
206 | except InvalidExpressionException as exc:
207 | if 'fixed-width' not in exc.message:
208 | pywikibot.warning('Invalid {} {}: {}'.format(
209 | exc.aspect, fielddict['1'], exc.message))
210 | else:
211 | rule.id = self.top_id
212 | # fixme: cvar or ivar?
213 | self.top_id += 1
214 | if load_all or not rule.needs_decision():
215 | self.typoRules.append(rule)
216 |
217 | pywikibot.info(f'{len(self.typoRules)} typo rules loaded')
218 | return self.typoRules
219 |
220 | def loadWhitelist(self):
221 | self.whitelist = []
222 | self.fp_page = self.getWhitelistPage()
223 | if self.fp_page.exists():
224 | for match in re.finditer(r'\[\[([^]|]+)\]\]', self.fp_page.text):
225 | self.whitelist.append(match[1].strip())
226 | return self.whitelist
227 |
--------------------------------------------------------------------------------
/clean_dupes.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | from queue import Queue
3 | from threading import Lock, Thread
4 |
5 | import pywikibot
6 |
7 | from pywikibot.exceptions import NoPageError
8 | from pywikibot.pagegenerators import (
9 | GeneratorFactory,
10 | PreloadingEntityGenerator,
11 | WikidataSPARQLPageGenerator,
12 | )
13 |
14 | from merger import Merger
15 | from query_store import QueryStore
16 | from wikidata import WikidataEntityBot
17 | from scripts.revertbot import BaseRevertBot
18 |
19 |
20 | class DupesMergingBot(WikidataEntityBot):
21 |
22 | dupe_items = {'Q1263068', 'Q17362920', 'Q21528878'}
23 | use_from_page = False
24 |
25 | def __init__(self, generator, offset=0, **kwargs):
26 | self.available_options.update({
27 | 'threads': 1, # unstable
28 | })
29 | super().__init__(**kwargs)
30 | self.offset = offset
31 | self.store = QueryStore()
32 | self._generator = generator or self.custom_generator()
33 | self.save_lock = Lock()
34 | self.access_lock = Lock()
35 | self.site_locks = {}
36 |
37 | @property
38 | def generator(self):
39 | return PreloadingEntityGenerator(self._generator)
40 |
41 | def custom_generator(self):
42 | query = self.store.build_query(
43 | 'dupes', dupe=' wd:'.join(self.dupe_items), offset=self.offset)
44 | return WikidataSPARQLPageGenerator(query, site=self.repo,
45 | result_type=list)
46 |
47 | def setup(self):
48 | super().setup()
49 | count = self.opt['threads']
50 | self.workers = []
51 | if count > 1:
52 | self.queue = Queue(count)
53 | for i in range(count):
54 | thread = Thread(target=self.work)
55 | thread.start()
56 | self.workers.append(thread)
57 |
58 | def get_lock_for(self, site):
59 | with self.access_lock:
60 | return self.site_locks.setdefault(site, Lock())
61 |
62 | def work(self):
63 | while True:
64 | item = self.queue.get()
65 | if item is None:
66 | break
67 | self.process_item(item)
68 | self.queue.task_done()
69 |
70 | def init_page(self, item):
71 | self.offset += 1
72 | return super().init_page(item)
73 |
74 | def skip_page(self, item):
75 | return 'P31' not in item.claims or super().skip_page(item)
76 |
77 | def treat_page_and_item(self, page, item):
78 | if self.opt['threads'] > 1:
79 | self.queue.put(item)
80 | else:
81 | self.process_item(item)
82 |
83 | def process_item(self, item):
84 | claims = []
85 | targets = set()
86 | for claim in item.claims['P31']:
87 | if claim.snaktype != 'value':
88 | continue
89 | if claim.target.id not in self.dupe_items:
90 | continue
91 | claims.append(claim)
92 | for snak in claim.qualifiers.get('P460', []):
93 | if snak.snaktype == 'value':
94 | targets.add(snak.getTarget())
95 |
96 | for claim in item.claims.get('P460', []):
97 | if claim.snaktype == 'value':
98 | claims.append(claim)
99 | targets.add(claim.getTarget())
100 |
101 | sitelinks = []
102 | if not targets:
103 | for page in item.iterlinks():
104 | site = page.site
105 | with self.get_lock_for(site):
106 | if not page.exists():
107 | sitelinks.append(site)
108 | continue
109 | if page.isRedirectPage():
110 | try:
111 | target = page.getRedirectTarget().data_item()
112 | except NoPageError:
113 | pass
114 | else:
115 | targets.add(target)
116 |
117 | if not targets:
118 | pywikibot.info('No target found')
119 | return
120 |
121 | target = targets.pop()
122 | if targets:
123 | pywikibot.info('Multiple targets found')
124 | return
125 |
126 | while target.isRedirectPage():
127 | pywikibot.warning(f'Target {target.getID()} is redirect')
128 | target = target.getRedirectTarget()
129 |
130 | if item == target:
131 | self._save_page(item, self._save_entity, item.removeClaims, claims)
132 | return
133 |
134 | target_sitelinks = []
135 | for dbname in item.sitelinks:
136 | if dbname not in target.sitelinks:
137 | continue
138 |
139 | link = item.sitelinks[dbname]
140 | site = link.site
141 | with self.get_lock_for(site):
142 | page = pywikibot.Page(link)
143 | if not page.exists():
144 | sitelinks.append(site)
145 | continue
146 |
147 | target_link = target.sitelinks[dbname]
148 | target_page = pywikibot.Page(target_link)
149 | if not target_page.exists():
150 | target_sitelinks.append(site)
151 | continue
152 |
153 | if self.redirectsTo(page, target_page):
154 | if link.badges:
155 | sitelinks.append(site)
156 | continue
157 |
158 | if self.redirectsTo(target_page, page):
159 | if target_link.badges:
160 | target_sitelinks.append(site)
161 | continue
162 |
163 | pywikibot.info(f'Target has a conflicting sitelink: {dbname}')
164 | return
165 |
166 | target_claims = []
167 | for claim in target.claims.get('P460', []):
168 | if claim.snaktype != 'value':
169 | continue
170 | if claim.target_equals(item):
171 | target_claims.append(claim)
172 |
173 | for claim in target.claims.get('P31', []):
174 | if claim.snaktype != 'value':
175 | continue
176 | if claim.target.id not in self.dupe_items:
177 | continue
178 | for snak in claim.qualifiers.get('P460', []):
179 | if snak.snaktype == 'value' and snak.target_equals(item):
180 | target_claims.append(claim)
181 |
182 | if sitelinks:
183 | self._save_page(
184 | item, self._save_entity, item.removeSitelinks, sitelinks,
185 | summary='removing sitelink(s) to non-existing / redirected page(s)')
186 | if claims:
187 | self._save_page(item, self._save_entity, item.removeClaims, claims)
188 | if target_sitelinks:
189 | self._save_page(
190 | target, self._save_entity, target.removeSitelinks, target_sitelinks,
191 | summary='removing sitelink(s) to non-existing / redirected page(s)')
192 | if target_claims:
193 | self._save_page(
194 | target, self._save_entity, target.removeClaims, target_claims)
195 |
196 | target, item = Merger.sort_for_merge(
197 | [item, target], key=['sitelinks', 'claims', 'id'])
198 |
199 | if not self._save_page(
200 | item, self._save_entity, Merger.clean_merge, item, target,
201 | ignore_conflicts=['description']):
202 | pywikibot.info('Reverting changes...')
203 | bot = BaseRevertBot(self.site) # todo: integrate to Merger
204 | comment = 'Error occurred when attempting to merge with %s'
205 | bot.comment = comment % target.title(as_link=True)
206 | bot.revert({'title': item.title()})
207 | bot.comment = comment % item.title(as_link=True)
208 | bot.revert({'title': target.title()})
209 | return
210 |
211 | self.offset -= 1
212 |
213 | def redirectsTo(self, page, target):
214 | return page.isRedirectPage() and page.getRedirectTarget() == target
215 |
216 | def _save_entity(self, callback, *args, **kwargs):
217 | with self.save_lock:
218 | if 'asynchronous' in kwargs:
219 | kwargs.pop('asynchronous')
220 | return callback(*args, **kwargs)
221 |
222 | def teardown(self):
223 | count = len(self.workers)
224 | for i in range(count):
225 | self.queue.put(None)
226 | for worker in self.workers:
227 | worker.join()
228 | super().teardown()
229 |
230 | def exit(self):
231 | super().exit()
232 | bound = self.offset - self.offset % 50
233 | pywikibot.info(f'\nCurrent offset: {self.offset} (use {bound})\n')
234 |
235 |
236 | def main(*args):
237 | options = {}
238 | local_args = pywikibot.handle_args(args)
239 | site = pywikibot.Site()
240 | genFactory = GeneratorFactory(site=site)
241 | for arg in genFactory.handle_args(local_args):
242 | if arg.startswith('-'):
243 | arg, sep, value = arg.partition(':')
244 | if value != '':
245 | options[arg[1:]] = value if not value.isdigit() else int(value)
246 | else:
247 | options[arg[1:]] = True
248 |
249 | generator = genFactory.getCombinedGenerator()
250 | bot = DupesMergingBot(generator=generator, site=site, **options)
251 | bot.run()
252 |
253 |
254 | if __name__ == '__main__':
255 | main()
256 |
--------------------------------------------------------------------------------
/connect.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | import datetime
3 |
4 | import pywikibot
5 |
6 | from pywikibot import pagegenerators, textlib
7 | from pywikibot.exceptions import APIError, NoPageError
8 | from pywikibot.tools import first_lower
9 |
10 | pywikibot.handle_args()
11 |
12 | start = datetime.datetime.now()
13 |
14 | do_only = []
15 | dont_do = []
16 |
17 | tp_map = {
18 | 'cs|wikipedia': {
19 | 'commons': {
20 | '1': {
21 | 'lang': 'commons',
22 | 'family': 'commons'
23 | },
24 | },
25 | 'commonscat': {
26 | '1': {
27 | 'lang': 'commons',
28 | 'family': 'commons',
29 | 'pattern': 'Category:%s',
30 | 'namespaces': [14]
31 | },
32 | },
33 | 'wikicitáty': {
34 | 'dílo': {
35 | 'family': 'wikiquote',
36 | 'pattern': 'Dílo:%s'
37 | },
38 | 'kategorie': {
39 | 'family': 'wikiquote',
40 | 'pattern': 'Kategorie:%s'
41 | },
42 | 'osoba': 'wikiquote',
43 | 'téma': 'wikiquote'
44 | },
45 | 'wikizdroje': {
46 | 'dílo': 'wikisource',
47 | 'autor': {
48 | 'family': 'wikisource',
49 | 'pattern': 'Autor:%s'
50 | },
51 | 'kategorie': {
52 | 'family': 'wikiquote',
53 | 'pattern': 'Kategorie:%s'
54 | },
55 | },
56 | 'wikidruhy': {
57 | 'taxon': {
58 | 'family': 'species',
59 | 'lang': 'species',
60 | },
61 | },
62 | },
63 | 'cs|wikiquote': {
64 | 'commons': {
65 | 'galerie': {
66 | 'lang': 'commons',
67 | 'family': 'commons'
68 | },
69 | 'kategorie': {
70 | 'lang': 'commons',
71 | 'family': 'commons',
72 | 'pattern': 'Category:%s',
73 | 'namespaces': [14]
74 | },
75 | },
76 | 'wikipedie': {
77 | 'článek': 'wikipedia'
78 | },
79 | },
80 | 'cs|wikisource': {
81 | 'commons': {
82 | 'galerie': {
83 | 'lang': 'commons',
84 | 'family': 'commons'
85 | },
86 | 'kategorie': {
87 | 'lang': 'commons',
88 | 'family': 'commons',
89 | 'pattern': 'Category:%s',
90 | 'namespaces': [14]
91 | },
92 | },
93 | 'autorinfo': {
94 | 'BiografieWiki': 'wikipedia',
95 | 'WikiquoteCS': 'wikiquote'
96 | },
97 | },
98 | 'de|wikiquote': {
99 | 'wikipedia': {
100 | '1': 'wikipedia'
101 | },
102 | },
103 | 'es|wikiquote': {
104 | 'wikipedia': {
105 | '1': 'wikipedia'
106 | },
107 | },
108 | 'fi|wikiquote': {
109 | 'wikipedia': {
110 | '1': 'wikipedia'
111 | },
112 | },
113 | 'fr|wikiquote': {
114 | 'autres projets': {
115 | 'w': 'wikipedia',
116 | 's': 'wikisource',
117 | 'species': {
118 | 'family': 'species',
119 | 'lang': 'species'
120 | },
121 | 'wikispecies': {
122 | 'family': 'species',
123 | 'lang': 'species'
124 | },
125 | 'commons': {
126 | 'lang': 'commons',
127 | 'family': 'commons'
128 | },
129 | '1': {
130 | 'lang': 'commons',
131 | 'family': 'commons'
132 | },
133 | },
134 | },
135 | 'fr|wikiquote': {
136 | 'wikipedia': {
137 | '1': 'wikipedia'
138 | },
139 | },
140 | 'id|wikiquote': {
141 | 'wikipedia': {
142 | '1': 'wikipedia'
143 | },
144 | },
145 | 'pl|wikiquote': {
146 | 'commons': {
147 | '1': {
148 | 'lang': 'commons',
149 | 'family': 'commons'
150 | }
151 | },
152 | 'wikinews': {str(i): 'wikinews' for i in range(1, 10)},
153 | 'wikipediakat': {
154 | '1': {
155 | 'lang': 'pl',
156 | 'family': 'wikipedia',
157 | 'pattern': 'Category:%s',
158 | 'namespaces': [14],
159 | },
160 | },
161 | 'wikisource': {}, # todo
162 | },
163 | 'pt|wikiquote': {
164 | 'autor': {
165 | 'Wikinoticias': 'wikinews',
166 | 'Wikipedia': 'wikipedia',
167 | 'Wikisource': 'wikisource'
168 | },
169 | 'wikipédia': {
170 | '1': 'wikipedia'
171 | },
172 | 'wikisource': {
173 | '1': 'wikisource'
174 | },
175 | },
176 | 'ru|wikiquote': {
177 | 'википедия': {
178 | '1': 'wikipedia'
179 | },
180 | 'wikipedia': {
181 | '1': 'wikipedia'
182 | },
183 | 'навигация': {
184 | 'Википедия': 'wikipedia',
185 | 'Викитека': 'wikisource',
186 | 'Викивиды': {
187 | 'family': 'species',
188 | 'lang': 'species'
189 | },
190 | 'Викисклад': {
191 | 'lang': 'commons',
192 | 'family': 'commons'
193 | },
194 | 'Викигид': 'wikivoyage',
195 | },
196 | },
197 | 'sk|wikiquote': {
198 | 'wikipedia': {
199 | '1': 'wikipedia'
200 | },
201 | },
202 | 'sv|wikiquote': {
203 | 'wikipedia': {
204 | '1': 'wikipedia'
205 | },
206 | },
207 | }
208 |
209 | for project in tp_map.keys():
210 | lang, family = project.split('|', 1)
211 | if len(do_only) > 0 and lang + family not in do_only and family not in do_only:
212 | continue
213 | if lang + family in dont_do or family in dont_do:
214 | continue
215 |
216 | site = pywikibot.Site(lang, family)
217 | pywikibot.info(f'Doing {lang}{family}')
218 | site.login()
219 |
220 | genFactory = pagegenerators.GeneratorFactory(site=site)
221 | for ns in (0, 14, 100):
222 | if family != 'wikisource' and ns == 100: # fixme: cswikiquote
223 | continue
224 | if family == 'wikisource' and ns == 0:
225 | continue
226 | genFactory.handle_arg(f'-ns:{ns}')
227 | genFactory.handle_arg('-unconnectedpages')
228 | generator = genFactory.getCombinedGenerator(preload=True)
229 |
230 | for page in generator:
231 | if page.namespace() != 14 and page.isDisambig():
232 | continue
233 |
234 | for template, fields in textlib.extract_templates_and_params(page.text):
235 | if first_lower(template) not in tp_map[project]:
236 | continue
237 |
238 | params = tp_map[project][first_lower(template)]
239 | for key in fields:
240 | if key not in params:
241 | continue
242 |
243 | title = fields[key].strip()
244 | if not title:
245 | continue
246 |
247 | target_lang = lang
248 | target_family = family
249 | if isinstance(params[key], dict):
250 | if params[key].get('namespaces', []) and page.namespace() not in params[key]['namespaces']:
251 | continue
252 | if 'pattern' in params[key].keys():
253 | title = params[key]['pattern'] % title
254 | if 'family' in params[key].keys():
255 | target_family = params[key]['family']
256 | if 'lang' in params[key].keys():
257 | target_lang = params[key]['lang']
258 | else:
259 | target_family = params[key]
260 |
261 | target_site = pywikibot.Site(target_lang, target_family)
262 | if '{{' in title:
263 | title = site.expand_text(title, page.title())
264 | target_page = pywikibot.Page(target_site, title)
265 | if not target_page.exists():
266 | pywikibot.info("{target_page} doesn't exist")
267 | continue
268 | while target_page.isRedirectPage():
269 | target_page = target_page.getRedirectTarget()
270 | if target_page.isDisambig():
271 | pywikibot.info(f'{target_page} is a disambiguation')
272 | continue
273 |
274 | try:
275 | item = target_page.data_item()
276 | except NoPageError:
277 | repo = site.data_repository()
278 | # fixme: unused return value
279 | data = repo.linkTitles(page, target_page)
280 | pywikibot.info('Item created')
281 | pywikibot.info(data) # todo
282 | break
283 | if site.dbName() in item.sitelinks:
284 | pywikibot.info(page)
285 | pywikibot.info('%s already has sitelink to %s%s' % (
286 | item, lang, family))
287 | continue
288 |
289 | try:
290 | item.setSitelink(
291 | page, summary='Adding sitelink %s' % page.title(
292 | asLink=True, insite=item.site))
293 | except APIError:
294 | pass
295 | else:
296 | page.purge()
297 | break
298 |
299 | end = datetime.datetime.now()
300 |
301 | pywikibot.info('Complete! Took %d seconds' % (end - start).total_seconds())
302 |
--------------------------------------------------------------------------------
/manage_duos.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | import pywikibot
3 |
4 | from pywikibot import pagegenerators
5 | from pywikibot.data.sparql import SparqlQuery
6 |
7 | from query_store import QueryStore
8 | from wikidata import WikidataEntityBot
9 |
10 |
11 | class DuosManagingBot(WikidataEntityBot):
12 |
13 | conj = {
14 | 'af': ' en ',
15 | 'az': ' və ',
16 | 'be': ' і ',
17 | 'be-tarask': ' і ',
18 | 'bg': ' и ',
19 | 'br': ' ha ',
20 | 'ca': ' i ',
21 | 'cs': ' a ',
22 | 'cy': ' a ',
23 | 'da': ' og ',
24 | 'de': ' und ',
25 | 'de-at': ' und ',
26 | 'el': ' και ',
27 | 'eo': ' kaj ',
28 | 'es': ' y ',
29 | 'et': ' ja ',
30 | 'eu': ' eta ',
31 | 'fi': ' ja ',
32 | 'fr': ' et ',
33 | 'fy': ' en ',
34 | 'gl': ' e ',
35 | 'hr': ' i ',
36 | 'hu': ' és ',
37 | 'id': ' dan ',
38 | 'it': ' e ',
39 | 'ka': ' და ',
40 | 'la': ' et ',
41 | 'lt': ' ir ',
42 | 'lv': ' un ',
43 | 'ms': ' dan ',
44 | 'nb': ' og ',
45 | 'nl': ' en ',
46 | 'nn': ' og ',
47 | 'oc': ' e ',
48 | 'pl': ' i ',
49 | 'pt': ' e ',
50 | 'ro': ' și ',
51 | 'ru': ' и ',
52 | 'sk': ' a ',
53 | 'sl': ' in ',
54 | 'sr': ' и ',
55 | 'sr-ec': ' и ',
56 | 'sr-el': ' i ',
57 | 'sv': ' och ',
58 | 'sw': ' na ',
59 | 'tr': ' ve ',
60 | 'uk': ' і ',
61 | 'vi': ' và ',
62 | 'war': ' ngan ',
63 | }
64 | distribute_properties = [
65 | 'P22', 'P25', 'P27', 'P40', 'P53', 'P106', 'P1412',
66 | ]
67 | class_to_relation = [
68 | ('Q132776479', 'twin-sisters'),
69 | ('Q132776456', 'twin-brothers'),
70 | ('Q14756018', 'twin'),
71 | ('Q14073567', 'sibling'),
72 | ('Q3046146', 'spouse'),
73 | ('Q106925878', 'father-son'),
74 | ('Q1313923', 'relative'),
75 | # TODO: ('Q1141470', 'comedians'), not a "relation by blood"
76 | ]
77 | relation_map = {
78 | #'comedians': 'P1327',
79 | #'father-son': '', we don't know who is who
80 | # TODO: 'partner': 'P451',
81 | 'relative': 'P1038',
82 | 'sibling': 'P3373',
83 | 'spouse': 'P26',
84 | 'twin': 'P3373/P1039/Q131440579',
85 | 'twin-brothers': 'P3373/P1039/Q108714555',
86 | 'twin-sisters': 'P3373/P1039/Q108714611',
87 | }
88 | use_from_page = False
89 |
90 | def __init__(self, generator, **kwargs):
91 | self.available_options.update({
92 | 'always': True,
93 | 'class': 'Q10648343',
94 | 'min_labels': 1,
95 | })
96 | super().__init__(**kwargs)
97 | self.store = QueryStore()
98 | self.sparql = SparqlQuery(repo=self.repo)
99 | self._generator = generator or self.custom_generator()
100 |
101 | def skip_page(self, item):
102 | if super().skip_page(item):
103 | return True
104 | if 'P31' not in item.claims:
105 | pywikibot.info(f'{item} is missing P31 property')
106 | return True
107 | if 'P527' in item.claims:
108 | pywikibot.info(f'{item} already has P527 property')
109 | return True
110 | return False
111 |
112 | def custom_generator(self):
113 | kwargs = {'class': self.opt['class']}
114 | query = self.store.build_query('duos', **kwargs)
115 | return pagegenerators.WikidataSPARQLPageGenerator(query, site=self.repo)
116 |
117 | @property
118 | def generator(self):
119 | return pagegenerators.PreloadingEntityGenerator(self._generator)
120 |
121 | def get_relation(self, item):
122 | ask_pattern = 'ASK { wd:%s wdt:P31/wdt:P279* wd:%%s }' % item.id
123 | for key, rel in self.class_to_relation:
124 | if self.sparql.ask(ask_pattern % key):
125 | return rel
126 | return None
127 |
128 | def get_labels(self, item, relation):
129 | labels = [{}, {}]
130 | for lang, value in item.labels.items():
131 | delim = []
132 | if lang in self.conj:
133 | delim.append(self.conj[lang])
134 | delim.append(' and ')
135 | delim.append(' & ')
136 | for conj in delim:
137 | label = value.partition(' (')[0]
138 | if ', ' in label:
139 | continue
140 | split = label.split(conj)
141 | if len(split) != 2:
142 | continue
143 | split0 = split[0].split()
144 | split1 = split[1].split()
145 | if split1[0].islower():
146 | continue
147 | # TODO: if len(split1) > 1 and split1[0][-1] == '.':
148 | if len(split1) > len(split0):
149 | if len(split1) > 2 and split1[-2].islower():
150 | split1[-2:] = [' '.join(split1[-2:])]
151 | if len(split1) - len(split0) == 1:
152 | # if items are in a relation, then
153 | # they probably share their surname
154 | if relation:
155 | split[0] += ' %s' % split1[-1]
156 | split0.append(split1[-1])
157 | if len(split0) > 1 or len(split1) == 1:
158 | labels[0][lang] = split[0]
159 | labels[1][lang] = split[1]
160 | break
161 |
162 | return labels
163 |
164 | def treat_page_and_item(self, page, item):
165 | relation = self.get_relation(item)
166 | labels = self.get_labels(item, relation)
167 | count = max(map(len, labels))
168 | if count == 0:
169 | pywikibot.info('No labels, skipping...')
170 | return
171 |
172 | if count < self.opt['min_labels']:
173 | pywikibot.info(f'Too few labels ({count}), skipping...')
174 | return
175 |
176 | to_add = []
177 | to_remove = []
178 | if relation and relation.startswith('twin'):
179 | distribute = self.distribute_properties + ['P569', 'P19']
180 | if relation.startswith('twin-'):
181 | distribute.append('P21')
182 | else:
183 | distribute = self.distribute_properties
184 |
185 | for prop in distribute:
186 | for claim in item.claims.get(prop, []):
187 | if claim.getTarget():
188 | to_remove.append(claim)
189 | json = claim.toJSON()
190 | json.pop('id')
191 | to_add.append(json)
192 |
193 | items = [self.create_item(item, data, relation, to_add)
194 | for data in labels]
195 | if self.relation_map.get(relation):
196 | recipe = self.relation_map[relation].split('/')
197 | if len(recipe) == 3:
198 | prop, qprop, qval = recipe
199 | else:
200 | prop, qprop, qval = recipe[0], None, None
201 | for it, target in zip(items, reversed(items)):
202 | claim = pywikibot.Claim(self.repo, prop)
203 | claim.setTarget(target)
204 | if qprop:
205 | qualifier = pywikibot.Claim(self.repo, qprop, is_qualifier=True)
206 | qualifier.setTarget(pywikibot.ItemPage(self.repo, qval))
207 | claim.addQualifier(qualifier)
208 | source = pywikibot.Claim(self.repo, 'P3452', is_reference=True)
209 | source.setTarget(item)
210 | claim.addSource(source)
211 | self.user_add_claim(it, claim, asynchronous=False)
212 |
213 | for it in items:
214 | claim = pywikibot.Claim(self.repo, 'P527')
215 | claim.setTarget(it)
216 | self.user_add_claim(item, claim, asynchronous=False)
217 |
218 | for claim in to_remove:
219 | pywikibot.info(f'Removing {claim.id} --> {claim.getTarget()}')
220 | json = claim.toJSON()
221 | json['remove'] = ''
222 | self.user_edit_entity(
223 | item,
224 | {'claims': [json]},
225 | asynchronous=False,
226 | summary='moved [[Property:{}]] to {} & {}'.format(
227 | claim.id,
228 | items[0].title(as_link=True, insite=self.repo),
229 | items[1].title(as_link=True, insite=self.repo)
230 | )
231 | )
232 |
233 | def create_item(self, item, labels, relation, to_add):
234 | instance_of = pywikibot.Claim(self.repo, 'P31')
235 | instance_of.setTarget(pywikibot.ItemPage(self.repo, 'Q5'))
236 | part_of = pywikibot.Claim(self.repo, 'P361')
237 | part_of.setTarget(item)
238 |
239 | pywikibot.info(f'Creating item (relation "{relation}")...')
240 | new_item = pywikibot.ItemPage(self.repo)
241 | self.user_edit_entity(
242 | new_item,
243 | {
244 | 'labels': labels,
245 | 'claims': [instance_of.toJSON(), part_of.toJSON()] + to_add,
246 | },
247 | asynchronous=False,
248 | summary='based on data in {}'.format(
249 | item.title(as_link=True, insite=self.repo)
250 | )
251 | )
252 |
253 | return new_item
254 |
255 |
256 | def main(*args):
257 | options = {}
258 | local_args = pywikibot.handle_args(args)
259 | site = pywikibot.Site()
260 | genFactory = pagegenerators.GeneratorFactory(site=site)
261 | for arg in genFactory.handle_args(local_args):
262 | if arg.startswith('-'):
263 | arg, sep, value = arg.partition(':')
264 | if value != '':
265 | options[arg[1:]] = value if not value.isdigit() else int(value)
266 | else:
267 | options[arg[1:]] = True
268 |
269 | generator = genFactory.getCombinedGenerator()
270 | bot = DuosManagingBot(generator=generator, site=site, **options)
271 | bot.run()
272 |
273 |
274 | if __name__ == '__main__':
275 | main()
276 |
--------------------------------------------------------------------------------
/checkwiki.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | import re
3 |
4 | import pywikibot
5 |
6 | from pywikibot import pagegenerators
7 | from pywikibot.exceptions import UnknownExtension
8 |
9 | from checkwiki_errors import *
10 | from wikitext import WikitextFixingBot
11 |
12 |
13 | class CheckWikiSettings:
14 |
15 | prio_map = {
16 | '0': '',
17 | '1': 'high',
18 | '2': 'medium',
19 | '3': 'low'
20 | }
21 |
22 | def __init__(self, data):
23 | self.data = data
24 |
25 | def get_priority(self, error):
26 | return self.data[error]['priority']
27 |
28 | def get_errors_by_priority(self, priority):
29 | for error, data in self.data.items():
30 | if data['priority'] == priority:
31 | yield error
32 |
33 | @classmethod
34 | def new_from_text(cls, text, dbName):
35 | data = {}
36 | inside_setting = False
37 | setting = None
38 | setting_text = ''
39 | parsed_settings = {}
40 | for line in text.splitlines():
41 | if inside_setting is False:
42 | match = re.match(' *([a-z0-9_]+) *=', line)
43 | if match is not None:
44 | setting = match[1]
45 | setting_text = ''
46 | inside_setting = True
47 | line = line[match.end():]
48 |
49 | if inside_setting is True:
50 | if 'END' in line:
51 | setting_text += line[:line.index('END')].strip()
52 | inside_setting = False
53 | parsed_settings[setting] = setting_text
54 | else:
55 | setting_text += line.strip() + '\n'
56 |
57 | project = parsed_settings.pop('project', dbName)
58 | for setting, text in parsed_settings.items():
59 | split = setting.split('_')
60 | if len(split) != 4:
61 | continue
62 | if split[0] != 'error':
63 | continue
64 | if split[-1] != project:
65 | continue
66 | if not split[1].isdigit():
67 | continue
68 | num = int(split[1])
69 | if num > 500:
70 | continue
71 | data.setdefault(num, {})
72 | if split[2] == 'prio':
73 | text = text.strip()
74 | if text in cls.prio_map.keys():
75 | data[num]['priority'] = cls.prio_map[text]
76 | elif split[2] == 'whitelistpage':
77 | data[num].setdefault('whitelists', []).append(text)
78 | return cls(data)
79 |
80 | @classmethod
81 | def new_from_site(cls, site):
82 | try:
83 | page = site.page_from_repository('Q10784379')
84 | except (NotImplementedError, UnknownExtension) as e:
85 | pywikibot.error(e)
86 | return None
87 | return cls.new_from_text(page.text, site.dbName())
88 |
89 |
90 | class CheckWikiErrorGenerator:
91 |
92 | def __init__(self, checkwiki, priorities=None, ids=None):
93 | self.checkwiki = checkwiki
94 | self.priorities = priorities or []
95 | self.ids = ids or []
96 |
97 | def __iter__(self):
98 | for error in self.ids:
99 | yield from self.checkwiki.iter_pages(error)
100 | already = set(self.ids)
101 | for prio in self.priorities:
102 | for error in self.checkwiki.settings.get_errors_by_priority(prio):
103 | if error not in already:
104 | yield from self.checkwiki.iter_pages(error)
105 |
106 |
107 | class CheckWiki:
108 |
109 | url = 'https://tools.wmflabs.org/checkwiki/cgi-bin/checkwiki_bots.cgi'
110 |
111 | errorMap = {
112 | 1: PrefixedTemplate,
113 | 2: BrokenHTMLTag,
114 | 7: LowHeadersLevel,
115 | 8: MissingEquation,
116 | 9: SingleLineCategories,
117 | #10: NoEndSquareBrackets,
118 | 11: HTMLEntity,
119 | 16: InvisibleChars,
120 | 17: DuplicateCategory,
121 | 18: LowerCaseCategory,
122 | 19: SingleEquationHeader,
123 | 20: Dagger,
124 | 21: EnglishCategory,
125 | 22: CategoryWithSpace,
126 | 25: HeaderHierarchy,
127 | 26: Bold,
128 | #27: Unicode,
129 | 32: MultiplePipes,
130 | 34: MagicWords,
131 | 38: Italics,
132 | 42: StrikedText,
133 | 44: BoldHeader,
134 | 48: SelfLink,
135 | 49: HTMLHeader,
136 | 50: EntitesAsDashes,
137 | 51: InterwikiBeforeHeader,
138 | 52: CategoriesBeforeHeader,
139 | 53: InterwikiBeforeCategory,
140 | 54: ListWithBreak,
141 | 57: HeaderWithColon,
142 | 59: ParameterWithBreak,
143 | 61: RefBeforePunctuation,
144 | 63: SmallInsideTags,
145 | #75: BadListStructure,
146 | #76: NoSpace,
147 | 80: BrokenExternalLink,
148 | 81: DuplicateReferences,
149 | 85: EmptyTag,
150 | 86: ExternalLinkLikeInternal,
151 | 88: DefaultsortSpace,
152 | 89: DefaultsortComma,
153 | 93: DoubleHttp,
154 | 101: Ordinals,
155 | 103: SuperfluousPipe,
156 | 104: ReferenceQuotes,
157 | }
158 |
159 | def __init__(self, site):
160 | self.site = site
161 |
162 | def purge(self):
163 | self.__cache = {}
164 |
165 | @property
166 | def site(self):
167 | return self._site
168 |
169 | @site.setter
170 | def site(self, value):
171 | self._site = value
172 | self.purge()
173 | self.load_settings()
174 |
175 | def load_settings(self):
176 | pywikibot.info('Loading CheckWiki settings...')
177 | self._settings = CheckWikiSettings.new_from_site(self.site)
178 |
179 | @property
180 | def settings(self):
181 | if not hasattr(self, '_settings'):
182 | self.load_settings()
183 | return self._settings
184 |
185 | def get_error(self, number):
186 | return self.__cache.setdefault(number, self.errorMap[number](self))
187 |
188 | def iter_errors(self, numbers=None, only_for_fixes=False, priorities=None):
189 | for num in self.errorMap:
190 | if numbers and num not in numbers:
191 | continue
192 | if priorities and self.settings.get_priority(num) not in priorities:
193 | continue
194 |
195 | error = self.get_error(num)
196 | if only_for_fixes and not error.isForFixes():
197 | continue
198 |
199 | yield error
200 |
201 | def apply(self, text, page, replaced=[], fixed=[], errors=[], **kwargs):
202 | # todo: use a graph algorithm
203 | errors = list(self.iter_errors(set(errors)))
204 | while errors:
205 | error = errors.pop(0)
206 | if error.needsDecision() or error.handledByCC(): # todo
207 | continue
208 |
209 | numbers = [err.number for err in errors]
210 | i = max([numbers.index(num) for num in error.needsFirst
211 | if num in numbers] + [0])
212 | if i > 0:
213 | errors.insert(i, error)
214 | continue
215 |
216 | new_text = error.apply(text, page)
217 | if new_text != text:
218 | text = new_text
219 | summary = error.summary
220 | fixed.append(error.number)
221 | if summary not in replaced:
222 | replaced.append(summary)
223 |
224 | return text
225 |
226 | def iter_titles(self, num, **kwargs):
227 | data = {
228 | 'action': 'list',
229 | 'id': num,
230 | 'project': self.site.dbName(),
231 | }
232 | for line in self.get(data, **kwargs).iter_lines():
233 | yield line.decode().replace('title=', '') # fixme: b/c
234 |
235 | def iter_pages(self, num, **kwargs):
236 | for title in self.iter_titles(num, **kwargs):
237 | yield pywikibot.Page(self.site, title)
238 |
239 | def get(self, data, **kwargs):
240 | return requests.get(self.url, data, **kwargs)
241 |
242 | def post(self, data, **kwargs):
243 | return requests.post(self.url, data, **kwargs)
244 |
245 | def mark_as_fixed(self, page, error):
246 | data = {
247 | 'action': 'mark',
248 | 'id': error,
249 | 'project': page.site.dbName(),
250 | 'title': page.title(),
251 | }
252 | return self.post(data)
253 |
254 | def mark_as_fixed_multiple(self, page, errors):
255 | for error in errors:
256 | self.mark_as_fixed(page, error)
257 |
258 | @staticmethod
259 | def parse_option(option):
260 | ids = []
261 | priorities = []
262 | for part in option.split(','):
263 | if part.isdigit():
264 | ids.append(int(part))
265 | elif part in CheckWikiSettings.prio_map.values():
266 | priorities.append(part)
267 | return ids, priorities
268 |
269 |
270 | class CheckWikiBot(WikitextFixingBot):
271 |
272 | def __init__(self, checkwiki, numbers, **kwargs):
273 | kwargs['checkwiki'] = False
274 | super().__init__(**kwargs)
275 | self.checkwiki = checkwiki
276 | self.numbers = numbers
277 |
278 | def treat_page(self):
279 | page = self.current_page
280 | replaced = []
281 | fixed = []
282 | text = self.checkwiki.apply(
283 | page.text, page, replaced, fixed, self.numbers)
284 | summary = 'opravy dle [[WP:WCW|CheckWiki]]: %s' % ', '.join(replaced)
285 | self.put_current(
286 | text, summary=summary,
287 | callback=lambda *args: self.mark_as_fixed_on_success(fixed, *args))
288 |
289 | def mark_as_fixed_on_success(self, numbers, page, exc=None):
290 | if exc is not None:
291 | return
292 | self.checkwiki.mark_as_fixed_multiple(page, numbers)
293 |
294 |
295 | def main(*args):
296 | options = {}
297 | local_args = pywikibot.handle_args(args)
298 | site = pywikibot.Site()
299 | checkwiki = CheckWiki(site)
300 | genFactory = pagegenerators.GeneratorFactory(site=site)
301 | numbers = []
302 | gens = []
303 | for arg in genFactory.handle_args(local_args):
304 | if arg.startswith('-checkwiki:'):
305 | ids, priorities = checkwiki.parse_option(arg.partition(':')[2])
306 | gen = CheckWikiErrorGenerator(
307 | checkwiki, ids=ids, priorities=priorities)
308 | gens.append(gen)
309 | continue
310 | if arg.startswith('-'):
311 | arg, sep, value = arg.partition(':')
312 | if value != '':
313 | options[arg[1:]] = int(value) if value.isdigit() else value
314 | else:
315 | options[arg[1:]] = True
316 | else:
317 | numbers.extend(checkwiki.parse_option(arg)[0])
318 |
319 | if gens:
320 | genFactory.gens.extend(gens)
321 | generator = genFactory.getCombinedGenerator(preload=True)
322 | if not generator:
323 | genFactory.gens.append(CheckWikiErrorGenerator(checkwiki, ids=numbers))
324 | generator = genFactory.getCombinedGenerator(preload=True)
325 |
326 | bot = CheckWikiBot(checkwiki, numbers, generator=generator,
327 | site=site, **options)
328 | bot.run()
329 |
330 |
331 | if __name__ == '__main__':
332 | main()
333 |
--------------------------------------------------------------------------------