├── __init__.py ├── queries ├── captions.txt ├── duos.txt ├── shift_ranks.txt ├── fake_references.txt ├── missing_descriptions.txt ├── fake_references_url.txt ├── commons_labels.txt ├── ask_externalid_props.txt ├── external-ids.txt ├── dupes.txt ├── qualifiers.txt ├── redirects.txt ├── mixed_claims.txt ├── unmerged_dates.txt ├── units.txt └── duplicate_dates.txt ├── README.md ├── query_store.py ├── deferred.py ├── .gitignore ├── cswiki ├── klementinum.py ├── heritage_lists_diff.py ├── sync_heritage_lists.py ├── sync_tree_lists.py ├── pageviews.py └── iucn.py ├── fix_commons_labels.py ├── wikidata_cleanup.py ├── error_reporting.py ├── import_displaytitle.py ├── wikidata └── list_of_wikis.py ├── wikidata.py ├── importdata.py ├── tools.py ├── lua_formatter.py ├── shift_ranks.py ├── update_deathdate.py ├── wikitext.py ├── nounit.py ├── cleanup_redirects.py ├── fix_qualifiers.py ├── captiontoimage.py ├── check_disambigs.py ├── split_names_and_titles.py ├── wikidata_fix_redirects.py ├── clean_commonscat.py ├── split_claims.py ├── import_descriptions.py ├── merger.py ├── slice_externalids.py ├── list_typos.py ├── fake_references.py ├── typos.py ├── cleanup_dates.py ├── typoloader.py ├── clean_dupes.py ├── connect.py ├── manage_duos.py └── checkwiki.py /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /queries/captions.txt: -------------------------------------------------------------------------------- 1 | SELECT DISTINCT ?item WHERE { ?item wdt:%(prop)s [] } -------------------------------------------------------------------------------- /queries/duos.txt: -------------------------------------------------------------------------------- 1 | SELECT DISTINCT ?item WHERE { 2 | ?item wdt:P31/wdt:P279* wd:%(class)s . 3 | MINUS { ?item wdt:P527 [] } . 4 | } -------------------------------------------------------------------------------- /queries/shift_ranks.txt: -------------------------------------------------------------------------------- 1 | SELECT DISTINCT ?item { 2 | ?statement wikibase:rank wikibase:DeprecatedRank; pq:%(prop)s [] . 3 | ?item ?p ?statement; wikibase:sitelinks [] . 4 | } LIMIT %(limit)i -------------------------------------------------------------------------------- /queries/fake_references.txt: -------------------------------------------------------------------------------- 1 | SELECT DISTINCT ?item { 2 | ?item ?p [ prov:wasDerivedFrom/pr:%(prop)s ?target; ?ps ?target ] . 3 | ?ps ^wikibase:statementProperty [] . 4 | ?target ?p [ ?ps ?item ] . 5 | } LIMIT %(limit)i -------------------------------------------------------------------------------- /queries/missing_descriptions.txt: -------------------------------------------------------------------------------- 1 | SELECT DISTINCT ?item { 2 | ?item ^schema:about [ 3 | schema:isPartOf ; schema:name ?title 4 | ]; wdt:P31 wd:Q5 . 5 | MINUS { ?item schema:description ?desc FILTER( LANG( ?desc ) = '%(lang)s' ) } . 6 | } -------------------------------------------------------------------------------- /queries/fake_references_url.txt: -------------------------------------------------------------------------------- 1 | SELECT DISTINCT ?item { 2 | VALUES ?host { } . 3 | ?item ?p [ prov:wasDerivedFrom/pr:%(prop)s ?url ] . 4 | FILTER( STRSTARTS( STR( ?url ), STR( ?host ) ) ) . 5 | } LIMIT %(limit)i -------------------------------------------------------------------------------- /queries/commons_labels.txt: -------------------------------------------------------------------------------- 1 | SELECT DISTINCT ?item { 2 | ?item ^schema:about [ 3 | schema:isPartOf/^wdt:P856 wd:Q565; schema:name ?name 4 | ]; rdfs:label ?label FILTER( LANG( ?label ) = 'en' ) . 5 | FILTER( STRSTARTS( STR( ?name ), 'Category:' ) ) . 6 | FILTER( STRSTARTS( STR( ?label ), 'Category:' ) ) . 7 | MINUS { ?item wdt:P31/wdt:P279* wd:Q4167836 } . 8 | } LIMIT %(limit)s -------------------------------------------------------------------------------- /queries/ask_externalid_props.txt: -------------------------------------------------------------------------------- 1 | ASK { 2 | { 3 | SELECT * { 4 | ?prop wikibase:propertyType wikibase:ExternalId; 5 | wikibase:directClaim []; 6 | wdt:P1630 [] . 7 | FILTER( ?prop NOT IN ( wd:%(blacklist)s ) ) . 8 | } 9 | ORDER BY xsd:integer( STRAFTER( STR( ?prop ), STR( wd:P ) ) ) 10 | OFFSET %(offset)i LIMIT %(limit)i 11 | } 12 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pywikibot-scripts 2 | Own pywikibot scripts (for Wikimedia projects) 3 | 4 | ## Requirements 5 | Python 3.6.1 or newer. 6 | Pywikibot version [4d6e674](https://github.com/wikimedia/pywikibot/commit/4d6e674bf1385961a27b3ddf9acc16bcb32373b0). 7 | 8 | ## Usage 9 | Checkout or download to "myscripts" directory inside "core/scripts/userscripts". 10 | Then add to your `user-config.py`: 11 | ``` 12 | user_script_paths = ['scripts.userscripts.myscripts'] 13 | ``` -------------------------------------------------------------------------------- /queries/external-ids.txt: -------------------------------------------------------------------------------- 1 | SELECT ?item WITH { 2 | SELECT DISTINCT ?wdt { 3 | ?prop wikibase:propertyType wikibase:ExternalId; 4 | wikibase:directClaim ?wdt; 5 | wdt:P1630 [] . 6 | FILTER( ?prop NOT IN ( wd:%(blacklist)s ) ) . 7 | } 8 | ORDER BY xsd:integer( STRAFTER( STR( ?prop ), STR( wd:P ) ) ) 9 | OFFSET %(offset)i LIMIT %(limit)i 10 | } AS %%predicates WHERE { 11 | INCLUDE %%predicates . 12 | ?item ?wdt ?value . 13 | FILTER( STRSTARTS( ?value, 'http' ) ) . 14 | } -------------------------------------------------------------------------------- /queries/dupes.txt: -------------------------------------------------------------------------------- 1 | SELECT DISTINCT ?item WHERE { 2 | VALUES ?dupe { wd:%(dupe)s } . 3 | ?item p:P31 ?statement . 4 | ?statement ps:P31 ?dupe . 5 | { 6 | ?statement (pq:P460|pq:P642) ?target . 7 | } UNION { 8 | ?item wdt:P460 ?target . 9 | } . 10 | MINUS { ?item wdt:P1889|^wdt:P1889 ?target } . 11 | MINUS { 12 | ?target wdt:P31/wdt:P279* wd:Q16521 . 13 | ?item wikibase:sitelinks 0 . 14 | } . 15 | ?item schema:dateModified ?mod . 16 | } ORDER BY ?mod OFFSET %(offset)i -------------------------------------------------------------------------------- /queries/qualifiers.txt: -------------------------------------------------------------------------------- 1 | SELECT DISTINCT ?item WHERE { 2 | ?prop wikibase:propertyType [] . 3 | { 4 | ?prop p:P31/ps:P31 wd:%(item)s . 5 | MINUS { ?prop wikibase:propertyType wikibase:ExternalId } . 6 | } UNION { 7 | FILTER( ?prop IN ( wd:%(good)s ) ) . 8 | } . 9 | FILTER( ?prop NOT IN ( wd:%(bad)s ) ) . 10 | MINUS { ?prop p:P31/ps:P31 wd:Q18608359 } . 11 | ?prop wikibase:reference ?pr . 12 | ?ref ?pr ?value . 13 | ?statement prov:wasDerivedFrom ?ref . 14 | ?item ?p ?statement . 15 | [] wikibase:claim ?p . 16 | } -------------------------------------------------------------------------------- /queries/redirects.txt: -------------------------------------------------------------------------------- 1 | SELECT DISTINCT ?item { 2 | ?item owl:sameAs ?target; schema:dateModified ?date . 3 | { 4 | ?entity ?p [ ?pred ?item; wikibase:rank [] ] . 5 | } UNION { 6 | ?entity ?p1 [ ?predv [ wikibase:quantityUnit ?item ]; wikibase:rank [] ] . 7 | } UNION { 8 | ?ref ?pr ?item . 9 | ?st2 prov:wasDerivedFrom ?ref . 10 | ?entity ?p2 ?st2 . 11 | } UNION { 12 | ?ref1 ?prv [ wikibase:quantityUnit ?item ] . 13 | ?st3 prov:wasDerivedFrom ?ref . 14 | ?entity ?p3 ?st3 . 15 | } . 16 | FILTER( NOW() - ?date > %(days)d ) . 17 | } ORDER BY ?date -------------------------------------------------------------------------------- /queries/mixed_claims.txt: -------------------------------------------------------------------------------- 1 | SELECT DISTINCT ?item { 2 | { ?st pq:P580 ?date1, ?date2 } UNION { ?st pq:P582 ?date1, ?date2 } . 3 | MINUS { ?st prov:wasDerivedFrom [] } . 4 | MINUS { 5 | ?st ?pq [] . 6 | FILTER( ?pq NOT IN ( pq:P580, pq:P582 ) ) . 7 | ?pq ^wikibase:qualifier [] . 8 | } . 9 | MINUS { ?item ?p [ pq:P580|pq:P582 ?date ] . FILTER( YEAR( ?date ) < 1 ) } . 10 | FILTER( !ISBLANK( ?date1 ) && !ISBLANK( ?date2 ) ) . 11 | MINUS { ?date1 a/a owl:Class } . 12 | MINUS { ?date2 a/a owl:Class } . 13 | FILTER( ?date1 < ?date2 ) . 14 | ?item ?p ?st . 15 | } LIMIT %(limit)i -------------------------------------------------------------------------------- /queries/unmerged_dates.txt: -------------------------------------------------------------------------------- 1 | SELECT DISTINCT ?item WHERE { 2 | ?item schema:dateModified ?dateModified . hint:Prior hint:rangeSafe true . 3 | FILTER( ?dateModified > "%(date)s"^^xsd:dateTime ) . 4 | ?item p:%(prop)s ?statement1, ?statement2 . 5 | FILTER( ?statement1 != ?statement2 ) . 6 | FILTER( STR( ?statement1 ) < STR( ?statement2 ) ) . 7 | VALUES (?prec1 ?prec2) { (9 9) (10 10) } . 8 | ?statement1 psv:%(prop)s ?node1 . 9 | ?node1 wikibase:timeValue ?val1 . hint:Prior hint:rangeSafe true . 10 | ?node1 wikibase:timePrecision ?prec1 . 11 | ?statement2 psv:%(prop)s ?node2 . 12 | ?node2 wikibase:timeValue ?val2 . hint:Prior hint:rangeSafe true . 13 | ?node2 wikibase:timePrecision ?prec2 . 14 | FILTER( ?val1 = ?val2 ) . 15 | } 16 | -------------------------------------------------------------------------------- /query_store.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | class QueryStore: 5 | 6 | '''Interface for loading SPARQL queries from text files''' 7 | 8 | def __init__(self, path=None): 9 | if path is None: 10 | dirname = os.path.dirname(os.path.realpath(__file__)) 11 | path = os.path.join(dirname, 'queries') 12 | self.path = path 13 | 14 | def get_query(self, name): 15 | with open('%s.txt' % os.path.join(self.path, name), 'r', 16 | encoding='utf-8') as file: 17 | file.seek(0) 18 | return file.read() 19 | 20 | def build_query(self, name, **params): 21 | return self.get_query(name) % params 22 | 23 | 24 | if __name__ == '__main__': 25 | print('This script is not runnable from command line.') 26 | -------------------------------------------------------------------------------- /queries/units.txt: -------------------------------------------------------------------------------- 1 | SELECT DISTINCT ?item WHERE { 2 | { 3 | ?pst rdf:type wdno:P2237 . 4 | } UNION { 5 | ?pst ps:P2237 wd:%(good)s . 6 | } . 7 | ?prop p:P2237 ?pst; 8 | wikibase:claim ?p; 9 | wikibase:statementValue ?psv; 10 | wikibase:qualifierValue ?pqv; 11 | wikibase:referenceValue ?prv . 12 | FILTER( ?prop != wd:P1092 ) . 13 | { 14 | ?statement ?psv ?value . 15 | ?value wikibase:quantityUnit ?unit . 16 | FILTER( ?unit != wd:Q199 ) . 17 | ?item ?p ?statement . 18 | } UNION { 19 | ?statement1 ?pqv ?value . 20 | ?value wikibase:quantityUnit ?unit . 21 | FILTER( ?unit != wd:Q199 ) . 22 | ?item ?claim1 ?statement1 . 23 | } UNION { 24 | ?ref ?prv ?value . 25 | ?value wikibase:quantityUnit ?unit . 26 | FILTER( ?unit != wd:Q199 ) . 27 | ?statement2 prov:wasDerivedFrom ?ref . 28 | ?item ?claim2 ?statement2 . 29 | } . 30 | } -------------------------------------------------------------------------------- /queries/duplicate_dates.txt: -------------------------------------------------------------------------------- 1 | SELECT DISTINCT ?item { 2 | ?item schema:dateModified ?dateModified . hint:Prior hint:rangeSafe true . 3 | FILTER( ?dateModified > "%(date)s"^^xsd:dateTime ) . 4 | ?item p:%(prop)s ?statement1, ?statement2 FILTER( ?statement2 != ?statement1 ) . 5 | VALUES (?prec1 ?prec2) { 6 | (9 10) (9 11) 7 | } . 8 | ?statement1 psv:%(prop)s ?node1 . 9 | ?node1 wikibase:timeValue ?val1 . hint:Prior hint:rangeSafe true . 10 | ?node1 wikibase:timePrecision ?prec1 . 11 | ?statement1 wikibase:rank ?rank1 . 12 | ?statement2 psv:%(prop)s ?node2 . 13 | ?node2 wikibase:timeValue ?val2 . hint:Prior hint:rangeSafe true . 14 | ?node2 wikibase:timePrecision ?prec2 . 15 | ?statement2 wikibase:rank ?rank2 . 16 | FILTER( YEAR( ?val1 ) = YEAR( ?val2 ) ) . 17 | FILTER( ?rank1 = ?rank2 || ?rank2 = wikibase:NormalRank ) . 18 | MINUS { ?statement1 prov:wasDerivedFrom/!(pr:P143|pr:P813|pr:P4656) [] } . 19 | ?statement2 prov:wasDerivedFrom/!(pr:P143|pr:P813|pr:P4656) [] . 20 | } 21 | -------------------------------------------------------------------------------- /deferred.py: -------------------------------------------------------------------------------- 1 | import pywikibot 2 | 3 | from pywikibot.bot import BaseBot 4 | 5 | 6 | class DeferredCallbacksBot(BaseBot): 7 | 8 | ''' 9 | Bot deferring callbacks like purging pages 10 | ''' 11 | 12 | def __init__(self, **kwargs): 13 | super().__init__(**kwargs) 14 | self.callbacks = [] 15 | 16 | def addCallback(self, func, *data, **kwargs): 17 | callback = lambda *_, **__: func(*data, **kwargs) 18 | self.callbacks.append(callback) 19 | 20 | def queueLen(self): 21 | return len(self.callbacks) 22 | 23 | def hasCallbacks(self): 24 | return self.queueLen() > 0 25 | 26 | def doWithCallback(self, func, *data, **kwargs): 27 | if self.hasCallbacks(): 28 | kwargs['callback'] = self.callbacks.pop(0) 29 | return func(*data, **kwargs) 30 | 31 | def exit(self): 32 | pywikibot.info(f'Executing remaining deferred callbacks: {self.queueLen()} left') 33 | try: 34 | while self.hasCallbacks(): 35 | callback = self.callbacks.pop(0) 36 | callback() 37 | finally: 38 | super().exit() 39 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /cswiki/klementinum.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import json 3 | import re 4 | 5 | from collections import OrderedDict 6 | 7 | import mwparserfromhell as parser 8 | import pywikibot 9 | import requests 10 | 11 | 12 | def get_single_year(year): 13 | return year.rpartition(', ')[2] 14 | 15 | 16 | def format_number(val): 17 | return re.sub(r'(\d+),(\d+)', r'\1.\2', str(val)) 18 | 19 | 20 | def main(): 21 | pywikibot.handle_args() 22 | site = pywikibot.Site('cs', 'wikipedia') 23 | url_pattern = 'https://www.chmi.cz/files/portal/docs/meteo/ok/klementinum/extrklem{:02d}_cs.html' 24 | 25 | data = OrderedDict() 26 | sources = [] 27 | for i in range(1, 13): 28 | url = url_pattern.format(i) 29 | response = requests.get(url) 30 | code = parser.parse(response.text) 31 | 32 | sources.append(url) 33 | data[str(i)] = month = OrderedDict() 34 | trs = (tr for tr in code.ifilter_tags() if tr.tag == 'tr') 35 | next(trs) # skip headline 36 | for day, tr in enumerate(trs, start=1): 37 | tags = tr.contents.filter_tags() 38 | if len(tags) != 6: 39 | break 40 | _, avg, mx, mx_year, mn, mn_year = [tag.contents for tag in tags] 41 | month[str(day)] = OrderedDict([ 42 | ('avg', format_number(avg)), 43 | ('max', format_number(mx)), 44 | ('max_year', get_single_year(mx_year)), 45 | ('min', format_number(mn)), 46 | ('min_year', get_single_year(mn_year)), 47 | ]) 48 | 49 | text = json.dumps({ 50 | '@metadata': { 51 | 'sources': sources, 52 | }, 53 | 'data': data, 54 | }) 55 | page = pywikibot.Page(site, 'Šablona:Klementinum/data.json') 56 | page.put(text, summary='aktualizace dat pro šablonu Klementinum', 57 | minor=False, bot=False, apply_cosmetic_changes=False) 58 | 59 | 60 | if __name__ == '__main__': 61 | main() 62 | -------------------------------------------------------------------------------- /fix_commons_labels.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import pywikibot 3 | 4 | from pywikibot import pagegenerators 5 | 6 | from query_store import QueryStore 7 | from wikidata import WikidataEntityBot 8 | 9 | 10 | class LabelsFixingBot(WikidataEntityBot): 11 | 12 | use_from_page = False 13 | 14 | def __init__(self, generator, **kwargs): 15 | self.available_options.update({ 16 | 'always': True, 17 | 'limit': 50, 18 | }) 19 | super().__init__(**kwargs) 20 | self.store = QueryStore() 21 | self._generator = generator or self.custom_generator() 22 | self.summary = 'remove prefix from [en] label' 23 | 24 | @property 25 | def generator(self): 26 | return pagegenerators.PreloadingEntityGenerator(self._generator) 27 | 28 | def custom_generator(self): 29 | query = self.store.build_query('commons_labels', 30 | limit=self.opt['limit']) 31 | return pagegenerators.WikidataSPARQLPageGenerator(query, site=self.repo) 32 | 33 | def treat_page_and_item(self, page, item): 34 | if any(cl.target_equals('Q4167836') for cl in item.claims.get('P31', [])): 35 | return 36 | if item.getSitelink('commonswiki').startswith('Category:'): 37 | if item.labels['en'].startswith('Category:'): 38 | data = {'en': item.labels['en'].removeprefix('Category:')} 39 | self.user_edit_entity(item, {'labels': data}, 40 | summary=self.summary) 41 | 42 | 43 | def main(*args): 44 | options = {} 45 | local_args = pywikibot.handle_args(args) 46 | site = pywikibot.Site() 47 | genFactory = pagegenerators.GeneratorFactory(site=site) 48 | for arg in genFactory.handle_args(local_args): 49 | if arg.startswith('-'): 50 | arg, sep, value = arg.partition(':') 51 | if value != '': 52 | options[arg[1:]] = value if not value.isdigit() else int(value) 53 | else: 54 | options[arg[1:]] = True 55 | 56 | generator = genFactory.getCombinedGenerator() 57 | bot = LabelsFixingBot(generator=generator, site=site, **options) 58 | bot.run() 59 | 60 | 61 | if __name__ == '__main__': 62 | main() 63 | -------------------------------------------------------------------------------- /wikidata_cleanup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import pywikibot 3 | 4 | from pywikibot import pagegenerators 5 | 6 | from wikidata import WikidataEntityBot 7 | from wikidata_cleanup_toolkit import WikidataCleanupToolkit 8 | 9 | 10 | class WikidataCleanupBot(WikidataEntityBot): 11 | 12 | use_from_page = False 13 | 14 | def __init__(self, generator, fix, **kwargs): 15 | super().__init__(**kwargs) 16 | self._generator = generator 17 | self.fix = fix 18 | self.my_kit = WikidataCleanupToolkit([self.fix]) 19 | 20 | @property 21 | def generator(self): 22 | return pagegenerators.PreloadingEntityGenerator(self._generator) 23 | 24 | @property 25 | def summary(self): 26 | return { 27 | 'add_missing_labels': 'import labels from sitelinks', 28 | 'cleanup_labels': 'strip labels', 29 | 'deduplicate_aliases': 'remove duplicate aliases', 30 | 'deduplicate_claims': 'merge duplicate claims', 31 | 'deduplicate_references': 'remove duplicate references', 32 | 'fix_HTML': 'resolve HTML entities', 33 | 'fix_languages': 'resolve invalid languages', 34 | 'fix_quantities': 'remove explicit bounds', 35 | 'replace_invisible': 'replace invisible characters', 36 | }[self.fix] 37 | 38 | def treat_page_and_item(self, page, item): 39 | data = None # seems to work more reliably than empty dict 40 | if self.my_kit.cleanup(item, data): 41 | self.user_edit_entity(item, data, summary=self.summary) 42 | 43 | 44 | def main(*args): 45 | options = {} 46 | local_args = pywikibot.handle_args(args) 47 | site = pywikibot.Site() 48 | genFactory = pagegenerators.GeneratorFactory(site=site) 49 | for arg in genFactory.handle_args(local_args): 50 | if arg.startswith('-'): 51 | arg, sep, value = arg.partition(':') 52 | if value != '': 53 | options[arg[1:]] = value if not value.isdigit() else int(value) 54 | else: 55 | options[arg[1:]] = True 56 | 57 | generator = genFactory.getCombinedGenerator() 58 | bot = WikidataCleanupBot(generator=generator, site=site, **options) 59 | bot.run() 60 | 61 | 62 | if __name__ == '__main__': 63 | main() 64 | -------------------------------------------------------------------------------- /error_reporting.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from contextlib import suppress 4 | from threading import Lock, Timer 5 | 6 | import pywikibot 7 | 8 | from pywikibot.bot import BaseBot 9 | from pywikibot.exceptions import NoPageError 10 | 11 | 12 | class ErrorReportingBot(BaseBot): 13 | 14 | file_name = None 15 | page_pattern = None 16 | 17 | def __init__(self, **kwargs): 18 | self.available_options.update({ 19 | 'clearonly': False, 20 | 'interval': 5 * 60, 21 | }) 22 | super().__init__(**kwargs) 23 | self.timer = None 24 | self.file_lock = Lock() 25 | self.timer_lock = Lock() 26 | 27 | def run(self): 28 | self.open() 29 | self.save_file() 30 | if not self.opt['clearonly']: 31 | super().run() 32 | 33 | def open(self): 34 | with suppress(OSError): 35 | f = open(os.path.join('..', self.file_name), 'x') 36 | f.close() 37 | 38 | @property 39 | def log_page(self): 40 | log_page = pywikibot.Page( 41 | self.repo, self.page_pattern % self.repo.username()) 42 | try: 43 | log_page.get() 44 | except NoPageError: 45 | log_page.text = '' 46 | return log_page 47 | 48 | def append(self, text): 49 | with ( 50 | self.file_lock, 51 | open(os.path.join('..', self.file_name), 'a', encoding='utf-8') as f 52 | ): 53 | f.write(text) 54 | 55 | def save_file(self): 56 | with ( 57 | self.file_lock, 58 | open(os.path.join('..', self.file_name), 'r+', encoding='utf-8') as f 59 | ): 60 | f.seek(0) # jump to the beginning 61 | text = '\n'.join(f.read().splitlines()) # multi-platform 62 | if text: 63 | log_page = self.log_page 64 | log_page.text += text 65 | log_page.save(summary='update') 66 | f.seek(0) # jump to the beginning 67 | f.truncate() # and delete everything 68 | with self.timer_lock: 69 | self.timer = Timer(self.opt['interval'], self.save_file) 70 | self.timer.start() 71 | 72 | def teardown(self): 73 | with self.timer_lock: 74 | if self.timer: 75 | self.timer.cancel() 76 | super().teardown() 77 | -------------------------------------------------------------------------------- /import_displaytitle.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import pywikibot 3 | 4 | from pywikibot.pagegenerators import ( 5 | page_with_property_generator, 6 | GeneratorFactory, 7 | NamespaceFilterPageGenerator, 8 | ) 9 | from pywikibot.tools import first_lower 10 | 11 | from wikidata import WikidataEntityBot 12 | 13 | 14 | class LabelSettingBot(WikidataEntityBot): 15 | 16 | def __init__(self, **kwargs): 17 | self.available_options.update({ 18 | 'create': False, 19 | }) 20 | super().__init__(**kwargs) 21 | self.create_missing_item = self.opt['create'] is True 22 | 23 | def stripped(self, title): 24 | if title.endswith(')'): 25 | return title.partition(' (')[0] 26 | else: 27 | return title 28 | 29 | def treat_page_and_item(self, page, item): 30 | title = page.properties().get('displaytitle') 31 | if not title: 32 | return 33 | page_title = page.title() 34 | if first_lower(page_title) != title: 35 | return 36 | lang = page.site.lang 37 | label = item.labels.get(lang) 38 | if not label or self.stripped(label) == self.stripped(page_title): 39 | item.labels[lang] = first_lower(label) if label else title 40 | link = page.title(as_link=True, insite=item.site) 41 | summary = f'importing [{lang}] label from displaytitle in {link}' 42 | self.user_edit_entity(item, summary=summary) 43 | 44 | 45 | def main(*args): 46 | options = {} 47 | local_args = pywikibot.handle_args(args) 48 | site = pywikibot.Site() 49 | genFactory = GeneratorFactory(site=site) 50 | for arg in genFactory.handle_args(local_args): 51 | if arg.startswith('-'): 52 | arg, sep, value = arg.partition(':') 53 | if value != '': 54 | options[arg[1:]] = value if not value.isdigit() else int(value) 55 | else: 56 | options[arg[1:]] = True 57 | 58 | generator = genFactory.getCombinedGenerator() 59 | if not generator: 60 | generator = page_with_property_generator('displaytitle', site=site) 61 | if genFactory.namespaces: 62 | generator = NamespaceFilterPageGenerator( 63 | generator, genFactory.namespaces, site=site) 64 | 65 | bot = LabelSettingBot(generator=generator, site=site, **options) 66 | bot.run() 67 | 68 | 69 | if __name__ == '__main__': 70 | main() 71 | -------------------------------------------------------------------------------- /wikidata/list_of_wikis.py: -------------------------------------------------------------------------------- 1 | #!/bin/python3 2 | import json 3 | 4 | import pywikibot 5 | 6 | from pywikibot.data.sparql import SparqlQuery 7 | from pywikibot.exceptions import SiteDefinitionError, UnknownFamilyError 8 | from tqdm import tqdm 9 | 10 | 11 | pywikibot.handle_args() 12 | 13 | repo = pywikibot.Site('wikidata') 14 | page = pywikibot.Page(repo, 'Wikidata:List of wikis/python') 15 | data = json.loads(page.text) 16 | 17 | endpoint = SparqlQuery(repo=repo) 18 | query = '''SELECT * WHERE { ?item wdt:P1800 ?dbname } ORDER BY ?dbname''' 19 | missing_families = set() 20 | added = set() 21 | 22 | out = {} 23 | for entry in tqdm(endpoint.select(query, full_data=True)): 24 | item = entry['item'].getID() 25 | dbname = entry['dbname'].value 26 | code, sep, right = dbname.rpartition('wik') 27 | if not sep: 28 | pywikibot.output(f'dbname not recognized: {dbname}') 29 | continue 30 | 31 | if dbname == 'sourceswiki': 32 | code, family = 'mul', 'wikisource' 33 | else: 34 | family = sep + right 35 | if family == 'wiki': 36 | if code in data: # commons, etc. 37 | family = code 38 | else: 39 | family = 'wikipedia' 40 | 41 | if family in missing_families: 42 | continue 43 | 44 | replace_hyphen = False 45 | if '_' in code: 46 | code = code.replace('_', '-') 47 | replace_hyphen = True 48 | 49 | try: 50 | site = pywikibot.Site(code, family) 51 | except UnknownFamilyError as e: 52 | missing_families.add(family) 53 | pywikibot.log(e.unicode) 54 | continue 55 | except SiteDefinitionError as e: 56 | pywikibot.log(e.unicode) 57 | continue 58 | 59 | if replace_hyphen: 60 | code = code.replace('-', '_') 61 | 62 | if code in out.setdefault(family, {}): 63 | pywikibot.warning(f'Duplicate {code}.{family} entry for {dbname}') 64 | continue 65 | 66 | out[family][code] = item 67 | if code not in data.get(family, {}): 68 | added.add(dbname) 69 | 70 | if added: 71 | total = sum(map(len, out.values())) 72 | summary = f'Updating list of wikis: {total} wikis; added: ' + ( 73 | ', '.join(sorted(added))) 74 | text = json.dumps(out, sort_keys=True, indent=4) 75 | pywikibot.showDiff(page.text, text) 76 | page.text = text 77 | pywikibot.output(f'Edit summary: {summary}') 78 | page.save(summary=summary, minor=False, bot=False) 79 | else: 80 | pywikibot.output('No wikis to be added') 81 | -------------------------------------------------------------------------------- /wikidata.py: -------------------------------------------------------------------------------- 1 | from contextlib import suppress 2 | import random 3 | 4 | import pywikibot 5 | 6 | from pywikibot.bot import WikidataBot 7 | from pywikibot.exceptions import NoPageError, IsRedirectPageError 8 | 9 | from wikidata_cleanup_toolkit import WikidataCleanupToolkit 10 | 11 | 12 | class WikidataEntityBot(WikidataBot): 13 | 14 | use_redirects = False 15 | 16 | ''' 17 | Bot editing Wikidata entities 18 | Features: 19 | * Caches properties so that iterating claims can be faster 20 | * Wraps around WikibataBot class. 21 | * Item cleanup like missing labels, redundant data etc. 22 | ''' 23 | 24 | def __init__(self, **kwargs): 25 | self.available_options.update({ 26 | 'nocleanup': False, 27 | }) 28 | self.bad_cache = set(kwargs.pop('bad_cache', [])) 29 | self.good_cache = set(kwargs.pop('good_cache', [])) 30 | self.kit = WikidataCleanupToolkit() 31 | super().__init__(**kwargs) 32 | 33 | def init_page(self, item): 34 | with suppress(NoPageError, IsRedirectPageError): 35 | item.get() 36 | return super().init_page(item) 37 | 38 | def checkProperty(self, prop): 39 | if prop in self.good_cache: 40 | return True 41 | if prop in self.bad_cache: 42 | return False 43 | 44 | self.cacheProperty(prop) 45 | return self.checkProperty(prop) 46 | 47 | def cacheProperty(self, prop): 48 | prop_page = pywikibot.PropertyPage(self.repo, prop) 49 | if self.filterProperty(prop_page): 50 | self.good_cache.add(prop) 51 | else: 52 | self.bad_cache.add(prop) 53 | 54 | def filterProperty(self, prop_page): 55 | raise NotImplementedError( 56 | f'{self.__class__.__name__}.filterProperty needs ' 57 | 'overriding in a subclass') 58 | 59 | def new_editgroups_summary(self): 60 | # https://www.wikidata.org/wiki/Wikidata:Edit_groups/Adding_a_tool 61 | n = random.randrange(0, 2**48) 62 | return f'[[:toollabs:editgroups/b/CB/{n:x}|details]]' 63 | 64 | def user_edit_entity(self, item, data=None, *, cleanup=None, **kwargs): 65 | # todo: support stub items 66 | if item.exists() and not (cleanup is False or ( 67 | self.opt['nocleanup'] and cleanup is not True)): 68 | if self.kit.cleanup(item, data): 69 | if kwargs.get('summary'): 70 | kwargs['summary'] += '; cleanup' 71 | else: 72 | kwargs['summary'] = 'cleanup' 73 | kwargs.setdefault('show_diff', not self.opt['always']) 74 | return super().user_edit_entity(item, data, **kwargs) 75 | -------------------------------------------------------------------------------- /importdata.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | from datetime import datetime 3 | 4 | import pywikibot 5 | 6 | pywikibot.handle_args() 7 | 8 | site = pywikibot.Site('wikidata', 'wikidata') 9 | repo = site.data_repository() 10 | 11 | path = pywikibot.input('Path to file: ') 12 | date = pywikibot.WbTime(year=2025, month=1, day=1, site=repo) 13 | 14 | ref_item = 'Q134497819' 15 | 16 | with open(path, 'r', encoding='utf-8') as file_data: 17 | next(file_data) # header 18 | for line in file_data: 19 | if not line: 20 | continue 21 | split = line.split('\t') 22 | item = pywikibot.ItemPage(repo, split[0]) 23 | hasNewClaim = False 24 | upToDateClaims = [] 25 | count = int(split[1]) 26 | for claim in item.claims.get('P1082', []): 27 | if claim.getRank() == 'preferred': 28 | claim.setRank('normal') 29 | upToDateClaims.append(claim) 30 | if (claim.qualifiers.get('P585') 31 | and claim.qualifiers['P585'][0].target_equals(date)): 32 | hasNewClaim = True 33 | break 34 | 35 | if hasNewClaim is True: 36 | continue 37 | 38 | newClaim = pywikibot.Claim(repo, 'P1082') 39 | newClaim.setTarget(pywikibot.WbQuantity(count, site=repo)) 40 | newClaim.setRank('preferred') 41 | 42 | newClaim_date = pywikibot.Claim(repo, 'P585', is_qualifier=True) 43 | newClaim_date.setTarget(date) 44 | newClaim.addQualifier(newClaim_date) 45 | 46 | newClaim_criter = pywikibot.Claim(repo, 'P1013', is_qualifier=True) 47 | newClaim_criter.setTarget(pywikibot.ItemPage(repo, 'Q2641256')) 48 | newClaim.addQualifier(newClaim_criter) 49 | 50 | newClaim_men = pywikibot.Claim(repo, 'P1540', is_qualifier=True) 51 | newClaim_men.setTarget(pywikibot.WbQuantity(int(split[2]), site=repo)) 52 | newClaim.addQualifier(newClaim_men) 53 | 54 | newClaim_women = pywikibot.Claim(repo, 'P1539', is_qualifier=True) 55 | newClaim_women.setTarget(pywikibot.WbQuantity(int(split[3]), site=repo)) 56 | newClaim.addQualifier(newClaim_women) 57 | 58 | ref = pywikibot.Claim(repo, 'P248', is_reference=True) 59 | ref.setTarget(pywikibot.ItemPage(repo, ref_item)) 60 | 61 | now = datetime.now() 62 | access_date = pywikibot.Claim(repo, 'P813', is_reference=True) 63 | access_date.setTarget(pywikibot.WbTime(year=now.year, month=now.month, 64 | day=now.day, site=repo)) 65 | newClaim.addSources([ref, access_date]) 66 | 67 | data = {'claims':[newClaim.toJSON()]} 68 | for upToDateClaim in upToDateClaims: 69 | data['claims'].append(upToDateClaim.toJSON()) 70 | 71 | item.editEntity( 72 | data, asynchronous=True, 73 | summary=f'Adding [[Property:P1082]]: {count} per data from ' 74 | f'[[Q3504917]], see [[{ref_item}]]') 75 | -------------------------------------------------------------------------------- /tools.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import pywikibot 4 | from pywikibot.tools.chars import url2string 5 | 6 | FULL_ARTICLE_REGEX = r'\A[\s\S]*\Z' 7 | 8 | 9 | class FileRegexHolder: 10 | 11 | replaceR = None 12 | FLOAT_PATTERN = r'\d+(?:\.\d+)?' 13 | 14 | @classmethod 15 | def get_regex(cls, site): 16 | if not cls.replaceR: 17 | magic = ['img_baseline', 'img_border', 'img_bottom', 'img_center', 18 | 'img_class', 'img_framed', 'img_frameless', 'img_left', 19 | 'img_middle', 'img_none', 'img_right', 'img_sub', 20 | 'img_super', 'img_text_bottom', 'img_text_top', 21 | 'img_thumbnail', 'img_top'] 22 | words = [] 23 | for magicword in magic: 24 | words.extend(site.getmagicwords(magicword)) 25 | replace = '|'.join(map(re.escape, words)) 26 | for magicword in site.getmagicwords('img_manualthumb'): 27 | replace += '|' + magicword.replace('$1', cls.FLOAT_PATTERN) 28 | for magicword in site.getmagicwords('img_upright'): 29 | replace += '|' + magicword.replace('$1', cls.FLOAT_PATTERN) 30 | for magicword in site.getmagicwords('img_width'): 31 | replace += '|' + magicword.replace('$1', r'\d+') 32 | cls.replaceR = re.compile(replace) 33 | return cls.replaceR 34 | 35 | 36 | def deduplicate(arg): 37 | # todo: merge with filter_unique? 38 | for index, member in enumerate(arg, start=1): 39 | while member in arg[index:]: 40 | arg.pop(arg.index(member, index)) 41 | 42 | 43 | def parse_image(text, site): 44 | # TODO: merge with .migrate_infobox.InfoboxMigratingBot.handle_image 45 | image = caption = None 46 | imgR = re.compile(r'\[\[\s*(?:%s) *:' % '|'.join(site.namespaces[6]), 47 | flags=re.I) 48 | if imgR.match(text): 49 | split = text.rstrip()[:-2].split('|') 50 | matchR = FileRegexHolder.get_regex(site) 51 | while split[1:]: 52 | tmp = split.pop().strip() 53 | if not matchR.fullmatch(tmp): 54 | caption = tmp 55 | break 56 | if caption: 57 | while caption.count('[') != caption.count(']'): 58 | caption = split.pop() + '|' + caption 59 | caption = caption.rstrip('.').strip() 60 | image = split[0].partition(':')[2].rstrip(']') 61 | image = url2string(image) 62 | image = re.sub('[ _]+', ' ', image).strip() 63 | 64 | return image, caption 65 | 66 | 67 | def get_best_statements(statements): 68 | best = [] 69 | best_rank = 'normal' 70 | for st in statements: 71 | if st.rank == best_rank: 72 | best.append(st) 73 | elif st.rank == 'preferred': 74 | best[:] = [st] 75 | best_rank = st.rank 76 | return best 77 | 78 | 79 | def iter_all_snaks(data): 80 | for claims in data.values(): 81 | for claim in claims: 82 | yield claim 83 | for snaks in claim.qualifiers.values(): 84 | yield from snaks 85 | for ref in claim.sources: 86 | for snaks in ref.values(): 87 | yield from snaks 88 | -------------------------------------------------------------------------------- /lua_formatter.py: -------------------------------------------------------------------------------- 1 | """This module is deprecated. Using JSON is more practical.""" 2 | 3 | __all__ = ( 4 | 'format_dictionary', 5 | 'format_list', 6 | 'QUOTES_SINGLE', 7 | 'QUOTES_DOUBLE', 8 | ) 9 | 10 | QUOTES_SINGLE = 1 11 | QUOTES_DOUBLE = 2 12 | 13 | 14 | def _indent(level, **kwargs): 15 | if kwargs.get('use_tabs') is True: 16 | return (level + 1) * '\t' 17 | else: 18 | return ((level + 1) * 4) * ' ' 19 | 20 | 21 | def _wrap_quotes(text, quote): 22 | if quote in text: 23 | text = text.replace(quote, '\\' + quote) 24 | return f'{quote}{text}{quote}' 25 | 26 | 27 | def _format_string(text, **kwargs): 28 | opt = kwargs.get('force_quotes', 0) 29 | if opt == QUOTES_SINGLE: 30 | text = _wrap_quotes(text, "'") 31 | elif opt == QUOTES_DOUBLE: 32 | text = _wrap_quotes(text, '"') 33 | else: 34 | assert opt == 0 35 | text = (_wrap_quotes(text, '"') 36 | if "'" in text else _wrap_quotes(text, "'")) 37 | return text 38 | 39 | 40 | def _format_key(key, **kwargs): 41 | if key is None: 42 | return '[nil]' 43 | elif isinstance(key, (int, float)): 44 | key = str(key) 45 | return f'[{key.lower()}]' # lower for booleans (which are ints) 46 | else: 47 | assert isinstance(key, str) 48 | if not key.isalnum() or kwargs.get('quotes_always') is True: 49 | return '[%s]' % _format_string(key, **kwargs) 50 | else: 51 | return key 52 | 53 | 54 | def _format_value(value, level, **kwargs): 55 | if isinstance(value, dict): 56 | return _format_dictionary(value, level + 1, **kwargs) 57 | elif isinstance(value, (list, tuple)): 58 | return _format_list(value, level + 1, **kwargs) 59 | elif isinstance(value, (int, float)): 60 | return str(value).lower() # lower for booleans (which are ints) 61 | else: 62 | return _format_string(value, **kwargs) 63 | 64 | 65 | def _format_pair(key, value, level, **kwargs): 66 | return '%s = %s,' % (_format_key(key, **kwargs), 67 | _format_value(value, level, **kwargs)) 68 | 69 | 70 | def _format_list(data, level, **kwargs): 71 | init = '\n' + _indent(level, **kwargs) 72 | string = '' 73 | if kwargs.get('show_keys') is True: 74 | for i, item in enumerate(data, start=1): 75 | string += init + _format_pair(i, item, **kwargs) 76 | else: 77 | for item in data: 78 | string += init + _format_value(item, level, **kwargs) + ',' 79 | return '{' + string + '\n' + _indent(level-1, **kwargs) + '}' 80 | 81 | 82 | def _format_dictionary(data, level, **kwargs): 83 | init = '\n' + _indent(level, **kwargs) 84 | string = '' 85 | keys = data.keys() 86 | if kwargs.get('sort_keys') is True: 87 | keys = sorted(keys) 88 | for key in keys: 89 | string += init + _format_pair(key, data[key], level, **kwargs) 90 | return '{' + string + '\n' + _indent(level-1, **kwargs) + '}' 91 | 92 | 93 | def format_list(data, level=0, **kwargs): 94 | assert isinstance(data, (list, tuple)) 95 | return _format_list(data, level, **kwargs) 96 | 97 | 98 | def format_dictionary(data, level=0, **kwargs): 99 | assert isinstance(data, dict) 100 | return _format_dictionary(data, level, **kwargs) 101 | -------------------------------------------------------------------------------- /shift_ranks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import pywikibot 3 | 4 | from pywikibot import pagegenerators 5 | 6 | from query_store import QueryStore 7 | from wikidata import WikidataEntityBot 8 | 9 | 10 | class RanksShiftingBot(WikidataEntityBot): 11 | 12 | end_prop = 'P582' 13 | reason_prop = 'P2241' 14 | use_from_page = False 15 | 16 | def __init__(self, generator, **kwargs): 17 | self.available_options.update({ 18 | 'limit': 500, 19 | }) 20 | super().__init__(**kwargs) 21 | self.store = QueryStore() 22 | self._generator = generator or self.custom_generator() 23 | 24 | def custom_generator(self): 25 | query = self.store.build_query( 26 | 'shift_ranks', 27 | limit=self.opt['limit'], 28 | prop=self.end_prop 29 | ) 30 | return pagegenerators.WikidataSPARQLPageGenerator(query, site=self.repo) 31 | 32 | @property 33 | def generator(self): 34 | return pagegenerators.PreloadingEntityGenerator(self._generator) 35 | 36 | @property 37 | def summary(self): 38 | return ('undeprecate claims and shift other ranks, see ' 39 | '[[Special:MyLanguage/Help:Ranking|Help:Ranking]]') 40 | 41 | def treat_page_and_item(self, page, item): 42 | changed = False 43 | for claims in item.claims.values(): 44 | by_rank = { 45 | 'preferred': [], 46 | 'normal': [], 47 | 'deprecated': [], 48 | } 49 | ok = False 50 | for claim in claims: 51 | by_rank[claim.rank].append(claim) 52 | if claim.rank == 'preferred': 53 | if claim.qualifiers.get(self.end_prop): 54 | ok = False 55 | break 56 | elif claim.rank == 'deprecated': 57 | if claim.qualifiers.get(self.reason_prop): 58 | ok = False 59 | break 60 | if not ok: 61 | ok = bool(claim.qualifiers.get(self.end_prop)) 62 | if not ok: 63 | continue 64 | for claim in by_rank['deprecated']: 65 | if claim.qualifiers.get(self.end_prop): 66 | claim.setRank('normal') 67 | changed = True 68 | if not by_rank['preferred']: 69 | for claim in by_rank['normal']: 70 | if not claim.qualifiers.get(self.end_prop): 71 | claim.setRank('preferred') 72 | changed = True 73 | if changed: 74 | self.user_edit_entity(item, summary=self.summary) 75 | 76 | 77 | def main(*args): 78 | options = {} 79 | local_args = pywikibot.handle_args(args) 80 | site = pywikibot.Site() 81 | genFactory = pagegenerators.GeneratorFactory(site=site) 82 | for arg in genFactory.handle_args(local_args): 83 | if arg.startswith('-'): 84 | arg, sep, value = arg.partition(':') 85 | if value != '': 86 | options[arg[1:]] = int(value) if value.isdigit() else value 87 | else: 88 | options[arg[1:]] = True 89 | 90 | generator = genFactory.getCombinedGenerator() 91 | bot = RanksShiftingBot(generator=generator, site=site, **options) 92 | bot.run() 93 | 94 | 95 | if __name__ == '__main__': 96 | main() 97 | -------------------------------------------------------------------------------- /update_deathdate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import re 3 | 4 | from datetime import datetime 5 | from itertools import chain 6 | 7 | import pywikibot 8 | 9 | from pywikibot import i18n, textlib 10 | from pywikibot.bot import ExistingPageBot, SingleSiteBot 11 | from pywikibot.pagegenerators import PreloadingGenerator 12 | 13 | birth = { 14 | 'wikipedia': { 15 | 'cs': r'Narození v roce (\d+)', 16 | }, 17 | } 18 | 19 | death = { 20 | 'wikipedia': { 21 | 'cs': 'Úmrtí v roce %d', 22 | }, 23 | } 24 | 25 | replace_pattern = '[[{inside}]] ({left}{year1}{right}–{left}{year2}{right})' 26 | 27 | 28 | class DeathDateUpdatingBot(SingleSiteBot, ExistingPageBot): 29 | 30 | use_redirects = False 31 | 32 | def __init__(self, **kwargs): 33 | self.available_options.update({ 34 | 'year': datetime.today().year, 35 | }) 36 | super().__init__(**kwargs) 37 | self.categoryR = re.compile(i18n.translate(self.site, birth)) 38 | self.year = self.opt['year'] 39 | 40 | @property 41 | def generator(self): 42 | while True: 43 | category = pywikibot.Category( 44 | self.site, i18n.translate(self.site, death) % self.year) 45 | yield from category.articles(content=True, namespaces=[0]) 46 | self.year -= 1 47 | 48 | def treat_page(self): 49 | page = self.current_page 50 | categories = textlib.getCategoryLinks(page.text, site=self.site) 51 | titles = (cat.title(with_ns=False, with_section=False, 52 | allow_interwiki=False, insite=self.site) 53 | for cat in categories) 54 | matches = [match for match in map(self.categoryR.fullmatch, titles) 55 | if match] 56 | if not matches: 57 | pywikibot.info('No birthdate category found') 58 | return 59 | fullmatch = matches.pop() 60 | if matches: 61 | pywikibot.info('Multiple birthdate categories found') 62 | return 63 | birth_date = fullmatch[1] 64 | search_query = f'linksto:"{page.title()}"' # todo: sanitize? 65 | search_query += r' insource:/\[\[[^\[\]]+\]\]' 66 | search_query += fr' +\(\* *\[*{birth_date}\]*\)/' 67 | search_query += ' -intitle:"Seznam"' 68 | pattern = r'\[\[((?:%s)(?:\|[^\[\]]+)?)\]\]' % '|'.join( 69 | re.escape(p.title()) for p in chain([page], page.backlinks( 70 | follow_redirects=False, filter_redirects=True, namespaces=[0]))) 71 | pattern += fr' +\(\* *(\[\[)?({birth_date})(\]\])?\)' 72 | regex = re.compile(pattern) 73 | for ref_page in PreloadingGenerator( 74 | page.site.search(search_query, namespaces=[0])): 75 | new_text, num = regex.subn(self.replace_callback, ref_page.text) 76 | if num: 77 | self.userPut(ref_page, ref_page.text, new_text, 78 | summary='doplnění data úmrtí') 79 | 80 | def replace_callback(self, match): 81 | inside, left, year1, right = match.groups('') 82 | return replace_pattern.format( 83 | inside=inside, left=left, right=right, year1=year1, 84 | year2=self.year) 85 | 86 | 87 | def main(*args): 88 | options = {} 89 | for arg in pywikibot.handle_args(args): 90 | if arg.startswith('-'): 91 | arg, sep, value = arg.partition(':') 92 | if value != '': 93 | options[arg[1:]] = value if not value.isdigit() else int(value) 94 | else: 95 | options[arg[1:]] = True 96 | 97 | bot = DeathDateUpdatingBot(**options) 98 | bot.run() 99 | 100 | 101 | if __name__ == '__main__': 102 | main() 103 | -------------------------------------------------------------------------------- /wikitext.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | from itertools import chain 3 | from operator import methodcaller 4 | 5 | import pywikibot 6 | 7 | from pywikibot import pagegenerators 8 | from pywikibot.bot import SingleSiteBot, ExistingPageBot 9 | 10 | from custome_fixes import all_fixes 11 | 12 | 13 | class WikitextFixingBot(SingleSiteBot, ExistingPageBot): 14 | 15 | use_redirects = False 16 | 17 | ''' 18 | Class for bots that save wikitext. It uses all demanded fixes from 19 | custome_fixes.py and applies them before cosmetic changes are 20 | executed. 21 | 22 | You can enable each fix by using its name as a command line argument 23 | or all fixes using -all (then, each used fix is excluded). 24 | ''' 25 | 26 | def __init__(self, **kwargs): 27 | do_all = kwargs.pop('all', False) is True 28 | self.fixes = [] 29 | for fix, cls in all_fixes.items(): 30 | if do_all: 31 | demand = fix not in kwargs 32 | kwargs.pop(fix, None) 33 | else: 34 | demand = bool(kwargs.pop(fix, False)) 35 | if demand: 36 | options = {} 37 | for opt in cls.options.keys(): 38 | if opt in kwargs: 39 | options[opt] = kwargs.pop(opt) 40 | self.fixes.append(cls(**options)) 41 | 42 | self.fixes.sort(key=lambda fix: fix.order) 43 | 44 | super().__init__(**kwargs) 45 | for fix in self.fixes: 46 | fix.site = self.site 47 | if not self.generator: 48 | pywikibot.info('No generator provided, making own generator...') 49 | self.generator = pagegenerators.PreloadingGenerator( 50 | chain.from_iterable(map(methodcaller('generator'), self.fixes))) 51 | 52 | def treat_page(self): 53 | summaries = [] 54 | page = self.current_page 55 | old_text = page.text 56 | callbacks = self.applyFixes(page, summaries) 57 | if len(summaries) < 1: 58 | pywikibot.info('No replacements worth saving') 59 | return 60 | pywikibot.showDiff(old_text, page.text) 61 | # todo: method 62 | callback = lambda _, exc: [cb() for cb in callbacks if not exc] 63 | # todo: put_current 64 | self._save_page(page, page.save, callback=callback, 65 | summary='; '.join(summaries)) 66 | 67 | def applyFixes(self, page, summaries=[]): 68 | callbacks = [] 69 | for fix in self.fixes: 70 | fix.apply(page, summaries, callbacks) 71 | return callbacks 72 | 73 | def userPut(self, page, oldtext, newtext, **kwargs): 74 | if oldtext.rstrip() == newtext.rstrip(): 75 | pywikibot.info( 76 | f'No changes were needed on {page.title(as_link=True)}') 77 | return 78 | 79 | self.current_page = page 80 | 81 | show_diff = kwargs.pop('show_diff', not self.opt['always']) 82 | 83 | if show_diff: 84 | pywikibot.showDiff(oldtext, newtext) 85 | 86 | if 'summary' in kwargs: 87 | pywikibot.info(f"Edit summary: {kwargs['summary']}") 88 | 89 | page.text = newtext 90 | return self._save_page(page, self.fix_wikitext, page, **kwargs) 91 | 92 | def fix_wikitext(self, page, *args, **kwargs): 93 | summaries = [kwargs['summary']] 94 | callbacks = self.applyFixes(page, summaries) 95 | 96 | kwargs['summary'] = '; '.join(summaries) 97 | # todo: method 98 | kwargs['callback'] = lambda _, exc: [cb() for cb in callbacks 99 | if not exc] 100 | page.save(*args, **kwargs) 101 | 102 | 103 | def main(*args): 104 | options = {} 105 | local_args = pywikibot.handle_args(args) 106 | genFactory = pagegenerators.GeneratorFactory() 107 | for arg in genFactory.handle_args(local_args): 108 | if arg.startswith('-'): 109 | arg, sep, value = arg.partition(':') 110 | if value != '': 111 | options[arg[1:]] = value if not value.isdigit() else int(value) 112 | else: 113 | options[arg[1:]] = True 114 | 115 | generator = genFactory.getCombinedGenerator(preload=True) 116 | bot = WikitextFixingBot(generator=generator, **options) 117 | bot.run() 118 | 119 | 120 | if __name__ == '__main__': 121 | main() 122 | -------------------------------------------------------------------------------- /nounit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | """This script is obsolete!""" 3 | import pywikibot 4 | 5 | from pywikibot import pagegenerators 6 | 7 | from query_store import QueryStore 8 | from wikidata import WikidataEntityBot 9 | 10 | 11 | class UnitsFixingBot(WikidataEntityBot): 12 | 13 | good_item = 'Q21027105' 14 | use_from_page = False 15 | 16 | def __init__(self, **kwargs): 17 | super().__init__(**kwargs) 18 | self.store = QueryStore() 19 | 20 | @property 21 | def generator(self): 22 | query = self.store.build_query('units', good=self.good_item) 23 | return pagegenerators.PreloadingEntityGenerator( 24 | pagegenerators.WikidataSPARQLPageGenerator(query, site=self.repo)) 25 | 26 | def filterProperty(self, prop_page): 27 | if prop_page.type != 'quantity': 28 | return False 29 | prop_page.get() 30 | if 'P2237' not in prop_page.claims: 31 | return False 32 | for claim in prop_page.claims['P2237']: 33 | if claim.snaktype == 'novalue': 34 | continue 35 | if (claim.snaktype == 'value' and 36 | claim.target_equals(self.good_item)): 37 | continue 38 | return False 39 | return True 40 | 41 | def treat_page_and_item(self, page, item): 42 | for prop, claims in item.claims.items(): 43 | for claim in claims: 44 | if claim.type == 'quantity': 45 | if self.checkProperty(prop): 46 | target = claim.getTarget() 47 | if self.change_target(target): 48 | pywikibot.output('Removing unit for property %s' % prop) 49 | self._save_page( 50 | item, self._save_entity, claim.changeTarget, 51 | target, summary='removing invalid unit, see ' 52 | "[[P:%s#P2237|property's page]]" % prop) 53 | else: 54 | self.bad_cache.add(prop) 55 | 56 | json = claim.toJSON() 57 | changed = False 58 | for qprop, snaks in claim.qualifiers.items(): 59 | if not self.checkProperty(qprop): 60 | continue 61 | new_snaks = snaks.copy() 62 | if self.handle_snaks(new_snaks): 63 | changed = True 64 | json['qualifiers'][qprop] = new_snaks 65 | #pywikibot.output("Removing unit for qualifier %s of %s" % (qprop, prop)) 66 | 67 | for i, source in enumerate(claim.sources): 68 | for ref_prop, snaks in source.items(): 69 | if not self.checkProperty(ref_prop): 70 | continue 71 | new_snaks = snaks.copy() 72 | if self.handle_snaks(new_snaks): 73 | changed = True 74 | json['references'][i]['snaks'][ref_prop] = new_snaks 75 | #pywikibot.output("Removing unit for reference %s of %s" % (ref_prop, prop)) 76 | 77 | if changed is True: 78 | data = {'claims': [json]} 79 | self.user_edit_entity(item, data, summary='removing invalid unit(s)') 80 | 81 | def change_target(self, target): 82 | if target is None or target._unit == '1': 83 | return False 84 | 85 | target._unit = '1' 86 | return True 87 | 88 | def handle_snaks(self, snaks): 89 | changed = False 90 | for snak in snaks: 91 | target = snak.getTarget() 92 | if self.change_target(target): 93 | changed = True 94 | snak.setTarget(target) 95 | return changed 96 | 97 | 98 | def main(*args): 99 | options = {} 100 | for arg in pywikibot.handle_args(args): 101 | if arg.startswith('-'): 102 | arg, sep, value = arg.partition(':') 103 | if value != '': 104 | options[arg[1:]] = value if not value.isdigit() else int(value) 105 | else: 106 | options[arg[1:]] = True 107 | 108 | site = pywikibot.Site('wikidata', 'wikidata') 109 | bot = UnitsFixingBot(site=site, **options) 110 | bot.run() 111 | 112 | 113 | if __name__ == '__main__': 114 | main() 115 | -------------------------------------------------------------------------------- /cleanup_redirects.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import csv 3 | import re 4 | import urllib 5 | 6 | from operator import methodcaller 7 | from urllib.request import urlopen 8 | 9 | import pywikibot 10 | 11 | from pywikibot.bot import WikidataBot 12 | from pywikibot.exceptions import NoPageError 13 | 14 | from merger import Merger 15 | 16 | 17 | class WikidataRedirectsBot(WikidataBot): 18 | 19 | labs_url = 'https://tools.wmflabs.org' 20 | sub_directory = 'wikidata-redirects-conflicts-reports/reports' 21 | namespaces = {0, 10, 14} 22 | ignore = {'ignore_save_related_errors': True, 23 | 'ignore_server_errors': True, 24 | } 25 | treat_missing_item = False 26 | use_redirects = True 27 | 28 | def __init__(self, **kwargs): 29 | self.available_options.update({ 30 | 'always': False, 31 | 'date': None, 32 | 'force': False, 33 | 'skip': [], 34 | 'start': None, 35 | 'touch': False, 36 | }) 37 | super().__init__(**kwargs) 38 | 39 | @property 40 | def generator(self): 41 | if not self.opt['date']: 42 | self.options['date'] = pywikibot.input( 43 | 'Enter the date when the reports were created') 44 | 45 | url = f"{self.labs_url}/{self.sub_directory}/{self.opt['date']}/" 46 | response = urlopen(url) 47 | regex = re.compile('href="([^"]+)"') 48 | not_yet = bool(self.opt['start']) 49 | for match in regex.finditer(response.read().decode()): 50 | file_name = match[1] 51 | dbname = file_name.partition('-')[0] 52 | if not_yet: 53 | if dbname == self.opt['start']: 54 | not_yet = False 55 | else: 56 | continue 57 | 58 | if dbname in self.opt['skip']: 59 | continue 60 | 61 | try: 62 | site = pywikibot.site.APISite.fromDBName(dbname) 63 | except ValueError as e: 64 | pywikibot.exception(e) 65 | continue 66 | 67 | pywikibot.info(f"Working on '{dbname}'") 68 | resp = urlopen(url + file_name) 69 | lines = resp.readlines() 70 | if not lines: 71 | continue 72 | lines.pop(0) 73 | f = map(methodcaller('decode', 'utf-8'), lines) 74 | for row in csv.reader(f, delimiter='\t'): 75 | if len(set(row[1:3])) > 1: 76 | continue 77 | if int(row[1]) not in self.namespaces: 78 | continue 79 | if '#' in row[4]: 80 | continue 81 | 82 | yield pywikibot.Page(site, row[3], ns=int(row[1])) 83 | 84 | @property 85 | def summary(self): 86 | return (f"based on [[toollabs:{self.sub_directory}/{self.opt['date']}/" 87 | "|Alphos' reports]]") 88 | 89 | def user_confirm(self, *args): 90 | return True 91 | 92 | def treat_page_and_item(self, page, item): 93 | items = [item] 94 | 95 | target = page.getRedirectTarget() 96 | try: 97 | items.append(target.data_item()) 98 | target.get() 99 | except NoPageError: 100 | self._save_page(items[0], items[0].setSitelink, target, 101 | **self.ignore) # todo: summary 102 | return 103 | 104 | Merger.sort_for_merge(items, key=['sitelinks', 'id']) 105 | if not self._save_page(items[1], Merger.clean_merge, items[1], items[0], 106 | safe=not self.opt['force'], 107 | ignore_conflicts=['description'], 108 | summary=self.summary, **self.ignore): 109 | return 110 | 111 | if self.opt['touch'] is True: 112 | self._save_page(target, target.touch, **self.ignore) 113 | 114 | 115 | def main(*args): 116 | options = {} 117 | skip = [] 118 | for arg in pywikibot.handle_args(args): 119 | if arg.startswith('-skip:'): 120 | skip.append(arg.partition(':')[2]) 121 | continue 122 | if arg.startswith('-'): 123 | arg, sep, value = arg.partition(':') 124 | if value != '': 125 | options[arg[1:]] = value if not value.isdigit() else int(value) 126 | else: 127 | options[arg[1:]] = True 128 | 129 | bot = WikidataRedirectsBot(skip=skip, **options) 130 | bot.run() 131 | 132 | 133 | if __name__ == '__main__': 134 | main() 135 | -------------------------------------------------------------------------------- /fix_qualifiers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | """This script is obsolete!""" 3 | import pywikibot 4 | 5 | from pywikibot import pagegenerators 6 | 7 | from query_store import QueryStore 8 | from wikidata import WikidataEntityBot 9 | 10 | 11 | class QualifiersFixingBot(WikidataEntityBot): 12 | 13 | blacklist = frozenset(['P143', 'P248', 'P459', 'P518', 'P577', 'P805', 14 | 'P972', 'P1065', 'P1135', 'P1480', 'P1545', 'P1932', 15 | 'P2315', 'P2701', 'P3274', ]) 16 | whitelist = frozenset(['P17', 'P21', 'P39', 'P155', 'P156', 'P281', 'P580', 17 | 'P582', 'P585', 'P669', 'P708', 'P969', 'P1355', 18 | 'P1356', ]) 19 | good_item = 'Q15720608' 20 | use_from_page = False 21 | 22 | def __init__(self, **kwargs): 23 | kwargs.update({ 24 | 'bad_cache': kwargs.get('bad_cache', []) + list(self.blacklist), 25 | 'good_cache': kwargs.get('good_cache', []) + list(self.whitelist), 26 | }) 27 | super().__init__(**kwargs) 28 | self.store = QueryStore() 29 | 30 | def filterProperty(self, prop_page): 31 | if prop_page.type == 'external-id': 32 | return False 33 | 34 | prop_page.get() 35 | if 'P31' not in prop_page.claims: 36 | pywikibot.warning('%s is not classified' % prop_page.getID()) 37 | return False 38 | 39 | for claim in prop_page.claims['P31']: 40 | if claim.target_equals(self.good_item): 41 | return True 42 | 43 | return False 44 | 45 | @property 46 | def generator(self): 47 | query = self.store.build_query( 48 | 'qualifiers', item=self.good_item, 49 | good=', wd:'.join(self.whitelist), 50 | bad=', wd:'.join(self.blacklist)) 51 | return pagegenerators.PreloadingItemGenerator( 52 | pagegenerators.WikidataSPARQLPageGenerator(query, site=self.repo)) 53 | 54 | def treat_page_and_item(self, page, item): 55 | for prop in item.claims.keys(): 56 | for claim in item.claims[prop]: 57 | moved = set() 58 | json = claim.toJSON() 59 | i = -1 60 | for source in claim.sources: 61 | i += 1 62 | for ref_prop in filter(self.checkProperty, source.keys()): 63 | for snak in source[ref_prop]: 64 | json.setdefault('qualifiers', {}).setdefault(ref_prop, []) 65 | for qual in (pywikibot.Claim.qualifierFromJSON(self.repo, q) 66 | for q in json['qualifiers'][ref_prop]): 67 | if qual.target_equals(snak.getTarget()): 68 | break 69 | else: 70 | snak.isReference = False 71 | snak.isQualifier = True 72 | json['qualifiers'][ref_prop].append(snak.toJSON()) 73 | json['references'][i]['snaks'][ref_prop].pop(0) 74 | if len(json['references'][i]['snaks'][ref_prop]) == 0: 75 | json['references'][i]['snaks'].pop(ref_prop) 76 | if len(json['references'][i]['snaks']) == 0: 77 | json['references'].pop(i) 78 | i -= 1 79 | moved.add(ref_prop) 80 | 81 | if len(moved) > 0: 82 | data = {'claims': [json]} 83 | self.user_edit_entity(item, data, summary=self.makeSummary(prop, moved), 84 | asynchronous=True) 85 | 86 | def makeSummary(self, prop, props): 87 | props = ['[[Property:P%s]]' % pid for pid in sorted( 88 | int(pid[1:]) for pid in props)] 89 | return '[[Property:%s]]: moving misplaced reference%s %s to qualifiers' % ( 90 | prop, 's' if len(props) > 1 else '', '%s and %s' % ( 91 | ', '.join(props[:-1]), props[-1]) if len(props) > 1 else props[0]) 92 | 93 | 94 | def main(*args): 95 | options = {} 96 | for arg in pywikibot.handle_args(args): 97 | if arg.startswith('-'): 98 | arg, sep, value = arg.partition(':') 99 | if value != '': 100 | options[arg[1:]] = value if not value.isdigit() else int(value) 101 | else: 102 | options[arg[1:]] = True 103 | 104 | site = pywikibot.Site('wikidata', 'wikidata') 105 | bot = QualifiersFixingBot(site=site, **options) 106 | bot.run() 107 | 108 | 109 | if __name__ == '__main__': 110 | main() 111 | -------------------------------------------------------------------------------- /captiontoimage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import pywikibot 3 | 4 | from pywikibot import pagegenerators 5 | 6 | from query_store import QueryStore 7 | from wikidata import WikidataEntityBot 8 | 9 | 10 | class CaptionToImageBot(WikidataEntityBot): 11 | 12 | ''' 13 | Bot re-adding file captions as qualifiers to the files on Wikidata 14 | 15 | Supported parameters: 16 | * -removeall - if a caption cannot be reused, remove it as well 17 | ''' 18 | 19 | caption_property = 'P2096' 20 | image_property = 'P18' 21 | use_from_page = False 22 | 23 | def __init__(self, generator, **kwargs): 24 | self.available_options.update({ 25 | 'removeall': False 26 | }) 27 | kwargs.setdefault('bad_cache', []).append(self.caption_property) 28 | super().__init__(**kwargs) 29 | self.store = QueryStore() 30 | self._generator = generator or self.custom_generator() 31 | 32 | def custom_generator(self): 33 | query = self.store.build_query('captions', prop=self.caption_property) 34 | return pagegenerators.WikidataSPARQLPageGenerator(query, site=self.repo) 35 | 36 | @property 37 | def generator(self): 38 | return pagegenerators.PreloadingEntityGenerator(self._generator) 39 | 40 | def filterProperty(self, prop_page): 41 | return prop_page.type == 'commonsMedia' 42 | 43 | def skip_page(self, item): 44 | return super().skip_page(item) or ( 45 | self.caption_property not in item.claims) 46 | 47 | def _save_entity(self, func, *args, **kwargs): 48 | # fixme upstream 49 | if 'asynchronous' in kwargs: 50 | kwargs.pop('asynchronous') 51 | return func(*args, **kwargs) 52 | 53 | def treat_page_and_item(self, page, item): 54 | our_prop = self.image_property 55 | if our_prop not in item.claims: 56 | our_prop = None 57 | for prop in item.claims: 58 | if self.checkProperty(prop): 59 | if our_prop is None: 60 | our_prop = prop 61 | else: 62 | pywikibot.info('More than one media property used') 63 | return 64 | 65 | remove_claims = [] 66 | remove_all = self.opt['removeall'] is True 67 | if our_prop is None: 68 | pywikibot.info('No media property found') 69 | if remove_all: 70 | remove_claims.extend(item.claims[self.caption_property]) 71 | self._save_page(item, self._save_entity, item.removeClaims, 72 | remove_claims, summary='removing redundant property') 73 | return 74 | 75 | media_claim = item.claims[our_prop][0] 76 | if len(item.claims[our_prop]) > 1: 77 | pywikibot.info(f'Property {our_prop} has more than one value') 78 | return 79 | 80 | for caption in item.claims[self.caption_property]: 81 | if self.caption_property in media_claim.qualifiers: 82 | language = caption.getTarget().language 83 | has_same_lang = any( 84 | claim.getTarget().language == language 85 | for claim in media_claim.qualifiers[self.caption_property]) 86 | if has_same_lang: 87 | pywikibot.info(f'Property {our_prop} already has ' 88 | f'a caption in language {language}') 89 | if remove_all: 90 | remove_claims.append(caption) 91 | continue 92 | 93 | qualifier = caption.copy() 94 | qualifier.isQualifier = True 95 | if self._save_page(item, self._save_entity, media_claim.addQualifier, 96 | qualifier): 97 | remove_claims.append(caption) 98 | 99 | if remove_claims: 100 | self._save_page(item, self._save_entity, item.removeClaims, 101 | remove_claims, summary='removing redundant property') 102 | 103 | 104 | def main(*args): 105 | options = {} 106 | local_args = pywikibot.handle_args(args) 107 | site = pywikibot.Site() 108 | genFactory = pagegenerators.GeneratorFactory(site=site) 109 | for arg in genFactory.handle_args(local_args): 110 | if arg.startswith('-'): 111 | arg, sep, value = arg.partition(':') 112 | if value != '': 113 | options[arg[1:]] = value if not value.isdigit() else int(value) 114 | else: 115 | options[arg[1:]] = True 116 | 117 | generator = genFactory.getCombinedGenerator() 118 | bot = CaptionToImageBot(generator=generator, site=site, **options) 119 | bot.run() 120 | 121 | 122 | if __name__ == '__main__': 123 | main() 124 | -------------------------------------------------------------------------------- /check_disambigs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import pywikibot 3 | 4 | from pywikibot import pagegenerators 5 | from pywikibot.exceptions import NoPageError 6 | 7 | from error_reporting import ErrorReportingBot 8 | from wikidata import WikidataEntityBot 9 | 10 | 11 | class DisambigsCheckingBot(WikidataEntityBot, ErrorReportingBot): 12 | 13 | disambig_items = {'Q4167410', 'Q22808320', 'Q61996773'} 14 | file_name = 'log_disambigs.txt' 15 | page_pattern = 'User:%s/Disambig_errors' 16 | skip = { 17 | 'brwiki', 18 | 'enwiki', 19 | 'hakwiki', 20 | 'igwiki', 21 | 'mkwiki', 22 | 'mznwiki', 23 | 'specieswiki', 24 | 'towiki', 25 | } 26 | use_from_page = False 27 | 28 | def __init__(self, generator=None, **kwargs): 29 | self.available_options.update({ 30 | 'limit': 1000, 31 | 'min_sitelinks': 1, 32 | 'offset': 0, 33 | #'only': None, todo 34 | }) 35 | super().__init__(**kwargs) 36 | self.generator = pagegenerators.PreloadingEntityGenerator( 37 | generator or self.custom_generator() 38 | ) 39 | 40 | def skip_page(self, item): 41 | return super().skip_page(item) or ( 42 | item.title(as_link=True, insite=self.repo) in self.log_page.text 43 | or not self.is_disambig(item)) 44 | 45 | def is_disambig(self, item): 46 | for claim in item.claims.get('P31', []): 47 | if any(claim.target_equals(cls) for cls in self.disambig_items): 48 | return True 49 | return False 50 | 51 | def custom_generator(self): 52 | # todo: move to store 53 | QUERY = '''SELECT ?item WITH { 54 | SELECT DISTINCT ?item { 55 | ?item wdt:P31 wd:%s; wikibase:sitelinks ?links . 56 | FILTER( ?links >= %i ) . 57 | MINUS { ?item wdt:P31 wd:Q101352 } . 58 | } OFFSET %i LIMIT %i 59 | } AS %%disambig WHERE { 60 | INCLUDE %%disambig . 61 | BIND( MD5( CONCAT( STR( ?item ), STR( RAND() ) ) ) AS ?hash ) . 62 | } ORDER BY ?hash''' % (self.disambig_item, self.opt['min_sitelinks'], 63 | self.opt['offset'], self.opt['limit']) 64 | 65 | return pagegenerators.WikidataSPARQLPageGenerator( 66 | QUERY, site=self.repo, result_type=list) 67 | 68 | def treat_page_and_item(self, page, item): 69 | append_text = '' 70 | count = len(item.sitelinks) 71 | if count == 0: 72 | append_text += '\n** no sitelinks' 73 | for dbname in item.sitelinks: 74 | if dbname in self.skip: 75 | continue 76 | page = pywikibot.Page(item.sitelinks[dbname]) 77 | if not page.exists(): 78 | append_text += "\n** {} – {} – doesn't exist".format( 79 | dbname, page.title(as_link=True, insite=self.repo)) 80 | continue 81 | if page.isRedirectPage(): 82 | target = page.getRedirectTarget() 83 | try: 84 | target_item = target.data_item() 85 | except NoPageError: 86 | link = "''no item''" 87 | else: 88 | link = target_item.title(as_link=True, insite=self.repo) 89 | if not target.isDisambig(): 90 | link += ', not a disambiguation' 91 | append_text += '\n** {} – {} – redirects to {} ({})'.format( 92 | dbname, page.title(as_link=True, insite=self.repo), 93 | target.title(as_link=True, insite=self.repo), link) 94 | continue 95 | if not page.isDisambig(): 96 | append_text += '\n** {} – {} – not a disambiguation'.format( 97 | dbname, page.title(as_link=True, insite=self.repo)) 98 | 99 | if append_text: 100 | prep = '\n* %s' % item.title(as_link=True, insite=self.repo) 101 | if count > 0: 102 | prep += f' ({count} sitelink' + ('s' if count > 1 else '') + ')' 103 | append_text = prep + append_text 104 | self.append(append_text) 105 | 106 | 107 | def main(*args): 108 | options = {} 109 | local_args = pywikibot.handle_args(args) 110 | site = pywikibot.Site() 111 | genFactory = pagegenerators.GeneratorFactory(site=site) 112 | for arg in genFactory.handle_args(local_args): 113 | if arg.startswith('-'): 114 | arg, sep, value = arg.partition(':') 115 | if value != '': 116 | options[arg[1:]] = value if not value.isdigit() else int(value) 117 | else: 118 | options[arg[1:]] = True 119 | 120 | generator = genFactory.getCombinedGenerator() 121 | 122 | bot = DisambigsCheckingBot(site=site, generator=generator, **options) 123 | bot.run() 124 | 125 | 126 | if __name__ == '__main__': 127 | main() 128 | -------------------------------------------------------------------------------- /cswiki/heritage_lists_diff.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import math 3 | from collections import defaultdict 4 | 5 | import mwparserfromhell 6 | import pywikibot 7 | 8 | from pywikibot import Coordinate, pagegenerators 9 | from pywikibot.textlib import removeDisabledParts 10 | from pywikibot.data.sparql import * 11 | from tqdm import tqdm 12 | 13 | from tools import get_best_statements 14 | 15 | 16 | def tidy(value) -> str: 17 | return removeDisabledParts(str(value), site=site).strip() 18 | 19 | 20 | def distance(coord1: Coordinate, coord2: Coordinate): 21 | lat1, lon1 = coord1.lat, coord1.lon 22 | lat2, lon2 = coord2.lat, coord2.lon 23 | radius = 6372.795 24 | 25 | cosValue = \ 26 | math.sin(math.radians(lat1)) * math.sin(math.radians(lat2)) \ 27 | + math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) * math.cos(math.radians(lon2 - lon1)) 28 | 29 | if cosValue > 1: 30 | return 0 31 | elif cosValue < -1: 32 | return radius * math.pi 33 | else: 34 | return radius * math.acos(cosValue) 35 | 36 | 37 | args = pywikibot.handle_args() 38 | 39 | site = pywikibot.Site('cs', 'wikipedia') 40 | repo = site.data_repository() 41 | image_repo = site.image_repository() 42 | 43 | genFactory = pagegenerators.GeneratorFactory(site=site) 44 | genFactory.handle_arg('-ns:0') 45 | genFactory.handle_args(args) 46 | generator = genFactory.getCombinedGenerator(preload=True) 47 | if not generator: 48 | genFactory.handle_arg('-ref:Template:Památky v Česku') 49 | generator = genFactory.getCombinedGenerator(preload=True) 50 | 51 | ignore_images = {'Noimage 2-1.png'} 52 | 53 | pywikibot.info('Loading all identifiers...') 54 | 55 | query = 'SELECT * WHERE { ?item wdt:P762 ?id }' 56 | obj = SparqlQuery(repo=repo) 57 | result = obj.select(query, full_data=True) 58 | id_to_items = defaultdict(set) 59 | for entry in result: 60 | item = entry['item'].getID() 61 | id_ = entry['id'].value 62 | id_to_items[id_].add(item) 63 | del result 64 | 65 | entries = [] 66 | 67 | for page in tqdm(generator): 68 | code = mwparserfromhell.parse(page.text) 69 | for template in code.ifilter_templates( 70 | matches=lambda t: t.name.matches('Památky v Česku')): 71 | item = None 72 | id_ = None 73 | if template.has('Wikidata', ignore_empty=True): 74 | linked_item = tidy(template.get('Wikidata').value) 75 | else: 76 | linked_item = None 77 | 78 | if not linked_item and template.has('Id_objektu', ignore_empty=True): 79 | id_ = tidy(template.get('Id_objektu').value) 80 | items = id_to_items[id_] 81 | if len(items) == 1: 82 | item_id = items.pop() 83 | item = pywikibot.ItemPage(repo, item_id) 84 | items.add(item_id) 85 | elif linked_item: 86 | item = pywikibot.ItemPage(repo, linked_item) 87 | 88 | if not item: 89 | continue 90 | 91 | item.get(get_redirect=True) 92 | while item.isRedirectPage(): 93 | item = item.getRedirectTarget() 94 | item.get(get_redirect=True) 95 | 96 | if template.has('Zeměpisná_šířka', ignore_empty=True) \ 97 | or template.has('Zeměpisná_délka', ignore_empty=True): 98 | best = get_best_statements(item.claims.get('P625', [])) 99 | if best and best[0].getTarget(): 100 | coord_wd = best[0].getTarget() 101 | coord_list = Coordinate( 102 | lat=float(str(template.get('Zeměpisná_šířka').value)), 103 | lon=float(str(template.get('Zeměpisná_délka').value)), 104 | site=repo) 105 | dist = distance(coord_list, coord_wd) 106 | if dist > 0.05: 107 | entries.append(( 108 | page.title(), 109 | item.getID(), 110 | coord_list, 111 | coord_wd, 112 | dist, 113 | )) 114 | 115 | entries.sort(key=lambda t: t[-1], reverse=True) 116 | 117 | text = '{| class="wikitable"' 118 | text += '\n! Seznam' 119 | text += ' !! Položka na WD' 120 | text += ' !! Souřadnice v seznamu' 121 | text += ' !! Souřadnice na WD' 122 | text += ' !! Vzdálenost [km]' 123 | for title, item_id, coord_list, coord_wd, dist in entries: 124 | text += '\n|-' 125 | text += f"\n| [[{title}|{title.removeprefix('Seznam kulturních památek ')}]]" 126 | text += f'\n| [[d:{item_id}|{item_id}]]' 127 | text += '\n| {{Souřadnice|%f|%f}}' % (coord_list.lat, coord_list.lon) 128 | text += '\n| {{Souřadnice|%f|%f}}' % (coord_wd.lat, coord_wd.lon) 129 | text += f'\n| {dist:.4f}' 130 | text += '\n|}' 131 | 132 | out_page = pywikibot.Page(site, 'Matěj Suchánek/Reports/Souřadnice', ns=2) 133 | out_page.text = text 134 | out_page.save(summary='seznam', bot=False, minor=False) 135 | -------------------------------------------------------------------------------- /split_names_and_titles.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import re 3 | 4 | import pywikibot 5 | 6 | from pywikibot import pagegenerators, textlib 7 | from pywikibot.tools import first_upper 8 | from pywikibot.textlib import mwparserfromhell 9 | 10 | try: 11 | from wikitext import WikitextFixingBot 12 | except ImportError: 13 | from pywikibot.bot import SingleSiteBot, ExistingPageBot 14 | 15 | class WikitextFixingBot(SingleSiteBot, ExistingPageBot): 16 | use_redirects = False 17 | 18 | 19 | class TitlesMovingBot(WikitextFixingBot): 20 | 21 | param = 'jméno' 22 | param_before = 'titul před' 23 | param_after = 'titul za' 24 | 25 | summary = 'přesun titulů do vlastních parametrů' 26 | 27 | def __init__(self, template, offset=0, **kwargs): 28 | self.template = self.normalize(template) 29 | self.start_offset = offset 30 | self.offset = 0 31 | super().__init__(**kwargs) 32 | 33 | def normalize(self, template): 34 | return first_upper(template 35 | .partition(' {target}') 64 | backlinks = item.backlinks(follow_redirects=False, 65 | filter_redirects=None, 66 | namespaces=[0, 120]) 67 | summary = self.summary.format( 68 | item.title(with_ns=True), target.title(with_ns=True)) 69 | if self.opt.editgroups: 70 | summary += f' ({self.new_editgroups_summary()})' 71 | if target != item.getRedirectTarget(): 72 | item.set_redirect_target(target, summary=summary) 73 | for entity in PreloadingEntityGenerator(backlinks): 74 | if entity == target: 75 | continue 76 | if entity.isRedirectPage(): 77 | entity.set_redirect_target(target, summary=summary) 78 | continue 79 | callbacks = [] 80 | update = [] 81 | for claim in chain.from_iterable(entity.claims.values()): 82 | changed = False 83 | if self.update_snak(claim, item, target): 84 | changed = True 85 | callbacks.append(self._make_callback( 86 | claim.changeTarget, claim.target, summary=summary)) 87 | for snak in chain.from_iterable(claim.qualifiers.values()): 88 | if self.update_snak(snak, item, target): 89 | changed = True 90 | callbacks.append(self._make_callback( 91 | claim.repo.editQualifier, claim, snak, 92 | summary=summary)) 93 | for source in claim.sources: 94 | source_changed = False 95 | snaks = list(chain.from_iterable(source.values())) 96 | for snak in snaks: 97 | if self.update_snak(snak, item, target): 98 | source_changed = True 99 | if source_changed: 100 | changed = True 101 | callbacks.append(self._make_callback( 102 | claim.repo.editSource, claim, snaks, 103 | summary=summary)) 104 | if changed: 105 | update.append(claim) 106 | if len(callbacks) > 1: 107 | data = {'claims': [c.toJSON() for c in update]} 108 | self.user_edit_entity( 109 | entity, data, cleanup=False, summary=summary) 110 | elif len(callbacks) == 1: 111 | callbacks[0]() 112 | 113 | 114 | def main(*args): 115 | options = {} 116 | local_args = pywikibot.handle_args(args) 117 | site = pywikibot.Site() 118 | genFactory = GeneratorFactory(site=site) 119 | for arg in genFactory.handle_args(local_args): 120 | if arg.startswith('-'): 121 | arg, sep, value = arg.partition(':') 122 | if value != '': 123 | options[arg[1:]] = value if not value.isdigit() else int(value) 124 | else: 125 | options[arg[1:]] = True 126 | 127 | generator = genFactory.getCombinedGenerator() 128 | bot = WikidataRedirectsFixingBot(generator=generator, site=site, **options) 129 | bot.run() 130 | 131 | 132 | if __name__ == '__main__': 133 | main() 134 | -------------------------------------------------------------------------------- /cswiki/sync_heritage_lists.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | from collections import defaultdict 3 | 4 | import mwparserfromhell 5 | import pywikibot 6 | 7 | from pywikibot import pagegenerators 8 | from pywikibot.textlib import removeDisabledParts 9 | from pywikibot.data.sparql import * 10 | 11 | from tools import get_best_statements 12 | 13 | 14 | def get_sources(page): 15 | wiki = pywikibot.Claim(repo, 'P143', is_reference=True) 16 | wiki.setTarget(pywikibot.ItemPage(repo, 'Q191168')) 17 | url = pywikibot.Claim(repo, 'P4656', is_reference=True) 18 | url.setTarget('https:' + page.permalink()) 19 | return [wiki, url] 20 | 21 | 22 | def tidy(value) -> str: 23 | return removeDisabledParts(str(value), site=site).strip() 24 | 25 | 26 | args = pywikibot.handle_args() 27 | 28 | site = pywikibot.Site('cs', 'wikipedia') 29 | repo = site.data_repository() 30 | image_repo = site.image_repository() 31 | 32 | genFactory = pagegenerators.GeneratorFactory(site=site) 33 | genFactory.handle_arg('-ns:0') 34 | genFactory.handle_args(args) 35 | generator = genFactory.getCombinedGenerator(preload=True) 36 | if not generator: 37 | genFactory.handle_arg('-ref:Template:Památky v Česku') 38 | generator = genFactory.getCombinedGenerator(preload=True) 39 | 40 | ignore_images = {'Noimage 2-1.png'} 41 | 42 | pywikibot.info('Loading all identifiers...') 43 | 44 | query = 'SELECT * WHERE { ?item wdt:P762 ?id }' 45 | obj = SparqlQuery(repo=repo) 46 | result = obj.select(query, full_data=True) 47 | #item_to_ids = defaultdict(set) 48 | id_to_items = defaultdict(set) 49 | for entry in result: 50 | item = entry['item'].getID() 51 | id_ = entry['id'].value 52 | #item_to_ids[item].add(id_) 53 | id_to_items[id_].add(item) 54 | del result 55 | 56 | for page in generator: 57 | pywikibot.info(page) 58 | code = mwparserfromhell.parse(page.text) 59 | change = False 60 | for template in code.ifilter_templates( 61 | matches=lambda t: t.name.matches('Památky v Česku')): 62 | item = None 63 | if template.has('Wikidata', ignore_empty=True): 64 | linked_item = tidy(template.get('Wikidata').value) 65 | else: 66 | linked_item = None 67 | if not linked_item and template.has('Id_objektu', ignore_empty=True): 68 | id_ = tidy(template.get('Id_objektu').value) 69 | items = id_to_items[id_] 70 | if len(items) == 1: 71 | item_id = items.pop() 72 | item = pywikibot.ItemPage(repo, item_id) 73 | items.add(item_id) 74 | elif linked_item: 75 | item = pywikibot.ItemPage(repo, linked_item) 76 | if not item: 77 | continue 78 | 79 | item.get(get_redirect=True) 80 | while item.isRedirectPage(): 81 | item = item.getRedirectTarget() 82 | item.get(get_redirect=True) 83 | 84 | if item.exists(): 85 | if item.getID() != linked_item: 86 | template.add('Wikidata', item.getID()) 87 | change = True 88 | ## else: 89 | ## template.add('Wikidata', '') 90 | ## change = change or bool(linked_item) 91 | ## item = None 92 | 93 | if item and not template.has('Commons', ignore_empty=True): 94 | ccat = None 95 | best = get_best_statements(item.claims.get('P373', [])) 96 | if best: 97 | ccat = best[0].getTarget() 98 | if not ccat: 99 | link = item.sitelinks.get('commonswiki') 100 | if link and link.namespace == 14: 101 | ccat = link.title 102 | if ccat: 103 | template.add('Commons', ccat) 104 | change = True 105 | del best 106 | 107 | if item and not template.has('Článek', ignore_empty=True): 108 | article = item.sitelinks.get('cswiki') 109 | if article: 110 | template.add('Článek', article.ns_title()) 111 | change = True 112 | 113 | if item and not ( 114 | template.has('Zeměpisná_šířka', ignore_empty=True) 115 | and template.has('Zeměpisná_délka', ignore_empty=True) 116 | ): 117 | coord = None 118 | best = get_best_statements(item.claims.get('P625', [])) 119 | if best: 120 | coord = best[0].getTarget() 121 | if coord: 122 | template.add('Zeměpisná_šířka', str(coord.lat)) 123 | template.add('Zeměpisná_délka', str(coord.lon)) 124 | change = True 125 | del best 126 | 127 | if item and template.has('Obrázek', ignore_empty=True): 128 | image = pywikibot.FilePage( 129 | image_repo, tidy(template.get('Obrázek').value)) 130 | if ( 131 | image.exists() and not image.isRedirectPage() 132 | and image.title(with_ns=False) not in ignore_images 133 | and not item.claims.get('P18') 134 | ): 135 | # todo: check unique 136 | claim = pywikibot.Claim(repo, 'P18') 137 | claim.setTarget(image) 138 | claim.addSources(get_sources(page)) 139 | item.addClaim(claim, asynchronous=True) 140 | 141 | if change: 142 | page.text = str(code) 143 | page.save(summary='synchronizace s údaji na Wikidatech', 144 | asynchronous=True) 145 | -------------------------------------------------------------------------------- /cswiki/sync_tree_lists.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import re 3 | 4 | import mwparserfromhell 5 | import pywikibot 6 | 7 | from pywikibot import pagegenerators 8 | from pywikibot.textlib import FILE_LINK_REGEX 9 | from pywikibot.tools import first_upper 10 | 11 | 12 | def get_sources(page): 13 | wiki = pywikibot.Claim(repo, 'P143', is_reference=True) 14 | wiki.setTarget(pywikibot.ItemPage(repo, 'Q191168')) 15 | url = pywikibot.Claim(repo, 'P4656', is_reference=True) 16 | url.setTarget('https:' + page.permalink()) 17 | return [wiki, url] 18 | 19 | 20 | args = pywikibot.handle_args() 21 | 22 | site = pywikibot.Site('cs', 'wikipedia') 23 | repo = site.data_repository() 24 | image_repo = site.image_repository() 25 | 26 | genFactory = pagegenerators.GeneratorFactory(site=site) 27 | genFactory.handle_arg('-ns:0') 28 | genFactory.handle_args(args) 29 | generator = genFactory.getCombinedGenerator(preload=True) 30 | if not generator: 31 | genFactory.handle_arg('-cat:Seznamy památných stromů v Česku podle okresů') 32 | generator = genFactory.getCombinedGenerator(preload=True) 33 | 34 | ignore_images = {'Noimage 2-1.png'} 35 | 36 | # todo: cache all in a single query 37 | query = '''SELECT DISTINCT ?item { 38 | { ?item wdt:P3296 "%s" } UNION { ?item wdt:P677 "%s" } 39 | } LIMIT 2''' 40 | 41 | titleR = re.compile(r'(\s*)([^[|\]<>]+?)((?: *†| *\(x\))?\s*)') 42 | fileR = re.compile(FILE_LINK_REGEX % '|'.join(site.namespaces[6]), re.VERBOSE) 43 | 44 | for page in generator: 45 | pywikibot.info(page) 46 | code = mwparserfromhell.parse(page.text) 47 | change = False 48 | for table in code.ifilter_tags(matches=lambda t: t.tag == 'table'): 49 | rows = table.contents.ifilter_tags(matches=lambda t: t.tag == 'tr') 50 | first = next(rows) 51 | index = dict.fromkeys(('název', 'obrázek', 'kód'), None) 52 | for i, cell in enumerate(first.contents.ifilter_tags( 53 | matches=lambda t: t.tag == 'th')): 54 | for key, value in index.items(): 55 | if value is None and key in str(cell.contents).lower(): 56 | index[key] = i 57 | break 58 | 59 | for key, value in index.items(): 60 | if value is None: 61 | pywikibot.info(f"Couldn't determine column for '{key}'") 62 | if index['kód'] is None: 63 | continue 64 | 65 | for row in rows: 66 | cells = row.contents.filter_tags(matches=lambda t: t.tag == 'td') 67 | if not cells: 68 | continue 69 | code_cell = cells[index['kód']] 70 | templates = code_cell.contents.filter_templates( 71 | matches=lambda t: t.name.matches('Pstrom')) 72 | if len(templates) != 1: 73 | continue 74 | template = templates[0] 75 | params = [] 76 | for i in (1, 2, 3): 77 | if template.has_param(i, ignore_empty=True): 78 | params.append(str(template.get(i)).strip()) 79 | else: 80 | params.append('') 81 | items = list(pagegenerators.WikidataSPARQLPageGenerator( 82 | query % tuple(params[:2]), site=repo)) 83 | if len(items) != 1: 84 | pywikibot.info( 85 | f"Couldn't determine the item for values " 86 | f'{params[0]}/{params[1]} ({len(items)} items)') 87 | continue 88 | 89 | item = items.pop() 90 | if params[2] != item.getID(): # 3rd param is index 2 91 | template.add(3, item.getID()) 92 | change = True 93 | 94 | if index['název'] is not None: 95 | title_cell = cells[index['název']] 96 | nodes = title_cell.contents.nodes 97 | # fixme: ignore   98 | #wikilinks = title_cell.contents.filter_wikilinks() 99 | #if not wikilinks: 100 | if len(nodes) == 1: 101 | match = titleR.fullmatch(str(nodes[0])) 102 | link = item.sitelinks.get(page.site) 103 | if link and match: 104 | groups = match.groups() 105 | if first_upper(groups[1]) == link.title: 106 | new = '{}[[{}]]{}'.format(*groups) 107 | else: 108 | new = '{1}[[{0}|{2}]]{3}'.format( 109 | link.title, *groups) 110 | title_cell.contents.replace(nodes[0], new) 111 | change = True 112 | 113 | if index['obrázek'] is not None: 114 | match = fileR.search(str(cells[index['obrázek']])) 115 | if match: 116 | image = pywikibot.FilePage(image_repo, match['filename']) 117 | if ( 118 | image.exists() and not image.isRedirectPage() 119 | and image.title(with_ns=False) not in ignore_images 120 | and not item.claims.get('P18') 121 | ): 122 | # todo: check unique 123 | claim = pywikibot.Claim(repo, 'P18') 124 | claim.setTarget(image) 125 | claim.addSources(get_sources(page)) 126 | item.addClaim(claim, asynchronous=True) 127 | 128 | if change: 129 | page.text = str(code) 130 | page.save(summary='doplnění článků a/nebo položek na Wikidatech', 131 | asynchronous=True) 132 | -------------------------------------------------------------------------------- /clean_commonscat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import itertools 3 | import re 4 | 5 | import pywikibot 6 | 7 | from pywikibot import i18n, pagegenerators, textlib 8 | from pywikibot.exceptions import UnknownExtensionError 9 | 10 | from deferred import DeferredCallbacksBot 11 | from wikidata import WikidataEntityBot 12 | from wikitext import WikitextFixingBot 13 | 14 | 15 | save_summary = { 16 | 'cs': 'odstranění odkazu na neexistující kategorii na Commons', 17 | 'en': 'removed link to a non-existing Commons category', 18 | } 19 | 20 | 21 | class CommonscatCleaningBot(WikitextFixingBot, WikidataEntityBot, DeferredCallbacksBot): 22 | 23 | def __init__(self, **kwargs): 24 | self.available_options.update({ 25 | 'createnew': False, 26 | 'noclean': False, 27 | 'noimport': False, 28 | }) 29 | super().__init__(**kwargs) 30 | self.commons = pywikibot.Site('commons', 'commons') 31 | 32 | def setup(self): 33 | super().setup() 34 | self.cacheSources() 35 | # todo: l10n etc. 36 | templates = itertools.chain( 37 | map(re.escape, self.site.getmagicwords('defaultsort')), 38 | ('[Pp]ahýl', '[Pp]osloupnost', '[Aa]utoritní data', '[Pp]ortály')) 39 | templates = '|'.join(templates) 40 | ns = '|'.join(self.site.namespaces[14]) 41 | self.empty_sectionR = re.compile( 42 | r'\s*\n==+ *Externí odkazy *==+ *\n\s*' 43 | r'^(==|\{\{(?:%s)|\[\[(?:%s):)' % (templates, ns), 44 | flags=re.M) 45 | 46 | def treat_page(self): # todo: treat_page_and_item 47 | page = self.current_page 48 | item = page.data_item() 49 | if 'P373' in item.claims: 50 | self.addCallback(page.touch) 51 | pywikibot.info('Already has a category on Commons') 52 | return 53 | 54 | cat_name = None 55 | has_param = False 56 | for template, fielddict in page.raw_extracted_templates: 57 | # todo: l10n 58 | if template.lower() in ['commonscat', 'commons category']: 59 | cat_name = page.title(with_ns=False) 60 | value = fielddict.get('1', '').strip() 61 | if value: 62 | has_param = True 63 | cat_name = value 64 | break 65 | 66 | if cat_name is None: 67 | pywikibot.warning('Template not found') 68 | return 69 | 70 | commons_cat = pywikibot.Category(self.commons, cat_name) 71 | exists = commons_cat.exists() 72 | if not exists and not commons_cat.isEmptyCategory(): 73 | if self.opt['createnew'] is not True: 74 | pywikibot.warning(f'{commons_cat.title()} is not empty') 75 | return 76 | 77 | exists = self.doWithCallback( 78 | self.userPut, commons_cat, '', '{{Uncategorized}}', 79 | asynchronous=False) 80 | 81 | if not exists: 82 | if self.opt['noclean'] is True: 83 | pywikibot.info( 84 | "Category doesn't exist on Commons, cleanup restricted") 85 | return 86 | regex = r'(?:\n?|^)(?:\* *)?\{\{ *[Cc]ommons(?:cat|[_ ]?category)' 87 | if has_param: 88 | regex += r' *\| *' + re.escape(cat_name) 89 | regex += r' *\}\}' 90 | page_replaced_text = re.sub( 91 | regex, '', page.text, flags=re.M, count=1) 92 | if page_replaced_text != page.text: 93 | page_replaced_text = self.empty_sectionR.sub( 94 | r'\n\n\1', page_replaced_text, count=1) 95 | 96 | # fixme 97 | self.doWithCallback( 98 | self.put_current, page_replaced_text, 99 | summary=i18n.translate(page.site, save_summary)) 100 | else: 101 | if self.opt['noimport'] is True: 102 | pywikibot.info('Category exists on Commons, import restricted') 103 | return 104 | claim = pywikibot.Claim(self.repo, 'P373') 105 | claim.setTarget(cat_name) 106 | pywikibot.info('Category missing on Wikidata') 107 | self.user_add_claim(item, claim, page.site, asynchronous=True) 108 | self.addCallback(page.touch) 109 | 110 | 111 | def main(*args): 112 | options = {} 113 | local_args = pywikibot.handle_args(args) 114 | genFactory = pagegenerators.GeneratorFactory() 115 | for arg in genFactory.handle_args(local_args): 116 | if arg.startswith('-'): 117 | arg, sep, value = arg.partition(':') 118 | if value != '': 119 | options[arg[1:]] = value if not value.isdigit() else int(value) 120 | else: 121 | options[arg[1:]] = True 122 | 123 | generator = genFactory.getCombinedGenerator(preload=True) 124 | site = pywikibot.Site() 125 | if not generator: 126 | try: 127 | category = site.page_from_repository('Q11925744') 128 | except (NotImplementedError, UnknownExtensionError) as e: 129 | pywikibot.error(e) 130 | return 131 | 132 | if not category: 133 | pywikibot.info(f"{site} doesn't have an appropriate category") 134 | return 135 | 136 | generator = itertools.chain( 137 | category.articles(namespaces=0), 138 | category.subcategories()) 139 | 140 | generator = pagegenerators.WikibaseItemFilterPageGenerator(generator) 141 | bot = CommonscatCleaningBot(generator=generator, site=site, **options) 142 | bot.run() 143 | 144 | 145 | if __name__ == '__main__': 146 | main() 147 | -------------------------------------------------------------------------------- /split_claims.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import pywikibot 3 | 4 | from pywikibot import pagegenerators 5 | 6 | from query_store import QueryStore 7 | from wikidata import WikidataEntityBot 8 | 9 | 10 | class ClaimsSplittingBot(WikidataEntityBot): 11 | 12 | start_prop = 'P580' 13 | end_prop = 'P582' 14 | use_from_page = False 15 | 16 | def __init__(self, generator, **kwargs): 17 | self.available_options.update({ 18 | 'limit': 500, 19 | }) 20 | super().__init__(**kwargs) 21 | self.store = QueryStore() 22 | self._generator = generator or self.custom_generator() 23 | 24 | def custom_generator(self): 25 | query = self.store.build_query( 26 | 'mixed_claims', limit=self.opt['limit']) 27 | return pagegenerators.WikidataSPARQLPageGenerator(query, site=self.repo) 28 | 29 | @property 30 | def generator(self): 31 | return pagegenerators.PreloadingEntityGenerator(self._generator) 32 | 33 | def has_multiple(self, claim): 34 | return (len(claim.qualifiers.get(self.start_prop, [])) > 1 35 | or len(claim.qualifiers.get(self.end_prop, [])) > 1) 36 | 37 | def can_divide(self, claim): 38 | qualifiers = (claim.qualifiers.get(self.start_prop, []) 39 | + claim.qualifiers.get(self.end_prop, [])) 40 | return (not claim.sources 41 | and set(claim.qualifiers.keys()) == { 42 | self.start_prop, self.end_prop} 43 | and all(qual.snaktype == 'value' for qual in qualifiers)) 44 | 45 | def sort_key(self, claim): 46 | return claim.target.toTimestamp() 47 | #{self.start_prop: 1, self.end_prop: 0}.get(claim.id) 48 | 49 | def get_qualifier_pairs(self, claim): 50 | qualifiers = (claim.qualifiers.get(self.start_prop, []) 51 | + claim.qualifiers.get(self.end_prop, [])) 52 | qualifiers.sort(key=self.sort_key) 53 | pairs = [] 54 | i = 0 55 | any_previous_finished = False 56 | while i < len(qualifiers): 57 | qual = qualifiers[i] 58 | if qual.id == self.start_prop: 59 | next_end = None 60 | if i + 1 < len(qualifiers): 61 | if qualifiers[i+1].id == self.end_prop: 62 | pairs.append( 63 | (qual, qualifiers[i+1]) 64 | ) 65 | i += 2 66 | any_previous_finished = True 67 | continue 68 | elif qualifiers[i+1].id == self.start_prop: 69 | next_end = pywikibot.Claim(self.repo, self.end_prop) 70 | next_end.setSnakType('somevalue') 71 | any_previous_finished = True 72 | pairs.append( 73 | (qual, next_end) 74 | ) 75 | elif qual.id == self.end_prop: 76 | next_start = None 77 | if any_previous_finished: 78 | next_start = pywikibot.Claim(self.repo, self.start_prop) 79 | next_start.setSnakType('somevalue') 80 | pairs.append( 81 | (next_start, qual) 82 | ) 83 | any_previous_finished = True 84 | i += 1 85 | return pairs 86 | 87 | def treat_page_and_item(self, page, item): 88 | to_remove = [] 89 | for claims in item.claims.values(): 90 | for claim in claims: 91 | if self.has_multiple(claim) and self.can_divide(claim): 92 | assert not claim.sources # todo 93 | to_remove.append(claim) 94 | pairs = self.get_qualifier_pairs(claim) 95 | for start, end in pairs: 96 | new_claim = pywikibot.Claim(self.repo, claim.id) 97 | if claim.target: 98 | new_claim.setTarget(claim.target) 99 | else: 100 | new_claim.setSnakType(claim.snaktype) 101 | new_claim.setRank(claim.rank) 102 | if start: 103 | start.hash = None 104 | new_claim.addQualifier(start) 105 | if end: 106 | end.hash = None 107 | new_claim.addQualifier(end) 108 | for ref in claim.sources: 109 | sources = [] 110 | for snaks in ref.values(): 111 | sources.extend(snaks) 112 | new_claim.addSources(sources) 113 | if not self.user_add_claim( 114 | item, new_claim, summary='split claim'): 115 | break 116 | if to_remove: 117 | data = {'claims': [ 118 | {'id': cl.toJSON()['id'], 'remove': ''} for cl in to_remove]} 119 | self.user_edit_entity( 120 | item, data, summary='remove splitted claim(s)') 121 | 122 | 123 | def main(*args): 124 | options = {} 125 | local_args = pywikibot.handle_args(args) 126 | site = pywikibot.Site() 127 | genFactory = pagegenerators.GeneratorFactory(site=site) 128 | for arg in genFactory.handle_args(local_args): 129 | if arg.startswith('-'): 130 | arg, sep, value = arg.partition(':') 131 | if value != '': 132 | options[arg[1:]] = int(value) if value.isdigit() else value 133 | else: 134 | options[arg[1:]] = True 135 | 136 | generator = genFactory.getCombinedGenerator() 137 | bot = ClaimsSplittingBot(generator=generator, site=site, **options) 138 | bot.run() 139 | 140 | 141 | if __name__ == '__main__': 142 | main() 143 | -------------------------------------------------------------------------------- /import_descriptions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import re 3 | 4 | import pywikibot 5 | 6 | from pywikibot import textlib 7 | from pywikibot.pagegenerators import ( 8 | GeneratorFactory, 9 | PreloadingEntityGenerator, 10 | PreloadingGenerator, 11 | SearchPageGenerator, 12 | WikidataSPARQLPageGenerator, 13 | ) 14 | 15 | from query_store import QueryStore 16 | from wikidata import WikidataEntityBot 17 | 18 | 19 | class BaseDescriptionBot(WikidataEntityBot): 20 | 21 | def __init__(self, **kwargs): 22 | self.available_options.update({ 23 | 'min_words': 2, 24 | }) 25 | super().__init__(**kwargs) 26 | self.FORMATTING_REGEX = re.compile("'{5}|'{2,3}") 27 | self.REF_REGEX = re.compile(r'.*?') 28 | 29 | def get_regex_for_title(self, escaped_title): 30 | pattern = fr'^\*+ *\[\[({escaped_title})(?:\|[^][]+)?\]\]' 31 | pattern += r' *(?:\([^)]+\))?' 32 | pattern += '(?:,| [-–]) *(.*)$' 33 | return re.compile(pattern, re.M) 34 | 35 | @staticmethod 36 | def handle_link(match): 37 | text = match[2] 38 | if text: 39 | return text.lstrip('|').strip() 40 | else: 41 | return match['title'].strip() 42 | 43 | def validate_description(self, desc): 44 | return (bool(desc) and len(desc.split()) >= self.opt['min_words']) 45 | 46 | def parse_description(self, text): 47 | desc = textlib.removeDisabledParts( 48 | text, 49 | ['comment', 'file', 'nowiki', 'template', self.FORMATTING_REGEX, 50 | self.REF_REGEX]) 51 | desc = LINK_REGEX.sub(self.handle_link, desc) 52 | desc = desc.replace(' ', ' ').strip() 53 | desc = re.sub(r' *\([^)]+\)$', '', desc) 54 | desc = desc.partition(';')[0] 55 | desc = re.sub(r'^.*\) [-–] +', '', desc) 56 | desc = re.sub(r'^\([^)]+\) +', '', desc) 57 | while ' ' * 2 in desc: 58 | desc = desc.replace(' ' * 2, ' ') 59 | if re.search(r'[^IVX]\.$', desc) or desc.endswith(tuple(',:')): 60 | desc = desc[:-1].rstrip() 61 | if desc.startswith(('a ', 'an ')): 62 | desc = desc.partition(' ')[2] 63 | return desc 64 | 65 | def get_summary(self, page, desc): 66 | link = page.title(as_link=True, insite=self.repo) 67 | return f'importing [{page.site.lang}] description "{desc}" from {link}' 68 | 69 | 70 | class MissingDescriptionBot(BaseDescriptionBot): 71 | 72 | use_from_page = False 73 | 74 | def __init__(self, **kwargs): 75 | self.available_options.update({ 76 | 'allpages': False, 77 | }) 78 | super().__init__(**kwargs) 79 | self.store = QueryStore() 80 | 81 | @property 82 | def generator(self): 83 | query = self.store.build_query( 84 | 'missing_descriptions', 85 | hostname=self.site.hostname(), 86 | lang=self.site.lang) 87 | return PreloadingEntityGenerator( 88 | WikidataSPARQLPageGenerator(query, site=self.repo)) 89 | 90 | def treat_page_and_item(self, page, item): 91 | if self.site.lang in item.descriptions: 92 | return 93 | title = item.getSitelink(self.site) 94 | link_start = re.escape('[[' + title) 95 | search_query = fr'linksto:"{title}" insource:/\* *{link_start}/' 96 | regex = self.get_regex_for_title(re.escape(title)) 97 | for ref_page in PreloadingGenerator( 98 | SearchPageGenerator(search_query, namespaces=[0])): 99 | # todo: first polish text 100 | match = regex.search(ref_page.text) 101 | if not match: 102 | continue 103 | if not self.opt['allpages'] and not ref_page.isDisambig(): 104 | continue 105 | desc = self.parse_description(match[2]) 106 | if not self.validate_description(desc): 107 | continue 108 | summary = self.get_summary(ref_page, desc) 109 | item.descriptions[self.site.lang] = desc.strip() 110 | if self.user_edit_entity(item, summary=summary): 111 | break 112 | 113 | 114 | class MappingDescriptionBot(BaseDescriptionBot): 115 | 116 | def __init__(self, **kwargs): 117 | super().__init__(**kwargs) 118 | self.regex = self.get_regex_for_title(r'[^\[\|\]]+') 119 | 120 | def get_pages_with_descriptions(self, text): 121 | data = {} 122 | for match in self.regex.finditer(text): 123 | title, desc = match.groups() 124 | page = pywikibot.Page(self.site, title) 125 | data[page] = self.parse_description(desc) 126 | return data 127 | 128 | def treat_page(self): 129 | page = self.current_page 130 | descriptions = self.get_pages_with_descriptions(page.text) 131 | for item in PreloadingEntityGenerator(descriptions.keys()): 132 | if self.site.lang in item.descriptions: 133 | continue 134 | target = pywikibot.Page(item.sitelinks[self.site]) 135 | desc = descriptions.get(target) 136 | if not self.validate_description(desc): 137 | continue 138 | summary = self.get_summary(page, desc) 139 | item.descriptions[self.site.lang] = desc.strip() 140 | self.current_page = item 141 | self.user_edit_entity(item, summary=summary) 142 | 143 | 144 | def main(*args): 145 | options = {} 146 | local_args = pywikibot.handle_args(args) 147 | site = pywikibot.Site() 148 | genFactory = GeneratorFactory(site=site) 149 | for arg in genFactory.handle_args(local_args): 150 | if arg.startswith('-'): 151 | arg, sep, value = arg.partition(':') 152 | if value != '': 153 | options[arg[1:]] = int(value) if value.isdigit() else value 154 | else: 155 | options[arg[1:]] = True 156 | 157 | generator = genFactory.getCombinedGenerator(preload=True) 158 | if generator: 159 | bot = MappingDescriptionBot(generator=generator, site=site, **options) 160 | else: 161 | bot = MissingDescriptionBot(site=site, **options) 162 | bot.run() 163 | 164 | 165 | if __name__ == '__main__': 166 | main() 167 | -------------------------------------------------------------------------------- /merger.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import time 3 | 4 | from operator import attrgetter 5 | 6 | import pywikibot 7 | 8 | from pywikibot.exceptions import APIError, OtherPageSaveError 9 | from pywikibot.data.sparql import SparqlQuery 10 | 11 | 12 | class Merger: 13 | 14 | strategies = { 15 | 'id': '_sort_by_id', 16 | 'claims': '_sort_by_claims', 17 | 'revisions': '_sort_by_revisions', 18 | 'sitelinks': '_sort_by_sitelinks', 19 | } 20 | no_conflict_props = {'P17', 'P21', 'P105', 'P170', 'P171', 'P225', 'P271', 21 | 'P296', 'P495', 'P569', 'P570', 'P734', 'P856'} 22 | no_conflict_trees = { 23 | 'P19': 'P131', 24 | 'P31': 'P279', 25 | 'P131': 'P131', 26 | 'P279': 'P279', 27 | } 28 | no_conflict_types = ['external-id'] 29 | 30 | @classmethod 31 | def merge(cls, item_from, item_to, **kwargs): 32 | try: 33 | item_from.mergeInto(item_to, **kwargs) 34 | except APIError as e: 35 | raise OtherPageSaveError(item_from, e) 36 | 37 | @classmethod 38 | def clean_merge(cls, item_from, item_to, safe=False, quick=True, **kwargs): 39 | kwargs.pop('asynchronous', None) # fixme 40 | if safe and not cls.can_merge(item_from, item_to, quick=quick): 41 | raise OtherPageSaveError( 42 | item_from, f'Cannot merge {item_from} with {item_to}') 43 | 44 | cls.merge(item_from, item_to, **kwargs) 45 | if not item_from.isRedirectPage(): 46 | try: 47 | item_from.editEntity( 48 | {}, clear=True, summary='Clearing item to prepare for redirect') 49 | except APIError as e: 50 | raise OtherPageSaveError(item_from, e) 51 | 52 | cls.merge(item_from, item_to) 53 | 54 | @classmethod 55 | def _conflicts(cls, data1, data2): 56 | set1 = {repr(x.target) for x in data1} # hack 57 | set2 = {repr(x.target) for x in data2} # hack 58 | return not bool(set1 & set2) 59 | 60 | @classmethod 61 | def _has_dtype(cls, dtype, claims): 62 | for cl in claims: 63 | if cl.type == dtype: 64 | return True 65 | return False 66 | 67 | @classmethod 68 | def _same_tree(cls, prop, data1, data2): 69 | sparql = SparqlQuery() # fixme: dependencies 70 | pattern = ('ASK { VALUES ?x1 { wd:%s } . VALUES ?x2 { wd:%s } . ' 71 | '?x1 wdt:%s* ?x2 }') 72 | item1 = ' wd:'.join(map(attrgetter('target.id'), data1)) 73 | item2 = ' wd:'.join(map(attrgetter('target.id'), data2)) 74 | tries = 3 75 | for ask in (pattern % (item1, item2, prop), 76 | pattern % (item2, item1, prop)): 77 | res = False 78 | while True: 79 | try: 80 | res = sparql.ask(ask) 81 | except requests.exceptions.ConnectionError: 82 | tries -= 1 83 | if tries == 0: 84 | raise 85 | time.sleep(1) 86 | continue 87 | else: 88 | break 89 | if res: 90 | return True 91 | 92 | return False 93 | 94 | @classmethod 95 | def can_merge(cls, item1, item2, quick=True): 96 | props = list(cls.no_conflict_props) 97 | if quick: 98 | props.extend(cls.no_conflict_trees.keys()) 99 | 100 | for prop in props: 101 | item1.get() 102 | data1 = item1.claims.get(prop, []) 103 | if not data1: 104 | continue 105 | item2.get() 106 | data2 = item2.claims.get(prop, []) 107 | if not data2: 108 | continue 109 | if cls._conflicts(data1, data2): 110 | return False 111 | 112 | key = lambda claims: claims[0].id 113 | for dtype in cls.no_conflict_types: 114 | callback = lambda claims: claims[0].type == dtype 115 | item1.get() 116 | keys1 = set(map(key, filter(callback, item1.claims.values()))) 117 | if not keys1: 118 | continue 119 | item2.get() 120 | keys2 = set(map(key, filter(callback, item2.claims.values()))) 121 | if not keys2: 122 | continue 123 | for prop in keys1 & keys2: 124 | if cls._conflicts(item1.claims[prop], item2.claims[prop]): 125 | return False 126 | 127 | if not quick: 128 | for prop in cls.no_conflict_trees: 129 | item1.get() 130 | data1 = item1.claims.get(prop, []) 131 | if not data1: 132 | continue 133 | item2.get() 134 | data2 = item2.claims.get(prop, []) 135 | if not data2: 136 | continue 137 | if not cls._same_tree(cls.no_conflict_trees[prop], data1, data2): 138 | return False 139 | 140 | return True 141 | 142 | @classmethod 143 | def _sort_by_id(cls, item1, item2): 144 | id1, id2 = item1.getID(numeric=True), item2.getID(numeric=True) 145 | return (id1 < id2) - (id1 > id2) 146 | 147 | @classmethod 148 | def _sort_by_revisions(cls, item1, item2): 149 | len1, len2 = map( 150 | lambda item: len(list(item.revisions())), [item1, item2]) 151 | return (len1 > len2) - (len1 < len2) 152 | 153 | @classmethod 154 | def _sort_by_claims(cls, item1, item2): 155 | callback = lambda item: sum(map(len, item.claims.values())) 156 | count1, count2 = map(callback, [item1, item2]) 157 | return (count1 > count2) - (count1 < count2) 158 | 159 | @classmethod 160 | def _sort_by_sitelinks(cls, item1, item2): 161 | len1, len2 = map(lambda item: len(item.sitelinks), [item1, item2]) 162 | return (len1 > len2) - (len1 < len2) 163 | 164 | @classmethod 165 | def sort_for_merge(cls, items, key=['id']): 166 | for strategy in key: 167 | if strategy not in cls.strategies: 168 | continue 169 | callback = getattr(cls, cls.strategies[strategy]) 170 | res = callback(*items) 171 | if res == 0: 172 | continue 173 | if res == -1: 174 | items[:] = items[::-1] 175 | break 176 | target_item, from_item = items 177 | return target_item, from_item 178 | -------------------------------------------------------------------------------- /slice_externalids.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import re 3 | 4 | import pywikibot 5 | 6 | from pywikibot.data.sparql import SparqlQuery 7 | from pywikibot.pagegenerators import ( 8 | PreloadingEntityGenerator, 9 | WikidataSPARQLPageGenerator, 10 | ) 11 | 12 | from query_store import QueryStore 13 | from wikidata import WikidataEntityBot 14 | 15 | 16 | class ExternalIdSlicingBot(WikidataEntityBot): 17 | 18 | blacklist = {'P2013'} 19 | use_from_page = False 20 | 21 | def __init__(self, **options): 22 | self.available_options.update({ 23 | 'step': 10, 24 | 'offset': 0, 25 | }) 26 | super().__init__(**options) 27 | self.cache = {} 28 | self.failed = {} 29 | self.sparql = SparqlQuery(repo=self.repo) 30 | self.store = QueryStore() 31 | 32 | @property 33 | def generator(self): 34 | step = self.opt['step'] 35 | opts = { 36 | # fixme: don't use this word 37 | 'blacklist': ' wd:'.join(self.blacklist), 38 | 'limit': step, 39 | } 40 | offset = self.opt['offset'] 41 | while True: 42 | pywikibot.info(f'\nLoading items (offset {offset})...') 43 | opts['offset'] = offset 44 | ask = self.store.build_query('ask_externalid_props', **opts) 45 | if not self.sparql.ask(ask): 46 | break 47 | query = self.store.build_query('external-ids', **opts) 48 | gen = PreloadingEntityGenerator( 49 | WikidataSPARQLPageGenerator(query, site=self.repo)) 50 | yield from gen 51 | offset += step 52 | 53 | def treat_page_and_item(self, page, item): 54 | for prop, claims in item.claims.items(): 55 | if prop in self.blacklist: 56 | continue 57 | if claims[0].type != 'external-id': 58 | continue 59 | for cl in claims: 60 | if not cl.target or not cl.target.startswith('http'): 61 | continue 62 | formatter, regex = self.get_formatter_and_regex(prop) 63 | if not formatter: 64 | pywikibot.info(f"{prop} doesn't have a formatter") 65 | break 66 | value = self.find_value(cl.target, formatter) 67 | if not value: 68 | pywikibot.info( 69 | f'Value not found in "{cl.target}" for property {prop}') 70 | self.failed.setdefault(prop, set()).add(item) 71 | continue 72 | if regex: 73 | try: 74 | match = re.match(f'({regex})', value) 75 | except re.error: 76 | pywikibot.info(f'Couldn\'t apply regex "{regex}"') 77 | break 78 | if not match: 79 | pywikibot.info( 80 | f'Value "{value}" not matched by regex "{regex}"') 81 | self.failed.setdefault(prop, set()).add(item) 82 | continue 83 | value = match.group() 84 | summary = 'harvested the identifier based on [[Property:P1630]]' 85 | if regex: 86 | summary += ' and [[Property:P1793]]' 87 | cl.changeTarget(value, summary=summary) 88 | 89 | def get_formatter_and_regex(self, prop): 90 | if prop not in self.cache: 91 | formatter = regex = None 92 | ppage = pywikibot.PropertyPage(self.repo, prop) 93 | if 'P1630' in ppage.claims: 94 | if len(ppage.claims['P1630']) > 1: 95 | preferred = [cl for cl in ppage.claims['P1630'] 96 | if cl.rank == 'preferred'] 97 | if len(preferred) == 1: 98 | formatter = preferred[0].target 99 | else: 100 | formatter = ppage.claims['P1630'][0].target 101 | 102 | if 'P1793' in ppage.claims: 103 | if len(ppage.claims['P1793']) > 1: 104 | preferred = [cl for cl in ppage.claims['P1793'] 105 | if cl.rank == 'preferred'] 106 | if len(preferred) == 1: 107 | regex = preferred[0].target 108 | else: 109 | regex = ppage.claims['P1793'][0].target 110 | 111 | self.cache[prop] = (formatter, regex) 112 | 113 | return self.cache[prop] 114 | 115 | def strip_init_stuff(self, string): 116 | if string.startswith(('http://', 'https://')): 117 | string = string.partition('//')[2] 118 | if string.startswith('www.'): 119 | string = string[4:] 120 | return string 121 | 122 | def find_value(self, url, formatter): 123 | url = self.strip_init_stuff(url) 124 | formatter = self.strip_init_stuff(formatter) 125 | value = pywikibot.page.url2unicode(url) 126 | split = formatter.split('$1') 127 | if not value.startswith(split[0]): 128 | return None 129 | if not split[1]: 130 | return value[len(split[0]):].rstrip('/') 131 | 132 | value = value[:-len(split[-1])] 133 | 134 | try: 135 | index = value.index(split[1], len(split[0])) 136 | except ValueError: 137 | return None 138 | else: 139 | return value[len(split[0]):index].rstrip('/') 140 | 141 | def exit(self): # fixme: teardown 142 | if self.failed: 143 | text = '' 144 | for prop in sorted(self.failed): 145 | text += f'* [[Property:{prop}]]:\n' 146 | for item in sorted(self.failed[prop]): 147 | text += f'** [[{item.title()}]]\n' 148 | username = self.repo.username() 149 | page = pywikibot.Page( 150 | self.repo, f'User:{username}/Wrong external ids') 151 | page.put(text, summary='update') 152 | super().exit() 153 | 154 | 155 | def main(*args): 156 | options = {} 157 | for arg in pywikibot.handle_args(args): 158 | if arg.startswith('-'): 159 | arg, sep, value = arg.partition(':') 160 | if value != '': 161 | options[arg[1:]] = int(value) if value.isdigit() else value 162 | else: 163 | options[arg[1:]] = True 164 | 165 | site = pywikibot.Site('wikidata', 'wikidata') 166 | bot = ExternalIdSlicingBot(site=site, **options) 167 | bot.run() 168 | 169 | 170 | if __name__ == '__main__': 171 | main() 172 | -------------------------------------------------------------------------------- /list_typos.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import re 3 | 4 | from collections import defaultdict 5 | 6 | import pywikibot 7 | 8 | from pywikibot import textlib 9 | from pywikibot.bot import SingleSiteBot, ExistingPageBot 10 | from pywikibot.pagegenerators import PreloadingGenerator 11 | from pywikibot.tools.itertools import itergroup 12 | 13 | from typoloader import TypoRule, TyposLoader 14 | 15 | 16 | class TypoReportBot(SingleSiteBot): 17 | 18 | pattern = '# {} \u2013 {}' 19 | 20 | def __init__(self, **kwargs): 21 | self.available_options.update({ 22 | 'always': True, 23 | 'anything': False, 24 | 'outputpage': None, 25 | 'typospage': None, 26 | 'whitelistpage': None, 27 | 'false_positives': None, 28 | }) 29 | super().__init__(**kwargs) 30 | self.loader = TyposLoader( 31 | self.site, allrules=True, typospage=self.opt.typospage, 32 | whitelistpage=self.opt.whitelistpage) 33 | self.false_positives = set() 34 | 35 | def setup(self): 36 | super().setup() 37 | self.typoRules = self.loader.loadTypos() 38 | #self.fp_page = self.loader.getWhitelistPage() 39 | self.whitelist = self.loader.loadWhitelist() 40 | self.data = defaultdict(list) 41 | self.order = [] # remove when dictionaries are ordered 42 | self.load_false_positives() 43 | 44 | def load_false_positives(self): 45 | if not self.opt.false_positives: 46 | return 47 | page = pywikibot.Page(self.site, self.opt.false_positives) 48 | fps = self.false_positives 49 | for line in page.text.splitlines(): 50 | if line.startswith(('#', '*')): 51 | fps.add(line.lstrip('#* ')) 52 | 53 | @property 54 | def generator(self): 55 | for rule in self.typoRules: 56 | if rule.query is None: 57 | continue 58 | 59 | pywikibot.info(f'Query: "{rule.query}"') 60 | self.current_rule = rule 61 | yield from PreloadingGenerator( 62 | self.site.search(rule.query, namespaces=[0])) 63 | 64 | def skip_page(self, page): 65 | # TODO: better terminology 66 | if page.title() in self.whitelist: 67 | pywikibot.warning(f'Skipped {page} because it is whitelisted') 68 | return True 69 | 70 | if self.current_rule.find.search(page.title()): 71 | pywikibot.warning( 72 | f'Skipped {page} because the rule matches the title') 73 | return True 74 | 75 | return super().skip_page(page) 76 | 77 | def remove_disabled_parts(self, text): 78 | return textlib.removeDisabledParts( 79 | text, TypoRule.exceptions, site=self.site) 80 | 81 | def treat(self, page): 82 | match = self.current_rule.find.search(page.text) 83 | if not match: 84 | return 85 | text = self.remove_disabled_parts(page.text) 86 | found = set() 87 | for match in self.current_rule.find.finditer(text): 88 | match_text = match[0] 89 | if match_text in found: 90 | continue 91 | found.add(match_text) 92 | link = page.title(as_link=True) 93 | put_text = self.pattern.format(link, match_text) 94 | if put_text[2:] not in self.false_positives: 95 | pywikibot.stdout(put_text) 96 | if not self.data.get(link): 97 | self.order.append(link) 98 | self.data[link].append(match_text) 99 | 100 | def teardown(self): 101 | outputpage = self.opt.outputpage 102 | if (self.generator_completed or self.opt.anything) and outputpage: 103 | put = [] 104 | for link in self.order: 105 | for match in self.data[link]: 106 | put.append(self.pattern.format(link, match)) 107 | page = pywikibot.Page(self.site, outputpage) 108 | page.text = '\n'.join(put) 109 | page.save(summary='aktualizace seznamu překlepů', minor=False, 110 | bot=False, apply_cosmetic_changes=False) 111 | super().teardown() 112 | 113 | 114 | class PurgeTypoReportBot(SingleSiteBot, ExistingPageBot): 115 | 116 | def __init__(self, **kwargs): 117 | self.helper = TypoReportBot(**kwargs) 118 | super().__init__(site=self.helper.site) 119 | self.put = [] 120 | self.cache = defaultdict(list) 121 | 122 | def setup(self): 123 | super().setup() 124 | self.whitelist = self.helper.loader.loadWhitelist() 125 | self.generator = [pywikibot.Page(self.site, self.helper.opt.outputpage)] 126 | self.helper.load_false_positives() 127 | 128 | def line_iterator(self, text): 129 | regex = re.compile(self.helper.pattern.format( 130 | r'\[\[([^]]+)\]\]', '(.+)')) 131 | for line in text.splitlines(): 132 | match = regex.fullmatch(line) 133 | if match: 134 | title, text = match.groups() 135 | entry = pywikibot.Page(self.site, title) 136 | self.cache[entry.title()].append(text) 137 | yield entry 138 | else: 139 | self.put.append(line) 140 | 141 | def treat(self, page): 142 | pattern = self.helper.pattern 143 | for entry in PreloadingGenerator(self.line_iterator(page.text)): 144 | key = title = entry.title() 145 | if not entry.exists(): 146 | self.cache.pop(key) 147 | continue 148 | while entry.isRedirectPage(): 149 | entry = entry.getRedirectTarget() 150 | title = entry.title() 151 | text = self.helper.remove_disabled_parts(entry.text) 152 | for string in self.cache.pop(key): 153 | if string not in text: 154 | continue 155 | put_text = pattern.format(f'[[{title}]]', string) 156 | if put_text[2:] in self.helper.false_positives: 157 | continue 158 | self.put.append(put_text) 159 | 160 | page.text = '\n'.join(self.put) 161 | page.save(summary='odstranění vyřešených překlepů', minor=True, 162 | bot=True, apply_cosmetic_changes=False) 163 | 164 | 165 | def main(*args): 166 | options = {} 167 | cls = TypoReportBot 168 | for arg in pywikibot.handle_args(args): 169 | if arg == 'purge': 170 | cls = PurgeTypoReportBot 171 | elif arg.startswith('-'): 172 | arg, sep, value = arg.partition(':') 173 | if value != '': 174 | options[arg[1:]] = int(value) if value.isdigit() else value 175 | else: 176 | options[arg[1:]] = True 177 | 178 | bot = cls(**options) 179 | bot.run() 180 | 181 | 182 | if __name__ == '__main__': 183 | main() 184 | -------------------------------------------------------------------------------- /cswiki/pageviews.py: -------------------------------------------------------------------------------- 1 | import heapq 2 | import json 3 | import os.path as osp 4 | from collections import defaultdict 5 | from datetime import date, datetime, timedelta 6 | 7 | import pywikibot 8 | import requests 9 | from pywikibot.comms.http import user_agent 10 | from pywikibot.pagegenerators import PreloadingGenerator 11 | 12 | pywikibot.handle_args() 13 | 14 | site = pywikibot.Site() 15 | 16 | headers = {'User-Agent': user_agent()} 17 | hostname = site.hostname() 18 | prefix = 'https://wikimedia.org/api/rest_v1/metrics/pageviews' 19 | pattern = f'{prefix}/top/{hostname}/all-access/%Y/%m/%d' 20 | 21 | check_templates = { 22 | 'Aktualizovat', 'Celkově zpochybněno', 'Globalizovat', 'Neověřeno', 'NPOV', 23 | 'Pahýl', 'Pravopis', 'Reklama', 'Sloh', 'Upravit', 'Vlastní výzkum', 24 | 'Vyhýbavá slova', 25 | } 26 | check_categories = { 27 | 'Wikipedie:Polozamčené stránky', 28 | 'Wikipedie:Rozšířeně polozamčené stránky', 29 | 'Wikipedie:Dlouhodobě zamčené stránky', 30 | 'Wikipedie:Dobré články', 31 | 'Wikipedie:Nejlepší články', 32 | 'Žijící lidé', 33 | } 34 | 35 | top = 100 36 | days = 7 37 | gamma = 0.85 38 | weights = [pow(gamma, i) for i in range(days)] 39 | 40 | today = date.today() 41 | this = today - timedelta(days=1) 42 | first = today - timedelta(days=days) 43 | min_per_day = [] 44 | 45 | check_categories.add(f'Úmrtí v roce {this.year}') 46 | check_categories.add(f'Úmrtí v roce {this.year - 1}') 47 | 48 | aggregate_url = '{}/aggregate/{}/all-access/user/daily/{}/{}'.format( 49 | prefix, 50 | hostname, 51 | first.strftime('%Y%m%d'), 52 | this.strftime('%Y%m%d') 53 | ) 54 | resp = requests.get(aggregate_url, headers=headers) 55 | data = resp.json() 56 | daily = [entry['views'] for entry in data['items']] 57 | 58 | index = defaultdict(lambda: [None] * days) 59 | for diff in range(days): 60 | the_day = this - timedelta(days=diff) 61 | resp = requests.get(the_day.strftime(pattern), headers=headers) 62 | data = resp.json() 63 | 64 | array = [] 65 | for info in data['items'][0]['articles']: 66 | page = info['article'] 67 | views = info['views'] 68 | index[page][diff] = views 69 | array.append(views) 70 | min_per_day.append(min(array)) 71 | del data 72 | 73 | done_heap = [] 74 | stack = [] 75 | 76 | for page, values in index.items(): 77 | if page.startswith('Speciální:'): 78 | continue 79 | complete = True 80 | total = 0 81 | for views, at_most in zip(values, min_per_day): 82 | if views is None: 83 | complete = False 84 | total += at_most 85 | else: 86 | total += views 87 | 88 | if complete: 89 | done_heap.append((total, page, values)) 90 | else: 91 | stack.append((total, page, values)) 92 | 93 | done_heap.sort() 94 | del done_heap[:-top] 95 | stack.sort() 96 | 97 | while True: 98 | possible, page, values = stack.pop() 99 | lowest = done_heap[0][0] 100 | if possible < lowest: 101 | break 102 | 103 | present = [i for i, val in enumerate(values) if val is None] 104 | 105 | start = this - timedelta(days=max(present)) 106 | end = this - timedelta(days=min(present)) 107 | 108 | url = f'{prefix}/per-article/{hostname}/all-access/user/' 109 | url += page.replace('/', '%2F') + '/daily/' 110 | url += start.strftime('%Y%m%d00') + '/' + end.strftime('%Y%m%d00') 111 | resp = requests.get(url, headers=headers) 112 | if resp.ok: 113 | data = resp.json() 114 | for entry in data['items']: 115 | dt = datetime.strptime(entry['timestamp'], '%Y%m%d%H') 116 | delta = this - dt.date() 117 | values[delta.days] = entry['views'] 118 | 119 | for i in range(days): 120 | if values[i] is None: 121 | values[i] = 0 122 | 123 | total = sum(values) 124 | assert total <= possible 125 | if total >= lowest: 126 | heapq.heappushpop(done_heap, (total, page, values)) 127 | 128 | done_heap.sort(reverse=True) 129 | 130 | lines = [] 131 | lines.append( 132 | f"Nejčtenější stránky za období {first.day}. {first.month}. {first.year}" 133 | f" – {this.day}. {this.month}. {this.year}." 134 | ) 135 | lines.append('') 136 | lines.append('{| class="wikitable sortable"') 137 | lines.append('! Pořadí') 138 | lines.append('! Stránka') 139 | lines.append('! Celkový
počet návštěv') 140 | lines.append('! Vážený
počet návštěv') 141 | lines.append('! Koeficient') 142 | lines.append('! Problémy') 143 | lines.append('! Příznaky') 144 | lines.append('! class="unsortable" | Graf') 145 | 146 | aggregate = sum(daily) 147 | weighted = sum(v * w for v, w in zip(daily, weights)) 148 | coef = weighted / aggregate 149 | 150 | lines.append('|-') 151 | lines.append('|') 152 | lines.append("| ''vše''") 153 | lines.append(f'| {aggregate}') 154 | lines.append(f'| {weighted:.0f}') 155 | lines.append('| %s' % f'{coef:.3f}'.replace('.', ',', 1)) 156 | lines.append(f'|') 157 | lines.append(f'|') 158 | lines.append(f"| [https://pageviews.wmcloud.org/siteviews/?sites={hostname}" 159 | f"&agent=user&range=latest-20 [0]]") 160 | 161 | gen = (pywikibot.Page(site, title) for _, title, _ in done_heap) 162 | for rank, (page, (total, title, values)) in enumerate(zip( 163 | site.preloadpages(gen, templates=True, categories=True, content=False), 164 | done_heap 165 | ), start=1): 166 | weighted = sum(v * w for v, w in zip(values, weights)) 167 | coef = weighted / total 168 | link_title = title.replace('_', ' ') 169 | if link_title.startswith(('Soubor:', 'Kategorie:')): 170 | link_title = f':{link_title}' 171 | 172 | lines.append('|-') 173 | lines.append(f'| {rank}') 174 | lines.append(f'| [[{link_title}]]') 175 | lines.append(f'| {total}') 176 | lines.append(f'| {weighted:.0f}') 177 | lines.append('| %s' % f'{coef:.3f}'.replace('.', ',', 1)) 178 | 179 | show_templates = check_templates.intersection(map( 180 | lambda p: p.title(with_ns=False), page.templates())) 181 | show_categories = check_categories.intersection(map( 182 | lambda p: p.title(with_ns=False), page.categories())) 183 | 184 | if show_templates: 185 | lines.append('| ' + ('
'.join( 186 | f'[[Šablona:{t}|{t}]]' for t in sorted(show_templates)))) 187 | else: 188 | lines.append('|') 189 | 190 | if show_categories: 191 | lines.append('| ' + ('
'.join( 192 | f"[[:Kategorie:{c}|{c.removeprefix('Wikipedie:')}]]" 193 | for c in sorted(show_categories)))) 194 | else: 195 | lines.append('|') 196 | 197 | lines.append(f"| [https://pageviews.wmcloud.org/pageviews/?project={hostname}" 198 | f"&agent=user&range=latest-20&pages={title}]") 199 | 200 | lines.append('|}') 201 | 202 | the_page = pywikibot.Page(site, f'{site.username()}/Návštěvy', ns=2) 203 | the_page.text = '\n'.join(lines) 204 | the_page.save(minor=False, bot=False, apply_cosmetic_changes=False, 205 | summary='aktualizace') 206 | -------------------------------------------------------------------------------- /fake_references.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | from contextlib import suppress 3 | 4 | import pywikibot 5 | 6 | from pywikibot import pagegenerators 7 | 8 | from query_store import QueryStore 9 | from wikidata import WikidataEntityBot 10 | 11 | 12 | class FakeReferencesBot(WikidataEntityBot): 13 | 14 | item_ids = ['Q2013'] 15 | inferred_from = 'P3452' 16 | ref_props = ['P143', 'P248'] 17 | url_props = ['P854'] 18 | use_from_page = False 19 | whitelist_props = {'P813', 'P4656'} 20 | 21 | def __init__(self, generator, **kwargs): 22 | self.available_options.update({ 23 | 'limit': None, 24 | }) 25 | super().__init__(**kwargs) 26 | self.store = QueryStore() 27 | self._generator = generator or self.subgenerator() 28 | self.url_start = self.repo.base_url(self.repo.article_path) 29 | 30 | def subgenerator(self): 31 | limit = self.opt['limit'] 32 | for ident in self.item_ids: 33 | from_item = pywikibot.ItemPage(self.repo, ident) 34 | for item in pagegenerators.WikibaseItemGenerator( 35 | from_item.backlinks( 36 | total=limit, filterRedirects=False, namespaces=[0])): 37 | yield item 38 | if limit is not None: 39 | limit -= 1 40 | 41 | if limit == 0: 42 | return 43 | 44 | for prop in self.url_props: 45 | ok = True 46 | while ok and limit != 0: 47 | ok = False 48 | query = self.store.build_query( 49 | 'fake_references_url', 50 | limit=500 if limit is None else min(500, limit), 51 | prop=prop) 52 | for item in pagegenerators.WikidataSPARQLPageGenerator( 53 | query, site=self.repo): 54 | ok = True 55 | yield item 56 | if limit is not None: 57 | limit -= 1 58 | 59 | for prop in self.ref_props: 60 | ok = True 61 | while ok and limit != 0: 62 | ok = False 63 | query = self.store.build_query( 64 | 'fake_references', 65 | limit=100 if limit is None else min(100, limit), 66 | prop=prop) 67 | for item in pagegenerators.WikidataSPARQLPageGenerator( 68 | query, site=self.repo): 69 | ok = True 70 | yield item 71 | if limit is not None: 72 | limit -= 1 73 | 74 | @property 75 | def generator(self): 76 | return pagegenerators.PreloadingEntityGenerator(self._generator) 77 | 78 | @property 79 | def summary(self): 80 | return ('update reference per [[Wikidata:Requests for permissions/' 81 | 'Bot/MatSuBot 8|RfPB]]') 82 | 83 | def treat_page_and_item(self, page, item): 84 | changed = False 85 | for prop, claims in item.claims.items(): 86 | for claim in claims: 87 | if self.handle_claim(claim): 88 | changed = True 89 | if changed: 90 | self.user_edit_entity(item, summary=self.summary) 91 | 92 | def handle_claim(self, claim): 93 | ret = False 94 | if not claim.sources: 95 | return ret 96 | if claim.type == 'wikibase-item': 97 | if claim.id == 'P1343' and 'P805' in claim.qualifiers: 98 | target = claim.qualifiers['P805'][0].getTarget() 99 | else: 100 | target = claim.getTarget() 101 | if target: 102 | for source in claim.sources: 103 | ret = self.handle_source_item(source, target) or ret 104 | for source in claim.sources: 105 | ret = self.handle_source_url(source) or ret 106 | return ret 107 | 108 | def handle_source_item(self, source, target): 109 | ret = False 110 | for prop in self.ref_props: 111 | keys = set(source.keys()) 112 | if prop not in keys: 113 | continue 114 | if keys - (self.whitelist_props | {prop}): 115 | continue 116 | if len(source[prop]) > 1: 117 | #continue? 118 | return ret 119 | 120 | fake = next(iter(source[prop])) 121 | items = list(self.item_ids) + [target] 122 | if any(fake.target_equals(tgt) for tgt in items): 123 | snak = pywikibot.Claim( 124 | self.repo, self.inferred_from, isReference=True) 125 | snak.setTarget(target) 126 | source.setdefault(self.inferred_from, []).append(snak) 127 | source.pop(prop) 128 | ret = True 129 | return ret 130 | 131 | def handle_source_url(self, source): 132 | ret = False 133 | for prop in self.url_props: 134 | keys = set(source.keys()) 135 | if prop not in keys: 136 | continue 137 | if keys - (self.whitelist_props | {prop}): 138 | continue 139 | if len(source[prop]) > 1: 140 | #continue? 141 | return ret 142 | 143 | snak = next(iter(source[prop])) 144 | url = snak.getTarget() 145 | if not url: 146 | continue 147 | target = None 148 | with suppress(pywikibot.InvalidTitle, ValueError): 149 | for prefix in [self.url_start, self.repo.concept_base_uri]: 150 | target_id = url.removeprefix(prefix) 151 | if target_id != url: 152 | target = pywikibot.ItemPage(self.repo, target_id) 153 | break 154 | if target: 155 | if target.isRedirectPage(): 156 | target = target.getRedirectTarget() 157 | if target != snak.on_item: 158 | snak = pywikibot.Claim( 159 | self.repo, self.inferred_from, isReference=True) 160 | snak.setTarget(target) 161 | source.setdefault(self.inferred_from, []).append(snak) 162 | source.pop(prop) 163 | ret = True 164 | return ret 165 | 166 | 167 | def main(*args): 168 | options = {} 169 | local_args = pywikibot.handle_args(args) 170 | site = pywikibot.Site() 171 | genFactory = pagegenerators.GeneratorFactory(site=site) 172 | for arg in genFactory.handle_args(local_args): 173 | if arg.startswith('-'): 174 | arg, sep, value = arg.partition(':') 175 | if value != '': 176 | options[arg[1:]] = value if not value.isdigit() else int(value) 177 | else: 178 | options[arg[1:]] = True 179 | 180 | generator = genFactory.getCombinedGenerator() 181 | bot = FakeReferencesBot(generator=generator, site=site, **options) 182 | bot.run() 183 | 184 | 185 | if __name__ == '__main__': 186 | main() 187 | -------------------------------------------------------------------------------- /typos.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import time 3 | 4 | import pywikibot 5 | from pywikibot import pagegenerators 6 | 7 | from typoloader import TyposLoader 8 | from wikitext import WikitextFixingBot 9 | 10 | 11 | class TypoBot(WikitextFixingBot): 12 | 13 | ''' 14 | Bot for typo fixing 15 | 16 | Supported parameters: 17 | * -allrules - use if you want to load rules that need user's decision 18 | * -offset:# - what typo rule do you want to start from 19 | * -quick - use if you want the bot to focus on the current rule, 20 | ie. skip the page if the rule couldn't be applied 21 | * -threshold:# - skip rule when loaded/replaced ratio gets over # 22 | * -typospage: - what page do you want to load typo rules from 23 | * -whitelistpage: - what page holds pages which should be skipped 24 | ''' 25 | 26 | def __init__(self, generator, *, offset=0, **kwargs): 27 | self.available_options.update({ 28 | 'allrules': False, 29 | 'quick': False, 30 | 'threshold': 10, 31 | 'typospage': None, 32 | 'whitelistpage': None, 33 | }) 34 | kwargs['typos'] = False 35 | self.own_generator = not bool(generator) 36 | if self.own_generator: 37 | self.generator = self.make_generator() 38 | else: 39 | self.generator = generator 40 | 41 | super().__init__(**kwargs) 42 | self.offset = offset 43 | 44 | def setup(self): 45 | loader = TyposLoader( 46 | self.site, allrules=self.opt['allrules'], 47 | typospage=self.opt['typospage'], 48 | whitelistpage=self.opt['whitelistpage']) 49 | self.typoRules = loader.loadTypos() 50 | self.fp_page = loader.getWhitelistPage() 51 | self.whitelist = loader.loadWhitelist() 52 | 53 | @property 54 | def is_rule_accurate(self): 55 | threshold = self.opt['threshold'] 56 | result = (self.processed < threshold or 57 | self.processed / threshold < self.replaced) 58 | return result 59 | 60 | def make_generator(self): 61 | for i, rule in enumerate(self.typoRules[:]): 62 | if self.offset > i: 63 | continue 64 | if rule.query is None: 65 | continue 66 | 67 | # todo: if not allrules:... 68 | self.offset = i 69 | pywikibot.info(f'\nQuery: "{rule.query}"') 70 | old_max = rule.longest 71 | rule.longest = 0.0 72 | self.current_rule = rule 73 | self.skip_rule = False 74 | self.processed = self.replaced = 0 75 | for page in self.site.search(rule.query, namespaces=[0]): 76 | if self.skip_rule: 77 | break 78 | yield page 79 | if not self.is_rule_accurate: 80 | pywikibot.info( 81 | f'Skipped inefficient query "{rule.query}" ' 82 | f'({self.replaced}/{self.processed}') 83 | break 84 | else: 85 | if self.processed < 1: 86 | pywikibot.info(f'No results from query "{rule.query}"') 87 | else: 88 | percent = (self.replaced / self.processed) * 100 89 | pywikibot.info( 90 | f'{percent:.f}% accuracy of query "{rule.query}"') 91 | 92 | if self.processed > 0: 93 | pywikibot.info(f'Longest match: {rule.longest}s') 94 | rule.longest = max(old_max, rule.longest) 95 | 96 | def save_false_positive(self, page): 97 | link = page.title(as_link=True) 98 | self.fp_page.text += f'\n* {link}' 99 | self.fp_page.save(summary=link, asynchronous=True) 100 | self.whitelist.append(page.title()) 101 | 102 | def skip_page(self, page): 103 | if page.title() in self.whitelist: 104 | pywikibot.warning(f'Skipped {page} because it is whitelisted') 105 | return True 106 | 107 | if self.own_generator and self.current_rule.find.search(page.title()): 108 | pywikibot.warning( 109 | f'Skipped {page} because the rule matches the title') 110 | return True 111 | 112 | return super().skip_page(page) 113 | 114 | def init_page(self, page): 115 | out = super().init_page(page) 116 | if self.own_generator: 117 | self.processed += 1 118 | return out 119 | 120 | def treat_page(self): 121 | page = self.current_page 122 | text = page.text 123 | done_replacements = [] 124 | quickly = self.opt['quick'] is True 125 | start = time.time() 126 | if self.own_generator: 127 | text = self.current_rule.apply(page.text, done_replacements) 128 | if page.text == text: 129 | if quickly: 130 | pywikibot.info('Typo not found, not fixing another ' 131 | 'typos in quick mode') 132 | return 133 | else: 134 | self.replaced += 1 135 | 136 | for rule in self.typoRules: 137 | if self.own_generator and rule == self.current_rule: # __eq__ 138 | continue 139 | if rule.find.search(page.title()): 140 | continue 141 | if quickly and rule.needs_decision(): 142 | continue 143 | 144 | text = rule.apply(text, done_replacements) 145 | stop = time.time() 146 | if quickly and stop - start > 15: 147 | pywikibot.warning('Other typos exceeded 15s, skipping') 148 | break 149 | 150 | self.put_current( 151 | text, summary='oprava překlepů: %s' % ', '.join(done_replacements)) 152 | 153 | def user_confirm(self, question): 154 | if self.opt['always']: 155 | return True 156 | 157 | options = [('yes', 'y'), ('no', 'n'), ('all', 'a')] 158 | if self.fp_page.exists(): 159 | options.append(('false positive', 'f')) 160 | if self.own_generator: 161 | options.append(('skip rule', 's')) 162 | options += [('open in browser', 'b'), ('quit', 'q')] 163 | 164 | choice = pywikibot.input_choice(question, options, default='N', 165 | automatic_quit=False) 166 | 167 | if choice == 'n': 168 | return False 169 | 170 | if choice == 's': 171 | self.skip_rule = True 172 | return False 173 | 174 | if choice == 'b': 175 | pywikibot.bot.open_webbrowser(self.current_page) 176 | return False 177 | 178 | if choice == 'f': 179 | self.save_false_positive(self.current_page) 180 | return False 181 | 182 | if choice == 'q': 183 | self.quit() 184 | 185 | if choice == 'a': 186 | self.options['always'] = True 187 | 188 | return True 189 | 190 | def teardown(self): 191 | rules = sorted( 192 | (rule for rule in self.typoRules if not rule.needs_decision()), 193 | key=lambda rule: rule.longest, reverse=True)[:3] 194 | pywikibot.info('\nSlowest autonomous rules:') 195 | for i, rule in enumerate(rules, start=1): 196 | pywikibot.info(f'{i}. "{rule.find.pattern}" - {rule.longest}') 197 | if self.own_generator: 198 | pywikibot.info(f'\nCurrent offset: {self.offset}\n') 199 | super().teardown() 200 | 201 | 202 | def main(*args): 203 | options = {} 204 | local_args = pywikibot.handle_args(args) 205 | genFactory = pagegenerators.GeneratorFactory() 206 | genFactory.handle_arg('-ns:0') 207 | for arg in genFactory.handle_args(local_args): 208 | if arg.startswith('-'): 209 | arg, sep, value = arg.partition(':') 210 | if value != '': 211 | options[arg[1:]] = value if not value.isdigit() else int(value) 212 | else: 213 | options[arg[1:]] = True 214 | 215 | generator = genFactory.getCombinedGenerator(preload=True) 216 | bot = TypoBot(generator, **options) 217 | bot.run() 218 | 219 | 220 | if __name__ == '__main__': 221 | main() 222 | -------------------------------------------------------------------------------- /cswiki/iucn.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | from datetime import datetime 4 | 5 | import mwparserfromhell as parser 6 | import pywikibot 7 | import pywikibot.pagegenerators as pg 8 | from pywikibot.exceptions import NoWikibaseEntityError 9 | from pywikibot.page import PropertyPage 10 | 11 | def get_revision_wrapper(item, rev_id: int): 12 | # https://github.com/matejsuchanek/wikidata-constraints/blob/11602b4050e4623c9f1e4e0b279cf2f6c14b2a53/retrieval.py#L131-L164 13 | cls = type(item) 14 | repo = item.repo 15 | entity_id = item.getID() 16 | 17 | rev = cls(repo, entity_id) 18 | data = json.loads(item.getOldVersion(rev_id)) 19 | for key, val in data.items(): 20 | # handle old serialization 21 | if val == []: 22 | data[key] = {} 23 | 24 | rev._content = data 25 | while True: 26 | try: 27 | rev.get() 28 | except (KeyError, NoWikibaseEntityError) as exc: 29 | # handle deleted properties 30 | if isinstance(exc, NoWikibaseEntityError): 31 | key = exc.entity.id 32 | else: 33 | key = exc.args[0] 34 | # in theory, this isn't needed 35 | if not PropertyPage.is_valid_id(key): 36 | raise 37 | 38 | if key.lower() in data['claims']: 39 | data['claims'].pop(key.lower()) 40 | elif key.upper() in data['claims']: 41 | data['claims'].pop(key.upper()) 42 | else: 43 | raise 44 | else: 45 | return rev 46 | 47 | 48 | def get_best_statements(statements): 49 | best = [] 50 | best_rank = 'normal' 51 | for st in statements: 52 | if st.rank == best_rank: 53 | best.append(st) 54 | elif st.rank == 'preferred': 55 | best[:] = [st] 56 | best_rank = st.rank 57 | return best 58 | 59 | 60 | def is_different(old, new): 61 | if old == new: 62 | return False 63 | 64 | if old.getID() == 'Q11394' and new.getID() == 'Q96377276': 65 | return False 66 | 67 | return True 68 | 69 | 70 | args = pywikibot.handle_args() 71 | 72 | site = pywikibot.Site('cs', 'wikipedia') 73 | repo = pywikibot.Site('wikidata', 'wikidata') 74 | 75 | needle = re.compile(r'\b[Pp]141\b') 76 | 77 | editions = { 78 | #'2012.1': '20120619', 79 | '2012.2': '20121017', 80 | '2013.1': '20130702', 81 | '2013.2': '20131126', 82 | '2014.1': '20140612', 83 | '2014.2': '20140724', 84 | '2014.3': '20141117', 85 | '2015.1': '20150603', 86 | '2015.2': '20150623', 87 | '2015.4': '20151119', 88 | '2016.2': '20160904', 89 | '2016.3': '20161208', 90 | '2017.2': '20170914', 91 | '2017.3': '20171205', 92 | '2018.1': '20180705', 93 | '2019.2': '20190718', 94 | '2019.3': '20191210', 95 | '2020.2': '20200709', 96 | '2020.3': '20201210', 97 | '2021.1': '20210325', 98 | '2021.2': '20210904', 99 | '2021.3': '20211209', 100 | '2022.1': '20220101', 101 | '2022.2': '20221209', 102 | '2023.1': '20231211', 103 | '2025.2': '20251010', 104 | } 105 | stat_to_label = { 106 | 'Q719675': 'téměř ohrožený', 107 | 'Q211005': 'málo dotčený', 108 | 'Q219127': 'kriticky ohrožený druh', 109 | 'Q237350': 'vyhynulý', 110 | 'Q239509': 'vyhynulý v přírodě', 111 | 'Q278113': 'zranitelný', 112 | 'Q719675': 'téměř ohrožený', 113 | 'Q3245245': 'chybí údaje', 114 | 'Q123509': 'vymírání', 115 | 'Q11394': 'ohrožený', 116 | 'Q96377276': 'ohrožený', 117 | } 118 | links = { 119 | pywikibot.Page(site, 'Kriticky_ohrožený_taxon'), 120 | pywikibot.Page(site, 'Málo_dotčený_taxon'), 121 | pywikibot.Page(site, 'O_taxonu_chybí_údaje'), 122 | pywikibot.Page(site, 'Nevyhodnocený_taxon'), 123 | pywikibot.Page(site, 'Ohrožený_taxon'), 124 | pywikibot.Page(site, 'Téměř_ohrožený_taxon'), 125 | pywikibot.Page(site, 'Zranitelný_taxon'), 126 | pywikibot.Page(site, 'Taxon vyhynulý v přírodě'), 127 | pywikibot.Page(site, 'Vyhynulý_taxon'), 128 | } 129 | 130 | lines = [ 131 | '
', 132 | '{| class="wikitable sortable"', 133 | '! Č.', 134 | '! Taxon', 135 | '! class="unsortable" | Wikidata', 136 | '! Naposled', 137 | '! class="unsortable" | Odkazuje na', 138 | ] 139 | lines.extend(f'! class="unsortable" | {ed}' for ed in editions) 140 | 141 | i = 0 142 | 143 | sparql = '''SELECT ?item WHERE { 144 | ?article schema:about ?item; schema:isPartOf . 145 | ?item wdt:P141 ?iucn . 146 | } ORDER BY ?item''' 147 | 148 | gen = pg.PreloadingEntityGenerator( 149 | pg.WikidataSPARQLPageGenerator(sparql, site=repo) 150 | ) 151 | 152 | for item in gen: 153 | best = get_best_statements(item.claims.get('P141', [])) 154 | if not best: 155 | continue 156 | 157 | ts_to_status = {} 158 | cur = None 159 | 160 | for rev in item.revisions(reverse=True, content=False): 161 | if not rev.parentid: 162 | continue 163 | 164 | if not needle.search(rev.comment): 165 | continue 166 | 167 | if rev.comment.startswith('/* wbsetreference-set:'): 168 | continue 169 | 170 | if 'mw-reverted' in rev.tags: 171 | continue 172 | 173 | this = get_revision_wrapper(item, rev.revid) 174 | claims = get_best_statements(this.claims.get('P141', [])) 175 | if claims: 176 | new = claims[0].getTarget() 177 | if cur is None or is_different(cur, new): 178 | key = rev.timestamp.strftime('%Y%m%d%H%M%S') 179 | ts_to_status[key] = new.getID() 180 | cur = new 181 | 182 | if len(ts_to_status) < 2: 183 | continue 184 | 185 | last_change = max(ts_to_status) 186 | 187 | new = best[0].getTarget() 188 | if cur is None or is_different(cur, new): 189 | key = item.latest_revision.timestamp.strftime('%Y%m%d%H%M%S') 190 | ts_to_status[key] = new.getID() 191 | 192 | link = item.sitelinks[site] 193 | page = pywikibot.Page(link) 194 | created = page.oldest_revision.timestamp 195 | if created > datetime.strptime(last_change, '%Y%m%d%H%M%S'): 196 | continue 197 | 198 | per_edition = {} 199 | for ts, stat in ts_to_status.items(): # asc 200 | last_release_date = max( 201 | (date for date in editions.values() if date < ts), 202 | default=0 203 | ) 204 | for ed, date in editions.items(): 205 | if last_release_date <= date: 206 | per_edition[ed] = stat 207 | 208 | links_to = [ 209 | other.title(as_link=True) 210 | for other in page.linkedPages( 211 | namespaces=0, 212 | content=False, 213 | follow_redirects=True 214 | ) 215 | if other in links 216 | ] 217 | 218 | i += 1 219 | ymd = f'{last_change[:4]}-{last_change[4:6]}-{last_change[6:8]}' 220 | 221 | lines.append('|-') 222 | lines.append(f'| {i}') 223 | lines.append(f'| {link.astext()}') 224 | lines.append(f'| [[d:{item.getID()}|{item.getID()}]]') 225 | lines.append(f'| data-sort-value="{last_change}" | {ymd}') 226 | lines.append('| ' + ('
'.join(sorted(links_to)))) 227 | 228 | last = '?' 229 | streak = 0 230 | for ed in editions: # asc 231 | stat = per_edition.get(ed, '?') 232 | if stat == last: 233 | streak += 1 234 | continue 235 | 236 | if streak > 1: 237 | lines.append( 238 | f'| colspan="{streak}" align="center" | {stat_to_label.get(last, last)}' 239 | ) 240 | elif streak == 1: 241 | lines.append(f'| {stat_to_label.get(last, last)}') 242 | 243 | last = stat 244 | streak = 1 245 | 246 | if streak > 1: 247 | lines.append( 248 | f'| colspan="{streak}" align="center" | {stat_to_label.get(last, last)}' 249 | ) 250 | elif streak == 1: 251 | lines.append(f'| {stat_to_label.get(last, last)}') 252 | 253 | lines.append('|}') 254 | lines.append('
') 255 | 256 | new_text = '\n'.join(lines) 257 | 258 | site.login() 259 | 260 | output_page = pywikibot.Page(site, 'Wikipedie:WikiProjekt_Biologie/Status_ohrožení/vše') 261 | code = parser.parse(output_page.text) 262 | for old in code.ifilter_tags(matches='div'): 263 | code.replace(old, new_text) 264 | output_page.text = str(code) 265 | break 266 | else: 267 | output_page.text = new_text 268 | 269 | output_page.save( 270 | summary='tabulka', apply_cosmetic_changes=False, bot=False, minor=False 271 | ) 272 | -------------------------------------------------------------------------------- /cleanup_dates.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | from contextlib import suppress 3 | from datetime import datetime, timedelta 4 | from itertools import chain, combinations 5 | 6 | import pywikibot 7 | 8 | from pywikibot import Claim 9 | from pywikibot.exceptions import APIError 10 | from pywikibot.pagegenerators import ( 11 | GeneratorFactory, 12 | PreloadingEntityGenerator, 13 | WikidataSPARQLPageGenerator, 14 | ) 15 | 16 | from query_store import QueryStore 17 | from wikidata import WikidataEntityBot 18 | 19 | 20 | class DuplicateDatesBot(WikidataEntityBot): 21 | 22 | invalid_refs = {'P143', 'P813', 'P3452', 'P4656'} 23 | use_from_page = False 24 | 25 | def __init__(self, generator, **kwargs): 26 | self.available_options.update({ 27 | 'days': 30, 28 | 'props': ['P569', 'P570', 'P2031', 'P2032'], 29 | }) 30 | super().__init__(**kwargs) 31 | self.store = QueryStore() 32 | self._generator = generator or self.custom_generator() 33 | 34 | def custom_generator(self): 35 | for prop in self.opt['props']: 36 | for key in ('duplicate_dates', 'unmerged_dates'): 37 | time = datetime.now() - timedelta(days=self.opt['days']) 38 | query = self.store.build_query( 39 | key, prop=prop, date=time.isoformat(timespec='seconds')) 40 | yield from WikidataSPARQLPageGenerator(query, site=self.repo) 41 | 42 | @property 43 | def generator(self): 44 | return PreloadingEntityGenerator(self._generator) 45 | 46 | @property 47 | def summary(self): 48 | return ('remove redundant and less precise unsourced claim(s), ' 49 | '[[Wikidata:Requests for permissions/Bot/MatSuBot 7|see RfPB]]') 50 | 51 | @staticmethod 52 | def first_inside_second(first, second): 53 | if first.precision > second.precision: 54 | if second.precision in {9, 10}: 55 | if first.year == second.year: 56 | if second.precision == 9: 57 | return True 58 | elif second.precision == 10: 59 | return first.month == second.month 60 | return False 61 | 62 | @staticmethod 63 | def first_same_as_second(first, second): 64 | if first == second: 65 | return True 66 | if first.precision == second.precision: 67 | if first.precision in {9, 10} and first.year == second.year: 68 | if first.precision == 10: 69 | return first.month == second.month 70 | else: 71 | return True 72 | return False 73 | 74 | @classmethod 75 | def is_valid_source(cls, source): 76 | return bool(set(source) - cls.invalid_refs) 77 | 78 | @classmethod 79 | def number_of_sources(cls, claim): 80 | number = 0 81 | for source in claim.sources: 82 | number += cls.is_valid_source(source) 83 | return number 84 | 85 | @classmethod 86 | def is_sourced(cls, claim): 87 | return cls.number_of_sources(claim) > 0 88 | 89 | @classmethod 90 | def can_merge_claims(cls, claim1, claim2): 91 | if claim1.getSnakType() != claim2.getSnakType(): 92 | return False 93 | 94 | if ( 95 | claim1.getSnakType() == 'value' 96 | and not cls.first_same_as_second( 97 | claim1.getTarget(), 98 | claim2.getTarget() 99 | ) 100 | ): 101 | return False 102 | 103 | if ( 104 | claim1.qualifiers != claim2.qualifiers 105 | and not ( 106 | claim1.rank != 'deprecated' 107 | and claim2.rank == 'normal' 108 | and not claim2.qualifiers 109 | and not cls.is_sourced(claim2) 110 | ) 111 | and not ( 112 | claim2.rank != 'deprecated' 113 | and claim1.rank == 'normal' 114 | and not claim1.qualifiers 115 | and not cls.is_sourced(claim1) 116 | ) 117 | ): 118 | return False 119 | 120 | return True 121 | 122 | def treat_page_and_item(self, page, item): 123 | redundant = [] 124 | unmerged = [] 125 | for prop in self.opt['props']: 126 | claims = item.claims.get(prop, []) 127 | if len(claims) < 2: 128 | continue 129 | 130 | already = set() 131 | for claim1, claim2 in combinations(claims, 2): 132 | if claim1.snak in already or claim2.snak in already: 133 | continue 134 | 135 | if (claim1.rank, claim2.rank) in ( 136 | ('preferred', 'deprecated'), 137 | ('deprecated', 'preferred'), 138 | ): 139 | # this would need manual intervention 140 | continue 141 | 142 | if self.can_merge_claims(claim1, claim2): 143 | # never remove preferred/deprecated claim 144 | # if either is normal 145 | if claim1.rank != claim2.rank: 146 | if claim1.rank == 'normal': 147 | claim1, claim2 = claim2, claim1 148 | elif claim2.qualifiers and not claim1.qualifiers: 149 | claim1, claim2 = claim2, claim1 150 | elif ( 151 | self.number_of_sources(claim2) > 152 | self.number_of_sources(claim1) 153 | ): 154 | claim1, claim2 = claim2, claim1 155 | 156 | for source in claim2.sources: 157 | if not self.is_valid_source(source): 158 | continue 159 | sources_copy = [ 160 | c.copy() for c in chain(*source.values())] 161 | with suppress(APIError): # duplicate reference present 162 | claim1.addSources(sources_copy) 163 | 164 | unmerged.append(claim2) 165 | already.add(claim2.snak) 166 | continue 167 | 168 | if not (claim1.getSnakType() == 'value' == claim2.getSnakType()): 169 | continue 170 | 171 | pairs = [(claim1, claim2), (claim2, claim1)] 172 | for first, second in pairs: 173 | if self.is_sourced(second): 174 | continue 175 | # never remove preferred/deprecated claim 176 | # if either is normal 177 | if first.rank != second.rank and second.rank != 'normal': 178 | continue 179 | 180 | if ( 181 | first.qualifiers != second.qualifiers 182 | and not ( 183 | first.rank == 'preferred' 184 | and second.rank == 'normal' 185 | and not second.qualifiers 186 | ) 187 | ): 188 | continue 189 | 190 | if self.first_inside_second( 191 | first.getTarget(), 192 | second.getTarget() 193 | ): 194 | redundant.append(second) 195 | already.add(second.snak) 196 | break 197 | 198 | if redundant or unmerged: 199 | if redundant: 200 | summary = self.summary 201 | else: 202 | summary = 'remove redundant claim(s)' 203 | item.removeClaims(redundant + unmerged, summary=summary) 204 | 205 | 206 | def main(*args): 207 | options = {} 208 | local_args = pywikibot.handle_args(args) 209 | site = pywikibot.Site() 210 | genFactory = GeneratorFactory(site=site) 211 | for arg in genFactory.handle_args(local_args): 212 | if arg.startswith('-'): 213 | arg, sep, value = arg.partition(':') 214 | if arg == '-prop': 215 | options.setdefault('props', []).append( 216 | value or pywikibot.input('Which property should be treated?')) 217 | elif value: 218 | options[arg[1:]] = int(value) if value.isdigit() else value 219 | else: 220 | options[arg[1:]] = True 221 | 222 | generator = genFactory.getCombinedGenerator() 223 | bot = DuplicateDatesBot(generator=generator, site=site, **options) 224 | bot.run() 225 | 226 | 227 | if __name__ == '__main__': 228 | main() 229 | -------------------------------------------------------------------------------- /typoloader.py: -------------------------------------------------------------------------------- 1 | import re 2 | import time 3 | 4 | import pywikibot 5 | 6 | from pywikibot import textlib 7 | 8 | 9 | class IncompleteTypoRuleException(Exception): 10 | 11 | '''Exception raised when constructing a typo rule from incomplete data''' 12 | 13 | def __init__(self, message): 14 | self.message = message 15 | 16 | 17 | class InvalidExpressionException(Exception): 18 | 19 | '''Exception raised when an expression has invalid syntax''' 20 | 21 | def __init__(self, error, aspect='regular expression'): 22 | self.message = error.msg 23 | self.aspect = aspect 24 | 25 | 26 | class TypoRule: 27 | 28 | '''Class representing one typo rule''' 29 | 30 | exceptions = [ 31 | 'category', 'comment', 'header', 'hyperlink', 'interwiki', 'invoke', 32 | 'property', 'template', 33 | 34 | # tags 35 | 'blockquote', 'code', 'gallery', 'graph', 'imagemap', 'kbd', 36 | 'mapframe', 'maplink', 'math', 'nowiki', 'poem', 'pre', 'score', 37 | 'section', 'syntaxhighlight', 'timeline', 'tt', 'var', 38 | 39 | # "target-part" of a wikilink 40 | re.compile(r'\[\[([^][|]+)(\]\]\w*|([^][|]+\|)+)'), 41 | 42 | re.compile('<[a-z]+ [^<>]+>|'), # HTML tag 43 | re.compile(r'„[^\n"„“]+["“]|(?') 49 | 50 | def __init__(self, find, replacements, auto=False, query=None): 51 | self.find = find 52 | self.replacements = replacements 53 | self.auto = auto 54 | self.query = query 55 | self.longest = 0 56 | 57 | def __eq__(self, other): 58 | if isinstance(other, self.__class__): 59 | return self.id == other.id 60 | else: 61 | return False 62 | 63 | def __ne__(self, other): 64 | return not self.__eq__(other) 65 | 66 | def __repr__(self): 67 | return ( 68 | f'{self.__class__.name}({self.find!r}, {self.replacements!r}, ' 69 | f'auto={self.auto!r}, query={self.query!r})' 70 | ) 71 | 72 | def needs_decision(self): 73 | return not self.auto or len(self.replacements) > 1 74 | 75 | @classmethod 76 | def newFromParameters(cls, parameters): 77 | if '1' not in parameters: 78 | raise IncompleteTypoRuleException('Missing find expression') 79 | 80 | find = cls.nowikiR.sub('', parameters['1']) 81 | try: 82 | find = re.compile(find, re.M) 83 | except re.error as exc: 84 | raise InvalidExpressionException(exc) 85 | 86 | replacements = [] 87 | for key in '23456': 88 | if key in parameters: 89 | replacement = re.sub(r'\$([1-9])', r'\\\1', cls.nowikiR.sub( 90 | '', parameters[key])) 91 | replacements.append(replacement) 92 | 93 | if not replacements: 94 | raise IncompleteTypoRuleException( 95 | f'No replacements found for rule "{find.pattern}"') 96 | 97 | query = None 98 | if parameters.get('hledat'): 99 | part = parameters['hledat'].replace('{{!}}', '|') 100 | if parameters.get('insource') == 'ne': 101 | query = part 102 | else: 103 | try: 104 | re.compile(part) 105 | query = f'insource:/{part}/' 106 | except re.error as exc: 107 | raise InvalidExpressionException(exc, 'query') 108 | 109 | auto = parameters.get('auto') == 'ano' 110 | 111 | return cls(find, replacements, auto, query) 112 | 113 | def summary_hook(self, match, replaced): 114 | def underscores(string): 115 | if string.startswith(' '): 116 | string = '_' + string[1:] 117 | if string.endswith(' '): 118 | string = string[:-1] + '_' 119 | return string 120 | 121 | new = old = match.group() 122 | if self.needs_decision(): 123 | options = [('keep', 'k')] 124 | replacements = [] 125 | for i, repl in enumerate(self.replacements, start=1): 126 | replacement = match.expand(repl) 127 | replacements.append(replacement) 128 | options.append((f'{i} {underscores(replacement)}', str(i))) 129 | text = match.string 130 | pre = text[max(0, match.start() - 30):match.start()].rpartition('\n')[2] 131 | post = text[match.end():match.end() + 30].partition('\n')[0] 132 | pywikibot.info(f'{pre}<>{old}<>{pos}') 133 | choice = pywikibot.input_choice('Choose the best replacement', 134 | options, automatic_quit=False, 135 | default='k') 136 | if choice != 'k': 137 | new = replacements[int(choice) - 1] 138 | else: 139 | new = match.expand(self.replacements[0]) 140 | if old == new: 141 | pywikibot.warning(f'No replacement done in string "{old}"') 142 | 143 | if old != new: 144 | old_str = underscores(old.replace('\n', '\\n')) 145 | new_str = underscores(new.replace('\n', '\\n')) 146 | fragment = f'{old_str} → {new_str}' 147 | if fragment.lower() not in map(str.lower, replaced): 148 | replaced.append(fragment) 149 | return new 150 | 151 | def apply(self, text, replaced=None): 152 | if replaced is None: 153 | replaced = [] 154 | hook = lambda match: self.summary_hook(match, replaced) 155 | start = time.clock() 156 | text = textlib.replaceExcept( 157 | text, self.find, hook, self.exceptions, site=self.site) 158 | finish = time.clock() 159 | delta = finish - start 160 | self.longest = max(delta, self.longest) 161 | if delta > 5: 162 | pywikibot.warning(f'Slow typo rule "{self.find.pattern}" ({delta})') 163 | return text 164 | 165 | 166 | class TyposLoader: 167 | 168 | top_id = 0 169 | 170 | '''Class loading and holding typo rules''' 171 | 172 | def __init__(self, site, *, allrules=False, typospage=None, 173 | whitelistpage=None): 174 | self.site = site 175 | self.load_all = allrules 176 | self.typos_page_name = typospage 177 | self.whitelist_page_name = whitelistpage 178 | 179 | def getWhitelistPage(self): 180 | if self.whitelist_page_name is None: 181 | self.whitelist_page_name = 'Wikipedie:WPCleaner/Typo/False' 182 | 183 | return pywikibot.Page(self.site, self.whitelist_page_name) 184 | 185 | def loadTypos(self): 186 | pywikibot.info('Loading typo rules...') 187 | self.typoRules = [] 188 | 189 | if self.typos_page_name is None: 190 | self.typos_page_name = 'Wikipedie:WPCleaner/Typo' 191 | typos_page = pywikibot.Page(self.site, self.typos_page_name) 192 | if not typos_page.exists(): 193 | # todo: feedback 194 | return 195 | 196 | text = textlib.removeDisabledParts( 197 | typos_page.text, include=['nowiki'], site=self.site) 198 | load_all = self.load_all is True 199 | for template, fielddict in textlib.extract_templates_and_params( 200 | text, remove_disabled_parts=False, strip=False): 201 | if template.lower() == 'typo': 202 | try: 203 | rule = TypoRule.newFromParameters(fielddict) 204 | except IncompleteTypoRuleException as exc: 205 | pywikibot.warning(exc.message) # pwb.exception? 206 | except InvalidExpressionException as exc: 207 | if 'fixed-width' not in exc.message: 208 | pywikibot.warning('Invalid {} {}: {}'.format( 209 | exc.aspect, fielddict['1'], exc.message)) 210 | else: 211 | rule.id = self.top_id 212 | # fixme: cvar or ivar? 213 | self.top_id += 1 214 | if load_all or not rule.needs_decision(): 215 | self.typoRules.append(rule) 216 | 217 | pywikibot.info(f'{len(self.typoRules)} typo rules loaded') 218 | return self.typoRules 219 | 220 | def loadWhitelist(self): 221 | self.whitelist = [] 222 | self.fp_page = self.getWhitelistPage() 223 | if self.fp_page.exists(): 224 | for match in re.finditer(r'\[\[([^]|]+)\]\]', self.fp_page.text): 225 | self.whitelist.append(match[1].strip()) 226 | return self.whitelist 227 | -------------------------------------------------------------------------------- /clean_dupes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | from queue import Queue 3 | from threading import Lock, Thread 4 | 5 | import pywikibot 6 | 7 | from pywikibot.exceptions import NoPageError 8 | from pywikibot.pagegenerators import ( 9 | GeneratorFactory, 10 | PreloadingEntityGenerator, 11 | WikidataSPARQLPageGenerator, 12 | ) 13 | 14 | from merger import Merger 15 | from query_store import QueryStore 16 | from wikidata import WikidataEntityBot 17 | from scripts.revertbot import BaseRevertBot 18 | 19 | 20 | class DupesMergingBot(WikidataEntityBot): 21 | 22 | dupe_items = {'Q1263068', 'Q17362920', 'Q21528878'} 23 | use_from_page = False 24 | 25 | def __init__(self, generator, offset=0, **kwargs): 26 | self.available_options.update({ 27 | 'threads': 1, # unstable 28 | }) 29 | super().__init__(**kwargs) 30 | self.offset = offset 31 | self.store = QueryStore() 32 | self._generator = generator or self.custom_generator() 33 | self.save_lock = Lock() 34 | self.access_lock = Lock() 35 | self.site_locks = {} 36 | 37 | @property 38 | def generator(self): 39 | return PreloadingEntityGenerator(self._generator) 40 | 41 | def custom_generator(self): 42 | query = self.store.build_query( 43 | 'dupes', dupe=' wd:'.join(self.dupe_items), offset=self.offset) 44 | return WikidataSPARQLPageGenerator(query, site=self.repo, 45 | result_type=list) 46 | 47 | def setup(self): 48 | super().setup() 49 | count = self.opt['threads'] 50 | self.workers = [] 51 | if count > 1: 52 | self.queue = Queue(count) 53 | for i in range(count): 54 | thread = Thread(target=self.work) 55 | thread.start() 56 | self.workers.append(thread) 57 | 58 | def get_lock_for(self, site): 59 | with self.access_lock: 60 | return self.site_locks.setdefault(site, Lock()) 61 | 62 | def work(self): 63 | while True: 64 | item = self.queue.get() 65 | if item is None: 66 | break 67 | self.process_item(item) 68 | self.queue.task_done() 69 | 70 | def init_page(self, item): 71 | self.offset += 1 72 | return super().init_page(item) 73 | 74 | def skip_page(self, item): 75 | return 'P31' not in item.claims or super().skip_page(item) 76 | 77 | def treat_page_and_item(self, page, item): 78 | if self.opt['threads'] > 1: 79 | self.queue.put(item) 80 | else: 81 | self.process_item(item) 82 | 83 | def process_item(self, item): 84 | claims = [] 85 | targets = set() 86 | for claim in item.claims['P31']: 87 | if claim.snaktype != 'value': 88 | continue 89 | if claim.target.id not in self.dupe_items: 90 | continue 91 | claims.append(claim) 92 | for snak in claim.qualifiers.get('P460', []): 93 | if snak.snaktype == 'value': 94 | targets.add(snak.getTarget()) 95 | 96 | for claim in item.claims.get('P460', []): 97 | if claim.snaktype == 'value': 98 | claims.append(claim) 99 | targets.add(claim.getTarget()) 100 | 101 | sitelinks = [] 102 | if not targets: 103 | for page in item.iterlinks(): 104 | site = page.site 105 | with self.get_lock_for(site): 106 | if not page.exists(): 107 | sitelinks.append(site) 108 | continue 109 | if page.isRedirectPage(): 110 | try: 111 | target = page.getRedirectTarget().data_item() 112 | except NoPageError: 113 | pass 114 | else: 115 | targets.add(target) 116 | 117 | if not targets: 118 | pywikibot.info('No target found') 119 | return 120 | 121 | target = targets.pop() 122 | if targets: 123 | pywikibot.info('Multiple targets found') 124 | return 125 | 126 | while target.isRedirectPage(): 127 | pywikibot.warning(f'Target {target.getID()} is redirect') 128 | target = target.getRedirectTarget() 129 | 130 | if item == target: 131 | self._save_page(item, self._save_entity, item.removeClaims, claims) 132 | return 133 | 134 | target_sitelinks = [] 135 | for dbname in item.sitelinks: 136 | if dbname not in target.sitelinks: 137 | continue 138 | 139 | link = item.sitelinks[dbname] 140 | site = link.site 141 | with self.get_lock_for(site): 142 | page = pywikibot.Page(link) 143 | if not page.exists(): 144 | sitelinks.append(site) 145 | continue 146 | 147 | target_link = target.sitelinks[dbname] 148 | target_page = pywikibot.Page(target_link) 149 | if not target_page.exists(): 150 | target_sitelinks.append(site) 151 | continue 152 | 153 | if self.redirectsTo(page, target_page): 154 | if link.badges: 155 | sitelinks.append(site) 156 | continue 157 | 158 | if self.redirectsTo(target_page, page): 159 | if target_link.badges: 160 | target_sitelinks.append(site) 161 | continue 162 | 163 | pywikibot.info(f'Target has a conflicting sitelink: {dbname}') 164 | return 165 | 166 | target_claims = [] 167 | for claim in target.claims.get('P460', []): 168 | if claim.snaktype != 'value': 169 | continue 170 | if claim.target_equals(item): 171 | target_claims.append(claim) 172 | 173 | for claim in target.claims.get('P31', []): 174 | if claim.snaktype != 'value': 175 | continue 176 | if claim.target.id not in self.dupe_items: 177 | continue 178 | for snak in claim.qualifiers.get('P460', []): 179 | if snak.snaktype == 'value' and snak.target_equals(item): 180 | target_claims.append(claim) 181 | 182 | if sitelinks: 183 | self._save_page( 184 | item, self._save_entity, item.removeSitelinks, sitelinks, 185 | summary='removing sitelink(s) to non-existing / redirected page(s)') 186 | if claims: 187 | self._save_page(item, self._save_entity, item.removeClaims, claims) 188 | if target_sitelinks: 189 | self._save_page( 190 | target, self._save_entity, target.removeSitelinks, target_sitelinks, 191 | summary='removing sitelink(s) to non-existing / redirected page(s)') 192 | if target_claims: 193 | self._save_page( 194 | target, self._save_entity, target.removeClaims, target_claims) 195 | 196 | target, item = Merger.sort_for_merge( 197 | [item, target], key=['sitelinks', 'claims', 'id']) 198 | 199 | if not self._save_page( 200 | item, self._save_entity, Merger.clean_merge, item, target, 201 | ignore_conflicts=['description']): 202 | pywikibot.info('Reverting changes...') 203 | bot = BaseRevertBot(self.site) # todo: integrate to Merger 204 | comment = 'Error occurred when attempting to merge with %s' 205 | bot.comment = comment % target.title(as_link=True) 206 | bot.revert({'title': item.title()}) 207 | bot.comment = comment % item.title(as_link=True) 208 | bot.revert({'title': target.title()}) 209 | return 210 | 211 | self.offset -= 1 212 | 213 | def redirectsTo(self, page, target): 214 | return page.isRedirectPage() and page.getRedirectTarget() == target 215 | 216 | def _save_entity(self, callback, *args, **kwargs): 217 | with self.save_lock: 218 | if 'asynchronous' in kwargs: 219 | kwargs.pop('asynchronous') 220 | return callback(*args, **kwargs) 221 | 222 | def teardown(self): 223 | count = len(self.workers) 224 | for i in range(count): 225 | self.queue.put(None) 226 | for worker in self.workers: 227 | worker.join() 228 | super().teardown() 229 | 230 | def exit(self): 231 | super().exit() 232 | bound = self.offset - self.offset % 50 233 | pywikibot.info(f'\nCurrent offset: {self.offset} (use {bound})\n') 234 | 235 | 236 | def main(*args): 237 | options = {} 238 | local_args = pywikibot.handle_args(args) 239 | site = pywikibot.Site() 240 | genFactory = GeneratorFactory(site=site) 241 | for arg in genFactory.handle_args(local_args): 242 | if arg.startswith('-'): 243 | arg, sep, value = arg.partition(':') 244 | if value != '': 245 | options[arg[1:]] = value if not value.isdigit() else int(value) 246 | else: 247 | options[arg[1:]] = True 248 | 249 | generator = genFactory.getCombinedGenerator() 250 | bot = DupesMergingBot(generator=generator, site=site, **options) 251 | bot.run() 252 | 253 | 254 | if __name__ == '__main__': 255 | main() 256 | -------------------------------------------------------------------------------- /connect.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import datetime 3 | 4 | import pywikibot 5 | 6 | from pywikibot import pagegenerators, textlib 7 | from pywikibot.exceptions import APIError, NoPageError 8 | from pywikibot.tools import first_lower 9 | 10 | pywikibot.handle_args() 11 | 12 | start = datetime.datetime.now() 13 | 14 | do_only = [] 15 | dont_do = [] 16 | 17 | tp_map = { 18 | 'cs|wikipedia': { 19 | 'commons': { 20 | '1': { 21 | 'lang': 'commons', 22 | 'family': 'commons' 23 | }, 24 | }, 25 | 'commonscat': { 26 | '1': { 27 | 'lang': 'commons', 28 | 'family': 'commons', 29 | 'pattern': 'Category:%s', 30 | 'namespaces': [14] 31 | }, 32 | }, 33 | 'wikicitáty': { 34 | 'dílo': { 35 | 'family': 'wikiquote', 36 | 'pattern': 'Dílo:%s' 37 | }, 38 | 'kategorie': { 39 | 'family': 'wikiquote', 40 | 'pattern': 'Kategorie:%s' 41 | }, 42 | 'osoba': 'wikiquote', 43 | 'téma': 'wikiquote' 44 | }, 45 | 'wikizdroje': { 46 | 'dílo': 'wikisource', 47 | 'autor': { 48 | 'family': 'wikisource', 49 | 'pattern': 'Autor:%s' 50 | }, 51 | 'kategorie': { 52 | 'family': 'wikiquote', 53 | 'pattern': 'Kategorie:%s' 54 | }, 55 | }, 56 | 'wikidruhy': { 57 | 'taxon': { 58 | 'family': 'species', 59 | 'lang': 'species', 60 | }, 61 | }, 62 | }, 63 | 'cs|wikiquote': { 64 | 'commons': { 65 | 'galerie': { 66 | 'lang': 'commons', 67 | 'family': 'commons' 68 | }, 69 | 'kategorie': { 70 | 'lang': 'commons', 71 | 'family': 'commons', 72 | 'pattern': 'Category:%s', 73 | 'namespaces': [14] 74 | }, 75 | }, 76 | 'wikipedie': { 77 | 'článek': 'wikipedia' 78 | }, 79 | }, 80 | 'cs|wikisource': { 81 | 'commons': { 82 | 'galerie': { 83 | 'lang': 'commons', 84 | 'family': 'commons' 85 | }, 86 | 'kategorie': { 87 | 'lang': 'commons', 88 | 'family': 'commons', 89 | 'pattern': 'Category:%s', 90 | 'namespaces': [14] 91 | }, 92 | }, 93 | 'autorinfo': { 94 | 'BiografieWiki': 'wikipedia', 95 | 'WikiquoteCS': 'wikiquote' 96 | }, 97 | }, 98 | 'de|wikiquote': { 99 | 'wikipedia': { 100 | '1': 'wikipedia' 101 | }, 102 | }, 103 | 'es|wikiquote': { 104 | 'wikipedia': { 105 | '1': 'wikipedia' 106 | }, 107 | }, 108 | 'fi|wikiquote': { 109 | 'wikipedia': { 110 | '1': 'wikipedia' 111 | }, 112 | }, 113 | 'fr|wikiquote': { 114 | 'autres projets': { 115 | 'w': 'wikipedia', 116 | 's': 'wikisource', 117 | 'species': { 118 | 'family': 'species', 119 | 'lang': 'species' 120 | }, 121 | 'wikispecies': { 122 | 'family': 'species', 123 | 'lang': 'species' 124 | }, 125 | 'commons': { 126 | 'lang': 'commons', 127 | 'family': 'commons' 128 | }, 129 | '1': { 130 | 'lang': 'commons', 131 | 'family': 'commons' 132 | }, 133 | }, 134 | }, 135 | 'fr|wikiquote': { 136 | 'wikipedia': { 137 | '1': 'wikipedia' 138 | }, 139 | }, 140 | 'id|wikiquote': { 141 | 'wikipedia': { 142 | '1': 'wikipedia' 143 | }, 144 | }, 145 | 'pl|wikiquote': { 146 | 'commons': { 147 | '1': { 148 | 'lang': 'commons', 149 | 'family': 'commons' 150 | } 151 | }, 152 | 'wikinews': {str(i): 'wikinews' for i in range(1, 10)}, 153 | 'wikipediakat': { 154 | '1': { 155 | 'lang': 'pl', 156 | 'family': 'wikipedia', 157 | 'pattern': 'Category:%s', 158 | 'namespaces': [14], 159 | }, 160 | }, 161 | 'wikisource': {}, # todo 162 | }, 163 | 'pt|wikiquote': { 164 | 'autor': { 165 | 'Wikinoticias': 'wikinews', 166 | 'Wikipedia': 'wikipedia', 167 | 'Wikisource': 'wikisource' 168 | }, 169 | 'wikipédia': { 170 | '1': 'wikipedia' 171 | }, 172 | 'wikisource': { 173 | '1': 'wikisource' 174 | }, 175 | }, 176 | 'ru|wikiquote': { 177 | 'википедия': { 178 | '1': 'wikipedia' 179 | }, 180 | 'wikipedia': { 181 | '1': 'wikipedia' 182 | }, 183 | 'навигация': { 184 | 'Википедия': 'wikipedia', 185 | 'Викитека': 'wikisource', 186 | 'Викивиды': { 187 | 'family': 'species', 188 | 'lang': 'species' 189 | }, 190 | 'Викисклад': { 191 | 'lang': 'commons', 192 | 'family': 'commons' 193 | }, 194 | 'Викигид': 'wikivoyage', 195 | }, 196 | }, 197 | 'sk|wikiquote': { 198 | 'wikipedia': { 199 | '1': 'wikipedia' 200 | }, 201 | }, 202 | 'sv|wikiquote': { 203 | 'wikipedia': { 204 | '1': 'wikipedia' 205 | }, 206 | }, 207 | } 208 | 209 | for project in tp_map.keys(): 210 | lang, family = project.split('|', 1) 211 | if len(do_only) > 0 and lang + family not in do_only and family not in do_only: 212 | continue 213 | if lang + family in dont_do or family in dont_do: 214 | continue 215 | 216 | site = pywikibot.Site(lang, family) 217 | pywikibot.info(f'Doing {lang}{family}') 218 | site.login() 219 | 220 | genFactory = pagegenerators.GeneratorFactory(site=site) 221 | for ns in (0, 14, 100): 222 | if family != 'wikisource' and ns == 100: # fixme: cswikiquote 223 | continue 224 | if family == 'wikisource' and ns == 0: 225 | continue 226 | genFactory.handle_arg(f'-ns:{ns}') 227 | genFactory.handle_arg('-unconnectedpages') 228 | generator = genFactory.getCombinedGenerator(preload=True) 229 | 230 | for page in generator: 231 | if page.namespace() != 14 and page.isDisambig(): 232 | continue 233 | 234 | for template, fields in textlib.extract_templates_and_params(page.text): 235 | if first_lower(template) not in tp_map[project]: 236 | continue 237 | 238 | params = tp_map[project][first_lower(template)] 239 | for key in fields: 240 | if key not in params: 241 | continue 242 | 243 | title = fields[key].strip() 244 | if not title: 245 | continue 246 | 247 | target_lang = lang 248 | target_family = family 249 | if isinstance(params[key], dict): 250 | if params[key].get('namespaces', []) and page.namespace() not in params[key]['namespaces']: 251 | continue 252 | if 'pattern' in params[key].keys(): 253 | title = params[key]['pattern'] % title 254 | if 'family' in params[key].keys(): 255 | target_family = params[key]['family'] 256 | if 'lang' in params[key].keys(): 257 | target_lang = params[key]['lang'] 258 | else: 259 | target_family = params[key] 260 | 261 | target_site = pywikibot.Site(target_lang, target_family) 262 | if '{{' in title: 263 | title = site.expand_text(title, page.title()) 264 | target_page = pywikibot.Page(target_site, title) 265 | if not target_page.exists(): 266 | pywikibot.info("{target_page} doesn't exist") 267 | continue 268 | while target_page.isRedirectPage(): 269 | target_page = target_page.getRedirectTarget() 270 | if target_page.isDisambig(): 271 | pywikibot.info(f'{target_page} is a disambiguation') 272 | continue 273 | 274 | try: 275 | item = target_page.data_item() 276 | except NoPageError: 277 | repo = site.data_repository() 278 | # fixme: unused return value 279 | data = repo.linkTitles(page, target_page) 280 | pywikibot.info('Item created') 281 | pywikibot.info(data) # todo 282 | break 283 | if site.dbName() in item.sitelinks: 284 | pywikibot.info(page) 285 | pywikibot.info('%s already has sitelink to %s%s' % ( 286 | item, lang, family)) 287 | continue 288 | 289 | try: 290 | item.setSitelink( 291 | page, summary='Adding sitelink %s' % page.title( 292 | asLink=True, insite=item.site)) 293 | except APIError: 294 | pass 295 | else: 296 | page.purge() 297 | break 298 | 299 | end = datetime.datetime.now() 300 | 301 | pywikibot.info('Complete! Took %d seconds' % (end - start).total_seconds()) 302 | -------------------------------------------------------------------------------- /manage_duos.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import pywikibot 3 | 4 | from pywikibot import pagegenerators 5 | from pywikibot.data.sparql import SparqlQuery 6 | 7 | from query_store import QueryStore 8 | from wikidata import WikidataEntityBot 9 | 10 | 11 | class DuosManagingBot(WikidataEntityBot): 12 | 13 | conj = { 14 | 'af': ' en ', 15 | 'az': ' və ', 16 | 'be': ' і ', 17 | 'be-tarask': ' і ', 18 | 'bg': ' и ', 19 | 'br': ' ha ', 20 | 'ca': ' i ', 21 | 'cs': ' a ', 22 | 'cy': ' a ', 23 | 'da': ' og ', 24 | 'de': ' und ', 25 | 'de-at': ' und ', 26 | 'el': ' και ', 27 | 'eo': ' kaj ', 28 | 'es': ' y ', 29 | 'et': ' ja ', 30 | 'eu': ' eta ', 31 | 'fi': ' ja ', 32 | 'fr': ' et ', 33 | 'fy': ' en ', 34 | 'gl': ' e ', 35 | 'hr': ' i ', 36 | 'hu': ' és ', 37 | 'id': ' dan ', 38 | 'it': ' e ', 39 | 'ka': ' და ', 40 | 'la': ' et ', 41 | 'lt': ' ir ', 42 | 'lv': ' un ', 43 | 'ms': ' dan ', 44 | 'nb': ' og ', 45 | 'nl': ' en ', 46 | 'nn': ' og ', 47 | 'oc': ' e ', 48 | 'pl': ' i ', 49 | 'pt': ' e ', 50 | 'ro': ' și ', 51 | 'ru': ' и ', 52 | 'sk': ' a ', 53 | 'sl': ' in ', 54 | 'sr': ' и ', 55 | 'sr-ec': ' и ', 56 | 'sr-el': ' i ', 57 | 'sv': ' och ', 58 | 'sw': ' na ', 59 | 'tr': ' ve ', 60 | 'uk': ' і ', 61 | 'vi': ' và ', 62 | 'war': ' ngan ', 63 | } 64 | distribute_properties = [ 65 | 'P22', 'P25', 'P27', 'P40', 'P53', 'P106', 'P1412', 66 | ] 67 | class_to_relation = [ 68 | ('Q132776479', 'twin-sisters'), 69 | ('Q132776456', 'twin-brothers'), 70 | ('Q14756018', 'twin'), 71 | ('Q14073567', 'sibling'), 72 | ('Q3046146', 'spouse'), 73 | ('Q106925878', 'father-son'), 74 | ('Q1313923', 'relative'), 75 | # TODO: ('Q1141470', 'comedians'), not a "relation by blood" 76 | ] 77 | relation_map = { 78 | #'comedians': 'P1327', 79 | #'father-son': '', we don't know who is who 80 | # TODO: 'partner': 'P451', 81 | 'relative': 'P1038', 82 | 'sibling': 'P3373', 83 | 'spouse': 'P26', 84 | 'twin': 'P3373/P1039/Q131440579', 85 | 'twin-brothers': 'P3373/P1039/Q108714555', 86 | 'twin-sisters': 'P3373/P1039/Q108714611', 87 | } 88 | use_from_page = False 89 | 90 | def __init__(self, generator, **kwargs): 91 | self.available_options.update({ 92 | 'always': True, 93 | 'class': 'Q10648343', 94 | 'min_labels': 1, 95 | }) 96 | super().__init__(**kwargs) 97 | self.store = QueryStore() 98 | self.sparql = SparqlQuery(repo=self.repo) 99 | self._generator = generator or self.custom_generator() 100 | 101 | def skip_page(self, item): 102 | if super().skip_page(item): 103 | return True 104 | if 'P31' not in item.claims: 105 | pywikibot.info(f'{item} is missing P31 property') 106 | return True 107 | if 'P527' in item.claims: 108 | pywikibot.info(f'{item} already has P527 property') 109 | return True 110 | return False 111 | 112 | def custom_generator(self): 113 | kwargs = {'class': self.opt['class']} 114 | query = self.store.build_query('duos', **kwargs) 115 | return pagegenerators.WikidataSPARQLPageGenerator(query, site=self.repo) 116 | 117 | @property 118 | def generator(self): 119 | return pagegenerators.PreloadingEntityGenerator(self._generator) 120 | 121 | def get_relation(self, item): 122 | ask_pattern = 'ASK { wd:%s wdt:P31/wdt:P279* wd:%%s }' % item.id 123 | for key, rel in self.class_to_relation: 124 | if self.sparql.ask(ask_pattern % key): 125 | return rel 126 | return None 127 | 128 | def get_labels(self, item, relation): 129 | labels = [{}, {}] 130 | for lang, value in item.labels.items(): 131 | delim = [] 132 | if lang in self.conj: 133 | delim.append(self.conj[lang]) 134 | delim.append(' and ') 135 | delim.append(' & ') 136 | for conj in delim: 137 | label = value.partition(' (')[0] 138 | if ', ' in label: 139 | continue 140 | split = label.split(conj) 141 | if len(split) != 2: 142 | continue 143 | split0 = split[0].split() 144 | split1 = split[1].split() 145 | if split1[0].islower(): 146 | continue 147 | # TODO: if len(split1) > 1 and split1[0][-1] == '.': 148 | if len(split1) > len(split0): 149 | if len(split1) > 2 and split1[-2].islower(): 150 | split1[-2:] = [' '.join(split1[-2:])] 151 | if len(split1) - len(split0) == 1: 152 | # if items are in a relation, then 153 | # they probably share their surname 154 | if relation: 155 | split[0] += ' %s' % split1[-1] 156 | split0.append(split1[-1]) 157 | if len(split0) > 1 or len(split1) == 1: 158 | labels[0][lang] = split[0] 159 | labels[1][lang] = split[1] 160 | break 161 | 162 | return labels 163 | 164 | def treat_page_and_item(self, page, item): 165 | relation = self.get_relation(item) 166 | labels = self.get_labels(item, relation) 167 | count = max(map(len, labels)) 168 | if count == 0: 169 | pywikibot.info('No labels, skipping...') 170 | return 171 | 172 | if count < self.opt['min_labels']: 173 | pywikibot.info(f'Too few labels ({count}), skipping...') 174 | return 175 | 176 | to_add = [] 177 | to_remove = [] 178 | if relation and relation.startswith('twin'): 179 | distribute = self.distribute_properties + ['P569', 'P19'] 180 | if relation.startswith('twin-'): 181 | distribute.append('P21') 182 | else: 183 | distribute = self.distribute_properties 184 | 185 | for prop in distribute: 186 | for claim in item.claims.get(prop, []): 187 | if claim.getTarget(): 188 | to_remove.append(claim) 189 | json = claim.toJSON() 190 | json.pop('id') 191 | to_add.append(json) 192 | 193 | items = [self.create_item(item, data, relation, to_add) 194 | for data in labels] 195 | if self.relation_map.get(relation): 196 | recipe = self.relation_map[relation].split('/') 197 | if len(recipe) == 3: 198 | prop, qprop, qval = recipe 199 | else: 200 | prop, qprop, qval = recipe[0], None, None 201 | for it, target in zip(items, reversed(items)): 202 | claim = pywikibot.Claim(self.repo, prop) 203 | claim.setTarget(target) 204 | if qprop: 205 | qualifier = pywikibot.Claim(self.repo, qprop, is_qualifier=True) 206 | qualifier.setTarget(pywikibot.ItemPage(self.repo, qval)) 207 | claim.addQualifier(qualifier) 208 | source = pywikibot.Claim(self.repo, 'P3452', is_reference=True) 209 | source.setTarget(item) 210 | claim.addSource(source) 211 | self.user_add_claim(it, claim, asynchronous=False) 212 | 213 | for it in items: 214 | claim = pywikibot.Claim(self.repo, 'P527') 215 | claim.setTarget(it) 216 | self.user_add_claim(item, claim, asynchronous=False) 217 | 218 | for claim in to_remove: 219 | pywikibot.info(f'Removing {claim.id} --> {claim.getTarget()}') 220 | json = claim.toJSON() 221 | json['remove'] = '' 222 | self.user_edit_entity( 223 | item, 224 | {'claims': [json]}, 225 | asynchronous=False, 226 | summary='moved [[Property:{}]] to {} & {}'.format( 227 | claim.id, 228 | items[0].title(as_link=True, insite=self.repo), 229 | items[1].title(as_link=True, insite=self.repo) 230 | ) 231 | ) 232 | 233 | def create_item(self, item, labels, relation, to_add): 234 | instance_of = pywikibot.Claim(self.repo, 'P31') 235 | instance_of.setTarget(pywikibot.ItemPage(self.repo, 'Q5')) 236 | part_of = pywikibot.Claim(self.repo, 'P361') 237 | part_of.setTarget(item) 238 | 239 | pywikibot.info(f'Creating item (relation "{relation}")...') 240 | new_item = pywikibot.ItemPage(self.repo) 241 | self.user_edit_entity( 242 | new_item, 243 | { 244 | 'labels': labels, 245 | 'claims': [instance_of.toJSON(), part_of.toJSON()] + to_add, 246 | }, 247 | asynchronous=False, 248 | summary='based on data in {}'.format( 249 | item.title(as_link=True, insite=self.repo) 250 | ) 251 | ) 252 | 253 | return new_item 254 | 255 | 256 | def main(*args): 257 | options = {} 258 | local_args = pywikibot.handle_args(args) 259 | site = pywikibot.Site() 260 | genFactory = pagegenerators.GeneratorFactory(site=site) 261 | for arg in genFactory.handle_args(local_args): 262 | if arg.startswith('-'): 263 | arg, sep, value = arg.partition(':') 264 | if value != '': 265 | options[arg[1:]] = value if not value.isdigit() else int(value) 266 | else: 267 | options[arg[1:]] = True 268 | 269 | generator = genFactory.getCombinedGenerator() 270 | bot = DuosManagingBot(generator=generator, site=site, **options) 271 | bot.run() 272 | 273 | 274 | if __name__ == '__main__': 275 | main() 276 | -------------------------------------------------------------------------------- /checkwiki.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import re 3 | 4 | import pywikibot 5 | 6 | from pywikibot import pagegenerators 7 | from pywikibot.exceptions import UnknownExtension 8 | 9 | from checkwiki_errors import * 10 | from wikitext import WikitextFixingBot 11 | 12 | 13 | class CheckWikiSettings: 14 | 15 | prio_map = { 16 | '0': '', 17 | '1': 'high', 18 | '2': 'medium', 19 | '3': 'low' 20 | } 21 | 22 | def __init__(self, data): 23 | self.data = data 24 | 25 | def get_priority(self, error): 26 | return self.data[error]['priority'] 27 | 28 | def get_errors_by_priority(self, priority): 29 | for error, data in self.data.items(): 30 | if data['priority'] == priority: 31 | yield error 32 | 33 | @classmethod 34 | def new_from_text(cls, text, dbName): 35 | data = {} 36 | inside_setting = False 37 | setting = None 38 | setting_text = '' 39 | parsed_settings = {} 40 | for line in text.splitlines(): 41 | if inside_setting is False: 42 | match = re.match(' *([a-z0-9_]+) *=', line) 43 | if match is not None: 44 | setting = match[1] 45 | setting_text = '' 46 | inside_setting = True 47 | line = line[match.end():] 48 | 49 | if inside_setting is True: 50 | if 'END' in line: 51 | setting_text += line[:line.index('END')].strip() 52 | inside_setting = False 53 | parsed_settings[setting] = setting_text 54 | else: 55 | setting_text += line.strip() + '\n' 56 | 57 | project = parsed_settings.pop('project', dbName) 58 | for setting, text in parsed_settings.items(): 59 | split = setting.split('_') 60 | if len(split) != 4: 61 | continue 62 | if split[0] != 'error': 63 | continue 64 | if split[-1] != project: 65 | continue 66 | if not split[1].isdigit(): 67 | continue 68 | num = int(split[1]) 69 | if num > 500: 70 | continue 71 | data.setdefault(num, {}) 72 | if split[2] == 'prio': 73 | text = text.strip() 74 | if text in cls.prio_map.keys(): 75 | data[num]['priority'] = cls.prio_map[text] 76 | elif split[2] == 'whitelistpage': 77 | data[num].setdefault('whitelists', []).append(text) 78 | return cls(data) 79 | 80 | @classmethod 81 | def new_from_site(cls, site): 82 | try: 83 | page = site.page_from_repository('Q10784379') 84 | except (NotImplementedError, UnknownExtension) as e: 85 | pywikibot.error(e) 86 | return None 87 | return cls.new_from_text(page.text, site.dbName()) 88 | 89 | 90 | class CheckWikiErrorGenerator: 91 | 92 | def __init__(self, checkwiki, priorities=None, ids=None): 93 | self.checkwiki = checkwiki 94 | self.priorities = priorities or [] 95 | self.ids = ids or [] 96 | 97 | def __iter__(self): 98 | for error in self.ids: 99 | yield from self.checkwiki.iter_pages(error) 100 | already = set(self.ids) 101 | for prio in self.priorities: 102 | for error in self.checkwiki.settings.get_errors_by_priority(prio): 103 | if error not in already: 104 | yield from self.checkwiki.iter_pages(error) 105 | 106 | 107 | class CheckWiki: 108 | 109 | url = 'https://tools.wmflabs.org/checkwiki/cgi-bin/checkwiki_bots.cgi' 110 | 111 | errorMap = { 112 | 1: PrefixedTemplate, 113 | 2: BrokenHTMLTag, 114 | 7: LowHeadersLevel, 115 | 8: MissingEquation, 116 | 9: SingleLineCategories, 117 | #10: NoEndSquareBrackets, 118 | 11: HTMLEntity, 119 | 16: InvisibleChars, 120 | 17: DuplicateCategory, 121 | 18: LowerCaseCategory, 122 | 19: SingleEquationHeader, 123 | 20: Dagger, 124 | 21: EnglishCategory, 125 | 22: CategoryWithSpace, 126 | 25: HeaderHierarchy, 127 | 26: Bold, 128 | #27: Unicode, 129 | 32: MultiplePipes, 130 | 34: MagicWords, 131 | 38: Italics, 132 | 42: StrikedText, 133 | 44: BoldHeader, 134 | 48: SelfLink, 135 | 49: HTMLHeader, 136 | 50: EntitesAsDashes, 137 | 51: InterwikiBeforeHeader, 138 | 52: CategoriesBeforeHeader, 139 | 53: InterwikiBeforeCategory, 140 | 54: ListWithBreak, 141 | 57: HeaderWithColon, 142 | 59: ParameterWithBreak, 143 | 61: RefBeforePunctuation, 144 | 63: SmallInsideTags, 145 | #75: BadListStructure, 146 | #76: NoSpace, 147 | 80: BrokenExternalLink, 148 | 81: DuplicateReferences, 149 | 85: EmptyTag, 150 | 86: ExternalLinkLikeInternal, 151 | 88: DefaultsortSpace, 152 | 89: DefaultsortComma, 153 | 93: DoubleHttp, 154 | 101: Ordinals, 155 | 103: SuperfluousPipe, 156 | 104: ReferenceQuotes, 157 | } 158 | 159 | def __init__(self, site): 160 | self.site = site 161 | 162 | def purge(self): 163 | self.__cache = {} 164 | 165 | @property 166 | def site(self): 167 | return self._site 168 | 169 | @site.setter 170 | def site(self, value): 171 | self._site = value 172 | self.purge() 173 | self.load_settings() 174 | 175 | def load_settings(self): 176 | pywikibot.info('Loading CheckWiki settings...') 177 | self._settings = CheckWikiSettings.new_from_site(self.site) 178 | 179 | @property 180 | def settings(self): 181 | if not hasattr(self, '_settings'): 182 | self.load_settings() 183 | return self._settings 184 | 185 | def get_error(self, number): 186 | return self.__cache.setdefault(number, self.errorMap[number](self)) 187 | 188 | def iter_errors(self, numbers=None, only_for_fixes=False, priorities=None): 189 | for num in self.errorMap: 190 | if numbers and num not in numbers: 191 | continue 192 | if priorities and self.settings.get_priority(num) not in priorities: 193 | continue 194 | 195 | error = self.get_error(num) 196 | if only_for_fixes and not error.isForFixes(): 197 | continue 198 | 199 | yield error 200 | 201 | def apply(self, text, page, replaced=[], fixed=[], errors=[], **kwargs): 202 | # todo: use a graph algorithm 203 | errors = list(self.iter_errors(set(errors))) 204 | while errors: 205 | error = errors.pop(0) 206 | if error.needsDecision() or error.handledByCC(): # todo 207 | continue 208 | 209 | numbers = [err.number for err in errors] 210 | i = max([numbers.index(num) for num in error.needsFirst 211 | if num in numbers] + [0]) 212 | if i > 0: 213 | errors.insert(i, error) 214 | continue 215 | 216 | new_text = error.apply(text, page) 217 | if new_text != text: 218 | text = new_text 219 | summary = error.summary 220 | fixed.append(error.number) 221 | if summary not in replaced: 222 | replaced.append(summary) 223 | 224 | return text 225 | 226 | def iter_titles(self, num, **kwargs): 227 | data = { 228 | 'action': 'list', 229 | 'id': num, 230 | 'project': self.site.dbName(), 231 | } 232 | for line in self.get(data, **kwargs).iter_lines(): 233 | yield line.decode().replace('title=', '') # fixme: b/c 234 | 235 | def iter_pages(self, num, **kwargs): 236 | for title in self.iter_titles(num, **kwargs): 237 | yield pywikibot.Page(self.site, title) 238 | 239 | def get(self, data, **kwargs): 240 | return requests.get(self.url, data, **kwargs) 241 | 242 | def post(self, data, **kwargs): 243 | return requests.post(self.url, data, **kwargs) 244 | 245 | def mark_as_fixed(self, page, error): 246 | data = { 247 | 'action': 'mark', 248 | 'id': error, 249 | 'project': page.site.dbName(), 250 | 'title': page.title(), 251 | } 252 | return self.post(data) 253 | 254 | def mark_as_fixed_multiple(self, page, errors): 255 | for error in errors: 256 | self.mark_as_fixed(page, error) 257 | 258 | @staticmethod 259 | def parse_option(option): 260 | ids = [] 261 | priorities = [] 262 | for part in option.split(','): 263 | if part.isdigit(): 264 | ids.append(int(part)) 265 | elif part in CheckWikiSettings.prio_map.values(): 266 | priorities.append(part) 267 | return ids, priorities 268 | 269 | 270 | class CheckWikiBot(WikitextFixingBot): 271 | 272 | def __init__(self, checkwiki, numbers, **kwargs): 273 | kwargs['checkwiki'] = False 274 | super().__init__(**kwargs) 275 | self.checkwiki = checkwiki 276 | self.numbers = numbers 277 | 278 | def treat_page(self): 279 | page = self.current_page 280 | replaced = [] 281 | fixed = [] 282 | text = self.checkwiki.apply( 283 | page.text, page, replaced, fixed, self.numbers) 284 | summary = 'opravy dle [[WP:WCW|CheckWiki]]: %s' % ', '.join(replaced) 285 | self.put_current( 286 | text, summary=summary, 287 | callback=lambda *args: self.mark_as_fixed_on_success(fixed, *args)) 288 | 289 | def mark_as_fixed_on_success(self, numbers, page, exc=None): 290 | if exc is not None: 291 | return 292 | self.checkwiki.mark_as_fixed_multiple(page, numbers) 293 | 294 | 295 | def main(*args): 296 | options = {} 297 | local_args = pywikibot.handle_args(args) 298 | site = pywikibot.Site() 299 | checkwiki = CheckWiki(site) 300 | genFactory = pagegenerators.GeneratorFactory(site=site) 301 | numbers = [] 302 | gens = [] 303 | for arg in genFactory.handle_args(local_args): 304 | if arg.startswith('-checkwiki:'): 305 | ids, priorities = checkwiki.parse_option(arg.partition(':')[2]) 306 | gen = CheckWikiErrorGenerator( 307 | checkwiki, ids=ids, priorities=priorities) 308 | gens.append(gen) 309 | continue 310 | if arg.startswith('-'): 311 | arg, sep, value = arg.partition(':') 312 | if value != '': 313 | options[arg[1:]] = int(value) if value.isdigit() else value 314 | else: 315 | options[arg[1:]] = True 316 | else: 317 | numbers.extend(checkwiki.parse_option(arg)[0]) 318 | 319 | if gens: 320 | genFactory.gens.extend(gens) 321 | generator = genFactory.getCombinedGenerator(preload=True) 322 | if not generator: 323 | genFactory.gens.append(CheckWikiErrorGenerator(checkwiki, ids=numbers)) 324 | generator = genFactory.getCombinedGenerator(preload=True) 325 | 326 | bot = CheckWikiBot(checkwiki, numbers, generator=generator, 327 | site=site, **options) 328 | bot.run() 329 | 330 | 331 | if __name__ == '__main__': 332 | main() 333 | --------------------------------------------------------------------------------