├── data ├── missing.csv ├── input.csv ├── new_york.csv ├── prague.csv ├── capitals.csv └── long_list.csv ├── .gitignore ├── requirements.txt ├── LICENSE ├── liftwing.py ├── README.md └── checkDataQuality.py /data/missing.csv: -------------------------------------------------------------------------------- 1 | Q6 2 | -------------------------------------------------------------------------------- /data/input.csv: -------------------------------------------------------------------------------- 1 | long_list.csv -------------------------------------------------------------------------------- /data/new_york.csv: -------------------------------------------------------------------------------- 1 | Q60 2 | -------------------------------------------------------------------------------- /data/prague.csv: -------------------------------------------------------------------------------- 1 | Q1085,Prague 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | *.iml 3 | *.out.* 4 | *.log -------------------------------------------------------------------------------- /data/capitals.csv: -------------------------------------------------------------------------------- 1 | Q60,New York 2 | Q64,Berlin 3 | Q70,Bern 4 | Q84,London 5 | Q90,Paris 6 | Q239,Brussels 7 | Q270,Warsaw 8 | Q727,Amsterdam 9 | Q1741,Vienna 10 | Q1748,Copenhagen 11 | Q1842,Luxembourg 12 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.7.4.post0 2 | async-timeout==3.0.1 3 | attrs==21.2.0 4 | chardet==4.0.0 5 | idna==3.2 6 | multidict==5.1.0 7 | numpy==1.21.2 8 | typing-extensions==3.10.0.0 9 | yarl==1.6.3 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2020, Wikimedia Deutschland e. V. 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /data/long_list.csv: -------------------------------------------------------------------------------- 1 | Q60,New York 2 | Q64,Berlin 3 | Q70,Bern 4 | Q84,London 5 | Q90,Paris 6 | Q239,Brussels 7 | Q270,Warsaw 8 | Q727,Amsterdam 9 | Q1741,Vienna 10 | Q1748,Copenhagen 11 | Q1842,Luxembourg 12 | Q60,New York 13 | Q64,Berlin 14 | Q70,Bern 15 | Q84,London 16 | Q90,Paris 17 | Q239,Brussels 18 | Q270,Warsaw 19 | Q727,Amsterdam 20 | Q1741,Vienna 21 | Q1748,Copenhagen 22 | Q1842,Luxembourg 23 | Q60,New York 24 | Q64,Berlin 25 | Q70,Bern 26 | Q84,London 27 | Q90,Paris 28 | Q239,Brussels 29 | Q270,Warsaw 30 | Q727,Amsterdam 31 | Q1741,Vienna 32 | Q1748,Copenhagen 33 | Q1842,Luxembourg 34 | Q60,New York 35 | Q64,Berlin 36 | Q70,Bern 37 | Q84,London 38 | Q90,Paris 39 | Q239,Brussels 40 | Q270,Warsaw 41 | Q727,Amsterdam 42 | Q1741,Vienna 43 | Q1748,Copenhagen 44 | Q1842,Luxembourg 45 | Q60,New York 46 | Q64,Berlin 47 | Q70,Bern 48 | Q84,London 49 | Q90,Paris 50 | Q239,Brussels 51 | Q270,Warsaw 52 | Q727,Amsterdam 53 | Q1741,Vienna 54 | Q1748,Copenhagen 55 | Q1842,Luxembourg 56 | Q60,New York 57 | Q64,Berlin 58 | Q70,Bern 59 | Q84,London 60 | Q90,Paris 61 | Q239,Brussels 62 | Q270,Warsaw 63 | Q727,Amsterdam 64 | Q1741,Vienna 65 | Q1748,Copenhagen 66 | Q1842,Luxembourg 67 | Q60,New York 68 | Q64,Berlin 69 | Q70,Bern 70 | Q84,London 71 | Q90,Paris 72 | Q239,Brussels 73 | Q270,Warsaw 74 | Q727,Amsterdam 75 | Q1741,Vienna 76 | Q1748,Copenhagen 77 | Q1842,Luxembourg 78 | Q60,New York 79 | Q64,Berlin 80 | Q70,Bern 81 | Q84,London 82 | Q90,Paris 83 | Q239,Brussels 84 | Q270,Warsaw 85 | Q727,Amsterdam 86 | Q1741,Vienna 87 | Q1748,Copenhagen 88 | Q1842,Luxembourg 89 | Q60,New York 90 | Q64,Berlin 91 | Q70,Bern 92 | Q84,London 93 | Q90,Paris 94 | Q239,Brussels 95 | Q270,Warsaw 96 | Q727,Amsterdam 97 | Q1741,Vienna 98 | Q1748,Copenhagen 99 | Q1842,Luxembourg 100 | Q60,New York 101 | Q64,Berlin 102 | Q70,Bern 103 | Q84,London 104 | Q90,Paris 105 | Q239,Brussels 106 | Q270,Warsaw 107 | Q727,Amsterdam 108 | Q1741,Vienna 109 | Q1748,Copenhagen 110 | Q1842,Luxembourg 111 | -------------------------------------------------------------------------------- /liftwing.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | from collections import defaultdict 4 | from typing import List 5 | 6 | import aiohttp 7 | 8 | 9 | async def get_liftwing_response( 10 | wiki_id: str, 11 | model_name: str, 12 | rev_id: int, 13 | features: bool, 14 | liftwing_url: str, 15 | ) -> dict: 16 | url = f"{liftwing_url}/v1/models/{wiki_id}-{model_name}:predict" 17 | 18 | data = {"rev_id": rev_id, "extended_output": features} 19 | async with aiohttp.ClientSession() as session: 20 | try: 21 | async with session.post(url, json=data) as response: 22 | response_json = await response.json() 23 | except aiohttp.ClientError as e: 24 | logging.error( 25 | f"LiftWing call for model {model_name} and rev-id {rev_id} failed" 26 | ) 27 | return response_json 28 | 29 | 30 | def merge_liftwing_responses(wiki_id: str, responses: List[str]) -> defaultdict: 31 | result = defaultdict(lambda: defaultdict(lambda: defaultdict())) 32 | for d in responses: 33 | if not d: 34 | continue 35 | for k, v in d[wiki_id].items(): 36 | if isinstance(v, dict) and k == "scores": 37 | for rev_id, scores in v.items(): 38 | if rev_id in result[wiki_id][k]: 39 | result[wiki_id][k][rev_id].update(scores) 40 | else: 41 | result[wiki_id][k][rev_id] = scores 42 | else: 43 | result[wiki_id][k].update(v) 44 | return result 45 | 46 | 47 | async def make_liftiwing_calls( 48 | wiki_id: str, 49 | models: List[str], 50 | rev_ids: List[int], 51 | features: bool = None, 52 | liftwing_url: str = "https://api.wikimedia.org/service/lw/inference", 53 | ): 54 | tasks = [ 55 | get_liftwing_response( 56 | wiki_id=wiki_id, 57 | model_name=model, 58 | rev_id=revid, 59 | features=features, 60 | liftwing_url=liftwing_url, 61 | ) 62 | for revid in rev_ids 63 | for model in models 64 | ] 65 | result = await asyncio.gather(*tasks) 66 | return merge_liftwing_responses(wiki_id, result) 67 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Wikidata Constraints Violation Checker 2 | 3 | The Wikidata Constraints Violations Checker allows you to analyze the number of constraints violations on a list of Wikidata Items. This is useful to better understand which Items need improvements the most and to better understand the data quality of a specific area of Wikidata. 4 | 5 | ## Installation 6 | This script requires at least Python 3.6. In your terminal, run: 7 | 8 | ```bash 9 | git clone https://github.com/wmde/wikidata-constraints-violation-checker.git 10 | cd wikidata-constraints-violation-checker 11 | pip3 install -r requirements.txt 12 | ``` 13 | 14 | ## Usage 15 | 16 | ```bash 17 | # To run the script with an input file 18 | python3 checkDataQuality.py -i 19 | 20 | # To run the script using randomly generated Item IDs 21 | python3 checkDataQuality.py -r 22 | 23 | # You can also specify an output filename 24 | python3 checkDataQuality.py -i -o 25 | 26 | # Or a batch size 27 | python3 checkDataQuality.py -r -b 28 | ``` 29 | 30 | | Arg | Name | Description | 31 | | :-: | ----------------------- | -------------------------------------------------------------------------------------- | 32 | | -i | Input file | The path to the file containing the input data | 33 | | -r | Randomly generate Items | The number of Items to randomly generate | 34 | | -o | Output file | The path to the file for output | 35 | | -b | Batch Size | The list of Items are broken down into batches for processing.
Default value is 10 | 36 | 37 | ## Input Data 38 | 39 | The script can read CSV files or generate random Item IDs. 40 | 41 | ### CSV File 42 | 43 | Example input file, the first column will be used to query for constrains violations: 44 | 45 | ```csv 46 | Q60,New York 47 | Q64,Berlin 48 | Q70,Bern 49 | Q84,London 50 | Q90,Paris 51 | ``` 52 | 53 | ## Output Data 54 | 55 | The following fields are provided in the output data for Items that are succesfully checked. 56 | 57 | | Field | Description | 58 | | :-------------------------: | ------------------------------------------------------------------------------------------------------------------------------ | 59 | | QID | The unique Item identifier | 60 | | statements | Total amount of statements on the Item | 61 | | violations_mandatory_level | # of violations at a [mandatory level](https://www.wikidata.org/wiki/Wikidata:2020_report_on_Property_constraints#mandatory) | 62 | | violations_normal_level | # of violations at a [normal level](https://www.wikidata.org/wiki/Wikidata:2020_report_on_Property_constraints#normal) | 63 | | violations_suggestion_level | # of violations at a [suggestion level](https://www.wikidata.org/wiki/Wikidata:2020_report_on_Property_constraints#suggestion) | 64 | | violated_statements | # of statements with violations | 65 | | total_sitelinks | # of sitelinks on the Item | 66 | | wikipedia_sitelinks | # of sitelinks to Wikipedia | 67 | | ores_score | [ORES Item quality score](https://www.wikidata.org/wiki/Wikidata:Item_quality)
From 1 to 5 (lowest to highest) | 68 | 69 | ## Note 70 | 71 | Please be aware that some large Items are skipped during the analysis because the constraint check API times out for them. 72 | -------------------------------------------------------------------------------- /checkDataQuality.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import asyncio 4 | import csv 5 | import getopt 6 | import json 7 | import os 8 | import random 9 | import sys 10 | from datetime import datetime 11 | 12 | import numpy 13 | from aiohttp import ClientSession 14 | 15 | from liftwing import make_liftiwing_calls 16 | 17 | OUTPUT_DELIMITER = ';' 18 | STATEMENT_COUNT_URL = 'https://www.wikidata.org/w/api.php?format=json&action=query&prop=pageprops|revisions&ppprop=wb-claims&rvprop=ids' 19 | SITELINK_COUNT_URL = 'https://www.wikidata.org/w/api.php?format=json&action=wbgetentities&props=sitelinks' 20 | CONSTRAINT_CHECK_URL = 'https://www.wikidata.org/w/api.php?format=json&action=wbcheckconstraints' 21 | 22 | # The ORES score is calculated by weight of the most relevant score, see ORES on https://www.wikidata.org/wiki/Wikidata:Item_quality#ORES 23 | ORES_WEIGHTS = { 24 | "E": 1, 25 | "D": 2, 26 | "C": 3, 27 | "B": 4, 28 | "A": 5 29 | } 30 | 31 | batchSize = 10 32 | 33 | def usage(exitCode = False): 34 | print('checkDataQuality.py -i | -r [-o -b ]') 35 | if(exitCode): 36 | sys.exit(exitCode) 37 | 38 | 39 | def parseArguments(argv): 40 | global batchSize 41 | numberOfRandomItems = False 42 | outputFileName = '' 43 | inputFileName = '' 44 | 45 | startMessage = '' 46 | 47 | try: 48 | opts, args = getopt.getopt(argv,"hi:o:r:b:",["help","ifile=","ofile=","random=","batch-size="]) 49 | except getopt.GetoptError: 50 | usage(2) 51 | for opt, arg in opts: 52 | if opt in ("-h", "--help"): 53 | usage(0) 54 | elif opt in ("-i", "--ifile"): 55 | inputFileName = arg 56 | elif opt in ("-o", "--ofile"): 57 | outputFileName = arg 58 | startMessage += ', write to ' + outputFileName 59 | elif opt in ("-r", "--random"): 60 | numberOfRandomItems = arg 61 | elif opt in ("-b", "--batch-size"): 62 | batchSize = int(arg) 63 | 64 | if(not (inputFileName or numberOfRandomItems) or (inputFileName and numberOfRandomItems)): 65 | usage(2) 66 | 67 | if (not inputFileName and not outputFileName): 68 | outputFileName = "./random-" + datetime.now().strftime("%Y-%m-%d_%H:%M:%S") + ".out.csv" 69 | startMessage = 'checking quality on ' + str(numberOfRandomItems) + ' random items' +\ 70 | ', write to ' + outputFileName 71 | if(not outputFileName): 72 | name, extension = os.path.splitext(inputFileName) 73 | outputFileName = name + ".out" + extension 74 | startMessage = 'checking quality on items from input file ' + inputFileName +\ 75 | ', write to ' + outputFileName 76 | 77 | startMessage += ', processing in batches of ' + str(batchSize) 78 | 79 | print(startMessage) 80 | return numberOfRandomItems, outputFileName, inputFileName 81 | 82 | def generateRandomItemIds(numberofItems): 83 | items = [] 84 | for i in range(numberofItems): 85 | randomId = 'Q' + str(random.randint(1, 100000000)) 86 | items.append(randomId) 87 | 88 | return items 89 | 90 | # generator to create batches of random Q-IDs, 91 | # then fetch the number of statements for them from the query API 92 | async def queryRandomItems(numberOfItems): 93 | counter = 0 94 | while counter < numberOfItems: 95 | batchOfIds = generateRandomItemIds(min(batchSize, numberOfItems - counter)) 96 | batchOfResults = await fetchNumberOfStatements(batchOfIds) 97 | counter += len(batchOfResults) 98 | yield batchOfResults 99 | 100 | # generator to read batches of Q-IDs from a file, 101 | # then fetch the number of statements for them from the query API 102 | async def queryItemsFromFile(inputFileName): 103 | with open(inputFileName, newline='') as inputFile: 104 | lines = [row[0] for row in csv.reader(inputFile)] 105 | 106 | numberOfBatches = (len(lines) // batchSize) + 1 107 | batches = numpy.array_split(lines, numberOfBatches) 108 | for batchOfIds in batches: 109 | batchOfResults = await fetchNumberOfStatements(batchOfIds) 110 | yield batchOfResults 111 | 112 | def printHeader(outputFileName): 113 | with open(outputFileName, 'w') as outputFile: 114 | print(OUTPUT_DELIMITER.join([ 115 | 'QID', 116 | 'statements', 117 | 'violations_mandatory_level', 118 | 'violations_normal_level', 119 | 'violations_suggestion_level', 120 | 'violated_statements', 121 | 'total_sitelinks', 122 | 'wikipedia_sitelinks', 123 | 'ores_score' 124 | ]), file=outputFile) 125 | 126 | def printResults(batchOfResults, outputFileName): 127 | with open(outputFileName, 'a') as outputFile: 128 | for itemId, itemResults in batchOfResults.items(): 129 | if('failed' in itemResults.keys()): 130 | continue 131 | 132 | # list of str-mapped values, delimited by OUTPUT_DELIMITER 133 | print(OUTPUT_DELIMITER.join(map(str, [ 134 | itemId, 135 | itemResults['statements'], 136 | itemResults['violations_mandatory'], 137 | itemResults['violations_normal'], 138 | itemResults['violations_suggestion'], 139 | itemResults['violated_statements'], 140 | itemResults['total_sitelinks'], 141 | itemResults['wikipedia_sitelinks'], 142 | itemResults['ores_score'], 143 | ] 144 | )), file=outputFile) 145 | 146 | def logException(exception): 147 | with open('error.log', 'a') as outputFile: 148 | print(exception, file=outputFile) 149 | 150 | def logErrorMessage(message): 151 | with open('error.log', 'a') as outputFile: 152 | print(message, file=outputFile) 153 | 154 | def displayProgress(step, overwrite=True): 155 | character = '' 156 | if(step < 0): 157 | # ANSI escape sequence for 'ERROR' (red) 158 | character = '\033[91m' 159 | step *= -1 160 | 161 | if(step == 0): 162 | character += '.' 163 | elif(step == 1): 164 | character += '-' 165 | elif(step == 2): 166 | character += '+' 167 | elif(step == 99): 168 | character += '|' 169 | 170 | if(overwrite): 171 | character = '\b' + character 172 | 173 | # turn off color 174 | character += '\033[0m' 175 | 176 | print(character, end='', flush=True) 177 | 178 | async def fetchNumberOfStatements(itemIds): 179 | # Returns a dictionary of items, each with their the number of statements 180 | batchOfResults = {} 181 | async with ClientSession() as session: 182 | async with session.get(STATEMENT_COUNT_URL + '&titles=' + '|'.join(itemIds)) as statementCountResponse: 183 | statementCountResponse = await statementCountResponse.read() 184 | r = json.loads(str(statementCountResponse, 'utf-8')) 185 | 186 | for page in r['query']['pages'].values(): 187 | if not 'pageprops' in page: 188 | logErrorMessage("Item " + page['title'] + ' does not exist or is a redirect.') 189 | continue 190 | 191 | # add revid and number of statements to the item's results dictionary in batchOfResults 192 | results = { 193 | 'revid': page['revisions'][0]['revid'], 194 | 'statements': page['pageprops']['wb-claims'], 195 | } 196 | batchOfResults.update({page['title']: results}) 197 | 198 | return batchOfResults 199 | 200 | async def fetchNumberOfSitelinks(batchOfResults): 201 | # Gets a dictionary of itemIds and their statement count results 202 | # and adds to it the total number of sitelinks and the number of wikipedia sitelinks per itemId 203 | async with ClientSession() as session: 204 | async with session.get(SITELINK_COUNT_URL + '&ids=' + '|'.join(batchOfResults.keys())) as sitelinksResponse: 205 | sitelinksResponse = await sitelinksResponse.read() 206 | r = json.loads(str(sitelinksResponse, 'utf-8')) 207 | 208 | if not 'entities' in r: 209 | raise Exception("could not find sitelinks for items", batchOfResults.keys()) 210 | 211 | for itemId, item in r['entities'].items(): 212 | total_sitelinks = item['sitelinks'] 213 | wikipedia_sitelinks = { k: v for k, v in total_sitelinks.items() 214 | if k.endswith('wiki') and not k in ['commonswiki', 'specieswiki'] } 215 | # add total and wikipedia sitelinks to the item's results dictionary in batchOfItems 216 | results = {'total_sitelinks': len(total_sitelinks), 'wikipedia_sitelinks': len(wikipedia_sitelinks)} 217 | batchOfResults[itemId].update(results) 218 | return batchOfResults 219 | 220 | async def checkConstraints(batchOfResults): 221 | items = '|'.join(batchOfResults.keys()) 222 | async with ClientSession() as session: 223 | async with session.get(CONSTRAINT_CHECK_URL + '&id=' + items) as r: 224 | if r.status != 200: 225 | raise Exception( 226 | 'wbcheckconstraint API returned status code ' + 227 | str(r.status) + ' for item(s) ' + items 228 | ) 229 | 230 | r = await r.read() 231 | 232 | jsonResponse = json.loads(str(r, 'utf-8')) 233 | if 'error' in jsonResponse: 234 | raise Exception( 235 | 'wbcheckconstraint API returned error \'' + 236 | jsonResponse['error']['code'] + 237 | '\' for items ' + items 238 | ) 239 | for itemId in jsonResponse['wbcheckconstraints']: 240 | itemCheck = jsonResponse['wbcheckconstraints'][itemId] 241 | constraintCheckResults = parseItemCheck(itemCheck) 242 | batchOfResults[itemId].update(constraintCheckResults) 243 | 244 | return batchOfResults 245 | 246 | def parseItemCheck(jsonConstraintCheckResponse): 247 | results = { 248 | 'violations_mandatory': 0, 249 | 'violations_normal': 0, 250 | 'violations_suggestion': 0, 251 | 'violated_statements': 0, 252 | 'statement_is_violated': False 253 | } 254 | claims = jsonConstraintCheckResponse['claims'] 255 | 256 | # claims is a list (not a dict) if it's empty... yikes. 257 | if not type(claims) is dict: 258 | # no statements -> no violations 259 | return results 260 | 261 | for (property_id, statement_group) in claims.items(): 262 | for statement in statement_group: 263 | results['statement_is_violated'] = False 264 | 265 | violated_mainsnaks = statement['mainsnak']['results'] 266 | for violated_mainsnak in violated_mainsnaks: 267 | results = countResults(violated_mainsnak['status'], results) 268 | 269 | if 'qualifiers' in statement.keys(): 270 | qualifier_items = statement['qualifiers'].items() 271 | for (qualifier_property_id, qualifier_item) in qualifier_items: 272 | for qualifier_constraint_check in qualifier_item: 273 | qualifier_results = qualifier_constraint_check['results'] 274 | for qualifier_result in qualifier_results: 275 | results = countResults(qualifier_result['status'], results) 276 | 277 | if 'references' in statement.keys(): 278 | reference_items = statement['references'] 279 | for reference_item in reference_items: 280 | for (snak_property_id, reference_constraint_checks) in reference_item['snaks'].items(): 281 | for reference_constraint_check in reference_constraint_checks: 282 | reference_results = reference_constraint_check['results'] 283 | for reference_result in reference_results: 284 | results = countResults(reference_result['status'], results) 285 | 286 | del results['statement_is_violated'] 287 | return results 288 | 289 | def countResults(status, results): 290 | # ignore 291 | if status == 'bad-parameters': 292 | return results 293 | 294 | if status == 'violation': 295 | results['violations_mandatory'] += 1 296 | elif status == 'warning': 297 | results['violations_normal'] += 1 298 | elif status == 'suggestion': 299 | results['violations_suggestion'] += 1 300 | 301 | if results['statement_is_violated'] == False: 302 | results['statement_is_violated'] = True 303 | results['violated_statements'] += 1 304 | 305 | return results 306 | 307 | async def checkQualityByBatch(batchOfItems): 308 | try: 309 | batchOfItems = await checkConstraints(batchOfItems) 310 | except Exception as ex: 311 | logErrorMessage("failed to check quality constraints on items " + 312 | '|'.join(batchOfItems.keys())) 313 | logErrorMessage("now checking them one-by-one") 314 | logException(ex) 315 | for itemId, itemResults in batchOfItems.items(): 316 | checkedItemResults = await checkQualityByItem(itemId, itemResults) 317 | batchOfItems[itemId].update(checkedItemResults) 318 | 319 | return batchOfItems 320 | 321 | async def checkQualityByItem(itemId, itemResults): 322 | try: 323 | itemResults = await checkConstraints({itemId: itemResults}) 324 | except Exception as ex: 325 | logErrorMessage("failed to check quality constraints on item " + itemId) 326 | logException(ex) 327 | return {'failed': True} 328 | 329 | return itemResults 330 | 331 | async def fetchOresScore(batchOfItems): 332 | # collect Q-ids and revids from items dictionary 333 | itemIds = {} 334 | for itemId, results in batchOfItems.items(): 335 | itemIds[results['revid']] = itemId 336 | 337 | r = await make_liftiwing_calls(wiki_id="wikidatawiki", models=["damaging", "goodfaith", "itemquality", "itemtopic"], rev_ids=list(itemIds.keys())) 338 | 339 | if not 'wikidatawiki' in r: 340 | logErrorMessage("no ORES scores found for items " + '|'.join(itemIds.keys())) 341 | return batchOfItems 342 | 343 | for revid, score in r['wikidatawiki']['scores'].items(): 344 | itemId = itemIds[int(revid)] 345 | probability = score['itemquality']['score']['probability'] 346 | weightedSum = 0 347 | for x in probability: 348 | if(probability[x]): 349 | weightedSum += probability[x] * ORES_WEIGHTS[x] 350 | batchOfItems[itemId].update({'ores_score': round(weightedSum, 2)}) 351 | 352 | return batchOfItems 353 | 354 | async def main(argv): 355 | numberOfItems, outputFileName, inputFileName= parseArguments(argv) 356 | 357 | printHeader(outputFileName) 358 | 359 | if(numberOfItems): 360 | # we use randomly generated Q-IDs 361 | batchesOfItems = queryRandomItems(int(numberOfItems)) 362 | else: 363 | # we read the Q-IDs from a file 364 | batchesOfItems = queryItemsFromFile(inputFileName) 365 | 366 | async for batch in batchesOfItems: 367 | itemsWithSitelinks = await fetchNumberOfSitelinks(batch) 368 | itemsWithConstraintChecks = await checkQualityByBatch(itemsWithSitelinks) 369 | itemsWithOresScore = await fetchOresScore(itemsWithConstraintChecks) 370 | printResults(itemsWithOresScore, outputFileName) 371 | print('', len(itemsWithOresScore)) 372 | 373 | print() 374 | 375 | loop=asyncio.new_event_loop() 376 | loop.run_until_complete(main(sys.argv[1:])) 377 | --------------------------------------------------------------------------------