├── data
    ├── missing.csv
    ├── input.csv
    ├── new_york.csv
    ├── prague.csv
    ├── capitals.csv
    └── long_list.csv
├── .gitignore
├── requirements.txt
├── LICENSE
├── liftwing.py
├── README.md
└── checkDataQuality.py


/data/missing.csv:
--------------------------------------------------------------------------------
1 | Q6
2 | 


--------------------------------------------------------------------------------
/data/input.csv:
--------------------------------------------------------------------------------
1 | long_list.csv


--------------------------------------------------------------------------------
/data/new_york.csv:
--------------------------------------------------------------------------------
1 | Q60
2 | 


--------------------------------------------------------------------------------
/data/prague.csv:
--------------------------------------------------------------------------------
1 | Q1085,Prague
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | *.iml
3 | *.out.*
4 | *.log


--------------------------------------------------------------------------------
/data/capitals.csv:
--------------------------------------------------------------------------------
 1 | Q60,New York
 2 | Q64,Berlin
 3 | Q70,Bern
 4 | Q84,London
 5 | Q90,Paris
 6 | Q239,Brussels
 7 | Q270,Warsaw
 8 | Q727,Amsterdam
 9 | Q1741,Vienna
10 | Q1748,Copenhagen
11 | Q1842,Luxembourg
12 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiohttp==3.7.4.post0
 2 | async-timeout==3.0.1
 3 | attrs==21.2.0
 4 | chardet==4.0.0
 5 | idna==3.2
 6 | multidict==5.1.0
 7 | numpy==1.21.2
 8 | typing-extensions==3.10.0.0
 9 | yarl==1.6.3
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2020, Wikimedia Deutschland e. V.
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/data/long_list.csv:
--------------------------------------------------------------------------------
  1 | Q60,New York
  2 | Q64,Berlin
  3 | Q70,Bern
  4 | Q84,London
  5 | Q90,Paris
  6 | Q239,Brussels
  7 | Q270,Warsaw
  8 | Q727,Amsterdam
  9 | Q1741,Vienna
 10 | Q1748,Copenhagen
 11 | Q1842,Luxembourg
 12 | Q60,New York
 13 | Q64,Berlin
 14 | Q70,Bern
 15 | Q84,London
 16 | Q90,Paris
 17 | Q239,Brussels
 18 | Q270,Warsaw
 19 | Q727,Amsterdam
 20 | Q1741,Vienna
 21 | Q1748,Copenhagen
 22 | Q1842,Luxembourg
 23 | Q60,New York
 24 | Q64,Berlin
 25 | Q70,Bern
 26 | Q84,London
 27 | Q90,Paris
 28 | Q239,Brussels
 29 | Q270,Warsaw
 30 | Q727,Amsterdam
 31 | Q1741,Vienna
 32 | Q1748,Copenhagen
 33 | Q1842,Luxembourg
 34 | Q60,New York
 35 | Q64,Berlin
 36 | Q70,Bern
 37 | Q84,London
 38 | Q90,Paris
 39 | Q239,Brussels
 40 | Q270,Warsaw
 41 | Q727,Amsterdam
 42 | Q1741,Vienna
 43 | Q1748,Copenhagen
 44 | Q1842,Luxembourg
 45 | Q60,New York
 46 | Q64,Berlin
 47 | Q70,Bern
 48 | Q84,London
 49 | Q90,Paris
 50 | Q239,Brussels
 51 | Q270,Warsaw
 52 | Q727,Amsterdam
 53 | Q1741,Vienna
 54 | Q1748,Copenhagen
 55 | Q1842,Luxembourg
 56 | Q60,New York
 57 | Q64,Berlin
 58 | Q70,Bern
 59 | Q84,London
 60 | Q90,Paris
 61 | Q239,Brussels
 62 | Q270,Warsaw
 63 | Q727,Amsterdam
 64 | Q1741,Vienna
 65 | Q1748,Copenhagen
 66 | Q1842,Luxembourg
 67 | Q60,New York
 68 | Q64,Berlin
 69 | Q70,Bern
 70 | Q84,London
 71 | Q90,Paris
 72 | Q239,Brussels
 73 | Q270,Warsaw
 74 | Q727,Amsterdam
 75 | Q1741,Vienna
 76 | Q1748,Copenhagen
 77 | Q1842,Luxembourg
 78 | Q60,New York
 79 | Q64,Berlin
 80 | Q70,Bern
 81 | Q84,London
 82 | Q90,Paris
 83 | Q239,Brussels
 84 | Q270,Warsaw
 85 | Q727,Amsterdam
 86 | Q1741,Vienna
 87 | Q1748,Copenhagen
 88 | Q1842,Luxembourg
 89 | Q60,New York
 90 | Q64,Berlin
 91 | Q70,Bern
 92 | Q84,London
 93 | Q90,Paris
 94 | Q239,Brussels
 95 | Q270,Warsaw
 96 | Q727,Amsterdam
 97 | Q1741,Vienna
 98 | Q1748,Copenhagen
 99 | Q1842,Luxembourg
100 | Q60,New York
101 | Q64,Berlin
102 | Q70,Bern
103 | Q84,London
104 | Q90,Paris
105 | Q239,Brussels
106 | Q270,Warsaw
107 | Q727,Amsterdam
108 | Q1741,Vienna
109 | Q1748,Copenhagen
110 | Q1842,Luxembourg
111 | 


--------------------------------------------------------------------------------
/liftwing.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import logging
 3 | from collections import defaultdict
 4 | from typing import List
 5 | 
 6 | import aiohttp
 7 | 
 8 | 
 9 | async def get_liftwing_response(
10 |     wiki_id: str,
11 |     model_name: str,
12 |     rev_id: int,
13 |     features: bool,
14 |     liftwing_url: str,
15 | ) -> dict:
16 |     url = f"{liftwing_url}/v1/models/{wiki_id}-{model_name}:predict"
17 | 
18 |     data = {"rev_id": rev_id, "extended_output": features}
19 |     async with aiohttp.ClientSession() as session:
20 |         try:
21 |             async with session.post(url, json=data) as response:
22 |                 response_json = await response.json()
23 |         except aiohttp.ClientError as e:
24 |             logging.error(
25 |                 f"LiftWing call for model {model_name} and rev-id {rev_id} failed"
26 |             )
27 |         return response_json
28 | 
29 | 
30 | def merge_liftwing_responses(wiki_id: str, responses: List[str]) -> defaultdict:
31 |     result = defaultdict(lambda: defaultdict(lambda: defaultdict()))
32 |     for d in responses:
33 |         if not d:
34 |             continue
35 |         for k, v in d[wiki_id].items():
36 |             if isinstance(v, dict) and k == "scores":
37 |                 for rev_id, scores in v.items():
38 |                     if rev_id in result[wiki_id][k]:
39 |                         result[wiki_id][k][rev_id].update(scores)
40 |                     else:
41 |                         result[wiki_id][k][rev_id] = scores
42 |             else:
43 |                 result[wiki_id][k].update(v)
44 |     return result
45 | 
46 | 
47 | async def make_liftiwing_calls(
48 |     wiki_id: str,
49 |     models: List[str],
50 |     rev_ids: List[int],
51 |     features: bool = None,
52 |     liftwing_url: str = "https://api.wikimedia.org/service/lw/inference",
53 | ):
54 |     tasks = [
55 |         get_liftwing_response(
56 |             wiki_id=wiki_id,
57 |             model_name=model,
58 |             rev_id=revid,
59 |             features=features,
60 |             liftwing_url=liftwing_url,
61 |         )
62 |         for revid in rev_ids
63 |         for model in models
64 |     ]
65 |     result = await asyncio.gather(*tasks)
66 |     return merge_liftwing_responses(wiki_id, result)
67 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Wikidata Constraints Violation Checker
 2 | 
 3 | The Wikidata Constraints Violations Checker allows you to analyze the number of constraints violations on a list of Wikidata Items. This is useful to better understand which Items need improvements the most and to better understand the data quality of a specific area of Wikidata.
 4 | 
 5 | ## Installation
 6 | This script requires at least Python 3.6. In your terminal, run:
 7 | 
 8 | ```bash
 9 | git clone https://github.com/wmde/wikidata-constraints-violation-checker.git
10 | cd wikidata-constraints-violation-checker
11 | pip3 install -r requirements.txt
12 | ```
13 | 
14 | ## Usage
15 | 
16 | ```bash
17 | # To run the script with an input file
18 | python3 checkDataQuality.py -i <inputfile>
19 | 
20 | # To run the script using randomly generated Item IDs
21 | python3 checkDataQuality.py -r <number of items>
22 | 
23 | # You can also specify an output filename
24 | python3 checkDataQuality.py -i <inputfile> -o <outputfile>
25 | 
26 | # Or a batch size
27 | python3 checkDataQuality.py -r <number of items> -b <batch-size>
28 | ```
29 | 
30 | | Arg | Name                    | Description                                                                            |
31 | | :-: | ----------------------- | -------------------------------------------------------------------------------------- |
32 | | -i  | Input file              | The path to the file containing the input data                                         |
33 | | -r  | Randomly generate Items | The number of Items to randomly generate                                               |
34 | | -o  | Output file             | The path to the file for output                                                        |
35 | | -b  | Batch Size              | The list of Items are broken down into batches for processing. <br>Default value is 10 |
36 | 
37 | ## Input Data
38 | 
39 | The script can read CSV files or generate random Item IDs.
40 | 
41 | ### CSV File
42 | 
43 | Example input file, the first column will be used to query for constrains violations:
44 | 
45 | ```csv
46 | Q60,New York
47 | Q64,Berlin
48 | Q70,Bern
49 | Q84,London
50 | Q90,Paris
51 | ```
52 | 
53 | ## Output Data
54 | 
55 | The following fields are provided in the output data for Items that are succesfully checked.
56 | 
57 | |            Field            | Description                                                                                                                    |
58 | | :-------------------------: | ------------------------------------------------------------------------------------------------------------------------------ |
59 | |             QID             | The unique Item identifier                                                                                                     |
60 | |         statements          | Total amount of statements on the Item                                                                                         |
61 | | violations_mandatory_level  | # of violations at a [mandatory level](https://www.wikidata.org/wiki/Wikidata:2020_report_on_Property_constraints#mandatory)   |
62 | |   violations_normal_level   | # of violations at a [normal level](https://www.wikidata.org/wiki/Wikidata:2020_report_on_Property_constraints#normal)         |
63 | | violations_suggestion_level | # of violations at a [suggestion level](https://www.wikidata.org/wiki/Wikidata:2020_report_on_Property_constraints#suggestion) |
64 | |     violated_statements     | # of statements with violations                                                                                                |
65 | |       total_sitelinks       | # of sitelinks on the Item                                                                                                     |
66 | |     wikipedia_sitelinks     | # of sitelinks to Wikipedia                                                                                                    |
67 | |         ores_score          | [ORES Item quality score](https://www.wikidata.org/wiki/Wikidata:Item_quality) <br>From 1 to 5 (lowest to highest)                                                 |
68 | 
69 | ## Note
70 | 
71 | Please be aware that some large Items are skipped during the analysis because the constraint check API times out for them.
72 | 


--------------------------------------------------------------------------------
/checkDataQuality.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | 
  3 | import asyncio
  4 | import csv
  5 | import getopt
  6 | import json
  7 | import os
  8 | import random
  9 | import sys
 10 | from datetime import datetime
 11 | 
 12 | import numpy
 13 | from aiohttp import ClientSession
 14 | 
 15 | from liftwing import make_liftiwing_calls
 16 | 
 17 | OUTPUT_DELIMITER = ';'
 18 | STATEMENT_COUNT_URL = 'https://www.wikidata.org/w/api.php?format=json&action=query&prop=pageprops|revisions&ppprop=wb-claims&rvprop=ids'
 19 | SITELINK_COUNT_URL = 'https://www.wikidata.org/w/api.php?format=json&action=wbgetentities&props=sitelinks'
 20 | CONSTRAINT_CHECK_URL = 'https://www.wikidata.org/w/api.php?format=json&action=wbcheckconstraints'
 21 | 
 22 | # The ORES score is calculated by weight of the most relevant score, see ORES on https://www.wikidata.org/wiki/Wikidata:Item_quality#ORES
 23 | ORES_WEIGHTS = {
 24 |     "E": 1,
 25 |     "D": 2,
 26 |     "C": 3,
 27 |     "B": 4,
 28 |     "A": 5
 29 | }
 30 | 
 31 | batchSize = 10
 32 | 
 33 | def usage(exitCode = False):
 34 |     print('checkDataQuality.py -i <inputfile> | -r <number of items> [-o <outputfile> -b <batch-size>]')
 35 |     if(exitCode):
 36 |         sys.exit(exitCode)
 37 | 
 38 | 
 39 | def parseArguments(argv):
 40 |     global batchSize
 41 |     numberOfRandomItems = False
 42 |     outputFileName = ''
 43 |     inputFileName = ''
 44 | 
 45 |     startMessage = ''
 46 | 
 47 |     try:
 48 |         opts, args = getopt.getopt(argv,"hi:o:r:b:",["help","ifile=","ofile=","random=","batch-size="])
 49 |     except getopt.GetoptError:
 50 |         usage(2)
 51 |     for opt, arg in opts:
 52 |         if opt in ("-h", "--help"):
 53 |             usage(0)
 54 |         elif opt in ("-i", "--ifile"):
 55 |             inputFileName = arg
 56 |         elif opt in ("-o", "--ofile"):
 57 |             outputFileName = arg
 58 |             startMessage += ', write to ' + outputFileName
 59 |         elif opt in ("-r", "--random"):
 60 |             numberOfRandomItems = arg
 61 |         elif opt in ("-b", "--batch-size"):
 62 |             batchSize = int(arg)
 63 | 
 64 |     if(not (inputFileName or numberOfRandomItems) or (inputFileName and numberOfRandomItems)):
 65 |         usage(2)
 66 | 
 67 |     if (not inputFileName and not outputFileName):
 68 |         outputFileName = "./random-" + datetime.now().strftime("%Y-%m-%d_%H:%M:%S") + ".out.csv"
 69 |         startMessage = 'checking quality on ' + str(numberOfRandomItems) + ' random items' +\
 70 |                        ', write to ' + outputFileName
 71 |     if(not outputFileName):
 72 |         name, extension = os.path.splitext(inputFileName)
 73 |         outputFileName = name + ".out" + extension
 74 |         startMessage = 'checking quality on items from input file ' + inputFileName +\
 75 |                        ', write to ' + outputFileName
 76 | 
 77 |     startMessage += ', processing in batches of ' + str(batchSize)
 78 | 
 79 |     print(startMessage)
 80 |     return numberOfRandomItems, outputFileName, inputFileName
 81 | 
 82 | def generateRandomItemIds(numberofItems):
 83 |     items = []
 84 |     for i in range(numberofItems):
 85 |         randomId = 'Q' + str(random.randint(1, 100000000))
 86 |         items.append(randomId)
 87 | 
 88 |     return items
 89 | 
 90 | # generator to create batches of random Q-IDs,
 91 | # then fetch the number of statements for them from the query API
 92 | async def queryRandomItems(numberOfItems):
 93 |     counter = 0
 94 |     while counter < numberOfItems:
 95 |         batchOfIds = generateRandomItemIds(min(batchSize, numberOfItems - counter))
 96 |         batchOfResults = await fetchNumberOfStatements(batchOfIds)
 97 |         counter += len(batchOfResults)
 98 |         yield batchOfResults
 99 | 
100 | # generator to read batches of Q-IDs from a file,
101 | # then fetch the number of statements for them from the query API
102 | async def queryItemsFromFile(inputFileName):
103 |     with open(inputFileName, newline='') as inputFile:
104 |         lines = [row[0] for row in csv.reader(inputFile)]
105 | 
106 |     numberOfBatches = (len(lines) // batchSize) + 1
107 |     batches = numpy.array_split(lines, numberOfBatches)
108 |     for batchOfIds in batches:
109 |         batchOfResults = await fetchNumberOfStatements(batchOfIds)
110 |         yield batchOfResults
111 | 
112 | def printHeader(outputFileName):
113 |     with open(outputFileName, 'w') as outputFile:
114 |         print(OUTPUT_DELIMITER.join([
115 |             'QID',
116 |             'statements',
117 |             'violations_mandatory_level',
118 |             'violations_normal_level',
119 |             'violations_suggestion_level',
120 |             'violated_statements',
121 |             'total_sitelinks',
122 |             'wikipedia_sitelinks',
123 |             'ores_score'
124 |         ]), file=outputFile)
125 | 
126 | def printResults(batchOfResults, outputFileName):
127 |     with open(outputFileName, 'a') as outputFile:
128 |         for itemId, itemResults in batchOfResults.items():
129 |             if('failed' in itemResults.keys()):
130 |                 continue
131 | 
132 |             # list of str-mapped values, delimited by OUTPUT_DELIMITER
133 |             print(OUTPUT_DELIMITER.join(map(str, [
134 |                 itemId,
135 |                 itemResults['statements'],
136 |                 itemResults['violations_mandatory'],
137 |                 itemResults['violations_normal'],
138 |                 itemResults['violations_suggestion'],
139 |                 itemResults['violated_statements'],
140 |                 itemResults['total_sitelinks'],
141 |                 itemResults['wikipedia_sitelinks'],
142 |                 itemResults['ores_score'],
143 |             ]
144 |             )), file=outputFile)
145 | 
146 | def logException(exception):
147 |     with open('error.log', 'a') as outputFile:
148 |         print(exception, file=outputFile)
149 | 
150 | def logErrorMessage(message):
151 |     with open('error.log', 'a') as outputFile:
152 |         print(message, file=outputFile)
153 | 
154 | def displayProgress(step, overwrite=True):
155 |     character = ''
156 |     if(step < 0):
157 |         #  ANSI escape sequence for 'ERROR' (red)
158 |         character = '\033[91m'
159 |         step *= -1
160 | 
161 |     if(step == 0):
162 |         character += '.'
163 |     elif(step == 1):
164 |         character += '-'
165 |     elif(step == 2):
166 |         character += '+'
167 |     elif(step == 99):
168 |         character += '|'
169 | 
170 |     if(overwrite):
171 |         character = '\b' + character
172 | 
173 |     # turn off color
174 |     character += '\033[0m'
175 | 
176 |     print(character, end='', flush=True)
177 | 
178 | async def fetchNumberOfStatements(itemIds):
179 |     # Returns a dictionary of items, each with their the number of statements
180 |     batchOfResults = {}
181 |     async with ClientSession() as session:
182 |         async with session.get(STATEMENT_COUNT_URL + '&titles=' + '|'.join(itemIds)) as statementCountResponse:
183 |             statementCountResponse = await statementCountResponse.read()
184 |             r = json.loads(str(statementCountResponse, 'utf-8'))
185 | 
186 |     for page in r['query']['pages'].values():
187 |         if not 'pageprops' in page:
188 |             logErrorMessage("Item " + page['title'] + ' does not exist or is a redirect.')
189 |             continue
190 | 
191 |         # add revid and number of statements to the item's results dictionary in batchOfResults
192 |         results = {
193 |             'revid': page['revisions'][0]['revid'],
194 |             'statements': page['pageprops']['wb-claims'],
195 |         }
196 |         batchOfResults.update({page['title']: results})
197 | 
198 |     return batchOfResults
199 | 
200 | async def fetchNumberOfSitelinks(batchOfResults):
201 |     # Gets a dictionary of itemIds and their statement count results
202 |     # and adds to it the total number of sitelinks and the number of wikipedia sitelinks per itemId
203 |     async with ClientSession() as session:
204 |         async with session.get(SITELINK_COUNT_URL + '&ids=' + '|'.join(batchOfResults.keys())) as sitelinksResponse:
205 |             sitelinksResponse = await sitelinksResponse.read()
206 |             r = json.loads(str(sitelinksResponse, 'utf-8'))
207 | 
208 |     if not 'entities' in r:
209 |         raise Exception("could not find sitelinks for items", batchOfResults.keys())
210 | 
211 |     for itemId, item in r['entities'].items():
212 |         total_sitelinks = item['sitelinks']
213 |         wikipedia_sitelinks = { k: v for k, v in total_sitelinks.items()
214 |             if k.endswith('wiki') and not k in ['commonswiki', 'specieswiki'] }
215 |         # add total and wikipedia sitelinks to the item's results dictionary in batchOfItems
216 |         results = {'total_sitelinks': len(total_sitelinks), 'wikipedia_sitelinks': len(wikipedia_sitelinks)}
217 |         batchOfResults[itemId].update(results)
218 |     return batchOfResults
219 | 
220 | async def checkConstraints(batchOfResults):
221 |     items = '|'.join(batchOfResults.keys())
222 |     async with ClientSession() as session:
223 |         async with session.get(CONSTRAINT_CHECK_URL + '&id=' + items) as r:
224 |             if r.status != 200:
225 |                 raise Exception(
226 |                     'wbcheckconstraint API returned status code ' +
227 |                     str(r.status) + ' for item(s) ' +  items
228 |                 )
229 | 
230 |             r = await r.read()
231 | 
232 |     jsonResponse = json.loads(str(r, 'utf-8'))
233 |     if 'error' in jsonResponse:
234 |         raise Exception(
235 |             'wbcheckconstraint API returned error \'' +
236 |             jsonResponse['error']['code'] +
237 |             '\' for items ' + items
238 |         )
239 |     for itemId in jsonResponse['wbcheckconstraints']:
240 |         itemCheck = jsonResponse['wbcheckconstraints'][itemId]
241 |         constraintCheckResults = parseItemCheck(itemCheck)
242 |         batchOfResults[itemId].update(constraintCheckResults)
243 | 
244 |     return batchOfResults
245 | 
246 | def parseItemCheck(jsonConstraintCheckResponse):
247 |     results = {
248 |         'violations_mandatory': 0,
249 |         'violations_normal': 0,
250 |         'violations_suggestion': 0,
251 |         'violated_statements': 0,
252 |         'statement_is_violated': False
253 |     }
254 |     claims = jsonConstraintCheckResponse['claims']
255 | 
256 |     # claims is a list (not a dict) if it's empty... yikes.
257 |     if not type(claims) is dict:
258 |         # no statements -> no violations
259 |         return results
260 | 
261 |     for (property_id, statement_group) in claims.items():
262 |         for statement in statement_group:
263 |             results['statement_is_violated'] = False
264 | 
265 |             violated_mainsnaks = statement['mainsnak']['results']
266 |             for violated_mainsnak in violated_mainsnaks:
267 |                 results = countResults(violated_mainsnak['status'], results)
268 | 
269 |             if 'qualifiers' in statement.keys():
270 |                 qualifier_items = statement['qualifiers'].items()
271 |                 for (qualifier_property_id, qualifier_item) in qualifier_items:
272 |                     for qualifier_constraint_check in qualifier_item:
273 |                         qualifier_results = qualifier_constraint_check['results']
274 |                         for qualifier_result in qualifier_results:
275 |                             results = countResults(qualifier_result['status'], results)
276 | 
277 |             if 'references' in statement.keys():
278 |                 reference_items = statement['references']
279 |                 for reference_item in reference_items:
280 |                     for (snak_property_id, reference_constraint_checks) in reference_item['snaks'].items():
281 |                         for reference_constraint_check in reference_constraint_checks:
282 |                             reference_results = reference_constraint_check['results']
283 |                             for reference_result in reference_results:
284 |                                 results = countResults(reference_result['status'], results)
285 | 
286 |     del results['statement_is_violated']
287 |     return results
288 | 
289 | def countResults(status, results):
290 |     # ignore
291 |     if status == 'bad-parameters':
292 |         return results
293 | 
294 |     if status == 'violation':
295 |         results['violations_mandatory'] += 1
296 |     elif status == 'warning':
297 |         results['violations_normal'] += 1
298 |     elif status == 'suggestion':
299 |         results['violations_suggestion'] += 1
300 | 
301 |     if results['statement_is_violated'] == False:
302 |         results['statement_is_violated'] = True
303 |         results['violated_statements'] += 1
304 | 
305 |     return results
306 | 
307 | async def checkQualityByBatch(batchOfItems):
308 |     try:
309 |         batchOfItems = await checkConstraints(batchOfItems)
310 |     except Exception as ex:
311 |         logErrorMessage("failed to check quality constraints on items " +
312 |                         '|'.join(batchOfItems.keys()))
313 |         logErrorMessage("now checking them one-by-one")
314 |         logException(ex)
315 |         for itemId, itemResults in batchOfItems.items():
316 |             checkedItemResults = await checkQualityByItem(itemId, itemResults)
317 |             batchOfItems[itemId].update(checkedItemResults)
318 | 
319 |     return batchOfItems
320 | 
321 | async def checkQualityByItem(itemId, itemResults):
322 |     try:
323 |         itemResults = await checkConstraints({itemId: itemResults})
324 |     except Exception as ex:
325 |         logErrorMessage("failed to check quality constraints on item " + itemId)
326 |         logException(ex)
327 |         return {'failed': True}
328 | 
329 |     return itemResults
330 | 
331 | async def fetchOresScore(batchOfItems):
332 |     # collect Q-ids and revids from items dictionary
333 |     itemIds = {}
334 |     for itemId, results in batchOfItems.items():
335 |         itemIds[results['revid']] = itemId
336 | 
337 |     r = await make_liftiwing_calls(wiki_id="wikidatawiki", models=["damaging", "goodfaith", "itemquality", "itemtopic"], rev_ids=list(itemIds.keys()))
338 | 
339 |     if not 'wikidatawiki' in r:
340 |         logErrorMessage("no ORES scores found for items " + '|'.join(itemIds.keys()))
341 |         return batchOfItems
342 | 
343 |     for revid, score in r['wikidatawiki']['scores'].items():
344 |         itemId = itemIds[int(revid)]
345 |         probability = score['itemquality']['score']['probability']
346 |         weightedSum = 0
347 |         for x in probability:
348 |             if(probability[x]):
349 |                 weightedSum += probability[x] * ORES_WEIGHTS[x]
350 |         batchOfItems[itemId].update({'ores_score': round(weightedSum, 2)})
351 | 
352 |     return batchOfItems
353 | 
354 | async def main(argv):
355 |     numberOfItems, outputFileName, inputFileName= parseArguments(argv)
356 | 
357 |     printHeader(outputFileName)
358 | 
359 |     if(numberOfItems):
360 |         # we use randomly generated Q-IDs
361 |         batchesOfItems = queryRandomItems(int(numberOfItems))
362 |     else:
363 |         # we read the Q-IDs from a file
364 |         batchesOfItems = queryItemsFromFile(inputFileName)
365 | 
366 |     async for batch in batchesOfItems:
367 |         itemsWithSitelinks = await fetchNumberOfSitelinks(batch)
368 |         itemsWithConstraintChecks = await checkQualityByBatch(itemsWithSitelinks)
369 |         itemsWithOresScore = await fetchOresScore(itemsWithConstraintChecks)
370 |         printResults(itemsWithOresScore, outputFileName)
371 |         print('', len(itemsWithOresScore))
372 | 
373 |     print()
374 | 
375 | loop=asyncio.new_event_loop()
376 | loop.run_until_complete(main(sys.argv[1:]))
377 | 


--------------------------------------------------------------------------------